├── .gitignore
├── cuda_acceleration_demo
    ├── conv.cpp
    ├── matmul_tile.cu
    ├── matmul_tile_full.cu
    └── sparse_mv.cu
├── ddp_example
    ├── README.md
    ├── data.py
    ├── data
    │   └── wikitext-2
    │   │   ├── test.txt
    │   │   ├── train.txt
    │   │   └── valid.txt
    ├── main.py
    ├── model.py
    ├── requirements.txt
    └── synthetic.py
├── decoding
    └── decoding.ipynb
├── deepspeed_example
    ├── DeepSpeed-Example.ipynb
    ├── README.md
    ├── deepspeed_chat.egg-info
    │   ├── PKG-INFO
    │   ├── SOURCES.txt
    │   ├── dependency_links.txt
    │   ├── requires.txt
    │   └── top_level.txt
    ├── dschat
    │   ├── rlhf
    │   │   ├── ppo_trainer.py
    │   │   └── rlhf_engine.py
    │   └── utils
    │   │   ├── data
    │   │       ├── data_utils.py
    │   │       └── raw_datasets.py
    │   │   ├── ds_utils.py
    │   │   ├── model
    │   │       ├── model_utils.py
    │   │       └── reward_model.py
    │   │   ├── module
    │   │       └── lora.py
    │   │   ├── perf.py
    │   │   └── utils.py
    ├── main.py
    ├── requirements.txt
    ├── run_llama2_7b.sh
    └── setup.py
├── imgs
    ├── broadcast_matrix_matrix.png
    ├── broadcast_matrix_vector.png
    ├── broadcast_mv_scalar.png
    ├── broadcast_rule.png
    ├── data_storage_operators.png
    ├── high_level_abstraction.png
    ├── reduce.jpg
    └── strides.png
├── mini_tensorflow
    ├── mini_tensorflow.ipynb
    └── mini_tensorflow_full.ipynb
├── minitorch_notebook
    └── minitorch_architecture.ipynb
├── simple_cuda_demo
    ├── CUDA_Code_Examples.ipynb
    ├── example_matadd.cu
    ├── example_matmul.cu
    ├── example_matmul2.cu
    ├── example_vector_add.cu
    ├── example_window_sum.cu
    ├── test_matmul.py
    ├── test_vector_add.py
    └── test_window_sum.py
├── tensor_demo
    ├── indexing_broadcasting.ipynb
    └── miniTorch
    │   ├── .vscode
    │       └── settings.json
    │   ├── LICENSE
    │   ├── minitorch.egg-info
    │       ├── PKG-INFO
    │       ├── SOURCES.txt
    │       ├── dependency_links.txt
    │       └── top_level.txt
    │   ├── minitorch
    │       ├── __init__.py
    │       ├── __pycache__
    │       │   ├── __init__.cpython-38.pyc
    │       │   ├── __init__.cpython-39.pyc
    │       │   ├── autodiff.cpython-38.pyc
    │       │   ├── autodiff.cpython-39.pyc
    │       │   ├── cuda_kernel_ops.cpython-39.pyc
    │       │   ├── cuda_ops.cpython-39.pyc
    │       │   ├── datasets.cpython-38.pyc
    │       │   ├── datasets.cpython-39.pyc
    │       │   ├── fast_conv.cpython-39.pyc
    │       │   ├── fast_ops.cpython-39.pyc
    │       │   ├── module.cpython-39.pyc
    │       │   ├── nn.cpython-39.pyc
    │       │   ├── operators.cpython-38.pyc
    │       │   ├── operators.cpython-39.pyc
    │       │   ├── optim.cpython-39.pyc
    │       │   ├── scalar.cpython-39.pyc
    │       │   ├── scalar_functions.cpython-39.pyc
    │       │   ├── tensor.cpython-38.pyc
    │       │   ├── tensor.cpython-39.pyc
    │       │   ├── tensor_data.cpython-38.pyc
    │       │   ├── tensor_data.cpython-39.pyc
    │       │   ├── tensor_functions.cpython-38.pyc
    │       │   ├── tensor_functions.cpython-39.pyc
    │       │   ├── tensor_ops.cpython-38.pyc
    │       │   ├── tensor_ops.cpython-39.pyc
    │       │   └── testing.cpython-39.pyc
    │       ├── autodiff.py
    │       ├── datasets.py
    │       ├── module.py
    │       ├── operators.py
    │       ├── optim.py
    │       ├── tensor.py
    │       ├── tensor_data.py
    │       ├── tensor_functions.py
    │       └── tensor_ops.py
    │   ├── requirements.extra.txt
    │   ├── requirements.txt
    │   ├── setup.cfg
    │   ├── setup.py
    │   └── style.sh
└── tokenization
    └── tokenization.ipynb


/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | 


--------------------------------------------------------------------------------
/cuda_acceleration_demo/conv.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | 
 3 | using namespace std;
 4 | #define TILE_WIDTH 16
 5 | 
 6 | 
 7 | void naive_conv(int N, int H, int W, int K, int C_IN, int C_OUT, float *input, float *output, float *kernel) {
 8 |     int h_out = H - K + 1;
 9 |     int w_out = W - K + 1;  
10 |     // kernel: C_OUT * C_IN * K * K
11 |     // input: N * C_IN * H * W
12 |     // output: N * C_OUT * h_out * w_out
13 |     for(int n = 0; n < N; n++) {  // for each image in the mini-batch
14 |         for(int c_in = 0; c_in < C_IN; c_in++) {  // for each output feature maps
15 |             for(int c_out = 0; c_out < C_OUT; c_out++) {
16 |                 for(int h = 0; h < h_out; h++) {
17 |                     for(int w = 0; w < w_out; w++) {
18 |                         for(int i = 0; k < K; i++) {
19 |                             for(int j = 0; j < K; j++) {
20 |                                 output[n, c_out, h, w] += input[n, c_in, h + i, w + j] * kernel[c_out, c_in, i, j];
21 |                             }
22 |                         }
23 |                     }
24 |                 }
25 |             }
26 |         }
27 |     }
28 | }
29 | 
30 | void unroll_conv(int N, int H, int W, int K, int C_IN, int C_OUT, float *input, float *output, float *kernel) {
31 |     int h_out = H - K + 1;
32 |     int w_out = W - K + 1;  
33 |     int W_unroll = C_IN * K * K;
34 |     int H_unroll = h_out * w_out;
35 |     float* input_unroll = new float[W_unroll * H_unroll];
36 |     for(int i = 0; i < N; i++) {
37 |         unroll(input[i], input_unroll, C_IN, H, W, K, h_out, w_out);
38 |         gemm(input_unroll, kernel, output[i], W_unroll, C_OUT, H_unroll);
39 |     }
40 | }
41 | 
42 | void im2col(float* input, int C_IN, int H, int W, int K, float* output) {
43 |     int h_out = H - K + 1;
44 |     int w_out = W - K + 1;
45 |     int h_unroll = C_IN * K * K;
46 |     int w_unroll = h_out * w_out;
47 | 
48 |     for (int c = 0; c < C_IN; ++c) {
49 |         for(int h = 0; h < h_out; h++) {
50 |             for(int w = 0; w < w_out; w++) {
51 |                 for(int i = 0; i < K; i++) {
52 |                     for(int j = 0; j < K; j++) {
53 |                         output[c * K * K + (h * w_out + w)][i * K + j] = input[c * H * W + (h + i) * W + w + j];
54 |                         output[c * K * K * h_out * w_out + i * K * w_out + j * w_out + h * w_out + w] = input[c, h + i, w + j];
55 |                     }
56 |                 }
57 |             }
58 |         }
59 |     }
60 | }
61 | 
62 | void unroll(float *input, float *input_unroll, int C_IN, int H, int W, int K, int h_out, int w_out) {
63 |     for(int c_in = 0; c_in < C_IN; c_in++) {
64 |         int w_base = c_in * K * K;
65 |         for(int h = 0; h < h_out; h++) {
66 |             for(int w = 0; w < w_out; w++) {
67 |                 for(int i = 0; i < K; i++) {
68 |                     for(int j = 0; j < K; j++) {
69 |                         input_unroll[w_base * h_out * w_out + h * w_out + w] = input[c_in * H * W + (h + i) * W + w + j];
70 |                     }
71 |                 }
72 |             }
73 |         }
74 |     }
75 | }


--------------------------------------------------------------------------------
/cuda_acceleration_demo/matmul_tile.cu:
--------------------------------------------------------------------------------
 1 | #include <cuda_runtime.h>
 2 | #include <iostream>
 3 | 
 4 | using namespace std;
 5 | #define TILE_WIDTH 16
 6 | 
 7 | void matrix_multiply(float **a, float **b, float **c, float N) {
 8 |     for (int i = 0; i < N; i++) {
 9 |         for (int j = 0; j < N; j++) {
10 |             c[i][j] = 0;
11 |             for (int k = 0; k < N; k++) {
12 |                 c[i][j] += a[i][k] * b[k][j];
13 |             }
14 |         }
15 |     }
16 | }
17 | 
18 | /**
19 |  * @brief compute C=A*B using tile size TILE_WIDTH
20 |  * 
21 |  * @param d_A matrix A
22 |  * @param d_B matrix B
23 |  * @param d_C result matrix C
24 |  * @param N size of matrix (number of rows and columns)
25 |  * 
26 |  * hint: define two matrices of size TILE_WIDTH x TILE_WIDTH in shared memory
27 |  * sliding the tile along the matrix, compute partial sum of product.
28 |  */
29 | __global__ void MatMulTiledKernel(float* d_A, float* d_B, float* d_C, int N) {
30 |     // define two matrices in share memory
31 | 
32 | 
33 |     // define the row and column in the result matrix of current thread
34 | 
35 | 
36 |     // iterate over tiles along row and column in d_A and d_B
37 | 
38 | 
39 | 
40 |     // store result
41 | 
42 |     
43 | }


--------------------------------------------------------------------------------
/cuda_acceleration_demo/matmul_tile_full.cu:
--------------------------------------------------------------------------------
 1 | #include <cuda_runtime.h>
 2 | #include <iostream>
 3 | 
 4 | using namespace std;
 5 | #define TILE_WIDTH 16
 6 | 
 7 | void matrix_multiply(float **a, float **b, float **c, float N) {
 8 |     for (int i = 0; i < N; i++) {
 9 |         for (int j = 0; j < N; j++) {
10 |             c[i][j] = 0;
11 |             for (int k = 0; k < N; k++) {
12 |                 c[i][j] += a[i][k] * b[k][j];
13 |             }
14 |         }
15 |     }
16 | }
17 | 
18 | /**
19 |  * @brief compute C=A*B using tile size TILE_WIDTH
20 |  * 
21 |  * @param d_A matrix A
22 |  * @param d_B matrix B
23 |  * @param d_C result matrix C
24 |  * @param N size of matrix (number of rows and columns)
25 |  */
26 | __global__ void MatMulTiledKernel(float* d_A, float* d_B, float* d_C, int N) {
27 | 	__shared__ float As[TILE_WIDTH][TILE_WIDTH];
28 | 	__shared__ float Bs[TILE_WIDTH][TILE_WIDTH];
29 | 
30 | 	// Determine the row and col of the P element to be calculated for the thread
31 | 	int row = blockIdx.y * blockDim.y + threadIdx.y;
32 | 	int col = blockIdx.x * blockDim.x + threadIdx.x;
33 | 	float Cvalue = 0;
34 | 	for(int ph = 0; ph < N/TILE_WIDTH; ++ph) {
35 | 		As[threadIdx.y][threadIdx.x] = d_A[row * N + ph * TILE_WIDTH + threadIdx.x];
36 | 		Bs[threadIdx.y][threadIdx.x] = d_B[(ph * TILE_WIDTH + threadIdx.y) * N + col];
37 | 		__syncthreads();
38 | 		for(int k = 0; k < TILE_WIDTH; ++k) {
39 | 			Cvalue += As[threadIdx.y][k] * Bs[k][threadIdx.x];
40 | 		}
41 | 		__syncthreads();
42 | 	}
43 | 	d_C[row * N + col] = Cvalue;
44 | }


--------------------------------------------------------------------------------
/cuda_acceleration_demo/sparse_mv.cu:
--------------------------------------------------------------------------------
 1 | #include <cuda_runtime.h>
 2 | #include <iostream>
 3 | 
 4 | using namespace std;
 5 | #define TILE_WIDTH 16
 6 | 
 7 | void spmv_csr(float *data, int *col_index, int *row_ptr, float *x, float *y, int n) {
 8 |     for(int row = 0; row < n; row++) {
 9 |         float dot = 0;
10 |         int row_start = row_ptr[row];
11 |         int row_end = row_ptr[row + 1];
12 |         for(int elem = row_start; elem < row_end; elem++) {
13 |             dot += x[row] * data[col_index[elem]];
14 |         }
15 |         y[row] += dot;
16 |     }
17 | }
18 | 
19 | __global__ void SpMVCSRKernel(float *data, int *col_index, int *row_ptr, float *x, float *y, int num_rows) {
20 |     int row = blockIdx.x * blockDim.x + threadIdx.x;
21 |     if(row < num_rows) {
22 |         float dot = 0;
23 |         int row_start = row_ptr[row];
24 |         int row_end = row_ptr[row + 1];
25 |         for(int elem = row_start; elem < row_end; elem++) {
26 |             dot += x[row] * data[col_index[elem]];
27 |         }
28 |         y[row] += dot;
29 |     }
30 | }


--------------------------------------------------------------------------------
/ddp_example/README.md:
--------------------------------------------------------------------------------
 1 | # Language Model Training Example
 2 | 
 3 | This example is adapted from [PyTorch Word Language Model](https://github.com/pytorch/examples/tree/main/word_language_model) and demonstrates how to train a language model using PyTorch, both on a single GPU and using Distributed Data Parallel (DDP) for multiple GPUs.
 4 | 
 5 | ## Instructions
 6 | 
 7 | ### Running Training on a Single GPU
 8 | To train the model on a single GPU, run the following command:
 9 | ```
10 | torchrun --nproc_per_node=1 main.py
11 | ```
12 | 
13 | This will start training with the default settings. You should see output similar to:
14 | ```
15 | | Epoch 1 | Time: 13.50s | Train Loss 7.85 | Valid Loss 7.21 | Perplexity 1355.19
16 | | Epoch 2 | Time: 13.32s | Train Loss 7.30 | Valid Loss 6.93 | Perplexity 1026.65
17 | | Epoch 3 | Time: 13.36s | Train Loss 7.09 | Valid Loss 6.79 | Perplexity 885.36
18 | | Epoch 4 | Time: 13.48s | Train Loss 6.97 | Valid Loss 6.69 | Perplexity 805.83
19 | | Epoch 5 | Time: 13.52s | Train Loss 6.89 | Valid Loss 6.62 | Perplexity 746.42
20 | | End of Training | Test Loss 6.53 | Test Perplexity 687.53
21 | ```
22 | 
23 | ### Running Distributed Data Parallel (DDP) Training on Multiple GPUs
24 | Replace <num_of_gpus> with the number of GPUs you wish to use. For example, to use 2 GPUs:
25 | ```
26 | torchrun --nproc_per_node=2 main.py --ddp
27 | ```
28 | 
29 | This will start the training process with DDP enabled, and you should see output similar to:
30 | ```
31 | | Epoch 1 | Time: 7.48s | Train Loss 8.14 | Valid Loss 7.47 | Perplexity 1757.71
32 | | Epoch 2 | Time: 7.28s | Train Loss 7.56 | Valid Loss 7.21 | Perplexity 1353.03
33 | | Epoch 3 | Time: 7.29s | Train Loss 7.36 | Valid Loss 7.05 | Perplexity 1154.24
34 | | Epoch 4 | Time: 7.31s | Train Loss 7.23 | Valid Loss 6.94 | Perplexity 1027.78
35 | | Epoch 5 | Time: 7.32s | Train Loss 7.13 | Valid Loss 6.85 | Perplexity 945.97
36 | | End of Training | Test Loss 6.77 | Test Perplexity 873.09
37 | ```
38 | 
39 | You will see the train, validation loss and perplexity plots (training\_metrics*.png).
40 | 
41 | ### Debugging DDP
42 | If the ddp initialization is stucked, it is likely that your GPUs are not connected via NVLink, please use `export NCCL_P2P_DISABLE=1`.
43 | 


--------------------------------------------------------------------------------
/ddp_example/data.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from io import open
 3 | import torch
 4 | from torch.utils.data import Dataset, DataLoader
 5 | 
 6 | class Dictionary(object):
 7 |     def __init__(self):
 8 |         self.word2idx = {}
 9 |         self.idx2word = []
10 | 
11 |     def add_word(self, word):
12 |         if word not in self.word2idx:
13 |             self.idx2word.append(word)
14 |             self.word2idx[word] = len(self.idx2word) - 1
15 |         return self.word2idx[word]
16 | 
17 |     def __len__(self):
18 |         return len(self.idx2word)
19 | 
20 | 
21 | class Corpus(object):
22 |     def __init__(self, path):
23 |         self.dictionary = Dictionary()
24 |         self.train = self.tokenize(os.path.join(path, 'train.txt'))
25 |         self.valid = self.tokenize(os.path.join(path, 'valid.txt'))
26 |         self.test = self.tokenize(os.path.join(path, 'test.txt'))
27 | 
28 |     def tokenize(self, path):
29 |         """Tokenizes a text file."""
30 |         assert os.path.exists(path)
31 |         # Add words to the dictionary
32 |         with open(path, 'r', encoding="utf8") as f:
33 |             for line in f:
34 |                 words = line.split() + ['<eos>']
35 |                 for word in words:
36 |                     self.dictionary.add_word(word)
37 | 
38 |         # Tokenize file content
39 |         with open(path, 'r', encoding="utf8") as f:
40 |             idss = []
41 |             for line in f:
42 |                 words = line.split() + ['<eos>']
43 |                 ids = []
44 |                 for word in words:
45 |                     ids.append(self.dictionary.word2idx[word])
46 |                 idss.append(torch.tensor(ids).type(torch.int64))
47 |             ids = torch.cat(idss)
48 | 
49 |         return ids
50 | 
51 | class TextDataset(Dataset):
52 |     """Formats corpus into sequences for language modeling"""
53 |     def __init__(self, data, seq_len, device, batch_size = 128):
54 |         self.device = device
55 |         self.seq_len = seq_len
56 |         data = data.narrow(0, 0, data.size(0) // batch_size * batch_size)
57 |         self.data = data.view(batch_size, -1).t().contiguous().to(device)
58 |         self.device = device
59 | 
60 |         self.indices = list(range(0, self.data.size(0)-self.seq_len, self.seq_len))
61 | 
62 |     def __len__(self):
63 |         return len(self.indices)
64 | 
65 |     def __getitem__(self, idx):
66 |         start_idx = self.indices[idx]
67 |         inputs = self.data[start_idx: start_idx + self.seq_len]
68 |         targets = self.data[start_idx + 1: start_idx + self.seq_len + 1].view(-1)
69 |         return inputs.to(self.device), targets.to(self.device)
70 | 
71 | def get_dataloader(dataset, seq_len, batch_size=1, sampler=None):
72 |     loader = DataLoader(
73 |         dataset,
74 |         batch_size=batch_size,
75 |         sampler=sampler,
76 |         shuffle=(sampler is None),
77 |     )
78 |     return loader
79 | 
80 | 


--------------------------------------------------------------------------------
/ddp_example/main.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import time
  3 | import math
  4 | import os
  5 | import torch
  6 | import torch.nn as nn
  7 | import torch.optim as optim
  8 | import torch.distributed as dist
  9 | from torch.nn.parallel import DistributedDataParallel as DDP
 10 | from torch.utils.data import DistributedSampler
 11 | import matplotlib.pyplot as plt
 12 | 
 13 | import data
 14 | import model
 15 | from data import TextDataset, get_dataloader
 16 | 
 17 | def setup_ddp():
 18 |     dist.init_process_group("nccl")
 19 |     rank = dist.get_rank()
 20 |     torch.cuda.set_device(rank)
 21 |     return rank, dist.get_world_size()
 22 | 
 23 | def cleanup_ddp():
 24 |     dist.destroy_process_group()
 25 | 
 26 | def parse_args():
 27 |     parser = argparse.ArgumentParser(description='PyTorch Wikitext-2 Transformer Language Model')
 28 |     parser.add_argument('--ddp', action='store_true', help='run with ddp', default=False)
 29 |     parser.add_argument('--batch_size', type=int, default=128)
 30 |     return parser.parse_args()
 31 | 
 32 | def plot_training_metrics(train_losses, val_losses, ppl_values, save_path='training_metrics.png', ddp=False):
 33 |     if ddp:
 34 |         save_path = save_path.replace('.png', '_ddp.png')
 35 | 
 36 |     epochs = range(1, len(train_losses) + 1)
 37 |     fig, axs = plt.subplots(3, 1, figsize=(8, 12))
 38 |     axs[0].plot(epochs, train_losses, marker='o', linestyle='-', color='blue', label='Training Loss')
 39 |     axs[1].plot(epochs, val_losses, marker='o', linestyle='-', color='red', label='Validation Loss')
 40 |     axs[2].plot(epochs, ppl_values, marker='o', linestyle='-', color='green', label='Perplexity')
 41 |     for ax, title in zip(axs, ['Training Loss', 'Validation Loss', 'Perplexity']):
 42 |         ax.set_xlabel('Epochs')
 43 |         ax.set_ylabel('Value')
 44 |         ax.set_title(title)
 45 |         ax.grid(True)
 46 |         ax.legend()
 47 |     plt.tight_layout()
 48 |     plt.savefig(save_path)
 49 |     print(f"Plot saved as {save_path}")
 50 | 
 51 | class Trainer:
 52 |     def __init__(self, args, model, train_loader, val_loader, test_loader, ntokens, device):
 53 |         self.args = args
 54 |         self.device = device
 55 |         self.ntokens = ntokens
 56 |         self.model = model.to(device)
 57 |         self.lr = 0.05
 58 |         if args.ddp:
 59 |             self.model = DDP(self.model, device_ids=[device.index])
 60 |             self.lr *= dist.get_world_size() #scale learning rate to compensate gradient averaging
 61 |         
 62 |         self.train_loader = train_loader
 63 |         self.val_loader = val_loader
 64 |         self.test_loader = test_loader
 65 |         self.criterion = nn.NLLLoss()
 66 |         self.optimizer = optim.SGD(self.model.parameters(), lr=self.lr)
 67 |         self.best_val_loss = None
 68 | 
 69 |     def train_one_epoch(self, epoch):
 70 |         if self.args.ddp:
 71 |             self.train_loader.sampler.set_epoch(epoch)
 72 | 
 73 |         self.model.train()
 74 |         total_loss = 0.
 75 |         start_time = time.time()
 76 |         for data, targets in self.train_loader:
 77 |             self.model.zero_grad()
 78 |             data = data.squeeze(0).clone()
 79 |             targets = targets.squeeze(0).clone()
 80 |             output = self.model(data)
 81 |             targets = targets.view(-1)
 82 |             output = output.view(-1, self.ntokens)
 83 |             loss = self.criterion(output, targets)
 84 |             loss.backward()
 85 |             self.optimizer.step()
 86 |             total_loss = total_loss + loss.item()
 87 | 
 88 |         if self.args.ddp:
 89 |             total_loss_tensor = torch.tensor(total_loss, device=self.device)
 90 |             dist.all_reduce(total_loss_tensor, op=dist.ReduceOp.SUM)
 91 | 
 92 |             return (total_loss_tensor / dist.get_world_size() / len(self.train_loader)).item()
 93 |         else:
 94 |             return total_loss / len(self.train_loader)
 95 | 
 96 |     def evaluate_model(self, loader):
 97 |         self.model.eval()
 98 |         total_loss = 0.
 99 |         with torch.no_grad():
100 |             for data, targets in loader:
101 |                 data = data.squeeze(0)
102 |                 targets = targets.squeeze(0)
103 |                 output = self.model(data)
104 |                 output = output.view(-1, self.ntokens)
105 |                 total_loss = total_loss + self.criterion(output, targets).item()
106 | 
107 |         if self.args.ddp:
108 |             total_loss_tensor = torch.tensor(total_loss, device=self.device)
109 |             dist.all_reduce(total_loss_tensor, op=dist.ReduceOp.SUM)
110 | 
111 |             return (total_loss_tensor / dist.get_world_size() / len(loader)).item()
112 |         else:
113 |             return total_loss / len(loader)
114 | 
115 |     def train_model(self, num_epochs=5):
116 |         train_losses, val_losses, ppl_values = [], [], []
117 |         try:
118 |             for epoch in range(1, num_epochs + 1):
119 |                 epoch_start_time = time.time()
120 |                 train_loss = self.train_one_epoch(epoch)
121 |                 val_loss = self.evaluate_model(self.val_loader)
122 |                 ppl = math.exp(val_loss)
123 |                 
124 |                 if not self.args.ddp or (self.args.ddp and self.device.index == 0):
125 |                     train_losses.append(train_loss)
126 |                     val_losses.append(val_loss)
127 |                     ppl_values.append(ppl)
128 |                     print(f'| Epoch {epoch} | Time: {time.time() - epoch_start_time:.2f}s | Train Loss {train_loss:.2f} | Valid Loss {val_loss:.2f} | Perplexity {ppl:.2f}')
129 |                     
130 |                     if self.best_val_loss is None or val_loss < self.best_val_loss:
131 |                         torch.save(self.model, 'model.pt')
132 |                         self.best_val_loss = val_loss
133 |                     else:
134 |                         self.lr /= 4.0
135 |         except KeyboardInterrupt:
136 |             print('Exiting from training early')
137 |         
138 |         self.test_model()
139 |         if not self.args.ddp or (self.args.ddp and self.device.index == 0):
140 |             plot_training_metrics(train_losses, val_losses, ppl_values, ddp=self.args.ddp)
141 | 
142 |     def test_model(self):
143 |         test_loss = self.evaluate_model(self.test_loader)
144 |         if not self.args.ddp or (self.args.ddp and self.device.index == 0):
145 |             print(f'| End of Training | Test Loss {test_loss:.2f} | Test Perplexity {math.exp(test_loss):.2f}')
146 | 
147 | 
148 | def main():
149 |     args = parse_args()
150 |     
151 |     if args.ddp:
152 |         rank, world_size = setup_ddp()
153 |         device = torch.device(f"cuda:{rank}")
154 |     else:
155 |         rank, world_size = 0, 1
156 |         device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
157 | 
158 |     torch.manual_seed(42)
159 |     corpus = data.Corpus('./data/wikitext-2')
160 |     ntokens = len(corpus.dictionary)
161 |     seq_len = 40
162 |     
163 |     corpus.train = TextDataset(corpus.train, seq_len, device, args.batch_size)
164 |     corpus.valid = TextDataset(corpus.valid, seq_len, device, args.batch_size)
165 |     corpus.test = TextDataset(corpus.test, seq_len, device, args.batch_size)
166 | 
167 |     sampler = DistributedSampler(corpus.train, world_size, rank, shuffle=False) if args.ddp else None
168 |     train_loader = get_dataloader(corpus.train, seq_len=40, sampler=sampler)
169 |     val_loader = get_dataloader(corpus.valid, seq_len=40)
170 |     test_loader = get_dataloader(corpus.test, seq_len=40) 
171 | 
172 |     model_inst = model.TransformerModel(ntokens, 256, 8, 256, 4, 0.2)
173 |     trainer = Trainer(args, model_inst, train_loader, val_loader, test_loader, ntokens, device)
174 |     trainer.train_model()
175 | 
176 |     if args.ddp:
177 |         cleanup_ddp()
178 | 
179 | if __name__ == "__main__":
180 |     main()
181 | 


--------------------------------------------------------------------------------
/ddp_example/model.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import torch
 3 | import torch.nn as nn
 4 | import torch.nn.functional as F
 5 | 
 6 | class PositionalEncoding(nn.Module):
 7 | 
 8 |     def __init__(self, d_model, dropout=0.1, max_len=5000):
 9 |         super(PositionalEncoding, self).__init__()
10 |         self.dropout = nn.Dropout(p=dropout)
11 | 
12 |         pe = torch.zeros(max_len, d_model)
13 |         position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
14 |         div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
15 |         pe[:, 0::2] = torch.sin(position * div_term)
16 |         pe[:, 1::2] = torch.cos(position * div_term)
17 |         pe = pe.unsqueeze(0).transpose(0, 1)
18 |         self.register_parameter('pe', nn.Parameter(pe, requires_grad=False))
19 | 
20 |     def forward(self, x):
21 |         x = x + self.pe[:x.size(0), :]
22 |         return self.dropout(x)
23 | 
24 | class TransformerModel(nn.Transformer):
25 |     """Container module with an encoder, a recurrent or transformer module, and a decoder."""
26 | 
27 |     def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5):
28 |         super(TransformerModel, self).__init__(d_model=ninp, nhead=nhead, dim_feedforward=nhid, num_encoder_layers=nlayers)
29 |         self.model_type = 'Transformer'
30 |         self.pos_encoder = PositionalEncoding(ninp, dropout)
31 | 
32 |         self.input_emb = nn.Embedding(ntoken, ninp)
33 |         self.ninp = ninp
34 |         self.decoder = nn.Linear(ninp, ntoken)
35 | 
36 |         self.init_weights()
37 | 
38 |     def _generate_square_subsequent_mask(self, sz):
39 |         return torch.log(torch.tril(torch.ones(sz,sz)))
40 | 
41 |     def init_weights(self):
42 |         initrange = 0.1
43 |         nn.init.uniform_(self.input_emb.weight, -initrange, initrange)
44 |         nn.init.zeros_(self.decoder.bias)
45 |         nn.init.uniform_(self.decoder.weight, -initrange, initrange)
46 | 
47 |     def forward(self, src, has_mask=True):
48 |         mask = None
49 |         if has_mask:
50 |             mask = self._generate_square_subsequent_mask(len(src))
51 |         src = self.input_emb(src) * math.sqrt(self.ninp)
52 |         src = self.pos_encoder(src)
53 |         output = self.encoder(src, mask=mask)
54 |         output = self.decoder(output)
55 |         return F.log_softmax(output, dim=-1)
56 | 


--------------------------------------------------------------------------------
/ddp_example/requirements.txt:
--------------------------------------------------------------------------------
 1 | numpy==1.26.4
 2 | torch==2.5.1
 3 | torchaudio==2.5.1
 4 | torcheval==0.0.7
 5 | torchvision==0.20.1
 6 | nvidia-cuda-cupti-cu11==11.8.87
 7 | nvidia-cuda-cupti-cu12==12.4.127
 8 | nvidia-cuda-nvrtc-cu11==11.7.99
 9 | nvidia-cuda-nvrtc-cu12==12.4.127
10 | nvidia-cuda-runtime-cu11==11.7.99
11 | nvidia-cuda-runtime-cu12==12.4.127
12 | matplotlib==3.7.1
13 | matplotlib-inline==0.1.6
14 | nvidia-nccl-cu11==2.20.5
15 | nvidia-nccl-cu12==2.21.5
16 | 


--------------------------------------------------------------------------------
/ddp_example/synthetic.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | import torch
  4 | import torch.distributed as dist
  5 | import torch.nn as nn
  6 | import torch.optim as optim
  7 | 
  8 | from torch.nn.parallel import DistributedDataParallel as DDP
  9 | from torch.utils.data import Dataset, DataLoader, TensorDataset
 10 | from torch.utils.data.distributed import DistributedSampler
 11 | 
 12 | 
 13 | class ToyModel(nn.Module):
 14 |     def __init__(self):
 15 |         super(ToyModel, self).__init__()
 16 |         self.net1 = nn.Linear(10, 10)
 17 |         self.relu = nn.ReLU()
 18 |         self.net2 = nn.Linear(10, 5)
 19 | 
 20 |     def forward(self, x):
 21 |         return self.net2(self.relu(self.net1(x)))
 22 | 
 23 | 
 24 | def create_dataset(num_samples):
 25 |     # Here we create a synthetic dataset
 26 |     inputs = torch.randn(num_samples, 10)
 27 |     labels = torch.randn(num_samples, 5)
 28 |     return TensorDataset(inputs, labels)
 29 | 
 30 | 
 31 | def train_single_gpu(dataset, batch_size=1024, num_epochs=100):
 32 |     # The following code only train with one gpu
 33 |     device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 34 |     model = ToyModel().to(device)
 35 |     loss_fn = nn.MSELoss()
 36 |     optimizer = optim.SGD(model.parameters(), lr=0.001)
 37 | 
 38 |     dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
 39 | 
 40 |     start_time = time.time()
 41 |     for epoch in range(num_epochs):
 42 |         for inputs, labels in dataloader:
 43 |             inputs, labels = inputs.to(device), labels.to(device)
 44 |             optimizer.zero_grad()
 45 |             outputs = model(inputs)
 46 |             loss = loss_fn(outputs, labels)
 47 |             loss.backward()
 48 |             optimizer.step()
 49 |     end_time = time.time()
 50 |     
 51 |     return end_time - start_time
 52 | 
 53 | 
 54 | def train_ddp(dataset, batch_size=1024, num_epochs=100):
 55 |     # We initialize process group with nccl for communication
 56 |     torch.cuda.set_device(int(os.environ["LOCAL_RANK"]))
 57 |     dist.init_process_group("nccl")
 58 |     
 59 |     rank = dist.get_rank()
 60 |     world_size = dist.get_world_size()
 61 |     
 62 |     device_id = rank % torch.cuda.device_count()
 63 |     model = ToyModel().to(device_id)
 64 |     ddp_model = DDP(model, device_ids=[device_id])
 65 |     
 66 |     loss_fn = nn.MSELoss()
 67 |     optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)
 68 | 
 69 |     # We sample using DistributedSampler such that each GPU gets unique data
 70 |     sampler = DistributedSampler(dataset, num_replicas=world_size, rank=rank, shuffle=True)
 71 |     dataloader = DataLoader(dataset, batch_size=batch_size // world_size, sampler=sampler)
 72 | 
 73 |     start_time = time.time()
 74 |     for epoch in range(num_epochs):
 75 |         sampler.set_epoch(epoch)  # we will have different shuffling per epoch
 76 |         for inputs, labels in dataloader:
 77 |             inputs, labels = inputs.to(device_id), labels.to(device_id)
 78 |             optimizer.zero_grad()
 79 |             outputs = ddp_model(inputs)
 80 |             loss = loss_fn(outputs, labels)
 81 |             loss.backward()
 82 |             optimizer.step()
 83 |     end_time = time.time()
 84 |     
 85 |     dist.destroy_process_group()
 86 |     return end_time - start_time
 87 | 
 88 | 
 89 | if __name__ == "__main__":
 90 |     import sys
 91 | 
 92 |     NUM_SAMPLES = 100000
 93 |     BATCH_SIZE = 1024
 94 |     NUM_EPOCHS = 100
 95 | 
 96 |     dataset = create_dataset(NUM_SAMPLES)
 97 | 
 98 |     if "--ddp" in sys.argv:
 99 |         # Run DDP training
100 |         total_time = train_ddp(dataset, batch_size=BATCH_SIZE, num_epochs=NUM_EPOCHS)
101 |         print(f"DDP Training Time: {total_time:.4f} seconds")
102 |     
103 |     else:
104 |         # Run Single-GPU training
105 |         total_time = train_single_gpu(dataset, batch_size=BATCH_SIZE, num_epochs=NUM_EPOCHS)
106 |         print(f"Single-GPU Training Time: {total_time:.4f} seconds")
107 | 
108 | 


--------------------------------------------------------------------------------
/decoding/decoding.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 3,
 6 |    "metadata": {},
 7 |    "outputs": [
 8 |     {
 9 |      "name": "stdout",
10 |      "output_type": "stream",
11 |      "text": [
12 |       "[[4, 0, 4, 0, 4, 0, 4, 0, 4, 0], 6.931471805599453]\n",
13 |       "[[4, 0, 4, 0, 4, 0, 4, 0, 4, 1], 7.154615356913663]\n",
14 |       "[[4, 0, 4, 0, 4, 0, 4, 0, 3, 0], 7.154615356913663]\n"
15 |      ]
16 |     }
17 |    ],
18 |    "source": [
19 |     "from math import log\n",
20 |     "from numpy import array\n",
21 |     "from numpy import argmax\n",
22 |     "\n",
23 |     "# beam search\n",
24 |     "def beam_search_decoder(next_token_probs, k):\n",
25 |     "\t\"\"\"beam search decoding \n",
26 |     "    \n",
27 |     "    next_token_probs: next token probabilities \n",
28 |     "    k: beam size\n",
29 |     "\t\"\"\"\n",
30 |     "\tsequences = [[list(), 0.0]]\n",
31 |     "\t# walk over each step in sequence\n",
32 |     "\tfor token_probs in next_token_probs:\n",
33 |     "\t\tall_candidates = list()\n",
34 |     "\t\t# expand each current candidate\n",
35 |     "\t\tfor current_seq in range(len(sequences)):\n",
36 |     "\t\t\tseq, score = sequences[current_seq]\n",
37 |     "\t\t\tfor tk_id in range(len(token_probs)):\n",
38 |     "\t\t\t\tcandidate = [seq + [tk_id], score - log(token_probs[tk_id])]\n",
39 |     "\t\t\t\tall_candidates.append(candidate)\n",
40 |     "\t\t# order all candidates by score\n",
41 |     "\t\tordered = sorted(all_candidates, key=lambda tup:tup[1])\n",
42 |     "\t\t# select k best\n",
43 |     "\t\tsequences = ordered[:k]\n",
44 |     "\treturn sequences\n",
45 |     "\n",
46 |     "# define a sequence of 10 words over a vocab of 5 words\n",
47 |     "data = [[0.1, 0.2, 0.3, 0.4, 0.5],\n",
48 |     "\t\t[0.5, 0.4, 0.3, 0.2, 0.1],\n",
49 |     "\t\t[0.1, 0.2, 0.3, 0.4, 0.5],\n",
50 |     "\t\t[0.5, 0.4, 0.3, 0.2, 0.1],\n",
51 |     "\t\t[0.1, 0.2, 0.3, 0.4, 0.5],\n",
52 |     "\t\t[0.5, 0.4, 0.3, 0.2, 0.1],\n",
53 |     "\t\t[0.1, 0.2, 0.3, 0.4, 0.5],\n",
54 |     "\t\t[0.5, 0.4, 0.3, 0.2, 0.1],\n",
55 |     "\t\t[0.1, 0.2, 0.3, 0.4, 0.5],\n",
56 |     "\t\t[0.5, 0.4, 0.3, 0.2, 0.1]]\n",
57 |     "data = array(data)\n",
58 |     "# decode sequence\n",
59 |     "result = beam_search_decoder(data, 3)\n",
60 |     "# print result\n",
61 |     "for seq in result:\n",
62 |     "\tprint(seq)"
63 |    ]
64 |   }
65 |  ],
66 |  "metadata": {
67 |   "kernelspec": {
68 |    "display_name": "base",
69 |    "language": "python",
70 |    "name": "python3"
71 |   },
72 |   "language_info": {
73 |    "codemirror_mode": {
74 |     "name": "ipython",
75 |     "version": 3
76 |    },
77 |    "file_extension": ".py",
78 |    "mimetype": "text/x-python",
79 |    "name": "python",
80 |    "nbconvert_exporter": "python",
81 |    "pygments_lexer": "ipython3",
82 |    "version": "3.9.18"
83 |   }
84 |  },
85 |  "nbformat": 4,
86 |  "nbformat_minor": 2
87 | }
88 | 


--------------------------------------------------------------------------------
/deepspeed_example/DeepSpeed-Example.ipynb:
--------------------------------------------------------------------------------
1 | {"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"provenance":[],"authorship_tag":"ABX9TyOm999wxkBzBdw34in6tbAt"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"}},"cells":[{"cell_type":"markdown","source":["This is adopted from https://github.com/deepspeedai/DeepSpeedExamples/blob/master/applications/DeepSpeed-Chat/README.md.\n","\n","We only demonstrate the SFT step here, but feel free to try out the reward modeling and RLHF training!"],"metadata":{"id":"edL-x2OUcFsT"}},{"cell_type":"markdown","source":["Env Setup: Make sure you are using GPU runtime.(e.g. T4)"],"metadata":{"id":"a8fmdQUScgoE"}},{"cell_type":"markdown","source":["Inside the training script `main.py`, we will need some modifications...\n","\n","(Note: libraries such huggingface accelerate integrates with Deepspeed, see https://huggingface.co/docs/accelerate/en/usage_guides/deepspeed for more details!)"],"metadata":{"id":"ztHx5i5ee1I9"}},{"cell_type":"code","source":["!pip install deepspeed"],"metadata":{"id":"noq0zvYqn7kC"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["import deepspeed\n","from deepspeed.ops.adam import DeepSpeedCPUAdam, FusedAdam\n","from deepspeed import get_accelerator\n","\n","import argparse\n","import torch\n","\n","#skip the details here, please check actual main.py\n","def get_train_ds_config(offload,\n","                        dtype,\n","                        stage=2,\n","                        enable_hybrid_engine=False,\n","                        inference_tp_size=1,\n","                        release_inference_cache=False,\n","                        pin_parameters=True,\n","                        tp_gather_partition_size=8,\n","                        max_out_tokens=512,\n","                        enable_tensorboard=False,\n","                        enable_mixed_precision_lora=False,\n","                        tb_path=\"\",\n","                        tb_name=\"\"):\n","    pass\n","def parse_args():\n","    pass\n","def to_device():\n","    pass\n","\n","def main():\n","    args = parse_args()\n","\n","    if args.local_rank == -1:\n","        device = torch.device(get_accelerator().device_name())\n","    else:\n","        get_accelerator().set_device(args.local_rank)\n","        device = torch.device(get_accelerator().device_name(), args.local_rank)\n","        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs\n","        # torch.distributed.init_process_group(backend='nccl')\n","        deepspeed.init_distributed()\n","    ds_config = get_train_ds_config(offload=args.offload,\n","                                    dtype=args.dtype,\n","                                    stage=args.zero_stage,\n","                                    enable_tensorboard=args.enable_tensorboard,\n","                                    tb_path=args.tensorboard_path,\n","                                    tb_name=\"step1_model\")\n","    # Deepspeed provide custom cpu adam if we want to offload optimizer to cpu\n","    AdamOptimizer = DeepSpeedCPUAdam if args.offload else FusedAdam\n","    # Initialize model with deepspeed\n","    # DeepSpeed model training is accomplished using the DeepSpeed engine.\n","    # The engine can wrap any arbitrary model of type torch.nn.module and\n","    # has a minimal set of APIs for training and checkpointing the model.\n","    model, optimizer, _, lr_scheduler = deepspeed.initialize(\n","        model=model,\n","        optimizer=optimizer,\n","        args=args,\n","        config=ds_config,\n","        lr_scheduler=lr_scheduler,\n","        dist_init_required=True)\n","    train_dataloader = None\n","    #DeepSpeed automatically performs the necessary operations required for distributed data parallel training,\n","    #in mixed precision, with a pre-defined learning rate scheduler. No code change needed.\n","    for epoch in range(args.num_train_epochs):\n","        model.train()\n","        import time\n","        for step, batch in enumerate(train_dataloader):\n","            start = time.time()\n","            batch = to_device(batch, device)\n","            outputs = model(**batch, use_cache=False)\n","            loss = outputs.loss\n","            if args.print_loss:\n","                print(\n","                    f\"Epoch: {epoch}, Step: {step}, Rank: {torch.distributed.get_rank()}, loss = {loss}\"\n","                )\n","            model.backward(loss)\n","            model.step()\n","            end = time.time()"],"metadata":{"id":"FVdLubB9eq1G","executionInfo":{"status":"ok","timestamp":1743549424621,"user_tz":240,"elapsed":28,"user":{"displayName":"Kath Choi","userId":"14493180204401828909"}}},"execution_count":6,"outputs":[]},{"cell_type":"markdown","source":["Let's see what is needed to run the training script with deepspeed command line:\n","1. specify your training script `main.py`\n","2. specify args such as `model_name_or_path`, `zero_stage` etc.\n","\n","To see the full list of args supported, see https://www.deepspeed.ai/docs/config-json/"],"metadata":{"id":"-dlTlGeTeNxx"}},{"cell_type":"markdown","source":["Recall that:\n","1. Optimizer state partitioning (ZeRO stage 1)\n","2. Gradient partitioning (ZeRO stage 2)\n","3. Parameter partitioning (ZeRO stage 3)"],"metadata":{"id":"il2y5cowhv43"}},{"cell_type":"markdown","source":["The bash scipt looks something like this"],"metadata":{"id":"Swm3dKlOm4RJ"}},{"cell_type":"code","source":["\"\"\"\n","ZERO_STAGE=$1\n","OUTPUT=./output_llama2_7b\n","if [ \"$ZERO_STAGE\" == \"\" ]; then\n","    ZERO_STAGE=3\n","fi\n","mkdir -p $OUTPUT\n","\n","deepspeed main.py \\\n","   --data_split 2,4,4 \\\n","   --model_name_or_path meta-llama/Llama-2-7b-hf \\\n","   --per_device_train_batch_size 1 \\\n","   --per_device_eval_batch_size 4 \\\n","   --max_seq_len 512 \\\n","   --learning_rate 9.65e-6 \\\n","   --weight_decay 0. \\\n","   --num_train_epochs 3  \\\n","   --gradient_accumulation_steps 4 \\\n","   --lr_scheduler_type cosine \\\n","   --num_warmup_steps 0 \\\n","   --seed 1234 \\\n","   --gradient_checkpointing \\\n","   --dtype bf16 \\\n","   --zero_stage $ZERO_STAGE \\\n","   --deepspeed \\\n","   --output_dir $OUTPUT \\\n","\"\"\""],"metadata":{"id":"tlbF27LkeFBZ"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["Then we can run the command in a cluster and monitor the nvidia-smi to see the how much memory is saved!"],"metadata":{"id":"B8iioP1jSivH"}}]}


--------------------------------------------------------------------------------
/deepspeed_example/README.md:
--------------------------------------------------------------------------------
 1 | This example is adopted from https://github.com/deepspeedai/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat, with some modifications.
 2 | 
 3 | ### Request GPUs
 4 | ```bash
 5 | srun --cpus-per-task=5 --gpus=4 --mem=256GB --partition=<partition> --time=10:34:56 --pty bash
 6 | ```
 7 | 
 8 | ### Installation
 9 | 
10 | ```bash
11 | pip install -r requirements.txt
12 | pip install -e .
13 | ```
14 | 
15 | ### Finetuning LLaMA2-7b
16 | If stage is not provided, the script will use DeepSpeed ZeRO stage 3 by default.
17 | Using stage 0 would disable ZeRO. (You will probably see CUDA OOM errors!)
18 | ```bash
19 | bash run_llama2_7b.sh <stage>
20 | ```
21 | 


--------------------------------------------------------------------------------
/deepspeed_example/deepspeed_chat.egg-info/PKG-INFO:
--------------------------------------------------------------------------------
 1 | Metadata-Version: 2.2
 2 | Name: deepspeed-chat
 3 | Version: 0.1
 4 | Home-page: https://github.com/deepspeedai/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat
 5 | Requires-Dist: datasets>=2.8.0
 6 | Requires-Dist: sentencepiece>=0.1.97
 7 | Requires-Dist: protobuf==3.20.3
 8 | Requires-Dist: accelerate>=0.15.0
 9 | Requires-Dist: torch>=1.12.0
10 | Requires-Dist: deepspeed>=0.9.2
11 | Requires-Dist: transformers!=4.33.2,>=4.31.0
12 | Requires-Dist: tensorboard
13 | Provides-Extra: azureml
14 | Requires-Dist: azure-ml-component; extra == "azureml"
15 | Requires-Dist: azureml-core; extra == "azureml"
16 | Dynamic: home-page
17 | Dynamic: provides-extra
18 | Dynamic: requires-dist
19 | 


--------------------------------------------------------------------------------
/deepspeed_example/deepspeed_chat.egg-info/SOURCES.txt:
--------------------------------------------------------------------------------
1 | setup.py
2 | deepspeed_chat.egg-info/PKG-INFO
3 | deepspeed_chat.egg-info/SOURCES.txt
4 | deepspeed_chat.egg-info/dependency_links.txt
5 | deepspeed_chat.egg-info/requires.txt
6 | deepspeed_chat.egg-info/top_level.txt


--------------------------------------------------------------------------------
/deepspeed_example/deepspeed_chat.egg-info/dependency_links.txt:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/deepspeed_example/deepspeed_chat.egg-info/requires.txt:
--------------------------------------------------------------------------------
 1 | datasets>=2.8.0
 2 | sentencepiece>=0.1.97
 3 | protobuf==3.20.3
 4 | accelerate>=0.15.0
 5 | torch>=1.12.0
 6 | deepspeed>=0.9.2
 7 | transformers!=4.33.2,>=4.31.0
 8 | tensorboard
 9 | 
10 | [azureml]
11 | azure-ml-component
12 | azureml-core
13 | 


--------------------------------------------------------------------------------
/deepspeed_example/deepspeed_chat.egg-info/top_level.txt:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/deepspeed_example/dschat/rlhf/rlhf_engine.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Microsoft Corporation.
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | 
  4 | # DeepSpeed Team
  5 | import time
  6 | import torch
  7 | import deepspeed
  8 | from deepspeed.ops.adam import FusedAdam
  9 | from deepspeed.ops.adam import DeepSpeedCPUAdam
 10 | from transformers import AutoModelForCausalLM, get_scheduler
 11 | 
 12 | from dschat.utils.ds_utils import get_train_ds_config, get_eval_ds_config
 13 | from dschat.utils.module.lora import convert_linear_layer_to_lora, only_optimize_lora_parameters, make_model_gradient_checkpointing_compatible
 14 | from dschat.utils.model.model_utils import create_hf_model, create_critic_model
 15 | from dschat.utils.utils import get_optimizer_grouped_parameters
 16 | """
 17 | TODOs:
 18 |   * support HF models for critic (for debugging), must be a previously saved ckpt from step-2
 19 |   * determine ds_config/zero_stage based on model size, gpu style, world size, etc
 20 |     - get model size by creating simple meta model
 21 |     - 1.3b: zero-2 for actor/ref models, zero-0 for others
 22 |     - 13b+: zero-3 for all models
 23 | """
 24 | 
 25 | 
 26 | def log_init(model_name, stime=None):
 27 |     if torch.distributed.get_rank() == 0:
 28 |         tag = "start" if stime is None else "end"
 29 |         suffix = "ing" if stime is None else "ed"
 30 |         duration = ""
 31 |         if stime is not None:
 32 |             duration = "(duration: {:.2f}s)".format(time.time() - stime)
 33 |         msg = f"[{tag}] Initializ{suffix} {model_name} Model [{tag}] {duration}"
 34 |         stars = (90 - len(msg)) // 2
 35 |         extra_star = "*" if (90 - len(msg)) % 2 == 1 else ""
 36 |         print("*" * stars + msg + "*" * stars + extra_star)
 37 |         return time.time()
 38 | 
 39 | 
 40 | class DeepSpeedRLHFEngine():
 41 | 
 42 |     def __init__(self, actor_model_name_or_path, critic_model_name_or_path,
 43 |                  tokenizer, args, num_total_iters):
 44 |         self.args = args
 45 |         self.num_total_iters = num_total_iters
 46 |         self.tokenizer = tokenizer
 47 | 
 48 |         self.actor = self._init_actor(
 49 |             actor_model_name_or_path=actor_model_name_or_path)
 50 |         self.ref = self._init_ref(
 51 |             actor_model_name_or_path=actor_model_name_or_path)
 52 |         self.actor_ema = None
 53 |         if self.args.enable_ema:
 54 |             self.actor_ema = self._init_ema(
 55 |                 actor_model_name_or_path=actor_model_name_or_path)
 56 |         self.critic = self._init_critic(
 57 |             critic_model_name_or_path=critic_model_name_or_path)
 58 |         self.reward = self._init_reward(
 59 |             critic_model_name_or_path=critic_model_name_or_path)
 60 |         if self.args.critic_gradient_checkpointing:
 61 |             self.critic.gradient_checkpointing_enable()
 62 | 
 63 |     def _init_actor(self, actor_model_name_or_path):
 64 |         stime = log_init("Actor")
 65 | 
 66 |         # DS Config
 67 |         ds_config = get_train_ds_config(
 68 |             offload=self.args.offload,
 69 |             dtype=self.args.dtype,
 70 |             stage=self.args.actor_zero_stage,
 71 |             enable_hybrid_engine=self.args.enable_hybrid_engine,
 72 |             inference_tp_size=self.args.inference_tp_size,
 73 |             release_inference_cache=self.args.release_inference_cache,
 74 |             pin_parameters=(not self.args.unpin_actor_parameters),
 75 |             tp_gather_partition_size=self.args.tp_gather_partition_size,
 76 |             max_out_tokens=self.args.max_prompt_seq_len +
 77 |             self.args.max_answer_seq_len,
 78 |             enable_tensorboard=self.args.enable_tensorboard,
 79 |             enable_mixed_precision_lora=self.args.enable_mixed_precision_lora,
 80 |             tb_path=self.args.tensorboard_path,
 81 |             tb_name="step3_actor")
 82 |         ds_config[
 83 |             'train_micro_batch_size_per_gpu'] = self.args.per_device_training_batch_size
 84 |         #TODO(jeff): we should probably set grad accumlation steps here as well for clarity
 85 |         ds_config[
 86 |             'train_batch_size'] = self.args.per_device_training_batch_size * torch.distributed.get_world_size(
 87 |             ) * self.args.gradient_accumulation_steps_actor
 88 | 
 89 |         # Model
 90 |         actor_model = create_hf_model(
 91 |             model_class=AutoModelForCausalLM,
 92 |             model_name_or_path=actor_model_name_or_path,
 93 |             tokenizer=self.tokenizer,
 94 |             ds_config=ds_config,
 95 |             dropout=self.args.actor_dropout)
 96 | 
 97 |         # LoRA
 98 |         if self.args.actor_lora_dim > 0:
 99 |             actor_model = convert_linear_layer_to_lora(
100 |                 actor_model, self.args.actor_lora_module_name,
101 |                 self.args.actor_lora_dim)
102 |             if self.args.only_optimize_lora:
103 |                 actor_model = only_optimize_lora_parameters(actor_model)
104 |                 actor_model = make_model_gradient_checkpointing_compatible(
105 |                     actor_model)
106 | 
107 |         # Optimizer
108 |         AdamOptimizer = DeepSpeedCPUAdam if self.args.offload else FusedAdam
109 |         optim_params = get_optimizer_grouped_parameters(
110 |             actor_model, self.args.actor_weight_decay,
111 |             self.args.actor_lora_learning_rate)
112 |         optim = AdamOptimizer(optim_params,
113 |                               lr=self.args.actor_learning_rate,
114 |                               betas=(0.9, 0.95))
115 | 
116 |         # LR Scheduler
117 |         lr_scheduler = get_scheduler(
118 |             name=self.args.lr_scheduler_type,
119 |             optimizer=optim,
120 |             num_warmup_steps=self.args.num_warmup_steps,
121 |             num_training_steps=self.num_total_iters,
122 |         )
123 | 
124 |         # DeepSpeed Engine
125 |         #TODO: move enable_hybrid_engine and pin_parameters to ds_config
126 |         actor_engine, *_ = deepspeed.initialize(model=actor_model,
127 |                                                 optimizer=optim,
128 |                                                 lr_scheduler=lr_scheduler,
129 |                                                 config=ds_config)
130 | 
131 |         log_init("Actor", stime=stime)
132 | 
133 |         return actor_engine
134 | 
135 |     def _init_ref(self, actor_model_name_or_path):
136 |         stime = log_init("Ref")
137 |         # DS Config
138 |         zero_stage = self.args.actor_zero_stage
139 |         if zero_stage != 3:
140 |             # If actor is ZeRO-3 then we use it for everything, otherwise assume we have enough memory for ref model
141 |             zero_stage = 0
142 |         ds_config = get_eval_ds_config(self.args.offload_reference_model,
143 |                                        self.args.dtype, zero_stage)
144 |         ds_config[
145 |             'train_micro_batch_size_per_gpu'] = self.args.per_device_training_batch_size
146 |         #TODO(jeff): we should probably set grad accumlation steps here as well for clarity
147 |         ds_config[
148 |             'train_batch_size'] = self.args.per_device_training_batch_size * torch.distributed.get_world_size(
149 |             ) * self.args.gradient_accumulation_steps_actor
150 | 
151 |         ref_model = create_hf_model(AutoModelForCausalLM,
152 |                                     actor_model_name_or_path, self.tokenizer,
153 |                                     ds_config)
154 | 
155 |         ref_engine, *_ = deepspeed.initialize(model=ref_model,
156 |                                               config=ds_config)
157 | 
158 |         log_init("Ref", stime=stime)
159 |         return ref_engine
160 | 
161 |     def _init_ema(self, actor_model_name_or_path):
162 |         stime = log_init("EMA")
163 |         # DS Config
164 |         zero_stage = self.args.actor_zero_stage
165 |         if zero_stage != 3:
166 |             # If actor is ZeRO-3 then we use it for everything, otherwise assume we have enough memory
167 |             zero_stage = 0
168 |         ds_config = get_eval_ds_config(self.args.offload_reference_model,
169 |                                        self.args.dtype, zero_stage)
170 |         ds_config[
171 |             'train_micro_batch_size_per_gpu'] = self.args.per_device_training_batch_size
172 |         #TODO(jeff): we should probably set grad accumlation steps here as well for clarity
173 |         ds_config[
174 |             'train_batch_size'] = self.args.per_device_training_batch_size * torch.distributed.get_world_size(
175 |             ) * self.args.gradient_accumulation_steps_actor
176 | 
177 |         actor_model_ema = create_hf_model(AutoModelForCausalLM,
178 |                                           actor_model_name_or_path,
179 |                                           self.tokenizer, ds_config)
180 |         if self.args.actor_lora_dim > 0:
181 |             actor_model_ema = convert_linear_layer_to_lora(
182 |                 actor_model_ema, self.args.actor_lora_module_name,
183 |                 self.args.actor_lora_dim)
184 | 
185 |         ema_engine, *_ = deepspeed.initialize(model=actor_model_ema,
186 |                                               config=ds_config)
187 | 
188 |         log_init("EMA", stime=stime)
189 |         return ema_engine
190 | 
191 |     def _init_critic(self, critic_model_name_or_path):
192 |         stime = log_init("Critic")
193 |         ds_config = get_train_ds_config(
194 |             offload=self.args.offload,
195 |             dtype=self.args.dtype,
196 |             stage=self.args.critic_zero_stage,
197 |             enable_tensorboard=self.args.enable_tensorboard,
198 |             tb_path=self.args.tensorboard_path,
199 |             tb_name="step3_critic")
200 |         ds_config[
201 |             'train_micro_batch_size_per_gpu'] = self.args.per_device_training_batch_size
202 |         #TODO(jeff): we should probably set grad accumlation steps here as well for clarity
203 |         ds_config[
204 |             'train_batch_size'] = self.args.per_device_training_batch_size * torch.distributed.get_world_size(
205 |             ) * self.args.gradient_accumulation_steps
206 | 
207 |         ds_eval_config = get_eval_ds_config(offload=False,
208 |                                             dtype=self.args.dtype,
209 |                                             stage=self.args.critic_zero_stage)
210 |         # We need to set train batch size and micro batch size here to pass the sanity check of DeepSpeed engine.
211 |         ds_eval_config[
212 |             'train_micro_batch_size_per_gpu'] = self.args.per_device_training_batch_size
213 |         ds_eval_config[
214 |             'train_batch_size'] = self.args.per_device_training_batch_size * torch.distributed.get_world_size(
215 |             ) * self.args.gradient_accumulation_steps
216 | 
217 |         # Model
218 |         critic_model = create_critic_model(
219 |             model_name_or_path=critic_model_name_or_path,
220 |             tokenizer=self.tokenizer,
221 |             ds_config=ds_eval_config,
222 |             num_padding_at_beginning=self.args.num_padding_at_beginning,
223 |             rlhf_training=True,
224 |             dropout=self.args.critic_dropout,
225 |             zero_stage=self.args.critic_zero_stage)
226 | 
227 |         # LoRA
228 |         if self.args.critic_lora_dim > 0:
229 |             critic_model = convert_linear_layer_to_lora(
230 |                 critic_model, self.args.critic_lora_module_name,
231 |                 self.args.critic_lora_dim)
232 |             if self.args.only_optimize_lora:
233 |                 critic_model = only_optimize_lora_parameters(critic_model)
234 |                 critic_model = make_model_gradient_checkpointing_compatible(
235 |                     critic_model)
236 | 
237 |         # Optimizer
238 |         AdamOptimizer = DeepSpeedCPUAdam if self.args.offload else FusedAdam
239 |         optim_params = get_optimizer_grouped_parameters(
240 |             critic_model, self.args.critic_weight_decay,
241 |             self.args.critic_lora_learning_rate)
242 |         optim = AdamOptimizer(optim_params,
243 |                               lr=self.args.critic_learning_rate,
244 |                               betas=(0.9, 0.95))
245 | 
246 |         # LR Scheduler
247 |         lr_scheduler = get_scheduler(
248 |             name=self.args.lr_scheduler_type,
249 |             optimizer=optim,
250 |             num_warmup_steps=self.args.num_warmup_steps,
251 |             num_training_steps=self.num_total_iters,
252 |         )
253 | 
254 |         # DeepSpeed Engine
255 |         critic_engine, *_ = deepspeed.initialize(model=critic_model,
256 |                                                  optimizer=optim,
257 |                                                  lr_scheduler=lr_scheduler,
258 |                                                  config=ds_config)
259 | 
260 |         log_init("Critic", stime=stime)
261 |         return critic_engine
262 | 
263 |     def _init_reward(self, critic_model_name_or_path):
264 |         stime = log_init("Reward")
265 |         # DS Config
266 |         zero_stage = self.args.critic_zero_stage
267 |         if zero_stage != 3:
268 |             # If critic is ZeRO-3 then we use it for everything, otherwise assume we have enough memory
269 |             zero_stage = 0
270 | 
271 |         ds_config = get_eval_ds_config(offload=self.args.offload_reward_model,
272 |                                        dtype=self.args.dtype,
273 |                                        stage=zero_stage)
274 | 
275 |         # We need to set train batch size and micro batch size here to pass the sanity check of DeepSpeed engine.
276 |         ds_config[
277 |             'train_micro_batch_size_per_gpu'] = self.args.per_device_training_batch_size
278 |         ds_config[
279 |             'train_batch_size'] = self.args.per_device_training_batch_size * torch.distributed.get_world_size(
280 |             ) * self.args.gradient_accumulation_steps
281 | 
282 |         # Model
283 |         reward_model = create_critic_model(
284 |             model_name_or_path=critic_model_name_or_path,
285 |             tokenizer=self.tokenizer,
286 |             ds_config=ds_config,
287 |             num_padding_at_beginning=self.args.num_padding_at_beginning,
288 |             rlhf_training=True,
289 |             dropout=self.args.critic_dropout,
290 |             zero_stage=zero_stage)
291 | 
292 |         reward_engine, *_ = deepspeed.initialize(model=reward_model,
293 |                                                  config=ds_config)
294 | 
295 |         log_init("Reward", stime=stime)
296 |         return reward_engine
297 | 


--------------------------------------------------------------------------------
/deepspeed_example/dschat/utils/ds_utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Microsoft Corporation.
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | 
  4 | # DeepSpeed Team
  5 | 
  6 | import deepspeed.comm as dist
  7 | from deepspeed.accelerator import get_accelerator
  8 | 
  9 | GLOBAL_BATCH_SIZE = 32
 10 | MICRO_BATCH_SIZE = 4
 11 | 
 12 | 
 13 | def get_train_ds_config(offload,
 14 |                         dtype,
 15 |                         stage=2,
 16 |                         enable_hybrid_engine=False,
 17 |                         inference_tp_size=1,
 18 |                         release_inference_cache=False,
 19 |                         pin_parameters=True,
 20 |                         tp_gather_partition_size=8,
 21 |                         max_out_tokens=512,
 22 |                         enable_tensorboard=False,
 23 |                         enable_mixed_precision_lora=False,
 24 |                         tb_path="",
 25 |                         tb_name=""):
 26 | 
 27 |     device = "cpu" if offload else "none"
 28 |     if dtype == "fp16":
 29 |         data_type = "fp16"
 30 |         dtype_config = {"enabled": True, "loss_scale_window": 100}
 31 |     elif dtype == "bf16":
 32 |         data_type = "bfloat16"
 33 |         dtype_config = {"enabled": True}
 34 |     zero_opt_dict = {
 35 |         "stage": stage,
 36 |         "overlap_comm": True,
 37 |         "offload_param": {
 38 |             "device": device
 39 |         },
 40 |         "offload_optimizer": {
 41 |             "device": device
 42 |         },
 43 |         "stage3_param_persistence_threshold": 1e4,
 44 |         "stage3_max_live_parameters": 3e7,
 45 |         "stage3_prefetch_bucket_size": 3e7,
 46 |         "memory_efficient_linear": False
 47 |     }
 48 |     if enable_mixed_precision_lora:
 49 |         zero_opt_dict["zero_quantized_nontrainable_weights"] = True
 50 |         if dist.get_world_size() != get_accelerator().device_count():
 51 |             zero_opt_dict["zero_hpz_partition_size"] = get_accelerator(
 52 |             ).device_count()
 53 |     return {
 54 |         "train_batch_size": GLOBAL_BATCH_SIZE,
 55 |         "train_micro_batch_size_per_gpu": MICRO_BATCH_SIZE,
 56 |         "steps_per_print": 10,
 57 |         "zero_optimization": zero_opt_dict,
 58 |         data_type: dtype_config,
 59 |         "gradient_clipping": 1.0,
 60 |         "prescale_gradients": False,
 61 |         "wall_clock_breakdown": False,
 62 |         "hybrid_engine": {
 63 |             "enabled": enable_hybrid_engine,
 64 |             "max_out_tokens": max_out_tokens,
 65 |             "inference_tp_size": inference_tp_size,
 66 |             "release_inference_cache": release_inference_cache,
 67 |             "pin_parameters": pin_parameters,
 68 |             "tp_gather_partition_size": tp_gather_partition_size,
 69 |         },
 70 |         "tensorboard": {
 71 |             "enabled": enable_tensorboard,
 72 |             "output_path": f"{tb_path}/ds_tensorboard_logs/",
 73 |             "job_name": f"{tb_name}_tensorboard"
 74 |         }
 75 |     }
 76 | 
 77 | 
 78 | def get_eval_ds_config(offload, dtype, stage=0):
 79 |     device = "cpu" if offload else "none"
 80 |     if dtype == "fp16":
 81 |         data_type = "fp16"
 82 |         dtype_config = {
 83 |             "enabled": True,
 84 |         }
 85 |     elif dtype == "bf16":
 86 |         data_type = "bfloat16"
 87 |         dtype_config = {"enabled": True}
 88 |     zero_opt_dict = {
 89 |         "stage": stage,
 90 |         "stage3_param_persistence_threshold": 1e4,
 91 |         "offload_param": {
 92 |             "device": device
 93 |         },
 94 |         "memory_efficient_linear": False
 95 |     }
 96 |     return {
 97 |         "train_batch_size": GLOBAL_BATCH_SIZE,
 98 |         "train_micro_batch_size_per_gpu": MICRO_BATCH_SIZE,
 99 |         "steps_per_print": 10,
100 |         "zero_optimization": zero_opt_dict,
101 |         data_type: dtype_config,
102 |         "gradient_clipping": 1.0,
103 |         "prescale_gradients": False,
104 |         "wall_clock_breakdown": False
105 |     }
106 | 


--------------------------------------------------------------------------------
/deepspeed_example/dschat/utils/model/model_utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Microsoft Corporation.
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | 
  4 | # DeepSpeed Team
  5 | import os
  6 | import math
  7 | import torch
  8 | from transformers import (
  9 |     AutoConfig,
 10 |     AutoModel,
 11 | )
 12 | from huggingface_hub import snapshot_download
 13 | from transformers.integrations.deepspeed import HfDeepSpeedConfig
 14 | 
 15 | from dschat.utils.model.reward_model import RewardModel
 16 | from dschat.utils.utils import load_state_dict_into_model, print_rank_0
 17 | 
 18 | 
 19 | def configure_dropout(model_config, dropout):
 20 |     if dropout is not None:
 21 |         for key in ('dropout', 'attention_dropout', 'hidden_dropout',
 22 |                     'activation_dropout'):
 23 |             if hasattr(model_config, key):
 24 |                 print(f"Setting model_config.{key} to {dropout}")
 25 |                 setattr(model_config, key, dropout)
 26 | 
 27 | 
 28 | def causal_lm_model_to_fp32_loss(model):
 29 |     """ Convert CausalLM model to calculate loss in fp32 """
 30 | 
 31 |     def causal_lm_forward(
 32 |         input_ids=None,
 33 |         past_key_values=None,
 34 |         attention_mask=None,
 35 |         head_mask=None,
 36 |         inputs_embeds=None,
 37 |         labels=None,
 38 |         use_cache=None,
 39 |         output_attentions=None,
 40 |         output_hidden_states=None,
 41 |         return_dict=None,
 42 |         **deprecated_arguments,
 43 |     ):
 44 |         kwargs = dict() if model.config.model_type == "llama" else dict(
 45 |             head_mask=head_mask)
 46 |         output = model.__original_forward__(
 47 |             input_ids=input_ids,
 48 |             past_key_values=past_key_values,
 49 |             attention_mask=attention_mask,
 50 |             inputs_embeds=inputs_embeds,
 51 |             labels=None,
 52 |             use_cache=use_cache,
 53 |             output_attentions=output_attentions,
 54 |             output_hidden_states=output_hidden_states,
 55 |             return_dict=return_dict,
 56 |             **kwargs)
 57 | 
 58 |         return_dict = isinstance(output, dict)
 59 |         lm_logits = output.logits if return_dict else output[0]
 60 |         loss = None
 61 |         if labels is not None:
 62 |             # move labels to correct device to enable model parallelism
 63 |             labels = labels.to(lm_logits.device)
 64 |             # Shift so that tokens < n predict n
 65 |             shift_logits = lm_logits[..., :-1, :].float().contiguous()
 66 |             shift_labels = labels[..., 1:].contiguous()
 67 |             batch_size, seq_length, vocab_size = shift_logits.shape
 68 |             # Flatten the tokens
 69 |             loss_fct = torch.nn.CrossEntropyLoss()
 70 |             loss = loss_fct(
 71 |                 shift_logits.view(batch_size * seq_length, vocab_size),
 72 |                 shift_labels.view(batch_size * seq_length))
 73 | 
 74 |         if not return_dict:
 75 |             # re-pack output with fp32 loss
 76 |             return ((loss, ) + output) if loss is not None else output
 77 | 
 78 |         output.loss = loss
 79 |         return output
 80 | 
 81 |     model.__original_forward__ = model.forward
 82 |     model.forward = causal_lm_forward
 83 | 
 84 | 
 85 | def create_hf_model(model_class,
 86 |                     model_name_or_path,
 87 |                     tokenizer,
 88 |                     ds_config=None,
 89 |                     rlhf_training=False,
 90 |                     dropout=None):
 91 |     model_config = AutoConfig.from_pretrained(model_name_or_path)
 92 |     configure_dropout(model_config, dropout)
 93 | 
 94 |     # Note: dschf is defined in function scope to avoid global effects
 95 |     # https://huggingface.co/docs/transformers/main_classes/deepspeed#nontrainer-deepspeed-integration
 96 |     if ds_config is not None and ds_config["zero_optimization"]["stage"] == 3:
 97 |         dschf = HfDeepSpeedConfig(ds_config)
 98 |     else:
 99 |         dschf = None
100 |     if rlhf_training:
101 |         # the weight loading is handled by create critic model
102 |         model = model_class.from_config(model_config)
103 |     else:
104 |         model = model_class.from_pretrained(
105 |             model_name_or_path,
106 |             from_tf=bool(".ckpt" in model_name_or_path),
107 |             config=model_config)
108 | 
109 |     model.config.end_token_id = tokenizer.eos_token_id
110 |     model.config.pad_token_id = model.config.eos_token_id
111 |     model.resize_token_embeddings(int(
112 |         8 *
113 |         math.ceil(len(tokenizer) / 8.0)))  # make the vocab size multiple of 8
114 | 
115 |     return model
116 | 
117 | 
118 | def create_critic_model(model_name_or_path,
119 |                         tokenizer,
120 |                         ds_config,
121 |                         num_padding_at_beginning=0,
122 |                         rlhf_training=False,
123 |                         dropout=None,
124 |                         zero_stage=0,
125 |                         compute_fp32_loss=False):
126 |     # OPT model family always put a padding token at the beginning of the sequence,
127 |     # we did not see this in other models but not sure if it is a general rule
128 | 
129 |     import time
130 | 
131 |     start = time.time()
132 |     critic_model = create_hf_model(AutoModel, model_name_or_path, tokenizer,
133 |                                    ds_config, rlhf_training, dropout)
134 |     end = time.time()
135 |     print_rank_0(f">Creating model from_config took {end - start} seconds",
136 |                  None)
137 | 
138 |     critic_model = RewardModel(
139 |         critic_model,
140 |         tokenizer,
141 |         num_padding_at_beginning=num_padding_at_beginning,
142 |         compute_fp32_loss=compute_fp32_loss)
143 | 
144 |     if rlhf_training:
145 |         # load critic model from checkpoint
146 | 
147 |         if not os.path.isdir(model_name_or_path):
148 |             model_name_or_path = snapshot_download(model_name_or_path)
149 |         model_ckpt_path = os.path.join(model_name_or_path, 'pytorch_model.bin')
150 |         assert os.path.exists(
151 |             model_ckpt_path
152 |         ), f"Cannot find model checkpoint at {model_ckpt_path}"
153 | 
154 |         start = time.time()
155 |         model_ckpt_state_dict = torch.load(model_ckpt_path, map_location='cpu')
156 |         end = time.time()
157 |         print_rank_0(f">Creating model from_config took {end - start} seconds",
158 |                      None)
159 | 
160 |         # load critic model from checkpoint with zero-stage 3 compatibility
161 |         # this functionality may be moved to DS checkpoint load API in future
162 |         start = time.time()
163 |         load_state_dict_into_model(critic_model,
164 |                                    model_ckpt_state_dict,
165 |                                    "",
166 |                                    zero_stage=zero_stage)
167 |         end = time.time()
168 | 
169 |         print_rank_0(f">Creating model from_config took {end - start} seconds",
170 |                      None)
171 | 
172 |     return critic_model
173 | 


--------------------------------------------------------------------------------
/deepspeed_example/dschat/utils/model/reward_model.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Microsoft Corporation.
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | 
  4 | # DeepSpeed Team
  5 | import torch
  6 | from torch import nn
  7 | 
  8 | 
  9 | ## Note that the following code is modified from
 10 | ## https://github.com/CarperAI/trlx/blob/main/examples/summarize_rlhf/reward_model/reward_model.py
 11 | class RewardModel(nn.Module):
 12 | 
 13 |     def __init__(self,
 14 |                  base_model,
 15 |                  tokenizer,
 16 |                  num_padding_at_beginning=0,
 17 |                  compute_fp32_loss=False):
 18 |         super().__init__()
 19 |         self.config = base_model.config
 20 |         self.num_padding_at_beginning = num_padding_at_beginning
 21 |         if hasattr(self.config, "word_embed_proj_dim"):
 22 |             # `OPT` models use word_embed_proj_dim as final output
 23 |             # https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py#L497
 24 |             self.v_head = nn.Linear(self.config.word_embed_proj_dim,
 25 |                                     1,
 26 |                                     bias=False)
 27 |         else:
 28 |             # `gpt-neo(x)` models use `hidden_size` attribute names instead of `n_embd``
 29 |             self.config.n_embd = self.config.hidden_size if hasattr(
 30 |                 self.config, "hidden_size") else self.config.n_embd
 31 |             self.v_head = nn.Linear(self.config.n_embd, 1, bias=False)
 32 |         self.rwtransformer = base_model
 33 |         self.PAD_ID = tokenizer.pad_token_id
 34 |         self.compute_fp32_loss = compute_fp32_loss
 35 | 
 36 |     def gradient_checkpointing_enable(self):
 37 |         self.rwtransformer.gradient_checkpointing_enable()
 38 | 
 39 |     def gradient_checkpointing_disable(self):
 40 |         self.rwtransformer.gradient_checkpointing_disable()
 41 | 
 42 |     def forward(self,
 43 |                 input_ids=None,
 44 |                 past_key_values=None,
 45 |                 attention_mask=None,
 46 |                 position_ids=None,
 47 |                 head_mask=None,
 48 |                 inputs_embeds=None,
 49 |                 use_cache=False):
 50 |         loss = None
 51 | 
 52 |         if self.config.model_type == "llama":
 53 |             kwargs = dict()
 54 |         else:
 55 |             kwargs = dict(head_mask=head_mask)
 56 | 
 57 |         transformer_outputs = self.rwtransformer(
 58 |             input_ids,
 59 |             past_key_values=past_key_values,
 60 |             attention_mask=attention_mask,
 61 |             inputs_embeds=inputs_embeds,
 62 |             use_cache=use_cache,
 63 |             **kwargs)
 64 | 
 65 |         hidden_states = transformer_outputs[0]
 66 |         rewards = self.v_head(hidden_states).squeeze(-1)
 67 |         chosen_mean_scores = []
 68 |         rejected_mean_scores = []
 69 | 
 70 |         # Split the inputs and rewards into two parts, chosen and rejected
 71 |         assert len(input_ids.shape) == 2
 72 |         bs = input_ids.shape[0] // 2
 73 |         seq_len = input_ids.shape[1]
 74 | 
 75 |         chosen_ids = input_ids[:bs]  # bs x seq x 1
 76 |         rejected_ids = input_ids[bs:]
 77 |         chosen_rewards = rewards[:bs]
 78 |         rejected_rewards = rewards[bs:]
 79 | 
 80 |         # Compute pairwise loss. Only backprop on the different tokens before padding
 81 |         loss = 0.
 82 |         for i in range(bs):
 83 |             chosen_id = chosen_ids[i]
 84 |             rejected_id = rejected_ids[i]
 85 |             chosen_reward = chosen_rewards[i]
 86 |             rejected_reward = rejected_rewards[i]
 87 | 
 88 |             c_inds = (chosen_id == self.PAD_ID).nonzero()
 89 |             c_ind = c_inds[self.num_padding_at_beginning].item() if len(
 90 |                 c_inds
 91 |             ) > self.num_padding_at_beginning else seq_len  # OPT model pads the first token, so we need to use the second padding token as the end of the sequence
 92 |             check_divergence = (chosen_id != rejected_id).nonzero()
 93 | 
 94 |             if len(check_divergence) == 0:
 95 |                 end_ind = rejected_reward.size(-1)
 96 |                 divergence_ind = end_ind - 1
 97 |                 r_ind = c_ind
 98 |             else:
 99 |                 # Check if there is any padding otherwise take length of sequence
100 |                 r_inds = (rejected_id == self.PAD_ID).nonzero()
101 |                 r_ind = r_inds[self.num_padding_at_beginning].item(
102 |                 ) if len(r_inds) > self.num_padding_at_beginning else seq_len
103 |                 end_ind = max(c_ind, r_ind)
104 |                 divergence_ind = check_divergence[0]
105 |             assert divergence_ind > 0
106 |             c_truncated_reward = chosen_reward[divergence_ind:end_ind]
107 |             r_truncated_reward = rejected_reward[divergence_ind:end_ind]
108 |             chosen_mean_scores.append(
109 |                 chosen_reward[c_ind - 1])  #use the end score for reference
110 |             rejected_mean_scores.append(rejected_reward[r_ind - 1])
111 | 
112 |             if self.compute_fp32_loss:
113 |                 c_truncated_reward = c_truncated_reward.float()
114 |                 r_truncated_reward = r_truncated_reward.float()
115 |             loss += -torch.nn.functional.logsigmoid(c_truncated_reward -
116 |                                                     r_truncated_reward).mean()
117 | 
118 |         loss = loss / bs
119 |         chosen_mean_scores = torch.stack(chosen_mean_scores)
120 |         rejected_mean_scores = torch.stack(rejected_mean_scores)
121 |         return {
122 |             "loss": loss,
123 |             "chosen_mean_scores": chosen_mean_scores,
124 |             "rejected_mean_scores": rejected_mean_scores,
125 |         }
126 | 
127 |     def forward_value(self,
128 |                       input_ids=None,
129 |                       attention_mask=None,
130 |                       past_key_values=None,
131 |                       position_ids=None,
132 |                       head_mask=None,
133 |                       inputs_embeds=None,
134 |                       return_value_only=False,
135 |                       prompt_length=0,
136 |                       use_cache=False):
137 | 
138 |         if self.config.model_type == "llama":
139 |             kwargs = dict()
140 |         else:
141 |             kwargs = dict(head_mask=head_mask)
142 | 
143 |         transformer_outputs = self.rwtransformer(
144 |             input_ids,
145 |             past_key_values=past_key_values,
146 |             attention_mask=attention_mask,
147 |             inputs_embeds=inputs_embeds,
148 |             use_cache=use_cache,
149 |             **kwargs)
150 |         hidden_states = transformer_outputs[0]
151 |         values = self.v_head(hidden_states).squeeze(-1)
152 |         if return_value_only:
153 |             return values
154 |         else:
155 |             # [0 0 0 0 prompt, answer, 0 0 0 0 ] for step 3, we have padding at the beginning
156 |             # [prompt, answer, 0, 0, 0, 0] this is normal
157 |             assert prompt_length > 1, "prompt_length must be greater than 1 to help select the end score"
158 |             bs = values.size(0)
159 |             seq_len = input_ids.shape[1]
160 |             chosen_end_scores = [
161 |             ]  # we use this name for consistency with the original forward function
162 |             for i in range(bs):
163 |                 input_id = input_ids[i]
164 |                 value = values[i]
165 | 
166 |                 c_inds = (input_id[prompt_length:] == self.PAD_ID).nonzero()
167 |                 # here we only use the answer part of the sequence so we do not need to care about the padding at the beginning
168 |                 c_ind = c_inds[0].item() + prompt_length if len(
169 |                     c_inds) > 0 else seq_len
170 |                 chosen_end_scores.append(value[c_ind - 1])
171 |             return {
172 |                 "values": values,
173 |                 "chosen_end_scores": torch.stack(chosen_end_scores),
174 |             }
175 | 


--------------------------------------------------------------------------------
/deepspeed_example/dschat/utils/module/lora.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Microsoft Corporation.
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | 
  4 | # DeepSpeed Team
  5 | import math
  6 | import torch
  7 | from torch import nn
  8 | import torch.nn.functional as F
  9 | from deepspeed.compression.helper import recursive_getattr, recursive_setattr
 10 | import deepspeed
 11 | 
 12 | 
 13 | class LinearLayer_LoRA(nn.Module):
 14 |     # an simple implementation of LoRA
 15 |     # for now only support Linear Layer
 16 |     def __init__(self,
 17 |                  weight,
 18 |                  lora_dim=0,
 19 |                  lora_scaling=1,
 20 |                  lora_droppout=0,
 21 |                  bias=None):
 22 |         super(LinearLayer_LoRA, self).__init__()
 23 |         self.weight = weight
 24 |         self.bias = bias
 25 | 
 26 |         if lora_dim <= 0:
 27 |             raise ValueError(
 28 |                 "You are training to use LoRA, whose reduced dim should be larger than 1"
 29 |             )
 30 | 
 31 |         try:
 32 |             # for zero stage 3
 33 |             rows, columns = weight.ds_shape
 34 |         except:
 35 |             rows, columns = weight.shape
 36 |         self.lora_right_weight = nn.Parameter(torch.zeros(
 37 |             columns,
 38 |             lora_dim))  # apply transpose so in forward we do not need to
 39 |         self.lora_left_weight = nn.Parameter(torch.zeros(lora_dim, rows))
 40 |         self.lora_scaling = lora_scaling / lora_dim
 41 | 
 42 |         if lora_droppout > 0:
 43 |             self.lora_dropout = nn.Dropout(lora_droppout)
 44 |         else:
 45 |             self.lora_dropout = nn.Identity()
 46 | 
 47 |         self.reset_parameters()
 48 |         # disable the original weight gradient
 49 |         self.weight.requires_grad = False
 50 |         # fuse LoRA to the original weight
 51 |         self.fuse_lora = False
 52 | 
 53 |     def eval(self):
 54 |         self.lora_dropout.eval()
 55 | 
 56 |     #   self.fuse_lora_weight()
 57 | 
 58 |     def train(self, mode=True):
 59 |         self.lora_dropout.train(mode)
 60 |         # self.unfuse_lora_weight()
 61 | 
 62 |     def reset_parameters(self):
 63 |         nn.init.kaiming_uniform_(self.lora_right_weight, a=math.sqrt(5))
 64 |         nn.init.zeros_(self.lora_left_weight)
 65 | 
 66 |     def fuse_lora_weight(self):
 67 |         if not self.fuse_lora:
 68 |             self.weight.data += self.lora_scaling * torch.matmul(
 69 |                 self.lora_left_weight.t(), self.lora_right_weight.t())
 70 |         self.fuse_lora = True
 71 | 
 72 |     def unfuse_lora_weight(self):
 73 |         if self.fuse_lora:
 74 |             self.weight.data -= self.lora_scaling * torch.matmul(
 75 |                 self.lora_left_weight.t(), self.lora_right_weight.t())
 76 |         self.fuse_lora = False
 77 | 
 78 |     def forward(self, input):
 79 |         if self.fuse_lora:
 80 |             return F.linear(input, self.weight, self.bias)
 81 |         else:
 82 |             return F.linear(
 83 |                 input, self.weight,
 84 |                 self.bias) + (self.lora_dropout(input) @ self.lora_right_weight
 85 |                               @ self.lora_left_weight) * self.lora_scaling
 86 | 
 87 | 
 88 | # convert the linear layer to LoRA
 89 | def convert_linear_layer_to_lora(model,
 90 |                                  part_module_name,
 91 |                                  lora_dim=0,
 92 |                                  lora_scaling=1,
 93 |                                  lora_droppout=0):
 94 |     replace_name = []
 95 |     for name, module in model.named_modules():
 96 |         if isinstance(module, nn.Linear) and part_module_name in name:
 97 |             replace_name.append(name)
 98 |     for name in replace_name:
 99 |         module = recursive_getattr(model, name)
100 |         tmp = LinearLayer_LoRA(
101 |             module.weight, lora_dim, lora_scaling, lora_droppout,
102 |             module.bias).to(module.weight.device).to(module.weight.dtype)
103 |         recursive_setattr(model, name, tmp)
104 |     return model
105 | 
106 | 
107 | def _z3_params_to_fetch(param_list):
108 |     return [
109 |         p for p in param_list
110 |         if hasattr(p, 'ds_id') and p.ds_status == deepspeed.runtime.zero.
111 |         partition_parameters.ZeroParamStatus.NOT_AVAILABLE
112 |     ]
113 | 
114 | 
115 | # convert the LoRA layer to linear layer
116 | def convert_lora_to_linear_layer(model):
117 |     replace_name = []
118 |     for name, module in model.named_modules():
119 |         if isinstance(module, LinearLayer_LoRA):
120 |             replace_name.append(name)
121 |     for name in replace_name:
122 |         module = recursive_getattr(model, name)
123 |         zero_stage_3 = hasattr(module.weight, 'ds_id')
124 |         with deepspeed.zero.GatheredParameters(_z3_params_to_fetch([
125 |                 module.weight, module.bias, module.lora_left_weight,
126 |                 module.lora_right_weight
127 |         ]),
128 |                                                modifier_rank=0,
129 |                                                enabled=zero_stage_3):
130 |             module.fuse_lora_weight()
131 |     return model
132 | 
133 | 
134 | def only_optimize_lora_parameters(model, force_optimize_params=[]):
135 |     # turn off the gradient of all the parameters except the LoRA parameters
136 |     for name, param in model.named_parameters():
137 |         if "lora_right_weight" in name or "lora_left_weight" in name or name in force_optimize_params:
138 |             param.requires_grad = True
139 |         else:
140 |             param.requires_grad = False
141 |     return model
142 | 
143 | 
144 | def make_model_gradient_checkpointing_compatible(model):
145 |     # Higgingface added this enable input require grads function to make gradient checkpointing work for lora-only optimization
146 |     if hasattr(model, "enable_input_require_grads"):
147 |         model.enable_input_require_grads()
148 |     elif hasattr(model, "get_input_embeddings"):
149 | 
150 |         def make_inputs_require_grad(module, input, output):
151 |             output.requires_grad_(True)
152 | 
153 |         model.get_input_embeddings().register_forward_hook(
154 |             make_inputs_require_grad)
155 |     return model
156 | 


--------------------------------------------------------------------------------
/deepspeed_example/dschat/utils/perf.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Microsoft Corporation.
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | 
  4 | # DeepSpeed Team
  5 | 
  6 | import torch
  7 | 
  8 | 
  9 | # This function can be used to print throughput for Step 1 and 2 only
 10 | def print_throughput(hf_model, args, e2e_time, rank=0):
 11 |     if rank <= 0:
 12 |         hf_config = hf_model.config
 13 |         num_layers, hidden_size, vocab_size = get_hf_configs(hf_config)
 14 | 
 15 |         gpus_per_model = torch.distributed.get_world_size()
 16 |         seq_length = args.max_seq_len
 17 |         batch_size = args.per_device_train_batch_size
 18 |         samples_per_second = batch_size / e2e_time
 19 |         checkpoint_activations_factor = 4 if args.gradient_checkpointing else 3
 20 |         if args.lora_dim > 0:
 21 |             k = args.lora_dim * 2 / hidden_size
 22 |             checkpoint_activations_factor -= (1 - k)
 23 | 
 24 |         hf_model._num_params = sum([
 25 |             p.ds_numel if hasattr(p, "ds_tensor") else p.numel()
 26 |             for p in hf_model.parameters()
 27 |         ])
 28 |         params_in_billions = hf_model._num_params / (1e9)
 29 | 
 30 |         # Megatron paper's formula to calculate training flops
 31 |         train_flops_per_iteration = calculate_flops(
 32 |             checkpoint_activations_factor, batch_size, seq_length, hf_config)
 33 | 
 34 |         train_tflops = train_flops_per_iteration / (e2e_time * gpus_per_model *
 35 |                                                     (10**12))
 36 | 
 37 |         param_string = f"{params_in_billions:.3f} B" if params_in_billions != 0 else "NA"
 38 |         print(
 39 |             f"Model Parameters: {param_string}, Latency: {e2e_time:.2f}s, TFLOPs: {train_tflops:.2f}, Samples/sec: {samples_per_second:.2f}, Time/seq {e2e_time/batch_size:.2f}s, Batch Size: {batch_size}, Sequence Length: {seq_length}"
 40 |         )
 41 | 
 42 | 
 43 | # Enhanced version of the function above that provides calculations and printing for Step 3
 44 | def print_throughput_step3(actor_model,
 45 |                            critic_model,
 46 |                            args,
 47 |                            e2e_time,
 48 |                            gen_exp_time,
 49 |                            train_time,
 50 |                            rank=0):
 51 |     if rank <= 0:
 52 |         # Actor model passed here is a HF model.
 53 |         actor_hf_config = actor_model.config
 54 |         # Critic model passed here is  a DeepSpeed Engine. The module inside is the Reward model (that wraps a HF model).
 55 |         critic_hf_config = critic_model.module.config
 56 | 
 57 |         actor_num_layers, actor_hidden_size, actor_vocab_size = get_hf_configs(
 58 |             actor_hf_config)
 59 |         critic_num_layers, critic_hidden_size, critic_vocab_size = get_hf_configs(
 60 |             critic_hf_config)
 61 | 
 62 |         gpus_per_model = torch.distributed.get_world_size()
 63 |         seq_length = args.max_answer_seq_len + args.max_prompt_seq_len
 64 |         batch_size = args.per_device_generation_batch_size * args.generation_batches * args.ppo_epochs * gpus_per_model * 1 if args.unsupervised_dataset_name is None else 2
 65 |         samples_per_second = batch_size / e2e_time
 66 | 
 67 |         actor_checkpoint_activations_factor = 4 if args.actor_gradient_checkpointing else 3
 68 |         critic_checkpoint_activations_factor = 4 if args.critic_gradient_checkpointing else 3
 69 |         if args.actor_lora_dim > 0:
 70 |             k = args.actor_lora_dim * 2 / actor_hidden_size
 71 |             actor_checkpoint_activations_factor -= (1 - k)
 72 |         if args.critic_lora_dim > 0:
 73 |             k = args.critic_lora_dim * 2 / critic_hidden_size
 74 |             critic_checkpoint_activations_factor -= (1 - k)
 75 | 
 76 |         actor_model._num_params = sum([
 77 |             p.ds_numel if hasattr(p, "ds_tensor") else p.numel()
 78 |             for p in actor_model.parameters()
 79 |         ])
 80 |         actor_params_in_billions = actor_model._num_params / (1e9)
 81 | 
 82 |         critic_model._num_params = sum([
 83 |             p.ds_numel if hasattr(p, "ds_tensor") else p.numel()
 84 |             for p in critic_model.parameters()
 85 |         ])
 86 |         critic_params_in_billions = critic_model._num_params / (1e9)
 87 | 
 88 |         # Megatron paper's formula to calculate training flops
 89 | 
 90 |         actor_train_flops_per_iteration = calculate_flops(
 91 |             actor_checkpoint_activations_factor, batch_size, seq_length,
 92 |             actor_hf_config)
 93 |         critic_train_flops_per_iteration = calculate_flops(
 94 |             critic_checkpoint_activations_factor, batch_size, seq_length,
 95 |             critic_hf_config)
 96 | 
 97 |         total_train_flops = actor_train_flops_per_iteration + critic_train_flops_per_iteration
 98 |         train_tflops = total_train_flops / (train_time * gpus_per_model *
 99 |                                             (10**12))
100 | 
101 |         gen_bs = args.per_device_generation_batch_size * gpus_per_model
102 | 
103 |         # Modified formula for calculating flops in the forward pass only
104 |         gen_flops_per_iteration = (
105 |             24 * gen_bs * seq_length * actor_num_layers *
106 |             (actor_hidden_size**2)) * (
107 |                 1.0 + (seq_length / (6.0 * actor_hidden_size)) +
108 |                 (actor_vocab_size /
109 |                  (16.0 * actor_num_layers * actor_hidden_size)))
110 | 
111 |         gen_tflops = gen_flops_per_iteration / (gen_exp_time * gpus_per_model *
112 |                                                 (10**12))
113 | 
114 |         if actor_hf_config.torch_dtype == torch.float16:
115 |             num_bytes = 2
116 |         elif actor_hf_config.torch_dtype == torch.float32:
117 |             num_bytes = 4
118 |         else:
119 |             num_bytes = -1
120 | 
121 |         pertok_lat = gen_exp_time / args.max_answer_seq_len
122 |         gen_bw = 1 / pertok_lat * actor_model._num_params * num_bytes / 1e9
123 | 
124 |         total_flops_per_iteration = total_train_flops + gen_flops_per_iteration * args.generation_batches
125 |         total_tflops = total_flops_per_iteration / (e2e_time * gpus_per_model *
126 |                                                     (10**12))
127 | 
128 |         print(
129 |             f"End-to-End => Latency: {e2e_time:.2f}s, TFLOPs: {total_tflops:.2f}, Samples/sec: {samples_per_second:.2f}, Time/seq {e2e_time/batch_size:.2f}s, Batch Size: {batch_size}, Total Seq. Length: {seq_length}"
130 |         )
131 |         print(
132 |             f"Generation => Latency: {gen_exp_time:.2f}s, Per-token Latency {pertok_lat*1000:.2f} ms, TFLOPs: {gen_tflops:.2f}, BW: {gen_bw if num_bytes > 0 else num_bytes:.2f} GB/sec, Answer Seq. Length: {args.max_answer_seq_len}"
133 |         )
134 |         print(
135 |             f"Training   => Latency: {train_time:.2f}s, TFLOPs: {train_tflops:.2f}"
136 |         )
137 |         actor_param_string = f"{actor_params_in_billions:.3f} B" if actor_params_in_billions != 0 else "NA"
138 |         critic_param_string = f"{critic_params_in_billions:.3f} B" if critic_params_in_billions != 0 else "NA"
139 |         print(
140 |             f"Actor Model Parameters => {actor_param_string}, Critic Model Parameters => {critic_param_string}"
141 |         )
142 | 
143 | 
144 | # Helper function to calculate FLOPs using the Megatron-LM paper's formula
145 | def calculate_flops(checkpoint_activations_factor, batch_size, seq_length,
146 |                     hf_config):
147 |     num_layers, hidden_size, vocab_size = get_hf_configs(hf_config)
148 |     flops_per_iteration = (24 * checkpoint_activations_factor * batch_size *
149 |                            seq_length * num_layers * (hidden_size**2)) * (
150 |                                1.0 + (seq_length / (6.0 * hidden_size)) +
151 |                                (vocab_size /
152 |                                 (16.0 * num_layers * hidden_size)))
153 |     return flops_per_iteration
154 | 
155 | 
156 | def get_hf_configs(hf_config):
157 |     num_layers = getattr(hf_config, "num_hidden_layers",
158 |                          getattr(hf_config, "n_layer", None))
159 |     hidden_size = getattr(hf_config, "hidden_size",
160 |                           getattr(hf_config, "n_embd", None))
161 |     vocab_size = getattr(hf_config, "vocab_size", None)
162 |     assert all(
163 |         (num_layers, hidden_size, vocab_size)
164 |     ), "Could not determine number of layers, hidden size, and vocab size of the model"
165 | 
166 |     return num_layers, hidden_size, vocab_size
167 | 


--------------------------------------------------------------------------------
/deepspeed_example/dschat/utils/utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Microsoft Corporation.
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | 
  4 | # DeepSpeed Team
  5 | import os
  6 | import torch
  7 | import random
  8 | import numpy as np
  9 | from transformers import set_seed, AutoTokenizer
 10 | import json
 11 | import deepspeed
 12 | from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
 13 | from deepspeed.accelerator import get_accelerator
 14 | import torch.nn as nn
 15 | 
 16 | 
 17 | def print_rank_0(msg, rank=None):
 18 |     if rank is not None and rank <= 0:
 19 |         print(msg)
 20 |     elif is_rank_0():
 21 |         print(msg)
 22 | 
 23 | 
 24 | def is_rank_0():
 25 |     """Check whether it is rank 0."""
 26 |     if torch.distributed.is_initialized():
 27 |         if torch.distributed.get_rank() == 0:
 28 |             return True
 29 |         else:
 30 |             return False
 31 |     else:
 32 |         return True
 33 | 
 34 | 
 35 | def to_device(batch, device):
 36 |     output = {}
 37 |     for k, v in batch.items():
 38 |         try:
 39 |             output[k] = v.to(device)
 40 |         except:
 41 |             output[k] = v
 42 |     return output
 43 | 
 44 | 
 45 | class MovingAverage:
 46 | 
 47 |     def __init__(self):
 48 |         self.count = 0
 49 |         self.total = 0
 50 |         self.mean = 0
 51 | 
 52 |     def update(self, num):
 53 |         self.total += num
 54 |         self.count += 1
 55 |         self.mean = self.total / self.count
 56 | 
 57 |         return self.mean
 58 | 
 59 | 
 60 | class ExponentialMovingAverage:
 61 | 
 62 |     def __init__(self, alpha=0.9):
 63 |         self.alpha = alpha
 64 |         self.ema = None
 65 | 
 66 |     def update(self, num):
 67 |         prev_ema = num if self.ema is None else self.ema
 68 |         self.ema = self.alpha * prev_ema + (1.0 - self.alpha) * num
 69 |         return self.ema
 70 | 
 71 |     def get(self):
 72 |         return self.ema if self.ema is not None else 0.
 73 | 
 74 | 
 75 | def get_tokenizer(model_name_or_path, fast_tokenizer=True):
 76 |     if "llama" in model_name_or_path:
 77 |         from transformers.models.llama import LlamaTokenizer
 78 |         tokenizer = LlamaTokenizer.from_pretrained(
 79 |             model_name_or_path, fast_tokenizer=fast_tokenizer)
 80 |         if tokenizer.pad_token is None:
 81 |             # assert tokenizer.eos_token is not None
 82 |             # tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
 83 |             tokenizer.add_special_tokens({'pad_token': '[PAD]'})
 84 |             tokenizer.padding_side = 'right'
 85 |     else:
 86 |         tokenizer = AutoTokenizer.from_pretrained(
 87 |             model_name_or_path, fast_tokenizer=fast_tokenizer)
 88 |         tokenizer.pad_token = tokenizer.eos_token
 89 |         # make sure tokenizer is right pad in our logic
 90 |         tokenizer.padding_side = 'right'
 91 |     return tokenizer
 92 | 
 93 | 
 94 | def load_hf_tokenizer(model_name_or_path,
 95 |                       fast_tokenizer=True,
 96 |                       add_special_tokens=None):
 97 |     if os.path.exists(model_name_or_path):
 98 |         # Locally tokenizer loading has some issue, so we need to force download
 99 |         model_json = os.path.join(model_name_or_path, "config.json")
100 |         if os.path.exists(model_json):
101 |             model_json_file = json.load(open(model_json))
102 |             model_name = model_json_file.get("_name_or_path",
103 |                                              model_name_or_path)
104 |             tokenizer = get_tokenizer(model_name,
105 |                                       fast_tokenizer=fast_tokenizer)
106 |     else:
107 |         tokenizer = get_tokenizer(model_name_or_path,
108 |                                   fast_tokenizer=fast_tokenizer)
109 | 
110 |     if add_special_tokens is not None:
111 |         add_special_tokens = [add_special_tokens] if isinstance(add_special_tokens, str) \
112 |             else add_special_tokens
113 |         tokenizer.add_special_tokens(
114 |             {'additional_special_tokens': add_special_tokens})
115 | 
116 |     return tokenizer
117 | 
118 | 
119 | def save_hf_format(model, tokenizer, args, sub_folder=""):
120 |     # used to save huggingface format, so we can use it for hf.from_pretrained
121 |     model_to_save = model.module if hasattr(model, 'module') else model
122 |     CONFIG_NAME = "config.json"
123 |     WEIGHTS_NAME = "pytorch_model.bin"
124 |     output_dir = os.path.join(args.output_dir, sub_folder)
125 |     os.makedirs(output_dir, exist_ok=True)
126 |     output_model_file = os.path.join(output_dir, WEIGHTS_NAME)
127 |     output_config_file = os.path.join(output_dir, CONFIG_NAME)
128 |     save_dict = model_to_save.state_dict()
129 |     for key in list(save_dict.keys()):
130 |         if "lora" in key:
131 |             del save_dict[key]
132 |     torch.save(save_dict, output_model_file)
133 |     model_to_save.config.to_json_file(output_config_file)
134 |     tokenizer.save_vocabulary(output_dir)
135 | 
136 | 
137 | def set_random_seed(seed):
138 |     if seed is not None:
139 |         set_seed(seed)
140 |         random.seed(seed)
141 |         np.random.seed(seed)
142 |         torch.manual_seed(seed)
143 |         get_accelerator().manual_seed_all(seed)
144 | 
145 | 
146 | def get_all_reduce_mean(tensor):
147 |     torch.distributed.all_reduce(tensor, op=torch.distributed.ReduceOp.SUM)
148 |     tensor = tensor / torch.distributed.get_world_size()
149 |     return tensor
150 | 
151 | 
152 | # This function is a modified version of code available in the from_pretrained API of HuggingFace Transformers
153 | # The code is copied and modified from: https://github.com/huggingface/transformers/blob/5ee9693a1c77c617ebc43ef20194b6d3b674318e/src/transformers/modeling_utils.py#L498
154 | # This function helps load a HF format checkpoint into a DeepSpeed wrapped model that has been sharded using ZeRO Stage 3
155 | def load_state_dict_into_model(model_to_load=None,
156 |                                state_dict=None,
157 |                                start_prefix="",
158 |                                zero_stage=0):
159 | 
160 |     # copy state_dict so _load_from_state_dict can modify it
161 |     metadata = getattr(state_dict, "_metadata", None)
162 |     state_dict = state_dict.copy()
163 |     if metadata is not None:
164 |         state_dict._metadata = metadata
165 | 
166 |     error_msgs = []
167 | 
168 |     # PyTorch's `_load_from_state_dict` does not copy parameters in a module's descendants
169 |     # so we need to apply the function recursively.
170 |     def load(module: nn.Module, state_dict, prefix=""):
171 |         local_metadata = {} if metadata is None else metadata.get(
172 |             prefix[:-1], {})
173 |         args = (state_dict, prefix, local_metadata, True, [], [], error_msgs)
174 |         # Parameters of module and children will start with prefix. We can exit early if there are none in this
175 |         # state_dict
176 |         if len([key for key in state_dict if key.startswith(prefix)]) > 0:
177 |             if zero_stage == 3:
178 |                 # In sharded models, each shard has only part of the full state_dict, so only gather
179 |                 # parameters that are in the current state_dict.
180 |                 named_parameters = dict(
181 |                     module.named_parameters(prefix=prefix[:-1], recurse=False))
182 |                 params_to_gather = [
183 |                     named_parameters[k] for k in state_dict.keys()
184 |                     if k in named_parameters
185 |                 ]
186 |                 if len(params_to_gather) > 0:
187 |                     # because zero3 puts placeholders in model params, this context
188 |                     # manager gathers (unpartitions) the params of the current layer, then loads from
189 |                     # the state dict and then re-partitions them again
190 |                     with deepspeed.zero.GatheredParameters(params_to_gather,
191 |                                                            modifier_rank=0):
192 |                         if torch.distributed.get_rank() == 0:
193 |                             module._load_from_state_dict(*args)
194 |             else:
195 |                 module._load_from_state_dict(*args)
196 | 
197 |         for name, child in module._modules.items():
198 |             if child is not None:
199 |                 load(child, state_dict, prefix + name + ".")
200 | 
201 |     load(model_to_load, state_dict, prefix=start_prefix)
202 |     # Delete `state_dict` so it could be collected by GC earlier. Note that `state_dict` is a copy of the argument, so
203 |     # it's safe to delete it.
204 |     del state_dict
205 | 
206 |     return error_msgs
207 | 
208 | 
209 | def get_optimizer_grouped_parameters(
210 |     model,
211 |     weight_decay,
212 |     lora_lr=5e-4,
213 |     no_decay_name_list=[
214 |         "bias", "layer_norm.weight", "layernorm.weight", "norm.weight",
215 |         "ln_f.weight"
216 |     ],
217 |     lora_name_list=["lora_right_weight", "lora_left_weight"],
218 | ):
219 |     optimizer_grouped_parameters = [
220 |         {
221 |             "params": [
222 |                 p for n, p in model.named_parameters()
223 |                 if (not any(nd in n.lower() for nd in no_decay_name_list)
224 |                     and p.requires_grad and not any(nd in n.lower()
225 |                                                     for nd in lora_name_list))
226 |             ],
227 |             "weight_decay":
228 |             weight_decay,
229 |         },
230 |         {
231 |             "params": [
232 |                 p for n, p in model.named_parameters()
233 |                 if (not any(nd in n.lower() for nd in no_decay_name_list)
234 |                     and p.requires_grad and any(nd in n.lower()
235 |                                                 for nd in lora_name_list))
236 |             ],
237 |             "weight_decay":
238 |             weight_decay,
239 |             "lr":
240 |             lora_lr
241 |         },
242 |         {
243 |             "params": [
244 |                 p for n, p in model.named_parameters()
245 |                 if (any(nd in n.lower()
246 |                         for nd in no_decay_name_list) and p.requires_grad)
247 |             ],
248 |             "weight_decay":
249 |             0.0,
250 |         },
251 |     ]
252 | 
253 |     non_empty_groups = []
254 |     for group in optimizer_grouped_parameters:
255 |         if group["params"]:
256 |             non_empty_groups.append(group)
257 |     return non_empty_groups
258 | 
259 | 
260 | def _z3_params_to_fetch(param_list):
261 |     return [
262 |         p for p in param_list
263 |         if hasattr(p, 'ds_id') and p.ds_status == ZeroParamStatus.NOT_AVAILABLE
264 |     ]
265 | 
266 | 
267 | def moving_average(model, model_ema, beta=0.992, device=None, zero_stage=0):
268 |     zero_stage_3 = (zero_stage == 3)
269 |     with torch.no_grad():
270 |         for param, param_ema in zip(model.parameters(),
271 |                                     model_ema.parameters()):
272 |             # TODO: use prefiltering for efficiency
273 |             params_to_fetch = _z3_params_to_fetch([param, param_ema
274 |                                                    ]) if zero_stage_3 else []
275 |             should_gather_param = len(params_to_fetch) > 0
276 |             with deepspeed.zero.GatheredParameters(
277 |                     params_to_fetch, enabled=should_gather_param):
278 |                 data = param.data
279 |                 if device is not None:
280 |                     data = data.to(device)
281 |                 param_ema.data.copy_(torch.lerp(data, param_ema.data, beta))
282 | 
283 | 
284 | def save_zero_three_model(model_ema, global_rank, save_dir, zero_stage=0):
285 |     zero_stage_3 = (zero_stage == 3)
286 |     os.makedirs(save_dir, exist_ok=True)
287 |     WEIGHTS_NAME = "pytorch_model.bin"
288 |     output_model_file = os.path.join(save_dir, WEIGHTS_NAME)
289 | 
290 |     model_to_save = model_ema.module if hasattr(model_ema,
291 |                                                 'module') else model_ema
292 |     if not zero_stage_3:
293 |         if global_rank == 0:
294 |             torch.save(model_to_save.state_dict(), output_model_file)
295 |     else:
296 |         output_state_dict = {}
297 |         for k, v in model_to_save.named_parameters():
298 | 
299 |             if hasattr(v, 'ds_id'):
300 |                 with deepspeed.zero.GatheredParameters(_z3_params_to_fetch([v
301 |                                                                             ]),
302 |                                                        enabled=zero_stage_3):
303 |                     v_p = v.data.cpu()
304 |             else:
305 |                 v_p = v.cpu()
306 |             if global_rank == 0 and "lora" not in k:
307 |                 output_state_dict[k] = v_p
308 |         if global_rank == 0:
309 |             torch.save(output_state_dict, output_model_file)
310 |         del output_state_dict
311 | 


--------------------------------------------------------------------------------
/deepspeed_example/requirements.txt:
--------------------------------------------------------------------------------
1 | datasets>=2.8.0
2 | sentencepiece>=0.1.97
3 | protobuf==3.20.3
4 | accelerate>=0.15.0
5 | torch>=1.12.0
6 | deepspeed>=0.9.0
7 | transformers>=4.31.0,!=4.33.2
8 | tensorboard
9 | 


--------------------------------------------------------------------------------
/deepspeed_example/run_llama2_7b.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright (c) Microsoft Corporation.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | # DeepSpeed Team
 6 | ZERO_STAGE=$1
 7 | OUTPUT=./output_llama2_7b
 8 | if [ "$ZERO_STAGE" == "" ]; then
 9 |     ZERO_STAGE=3
10 | fi
11 | mkdir -p $OUTPUT
12 | 
13 | deepspeed main.py \
14 |    --data_split 2,4,4 \
15 |    --model_name_or_path meta-llama/Llama-2-7b-hf \
16 |    --per_device_train_batch_size 1 \
17 |    --per_device_eval_batch_size 4 \
18 |    --max_seq_len 512 \
19 |    --learning_rate 9.65e-6 \
20 |    --weight_decay 0. \
21 |    --num_train_epochs 3  \
22 |    --gradient_accumulation_steps 4 \
23 |    --lr_scheduler_type cosine \
24 |    --num_warmup_steps 0 \
25 |    --seed 1234 \
26 |    --gradient_checkpointing \
27 |    --dtype bf16 \
28 |    --zero_stage $ZERO_STAGE \
29 |    --deepspeed \
30 |    --output_dir $OUTPUT \
31 |    #&> $OUTPUT/training.log
32 | 


--------------------------------------------------------------------------------
/deepspeed_example/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # Copyright (c) Microsoft Corporation.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | # DeepSpeed Team
 6 | 
 7 | # setup.py: install script for deepspeed_chat
 8 | """
 9 | to install deepspeed_chat and its dependencies for development work,
10 | run this cmd from the root directory:
11 |     pip install -e .
12 | """
13 | import setuptools
14 | 
15 | setuptools.setup(
16 |     name="deepspeed-chat",
17 |     version="0.1",
18 |     url=
19 |     "https://github.com/deepspeedai/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat",
20 |     include_package_data=True,
21 |     packages=setuptools.find_packages(include=['dschat']),
22 |     install_requires=[
23 |         "datasets>=2.8.0", "sentencepiece>=0.1.97", "protobuf==3.20.3",
24 |         "accelerate>=0.15.0", "torch>=1.12.0", "deepspeed>=0.9.2",
25 |         "transformers>=4.31.0,!=4.33.2", "tensorboard"
26 |     ],
27 |     extras_require={
28 |         "azureml": [
29 |             "azure-ml-component",
30 |             "azureml-core",
31 |         ],
32 |     })
33 | 


--------------------------------------------------------------------------------
/imgs/broadcast_matrix_matrix.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/llmsystem/llmsys_code_examples/ed76a814ddc7b0beab6ee0c73f158d06c03efc98/imgs/broadcast_matrix_matrix.png


--------------------------------------------------------------------------------
/imgs/broadcast_matrix_vector.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/llmsystem/llmsys_code_examples/ed76a814ddc7b0beab6ee0c73f158d06c03efc98/imgs/broadcast_matrix_vector.png


--------------------------------------------------------------------------------
/imgs/broadcast_mv_scalar.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/llmsystem/llmsys_code_examples/ed76a814ddc7b0beab6ee0c73f158d06c03efc98/imgs/broadcast_mv_scalar.png


--------------------------------------------------------------------------------
/imgs/broadcast_rule.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/llmsystem/llmsys_code_examples/ed76a814ddc7b0beab6ee0c73f158d06c03efc98/imgs/broadcast_rule.png


--------------------------------------------------------------------------------
/imgs/data_storage_operators.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/llmsystem/llmsys_code_examples/ed76a814ddc7b0beab6ee0c73f158d06c03efc98/imgs/data_storage_operators.png


--------------------------------------------------------------------------------
/imgs/high_level_abstraction.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/llmsystem/llmsys_code_examples/ed76a814ddc7b0beab6ee0c73f158d06c03efc98/imgs/high_level_abstraction.png


--------------------------------------------------------------------------------
/imgs/reduce.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/llmsystem/llmsys_code_examples/ed76a814ddc7b0beab6ee0c73f158d06c03efc98/imgs/reduce.jpg


--------------------------------------------------------------------------------
/imgs/strides.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/llmsystem/llmsys_code_examples/ed76a814ddc7b0beab6ee0c73f158d06c03efc98/imgs/strides.png


--------------------------------------------------------------------------------
/mini_tensorflow/mini_tensorflow.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import numpy as np\n",
 10 |     "\n",
 11 |     "class Graph():\n",
 12 |     "  \"\"\" Define a computation graph, which contains\n",
 13 |     "        placeholders: to store the input data\n",
 14 |     "        variables: to store the network parameters\n",
 15 |     "        constants: some static data\n",
 16 |     "        operations: the mathmatical operations for each neural network layer\n",
 17 |     "  \"\"\"\n",
 18 |     "  def __init__(self):\n",
 19 |     "    \"\"\"\n",
 20 |     "       Please define attributes for placeholders, variables, constants, operations,\n",
 21 |     "       using lists.\n",
 22 |     "    \"\"\"\n",
 23 |     "    [implement]\n",
 24 |     "\n",
 25 |     "  def as_default(self):\n",
 26 |     "    \"\"\"\n",
 27 |     "      define a global default computation graph\n",
 28 |     "    \"\"\"\n",
 29 |     "    global _default_graph\n",
 30 |     "    _default_graph = [implement]\n",
 31 |     "    \n",
 32 |     "  def add_operation(self, op):\n",
 33 |     "    \"\"\"\n",
 34 |     "      add op to operations\n",
 35 |     "    \"\"\"\n",
 36 |     "    [implement]\n",
 37 |     "\n",
 38 |     "  def add_placeholder(self, holder):\n",
 39 |     "    \"\"\"\n",
 40 |     "      add holder to placeholders\n",
 41 |     "    \"\"\"\n",
 42 |     "    [implement]\n",
 43 |     "  \n",
 44 |     "  def add_variable(self, var):\n",
 45 |     "    \"\"\"\n",
 46 |     "      add var to variables\n",
 47 |     "    \"\"\"\n",
 48 |     "    [implement]\n",
 49 |     "    \n",
 50 |     "  def add_constant(self, c):\n",
 51 |     "    \"\"\"\n",
 52 |     "      add c to constants\n",
 53 |     "    \"\"\"\n",
 54 |     "    [implement]\n",
 55 |     "\n",
 56 |     "class Operation():\n",
 57 |     "  \"\"\"\n",
 58 |     "    Define network operation node. It should contain input nodes and one output node.\n",
 59 |     "  \"\"\"\n",
 60 |     "  def __init__(self, input_nodes=None):\n",
 61 |     "    \"\"\"\n",
 62 |     "      Define two attributes here: \n",
 63 |     "        input_nodes\n",
 64 |     "        output\n",
 65 |     "      Add the current Operation node to _default_graph's operations\n",
 66 |     "    \"\"\"\n",
 67 |     "    [implement]\n",
 68 |     "    \n",
 69 |     "    # Append operation to the list of operations of the default graph\n",
 70 |     "    [implement]\n",
 71 |     "\n",
 72 |     "  def forward(self):\n",
 73 |     "    pass\n",
 74 |     "\n",
 75 |     "  def backward(self):\n",
 76 |     "    pass\n",
 77 |     "\n",
 78 |     "\n",
 79 |     "class BinaryOperation(Operation):\n",
 80 |     "  \"\"\"\n",
 81 |     "    define binary operations\n",
 82 |     "  \"\"\"\n",
 83 |     "  def __init__(self, a, b):\n",
 84 |     "    \"\"\"\n",
 85 |     "      a, b are input nodes. please initialize\n",
 86 |     "    \"\"\"\n",
 87 |     "    [implement]\n",
 88 |     "\n",
 89 |     "class add(BinaryOperation):\n",
 90 |     "  \"\"\"\n",
 91 |     "  Computes a + b, element-wise\n",
 92 |     "  \"\"\"\n",
 93 |     "  def forward(self, a, b):\n",
 94 |     "    [implement]\n",
 95 |     "\n",
 96 |     "  def backward(self, upstream_grad):\n",
 97 |     "    raise NotImplementedError\n",
 98 |     "\n",
 99 |     "class multiply(BinaryOperation):\n",
100 |     "  \"\"\"\n",
101 |     "  Computes a * b, element-wise\n",
102 |     "  \"\"\"\n",
103 |     "  def forward(self, a, b):\n",
104 |     "    [implement]\n",
105 |     "\n",
106 |     "  def backward(self, upstream_grad):\n",
107 |     "    raise NotImplementedError\n",
108 |     "\n",
109 |     "class divide(BinaryOperation):\n",
110 |     "  \"\"\"\n",
111 |     "  Returns the true division of the inputs, element-wise\n",
112 |     "  \"\"\"\n",
113 |     "  def forward(self, a, b):\n",
114 |     "    return np.true_divide(a, b)\n",
115 |     "\n",
116 |     "  def backward(self, upstream_grad):\n",
117 |     "    raise NotImplementedError\n",
118 |     "\n",
119 |     "class matmul(BinaryOperation):\n",
120 |     "  \"\"\"\n",
121 |     "  Multiplies matrix a by matrix b, producing a * b\n",
122 |     "  \"\"\"\n",
123 |     "  def forward(self, a, b):\n",
124 |     "    \"\"\"\n",
125 |     "      using numpy.dot to perform matrix multiplication on a and b\n",
126 |     "    \"\"\"\n",
127 |     "    [implement]\n",
128 |     "\n",
129 |     "  def backward(self, upstream_grad):\n",
130 |     "    raise NotImplementedError\n",
131 |     "  \n",
132 |     "class Placeholder():\n",
133 |     "  \"\"\"\n",
134 |     "    define placeholder. It should contain a value attribute to store value\n",
135 |     "  \"\"\"\n",
136 |     "  def __init__(self):\n",
137 |     "    \"\"\"\n",
138 |     "      initialize the value to None, add the current node to default graph's placeholder\n",
139 |     "    \"\"\"\n",
140 |     "    [implement]\n",
141 |     "\n",
142 |     "class Constant():\n",
143 |     "  \"\"\"\n",
144 |     "    Define a constant node\n",
145 |     "  \"\"\"\n",
146 |     "  def __init__(self, value=None):\n",
147 |     "    \"\"\"\n",
148 |     "      define internal __value to store the value, add the current node to graph's constant\n",
149 |     "    \"\"\"\n",
150 |     "    [implement]\n",
151 |     "\n",
152 |     "  @property\n",
153 |     "  def value(self):\n",
154 |     "    \"\"\"\n",
155 |     "      return the internal value\n",
156 |     "    \"\"\"\n",
157 |     "    [implement]\n",
158 |     "\n",
159 |     "  @value.setter\n",
160 |     "  def value(self, value):\n",
161 |     "    raise ValueError(\"Cannot reassign value.\")\n",
162 |     "  \n",
163 |     "class Variable():\n",
164 |     "  \"\"\"\n",
165 |     "    define a variable node (for parameter) with initial_value\n",
166 |     "  \"\"\"\n",
167 |     "  def __init__(self, initial_value=None):\n",
168 |     "    \"\"\"\n",
169 |     "      assign initial_value to value, add the current node to graph's variables\n",
170 |     "    \"\"\"\n",
171 |     "    [implement]\n",
172 |     "\n",
173 |     "def topology_sort(operation):\n",
174 |     "  \"\"\"\n",
175 |     "    implement topological sort to order the operations, starting from current node\n",
176 |     "  \"\"\"\n",
177 |     "  ordering = []\n",
178 |     "  visited_nodes = set()\n",
179 |     "\n",
180 |     "  def recursive_helper(node):\n",
181 |     "    \"\"\"\n",
182 |     "      for each Operation node (using isinstance)\n",
183 |     "      recursively find the incoming nodes, visit them first and add node to visited nodes. \n",
184 |     "    \"\"\"\n",
185 |     "    [implement]\n",
186 |     "\n",
187 |     "  # start recursive depth-first search\n",
188 |     "  [implement]\n",
189 |     "\n",
190 |     "  return ordering\n",
191 |     "\n",
192 |     "# session = Session()\n",
193 |     "# output = session.run(some_operation, {\n",
194 |     "#     X: train_X # [1,2,...,n_features]\n",
195 |     "# })\n",
196 |     "\n",
197 |     "class Session():\n",
198 |     "  \"\"\"\n",
199 |     "    A session provides a context to run the computation graph\n",
200 |     "  \"\"\"\n",
201 |     "  def run(self, operation, feed_dict={}):\n",
202 |     "    \"\"\"\n",
203 |     "      apply topological sort on the computation graph starting from the operation node\n",
204 |     "      operation is the final operation node\n",
205 |     "      feed_dict: a dictionary that maps Placeholder to actual data value (in numpy)\n",
206 |     "      if a node is a placeholder, it should take value from feed_dict, \n",
207 |     "      if a node is variable or constant, it just use the node's value\n",
208 |     "      it a node is an operation, it should get the node's input_nodes, and then apply forward\n",
209 |     "    \"\"\"\n",
210 |     "    nodes_sorted = topology_sort(operation)\n",
211 |     "\n",
212 |     "    for node in nodes_sorted:\n",
213 |     "      [implement]\n",
214 |     "\n",
215 |     "    return operation.output"
216 |    ]
217 |   },
218 |   {
219 |    "cell_type": "code",
220 |    "execution_count": 8,
221 |    "metadata": {},
222 |    "outputs": [
223 |     {
224 |      "name": "stdout",
225 |      "output_type": "stream",
226 |      "text": [
227 |       "2.7\n"
228 |      ]
229 |     }
230 |    ],
231 |    "source": [
232 |     "# create default graph\n",
233 |     "Graph().as_default()\n",
234 |     "\n",
235 |     "# construct computational graph by creating some nodes\n",
236 |     "# implement a simple network for y = a * x + b\n",
237 |     "# a and b are constant\n",
238 |     "# x is input\n",
239 |     "[implement]\n",
240 |     "\n",
241 |     "\n",
242 |     "# create a session object\n",
243 |     "[implement]\n",
244 |     "\n",
245 |     "# run computational graph to compute the output for 'res'\n",
246 |     "[implement]\n"
247 |    ]
248 |   }
249 |  ],
250 |  "metadata": {
251 |   "kernelspec": {
252 |    "display_name": "base",
253 |    "language": "python",
254 |    "name": "python3"
255 |   },
256 |   "language_info": {
257 |    "codemirror_mode": {
258 |     "name": "ipython",
259 |     "version": 3
260 |    },
261 |    "file_extension": ".py",
262 |    "mimetype": "text/x-python",
263 |    "name": "python",
264 |    "nbconvert_exporter": "python",
265 |    "pygments_lexer": "ipython3",
266 |    "version": "3.9.18"
267 |   }
268 |  },
269 |  "nbformat": 4,
270 |  "nbformat_minor": 2
271 | }
272 | 


--------------------------------------------------------------------------------
/mini_tensorflow/mini_tensorflow_full.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import numpy as np\n",
 10 |     "\n",
 11 |     "class Graph():\n",
 12 |     "  \"\"\" Define a computation graph, which contains\n",
 13 |     "        placeholders: to store the input data\n",
 14 |     "        variables: to store the network parameters\n",
 15 |     "        constants: some static data\n",
 16 |     "        operations: the mathmatical operations for each neural network layer\n",
 17 |     "  \"\"\"\n",
 18 |     "  def __init__(self):\n",
 19 |     "    \"\"\"\n",
 20 |     "       Please define attributes for placeholders, variables, constants, operations,\n",
 21 |     "       using lists.\n",
 22 |     "    \"\"\"\n",
 23 |     "    self.operations = []\n",
 24 |     "    self.placeholders = []\n",
 25 |     "    self.variables = []\n",
 26 |     "    self.constants = []\n",
 27 |     "\n",
 28 |     "  def as_default(self):\n",
 29 |     "    \"\"\"\n",
 30 |     "      define a global default computation graph\n",
 31 |     "    \"\"\"\n",
 32 |     "    global _default_graph\n",
 33 |     "    _default_graph = self\n",
 34 |     "    \n",
 35 |     "  def add_operation(self, op):\n",
 36 |     "    \"\"\"\n",
 37 |     "      add op to operations\n",
 38 |     "    \"\"\"\n",
 39 |     "    self.operations.append(op)\n",
 40 |     "\n",
 41 |     "  def add_placeholder(self, holder):\n",
 42 |     "    \"\"\"\n",
 43 |     "      add holder to placeholders\n",
 44 |     "    \"\"\"\n",
 45 |     "    self.placeholders.append(holder)\n",
 46 |     "  \n",
 47 |     "  def add_variable(self, var):\n",
 48 |     "    \"\"\"\n",
 49 |     "      add var to variables\n",
 50 |     "    \"\"\"\n",
 51 |     "    self.variables.append(var)\n",
 52 |     "    \n",
 53 |     "  def add_constant(self, c):\n",
 54 |     "    \"\"\"\n",
 55 |     "      add c to constants\n",
 56 |     "    \"\"\"\n",
 57 |     "    self.constants.append(c)\n",
 58 |     "\n",
 59 |     "class Operation():\n",
 60 |     "  \"\"\"\n",
 61 |     "    Define network operation node. It should contain input nodes and one output node.\n",
 62 |     "  \"\"\"\n",
 63 |     "  def __init__(self, input_nodes=None):\n",
 64 |     "    \"\"\"\n",
 65 |     "      Define two attributes here: \n",
 66 |     "        input_nodes\n",
 67 |     "        output\n",
 68 |     "      Add the current Operation node to _default_graph's operations\n",
 69 |     "    \"\"\"\n",
 70 |     "    self.input_nodes = input_nodes\n",
 71 |     "    self.output = None\n",
 72 |     "    \n",
 73 |     "    # Append operation to the list of operations of the default graph\n",
 74 |     "    _default_graph.add_operation(self)\n",
 75 |     "\n",
 76 |     "  def forward(self):\n",
 77 |     "    pass\n",
 78 |     "\n",
 79 |     "  def backward(self):\n",
 80 |     "    pass\n",
 81 |     "\n",
 82 |     "\n",
 83 |     "class BinaryOperation(Operation):\n",
 84 |     "  \"\"\"\n",
 85 |     "    define binary operations\n",
 86 |     "  \"\"\"\n",
 87 |     "  def __init__(self, a, b):\n",
 88 |     "    \"\"\"\n",
 89 |     "      a, b are input nodes. please initialize\n",
 90 |     "    \"\"\"\n",
 91 |     "    super().__init__([a, b])\n",
 92 |     "\n",
 93 |     "class add(BinaryOperation):\n",
 94 |     "  \"\"\"\n",
 95 |     "  Computes a + b, element-wise\n",
 96 |     "  \"\"\"\n",
 97 |     "  def forward(self, a, b):\n",
 98 |     "    return a + b\n",
 99 |     "\n",
100 |     "  def backward(self, upstream_grad):\n",
101 |     "    raise NotImplementedError\n",
102 |     "\n",
103 |     "class multiply(BinaryOperation):\n",
104 |     "  \"\"\"\n",
105 |     "  Computes a * b, element-wise\n",
106 |     "  \"\"\"\n",
107 |     "  def forward(self, a, b):\n",
108 |     "    return a * b\n",
109 |     "\n",
110 |     "  def backward(self, upstream_grad):\n",
111 |     "    raise NotImplementedError\n",
112 |     "\n",
113 |     "class divide(BinaryOperation):\n",
114 |     "  \"\"\"\n",
115 |     "  Returns the true division of the inputs, element-wise\n",
116 |     "  \"\"\"\n",
117 |     "  def forward(self, a, b):\n",
118 |     "    return np.true_divide(a, b)\n",
119 |     "\n",
120 |     "  def backward(self, upstream_grad):\n",
121 |     "    raise NotImplementedError\n",
122 |     "\n",
123 |     "class matmul(BinaryOperation):\n",
124 |     "  \"\"\"\n",
125 |     "  Multiplies matrix a by matrix b, producing a * b\n",
126 |     "  \"\"\"\n",
127 |     "  def forward(self, a, b):\n",
128 |     "    \"\"\"\n",
129 |     "      using numpy.dot to perform matrix multiplication on a and b\n",
130 |     "    \"\"\"    \n",
131 |     "    return a.dot(b)\n",
132 |     "\n",
133 |     "  def backward(self, upstream_grad):\n",
134 |     "    raise NotImplementedError\n",
135 |     "  \n",
136 |     "class Placeholder():\n",
137 |     "  \"\"\"\n",
138 |     "    define placeholder. It should contain a value attribute to store value\n",
139 |     "  \"\"\"\n",
140 |     "  def __init__(self):\n",
141 |     "    \"\"\"\n",
142 |     "      initialize the value to None, add the current node to default graph's placeholder\n",
143 |     "    \"\"\"    \n",
144 |     "    self.value = None\n",
145 |     "    _default_graph.add_placeholder(self)\n",
146 |     "\n",
147 |     "class Constant():\n",
148 |     "  \"\"\"\n",
149 |     "    Define a constant node\n",
150 |     "  \"\"\"\n",
151 |     "  def __init__(self, value=None):\n",
152 |     "    \"\"\"\n",
153 |     "      define internal __value to store the value, add the current node to graph's constant\n",
154 |     "    \"\"\"    \n",
155 |     "    self.__value = value\n",
156 |     "    _default_graph.add_constant(self)\n",
157 |     "\n",
158 |     "  @property\n",
159 |     "  def value(self):\n",
160 |     "    \"\"\"\n",
161 |     "      return the internal value\n",
162 |     "    \"\"\"\n",
163 |     "    return self.__value\n",
164 |     "\n",
165 |     "  @value.setter\n",
166 |     "  def value(self, value):\n",
167 |     "    raise ValueError(\"Cannot reassign value.\")\n",
168 |     "  \n",
169 |     "class Variable():\n",
170 |     "  \"\"\"\n",
171 |     "    define a variable node (for parameter) with initial_value\n",
172 |     "  \"\"\"\n",
173 |     "  def __init__(self, initial_value=None):\n",
174 |     "    \"\"\"\n",
175 |     "      assign initial_value to value, add the current node to graph's variables\n",
176 |     "    \"\"\"    \n",
177 |     "    self.value = initial_value\n",
178 |     "    _default_graph.add_variable(self)\n",
179 |     "\n",
180 |     "def topology_sort(operation):\n",
181 |     "  \"\"\"\n",
182 |     "    implement topological sort to order the operations, starting from current node\n",
183 |     "  \"\"\"\n",
184 |     "  ordering = []\n",
185 |     "  visited_nodes = set()\n",
186 |     "\n",
187 |     "  def recursive_helper(node):\n",
188 |     "    \"\"\"\n",
189 |     "      for each Operation node (using isinstance)\n",
190 |     "      recursively find the incoming nodes, visit them first and add node to visited nodes. \n",
191 |     "    \"\"\"    \n",
192 |     "    if isinstance(node, Operation):\n",
193 |     "      for input_node in node.input_nodes:\n",
194 |     "        if input_node not in visited_nodes:\n",
195 |     "          recursive_helper(input_node)\n",
196 |     "\n",
197 |     "    visited_nodes.add(node)\n",
198 |     "    ordering.append(node)\n",
199 |     "\n",
200 |     "  # start recursive depth-first search\n",
201 |     "  recursive_helper(operation)\n",
202 |     "\n",
203 |     "  return ordering\n",
204 |     "\n",
205 |     "# session = Session()\n",
206 |     "# output = session.run(some_operation, {\n",
207 |     "#     X: train_X # [1,2,...,n_features]\n",
208 |     "# })\n",
209 |     "\n",
210 |     "class Session():\n",
211 |     "  \"\"\"\n",
212 |     "    A session provides a context to run the computation graph\n",
213 |     "  \"\"\"\n",
214 |     "  def run(self, operation, feed_dict={}):\n",
215 |     "    \"\"\"\n",
216 |     "      apply topological sort on the computation graph starting from the operation node\n",
217 |     "      operation is the final operation node\n",
218 |     "      feed_dict: a dictionary that maps Placeholder to actual data value (in numpy)\n",
219 |     "      if a node is a placeholder, it should take value from feed_dict, \n",
220 |     "      if a node is variable or constant, it just use the node's value\n",
221 |     "      it a node is an operation, it should get the node's input_nodes, and then apply forward\n",
222 |     "    \"\"\"\n",
223 |     "    nodes_sorted = topology_sort(operation)\n",
224 |     "\n",
225 |     "    for node in nodes_sorted:\n",
226 |     "      if type(node) == Placeholder:\n",
227 |     "        node.output = feed_dict[node]\n",
228 |     "      elif type(node) == Variable or type(node) == Constant:\n",
229 |     "        node.output = node.value\n",
230 |     "      else:\n",
231 |     "        inputs = [node.output for node in node.input_nodes]\n",
232 |     "        node.output = node.forward(*inputs)\n",
233 |     "\n",
234 |     "    return operation.output"
235 |    ]
236 |   },
237 |   {
238 |    "cell_type": "code",
239 |    "execution_count": 8,
240 |    "metadata": {},
241 |    "outputs": [
242 |     {
243 |      "name": "stdout",
244 |      "output_type": "stream",
245 |      "text": [
246 |       "2.7\n"
247 |      ]
248 |     }
249 |    ],
250 |    "source": [
251 |     "# create default graph\n",
252 |     "Graph().as_default()\n",
253 |     "\n",
254 |     "# construct computational graph by creating some nodes\n",
255 |     "# implement a simple network for y = a * x + b\n",
256 |     "a = Constant(np.array([2.0, 1.5]))\n",
257 |     "b = Constant(0.5)\n",
258 |     "x = Placeholder()\n",
259 |     "x2 = matmul(a, x)\n",
260 |     "y = add(x2, b)\n",
261 |     "\n",
262 |     "x_data = np.array([0.5, 0.8])\n",
263 |     "input_data = {x: x_data}\n",
264 |     "\n",
265 |     "# create a session object\n",
266 |     "session = Session()\n",
267 |     "\n",
268 |     "# run computational graph to compute the output for 'res'\n",
269 |     "out = session.run(y, input_data)\n",
270 |     "print(out)"
271 |    ]
272 |   }
273 |  ],
274 |  "metadata": {
275 |   "kernelspec": {
276 |    "display_name": "base",
277 |    "language": "python",
278 |    "name": "python3"
279 |   },
280 |   "language_info": {
281 |    "codemirror_mode": {
282 |     "name": "ipython",
283 |     "version": 3
284 |    },
285 |    "file_extension": ".py",
286 |    "mimetype": "text/x-python",
287 |    "name": "python",
288 |    "nbconvert_exporter": "python",
289 |    "pygments_lexer": "ipython3",
290 |    "version": "3.9.18"
291 |   }
292 |  },
293 |  "nbformat": 4,
294 |  "nbformat_minor": 2
295 | }
296 | 


--------------------------------------------------------------------------------
/simple_cuda_demo/example_matadd.cu:
--------------------------------------------------------------------------------
  1 | // This program computes a simple version of matrix multiplication
  2 | #include <cuda_runtime.h>
  3 | #include <algorithm>
  4 | #include <cassert>
  5 | #include <cstdlib>
  6 | #include <functional>
  7 | #include <iostream>
  8 | #include <fstream>
  9 | #include <sstream>
 10 | #include <vector>
 11 | 
 12 | using namespace std;
 13 | using std::generate;
 14 | using std::vector;
 15 | 
 16 | 
 17 | __global__ void matrixAdd(const int * a, const int * b,
 18 |                           int * c, int N) {
 19 |   // Compute each thread's global row and column index
 20 |   int row = blockIdx.x * blockDim.x + threadIdx.x;
 21 |   int col = blockIdx.y * blockDim.y + threadIdx.y;
 22 |   
 23 |   // Iterate over row, and down column
 24 |   if (row < N && col < N) {
 25 |     c[row * N + col] = a[row * N + col] + b[row * N + col];
 26 |   }
 27 | }
 28 | 
 29 | 
 30 | 
 31 | // Check result on the CPU
 32 | void verify_result_add(vector<int> &a, vector<int> &b, vector<int> &c, int N) {
 33 |   // For every row...
 34 |   for (int i = 0; i < N; i++) {
 35 |     // For every column...
 36 |     for (int j = 0; j < N; j++) {
 37 |       // For every element in the row-column pair
 38 |       // Check against the CPU result
 39 |       if (a[i * N + j] + b[i * N + j] != c[i * N + j]) {
 40 |         printf("Error in (%d, %d): %d + %d != %d\n", i, j, a[i * N + j], b[i * N + j], c[i * N + j]);
 41 |       }
 42 |       assert(a[i * N + j] + b[i * N + j] == c[i * N + j]);
 43 |     }
 44 |   }
 45 | }
 46 | 
 47 | int main() {
 48 |   // Matrix size of 1024 x 1024;
 49 |   int N = 1 << 10;
 50 | 
 51 |   // Size (in bytes) of matrix
 52 |   size_t bytes = N * N * sizeof(int);
 53 | 
 54 |   // Host vectors
 55 |   vector<int> h_a(N * N);
 56 |   vector<int> h_b(N * N);
 57 |   vector<int> h_c(N * N);
 58 | 
 59 |   // Initialize matrices
 60 |   generate(h_a.begin(), h_a.end(), []() { return rand() % 100; });
 61 |   generate(h_b.begin(), h_b.end(), []() { return rand() % 100; });
 62 | 
 63 |   
 64 |   // Allocate device memory
 65 |   int *d_a, *d_b, *d_c;
 66 |   cudaMalloc(&d_a, bytes);
 67 |   cudaMalloc(&d_b, bytes);
 68 |   cudaMalloc(&d_c, bytes);
 69 | 
 70 | 
 71 |   // Copy data to the device
 72 |   cudaMemcpy(d_a, h_a.data(), bytes, cudaMemcpyHostToDevice);
 73 |   cudaMemcpy(d_b, h_b.data(), bytes, cudaMemcpyHostToDevice);
 74 | 
 75 | 
 76 |   // Threads per CTA dimension
 77 |   int THREADS = 32;
 78 | 
 79 |   // Blocks per grid dimension (assumes THREADS divides N evenly)
 80 |   int BLOCKS = N / THREADS;
 81 | 
 82 |   // Use dim3 structs for block  and grid dimensions
 83 |   dim3 threads(THREADS, THREADS);  // should be <= 1024
 84 |   dim3 blocks(BLOCKS, BLOCKS);
 85 | 
 86 |   // Launch kernel
 87 |   matrixAdd<<<blocks, threads>>>(d_a, d_b, d_c, N);
 88 | 
 89 |   // Copy back to the host
 90 |   cudaMemcpy(h_c.data(), d_c, bytes, cudaMemcpyDeviceToHost);
 91 | 
 92 |   cudaDeviceSynchronize();
 93 | 
 94 |   // Check CUDA execution
 95 |   cudaError_t err = cudaGetLastError();
 96 |   if (err != cudaSuccess) {
 97 |     fprintf(stderr, "Error: %s\n", cudaGetErrorString(err));
 98 |     // Handle the error (e.g., by exiting the program)
 99 |   }
100 | 
101 |   // Check result
102 |   verify_result_add(h_a, h_b, h_c, N);
103 | 
104 |   cout << "COMPLETED SUCCESSFULLY\n";
105 | 
106 |   // Free memory on device
107 |   cudaFree(d_a);
108 |   cudaFree(d_b);
109 |   cudaFree(d_c);
110 | 
111 |   return 0;
112 | }
113 | 


--------------------------------------------------------------------------------
/simple_cuda_demo/example_matmul.cu:
--------------------------------------------------------------------------------
 1 | #include <cuda_runtime.h>
 2 | 
 3 | #include <iostream>
 4 | #include <sstream>
 5 | #include <fstream>
 6 | 
 7 | 
 8 | __global__ void MatmulKernel(const float* a, const float* b, float* out, 
 9 |                              int M, int N, int P) {
10 |   // Calculate the global thread index and the row and column it corresponds to
11 |   // Every thread will compute one element of the output matrix
12 |   int idx = threadIdx.x + blockIdx.x * blockDim.x;
13 |   int row = idx / P;
14 |   int col = idx % P;
15 |   // Compute the summation of the dot product of the row of a and the column of b
16 |   if (row < M && col < P) {
17 |     float sum = 0.0;
18 |     for (int i = 0; i < N; i++) {
19 |       sum += a[row * N + i] * b[i * P + col];
20 |     }
21 |     out[row * P + col] = sum;
22 |   }
23 | }
24 | 
25 | extern "C" {
26 | 
27 | // This functions takes in arrays which are already on the GPU
28 | // and will return arrays which are also on the GPU
29 | // Copying values between the device memory and host memory is done in the python codes
30 | 
31 | void Matmul(const float* a, const float* b, float* c, int M, int N, int P) {
32 |     int n = M * P;
33 |     int threads_per_block = 256;
34 |     int num_blocks = (n + threads_per_block - 1) / threads_per_block;
35 |     MatmulKernel<<<num_blocks, threads_per_block>>>(a, b, c, M, N, P);
36 | }
37 | 
38 | }


--------------------------------------------------------------------------------
/simple_cuda_demo/example_matmul2.cu:
--------------------------------------------------------------------------------
  1 | // This program computes a simple version of matrix multiplication
  2 | 
  3 | #include <algorithm>
  4 | #include <cassert>
  5 | #include <cstdlib>
  6 | #include <functional>
  7 | #include <iostream>
  8 | #include <vector>
  9 | 
 10 | using std::cout;
 11 | using std::generate;
 12 | using std::vector;
 13 | 
 14 | __global__ void matrixMul(const int *a, const int *b, int *c, int M, int N, int P) {
 15 |   // Compute each thread's global row and column index
 16 |   int row = blockIdx.x * blockDim.x + threadIdx.x;
 17 |   int col = blockIdx.y * blockDim.y + threadIdx.y;
 18 |   if (row >= M || col >= P) return;
 19 |   // Iterate over row, and down column
 20 |   c[row * P + col] = 0;
 21 |   for (int k = 0; k < N; k++) {
 22 |     // Accumulate results for a single element
 23 |     c[row * P + col] += a[row * N + k] * b[k * P + col];
 24 |   }
 25 | }
 26 | 
 27 | // Check result on the CPU
 28 | void verify_result(vector<int> &a, vector<int> &b, vector<int> &c, int M, int N, int P) {
 29 |   // For every row...
 30 |   for (int i = 0; i < M; i++) {
 31 |     // For every column...
 32 |     for (int j = 0; j < P; j++) {
 33 |       // For every element in the row-column pair
 34 |       int tmp = 0;
 35 |       for (int k = 0; k < N; k++) {
 36 |         // Accumulate the partial results
 37 |         tmp += a[i * N + k] * b[k * P + j];
 38 |       }
 39 | 
 40 |       // Check against the CPU result
 41 |       if (tmp != c[i * P + j]) {
 42 |         printf("Error in (%d, %d): %d != %d\n", i, j, tmp, c[i * P + j]);
 43 |       }
 44 |       assert(tmp == c[i * P + j]);
 45 |     }
 46 |   }
 47 | }
 48 | 
 49 | int main() {
 50 |   // Matrix size of 256 x 1024, 1024 x 512;
 51 |   int M = 1 << 8;
 52 |   int N = 1 << 10;
 53 |   int P = 1 << 9;
 54 | 
 55 | 
 56 |   // Host vectors
 57 |   vector<int> h_a(M * N);
 58 |   vector<int> h_b(N * P);
 59 |   vector<int> h_c(M * P);
 60 | 
 61 |   // Initialize matrices
 62 |   generate(h_a.begin(), h_a.end(), []() { return rand() % 100; });
 63 |   generate(h_b.begin(), h_b.end(), []() { return rand() % 100; });
 64 | 
 65 |   // Allocate device memory
 66 |   int *d_a, *d_b, *d_c;
 67 |   cudaMalloc(&d_a, M * N * sizeof(int));
 68 |   cudaMalloc(&d_b, N * P * sizeof(int));
 69 |   cudaMalloc(&d_c, M * P * sizeof(int));
 70 | 
 71 |   // Copy data to the device
 72 |   cudaMemcpy(d_a, h_a.data(), M * N * sizeof(int), cudaMemcpyHostToDevice);
 73 |   cudaMemcpy(d_b, h_b.data(), N * P * sizeof(int), cudaMemcpyHostToDevice);
 74 | 
 75 |   // Threads per CTA dimension
 76 |   int THREADS = 32;
 77 | 
 78 |   // Blocks per grid dimension (assumes THREADS divides N evenly)
 79 |   int BLOCKS_X = M / THREADS, BLOCKS_Y = P / THREADS;
 80 | 
 81 |   // Use dim3 structs for block  and grid dimensions
 82 |   dim3 threads(THREADS, THREADS);
 83 |   dim3 blocks(BLOCKS_X, BLOCKS_Y);
 84 | 
 85 |   // Launch kernel
 86 |   matrixMul<<<blocks, threads>>>(d_a, d_b, d_c, M, N, P);
 87 | 
 88 |   // Copy back to the host
 89 |   cudaMemcpy(h_c.data(), d_c, M * P * sizeof(int), cudaMemcpyDeviceToHost);
 90 | 
 91 |   // Check result
 92 |   verify_result(h_a, h_b, h_c, M, N, P);
 93 | 
 94 |   cout << "COMPLETED SUCCESSFULLY\n";
 95 | 
 96 |   // Free memory on device
 97 |   cudaFree(d_a);
 98 |   cudaFree(d_b);
 99 |   cudaFree(d_c);
100 | 
101 |   return 0;
102 | }
103 | 


--------------------------------------------------------------------------------
/simple_cuda_demo/example_vector_add.cu:
--------------------------------------------------------------------------------
  1 | #include <cuda_runtime.h>
  2 | #include <iostream>
  3 | #include <sstream>
  4 | #include <fstream>
  5 | #include <algorithm>
  6 | #include <cassert>
  7 | #include <cstdlib>
  8 | #include <functional>
  9 | #include <vector>
 10 | 
 11 | using namespace std;
 12 | using std::generate;
 13 | using std::vector;
 14 | 
 15 | __global__ void VecAddKernel(int* A, int* B, int* C, int n) {
 16 |   // blockDim is size of block along x-axis
 17 |   // blockIdx is the index of the current thread's block
 18 |   // threadIdx is the index of the current thread within the block
 19 |   // Compute the global thread ID
 20 |   int i = blockDim.x * blockIdx.x + threadIdx.x;
 21 |   if (i < n) {
 22 |     // Calculate the addition of the ith element of A and B
 23 |     C[i] = A[i] + B[i];
 24 |   }
 25 | }
 26 | 
 27 | 
 28 | extern "C" {
 29 | 
 30 | void VecAddCPU(int* A, int* B, int* C, int n) {
 31 |   for(int i = 0; i < n; ++i) {
 32 |     C[i] = A[i] + B[i];
 33 |   }
 34 | }
 35 | 
 36 | 
 37 | void VecAddCUDA(int* Agpu, int* Bgpu, int* Cgpu, int n) {
 38 |   // In this example, we load the data into the GPU by Python codes.
 39 |   int threads_per_block = 256;
 40 |   int num_blocks = (n + threads_per_block - 1) / threads_per_block;
 41 |   VecAddKernel<<<num_blocks, threads_per_block>>>(Agpu, Bgpu, Cgpu, n);
 42 | }
 43 | 
 44 | 
 45 | void VecAddCUDA2(int* Acpu, int* Bcpu, int* Ccpu, int n) {
 46 |   // In this example, we load the data into the GPU by C++ codes.
 47 |   int *dA, *dB, *dC;
 48 |   // Allocate device memory
 49 |   cudaMalloc(&dA, n * sizeof(int));
 50 |   cudaMalloc(&dB, n * sizeof(int));
 51 |   cudaMalloc(&dC, n * sizeof(int));
 52 |   // Copy data from host memory to device memory
 53 |   cudaMemcpy(dA, Acpu, n * sizeof(int), cudaMemcpyHostToDevice);
 54 |   cudaMemcpy(dB, Bcpu, n * sizeof(int), cudaMemcpyHostToDevice);
 55 |   // Launch the CUDA kernel
 56 |   int threads_per_block = 256;
 57 |   int num_blocks = (n + threads_per_block - 1) / threads_per_block;
 58 |   VecAddKernel<<<num_blocks, threads_per_block>>>(dA, dB, dC, n);
 59 |   // Copy the result from device memory to host memory
 60 |   cudaMemcpy(Ccpu, dC, n * sizeof(int), cudaMemcpyDeviceToHost);
 61 |   // Free device memory
 62 |   cudaFree(dA); 
 63 |   cudaFree(dB); 
 64 |   cudaFree(dC);
 65 | }
 66 | 
 67 | }
 68 | 
 69 | // Check result on the CPU
 70 | void verify_result_vecadd(vector<int> &a, vector<int> &b, vector<int> &c, int N) {
 71 |   // For every element...
 72 |   for (int i = 0; i < N; i++) {
 73 |     // For every element in the row-column pair
 74 |     // Check against the CPU result
 75 |     if (a[i] + b[i] != c[i]) {
 76 |       printf("Error in (%d): %d + %d != %d\n", i, a[i], b[i], c[i]);
 77 |     }
 78 |     assert(a[i] + b[i] == c[i]);
 79 |   }
 80 | }
 81 | 
 82 | int main() {
 83 |   // length of the vector
 84 |   int n = 1024;
 85 | 
 86 |   // Host vectors
 87 |   vector<int> h_a(n);
 88 |   vector<int> h_b(n);
 89 |   vector<int> h_c(n);
 90 | 
 91 |   // Initialize matrices
 92 |   generate(h_a.begin(), h_a.end(), []() { return rand() % 100; });
 93 |   generate(h_b.begin(), h_b.end(), []() { return rand() % 100; });
 94 | 
 95 |   VecAddCUDA2(h_a.data(), h_b.data(), h_c.data(), n);
 96 | 
 97 |   cudaDeviceSynchronize();
 98 | 
 99 |   // Check CUDA execution
100 |   cudaError_t err = cudaGetLastError();
101 |   if (err != cudaSuccess) {
102 |     fprintf(stderr, "Error: %s\n", cudaGetErrorString(err));
103 |     // Handle the error (e.g., by exiting the program)
104 |   }
105 | 
106 |   // Check result
107 |   verify_result_vecadd(h_a, h_b, h_c, n);
108 | 
109 |   cout << "Vector add verified! COMPLETED SUCCESSFULLY\n";
110 | }


--------------------------------------------------------------------------------
/simple_cuda_demo/example_window_sum.cu:
--------------------------------------------------------------------------------
 1 | #include <cuda_runtime.h>
 2 | 
 3 | #include <iostream>
 4 | #include <sstream>
 5 | #include <fstream>
 6 | 
 7 | 
 8 | #define RADIUS 2
 9 | #define THREADS_PER_BLOCK 4
10 | 
11 | 
12 | __global__ void WindowSumSimpleKernel(float* A, float *B, int n) {
13 |     // Each thread will compute one element of B, calculate the global index of the element
14 |     int out_idx = blockDim.x * blockIdx.x + threadIdx.x;
15 |     if (out_idx < n) {
16 |         // Calculate the sum of the elements in the window of 5
17 |         float sum = 0;
18 |         for (int dx = -RADIUS; dx <= RADIUS; ++dx) {
19 |             sum += A[dx + out_idx + RADIUS];
20 |         }
21 |         B[out_idx] = sum;
22 |     }
23 | }
24 | 
25 | __global__ void WindowSumSharedKernel(float* A, float *B, int size_a, int size_b) {
26 |     __shared__ float temp[THREADS_PER_BLOCK + 2 * RADIUS];
27 |     int base = blockDim.x * blockIdx.x;
28 |     int out_idx = base + threadIdx.x;
29 |     // Load the elements into the shared memory
30 |     if (base + threadIdx.x < size_a) {
31 |         temp[threadIdx.x] = A[base + threadIdx.x];
32 |     }
33 |     if (threadIdx.x < 2 * RADIUS && base + THREADS_PER_BLOCK + threadIdx.x < size_a) {
34 |         temp[threadIdx.x + THREADS_PER_BLOCK] = A[base + THREADS_PER_BLOCK + threadIdx.x];
35 |     }
36 |     // Wait for all threads to finish loading, after this point, all threads will have the same copy of the shared memory
37 |     __syncthreads();
38 |     // Until then, start calculating the sum of the elements in the window of 5
39 |     if (out_idx < size_b) {
40 |         float sum = 0;
41 |         for (int dx = -RADIUS; dx <= RADIUS; ++dx) {
42 |             // Accessing shared memory within blocks is faster than accessing global memory
43 |             sum += temp[threadIdx.x + dx + RADIUS];
44 |         }
45 |         B[out_idx] = sum;
46 |     }
47 | }
48 | 
49 | extern "C" {
50 | 
51 | // These two functions take in arrays which are already on the GPU
52 | // and will return arrays which are also on the GPU
53 | // Copying values between the device memory and host memory is done in the python codes
54 | 
55 | void WindowSumSimple(float* in_array, float* out_array, int n) {
56 |   int num_blocks = (n + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
57 |   WindowSumSimpleKernel<<<num_blocks, THREADS_PER_BLOCK>>>(in_array, out_array, n);
58 | }
59 | 
60 | void WindowSumShared(float* in_array, float* out_array, int size_a, int size_b) {
61 |   int num_blocks = (size_b + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
62 |   WindowSumSharedKernel<<<num_blocks, THREADS_PER_BLOCK>>>(in_array, out_array, size_a, size_b);
63 |   cudaDeviceSynchronize();
64 | }
65 | }


--------------------------------------------------------------------------------
/simple_cuda_demo/test_matmul.py:
--------------------------------------------------------------------------------
 1 | import ctypes
 2 | import pycuda.gpuarray as gpuarray
 3 | import pycuda.driver as cuda
 4 | from pycuda.compiler import SourceModule
 5 | import pycuda.autoinit
 6 | 
 7 | import os
 8 | import numpy as np
 9 | 
10 | # Load the shared library
11 | cur_path = os.getcwd()
12 | lib = ctypes.CDLL(os.path.join(cur_path, "matmul.so"))
13 | 
14 | m, n, p = 4, 4, 2
15 | np.random.seed(0)
16 | a = np.random.randint(1, 3, [m, n]).astype(np.float32)
17 | b = np.random.randint(1, 3, [n, p]).astype(np.float32)
18 | cgpu = np.zeros([m, p], dtype=np.float32)
19 | cgputile = np.zeros([m, p], dtype=np.float32)
20 | 
21 | print(f"Input a: {a}\nInput b: {b}")
22 | 
23 | print(f"Numpy matmul: {a @ b}, {type(a@b)}")
24 | 
25 | # Define argtypes and returntypes of the C function
26 | lib.Matmul.argtypes = [
27 |     ctypes.POINTER(ctypes.c_float),
28 |     ctypes.POINTER(ctypes.c_float),
29 |     ctypes.POINTER(ctypes.c_float),
30 |     ctypes.c_int,
31 | ]
32 | 
33 | lib.Matmul.restype = None
34 | 
35 | # Load the arrays to CUDA device
36 | a_gpu = gpuarray.to_gpu(a)
37 | b_gpu = gpuarray.to_gpu(b)
38 | c_gpu = gpuarray.to_gpu(cgpu)
39 | 
40 | # Call the C wrapper function with CUDA kernel
41 | lib.Matmul(
42 |     ctypes.cast(a_gpu.ptr, ctypes.POINTER(ctypes.c_float)),
43 |     ctypes.cast(b_gpu.ptr, ctypes.POINTER(ctypes.c_float)),
44 |     ctypes.cast(c_gpu.ptr, ctypes.POINTER(ctypes.c_float)),
45 |     ctypes.c_int(m),
46 |     ctypes.c_int(n),
47 |     ctypes.c_int(p)
48 | )
49 | 
50 | print(f"GPU matmul: {c_gpu}, {type(c_gpu)}")
51 | # Load the gpuarray back to array in the host device
52 | cgpu = c_gpu.get()
53 | print(f"After offload: {cgpu}, {type(cgpu)}")
54 | 
55 | # Compare result
56 | ccpu = a @ b
57 | print(f"Compare result: {np.linalg.norm(ccpu - cgpu)}")


--------------------------------------------------------------------------------
/simple_cuda_demo/test_vector_add.py:
--------------------------------------------------------------------------------
 1 | import ctypes
 2 | import pycuda.gpuarray as gpuarray
 3 | import pycuda.driver as cuda
 4 | from pycuda.compiler import SourceModule
 5 | import pycuda.autoinit
 6 | 
 7 | import os
 8 | import numpy as np
 9 | 
10 | # Load the shared library
11 | cur_path = os.getcwd()
12 | lib = ctypes.CDLL(os.path.join(cur_path, "vector_add.so"))
13 | 
14 | size = 10
15 | a = np.random.randint(1, 10, size, dtype=np.int32)
16 | b = np.random.randint(1, 10, size, dtype=np.int32)
17 | ccpu = np.zeros(size, dtype=np.int32)
18 | cgpu = np.zeros(size, dtype=np.int32)
19 | 
20 | print(f"Input a: {a}\nInput b: {b}")
21 | 
22 | print(f"Numpy add: {a + b}, {type(a+b)}")
23 | 
24 | # Define argtypes and returntypes of the C function
25 | lib.VecAddCPU.argtypes = [
26 |     np.ctypeslib.ndpointer(dtype=np.int32, ndim=1, flags='C_CONTIGUOUS'),
27 |     np.ctypeslib.ndpointer(dtype=np.int32, ndim=1, flags='C_CONTIGUOUS'),
28 |     np.ctypeslib.ndpointer(dtype=np.int32, ndim=1, flags='C_CONTIGUOUS'),
29 |     ctypes.c_int
30 | ]
31 | 
32 | lib.VecAddCPU.restype = None
33 | 
34 | lib.VecAddCUDA.argtypes = [
35 |     ctypes.POINTER(ctypes.c_int),
36 |     ctypes.POINTER(ctypes.c_int),
37 |     ctypes.POINTER(ctypes.c_int),
38 |     ctypes.c_int,
39 | ]
40 | 
41 | lib.VecAddCUDA.restype = None
42 | 
43 | # Call the C function
44 | lib.VecAddCPU(a, b, ccpu, size)
45 | 
46 | print(f"CPU add: {ccpu}, {type(ccpu)}")
47 | 
48 | # Load the arrays to CUDA device
49 | a_gpu = gpuarray.to_gpu(a)
50 | b_gpu = gpuarray.to_gpu(b)
51 | c_gpu = gpuarray.to_gpu(cgpu)
52 | 
53 | # Call the C wrapper function with CUDA kernel
54 | lib.VecAddCUDA(
55 |     ctypes.cast(a_gpu.ptr, ctypes.POINTER(ctypes.c_int)),
56 |     ctypes.cast(b_gpu.ptr, ctypes.POINTER(ctypes.c_int)),
57 |     ctypes.cast(c_gpu.ptr, ctypes.POINTER(ctypes.c_int)),
58 |     ctypes.c_int(size)
59 | )
60 | 
61 | print(f"GPU add: {c_gpu}, {type(c_gpu)}")
62 | # Load the gpuarray back to array in the host device
63 | cgpu = c_gpu.get()
64 | print(f"After offload: {cgpu}, {type(cgpu)}")
65 | 
66 | 
67 | 
68 | lib.VecAddCUDA2.argtypes = [
69 |     np.ctypeslib.ndpointer(dtype=np.int32, ndim=1, flags='C_CONTIGUOUS'),
70 |     np.ctypeslib.ndpointer(dtype=np.int32, ndim=1, flags='C_CONTIGUOUS'),
71 |     np.ctypeslib.ndpointer(dtype=np.int32, ndim=1, flags='C_CONTIGUOUS'),
72 |     ctypes.c_int
73 | ]
74 | 
75 | lib.VecAddCUDA2.restype = None
76 | cgpu2 = np.zeros(size, dtype=np.int32)
77 | lib.VecAddCUDA2(a, b, cgpu2, size)
78 | print(f"GPU add2: {cgpu2}, {type(cgpu2)}")
79 | 
80 | 
81 | 


--------------------------------------------------------------------------------
/simple_cuda_demo/test_window_sum.py:
--------------------------------------------------------------------------------
 1 | import ctypes
 2 | import pycuda.gpuarray as gpuarray
 3 | import pycuda.driver as cuda
 4 | from pycuda.compiler import SourceModule
 5 | import pycuda.autoinit
 6 | 
 7 | import os
 8 | import numpy as np
 9 | from numpy.lib.stride_tricks import sliding_window_view
10 | 
11 | # Load the shared library
12 | cur_path = os.getcwd()
13 | lib = ctypes.CDLL(os.path.join(cur_path, "window_sum.so"))
14 | 
15 | in_array = np.array([i+1 for i in range(12)], dtype=np.float32)
16 | simple_array = np.zeros(8, dtype=np.float32)
17 | shared_array = np.zeros(8, dtype=np.float32)
18 | 
19 | print(f"Input: {in_array}")
20 | win_temp = sliding_window_view(in_array, 5)
21 | np_res = np.sum(win_temp, axis=1)
22 | print(f"Numpy window sum: {np_res}")
23 | 
24 | # Define argtypes and returntypes of the C function
25 | lib.WindowSumSimple.argtypes = [
26 |     ctypes.POINTER(ctypes.c_float),
27 |     ctypes.POINTER(ctypes.c_float),
28 |     ctypes.c_int,
29 | ]
30 | 
31 | lib.WindowSumSimple.restype = None
32 | 
33 | lib.WindowSumShared.argtypes = [
34 |     ctypes.POINTER(ctypes.c_float),
35 |     ctypes.POINTER(ctypes.c_float),
36 |     ctypes.c_int,
37 |     ctypes.c_int,
38 | ]
39 | 
40 | lib.WindowSumShared.restype = None
41 | 
42 | # Load the arrays to CUDA device
43 | in_gpu = gpuarray.to_gpu(in_array)
44 | simple_out_gpu = gpuarray.to_gpu(simple_array)
45 | shared_out_gpu = gpuarray.to_gpu(shared_array)
46 | 
47 | # Call the C wrapper function with CUDA kernel
48 | lib.WindowSumSimple(
49 |     ctypes.cast(in_gpu.ptr, ctypes.POINTER(ctypes.c_float)),
50 |     ctypes.cast(simple_out_gpu.ptr, ctypes.POINTER(ctypes.c_float)),
51 |     ctypes.c_int(len(simple_array)),
52 | )
53 | 
54 | # Load the gpuarray back to array in the host device
55 | simple_array = simple_out_gpu.get()
56 | print(f"GPU simple window sum: {simple_array}")
57 | 
58 | # Call the C wrapper function with CUDA kernel
59 | lib.WindowSumShared(
60 |     ctypes.cast(in_gpu.ptr, ctypes.POINTER(ctypes.c_float)),
61 |     ctypes.cast(shared_out_gpu.ptr, ctypes.POINTER(ctypes.c_float)),
62 |     ctypes.c_int(len(in_gpu)),
63 |     ctypes.c_int(len(shared_array)),
64 | )
65 | 
66 | # Load the gpuarray back to array in the host device
67 | shared_array = shared_out_gpu.get()
68 | print(f"GPU shared window sum: {shared_array}")


--------------------------------------------------------------------------------
/tensor_demo/miniTorch/.vscode/settings.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "python.pythonPath": "venv/bin/python",
 3 |   "python.testing.unittestEnabled": false,
 4 |   "python.testing.nosetestsEnabled": false,
 5 |   "python.testing.pytestEnabled": true,
 6 |   "python.testing.pytestArgs": [
 7 |     "tests"
 8 |   ],
 9 |   "restructuredtext.confPath": "${workspaceFolder}/docs/source",
10 |   "python.linting.enabled": true,
11 |   "python.linting.pylintEnabled": false,
12 |   "python.linting.banditEnabled": false,
13 |   "python.linting.flake8Enabled": true,
14 |   "python.linting.mypyEnabled": false,
15 |   "python.linting.flake8Args": [
16 |     "--ignore",
17 |     "N801, E203, E266, E501, W503, F812, E741, N803, N802, N806"
18 |   ],
19 | }
20 | 


--------------------------------------------------------------------------------
/tensor_demo/miniTorch/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Sasha Rush
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/tensor_demo/miniTorch/minitorch.egg-info/PKG-INFO:
--------------------------------------------------------------------------------
1 | Metadata-Version: 2.1
2 | Name: minitorch
3 | Version: 0.4
4 | License-File: LICENSE
5 | 


--------------------------------------------------------------------------------
/tensor_demo/miniTorch/minitorch.egg-info/SOURCES.txt:
--------------------------------------------------------------------------------
1 | LICENSE
2 | README.md
3 | setup.cfg
4 | setup.py
5 | minitorch.egg-info/PKG-INFO
6 | minitorch.egg-info/SOURCES.txt
7 | minitorch.egg-info/dependency_links.txt
8 | minitorch.egg-info/top_level.txt


--------------------------------------------------------------------------------
/tensor_demo/miniTorch/minitorch.egg-info/dependency_links.txt:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/tensor_demo/miniTorch/minitorch.egg-info/top_level.txt:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/tensor_demo/miniTorch/minitorch/__init__.py:
--------------------------------------------------------------------------------
1 | from .datasets import datasets  # noqa: F401,F403
2 | from .tensor import *  # noqa: F401,F403
3 | from .tensor_data import *  # noqa: F401,F403
4 | from .tensor_functions import *  # noqa: F401,F403
5 | from .tensor_ops import *  # noqa: F401,F403
6 | 
7 | version = "0.4"
8 | 


--------------------------------------------------------------------------------
/tensor_demo/miniTorch/minitorch/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/llmsystem/llmsys_code_examples/ed76a814ddc7b0beab6ee0c73f158d06c03efc98/tensor_demo/miniTorch/minitorch/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/tensor_demo/miniTorch/minitorch/__pycache__/__init__.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/llmsystem/llmsys_code_examples/ed76a814ddc7b0beab6ee0c73f158d06c03efc98/tensor_demo/miniTorch/minitorch/__pycache__/__init__.cpython-39.pyc


--------------------------------------------------------------------------------
/tensor_demo/miniTorch/minitorch/__pycache__/autodiff.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/llmsystem/llmsys_code_examples/ed76a814ddc7b0beab6ee0c73f158d06c03efc98/tensor_demo/miniTorch/minitorch/__pycache__/autodiff.cpython-38.pyc


--------------------------------------------------------------------------------
/tensor_demo/miniTorch/minitorch/__pycache__/autodiff.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/llmsystem/llmsys_code_examples/ed76a814ddc7b0beab6ee0c73f158d06c03efc98/tensor_demo/miniTorch/minitorch/__pycache__/autodiff.cpython-39.pyc


--------------------------------------------------------------------------------
/tensor_demo/miniTorch/minitorch/__pycache__/cuda_kernel_ops.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/llmsystem/llmsys_code_examples/ed76a814ddc7b0beab6ee0c73f158d06c03efc98/tensor_demo/miniTorch/minitorch/__pycache__/cuda_kernel_ops.cpython-39.pyc


--------------------------------------------------------------------------------
/tensor_demo/miniTorch/minitorch/__pycache__/cuda_ops.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/llmsystem/llmsys_code_examples/ed76a814ddc7b0beab6ee0c73f158d06c03efc98/tensor_demo/miniTorch/minitorch/__pycache__/cuda_ops.cpython-39.pyc


--------------------------------------------------------------------------------
/tensor_demo/miniTorch/minitorch/__pycache__/datasets.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/llmsystem/llmsys_code_examples/ed76a814ddc7b0beab6ee0c73f158d06c03efc98/tensor_demo/miniTorch/minitorch/__pycache__/datasets.cpython-38.pyc


--------------------------------------------------------------------------------
/tensor_demo/miniTorch/minitorch/__pycache__/datasets.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/llmsystem/llmsys_code_examples/ed76a814ddc7b0beab6ee0c73f158d06c03efc98/tensor_demo/miniTorch/minitorch/__pycache__/datasets.cpython-39.pyc


--------------------------------------------------------------------------------
/tensor_demo/miniTorch/minitorch/__pycache__/fast_conv.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/llmsystem/llmsys_code_examples/ed76a814ddc7b0beab6ee0c73f158d06c03efc98/tensor_demo/miniTorch/minitorch/__pycache__/fast_conv.cpython-39.pyc


--------------------------------------------------------------------------------
/tensor_demo/miniTorch/minitorch/__pycache__/fast_ops.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/llmsystem/llmsys_code_examples/ed76a814ddc7b0beab6ee0c73f158d06c03efc98/tensor_demo/miniTorch/minitorch/__pycache__/fast_ops.cpython-39.pyc


--------------------------------------------------------------------------------
/tensor_demo/miniTorch/minitorch/__pycache__/module.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/llmsystem/llmsys_code_examples/ed76a814ddc7b0beab6ee0c73f158d06c03efc98/tensor_demo/miniTorch/minitorch/__pycache__/module.cpython-39.pyc


--------------------------------------------------------------------------------
/tensor_demo/miniTorch/minitorch/__pycache__/nn.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/llmsystem/llmsys_code_examples/ed76a814ddc7b0beab6ee0c73f158d06c03efc98/tensor_demo/miniTorch/minitorch/__pycache__/nn.cpython-39.pyc


--------------------------------------------------------------------------------
/tensor_demo/miniTorch/minitorch/__pycache__/operators.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/llmsystem/llmsys_code_examples/ed76a814ddc7b0beab6ee0c73f158d06c03efc98/tensor_demo/miniTorch/minitorch/__pycache__/operators.cpython-38.pyc


--------------------------------------------------------------------------------
/tensor_demo/miniTorch/minitorch/__pycache__/operators.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/llmsystem/llmsys_code_examples/ed76a814ddc7b0beab6ee0c73f158d06c03efc98/tensor_demo/miniTorch/minitorch/__pycache__/operators.cpython-39.pyc


--------------------------------------------------------------------------------
/tensor_demo/miniTorch/minitorch/__pycache__/optim.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/llmsystem/llmsys_code_examples/ed76a814ddc7b0beab6ee0c73f158d06c03efc98/tensor_demo/miniTorch/minitorch/__pycache__/optim.cpython-39.pyc


--------------------------------------------------------------------------------
/tensor_demo/miniTorch/minitorch/__pycache__/scalar.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/llmsystem/llmsys_code_examples/ed76a814ddc7b0beab6ee0c73f158d06c03efc98/tensor_demo/miniTorch/minitorch/__pycache__/scalar.cpython-39.pyc


--------------------------------------------------------------------------------
/tensor_demo/miniTorch/minitorch/__pycache__/scalar_functions.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/llmsystem/llmsys_code_examples/ed76a814ddc7b0beab6ee0c73f158d06c03efc98/tensor_demo/miniTorch/minitorch/__pycache__/scalar_functions.cpython-39.pyc


--------------------------------------------------------------------------------
/tensor_demo/miniTorch/minitorch/__pycache__/tensor.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/llmsystem/llmsys_code_examples/ed76a814ddc7b0beab6ee0c73f158d06c03efc98/tensor_demo/miniTorch/minitorch/__pycache__/tensor.cpython-38.pyc


--------------------------------------------------------------------------------
/tensor_demo/miniTorch/minitorch/__pycache__/tensor.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/llmsystem/llmsys_code_examples/ed76a814ddc7b0beab6ee0c73f158d06c03efc98/tensor_demo/miniTorch/minitorch/__pycache__/tensor.cpython-39.pyc


--------------------------------------------------------------------------------
/tensor_demo/miniTorch/minitorch/__pycache__/tensor_data.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/llmsystem/llmsys_code_examples/ed76a814ddc7b0beab6ee0c73f158d06c03efc98/tensor_demo/miniTorch/minitorch/__pycache__/tensor_data.cpython-38.pyc


--------------------------------------------------------------------------------
/tensor_demo/miniTorch/minitorch/__pycache__/tensor_data.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/llmsystem/llmsys_code_examples/ed76a814ddc7b0beab6ee0c73f158d06c03efc98/tensor_demo/miniTorch/minitorch/__pycache__/tensor_data.cpython-39.pyc


--------------------------------------------------------------------------------
/tensor_demo/miniTorch/minitorch/__pycache__/tensor_functions.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/llmsystem/llmsys_code_examples/ed76a814ddc7b0beab6ee0c73f158d06c03efc98/tensor_demo/miniTorch/minitorch/__pycache__/tensor_functions.cpython-38.pyc


--------------------------------------------------------------------------------
/tensor_demo/miniTorch/minitorch/__pycache__/tensor_functions.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/llmsystem/llmsys_code_examples/ed76a814ddc7b0beab6ee0c73f158d06c03efc98/tensor_demo/miniTorch/minitorch/__pycache__/tensor_functions.cpython-39.pyc


--------------------------------------------------------------------------------
/tensor_demo/miniTorch/minitorch/__pycache__/tensor_ops.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/llmsystem/llmsys_code_examples/ed76a814ddc7b0beab6ee0c73f158d06c03efc98/tensor_demo/miniTorch/minitorch/__pycache__/tensor_ops.cpython-38.pyc


--------------------------------------------------------------------------------
/tensor_demo/miniTorch/minitorch/__pycache__/tensor_ops.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/llmsystem/llmsys_code_examples/ed76a814ddc7b0beab6ee0c73f158d06c03efc98/tensor_demo/miniTorch/minitorch/__pycache__/tensor_ops.cpython-39.pyc


--------------------------------------------------------------------------------
/tensor_demo/miniTorch/minitorch/__pycache__/testing.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/llmsystem/llmsys_code_examples/ed76a814ddc7b0beab6ee0c73f158d06c03efc98/tensor_demo/miniTorch/minitorch/__pycache__/testing.cpython-39.pyc


--------------------------------------------------------------------------------
/tensor_demo/miniTorch/minitorch/autodiff.py:
--------------------------------------------------------------------------------
  1 | from dataclasses import dataclass
  2 | from typing import Any, Iterable, List, Tuple
  3 | 
  4 | from typing_extensions import Protocol
  5 | 
  6 | # ## Task 1.1
  7 | # Central Difference calculation
  8 | 
  9 | 
 10 | def central_difference(f: Any, *vals: Any, arg: int = 0, epsilon: float = 1e-6) -> Any:
 11 |     r"""
 12 |     Computes an approximation to the derivative of `f` with respect to one arg.
 13 | 
 14 |     See :doc:`derivative` or https://en.wikipedia.org/wiki/Finite_difference for more details.
 15 | 
 16 |     Args:
 17 |         f : arbitrary function from n-scalar args to one value
 18 |         *vals : n-float values $x_0 \ldots x_{n-1}$
 19 |         arg : the number $i$ of the arg to compute the derivative
 20 |         epsilon : a small constant
 21 | 
 22 |     Returns:
 23 |         An approximation of $f'_i(x_0, \ldots, x_{n-1})$
 24 |     """
 25 |     # ASSIGN1.1
 26 |     vals1 = [v for v in vals]
 27 |     vals2 = [v for v in vals]
 28 |     vals1[arg] = vals1[arg] + epsilon
 29 |     vals2[arg] = vals2[arg] - epsilon
 30 |     delta = f(*vals1) - f(*vals2)
 31 |     return delta / (2 * epsilon)
 32 |     # END ASSIGN1
 33 | 
 34 | 
 35 | variable_count = 1
 36 | 
 37 | 
 38 | class Variable(Protocol):
 39 |     def accumulate_derivative(self, x: Any) -> None:
 40 |         pass
 41 | 
 42 |     @property
 43 |     def unique_id(self) -> int:
 44 |         pass
 45 | 
 46 |     def is_leaf(self) -> bool:
 47 |         pass
 48 | 
 49 |     def is_constant(self) -> bool:
 50 |         pass
 51 | 
 52 |     @property
 53 |     def parents(self) -> Iterable["Variable"]:
 54 |         pass
 55 | 
 56 |     def chain_rule(self, d_output: Any) -> Iterable[Tuple["Variable", Any]]:
 57 |         pass
 58 | 
 59 | 
 60 | def topological_sort(variable: Variable) -> Iterable[Variable]:
 61 |     """
 62 |     Computes the topological order of the computation graph.
 63 | 
 64 |     Args:
 65 |         variable: The right-most variable
 66 | 
 67 |     Returns:
 68 |         Non-constant Variables in topological order starting from the right.
 69 |     """
 70 |     order: List[Variable] = []
 71 |     return order
 72 | 
 73 | 
 74 | def backpropagate(variable: Variable, deriv: Any) -> None:
 75 |     """
 76 |     Runs backpropagation on the computation graph in order to
 77 |     compute derivatives for the leave nodes.
 78 | 
 79 |     Args:
 80 |         variable: The right-most variable
 81 |         deriv  : Its derivative that we want to propagate backward to the leaves.
 82 | 
 83 |     No return. Should write to its results to the derivative values of each leaf through `accumulate_derivative`.
 84 |     """
 85 | 
 86 | 
 87 | @dataclass
 88 | class Context:
 89 |     """
 90 |     Context class is used by `Function` to store information during the forward pass.
 91 |     """
 92 | 
 93 |     no_grad: bool = False
 94 |     saved_values: Tuple[Any, ...] = ()
 95 | 
 96 |     def save_for_backward(self, *values: Any) -> None:
 97 |         "Store the given `values` if they need to be used during backpropagation."
 98 |         if self.no_grad:
 99 |             return
100 |         self.saved_values = values
101 | 
102 |     @property
103 |     def saved_tensors(self) -> Tuple[Any, ...]:
104 |         return self.saved_values
105 | 


--------------------------------------------------------------------------------
/tensor_demo/miniTorch/minitorch/datasets.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import random
 3 | from dataclasses import dataclass
 4 | from typing import List, Tuple
 5 | 
 6 | 
 7 | def make_pts(N: int) -> List[Tuple[float, float]]:
 8 |     X = []
 9 |     for i in range(N):
10 |         x_1 = random.random()
11 |         x_2 = random.random()
12 |         X.append((x_1, x_2))
13 |     return X
14 | 
15 | 
16 | @dataclass
17 | class Graph:
18 |     N: int
19 |     X: List[Tuple[float, float]]
20 |     y: List[int]
21 | 
22 | 
23 | def simple(N: int) -> Graph:
24 |     X = make_pts(N)
25 |     y = []
26 |     for x_1, x_2 in X:
27 |         y1 = 1 if x_1 < 0.5 else 0
28 |         y.append(y1)
29 |     return Graph(N, X, y)
30 | 
31 | 
32 | def diag(N: int) -> Graph:
33 |     X = make_pts(N)
34 |     y = []
35 |     for x_1, x_2 in X:
36 |         y1 = 1 if x_1 + x_2 < 0.5 else 0
37 |         y.append(y1)
38 |     return Graph(N, X, y)
39 | 
40 | 
41 | def split(N: int) -> Graph:
42 |     X = make_pts(N)
43 |     y = []
44 |     for x_1, x_2 in X:
45 |         y1 = 1 if x_1 < 0.2 or x_1 > 0.8 else 0
46 |         y.append(y1)
47 |     return Graph(N, X, y)
48 | 
49 | 
50 | def xor(N: int) -> Graph:
51 |     X = make_pts(N)
52 |     y = []
53 |     for x_1, x_2 in X:
54 |         y1 = 1 if ((x_1 < 0.5 and x_2 > 0.5) or (x_1 > 0.5 and x_2 < 0.5)) else 0
55 |         y.append(y1)
56 |     return Graph(N, X, y)
57 | 
58 | 
59 | def circle(N: int) -> Graph:
60 |     X = make_pts(N)
61 |     y = []
62 |     for x_1, x_2 in X:
63 |         x1, x2 = (x_1 - 0.5, x_2 - 0.5)
64 |         y1 = 1 if x1 * x1 + x2 * x2 > 0.1 else 0
65 |         y.append(y1)
66 |     return Graph(N, X, y)
67 | 
68 | 
69 | def spiral(N: int) -> Graph:
70 |     def x(t: float) -> float:
71 |         return t * math.cos(t) / 20.0
72 | 
73 |     def y(t: float) -> float:
74 |         return t * math.sin(t) / 20.0
75 | 
76 |     X = [
77 |         (x(10.0 * (float(i) / (N // 2))) + 0.5, y(10.0 * (float(i) / (N // 2))) + 0.5)
78 |         for i in range(5 + 0, 5 + N // 2)
79 |     ]
80 |     X = X + [
81 |         (y(-10.0 * (float(i) / (N // 2))) + 0.5, x(-10.0 * (float(i) / (N // 2))) + 0.5)
82 |         for i in range(5 + 0, 5 + N // 2)
83 |     ]
84 |     y2 = [0] * (N // 2) + [1] * (N // 2)
85 |     return Graph(N, X, y2)
86 | 
87 | 
88 | datasets = {
89 |     "Simple": simple,
90 |     "Diag": diag,
91 |     "Split": split,
92 |     "Xor": xor,
93 |     "Circle": circle,
94 |     "Spiral": spiral,
95 | }
96 | 


--------------------------------------------------------------------------------
/tensor_demo/miniTorch/minitorch/module.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | from typing import Any, Dict, Optional, Sequence, Tuple
  4 | 
  5 | 
  6 | class Module:
  7 |     """
  8 |     Modules form a tree that store parameters and other
  9 |     submodules. They make up the basis of neural network stacks.
 10 | 
 11 |     Attributes:
 12 |         _modules : Storage of the child modules
 13 |         _parameters : Storage of the module's parameters
 14 |         training : Whether the module is in training mode or evaluation mode
 15 | 
 16 |     """
 17 | 
 18 |     _modules: Dict[str, Module]
 19 |     _parameters: Dict[str, Parameter]
 20 |     training: bool
 21 | 
 22 |     def __init__(self) -> None:
 23 |         self._modules = {}
 24 |         self._parameters = {}
 25 |         self.training = True
 26 | 
 27 |     def modules(self) -> Sequence[Module]:
 28 |         "Return the direct child modules of this module."
 29 |         m: Dict[str, Module] = self.__dict__["_modules"]
 30 |         return list(m.values())
 31 | 
 32 |     def train(self) -> None:
 33 |         "Set the mode of this module and all descendent modules to `train`."
 34 |         # ASSIGN0.4
 35 |         for m in self.modules():
 36 |             m.train()
 37 |         self.training = True
 38 |         # END ASSIGN0.4
 39 | 
 40 |     def eval(self) -> None:
 41 |         "Set the mode of this module and all descendent modules to `eval`."
 42 |         for m in self.modules():
 43 |             m.eval()
 44 |         self.training = False
 45 | 
 46 |     def named_parameters(self) -> Sequence[Tuple[str, Parameter]]:
 47 |         """
 48 |         Collect all the parameters of this module and its descendents.
 49 | 
 50 | 
 51 |         Returns:
 52 |             The name and `Parameter` of each ancestor parameter.
 53 |         """
 54 | 
 55 |         # Collect our parameters and give them a name.
 56 |         parameters = {}
 57 |         for k, v in self._parameters.items():
 58 |             parameters[k] = v
 59 | 
 60 |         # Recurse down to children submodules
 61 |         for mod_name, m in self._modules.items():
 62 |             for k, v in m.named_parameters():
 63 |                 parameters[f"{mod_name}.{k}"] = v
 64 |         return list(parameters.items())
 65 | 
 66 |     def parameters(self) -> Sequence[Parameter]:
 67 |         "Enumerate over all the parameters of this module and its descendents."
 68 |         return [j for _, j in self.named_parameters()]
 69 | 
 70 |     def add_parameter(self, k: str, v: Any) -> Parameter:
 71 |         """
 72 |         Manually add a parameter. Useful helper for scalar parameters.
 73 | 
 74 |         Args:
 75 |             k: Local name of the parameter.
 76 |             v: Value for the parameter.
 77 | 
 78 |         Returns:
 79 |             Newly created parameter.
 80 |         """
 81 |         val = Parameter(v, k)
 82 |         self.__dict__["_parameters"][k] = val
 83 |         return val
 84 | 
 85 |     def __setattr__(self, key: str, val: Parameter) -> None:
 86 |         if isinstance(val, Parameter):
 87 |             self.__dict__["_parameters"][key] = val
 88 |         elif isinstance(val, Module):
 89 |             self.__dict__["_modules"][key] = val
 90 |         else:
 91 |             super().__setattr__(key, val)
 92 | 
 93 |     def __getattr__(self, key: str) -> Any:
 94 |         if key in self.__dict__["_parameters"]:
 95 |             return self.__dict__["_parameters"][key]
 96 | 
 97 |         if key in self.__dict__["_modules"]:
 98 |             return self.__dict__["_modules"][key]
 99 |         return None
100 | 
101 |     def __call__(self, *args: Any, **kwargs: Any) -> Any:
102 |         return self.forward(*args, **kwargs)
103 | 
104 |     def __repr__(self) -> str:
105 |         def _addindent(s_: str, numSpaces: int) -> str:
106 |             s2 = s_.split("\n")
107 |             if len(s2) == 1:
108 |                 return s_
109 |             first = s2.pop(0)
110 |             s2 = [(numSpaces * " ") + line for line in s2]
111 |             s = "\n".join(s2)
112 |             s = first + "\n" + s
113 |             return s
114 | 
115 |         child_lines = []
116 | 
117 |         for key, module in self._modules.items():
118 |             mod_str = repr(module)
119 |             mod_str = _addindent(mod_str, 2)
120 |             child_lines.append("(" + key + "): " + mod_str)
121 |         lines = child_lines
122 | 
123 |         main_str = self.__class__.__name__ + "("
124 |         if lines:
125 |             # simple one-liner info, which most builtin Modules will use
126 |             main_str += "\n  " + "\n  ".join(lines) + "\n"
127 | 
128 |         main_str += ")"
129 |         return main_str
130 | 
131 | 
132 | class Parameter:
133 |     """
134 |     A Parameter is a special container stored in a `Module`.
135 | 
136 |     It is designed to hold a `Variable`, but we allow it to hold
137 |     any value for testing.
138 |     """
139 | 
140 |     def __init__(self, x: Any, name: Optional[str] = None) -> None:
141 |         self.value = x
142 |         self.name = name
143 |         if hasattr(x, "requires_grad_"):
144 |             self.value.requires_grad_(True)
145 |             if self.name:
146 |                 self.value.name = self.name
147 | 
148 |     def update(self, x: Any) -> None:
149 |         "Update the parameter value."
150 |         self.value = x
151 |         if hasattr(x, "requires_grad_"):
152 |             self.value.requires_grad_(True)
153 |             if self.name:
154 |                 self.value.name = self.name
155 | 
156 |     def __repr__(self) -> str:
157 |         return repr(self.value)
158 | 
159 |     def __str__(self) -> str:
160 |         return str(self.value)
161 | 


--------------------------------------------------------------------------------
/tensor_demo/miniTorch/minitorch/operators.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Collection of the core mathematical operators used throughout the code base.
  3 | """
  4 | 
  5 | import math
  6 | from typing import Callable, Iterable
  7 | 
  8 | # ## Task 0.1
  9 | #
 10 | # Implementation of a prelude of elementary functions.
 11 | 
 12 | 
 13 | def mul(x: float, y: float) -> float:
 14 |     "$f(x, y) = x * y$"
 15 |     # ASSIGN0.1
 16 |     return x * y
 17 |     # END ASSIGN0.1
 18 | 
 19 | 
 20 | def id(x: float) -> float:
 21 |     "$f(x) = x$"
 22 |     # ASSIGN0.1
 23 |     return x
 24 |     # END ASSIGN0.1
 25 | 
 26 | 
 27 | def add(x: float, y: float) -> float:
 28 |     "$f(x, y) = x + y$"
 29 |     # ASSIGN0.1
 30 |     return x + y
 31 |     # END ASSIGN0.1
 32 | 
 33 | 
 34 | def neg(x: float) -> float:
 35 |     "$f(x) = -x$"
 36 |     # ASSIGN0.1
 37 |     return -x
 38 |     # END ASSIGN0.1
 39 | 
 40 | 
 41 | def lt(x: float, y: float) -> float:
 42 |     "$f(x) =$ 1.0 if x is less than y else 0.0"
 43 |     # ASSIGN0.1
 44 |     return 1.0 if x < y else 0.0
 45 |     # END ASSIGN0.1
 46 | 
 47 | 
 48 | def eq(x: float, y: float) -> float:
 49 |     "$f(x) =$ 1.0 if x is equal to y else 0.0"
 50 |     # ASSIGN0.1
 51 |     return 1.0 if x == y else 0.0
 52 |     # END ASSIGN0.1
 53 | 
 54 | 
 55 | def max(x: float, y: float) -> float:
 56 |     "$f(x) =$ x if x is greater than y else y"
 57 |     # ASSIGN0.1
 58 |     return x if x > y else y
 59 |     # END ASSIGN0.1
 60 | 
 61 | 
 62 | def is_close(x: float, y: float) -> float:
 63 |     "$f(x) = |x - y| < 1e-2$"
 64 |     # ASSIGN0.1
 65 |     return (x - y < 1e-2) and (y - x < 1e-2)
 66 |     # END ASSIGN0.1
 67 | 
 68 | 
 69 | def sigmoid(x: float) -> float:
 70 |     r"""
 71 |     $f(x) =  \frac{1.0}{(1.0 + e^{-x})}$
 72 | 
 73 |     (See https://en.wikipedia.org/wiki/Sigmoid_function )
 74 | 
 75 |     Calculate as
 76 | 
 77 |     $f(x) =  \frac{1.0}{(1.0 + e^{-x})}$ if x >=0 else $\frac{e^x}{(1.0 + e^{x})}$
 78 | 
 79 |     for stability.
 80 |     """
 81 |     # ASSIGN0.1
 82 |     if x >= 0:
 83 |         return 1.0 / (1.0 + math.exp(-x))
 84 |     else:
 85 |         return math.exp(x) / (1.0 + math.exp(x))
 86 |     # END ASSIGN0.1
 87 | 
 88 | 
 89 | def relu(x: float) -> float:
 90 |     """
 91 |     $f(x) =$ x if x is greater than 0, else 0
 92 | 
 93 |     (See https://en.wikipedia.org/wiki/Rectifier_(neural_networks) .)
 94 |     """
 95 |     # ASSIGN0.1
 96 |     return x if x > 0 else 0.0
 97 |     # END ASSIGN0.1
 98 | 
 99 | 
100 | EPS = 1e-6
101 | 
102 | 
103 | def log(x: float) -> float:
104 |     "$f(x) = log(x)$"
105 |     return math.log(x + EPS)
106 | 
107 | 
108 | def exp(x: float) -> float:
109 |     "$f(x) = e^{x}$"
110 |     return math.exp(x)
111 | 
112 | 
113 | def log_back(x: float, d: float) -> float:
114 |     r"If $f = log$ as above, compute $d \times f'(x)$"
115 |     # ASSIGN0.1
116 |     return d / (x + EPS)
117 |     # END ASSIGN0.1
118 | 
119 | 
120 | def inv(x: float) -> float:
121 |     "$f(x) = 1/x$"
122 |     # ASSIGN0.1
123 |     return 1.0 / x
124 |     # END ASSIGN0.1
125 | 
126 | 
127 | def inv_back(x: float, d: float) -> float:
128 |     r"If $f(x) = 1/x$ compute $d \times f'(x)$"
129 |     # ASSIGN0.1
130 |     return -(1.0 / x**2) * d
131 |     # END ASSIGN0.1
132 | 
133 | 
134 | def relu_back(x: float, d: float) -> float:
135 |     r"If $f = relu$ compute $d \times f'(x)$"
136 |     # ASSIGN0.1
137 |     return d if x > 0 else 0.0
138 |     # END ASSIGN0.1
139 | 
140 | 
141 | # ## Task 0.3
142 | 
143 | # Small practice library of elementary higher-order functions.
144 | 
145 | 
146 | def map(fn: Callable[[float], float]) -> Callable[[Iterable[float]], Iterable[float]]:
147 |     """
148 |     Higher-order map.
149 | 
150 |     See https://en.wikipedia.org/wiki/Map_(higher-order_function)
151 | 
152 |     Args:
153 |         fn: Function from one value to one value.
154 | 
155 |     Returns:
156 |          A function that takes a list, applies `fn` to each element, and returns a
157 |          new list
158 |     """
159 |     # ASSIGN0.3
160 |     def _map(ls: Iterable[float]) -> Iterable[float]:
161 |         ret = []
162 |         for x in ls:
163 |             ret.append(fn(x))
164 |         return ret
165 | 
166 |     return _map
167 |     # END ASSIGN0.3
168 | 
169 | 
170 | def negList(ls: Iterable[float]) -> Iterable[float]:
171 |     "Use `map` and `neg` to negate each element in `ls`"
172 |     # ASSIGN0.3
173 |     return map(neg)(ls)
174 |     # END ASSIGN0.3
175 | 
176 | 
177 | def zipWith(
178 |     fn: Callable[[float, float], float]
179 | ) -> Callable[[Iterable[float], Iterable[float]], Iterable[float]]:
180 |     """
181 |     Higher-order zipwith (or map2).
182 | 
183 |     See https://en.wikipedia.org/wiki/Map_(higher-order_function)
184 | 
185 |     Args:
186 |         fn: combine two values
187 | 
188 |     Returns:
189 |          Function that takes two equally sized lists `ls1` and `ls2`, produce a new list by
190 |          applying fn(x, y) on each pair of elements.
191 | 
192 |     """
193 |     # ASSIGN0.3
194 |     def _zipWith(ls1: Iterable[float], ls2: Iterable[float]) -> Iterable[float]:
195 |         ret = []
196 |         for x, y in zip(ls1, ls2):
197 |             ret.append(fn(x, y))
198 |         return ret
199 | 
200 |     return _zipWith
201 |     # END ASSIGN0.3
202 | 
203 | 
204 | def addLists(ls1: Iterable[float], ls2: Iterable[float]) -> Iterable[float]:
205 |     "Add the elements of `ls1` and `ls2` using `zipWith` and `add`"
206 |     # ASSIGN0.3
207 |     return zipWith(add)(ls1, ls2)
208 |     # END ASSIGN0.3
209 | 
210 | 
211 | def reduce(
212 |     fn: Callable[[float, float], float], start: float
213 | ) -> Callable[[Iterable[float]], float]:
214 |     r"""
215 |     Higher-order reduce.
216 | 
217 |     Args:
218 |         fn: combine two values
219 |         start: start value $x_0$
220 | 
221 |     Returns:
222 |          Function that takes a list `ls` of elements
223 |          $x_1 \ldots x_n$ and computes the reduction :math:`fn(x_3, fn(x_2,
224 |          fn(x_1, x_0)))`
225 |     """
226 |     # ASSIGN0.3
227 |     def _reduce(ls: Iterable[float]) -> float:
228 |         val = start
229 |         for l in ls:
230 |             val = fn(val, l)
231 |         return val
232 | 
233 |     return _reduce
234 |     # END ASSIGN0.3
235 | 
236 | 
237 | def sum(ls: Iterable[float]) -> float:
238 |     "Sum up a list using `reduce` and `add`."
239 |     # ASSIGN0.3
240 |     return reduce(add, 0.0)(ls)
241 |     # END ASSIGN0.3
242 | 
243 | 
244 | def prod(ls: Iterable[float]) -> float:
245 |     "Product of a list using `reduce` and `mul`."
246 |     # ASSIGN0.3
247 |     return reduce(mul, 1.0)(ls)
248 |     # END ASSIGN0.3
249 | 


--------------------------------------------------------------------------------
/tensor_demo/miniTorch/minitorch/optim.py:
--------------------------------------------------------------------------------
 1 | from typing import Sequence
 2 | 
 3 | from .module import Parameter
 4 | 
 5 | 
 6 | class Optimizer:
 7 |     def __init__(self, parameters: Sequence[Parameter]):
 8 |         self.parameters = parameters
 9 | 
10 | 
11 | class SGD(Optimizer):
12 |     def __init__(self, parameters: Sequence[Parameter], lr: float = 1.0):
13 |         super().__init__(parameters)
14 |         self.lr = lr
15 | 
16 |     def zero_grad(self) -> None:
17 |         for p in self.parameters:
18 |             if p.value is None:
19 |                 continue
20 |             if hasattr(p.value, "derivative"):
21 |                 if p.value.derivative is not None:
22 |                     p.value.derivative = None
23 |             if hasattr(p.value, "grad"):
24 |                 if p.value.grad is not None:
25 |                     p.value.grad = None
26 | 
27 |     def step(self) -> None:
28 |         for p in self.parameters:
29 |             if p.value is None:
30 |                 continue
31 |             if hasattr(p.value, "grad"):
32 |                 if p.value.grad is not None:
33 |                     p.update(p.value - self.lr * p.value.grad)
34 |     
35 |     def _print(self) -> None:
36 |         for param in self.parameters:
37 |             if param.value is None:
38 |                 continue
39 |             print(param.value.shape)
40 |             print(param.value.grad)
41 | 


--------------------------------------------------------------------------------
/tensor_demo/miniTorch/minitorch/tensor.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Implementation of the core Tensor object for autodifferentiation.
  3 | """
  4 | 
  5 | from __future__ import annotations
  6 | 
  7 | from dataclasses import dataclass
  8 | from typing import TYPE_CHECKING
  9 | 
 10 | import numpy as np
 11 | 
 12 | from . import operators
 13 | from .autodiff import Context, Variable, backpropagate
 14 | from .tensor_data import TensorData
 15 | from .tensor_functions import (
 16 |     EQ,
 17 |     LT,
 18 |     Add,
 19 |     All,
 20 |     Copy,
 21 |     Exp,
 22 |     Inv,
 23 |     IsClose,
 24 |     Log,
 25 |     MatMul,
 26 |     Mul,
 27 |     Neg,
 28 |     Permute,
 29 |     ReLU,
 30 |     Sigmoid,
 31 |     Sum,
 32 |     View,
 33 |     tensor,
 34 | )
 35 | 
 36 | if TYPE_CHECKING:
 37 |     from typing import Any, Iterable, List, Optional, Sequence, Tuple, Type, Union
 38 | 
 39 |     import numpy.typing as npt
 40 | 
 41 |     from .tensor_data import Shape, Storage, Strides, UserIndex, UserShape, UserStrides
 42 |     from .tensor_functions import Function
 43 |     from .tensor_ops import TensorBackend
 44 | 
 45 |     TensorLike = Union[float, int, "Tensor"]
 46 | 
 47 | 
 48 | @dataclass
 49 | class History:
 50 |     """
 51 |     `History` stores the history of `Function` operations that was
 52 |     used to construct the current Variable.
 53 |     """
 54 | 
 55 |     last_fn: Optional[Type[Function]] = None
 56 |     ctx: Optional[Context] = None
 57 |     inputs: Sequence[Tensor] = ()
 58 | 
 59 | 
 60 | _tensor_count = 0
 61 | 
 62 | 
 63 | class Tensor:
 64 |     """
 65 |     Tensor is a generalization of Scalar in that it is a Variable that
 66 |     handles multidimensional arrays.
 67 |     """
 68 | 
 69 |     backend: TensorBackend
 70 |     history: Optional[History]
 71 |     grad: Optional[Tensor]
 72 |     _tensor: TensorData
 73 |     unique_id: int
 74 |     name: str
 75 | 
 76 |     def __init__(
 77 |         self,
 78 |         v: TensorData,
 79 |         back: Optional[History] = None,
 80 |         name: Optional[str] = None,
 81 |         backend: Optional[TensorBackend] = None,
 82 |     ):
 83 |         global _tensor_count
 84 |         _tensor_count += 1
 85 |         self.unique_id = _tensor_count
 86 |         assert isinstance(v, TensorData)
 87 |         assert backend is not None
 88 |         self._tensor = v
 89 |         self.history = back
 90 |         self.backend = backend
 91 |         self.grad = None
 92 |         if name is not None:
 93 |             self.name = name
 94 |         else:
 95 |             self.name = str(self.unique_id)
 96 | 
 97 |         self.f = backend
 98 | 
 99 |     def requires_grad_(self, x: bool) -> None:
100 |         self.history = History()
101 | 
102 |     def requires_grad(self) -> bool:
103 |         return self.history is not None
104 | 
105 |     def to_numpy(self) -> npt.NDArray[np.float64]:
106 |         """
107 |         Returns:
108 |              Converted to numpy array
109 |         """
110 |         return self.contiguous()._tensor._storage.reshape(self.shape)
111 | 
112 |     # Properties
113 |     @property
114 |     def shape(self) -> UserShape:
115 |         """
116 |         Returns:
117 |              shape of the tensor
118 |         """
119 |         return self._tensor.shape
120 | 
121 |     @property
122 |     def size(self) -> int:
123 |         """
124 |         Returns:
125 |              int : size of the tensor
126 |         """
127 |         return self._tensor.size
128 | 
129 |     @property
130 |     def dims(self) -> int:
131 |         """
132 |         Returns:
133 |              int : dimensionality of the tensor
134 |         """
135 |         return self._tensor.dims
136 | 
137 |     def _ensure_tensor(self, b: TensorLike) -> Tensor:
138 |         "Turns a python number into a tensor with the same backend."
139 |         if isinstance(b, (int, float)):
140 |             c = Tensor.make([b], (1,), backend=self.backend)
141 |         else:
142 |             b._type_(self.backend)
143 |             c = b
144 |         return c
145 | 
146 |     # Functions
147 |     def __add__(self, b: TensorLike) -> Tensor:
148 |         return Add.apply(self, self._ensure_tensor(b))
149 | 
150 |     def __sub__(self, b: TensorLike) -> Tensor:
151 |         return Add.apply(self, -self._ensure_tensor(b))
152 | 
153 |     def __mul__(self, b: TensorLike) -> Tensor:
154 |         return Mul.apply(self, self._ensure_tensor(b))
155 | 
156 |     def __truediv__(self, b: TensorLike) -> Tensor:
157 |         return Mul.apply(self, Inv.apply(self._ensure_tensor(b)))
158 | 
159 |     def __rtruediv__(self, b: TensorLike) -> Tensor:
160 |         return Mul.apply(self._ensure_tensor(b), Inv.apply(self))
161 | 
162 |     def __matmul__(self, b: Tensor) -> Tensor:
163 |         "Not used until Module 3"
164 |         return MatMul.apply(self, b)
165 | 
166 |     def __lt__(self, b: TensorLike) -> Tensor:
167 |         return LT.apply(self, self._ensure_tensor(b))
168 | 
169 |     def __eq__(self, b: TensorLike) -> Tensor:  # type: ignore[override]
170 |         return EQ.apply(self, self._ensure_tensor(b))
171 | 
172 |     def __gt__(self, b: TensorLike) -> Tensor:
173 |         return LT.apply(self._ensure_tensor(b), self)
174 | 
175 |     def __neg__(self) -> Tensor:
176 |         return Neg.apply(self)
177 | 
178 |     def __radd__(self, b: TensorLike) -> Tensor:
179 |         return self + b
180 | 
181 |     def __rmul__(self, b: TensorLike) -> Tensor:
182 |         return self * b
183 | 
184 |     def all(self, dim: Optional[int] = None) -> Tensor:
185 |         if dim is None:
186 |             return All.apply(self.view(self.size), self._ensure_tensor(0))
187 |         else:
188 |             return All.apply(self, self._ensure_tensor(dim))
189 | 
190 |     def is_close(self, y: Tensor) -> Tensor:
191 |         return IsClose.apply(self, y)
192 | 
193 |     def sigmoid(self) -> Tensor:
194 |         return Sigmoid.apply(self)
195 | 
196 |     def relu(self) -> Tensor:
197 |         return ReLU.apply(self)
198 | 
199 |     def log(self) -> Tensor:
200 |         return Log.apply(self)
201 | 
202 |     def exp(self) -> Tensor:
203 |         return Exp.apply(self)
204 | 
205 |     def item(self) -> float:
206 |         assert self.size == 1
207 |         x: float = self._tensor._storage[0]
208 |         return x
209 | 
210 |     def sum(self, dim: Optional[int] = None) -> Tensor:
211 |         "Compute the sum over dimension `dim`"
212 |         if dim is None:
213 |             return Sum.apply(self.contiguous().view(self.size), self._ensure_tensor(0))
214 |         else:
215 |             return Sum.apply(self, self._ensure_tensor(dim))
216 | 
217 |     def mean(self, dim: Optional[int] = None) -> Tensor:
218 |         "Compute the mean over dimension `dim`"
219 |         if dim is not None:
220 |             return self.sum(dim) / self.shape[dim]
221 |         else:
222 |             return self.sum() / self.size
223 |     
224 |     def var(self, dim: Optional[int] = None) -> Tensor:
225 |         "Compute the variance over dimension `dim`"
226 |         if dim is not None:
227 |             shape = self.shape
228 |             
229 |             mean = self.sum(dim) / self.shape[dim]
230 |             mean = mean.contiguous().view(shape)
231 |             
232 |             diff = self.__sub__(mean) ** 2
233 |             diff = diff.sum(dim) / self.shape[dim]
234 |             
235 |             return diff
236 |         else:
237 |             shape = self.shape
238 |             mean = self.sum() / self.size
239 |             mean = mean.contiguous().view(shape)
240 |             
241 |             diff = self.__sub__(mean) ** 2
242 |             diff = diff.sum() / self.size
243 |             
244 |             return diff
245 | 
246 |     def permute(self, *order: int) -> Tensor:
247 |         "Permute tensor dimensions to *order"
248 |         return Permute.apply(self, tensor(list(order)))
249 | 
250 |     def view(self, *shape: int) -> Tensor:
251 |         "Change the shape of the tensor to a new shape with the same size"
252 |         return View.apply(self, tensor(list(shape)))
253 | 
254 |     def contiguous(self) -> Tensor:
255 |         "Return a contiguous tensor with the same data"
256 |         return Copy.apply(self)
257 | 
258 |     def __repr__(self) -> str:
259 |         return self._tensor.to_string()
260 | 
261 |     def __getitem__(self, key: Union[int, UserIndex]) -> float:
262 |         key2 = (key,) if isinstance(key, int) else key
263 |         return self._tensor.get(key2)
264 | 
265 |     def __setitem__(self, key: Union[int, UserIndex], val: float) -> None:
266 |         key2 = (key,) if isinstance(key, int) else key
267 |         self._tensor.set(key2, val)
268 | 
269 |     # Internal methods used for autodiff.
270 |     def _type_(self, backend: TensorBackend) -> None:
271 |         self.backend = backend
272 |         if backend.cuda:  # pragma: no cover
273 |             self._tensor.to_cuda_()
274 | 
275 |     def _new(self, tensor_data: TensorData) -> Tensor:
276 |         return Tensor(tensor_data, backend=self.backend)
277 | 
278 |     @staticmethod
279 |     def make(
280 |         storage: Union[Storage, List[float]],
281 |         shape: UserShape,
282 |         strides: Optional[UserStrides] = None,
283 |         backend: Optional[TensorBackend] = None,
284 |     ) -> Tensor:
285 |         "Create a new tensor from data"
286 |         return Tensor(TensorData(storage, shape, strides), backend=backend)
287 | 
288 |     def expand(self, other: Tensor) -> Tensor:
289 |         """
290 |         Method used to allow for backprop over broadcasting.
291 |         This method is called when the output of `backward`
292 |         is a different size than the input of `forward`.
293 | 
294 | 
295 |         Parameters:
296 |             other : backward tensor (must broadcast with self)
297 | 
298 |         Returns:
299 |             Expanded version of `other` with the right derivatives
300 | 
301 |         """
302 | 
303 |         # Case 1: Both the same shape.
304 |         if self.shape == other.shape:
305 |             return other
306 | 
307 |         # Case 2: Backward is a smaller than self. Broadcast up.
308 |         true_shape = TensorData.shape_broadcast(self.shape, other.shape)
309 |         buf = self.zeros(true_shape)
310 |         self.backend.id_map(other, buf)
311 |         if self.shape == true_shape:
312 |             return buf
313 | 
314 |         # Case 3: Still different, reduce extra dims.
315 |         out = buf
316 |         orig_shape = [1] * (len(out.shape) - len(self.shape)) + list(self.shape)
317 |         for dim, shape in enumerate(out.shape):
318 |             if orig_shape[dim] == 1 and shape != 1:
319 |                 out = self.backend.add_reduce(out, dim)
320 |         assert out.size == self.size, f"{out.shape} {self.shape}"
321 |         # START CODE CHANGE (2021)
322 |         return Tensor.make(out._tensor._storage, self.shape, backend=self.backend)
323 |         # END CODE CHANGE (2021)
324 | 
325 |     def zeros(self, shape: Optional[UserShape] = None) -> Tensor:
326 |         def zero(shape: UserShape) -> Tensor:
327 |             return Tensor.make(
328 |                 [0.0] * int(operators.prod(shape)), shape, backend=self.backend
329 |             )
330 | 
331 |         if shape is None:
332 |             out = zero(self.shape)
333 |         else:
334 |             out = zero(shape)
335 |         out._type_(self.backend)
336 |         return out
337 | 
338 |     def tuple(self) -> Tuple[Storage, Shape, Strides]:
339 |         return self._tensor.tuple()
340 | 
341 |     def detach(self) -> Tensor:
342 |         return Tensor(self._tensor, backend=self.backend)
343 | 
344 |     # Variable elements for backprop
345 | 
346 |     def accumulate_derivative(self, x: Any) -> None:
347 |         """
348 |         Add `val` to the the derivative accumulated on this variable.
349 |         Should only be called during autodifferentiation on leaf variables.
350 | 
351 |         Args:
352 |             x : value to be accumulated
353 |         """
354 |         assert self.is_leaf(), "Only leaf variables can have derivatives."
355 |         if self.grad is None:
356 |             self.grad = Tensor.make(
357 |                 [0] * int(operators.prod(self.shape)), self.shape, backend=self.backend
358 |             )
359 |         self.grad += x
360 | 
361 |     def is_leaf(self) -> bool:
362 |         "True if this variable created by the user (no `last_fn`)"
363 |         return self.history is not None and self.history.last_fn is None
364 | 
365 |     def is_constant(self) -> bool:
366 |         return self.history is None
367 | 
368 |     @property
369 |     def parents(self) -> Iterable[Variable]:
370 |         assert self.history is not None
371 |         return self.history.inputs
372 | 
373 |     def chain_rule(self, d_output: Any) -> Iterable[Tuple[Variable, Any]]:
374 |         h = self.history
375 |         assert h is not None
376 |         assert h.last_fn is not None
377 |         assert h.ctx is not None
378 | 
379 |         x = h.last_fn._backward(h.ctx, d_output)
380 |         assert len(x) == len(h.inputs), f"Bug in function {h.last_fn}"
381 |         return [
382 |             (inp, inp.expand(self._ensure_tensor(d_in)))
383 |             for inp, d_in in zip(h.inputs, x)
384 |         ]
385 | 
386 |     def backward(self, grad_output: Optional[Tensor] = None) -> None:
387 |         if grad_output is None:
388 |             assert self.shape == (1,), "Must provide grad_output if non-scalar"
389 |             grad_output = Tensor.make([1.0], (1,), backend=self.backend)
390 |         backpropagate(self, grad_output)
391 | 
392 |     def zero_grad_(self) -> None:  # pragma: no cover
393 |         """
394 |         Reset the derivative on this variable.
395 |         """
396 |         self.grad = None
397 | 


--------------------------------------------------------------------------------
/tensor_demo/miniTorch/minitorch/tensor_data.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import random
  4 | from typing import Iterable, Optional, Sequence, Tuple, Union
  5 | 
  6 | import numba
  7 | import numpy as np
  8 | import numpy.typing as npt
  9 | from numpy import array, float64
 10 | from typing_extensions import TypeAlias
 11 | 
 12 | from .operators import prod
 13 | 
 14 | MAX_DIMS = 32
 15 | 
 16 | 
 17 | class IndexingError(RuntimeError):
 18 |     "Exception raised for indexing errors."
 19 |     pass
 20 | 
 21 | 
 22 | Storage: TypeAlias = npt.NDArray[np.float64]
 23 | OutIndex: TypeAlias = npt.NDArray[np.int32]
 24 | Index: TypeAlias = npt.NDArray[np.int32]
 25 | Shape: TypeAlias = npt.NDArray[np.int32]
 26 | Strides: TypeAlias = npt.NDArray[np.int32]
 27 | 
 28 | UserIndex: TypeAlias = Sequence[int]
 29 | UserShape: TypeAlias = Sequence[int]
 30 | UserStrides: TypeAlias = Sequence[int]
 31 | 
 32 | 
 33 | def index_to_position(index: Index, strides: Strides) -> int:
 34 |     """
 35 |     Converts a multidimensional tensor `index` into a single-dimensional position in
 36 |     storage based on strides.
 37 | 
 38 |     Args:
 39 |         index : index tuple of ints
 40 |         strides : tensor strides
 41 | 
 42 |     Returns:
 43 |         Position in storage
 44 |     """
 45 | 
 46 |     # ASSIGN2.1
 47 |     position = 0
 48 |     for ind, stride in zip(index, strides):
 49 |         position += ind * stride
 50 |     return position
 51 |     # END ASSIGN2.1
 52 | 
 53 | 
 54 | def to_index(ordinal: int, shape: Shape, out_index: OutIndex) -> None:
 55 |     """
 56 |     Convert an `ordinal` to an index in the `shape`.
 57 |     Should ensure that enumerating position 0 ... size of a
 58 |     tensor produces every index exactly once. It
 59 |     may not be the inverse of `index_to_position`.
 60 | 
 61 |     Args:
 62 |         ordinal: ordinal position to convert.
 63 |         shape : tensor shape.
 64 |         out_index : return index corresponding to position.
 65 | 
 66 |     """
 67 |     # ASSIGN2.1
 68 |     cur_ord = ordinal + 0
 69 |     for i in range(len(shape) - 1, -1, -1):
 70 |         sh = shape[i]
 71 |         out_index[i] = int(cur_ord % sh)
 72 |         cur_ord = cur_ord // sh
 73 |     # END ASSIGN2.1
 74 | 
 75 | 
 76 | def broadcast_index(
 77 |     big_index: Index, big_shape: Shape, shape: Shape, out_index: OutIndex
 78 | ) -> None:
 79 |     """
 80 |     Convert a `big_index` into `big_shape` to a smaller `out_index`
 81 |     into `shape` following broadcasting rules. In this case
 82 |     it may be larger or with more dimensions than the `shape`
 83 |     given. Additional dimensions may need to be mapped to 0 or
 84 |     removed.
 85 | 
 86 |     Args:
 87 |         big_index : multidimensional index of bigger tensor
 88 |         big_shape : tensor shape of bigger tensor
 89 |         shape : tensor shape of smaller tensor
 90 |         out_index : multidimensional index of smaller tensor
 91 | 
 92 |     Returns:
 93 |         None
 94 |     """
 95 |     # ASSIGN2.2
 96 |     for i, s in enumerate(shape):
 97 |         if s > 1:
 98 |             out_index[i] = big_index[i + (len(big_shape) - len(shape))]
 99 |         else:
100 |             out_index[i] = 0
101 |     return None
102 |     # END ASSIGN2.2
103 | 
104 | 
105 | def shape_broadcast(shape1: UserShape, shape2: UserShape) -> UserShape:
106 |     """
107 |     Broadcast two shapes to create a new union shape.
108 | 
109 |     Args:
110 |         shape1 : first shape
111 |         shape2 : second shape
112 | 
113 |     Returns:
114 |         broadcasted shape
115 | 
116 |     Raises:
117 |         IndexingError : if cannot broadcast
118 |     """
119 |     # ASSIGN2.2
120 |     a, b = shape1, shape2
121 |     m = max(len(a), len(b))
122 |     c_rev = [0] * m
123 |     a_rev = list(reversed(a))
124 |     b_rev = list(reversed(b))
125 |     for i in range(m):
126 |         if i >= len(a):
127 |             c_rev[i] = b_rev[i]
128 |         elif i >= len(b):
129 |             c_rev[i] = a_rev[i]
130 |         else:
131 |             c_rev[i] = max(a_rev[i], b_rev[i])
132 |             if a_rev[i] != c_rev[i] and a_rev[i] != 1:
133 |                 raise IndexingError(f"Broadcast failure {a} {b}")
134 |             if b_rev[i] != c_rev[i] and b_rev[i] != 1:
135 |                 raise IndexingError(f"Broadcast failure {a} {b}")
136 |     return tuple(reversed(c_rev))
137 |     # END ASSIGN2.2
138 | 
139 | 
140 | def strides_from_shape(shape: UserShape) -> UserStrides:
141 |     layout = [1]
142 |     offset = 1
143 |     for s in reversed(shape):
144 |         layout.append(s * offset)
145 |         offset = s * offset
146 |     return tuple(reversed(layout[:-1]))
147 | 
148 | 
149 | class TensorData:
150 |     _storage: Storage
151 |     _strides: Strides
152 |     _shape: Shape
153 |     strides: UserStrides
154 |     shape: UserShape
155 |     dims: int
156 | 
157 |     def __init__(
158 |         self,
159 |         storage: Union[Sequence[float], Storage],
160 |         shape: UserShape,
161 |         strides: Optional[UserStrides] = None,
162 |     ):
163 |         if isinstance(storage, np.ndarray):
164 |             self._storage = storage
165 |         else:
166 |             self._storage = array(storage, dtype=float64)
167 | 
168 |         if strides is None:
169 |             strides = strides_from_shape(shape)
170 | 
171 |         assert isinstance(strides, tuple), "Strides must be tuple"
172 |         assert isinstance(shape, tuple), "Shape must be tuple"
173 |         if len(strides) != len(shape):
174 |             raise IndexingError(f"Len of strides {strides} must match {shape}.")
175 |         self._strides = array(strides)
176 |         self._shape = array(shape)
177 |         self.strides = strides
178 |         self.dims = len(strides)
179 |         self.size = int(prod(shape))
180 |         self.shape = shape
181 |         assert len(self._storage) == self.size
182 | 
183 |     def to_cuda_(self) -> None:  # pragma: no cover
184 |         if not numba.cuda.is_cuda_array(self._storage):
185 |             self._storage = numba.cuda.to_device(self._storage)
186 | 
187 |     def is_contiguous(self) -> bool:
188 |         """
189 |         Check that the layout is contiguous, i.e. outer dimensions have bigger strides than inner dimensions.
190 | 
191 |         Returns:
192 |             bool : True if contiguous
193 |         """
194 |         last = 1e9
195 |         for stride in self._strides:
196 |             if stride > last:
197 |                 return False
198 |             last = stride
199 |         return True
200 | 
201 |     @staticmethod
202 |     def shape_broadcast(shape_a: UserShape, shape_b: UserShape) -> UserShape:
203 |         return shape_broadcast(shape_a, shape_b)
204 | 
205 |     def index(self, index: Union[int, UserIndex]) -> int:
206 |         if isinstance(index, int):
207 |             aindex: Index = array([index])
208 |         if isinstance(index, tuple):
209 |             aindex = array(index)
210 | 
211 |         # Pretend 0-dim shape is 1-dim shape of singleton
212 |         shape = self.shape
213 |         if len(shape) == 0 and len(aindex) != 0:
214 |             shape = (1,)
215 |             
216 |         # Check for errors
217 |         if aindex.shape[0] != len(self.shape):
218 |             raise IndexingError(f"Index {aindex} must be size of {self.shape}.")
219 |         for i, ind in enumerate(aindex):
220 |             if ind >= self.shape[i]:
221 |                 raise IndexingError(f"Index {aindex} out of range {self.shape}.")
222 |             if ind < 0:
223 |                 raise IndexingError(f"Negative indexing for {aindex} not supported.")
224 | 
225 |         # Call fast indexing.
226 |         return index_to_position(array(index), self._strides)
227 | 
228 |     def indices(self) -> Iterable[UserIndex]:
229 |         lshape: Shape = array(self.shape)
230 |         out_index: Index = array(self.shape)
231 |         for i in range(self.size):
232 |             to_index(i, lshape, out_index)
233 |             yield tuple(out_index)
234 | 
235 |     def sample(self) -> UserIndex:
236 |         return tuple((random.randint(0, s - 1) for s in self.shape))
237 | 
238 |     def get(self, key: UserIndex) -> float:
239 |         x: float = self._storage[self.index(key)]
240 |         return x
241 | 
242 |     def set(self, key: UserIndex, val: float) -> None:
243 |         self._storage[self.index(key)] = val
244 | 
245 |     def tuple(self) -> Tuple[Storage, Shape, Strides]:
246 |         return (self._storage, self._shape, self._strides)
247 | 
248 |     def permute(self, *order: int) -> TensorData:
249 |         """
250 |         Permute the dimensions of the tensor.
251 | 
252 |         Args:
253 |             *order: a permutation of the dimensions
254 | 
255 |         Returns:
256 |             New `TensorData` with the same storage and a new dimension order.
257 |         """
258 |         assert list(sorted(order)) == list(
259 |             range(len(self.shape))
260 |         ), f"Must give a position to each dimension. Shape: {self.shape} Order: {order}"
261 | 
262 |         # ASSIGN2.1
263 |         return TensorData(
264 |             self._storage,
265 |             tuple([self.shape[o] for o in order]),
266 |             tuple([self._strides[o] for o in order]),
267 |         )
268 |         # END ASSIGN2.1
269 | 
270 |     def to_string(self) -> str:
271 |         s = ""
272 |         for index in self.indices():
273 |             l = ""
274 |             for i in range(len(index) - 1, -1, -1):
275 |                 if index[i] == 0:
276 |                     l = "\n%s[" % ("\t" * i) + l
277 |                 else:
278 |                     break
279 |             s += l
280 |             v = self.get(index)
281 |             s += f"{v:f}"
282 |             l = ""
283 |             for i in range(len(index) - 1, -1, -1):
284 |                 if index[i] == self.shape[i] - 1:
285 |                     l += "]"
286 |                 else:
287 |                     break
288 |             if l:
289 |                 s += l
290 |             else:
291 |                 s += " "
292 |         return s
293 | 


--------------------------------------------------------------------------------
/tensor_demo/miniTorch/minitorch/tensor_ops.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | from typing import TYPE_CHECKING, Callable, Optional, Type
  4 | 
  5 | import numpy as np
  6 | from typing_extensions import Protocol
  7 | 
  8 | from . import operators
  9 | from .tensor_data import (
 10 |     MAX_DIMS,
 11 |     broadcast_index,
 12 |     index_to_position,
 13 |     shape_broadcast,
 14 |     to_index,
 15 | )
 16 | 
 17 | if TYPE_CHECKING:
 18 |     from .tensor import Tensor
 19 |     from .tensor_data import Index, Shape, Storage, Strides
 20 | 
 21 | 
 22 | class MapProto(Protocol):
 23 |     def __call__(self, x: Tensor, out: Optional[Tensor] = ..., /) -> Tensor:
 24 |         ...
 25 | 
 26 | 
 27 | class TensorOps:
 28 |     @staticmethod
 29 |     def map(fn: Callable[[float], float]) -> MapProto:
 30 |         pass
 31 | 
 32 |     @staticmethod
 33 |     def cmap(fn: Callable[[float], float]) -> Callable[[Tensor, Tensor], Tensor]:
 34 |         pass
 35 | 
 36 |     @staticmethod
 37 |     def zip(fn: Callable[[float, float], float]) -> Callable[[Tensor, Tensor], Tensor]:
 38 |         pass
 39 | 
 40 |     @staticmethod
 41 |     def reduce(
 42 |         fn: Callable[[float, float], float], start: float = 0.0
 43 |     ) -> Callable[[Tensor, int], Tensor]:
 44 |         pass
 45 | 
 46 |     @staticmethod
 47 |     def matrix_multiply(a: Tensor, b: Tensor) -> Tensor:
 48 |         raise NotImplementedError("Not implemented in this assignment")
 49 | 
 50 |     cuda = False
 51 | 
 52 | 
 53 | class TensorBackend:
 54 |     def __init__(self, ops: Type[TensorOps]):
 55 |         """
 56 |         Dynamically construct a tensor backend based on a `tensor_ops` object
 57 |         that implements map, zip, and reduce higher-order functions.
 58 | 
 59 |         Args:
 60 |             ops : tensor operations object see `tensor_ops.py`
 61 | 
 62 | 
 63 |         Returns :
 64 |             A collection of tensor functions
 65 | 
 66 |         """
 67 | 
 68 |         # Maps
 69 |         self.neg_map = ops.map(operators.neg)
 70 |         self.sigmoid_map = ops.map(operators.sigmoid)
 71 |         self.relu_map = ops.map(operators.relu)
 72 |         self.log_map = ops.map(operators.log)
 73 |         self.exp_map = ops.map(operators.exp)
 74 |         self.id_map = ops.map(operators.id)
 75 |         self.id_cmap = ops.cmap(operators.id)
 76 |         self.inv_map = ops.map(operators.inv)
 77 | 
 78 |         # Zips
 79 |         self.add_zip = ops.zip(operators.add)
 80 |         self.mul_zip = ops.zip(operators.mul)
 81 |         self.lt_zip = ops.zip(operators.lt)
 82 |         self.eq_zip = ops.zip(operators.eq)
 83 |         self.is_close_zip = ops.zip(operators.is_close)
 84 |         self.relu_back_zip = ops.zip(operators.relu_back)
 85 |         self.log_back_zip = ops.zip(operators.log_back)
 86 |         self.inv_back_zip = ops.zip(operators.inv_back)
 87 | 
 88 |         # Reduce
 89 |         self.add_reduce = ops.reduce(operators.add, 0.0)
 90 |         self.mul_reduce = ops.reduce(operators.mul, 1.0)
 91 |         self.matrix_multiply = ops.matrix_multiply
 92 |         self.cuda = ops.cuda
 93 | 
 94 | 
 95 | class SimpleOps(TensorOps):
 96 |     @staticmethod
 97 |     def map(fn: Callable[[float], float]) -> MapProto:
 98 |         """
 99 |         Higher-order tensor map function ::
100 | 
101 |           fn_map = map(fn)
102 |           fn_map(a, out)
103 |           out
104 | 
105 |         Simple version::
106 | 
107 |             for i:
108 |                 for j:
109 |                     out[i, j] = fn(a[i, j])
110 | 
111 |         Broadcasted version (`a` might be smaller than `out`) ::
112 | 
113 |             for i:
114 |                 for j:
115 |                     out[i, j] = fn(a[i, 0])
116 | 
117 |         Args:
118 |             fn: function from float-to-float to apply.
119 |             a (:class:`TensorData`): tensor to map over
120 |             out (:class:`TensorData`): optional, tensor data to fill in,
121 |                    should broadcast with `a`
122 | 
123 |         Returns:
124 |             new tensor data
125 |         """
126 | 
127 |         f = tensor_map(fn)
128 | 
129 |         def ret(a: Tensor, out: Optional[Tensor] = None) -> Tensor:
130 |             if out is None:
131 |                 out = a.zeros(a.shape)
132 |             f(*out.tuple(), *a.tuple())
133 |             return out
134 | 
135 |         return ret
136 | 
137 |     @staticmethod
138 |     def zip(
139 |         fn: Callable[[float, float], float]
140 |     ) -> Callable[["Tensor", "Tensor"], "Tensor"]:
141 |         """
142 |         Higher-order tensor zip function ::
143 | 
144 |           fn_zip = zip(fn)
145 |           out = fn_zip(a, b)
146 | 
147 |         Simple version ::
148 | 
149 |             for i:
150 |                 for j:
151 |                     out[i, j] = fn(a[i, j], b[i, j])
152 | 
153 |         Broadcasted version (`a` and `b` might be smaller than `out`) ::
154 | 
155 |             for i:
156 |                 for j:
157 |                     out[i, j] = fn(a[i, 0], b[0, j])
158 | 
159 | 
160 |         Args:
161 |             fn: function from two floats-to-float to apply
162 |             a (:class:`TensorData`): tensor to zip over
163 |             b (:class:`TensorData`): tensor to zip over
164 | 
165 |         Returns:
166 |             :class:`TensorData` : new tensor data
167 |         """
168 | 
169 |         f = tensor_zip(fn)
170 | 
171 |         def ret(a: "Tensor", b: "Tensor") -> "Tensor":
172 |             if a.shape != b.shape:
173 |                 c_shape = shape_broadcast(a.shape, b.shape)
174 |             else:
175 |                 c_shape = a.shape
176 |             out = a.zeros(c_shape)
177 |             f(*out.tuple(), *a.tuple(), *b.tuple())
178 |             return out
179 | 
180 |         return ret
181 | 
182 |     @staticmethod
183 |     def reduce(
184 |         fn: Callable[[float, float], float], start: float = 0.0
185 |     ) -> Callable[["Tensor", int], "Tensor"]:
186 |         """
187 |         Higher-order tensor reduce function. ::
188 | 
189 |           fn_reduce = reduce(fn)
190 |           out = fn_reduce(a, dim)
191 | 
192 |         Simple version ::
193 | 
194 |             for j:
195 |                 out[1, j] = start
196 |                 for i:
197 |                     out[1, j] = fn(out[1, j], a[i, j])
198 | 
199 | 
200 |         Args:
201 |             fn: function from two floats-to-float to apply
202 |             a (:class:`TensorData`): tensor to reduce over
203 |             dim (int): int of dim to reduce
204 | 
205 |         Returns:
206 |             :class:`TensorData` : new tensor
207 |         """
208 |         f = tensor_reduce(fn)
209 | 
210 |         def ret(a: "Tensor", dim: int) -> "Tensor":
211 |             out_shape = list(a.shape)
212 |             out_shape[dim] = 1
213 | 
214 |             # Other values when not sum.
215 |             out = a.zeros(tuple(out_shape))
216 |             out._tensor._storage[:] = start
217 | 
218 |             f(*out.tuple(), *a.tuple(), dim)
219 |             return out
220 | 
221 |         return ret
222 | 
223 |     @staticmethod
224 |     def matrix_multiply(a: "Tensor", b: "Tensor") -> "Tensor":
225 |         raise NotImplementedError("Not implemented in this assignment")
226 | 
227 |     is_cuda = False
228 | 
229 | 
230 | # Implementations.
231 | 
232 | 
233 | def tensor_map(
234 |     fn: Callable[[float], float]
235 | ) -> Callable[[Storage, Shape, Strides, Storage, Shape, Strides], None]:
236 |     """
237 |     Low-level implementation of tensor map between
238 |     tensors with *possibly different strides*.
239 | 
240 |     Simple version:
241 | 
242 |     * Fill in the `out` array by applying `fn` to each
243 |       value of `in_storage` assuming `out_shape` and `in_shape`
244 |       are the same size.
245 | 
246 |     Broadcasted version:
247 | 
248 |     * Fill in the `out` array by applying `fn` to each
249 |       value of `in_storage` assuming `out_shape` and `in_shape`
250 |       broadcast. (`in_shape` must be smaller than `out_shape`).
251 | 
252 |     Args:
253 |         fn: function from float-to-float to apply
254 | 
255 |     Returns:
256 |         Tensor map function.
257 |     """
258 | 
259 |     def _map(
260 |         out: Storage,
261 |         out_shape: Shape,
262 |         out_strides: Strides,
263 |         in_storage: Storage,
264 |         in_shape: Shape,
265 |         in_strides: Strides,
266 |     ) -> None:
267 |         # ASSIGN2.3
268 |         out_index: Index = np.zeros(MAX_DIMS, np.int16)
269 |         in_index: Index = np.zeros(MAX_DIMS, np.int16)
270 |         for i in range(len(out)):
271 |             to_index(i, out_shape, out_index)
272 |             broadcast_index(out_index, out_shape, in_shape, in_index)
273 |             o = index_to_position(out_index, out_strides)
274 |             j = index_to_position(in_index, in_strides)
275 |             out[o] = fn(in_storage[j])
276 |         # END ASSIGN2.3
277 | 
278 |     return _map
279 | 
280 | 
281 | def tensor_zip(
282 |     fn: Callable[[float, float], float]
283 | ) -> Callable[
284 |     [Storage, Shape, Strides, Storage, Shape, Strides, Storage, Shape, Strides], None
285 | ]:
286 |     """
287 |     Low-level implementation of tensor zip between
288 |     tensors with *possibly different strides*.
289 | 
290 |     Simple version:
291 | 
292 |     * Fill in the `out` array by applying `fn` to each
293 |       value of `a_storage` and `b_storage` assuming `out_shape`
294 |       and `a_shape` are the same size.
295 | 
296 |     Broadcasted version:
297 | 
298 |     * Fill in the `out` array by applying `fn` to each
299 |       value of `a_storage` and `b_storage` assuming `a_shape`
300 |       and `b_shape` broadcast to `out_shape`.
301 | 
302 |     Args:
303 |         fn: function mapping two floats to float to apply
304 | 
305 |     Returns:
306 |         Tensor zip function.
307 |     """
308 | 
309 |     def _zip(
310 |         out: Storage,
311 |         out_shape: Shape,
312 |         out_strides: Strides,
313 |         a_storage: Storage,
314 |         a_shape: Shape,
315 |         a_strides: Strides,
316 |         b_storage: Storage,
317 |         b_shape: Shape,
318 |         b_strides: Strides,
319 |     ) -> None:
320 |         # ASSIGN2.3
321 |         out_index: Index = np.zeros(MAX_DIMS, np.int32)
322 |         a_index: Index = np.zeros(MAX_DIMS, np.int32)
323 |         b_index: Index = np.zeros(MAX_DIMS, np.int32)
324 |         for i in range(len(out)):
325 |             to_index(i, out_shape, out_index)
326 |             o = index_to_position(out_index, out_strides)
327 |             broadcast_index(out_index, out_shape, a_shape, a_index)
328 |             j = index_to_position(a_index, a_strides)
329 |             broadcast_index(out_index, out_shape, b_shape, b_index)
330 |             k = index_to_position(b_index, b_strides)
331 |             out[o] = fn(a_storage[j], b_storage[k])
332 |         # END ASSIGN2.3
333 | 
334 |     return _zip
335 | 
336 | 
337 | def tensor_reduce(
338 |     fn: Callable[[float, float], float]
339 | ) -> Callable[[Storage, Shape, Strides, Storage, Shape, Strides, int], None]:
340 |     """
341 |     Low-level implementation of tensor reduce.
342 | 
343 |     * `out_shape` will be the same as `a_shape`
344 |        except with `reduce_dim` turned to size `1`
345 | 
346 |     Args:
347 |         fn: reduction function mapping two floats to float
348 | 
349 |     Returns:
350 |         Tensor reduce function.
351 |     """
352 | 
353 |     def _reduce(
354 |         out: Storage,
355 |         out_shape: Shape,
356 |         out_strides: Strides,
357 |         a_storage: Storage,
358 |         a_shape: Shape,
359 |         a_strides: Strides,
360 |         reduce_dim: int,
361 |     ) -> None:
362 |         # ASSIGN2.3
363 |         out_index: Index = np.zeros(MAX_DIMS, np.int32)
364 |         reduce_size = a_shape[reduce_dim]
365 |         for i in range(len(out)):
366 |             to_index(i, out_shape, out_index)
367 |             o = index_to_position(out_index, out_strides)
368 |             for s in range(reduce_size):
369 |                 out_index[reduce_dim] = s
370 |                 j = index_to_position(out_index, a_strides)
371 |                 out[o] = fn(out[o], a_storage[j])
372 |         # END ASSIGN2.3
373 | 
374 |     return _reduce
375 | 1
376 | 
377 | SimpleBackend = TensorBackend(SimpleOps)
378 | 


--------------------------------------------------------------------------------
/tensor_demo/miniTorch/requirements.extra.txt:
--------------------------------------------------------------------------------
 1 | datasets==2.4.0
 2 | embeddings==0.0.8
 3 | networkx==2.4
 4 | plotly==4.14.3
 5 | pydot==1.4.1
 6 | python-mnist
 7 | streamlit==1.12.0
 8 | streamlit-ace
 9 | torch
10 | watchdog==1.0.2
11 | 


--------------------------------------------------------------------------------
/tensor_demo/miniTorch/requirements.txt:
--------------------------------------------------------------------------------
 1 | colorama==0.4.3
 2 | hypothesis == 6.54
 3 | mypy == 0.971
 4 | numba == 0.58.1
 5 | numpy == 1.23.5
 6 | pre-commit == 2.20.0
 7 | pytest == 7.1.2
 8 | pytest-env
 9 | pytest-runner == 5.2
10 | typing_extensions


--------------------------------------------------------------------------------
/tensor_demo/miniTorch/setup.cfg:
--------------------------------------------------------------------------------
 1 | [metadata]
 2 | name=minitorch
 3 | version=0.4
 4 | 
 5 | [files]
 6 | packages =
 7 |     minitorch
 8 | [darglint]
 9 | ignore_regex=((^_(.*))|(.*map)|(.*zip)|(.*reduce)|(test.*)|(tensor_.*))
10 | docstring_style=google
11 | strictness=long
12 | 
13 | [flake8]
14 | ignore = N801, E203, E266, E501, W503, F812, E741, N803, N802, N806
15 | exclude = .git,__pycache__,docs/slides/*,old,build,dist
16 | 
17 | [isort]
18 | profile=black
19 | src_paths=minitorch,test
20 | 
21 | [mypy]
22 | strict = True
23 | ignore_missing_imports = True
24 | exclude=^(docs/)|(project/)|(assignments/)
25 | implicit_reexport = True
26 | 
27 | [mypy-tests.*]
28 | disallow_untyped_decorators = False
29 | implicit_reexport = True
30 | 
31 | [black]
32 | exclude=^(docs/)|(project/)|(assignments/)
33 | 
34 | [tool:pytest]
35 | markers =
36 |         task0_0
37 |         task0_1
38 |         task0_2
39 |         task0_3
40 |         task0_4
41 |         task1_0
42 |         task1_1
43 |         task1_2
44 |         task1_3
45 |         task1_4
46 |         task2_0
47 |         task2_1
48 |         task2_2
49 |         task2_3
50 |         task2_4
51 |         task3_0
52 |         task3_1
53 |         task3_2
54 |         task3_3
55 |         task3_4
56 |         task4_0
57 |         task4_1
58 |         task4_2
59 |         task4_3
60 |         task4_4
61 | 


--------------------------------------------------------------------------------
/tensor_demo/miniTorch/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup
2 | 
3 | setup(py_modules=[])
4 | 


--------------------------------------------------------------------------------
/tensor_demo/miniTorch/style.sh:
--------------------------------------------------------------------------------
1 | flake8 --ignore "N801, E203, E266, E501, W503, F812, E741, N803, N802, N806" minitorch/ tests/ project/
2 | 


--------------------------------------------------------------------------------
/tokenization/tokenization.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 3,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import re \n",
 10 |     "from collections import defaultdict \n",
 11 |     "import string\n",
 12 |     "\n",
 13 |     "def get_init_vocab(data): \n",
 14 |     "    \"\"\" \n",
 15 |     "    Given a list of strings, returns a dictionary of words mapping to their frequency  \n",
 16 |     "    count in the data. \n",
 17 |     "    Args: \n",
 18 |     "        data: raw text with line breaks\n",
 19 |     "        \n",
 20 |     "    Returns: \n",
 21 |     "        (vocab, tokens) tuple, \n",
 22 |     "          vocab is a dictionary mapping space delimited characters to count (e.g. {'a b c </w>': 5})\n",
 23 |     "          tokens is a set of basic characters. \n",
 24 |     "    \"\"\"\n",
 25 |     "    vocab = defaultdict(int)\n",
 26 |     "    tokens = set()\n",
 27 |     "    tokens.add('</w>')\n",
 28 |     "    for line in data: \n",
 29 |     "        for word in line.split(): \n",
 30 |     "            vocab[' '.join(list(word)) + ' </w>'] += 1\n",
 31 |     "            tokens.update(list(word))\n",
 32 |     "    return vocab, tokens \n",
 33 |     "  \n",
 34 |     "def count_cooccurance(vocab): \n",
 35 |     "    \"\"\" \n",
 36 |     "    Given a vocabulary (dictionary mapping words to frequency counts), returns a  \n",
 37 |     "    dictionary of tuples representing the frequency count of pairs of characters  \n",
 38 |     "    in the vocabulary. \n",
 39 |     "    Args:\n",
 40 |     "        vocab: a dictionary mapping space-delimited tokens to count (e.g. {'a b c </w>': 5})\n",
 41 |     "        \n",
 42 |     "    Returns: \n",
 43 |     "        a dictionary mapping a tuple of tokens to count\n",
 44 |     "    \"\"\"\n",
 45 |     "    pairs = defaultdict(int) \n",
 46 |     "    for word, freq in vocab.items(): \n",
 47 |     "        chars = word.split() # split the word by any white space\n",
 48 |     "        for i in range(len(chars)-1): \n",
 49 |     "            pairs[chars[i], chars[i+1]] += freq \n",
 50 |     "    return pairs\n",
 51 |     "  \n",
 52 |     "def merge_vocab(token_pair, vocab_in): \n",
 53 |     "    \"\"\" \n",
 54 |     "    Given a pair of tokens and a vocabulary, returns a new vocabulary with the  \n",
 55 |     "    pair of tokens merged together wherever they appear. \n",
 56 |     "    \n",
 57 |     "    e.g. merge_vocab(('a', 'b'), {'a b c </w>': 5})\n",
 58 |     "    returns {'ab c </w>': 5}\n",
 59 |     "    \n",
 60 |     "    Args: \n",
 61 |     "        token_pair: a tuple of two tokens\n",
 62 |     "        vocab_in: a dictionary mapping space-delimited tokens to count (e.g. {'a b c </w>': 5})\n",
 63 |     "        \n",
 64 |     "    Returns: \n",
 65 |     "        a dictionary mapping space-delimited tokens to count (e.g. {'a b c </w>': 5})\n",
 66 |     "    \"\"\"\n",
 67 |     "    vocab_out = defaultdict(int)  \n",
 68 |     "    bigram = re.escape(' '.join(token_pair)) \n",
 69 |     "    new_token = ''.join(token_pair)\n",
 70 |     "    # search for every occurance of bigram (token pairs with a space), \n",
 71 |     "    p = re.compile(r'(?<!\\S)' + bigram + r'(?!\\S)') \n",
 72 |     "    for word in vocab_in:\n",
 73 |     "        # replace the bigram (with space), with the new merged token (the concanated pair)\n",
 74 |     "        w_out = p.sub(new_token, word)\n",
 75 |     "        vocab_out[w_out] = vocab_in[word]\n",
 76 |     "    return vocab_out\n",
 77 |     "\n",
 78 |     "  \n",
 79 |     "def byte_pair_encoding(data, n): \n",
 80 |     "    \"\"\" \n",
 81 |     "    Given a list of strings and an integer n, returns a list of n merged pairs \n",
 82 |     "    of characters found in the vocabulary of the input data. \n",
 83 |     "    \n",
 84 |     "    Args: \n",
 85 |     "        data: raw text\n",
 86 |     "        n: number of merge opperations\n",
 87 |     "    \n",
 88 |     "    Returns: \n",
 89 |     "        a list of tokens\n",
 90 |     "        a dictionary mapping token to index (starting from 0)\n",
 91 |     "    \"\"\"\n",
 92 |     "    vocab, init_tokens = get_init_vocab(data)\n",
 93 |     "    tokens = list(init_tokens)\n",
 94 |     "    for i in range(n): \n",
 95 |     "        pairs = count_cooccurance(vocab) \n",
 96 |     "        best_pair = max(pairs, key=pairs.get) \n",
 97 |     "        new_token = ''.join(best_pair)\n",
 98 |     "        tokens.append(new_token)\n",
 99 |     "        vocab = merge_vocab(best_pair, vocab)\n",
100 |     "        print('step {}: merging \\\"{}\\\" and \\\"{}\\\"'.format(i+1, best_pair[0], best_pair[1]))\n",
101 |     "    token_to_ids = dict([(tk, id) for id, tk in enumerate(tokens)])\n",
102 |     "    return tokens, token_to_ids\n",
103 |     "\n",
104 |     "def tokenize(data, token_dict):\n",
105 |     "    \"\"\"\n",
106 |     "    split the data into tokens and map into index. \n",
107 |     "    It applies greedy split to text with longest matching.\n",
108 |     "    \n",
109 |     "    e.g. \n",
110 |     "    tokenize(\"spiderman\", {'spider':0,'man': 1})\n",
111 |     "    will return\n",
112 |     "     [0, 1]\n",
113 |     "        \n",
114 |     "    Args: \n",
115 |     "        data: raw text\n",
116 |     "        token_dict: a dictionary mapping from token to id\n",
117 |     "        \n",
118 |     "    Returns: \n",
119 |     "        a list of ids\n",
120 |     "        \n",
121 |     "    \"\"\"\n",
122 |     "    encoded_ids = []\n",
123 |     "    for line in data: \n",
124 |     "        for word in line.split():\n",
125 |     "            word = word + '</w>'\n",
126 |     "            last_idx = 0\n",
127 |     "            idx = len(word)\n",
128 |     "            while idx > last_idx:\n",
129 |     "                whole_word = word[last_idx:idx]\n",
130 |     "                if whole_word in token_dict:\n",
131 |     "                    encoded_ids.append(token_dict[whole_word])\n",
132 |     "                    last_idx = idx\n",
133 |     "                    idx = len(word)\n",
134 |     "                else:\n",
135 |     "                    idx = idx - 1\n",
136 |     "    return encoded_ids\n",
137 |     "  \n",
138 |     "\n"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "code",
143 |    "execution_count": 4,
144 |    "metadata": {},
145 |    "outputs": [
146 |     {
147 |      "name": "stdout",
148 |      "output_type": "stream",
149 |      "text": [
150 |       "step 1: merging \"e\" and \"r\"\n",
151 |       "step 2: merging \"s\" and \"</w>\"\n",
152 |       "step 3: merging \"e\" and \"</w>\"\n",
153 |       "step 4: merging \"e\" and \"n\"\n",
154 |       "step 5: merging \"d\" and \"</w>\"\n",
155 |       "step 6: merging \"h\" and \"er\"\n",
156 |       "step 7: merging \"en\" and \"t\"\n",
157 |       "step 8: merging \"e\" and \"d</w>\"\n",
158 |       "step 9: merging \",\" and \"</w>\"\n",
159 |       "step 10: merging \"her\" and \"</w>\"\n",
160 |       "step 11: merging \"n\" and \"</w>\"\n",
161 |       "step 12: merging \"p\" and \"a\"\n",
162 |       "step 13: merging \"pa\" and \"r\"\n",
163 |       "step 14: merging \"par\" and \"ent\"\n",
164 |       "step 15: merging \"en\" and \"</w>\"\n",
165 |       "step 16: merging \"h\" and \"e</w>\"\n",
166 |       "step 17: merging \"a\" and \"s</w>\"\n",
167 |       "step 18: merging \"s\" and \"e\"\n",
168 |       "step 19: merging \"e\" and \"a\"\n",
169 |       "step 20: merging \"i\" and \"t\"\n",
170 |       "The bpe tokens are: \n",
171 |       "W: 0\n",
172 |       "c: 1\n",
173 |       "[: 2\n",
174 |       "b: 3\n",
175 |       "w: 4\n",
176 |       "a: 5\n",
177 |       "s: 6\n",
178 |       "d: 7\n",
179 |       "m: 8\n",
180 |       "T: 9\n",
181 |       "f: 10\n",
182 |       "y: 11\n",
183 |       "</w>: 12\n",
184 |       "k: 13\n",
185 |       "t: 14\n",
186 |       "H: 15\n",
187 |       "o: 16\n",
188 |       "': 17\n",
189 |       "O: 18\n",
190 |       "p: 19\n",
191 |       "D: 20\n",
192 |       "e: 21\n",
193 |       "B: 22\n",
194 |       "n: 23\n",
195 |       "i: 24\n",
196 |       "]: 25\n",
197 |       "h: 26\n",
198 |       ",: 27\n",
199 |       "u: 28\n",
200 |       "6: 29\n",
201 |       "l: 30\n",
202 |       "g: 31\n",
203 |       "v: 32\n",
204 |       "r: 33\n",
205 |       "er: 34\n",
206 |       "s</w>: 35\n",
207 |       "e</w>: 36\n",
208 |       "en: 37\n",
209 |       "d</w>: 38\n",
210 |       "her: 39\n",
211 |       "ent: 40\n",
212 |       "ed</w>: 41\n",
213 |       ",</w>: 42\n",
214 |       "her</w>: 43\n",
215 |       "n</w>: 44\n",
216 |       "pa: 45\n",
217 |       "par: 46\n",
218 |       "parent: 47\n",
219 |       "en</w>: 48\n",
220 |       "he</w>: 49\n",
221 |       "as</w>: 50\n",
222 |       "se: 51\n",
223 |       "ea: 52\n",
224 |       "it: 53\n",
225 |       "The ids of the tokenized sequence are: \n",
226 |       "[22, 34, 8, 5, 23, 17, 35, 47, 35, 7, 24, 32, 16, 33, 1, 41, 4, 26, 48, 49, 4, 50, 51, 32, 48, 9, 39, 52, 10, 14, 34, 42, 49, 6, 19, 30, 53, 12, 14, 24, 8, 36, 3, 21, 14, 4, 21, 48, 52, 1, 26, 12, 47, 17, 35, 26, 16, 28, 51, 26, 16, 30, 38, 28, 23, 14, 24, 30, 12, 49, 40, 34, 41, 1, 16, 30, 30, 21, 31, 36, 2, 29, 25, 12, 15, 24, 35, 10, 5, 14, 43, 33, 21, 30, 16, 1, 5, 14, 41, 14, 16, 12, 20, 5, 30, 30, 50, 10, 16, 33, 12, 5, 12, 19, 16, 6, 53, 24, 16, 44, 50, 5, 12, 30, 16, 3, 3, 11, 24, 6, 14, 12, 16, 44, 3, 21, 26, 5, 30, 10, 12, 16, 10, 12, 10, 16, 16, 7, 51, 33, 32, 24, 1, 36, 3, 28, 6, 24, 23, 21, 6, 51, 6, 42, 4, 26, 24, 30, 36, 26, 24, 35, 8, 16, 14, 43, 8, 16, 32, 41, 3, 5, 1, 13, 12, 24, 44, 4, 53, 26, 12, 43, 47, 35, 24, 44, 0, 16, 16, 6, 14, 34, 42, 18, 26, 24, 16, 42, 5, 23, 38, 3, 21, 1, 5, 8, 36, 5, 12, 14, 52, 1, 43, 14, 39, 36]\n",
227 |       "\n",
228 |       "The sequence corresponding to ids is: \n",
229 |       "B er m a n ' s</w> parent s</w> d i v o r c ed</w> w h en</w> he</w> w as</w> se v en</w> T her ea f t er ,</w> he</w> s p l it </w> t i m e</w> b e t w e en</w> ea c h </w> parent ' s</w> h o u se h o l d</w> u n t i l </w> he</w> ent er ed</w> c o l l e g e</w> [ 6 ] </w> H i s</w> f a t her</w> r e l o c a t ed</w> t o </w> D a l l as</w> f o r </w> a </w> p o s it i o n</w> as</w> a </w> l o b b y i s t </w> o n</w> b e h a l f </w> o f </w> f o o d se r v i c e</w> b u s i n e s se s ,</w> w h i l e</w> h i s</w> m o t her</w> m o v ed</w> b a c k </w> i n</w> w it h </w> her</w> parent s</w> i n</w> W o o s t er ,</w> O h i o ,</w> a n d</w> b e c a m e</w> a </w> t ea c her</w> t her e</w>\n"
230 |      ]
231 |     }
232 |    ],
233 |    "source": [
234 |     "# Example usage: \n",
235 |     "corpus = '''Berman's parents divorced when he was seven. \n",
236 |     "Thereafter, he split time between each parent's household until he entered college.[6] \n",
237 |     "His father relocated to Dallas for a position as a lobbyist on behalf of foodservice businesses, \n",
238 |     "while his mother moved back in with her parents in Wooster, Ohio, and became a teacher there'''\n",
239 |     "data = corpus.split('.') \n",
240 |     "  \n",
241 |     "n = 20 # number of merge operations\n",
242 |     "id_to_tokens, token_to_ids = byte_pair_encoding(data, n)\n",
243 |     "\n",
244 |     "token_ids = tokenize(data, token_to_ids)\n",
245 |     "\n",
246 |     "print(\"The bpe tokens are: \")\n",
247 |     "for tk, tid in token_to_ids.items():\n",
248 |     "    print(\"{}: {}\".format(tk, tid))\n",
249 |     "\n",
250 |     "print(\"The ids of the tokenized sequence are: \")\n",
251 |     "print(token_ids)\n",
252 |     "print()\n",
253 |     "print(\"The sequence corresponding to ids is: \")\n",
254 |     "print(' '.join(id_to_tokens[tid] for tid in token_ids))"
255 |    ]
256 |   }
257 |  ],
258 |  "metadata": {
259 |   "kernelspec": {
260 |    "display_name": "base",
261 |    "language": "python",
262 |    "name": "python3"
263 |   },
264 |   "language_info": {
265 |    "codemirror_mode": {
266 |     "name": "ipython",
267 |     "version": 3
268 |    },
269 |    "file_extension": ".py",
270 |    "mimetype": "text/x-python",
271 |    "name": "python",
272 |    "nbconvert_exporter": "python",
273 |    "pygments_lexer": "ipython3",
274 |    "version": "3.9.18"
275 |   }
276 |  },
277 |  "nbformat": 4,
278 |  "nbformat_minor": 2
279 | }
280 | 


--------------------------------------------------------------------------------