├── README.md ├── docs ├── Akula_gourani_final_project.pptx └── gourani_akula_initial_report.pdf └── src ├── softmax.cpp └── multihead_attn.c /README.md: -------------------------------------------------------------------------------- 1 | # multihead_attn_accelerator 2 | Accelerate multihead attention transformer model using HLS for FPGA 3 | -------------------------------------------------------------------------------- /docs/Akula_gourani_final_project.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RakeshUIUC/multihead_attn_accelerator/HEAD/docs/Akula_gourani_final_project.pptx -------------------------------------------------------------------------------- /docs/gourani_akula_initial_report.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RakeshUIUC/multihead_attn_accelerator/HEAD/docs/gourani_akula_initial_report.pdf -------------------------------------------------------------------------------- /src/softmax.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | void softmax(float* input, size_t size) { 6 | 7 | assert(0 <= size <= sizeof(input) / sizeof(float)); 8 | 9 | int i; 10 | float m, sum, constant; 11 | 12 | m = -INFINITY; 13 | for (i = 0; i < size; ++i) { 14 | if (m < input[i]) { 15 | m = input[i]; 16 | } 17 | } 18 | 19 | sum = 0.0; 20 | for (i = 0; i < size; ++i) { 21 | sum += exp(input[i] - m); 22 | } 23 | 24 | constant = m + log(sum); 25 | for (i = 0; i < size; ++i) { 26 | input[i] = exp(input[i] - constant); 27 | } 28 | 29 | } -------------------------------------------------------------------------------- /src/multihead_attn.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #define INPUT_SIZE 512 6 | #define NUM_HEADS 8 7 | 8 | typedef struct { 9 | double *W_q, *W_k, *W_v, *W_o; 10 | double *b_q, *b_k, *b_v, *b_o; 11 | } MultiheadAttention; 12 | 13 | void initialize_attention(MultiheadAttention *attention) { 14 | attention->W_q = (double *)malloc(INPUT_SIZE * INPUT_SIZE * sizeof(double)); 15 | attention->W_k = (double *)malloc(INPUT_SIZE * INPUT_SIZE * sizeof(double)); 16 | attention->W_v = (double *)malloc(INPUT_SIZE * INPUT_SIZE * sizeof(double)); 17 | attention->W_o = (double *)malloc(INPUT_SIZE * INPUT_SIZE * sizeof(double)); 18 | 19 | attention->b_q = (double *)malloc(INPUT_SIZE * sizeof(double)); 20 | attention->b_k = (double *)malloc(INPUT_SIZE * sizeof(double)); 21 | attention->b_v = (double *)malloc(INPUT_SIZE * sizeof(double)); 22 | attention->b_o = (double *)malloc(INPUT_SIZE * sizeof(double)); 23 | 24 | // Initialize weights and biases randomly (you can replace this with your initialization logic) 25 | for (int i = 0; i < INPUT_SIZE * INPUT_SIZE; ++i) { 26 | attention->W_q[i] = ((double)rand() / RAND_MAX) * 2.0 - 1.0; 27 | attention->W_k[i] = ((double)rand() / RAND_MAX) * 2.0 - 1.0; 28 | attention->W_v[i] = ((double)rand() / RAND_MAX) * 2.0 - 1.0; 29 | attention->W_o[i] = ((double)rand() / RAND_MAX) * 2.0 - 1.0; 30 | } 31 | 32 | for (int i = 0; i < INPUT_SIZE; ++i) { 33 | attention->b_q[i] = ((double)rand() / RAND_MAX) * 2.0 - 1.0; 34 | attention->b_k[i] = ((double)rand() / RAND_MAX) * 2.0 - 1.0; 35 | attention->b_v[i] = ((double)rand() / RAND_MAX) * 2.0 - 1.0; 36 | attention->b_o[i] = ((double)rand() / RAND_MAX) * 2.0 - 1.0; 37 | } 38 | } 39 | 40 | void free_attention(MultiheadAttention *attention) { 41 | free(attention->W_q); 42 | free(attention->W_k); 43 | free(attention->W_v); 44 | free(attention->W_o); 45 | 46 | free(attention->b_q); 47 | free(attention->b_k); 48 | free(attention->b_v); 49 | free(attention->b_o); 50 | } 51 | 52 | // Implement matrix multiplication: C = A * B 53 | void matrix_multiply(const double *A, const double *B, double *C, int m, int n, int p) { 54 | for (int i = 0; i < m; ++i) { 55 | for (int j = 0; j < p; ++j) { 56 | C[i * p + j] = 0.0; 57 | for (int k = 0; k < n; ++k) { 58 | C[i * p + j] += A[i * n + k] * B[k * p + j]; 59 | } 60 | } 61 | } 62 | } 63 | 64 | // Implement vector addition: C = A + B 65 | void vector_addition(const double *A, const double *B, double *C, int size) { 66 | for (int i = 0; i < size; ++i) { 67 | C[i] = A[i] + B[i]; 68 | } 69 | } 70 | 71 | // Implement the softmax function 72 | void softmax(double *x, int size) { 73 | double max_val = x[0]; 74 | for (int i = 1; i < size; ++i) { 75 | if (x[i] > max_val) { 76 | max_val = x[i]; 77 | } 78 | } 79 | 80 | double exp_sum = 0.0; 81 | for (int i = 0; i < size; ++i) { 82 | x[i] = exp(x[i] - max_val); 83 | exp_sum += x[i]; 84 | } 85 | 86 | for (int i = 0; i < size; ++i) { 87 | x[i] /= exp_sum; 88 | } 89 | } 90 | 91 | void multihead_attention(const double *input, int batch_size, int sequence_length, double *output, MultiheadAttention *attention) { 92 | int head_size = INPUT_SIZE / NUM_HEADS; 93 | int i, j; 94 | 95 | // Linear transformations for query, key, and value 96 | double *Q = (double *)malloc(batch_size * sequence_length * INPUT_SIZE * sizeof(double)); 97 | double *K = (double *)malloc(batch_size * sequence_length * INPUT_SIZE * sizeof(double)); 98 | double *V = (double *)malloc(batch_size * sequence_length * INPUT_SIZE * sizeof(double)); 99 | 100 | matrix_multiply(input, attention->W_q, Q, batch_size * sequence_length, INPUT_SIZE, INPUT_SIZE); 101 | matrix_multiply(input, attention->W_k, K, batch_size * sequence_length, INPUT_SIZE, INPUT_SIZE); 102 | matrix_multiply(input, attention->W_v, V, batch_size * sequence_length, INPUT_SIZE, INPUT_SIZE); 103 | 104 | for (i = 0; i < batch_size * sequence_length; ++i) { 105 | vector_addition(Q + i * INPUT_SIZE, attention->b_q, Q + i * INPUT_SIZE, INPUT_SIZE); 106 | vector_addition(K + i * INPUT_SIZE, attention->b_k, K + i * INPUT_SIZE, INPUT_SIZE); 107 | vector_addition(V + i * INPUT_SIZE, attention->b_v, V + i * INPUT_SIZE, INPUT_SIZE); 108 | } 109 | 110 | // Split into multiple heads 111 | double **Q_heads = (double **)malloc(NUM_HEADS * sizeof(double *)); 112 | double **K_heads = (double **)malloc(NUM_HEADS * sizeof(double *)); 113 | double **V_heads = (double **)malloc(NUM_HEADS * sizeof(double *)); 114 | 115 | for (i = 0; i < NUM_HEADS; ++i) { 116 | Q_heads[i] = (double *)malloc(batch_size * sequence_length * head_size * sizeof(double)); 117 | K_heads[i] = (double *)malloc(batch_size * sequence_length * head_size * sizeof(double)); 118 | V_heads[i] = (double *)malloc(batch_size * sequence_length * head_size * sizeof(double)); 119 | 120 | for (j = 0; j < batch_size * sequence_length; ++j) { 121 | Q_heads[i][j * head_size] = Q[j * INPUT_SIZE + i * head_size]; 122 | K_heads[i][j * head_size] = K[j * INPUT_SIZE + i * head_size]; 123 | V_heads[i][j * head_size] = V[j * INPUT_SIZE + i * head_size]; 124 | 125 | for (int k = 1; k < head_size; ++k) { 126 | Q_heads[i][j * head_size + k] = Q[j * INPUT_SIZE + i * head_size + k]; 127 | K_heads[i][j * head_size + k] = K[j * INPUT_SIZE + i * head_size + k]; 128 | V_heads[i][j * head_size + k] = V[j * INPUT_SIZE + i * head_size + k]; 129 | } 130 | } 131 | } 132 | 133 | // Apply attention for each head 134 | double **attention_outputs = (double **)malloc(NUM_HEADS * sizeof(double *)); 135 | for (i = 0; i < NUM_HEADS; ++i) { 136 | attention_outputs[i] = (double *)malloc(batch_size * sequence_length * head_size * sizeof(double)); 137 | for (j = 0; j < batch_size * sequence_length; ++j) { 138 | // Implement attention mechanism (this is a basic dot-product attention) 139 | double *attention_scores = (double *)malloc(head_size * sizeof(double)); 140 | matrix_multiply(Q_heads[i] + j * head_size, K_heads[i] + j * head_size, attention_scores, 1, head_size, head_size); 141 | 142 | // Apply scaling factor 143 | for (int k = 0; k < head_size; ++k) { 144 | attention_scores[k] /= sqrt((double)head_size); 145 | } 146 | 147 | // Apply softmax activation 148 | softmax(attention_scores, head_size); 149 | 150 | // Weighted sum 151 | matrix_multiply(attention_scores, V_heads[i] + j * head_size, attention_outputs[i] + j * head_size, 1, head_size, head_size); 152 | 153 | free(attention_scores); 154 | } 155 | } 156 | 157 | // Concatenate the attention outputs 158 | double *concatenated_output = (double *)malloc(batch_size * sequence_length * INPUT_SIZE * sizeof(double)); 159 | for (i = 0; i < NUM_HEADS; ++i) { 160 | for (j = 0; j < batch_size * sequence_length; ++j) { 161 | for (int k = 0; k < head_size; ++k) { 162 | concatenated_output[j * INPUT_SIZE + i * head_size + k] = attention_outputs[i][j * head_size + k]; 163 | } 164 | } 165 | free(attention_outputs[i]); 166 | } 167 | free(attention_outputs); 168 | 169 | // Apply final linear transformation 170 | matrix_multiply(concatenated_output, attention->W_o, output, batch_size * sequence_length, INPUT_SIZE, INPUT_SIZE); 171 | for (i = 0; i < batch_size * sequence_length; ++i) { 172 | vector_addition(output + i * INPUT_SIZE, attention->b_o, output + i * INPUT_SIZE, INPUT_SIZE); 173 | } 174 | 175 | free(Q); 176 | free(K); 177 | free(V); 178 | free(concatenated_output); 179 | 180 | for (i = 0; i < NUM_HEADS; ++i) { 181 | free(Q_heads[i]); 182 | free(K_heads[i]); 183 | free(V_heads[i]); 184 | } 185 | free(Q_heads); 186 | free(K_heads); 187 | free(V_heads); 188 | } 189 | 190 | int main() { 191 | srand(42); // Set a seed for reproducibility 192 | 193 | int batch_size = 16; 194 | int sequence_length = 10; 195 | double *input = (double *)malloc(batch_size * sequence_length * INPUT_SIZE * sizeof(double)); 196 | for (int i = 0; i < batch_size * sequence_length * INPUT_SIZE; ++i) { 197 | input[i] = ((double)rand() / RAND_MAX) * 2.0 - 1.0; 198 | } 199 | 200 | MultiheadAttention attention; 201 | initialize_attention(&attention); 202 | 203 | double *output = (double *)malloc(batch_size * sequence_length * INPUT_SIZE * sizeof(double)); 204 | 205 | multihead_attention(input, batch_size, sequence_length, output, &attention); 206 | 207 | // Print input and output shapes (for demonstration purposes) 208 | printf("Input shape: %d x %d\n", batch_size * sequence_length, INPUT_SIZE); 209 | printf("Output shape: %d x %d\n", batch_size * sequence_length, INPUT_SIZE); 210 | 211 | // Clean up 212 | free(input); 213 | free(output); 214 | free_attention(&attention); 215 | 216 | return 0; 217 | } 218 | --------------------------------------------------------------------------------