├── README.md
├── docs
    ├── Akula_gourani_final_project.pptx
    └── gourani_akula_initial_report.pdf
└── src
    ├── softmax.cpp
    └── multihead_attn.c


/README.md:
--------------------------------------------------------------------------------
1 | # multihead_attn_accelerator
2 | Accelerate multihead attention transformer model using HLS for FPGA
3 | 


--------------------------------------------------------------------------------
/docs/Akula_gourani_final_project.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RakeshUIUC/multihead_attn_accelerator/HEAD/docs/Akula_gourani_final_project.pptx


--------------------------------------------------------------------------------
/docs/gourani_akula_initial_report.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RakeshUIUC/multihead_attn_accelerator/HEAD/docs/gourani_akula_initial_report.pdf


--------------------------------------------------------------------------------
/src/softmax.cpp:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <math.h>
 3 | #include <assert.h>
 4 | 
 5 | void softmax(float* input, size_t size) {
 6 | 
 7 | 	assert(0 <= size <= sizeof(input) / sizeof(float));
 8 | 
 9 | 	int i;
10 | 	float m, sum, constant;
11 | 
12 | 	m = -INFINITY;
13 | 	for (i = 0; i < size; ++i) {
14 | 		if (m < input[i]) {
15 | 			m = input[i];
16 | 		}
17 | 	}
18 | 
19 | 	sum = 0.0;
20 | 	for (i = 0; i < size; ++i) {
21 | 		sum += exp(input[i] - m);
22 | 	}
23 | 
24 | 	constant = m + log(sum);
25 | 	for (i = 0; i < size; ++i) {
26 | 		input[i] = exp(input[i] - constant);
27 | 	}
28 | 
29 | }


--------------------------------------------------------------------------------
/src/multihead_attn.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <stdlib.h>
  3 | #include <math.h>
  4 | 
  5 | #define INPUT_SIZE 512
  6 | #define NUM_HEADS 8
  7 | 
  8 | typedef struct {
  9 |     double *W_q, *W_k, *W_v, *W_o;
 10 |     double *b_q, *b_k, *b_v, *b_o;
 11 | } MultiheadAttention;
 12 | 
 13 | void initialize_attention(MultiheadAttention *attention) {
 14 |     attention->W_q = (double *)malloc(INPUT_SIZE * INPUT_SIZE * sizeof(double));
 15 |     attention->W_k = (double *)malloc(INPUT_SIZE * INPUT_SIZE * sizeof(double));
 16 |     attention->W_v = (double *)malloc(INPUT_SIZE * INPUT_SIZE * sizeof(double));
 17 |     attention->W_o = (double *)malloc(INPUT_SIZE * INPUT_SIZE * sizeof(double));
 18 | 
 19 |     attention->b_q = (double *)malloc(INPUT_SIZE * sizeof(double));
 20 |     attention->b_k = (double *)malloc(INPUT_SIZE * sizeof(double));
 21 |     attention->b_v = (double *)malloc(INPUT_SIZE * sizeof(double));
 22 |     attention->b_o = (double *)malloc(INPUT_SIZE * sizeof(double));
 23 | 
 24 |     // Initialize weights and biases randomly (you can replace this with your initialization logic)
 25 |     for (int i = 0; i < INPUT_SIZE * INPUT_SIZE; ++i) {
 26 |         attention->W_q[i] = ((double)rand() / RAND_MAX) * 2.0 - 1.0;
 27 |         attention->W_k[i] = ((double)rand() / RAND_MAX) * 2.0 - 1.0;
 28 |         attention->W_v[i] = ((double)rand() / RAND_MAX) * 2.0 - 1.0;
 29 |         attention->W_o[i] = ((double)rand() / RAND_MAX) * 2.0 - 1.0;
 30 |     }
 31 | 
 32 |     for (int i = 0; i < INPUT_SIZE; ++i) {
 33 |         attention->b_q[i] = ((double)rand() / RAND_MAX) * 2.0 - 1.0;
 34 |         attention->b_k[i] = ((double)rand() / RAND_MAX) * 2.0 - 1.0;
 35 |         attention->b_v[i] = ((double)rand() / RAND_MAX) * 2.0 - 1.0;
 36 |         attention->b_o[i] = ((double)rand() / RAND_MAX) * 2.0 - 1.0;
 37 |     }
 38 | }
 39 | 
 40 | void free_attention(MultiheadAttention *attention) {
 41 |     free(attention->W_q);
 42 |     free(attention->W_k);
 43 |     free(attention->W_v);
 44 |     free(attention->W_o);
 45 | 
 46 |     free(attention->b_q);
 47 |     free(attention->b_k);
 48 |     free(attention->b_v);
 49 |     free(attention->b_o);
 50 | }
 51 | 
 52 | // Implement matrix multiplication: C = A * B
 53 | void matrix_multiply(const double *A, const double *B, double *C, int m, int n, int p) {
 54 |     for (int i = 0; i < m; ++i) {
 55 |         for (int j = 0; j < p; ++j) {
 56 |             C[i * p + j] = 0.0;
 57 |             for (int k = 0; k < n; ++k) {
 58 |                 C[i * p + j] += A[i * n + k] * B[k * p + j];
 59 |             }
 60 |         }
 61 |     }
 62 | }
 63 | 
 64 | // Implement vector addition: C = A + B
 65 | void vector_addition(const double *A, const double *B, double *C, int size) {
 66 |     for (int i = 0; i < size; ++i) {
 67 |         C[i] = A[i] + B[i];
 68 |     }
 69 | }
 70 | 
 71 | // Implement the softmax function
 72 | void softmax(double *x, int size) {
 73 |     double max_val = x[0];
 74 |     for (int i = 1; i < size; ++i) {
 75 |         if (x[i] > max_val) {
 76 |             max_val = x[i];
 77 |         }
 78 |     }
 79 | 
 80 |     double exp_sum = 0.0;
 81 |     for (int i = 0; i < size; ++i) {
 82 |         x[i] = exp(x[i] - max_val);
 83 |         exp_sum += x[i];
 84 |     }
 85 | 
 86 |     for (int i = 0; i < size; ++i) {
 87 |         x[i] /= exp_sum;
 88 |     }
 89 | }
 90 | 
 91 | void multihead_attention(const double *input, int batch_size, int sequence_length, double *output, MultiheadAttention *attention) {
 92 |     int head_size = INPUT_SIZE / NUM_HEADS;
 93 |     int i, j;
 94 | 
 95 |     // Linear transformations for query, key, and value
 96 |     double *Q = (double *)malloc(batch_size * sequence_length * INPUT_SIZE * sizeof(double));
 97 |     double *K = (double *)malloc(batch_size * sequence_length * INPUT_SIZE * sizeof(double));
 98 |     double *V = (double *)malloc(batch_size * sequence_length * INPUT_SIZE * sizeof(double));
 99 | 
100 |     matrix_multiply(input, attention->W_q, Q, batch_size * sequence_length, INPUT_SIZE, INPUT_SIZE);
101 |     matrix_multiply(input, attention->W_k, K, batch_size * sequence_length, INPUT_SIZE, INPUT_SIZE);
102 |     matrix_multiply(input, attention->W_v, V, batch_size * sequence_length, INPUT_SIZE, INPUT_SIZE);
103 | 
104 |     for (i = 0; i < batch_size * sequence_length; ++i) {
105 |         vector_addition(Q + i * INPUT_SIZE, attention->b_q, Q + i * INPUT_SIZE, INPUT_SIZE);
106 |         vector_addition(K + i * INPUT_SIZE, attention->b_k, K + i * INPUT_SIZE, INPUT_SIZE);
107 |         vector_addition(V + i * INPUT_SIZE, attention->b_v, V + i * INPUT_SIZE, INPUT_SIZE);
108 |     }
109 | 
110 |     // Split into multiple heads
111 |     double **Q_heads = (double **)malloc(NUM_HEADS * sizeof(double *));
112 |     double **K_heads = (double **)malloc(NUM_HEADS * sizeof(double *));
113 |     double **V_heads = (double **)malloc(NUM_HEADS * sizeof(double *));
114 | 
115 |     for (i = 0; i < NUM_HEADS; ++i) {
116 |         Q_heads[i] = (double *)malloc(batch_size * sequence_length * head_size * sizeof(double));
117 |         K_heads[i] = (double *)malloc(batch_size * sequence_length * head_size * sizeof(double));
118 |         V_heads[i] = (double *)malloc(batch_size * sequence_length * head_size * sizeof(double));
119 | 
120 |         for (j = 0; j < batch_size * sequence_length; ++j) {
121 |             Q_heads[i][j * head_size] = Q[j * INPUT_SIZE + i * head_size];
122 |             K_heads[i][j * head_size] = K[j * INPUT_SIZE + i * head_size];
123 |             V_heads[i][j * head_size] = V[j * INPUT_SIZE + i * head_size];
124 | 
125 |             for (int k = 1; k < head_size; ++k) {
126 |                 Q_heads[i][j * head_size + k] = Q[j * INPUT_SIZE + i * head_size + k];
127 |                 K_heads[i][j * head_size + k] = K[j * INPUT_SIZE + i * head_size + k];
128 |                 V_heads[i][j * head_size + k] = V[j * INPUT_SIZE + i * head_size + k];
129 |             }
130 |         }
131 |     }
132 | 
133 |     // Apply attention for each head
134 |     double **attention_outputs = (double **)malloc(NUM_HEADS * sizeof(double *));
135 |     for (i = 0; i < NUM_HEADS; ++i) {
136 |         attention_outputs[i] = (double *)malloc(batch_size * sequence_length * head_size * sizeof(double));
137 |         for (j = 0; j < batch_size * sequence_length; ++j) {
138 |             // Implement attention mechanism (this is a basic dot-product attention)
139 |             double *attention_scores = (double *)malloc(head_size * sizeof(double));
140 |             matrix_multiply(Q_heads[i] + j * head_size, K_heads[i] + j * head_size, attention_scores, 1, head_size, head_size);
141 | 
142 |             // Apply scaling factor
143 |             for (int k = 0; k < head_size; ++k) {
144 |                 attention_scores[k] /= sqrt((double)head_size);
145 |             }
146 | 
147 |             // Apply softmax activation
148 |             softmax(attention_scores, head_size);
149 | 
150 |             // Weighted sum
151 |             matrix_multiply(attention_scores, V_heads[i] + j * head_size, attention_outputs[i] + j * head_size, 1, head_size, head_size);
152 | 
153 |             free(attention_scores);
154 |         }
155 |     }
156 | 
157 |     // Concatenate the attention outputs
158 |     double *concatenated_output = (double *)malloc(batch_size * sequence_length * INPUT_SIZE * sizeof(double));
159 |     for (i = 0; i < NUM_HEADS; ++i) {
160 |         for (j = 0; j < batch_size * sequence_length; ++j) {
161 |             for (int k = 0; k < head_size; ++k) {
162 |                 concatenated_output[j * INPUT_SIZE + i * head_size + k] = attention_outputs[i][j * head_size + k];
163 |             }
164 |         }
165 |         free(attention_outputs[i]);
166 |     }
167 |     free(attention_outputs);
168 | 
169 |     // Apply final linear transformation
170 |     matrix_multiply(concatenated_output, attention->W_o, output, batch_size * sequence_length, INPUT_SIZE, INPUT_SIZE);
171 |     for (i = 0; i < batch_size * sequence_length; ++i) {
172 |         vector_addition(output + i * INPUT_SIZE, attention->b_o, output + i * INPUT_SIZE, INPUT_SIZE);
173 |     }
174 | 
175 |     free(Q);
176 |     free(K);
177 |     free(V);
178 |     free(concatenated_output);
179 | 
180 |     for (i = 0; i < NUM_HEADS; ++i) {
181 |         free(Q_heads[i]);
182 |         free(K_heads[i]);
183 |         free(V_heads[i]);
184 |     }
185 |     free(Q_heads);
186 |     free(K_heads);
187 |     free(V_heads);
188 | }
189 | 
190 | int main() {
191 |     srand(42); // Set a seed for reproducibility
192 | 
193 |     int batch_size = 16;
194 |     int sequence_length = 10;
195 |     double *input = (double *)malloc(batch_size * sequence_length * INPUT_SIZE * sizeof(double));
196 |     for (int i = 0; i < batch_size * sequence_length * INPUT_SIZE; ++i) {
197 |         input[i] = ((double)rand() / RAND_MAX) * 2.0 - 1.0;
198 |     }
199 | 
200 |     MultiheadAttention attention;
201 |     initialize_attention(&attention);
202 | 
203 |     double *output = (double *)malloc(batch_size * sequence_length * INPUT_SIZE * sizeof(double));
204 | 
205 |     multihead_attention(input, batch_size, sequence_length, output, &attention);
206 | 
207 |     // Print input and output shapes (for demonstration purposes)
208 |     printf("Input shape: %d x %d\n", batch_size * sequence_length, INPUT_SIZE);
209 |     printf("Output shape: %d x %d\n", batch_size * sequence_length, INPUT_SIZE);
210 | 
211 |     // Clean up
212 |     free(input);
213 |     free(output);
214 |     free_attention(&attention);
215 | 
216 |     return 0;
217 | }
218 | 


--------------------------------------------------------------------------------