├── LLAMA2_0.bas └── README.md /LLAMA2_0.bas: -------------------------------------------------------------------------------- 1 | /' 2 | 3 | INFERENCE FOR LLAMA-2 TRANSFORMER MODEL IN PURE FREEBASIC 4 | 5 | EXAMPLE COMPILE: 6 | ] 7 | 8 | ] 9 | '/ 10 | 11 | #include "crt/stdio.bi" 12 | #include "crt/stdlib.bi" 13 | #include "crt/time.bi" 14 | #include "crt/math.bi" 15 | #include "crt/string.bi" 16 | 17 | 18 | 19 | 20 | '------------------------------------------------------------------------------------------------- 21 | 'TRANSFORMER AND RUNSTATE STRUCTS,AND RELATED MEMORY MANAGER 22 | 23 | 24 | DIM SHARED AS INTEGER CONFIG_DIM_4B'1 25 | DIM SHARED AS INTEGER CONFIG_HIDDEN_DIM_4B'2 26 | DIM SHARED AS INTEGER CONFIG_N_LAYERS_4B'3 27 | DIM SHARED AS INTEGER CONFIG_N_HEADS_4B'4 28 | DIM SHARED AS INTEGER CONFIG_N_KV_HEADS_4B'5 29 | DIM SHARED AS INTEGER CONFIG_VOCAB_SIZE_4B'6 30 | DIM SHARED AS INTEGER CONFIG_SEQ_LEN_4B'7 31 | 32 | 33 | 34 | 'TOKEN EMBEDDING TABLE 35 | DIM SHARED AS SINGLE PTR TransformerWeights_TOKEN_EMBEDDING_TABLE_f4B '(VOCAB_SIZE,DIM)1 36 | 'WEIGHTS FOR RMSNORMS 37 | DIM SHARED AS SINGLE PTR TransformerWeights_RMS_ATT_WEIGHT_f4B '(LAYER,DIM)2 38 | DIM SHARED AS SINGLE PTR TransformerWeights_RMS_FFN_WEIGHT_f4B '(LAYER,DIM)3 39 | 'WEIGHTS FOR MATMULS 40 | DIM SHARED AS SINGLE PTR TransformerWeights_WQ_f4B '(LAYER,DIM,DIM)4 41 | DIM SHARED AS SINGLE PTR TransformerWeights_WK_f4B '(LAYER,DIM,DIM)5 42 | DIM SHARED AS SINGLE PTR TransformerWeights_WV_f4B '(LAYER,DIM,DIM)6 43 | DIM SHARED AS SINGLE PTR TransformerWeights_WO_f4B '(LAYER,DIM,DIM)7 44 | 'WEIGHTS FOR TTN 45 | DIM SHARED AS SINGLE PTR TransformerWeights_W1_f4B '(LAYER,HIDDEN_DIM,DIM)8 46 | DIM SHARED AS SINGLE PTR TransformerWeights_W2_f4B '(LAYER,DIM,HIDDEN_DIM)9 47 | DIM SHARED AS SINGLE PTR TransformerWeights_W3_f4B '(LAYER,HIDDEN_DIM,DIM)10 48 | 'FINAL RMSNORM 49 | DIM SHARED AS SINGLE PTR TransformerWeights_RMS_FINAL_WEIGHT_f4B '(LAYER,DIM,DIM)11 50 | 'FREQ_CIS FOR ROPE RELATIVELY POSITIONAL EMBEDDINDS 51 | DIM SHARED AS SINGLE PTR TransformerWeights_FREQ_CIS_REAL_f4B '(LAYER,DIM,DIM)12 52 | DIM SHARED AS SINGLE PTR TransformerWeights_FREQ_CIS_IMAG_f4B '(LAYER,DIM,DIM)13 53 | '(OPTIONAL) CLASSIFIER WEIGHTS FOR THE LOGITS, ON THE LAST LAYER 54 | DIM SHARED AS SINGLE PTR TransformerWeights_WCLS_f4B '(LAYER,DIM,DIM)14 55 | 56 | 57 | 58 | 59 | DIM SHARED AS SINGLE PTR RunState_X_f4B '(LAYER,DIM,DIM) 1 60 | DIM SHARED AS SINGLE PTR RunState_XB_f4B '(LAYER,DIM,DIM) 2 61 | DIM SHARED AS SINGLE PTR RunState_XB2_f4B '(LAYER,DIM,DIM) 3 62 | DIM SHARED AS SINGLE PTR RunState_HB_f4B '(LAYER,DIM,DIM) 4 63 | DIM SHARED AS SINGLE PTR RunState_HB2_f4B '(LAYER,DIM,DIM) 5 64 | DIM SHARED AS SINGLE PTR RunState_Q_f4B '(LAYER,DIM,DIM) 6 65 | DIM SHARED AS SINGLE PTR RunState_K_f4B '(LAYER,DIM,DIM) 7 66 | DIM SHARED AS SINGLE PTR RunState_V_f4B '(LAYER,DIM,DIM) 8 67 | DIM SHARED AS SINGLE PTR RunState_ATT_f4B '(LAYER,DIM,DIM) 9 68 | DIM SHARED AS SINGLE PTR RunState_LOGITS_f4B '(LAYER,DIM,DIM) 10 69 | 'KV CACHE 70 | DIM SHARED AS SINGLE PTR RunState_KEY_CACHE_f4B '(LAYER,SEQ_LEN,DIM) 11 71 | DIM SHARED AS SINGLE PTR RunState_VALUE_CACHE_f4B '(LAYER,SEQ_LEN,DIM) 12 72 | 73 | 74 | SUB SUB_MALLOC_RUN_STATE() 75 | 'WA ALLOC INSTEAD OF MALLOC TO KEEP VALGRIND HAPPY 76 | RunState_X_f4B=CALLOCATE(CONFIG_DIM_4B,SIZEOF(SINGLE)) 77 | RunState_XB_f4B=CALLOCATE(CONFIG_DIM_4B,SIZEOF(SINGLE))'4B 78 | RunState_XB2_f4B=CALLOCATE(CONFIG_DIM_4B,SIZEOF(SINGLE))'4B 79 | RunState_HB_f4B=CALLOCATE(CONFIG_HIDDEN_DIM_4B,SIZEOF(SINGLE))'4B 80 | RunState_HB2_f4B=CALLOCATE(CONFIG_HIDDEN_DIM_4B,SIZEOF(SINGLE))'4B 81 | RunState_Q_f4B=CALLOCATE(CONFIG_DIM_4B,SIZEOF(SINGLE))'4B 82 | RunState_K_f4B=CALLOCATE(CONFIG_DIM_4B,SIZEOF(SINGLE))'4B 83 | RunState_V_f4B=CALLOCATE(CONFIG_DIM_4B,SIZEOF(SINGLE))'4B 84 | RunState_ATT_f4B=CALLOCATE(CONFIG_N_HEADS_4B*CONFIG_SEQ_LEN_4B,SIZEOF(SINGLE))'4B 85 | RunState_LOGITS_f4B=CALLOCATE(CONFIG_VOCAB_SIZE_4B,SIZEOF(SINGLE))'4B 86 | RunState_KEY_CACHE_f4B=CALLOCATE(CONFIG_N_LAYERS_4B*CONFIG_SEQ_LEN_4B*CONFIG_DIM_4B,SIZEOF(SINGLE))'4B 87 | RunState_VALUE_CACHE_f4B=CALLOCATE(CONFIG_N_LAYERS_4B*CONFIG_SEQ_LEN_4B*CONFIG_DIM_4B,SIZEOF(SINGLE))'4B 88 | 89 | 'IF((NOT RunState_X_f4B)OR(NOT RunState_XB_f4B)OR(NOT RunState_XB2_f4B) _ 90 | 'OR(NOT RunState_HB_f4B)OR(NOT RunState_HB2_f4B)OR(NOT RunState_Q_f4B) _ 91 | 'OR(NOT RunState_K_f4B)OR(NOT RunState_V_f4B)OR(NOT RunState_ATT_f4B) _ 92 | 'OR(NOT RunState_LOGITS_f4B)OR(NOT RunState_KEY_CACHE_f4B)OR(NOT RunState_VALUE_CACHE_f4B))THEN 93 | ' ?"MEMORY ALLOCATION FAILED!":EXIT 94 | 'ENDIF 95 | END SUB 96 | 97 | SUB SUB_FREE_RUN_STATE() 98 | DEALLOCATE(RunState_X_f4B)'1 99 | DEALLOCATE(RunState_XB_f4B)'2 100 | DEALLOCATE(RunState_XB2_f4B)'3 101 | DEALLOCATE(RunState_HB_f4B)'4 102 | DEALLOCATE(RunState_HB2_f4B)'5 103 | DEALLOCATE(RunState_Q_f4B)'6 104 | DEALLOCATE(RunState_K_f4B)'7 105 | DEALLOCATE(RunState_V_f4B)'8 106 | DEALLOCATE(RunState_ATT_f4B)'9 107 | DEALLOCATE(RunState_LOGITS_f4B)'10 108 | DEALLOCATE(RunState_KEY_CACHE_f4B)'11 109 | DEALLOCATE(RunState_VALUE_CACHE_f4B)'12 110 | 111 | END SUB 112 | '--------------------------------------------------- 113 | 'INITIALIZATION READ THE CHECKPOINT 114 | 115 | SUB SUB_CHECKPOINT_INIT_WEIGHTS(F_f4B AS SINGLE PTR,SHARED_WEIGHTS_4B AS INTEGER)':425: 116 | DIM AS SINGLE PTR PTR_f4B:?" :116:PRE ":PTR_f4B=F_f4B:?":116: F[0]=";F_f4B[0];" PTR_f4B=";PTR_f4B[0] 117 | TransformerWeights_TOKEN_EMBEDDING_TABLE_f4B=PTR_f4B:?":117:TransformerWeights_TOKEN_EMBEDDING_TABLE_f4B[0]=";TransformerWeights_TOKEN_EMBEDDING_TABLE_f4B[0] 118 | PTR_f4B+=CONFIG_VOCAB_SIZE_4B*CONFIG_DIM_4B 119 | TransformerWeights_RMS_ATT_WEIGHT_f4B=PTR_f4B 120 | PTR_f4B+=CONFIG_N_LAYERS_4B*CONFIG_DIM_4B 121 | TransformerWeights_WQ_f4B=PTR_f4B 122 | PTR_f4B+=CONFIG_N_LAYERS_4B*CONFIG_DIM_4B*CONFIG_DIM_4B 123 | TransformerWeights_WK_f4B=PTR_f4B 124 | PTR_f4B+=CONFIG_N_LAYERS_4B*CONFIG_DIM_4B*CONFIG_DIM_4B 125 | TransformerWeights_WV_f4B=PTR_f4B 126 | PTR_f4B+=CONFIG_N_LAYERS_4B*CONFIG_DIM_4B*CONFIG_DIM_4B 127 | TransformerWeights_WO_f4B=PTR_f4B 128 | PTR_f4B+=CONFIG_N_LAYERS_4B*CONFIG_DIM_4B*CONFIG_DIM_4B 129 | TransformerWeights_RMS_FFN_WEIGHT_f4B=PTR_f4B 130 | PTR_f4B+=CONFIG_N_LAYERS_4B*CONFIG_DIM_4B 131 | TransformerWeights_W1_f4B=PTR_f4B 132 | PTR_f4B+=CONFIG_N_LAYERS_4B*CONFIG_DIM_4B*CONFIG_HIDDEN_DIM_4B 133 | TransformerWeights_W2_f4B=PTR_f4B 134 | PTR_f4B+=CONFIG_N_LAYERS_4B*CONFIG_HIDDEN_DIM_4B*CONFIG_DIM_4B 135 | TransformerWeights_W3_f4B=PTR_f4B 136 | PTR_f4B+=CONFIG_N_LAYERS_4B*CONFIG_DIM_4B*CONFIG_HIDDEN_DIM_4B 137 | TransformerWeights_RMS_FINAL_WEIGHT_f4B=PTR_f4B 138 | PTR_f4B+=CONFIG_DIM_4B 139 | TransformerWeights_FREQ_CIS_REAL_f4B=PTR_f4B 140 | DIM AS INTEGER HEAD_SIZE_4B=CONFIG_DIM_4B / CONFIG_N_HEADS_4B 141 | DIM TEMP AS INTEGER:TEMP=CONFIG_SEQ_LEN_4B*HEAD_SIZE_4B/2:PTR_f4B+=TEMP 142 | TransformerWeights_FREQ_CIS_IMAG_f4B=PTR_f4B 143 | TEMP=CONFIG_SEQ_LEN_4B*HEAD_SIZE_4B/2:PTR_f4B+=TEMP 144 | 'TransformerWeights_WCLS_f4B=SHARED_WEIGHTS ? TransformerWeights_TOKEN_EMBEDDING_TABLE_f4B : PTR_f4B 145 | IF SHARED_WEIGHTS_4B=1 THEN TransformerWeights_WCLS_f4B=TransformerWeights_TOKEN_EMBEDDING_TABLE_f4B ELSE TransformerWeights_WCLS_f4B=PTR_f4B ENDIF 146 | END SUB 147 | 148 | '--------------------------- 149 | 'NEURAL NET BLOCKS 150 | SUB SUB_ACCUM( A_f4B AS SINGLE PTR ,B_f4B AS SINGLE PTR,SIZE_4B AS INTEGER) 151 | DIM I AS INTEGER:FOR I=0 TO SIZE_4B-1 152 | A_f4B[I]+=B_f4B[I] 153 | NEXT I 154 | END SUB 155 | 156 | SUB SUB_RMSNORM(OUT_O_f4B AS SINGLE PTR,X_f4B AS SINGLE PTR,WEIGHT_f4B AS SINGLE PTR,SIZE_IN_4B AS INTEGER) 157 | '?"-- :157: IN X_f4B[0]=";X_f4B[0];:'OUT_O_f4B=CALLOCATE(288+2)'CALCULATE SUM SQUARES 158 | DIM AS SINGLE SS_f4B=0:DIM AS INTEGER J 159 | FOR J=0 TO SIZE_IN_4B-1 160 | SS_f4B+=X_f4B[J]*X_f4B[J] 161 | NEXT J 162 | SS_f4B/=SIZE_IN_4B :'?":162: SS_f4B=";SS_f4B;" "; 163 | SS_f4B+=0.00001 ' :?":163: SS_f4B=";SS_f4B '1e-5f IS 0. 164 | SS_f4B=1.0/SQR(SS_f4B) 165 | '?":165:SIZE_IN_4B=";SIZE_IN_4B'NORMALIZE AND SCALE 166 | FOR J=0 TO SIZE_IN_4B-1 167 | OUT_O_f4B[J]=WEIGHT_f4B[J]*(SS_f4B*X_f4B[J]) ':?" OUT_O_f4B[J]=";OUT_O_f4B[J] 168 | NEXT J 169 | END SUB 170 | 171 | SUB SUB_SOFTMAX(X_f4B AS SINGLE PTR,IN_SIZE_4B AS INTEGER) 172 | 'FIND MAX VALUE (FOR NUMERICAL STABILITY) 173 | DIM AS SINGLE MAX_VAL_f4B=X_f4B[0]:DIM I AS UINTEGER 174 | FOR I=1 TO IN_SIZE_4B-1 175 | IF X_f4B[I]>MAX_VAL_f4B THEN 176 | MAX_VAL_f4B=X_f4B[I] 177 | ENDIF 178 | NEXT I 179 | 'EXP AND SUM 180 | DIM AS SINGLE SUM_f4B=0.0 181 | FOR I=0 TO IN_SIZE_4B-1 182 | X_f4B[I]=EXP(X_f4B[I]-MAX_VAL_f4B) 183 | SUM_f4B+=X_f4B[I] 184 | NEXT I 185 | 'NORMALIZE 186 | FOR I=0 TO IN_SIZE_4B-1 187 | X_f4B[I]/=SUM_f4B 188 | NEXT I 189 | END SUB 190 | 191 | SUB SUB_MATMUL(XOUT_f4B AS SINGLE PTR,X_f4B AS SINGLE PTR,W_f4B AS SINGLE PTR,IN_N_4B AS INTEGER,IN_D_4B AS INTEGER) 192 | '?":192:"' W (D,N) @ X (N,) -> XOUT (D,) 193 | DIM I AS INTEGER:'?"IN_D_4B=";IN_D_4B;" IN_N_4B=";IN_N_4B 194 | FOR I=0 TO IN_D_4B-1:'?":194: I=";I 195 | DIM AS SINGLE VAL_f4B=0:DIM J AS INTEGER 196 | FOR J=0 TO IN_N_4B-1':?":196:";" I=";I;" IN_N_4B=";IN_N_4B;" J=";J;" W_f4B[0]=";W_f4B[0];" X_f4B[0]=";X_f4B[0];" W_f4B[0]*X_f4B[0]=";W_f4B[0]*X_f4B[0] 197 | VAL_f4B+=W_f4B[I*IN_N_4B+J]*X_f4B[J] 198 | NEXT J 199 | XOUT_f4B[I]=VAL_f4B 200 | NEXT I:'?":200: XOUT_f4B[287]=";XOUT_f4B[287] 201 | END SUB 202 | 203 | SUB SUB_TRANSFORMER(TOKEN_4B AS INTEGER,POS_4B AS INTEGER) 204 | '?":204: TOKEN_4B=";TOKEN_4B;" POS_4B=";POS_4B:COLOR 3:?"BEGIN SUB_TRANSFORMER()":COLOR 7 205 | ' A FEW CONVENIENCE VARIABLES 206 | DIM AS SINGLE PTR X_f4B=RunState_X_f4B 207 | DIM AS INTEGER DIM_4B=CONFIG_DIM_4B:'?":207: DIM_4B=";DIM_4B; 208 | DIM AS INTEGER HIDDEN_DIM_4B=CONFIG_HIDDEN_DIM_4B:'?" HIDDEN_DIM_4B=";HIDDEN_DIM_4B; 209 | DIM AS INTEGER HEAD_SIZE_4B=DIM_4B/CONFIG_N_HEADS_4B:'?" HEAD_SIZE_4B=";HEAD_SIZE_4B 210 | '?":210:";"RunState_X_f4B=";RunState_X_f4B 211 | 'COPY THE TOKEN EMBEDDING INTO X 212 | DIM AS SINGLE PTR CONTENT_ROW_f4B:CONTENT_ROW_f4B=@TransformerWeights_TOKEN_EMBEDDING_TABLE_f4B[TOKEN_4B*CONFIG_DIM_4B]: '?":212: CONTENT_ROW_f4B=";CONTENT_ROW_f4B[0] 213 | X_f4B=ALLOCATE(DIM_4B*SIZEOF(*X_f4B)):MEMCPY(X_f4B,CONTENT_ROW_f4B,DIM_4B*SIZEOF(*X_f4B) ) 214 | DIM TEMP_POS_4B AS UINTEGER :TEMP_POS_4B=(POS_4B*HEAD_SIZE_4B/2) 215 | 'PLUCK OUT THE "POS" ROW OF FREQ_CIS_REAL AND FREQ_CIS_IMAG 216 | DIM AS SINGLE PTR FREQ_CIS_REAL_ROW_f4B=TransformerWeights_FREQ_CIS_REAL_f4B+TEMP_POS_4B 217 | DIM AS SINGLE PTR FREQ_CIS_IMAG_ROW_f4B=TransformerWeights_FREQ_CIS_IMAG_f4B+TEMP_POS_4B 218 | DIM L AS INTEGER 219 | 'FORWARD ALL THE LAYERS 220 | FOR L=0 TO CONFIG_N_LAYERS_4B-1 221 | '?":221:";:COLOR 6:?" L=";L:COLOR 7 222 | '?":222: RunState_XB_f4B[0]=";RunState_XB_f4B[0] 223 | SUB_RMSNORM(RunState_XB_f4B,X_f4B,TransformerWeights_RMS_ATT_WEIGHT_f4B+L*DIM_4B,DIM_4B)':156: 224 | '?":224: RunState_XB_f4B[0]=";RunState_XB_f4B[0] 225 | 'QKV MATMUL FOR THIS POSITION 226 | SUB_MATMUL(RunState_Q_f4B,RunState_XB_f4B,TransformerWeights_WQ_f4B+L*DIM_4B*DIM_4B,DIM_4B,DIM_4B)':191: 227 | SUB_MATMUL(RunState_K_f4B,RunState_XB_f4B,TransformerWeights_WK_f4B+L*DIM_4B*DIM_4B,DIM_4B,DIM_4B)':191: 228 | SUB_MATMUL(RunState_V_f4B,RunState_XB_f4B,TransformerWeights_WV_f4B+L*DIM_4B*DIM_4B,DIM_4B,DIM_4B)':191: 229 | DIM AS UINTEGER H,I 230 | 'APPLY ROPE ROTATION TO THE Q AND K VECTORS FOR EACH HEAD 231 | FOR H=0 TO CONFIG_N_HEADS_4B-1 232 | 'GET THE Q AND K VECTORS FOR THIS HEAD 233 | DIM AS SINGLE PTR Q_f4B=RunState_Q_f4B+H*HEAD_SIZE_4B 234 | DIM AS SINGLE PTR K_f4B=RunState_K_f4B+H*HEAD_SIZE_4B 235 | 'ROTATE Q AND K BY THE FREQ_CIS_REAL AND FREQ_CIS_IMAG 236 | FOR I=0 TO HEAD_SIZE_4B-1 STEP 2 237 | DIM AS SINGLE Q0_f4B=Q_f4B[I] 238 | DIM AS SINGLE Q1_f4B=Q_f4B[I+1] 239 | DIM AS SINGLE K0_f4B=K_f4B[I] 240 | DIM AS SINGLE K1_f4B=K_f4B[I+1] 241 | DIM AS SINGLE FCR_f4B=FREQ_CIS_REAL_ROW_f4B[I/2] 242 | DIM AS SINGLE FCI_f4B=FREQ_CIS_IMAG_ROW_f4B[I/2] 243 | Q_f4B[I]=Q0_f4B*FCR_f4B-Q1_f4B*FCI_f4B 244 | Q_f4B[I+1]=Q0_f4B*FCI_f4B+Q1_f4B*FCR_f4B 245 | K_f4B[I]=K0_f4B*FCR_f4B-K1_f4B*FCI_f4B 246 | K_f4B[I+1]=K0_f4B*FCI_f4B+K1_f4B*FCR_f4B 247 | NEXT I 248 | NEXT H 249 | 250 | 'SAVE KEY,VALUE AT THIS TIME STEP (POS) TO OUR KV CACHE 251 | DIM AS INTEGER LOFF_4B=L*CONFIG_SEQ_LEN_4B*DIM_4B 252 | DIM AS SINGLE PTR KEY_CACHE_ROW_f4B=RunState_KEY_CACHE_f4B+LOFF_4B+POS_4B*DIM_4B 253 | DIM AS SINGLE PTR VALUE_CACHE_ROW_f4B=RunState_VALUE_CACHE_f4B+LOFF_4B+POS_4B*DIM_4B 254 | MEMCPY(KEY_CACHE_ROW_f4B,RunState_K_f4B,DIM_4B*SIZEOF(*KEY_CACHE_ROW_f4B)) 255 | MEMCPY(VALUE_CACHE_ROW_f4B,RunState_V_f4B,DIM_4B*SIZEOF(*VALUE_CACHE_ROW_f4B)) 256 | 257 | 'MULTIHEAD ATTENTION. ITERATE OVER ALL HEADS 258 | '#PRAGMA OMP PARALLEL FOR 259 | FOR H=0 TO CONFIG_N_HEADS_4B-1:':?":259: RunState_XB_f4B[0]=";RunState_XB_f4B[0] 260 | 'GET THE QUERY VECTOR FOR THIS HEAD 261 | DIM AS SINGLE PTR Q_f4B=RunState_Q_f4B+H*HEAD_SIZE_4B 262 | 'ATTENTION SCORES FOR THIS HEAD 263 | DIM AS SINGLE PTR ATT_f4B=RunState_ATT_f4B+H*CONFIG_SEQ_LEN_4B:DIM T AS UINTEGER 264 | 'ITERATE OVER ALL TIMESTEPS, INCLUDING THE CURRENT ONE 265 | FOR T=0 TO POS_4B 266 | 'GET THE KEY VECTOR FOR THIS HEAD AND AT THIS TIMESTEP 267 | DIM AS SINGLE PTR K_f4B=RunState_KEY_CACHE_f4B+LOFF_4B+T*DIM_4B+H*HEAD_SIZE_4B 268 | 'CALCULATE THE ATTENTION SCORE AT THE DOS PRODUCT OF Q AND K 269 | DIM AS SINGLE SCORE_f4B=0 270 | FOR I=0 TO HEAD_SIZE_4B-1 271 | SCORE_f4B+=Q_f4B[I]*K_f4B[I] 272 | NEXT I:'?":272: SCORE_f4B=";SCORE_f4B 273 | SCORE_f4B/=SQR(HEAD_SIZE_4B):'?":273: HEAD_SIZE_4B=";HEAD_SIZE_4B 274 | 'SAVE THE SCORE TO THE ATTENTION BUFFER 275 | ATT_f4B[T]=SCORE_f4B:'?":275: SCORE_f4B=";SCORE_f4B;" T=";T 276 | NEXT T 277 | 278 | 'SOFTMAX THE SCORES TO GET ATTENTION WEIGHTS, FROM 0..POS INCLUSIVELY 279 | SUB_SOFTMAX(ATT_f4B,POS_4B+1)':171: 280 | '?":280: ATT_f4B[0]=";ATT_f4B[0] 281 | 'WEIGHTED SUM OF THE VALUES, STORE BACK INTO XB 282 | DIM AS SINGLE PTR XB_f4B=RunState_XB_f4B+H*HEAD_SIZE_4B:'?":282: RunState_XB_f4B[0]=";RunState_XB_f4B[0] 283 | MEMSET(XB_f4B,0,HEAD_SIZE_4B*SIZEOF(SINGLE)) 284 | FOR T=0 TO POS_4B 285 | '?":285: POS_4B=";POS_4B 'GET THE VALUE VECTOR FOR THIS HEAD AND AT THIS TIME STEP 286 | DIM AS SINGLE PTR V_f4B=RunState_VALUE_CACHE_f4B+LOFF_4B+T*DIM_4B+H*HEAD_SIZE_4B 287 | 'GET THE ATTENTION WEIGHT FOR THIS TIMESTEP 288 | DIM AS SINGLE A_f4B=ATT_f4B[T]:'?":288: A_f4B=";A_f4B 289 | 'ACCUMULATE THE WEIGHTED VALUE INTO XB 290 | FOR I=0 TO HEAD_SIZE_4B-1 291 | XB_f4B[I]+=A_f4B*V_f4B[I] 292 | NEXT I 293 | NEXT T:'?":293: RunState_XB_f4B[0]=";RunState_XB_f4B[0] 294 | NEXT H 295 | '?":295: TransformerWeights_WO_f4B[0]+(L*DIM_4B*DIM_4B)=";TransformerWeights_WO_f4B[0]+(L*DIM_4B*DIM_4B) 296 | '?":296: RunState_XB_f4B[0]=";RunState_XB_f4B[0]'FINAL MATMUL TO GET THE OUTPUT OF THE ATTENTION 297 | SUB_MATMUL(RunState_XB2_f4B,RunState_XB_f4B,TransformerWeights_WO_f4B+(L*DIM_4B*DIM_4B),DIM_4B,DIM_4B) 298 | '?":298: X_f4B[0]=";X_f4B[0];" RunState_XB2_f4B[0]=";RunState_XB2_f4B[0];" IN DIM_4B=";DIM_4B 299 | 'RESIDUAL CONNECTION BACK INTO X 300 | SUB_ACCUM(X_f4B,RunState_XB2_f4B,DIM_4B)':150: 301 | '?":301: OUT X_f4B[0]=";X_f4B[0];" IN RunState_XB2_f4B[0]=";RunState_XB2_f4B[0];" IN DIM_4B=";DIM_4B 302 | 'FFN RMSNORM (2 TIME) 303 | SUB_RMSNORM(RunState_XB_f4B,X_f4B,TransformerWeights_RMS_FFN_WEIGHT_f4B+L*DIM_4B,DIM_4B)':156: 304 | '?":304: X_f4B[0]=";X_f4B[0] 305 | ' NOW FOR FFN IN PYTORCH WE HAVE: SELF.W2(F.SILU(SELF.W1(X))*SELF.W3(X)) 306 | '?":306: OUT RunState_HB_f4B[0]=";RunState_HB_f4B[0];" IN RunState_XB_f4B[0]=";RunState_XB_f4B[0]' FIRST CALCULATE SELF.W1(X) AND SELF.W3(X) 307 | SUB_MATMUL(RunState_HB_f4B,RunState_XB_f4B,TransformerWeights_W1_f4B+L*DIM_4B*HIDDEN_DIM_4B,DIM_4B,HIDDEN_DIM_4B):'?":307: OUT RunState_HB_f4B[0]=";RunState_HB_f4B[0];" IN RunState_XB_f4B[0]=";RunState_XB_f4B[0] 308 | SUB_MATMUL(RunState_HB2_f4B,RunState_XB_f4B,TransformerWeights_W3_f4B+L*DIM_4B*HIDDEN_DIM_4B,DIM_4B,HIDDEN_DIM_4B) 309 | 310 | 'F.SILU; SILU(X)=X*O(X),WHERE O(X) IS THE LOGISTIC SIGMOID 311 | FOR I=0 TO HIDDEN_DIM_4B-1 312 | RunState_HB_f4B[I]=RunState_HB_f4B[I]*(1.0/(1.0+EXP(-RunState_HB_f4B[I]))) 313 | NEXT I 314 | 315 | 'ELEMENTWISE MULTIPLY WITH W3(X) 316 | FOR I=0 TO HIDDEN_DIM_4B-1 317 | RunState_HB_f4B[I]=RunState_HB_f4B[I]*RunState_HB2_f4B[I] 318 | NEXT I 319 | 320 | 'FINAL MATMUT TO GET THE OUTPUT OF THE FFN 321 | SUB_MATMUL(RunState_XB_f4B,RunState_HB_f4B,TransformerWeights_W2_f4B+L*DIM_4B*HIDDEN_DIM_4B,HIDDEN_DIM_4B,DIM_4B) 322 | '?":322: X_f4B[0]=";X_f4B[0] 323 | 'RESIDUAL CONNECTION 324 | SUB_ACCUM(X_f4B,RunState_XB_f4B,DIM_4B):'?":324: X_f4B[0]=";X_f4B[0] 325 | NEXT L 326 | 327 | '?":327: X_f4B[0]=";X_f4B[0]'FINAL RMSNORM 328 | SUB_RMSNORM(X_f4B,X_f4B,TransformerWeights_RMS_FINAL_WEIGHT_f4B,DIM_4B)':156: 329 | 330 | '?":330: X_f4B[0]=";X_f4B[0]'CLASSIFIER INFO LOGITS 331 | SUB_MATMUL(RunState_LOGITS_f4B,X_f4B,TransformerWeights_WCLS_f4B,CONFIG_DIM_4B,CONFIG_VOCAB_SIZE_4B)':191: 332 | END SUB 'END SUB_TRANSFORMER 333 | 334 | FUNCTION FUNC_SAMPLE(PROBABILITIES_f4B AS SINGLE PTR,N_IN_4B AS INTEGER)AS INTEGER 335 | 'SAMPLE INDEX FROM PROBALILITIES,THEY MUST SUM TO 1 336 | DIM AS DOUBLE R_f4B=RND'/2147483648.0 'RAND_MAX=&H7FFFFFFF 337 | DIM AS SINGLE CDF_f4B=0:DIM I AS UINTEGER:'?":337: R_f4B=";R_f4B;" N_IN_4B=";N_IN_4B 338 | FOR I=0 TO N_IN_4B-1 339 | CDF_f4B+=PROBABILITIES_f4B[I] 340 | IF R_f4B [ TEMPERATURE ] [ STEPS ]" 382 | ?"PRESS ANY KEY TO EXIT.":END 383 | ENDIF 384 | IF COMMAND(1)>"" THEN 385 | CHECKPOINT_STR=COMMAND(1) 386 | ENDIF 387 | IF COMMAND(2)>"" THEN 388 | 'OPTIONAL TEMPERATURE. 0.0=(DETERMINISTIC) ARGMAX SAMPLING.1.0=BASELINE 389 | TEMPERATURE_f4B=ATOF(COMMAND(2)) 390 | ENDIF 391 | IF COMMAND(3)>"" THEN 392 | STEPS_4B=ATOI(COMMAND(3)) 393 | ENDIF 394 | 395 | 'SEED RNG WITH TIME. IF YOU WANT DETERMINISTIC BEHAVIOR USE TEMPERATURE 0.0 396 | 'SRAND(TIME(NULL)) 397 | 398 | 'READ IN THE MODEL.BIN FILE 399 | ?":399: CHECKPOINT_STR=";CHECKPOINT_STR:?":399: TEMPERATURE_f4B=";TEMPERATURE_f4B:?":399: STEPS_4B=";STEPS_4B 400 | 401 | DIM FD_4B AS INTEGER 402 | DIM DATA_f4B AS SINGLE PTR 403 | DIM FILE_SIZE_4B AS LONG 404 | DIM AS FILE PTR FILE_PTR 405 | FILE_PTR=FOPEN(CHECKPOINT_STR,"rb") 406 | 407 | IF FILE_PTR=0 THEN COLOR 4:?"FILE '";CHECKPOINT_STR;"' OPENNING ERROR" ELSE COLOR 2:?":407: FILE '";CHECKPOINT_STR;"' OPENED " ENDIF 408 | COLOR 7 409 | 410 | 'READ THE CONFIG HEADER 411 | FREAD(@CONFIG_DIM_4B,4,1,FILE_PTR):FREAD(@CONFIG_HIDDEN_DIM_4B,4,1,FILE_PTR):FREAD(@CONFIG_N_LAYERS_4B,4,1,FILE_PTR):FREAD(@CONFIG_N_HEADS_4B,4,1,FILE_PTR):FREAD(@CONFIG_N_KV_HEADS_4B,4,1,FILE_PTR):FREAD(@CONFIG_VOCAB_SIZE_4B,4,1,FILE_PTR):FREAD(@CONFIG_SEQ_LEN_4B,4,1,FILE_PTR) 412 | ?":412: DIM_4B=";CONFIG_DIM_4B;" HIDDEN_DIM_4B=";CONFIG_HIDDEN_DIM_4B;" N_LAYERS_4B_4B=";CONFIG_N_LAYERS_4B;" N_HEADS_4B=";CONFIG_N_HEADS_4B," N_KV_HEADS_4B=";CONFIG_N_KV_HEADS_4B;" VOCAB_SIZE_4B=";CONFIG_VOCAB_SIZE_4B;" SEQ_LEN_4B=";CONFIG_SEQ_LEN_4B 'NEGATIVE VOCAB SIZE IS HACKY WAY OF SIGNALING UNSHARED WEIGHTS.BIT YIKES 413 | DIM AS INTEGER SHARED_WEIGHTS_4B:IF CONFIG_VOCAB_SIZE_4B>0 THEN SHARED_WEIGHTS_4B=1 ELSE SHARED_WEIGHTS_4B=0 ENDIF 414 | CONFIG_VOCAB_SIZE_4B=ABS(CONFIG_VOCAB_SIZE_4B) 415 | 'FIGURE OUT THE FILE SIZE 416 | FSEEK(FILE_PTR,0,SEEK_END)'MOVE FILE POINTER TO THE END OF FILE 417 | FILE_SIZE_4B=FTELL(FILE_PTR):?":417: FILE_SIZE_4B=";FILE_SIZE_4B'GET FILE SIZE IN BYTES 418 | FCLOSE(FILE_PTR) 419 | 'MEMORY MAP THE TRANSFORMER WEIGHTS INTO THE DATA POINTER 420 | FD_4B=FREEFILE:OPEN CHECKPOINT_STR FOR BINARY AS #FD_4B 421 | IF ERR>0 THEN COLOR 4:? "Error opening the '";CHECKPOINT_STR;"' file" ELSE COLOR 2:?":421: FILE '";CHECKPOINT_STR;"' OPENED":COLOR 7 422 | DATA_F4B=Allocate(FILE_SIZE_4B*4):GET #FD_4B, ,*DATA_F4B, FILE_SIZE_4B 423 | 424 | DIM AS SINGLE PTR WEIGHTS_PTR:WEIGHTS_PTR=Allocate(FILE_SIZE_4B*4):WEIGHTS_PTR=DATA_F4B+7 425 | SUB_CHECKPOINT_INIT_WEIGHTS(WEIGHTS_PTR,SHARED_WEIGHTS_4B)':115: 426 | 427 | 'RIGHT NOW WE CANNOT RUN FOR MORE THAN CONFIG.SEQ_LEN STEPS 428 | IF STEPS_4B<=0 OR STEPS_4B>CONFIG_SEQ_LEN_4B THEN STEPS_4B=CONFIG_SEQ_LEN_4B 429 | 430 | 'READ IN THE TOKENIZER.BIN FILE 431 | DIM AS ZSTRING PTR VOCAB_STR(0 TO CONFIG_VOCAB_SIZE_4B) 432 | DIM AS ZSTRING PTR TEMP_VOCAB_STR 433 | FILE_PTR=FOPEN("TOKENIZER.BIN","rb") 434 | IF FILE_PTR=0 THEN COLOR 4:?"UNABLE TO OPEN THE TOKENIZER FILE 'TOKENIZER.BIN'! RUN""PYTHON 'TOKENIZER.PY' TO CONVERT 'TOKENIZER.MODEL' -> 'TOKENIZER.BIN'" ELSE COLOR 2:?":434: FILE 'TOKENIZER.BIN' OPENED " ENDIF:COLOR 7 435 | 436 | 437 | 438 | ?":438: CONFIG_VOCAB_SIZE_4B=";CONFIG_VOCAB_SIZE_4B 439 | DIM AS INTEGER LEN_4B,I 440 | FOR I=0 TO CONFIG_VOCAB_SIZE_4B 441 | FREAD(@LEN_4B,4,1,FILE_PTR) 442 | TEMP_VOCAB_STR=ALLOCATE(LEN_4B) 443 | VOCAB_STR(I)=ALLOCATE(LEN_4B+4) 444 | FREAD(TEMP_VOCAB_STR,LEN_4B,1,FILE_PTR):*VOCAB_STR(I)=LEFT(*TEMP_VOCAB_STR,LEN_4B)+"\0" 445 | NEXT I:?":445:" 446 | FCLOSE(FILE_PTR) 447 | 448 | 449 | ' CREATE AND INIT THE APPLICATION RUN STATE 450 | 451 | SUB_MALLOC_RUN_STATE() 452 | 453 | 'THE CURRENT POSITION WE ARE IN 454 | DIM AS LONG START_4B=FUNC_TIME_IN_MS() 455 | DIM AS INTEGER NEXT_4B 456 | DIM AS INTEGER TOKEN_4B=1 457 | DIM AS INTEGER POS_4B=0 458 | ?"" 'EXPLICIT PRINT THE INITIAL BIS TOKEN (=1),STYLISTICALLY SYMMETRIC 459 | WHILE POS_4B