├── .gitignore
├── benchmark
    ├── transformer.py
    └── README.md
├── Makefile
├── LICENSE
├── weight_module.f90
├── sentence_ex.txt
├── savemodel.py
├── README.md
├── transformer.f90
└── read_ggml.f90


/.gitignore:
--------------------------------------------------------------------------------
 1 | /*/
 2 | tx
 3 | *.mod
 4 | *.bin
 5 | *.o
 6 | *.swp
 7 | !benchmark
 8 | *.o
 9 | 
10 | 


--------------------------------------------------------------------------------
/benchmark/transformer.py:
--------------------------------------------------------------------------------
 1 | from sentence_transformers import SentenceTransformer
 2 | from time import time
 3 | import torch
 4 | 
 5 | #change as desired
 6 | #torch.set_num_threads(1)
 7 | 
 8 | if __name__ == "__main__":
 9 | 
10 |     start = time()
11 |     st_model =  SentenceTransformer("msmarco-distilbert-base-dot-prod-v3")
12 |     end = time()
13 | 
14 |     print("load time:", end-start)
15 | 
16 |     with open("../sentence_ex.txt") as f:
17 |         prompts = f.readlines()
18 | 
19 |     p = [p.strip() for p in prompts]
20 |     print(len(p))
21 |     #print(p)
22 | 
23 |     start = time()
24 |     for p in prompts:
25 |         #print(p)
26 |         st_model.encode(p)
27 | 
28 |     end = time()
29 |     print("inferenece time:", end-start)
30 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | FORTRAN = gfortran-10
 2 | GCC = gcc-10
 3 | 
 4 | .DEFAULT_GOAL := all
 5 | 
 6 | weight_module.o: weight_module.f90 
 7 | 	$(FORTRAN) -c -O3 -march=native -mtune=native -ffast-math -funroll-loops -flto -fPIC weight_module.f90
 8 | 
 9 | transformer.o: transformer.f90 
10 | 	$(FORTRAN) -c -O3 -march=native -mtune=native -ffast-math -funroll-loops -flto -fPIC transformer.f90  
11 | read_ggml.o: read_ggml.f90
12 | 	$(FORTRAN) -c -O3 -march=native -mtune=native -ffast-math -funroll-loops -flto -fPIC read_ggml.f90
13 | 
14 | tx: weight_module.o read_ggml.o transformer.o 
15 | 	$(FORTRAN) -O3 -march=native -mtune=native -ffast-math -funroll-loops -flto -fPIC weight_module.o read_ggml.o transformer.o -o tx 
16 | 
17 | 	
18 | 
19 | all: tx
20 | 
21 | clean:
22 | 	rm *.o
23 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Andrew Marble
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/weight_module.f90:
--------------------------------------------------------------------------------
 1 | module precision_module
 2 |   implicit none
 3 |   integer, parameter :: wp = kind(1.0)
 4 | end module precision_module
 5 | 
 6 | ! structs for reading weights, config information and state 
 7 | module weight_module
 8 |         use precision_module
 9 |         implicit none
10 |         private wp
11 | 
12 |         type TransformerWeights
13 |                 real(kind=wp), allocatable :: word_embeddings(:,:)
14 |                 real(kind=wp), allocatable :: position_embeddings(:,:)
15 |                 real(kind=wp), allocatable :: emb_layer_norm_w(:)
16 |                 real(kind=wp), allocatable :: emb_layer_norm_b(:)
17 |                 real(kind=wp), allocatable :: wq(:,:,:)
18 |                 real(kind=wp), allocatable :: bq(:,:)
19 |                 real(kind=wp), allocatable :: wk(:,:,:)
20 |                 real(kind=wp), allocatable :: bk(:,:)
21 |                 real(kind=wp), allocatable :: wv(:,:,:)
22 |                 real(kind=wp), allocatable :: bv(:,:)
23 |                 real(kind=wp), allocatable :: wo(:,:,:)
24 |                 real(kind=wp), allocatable :: bo(:,:)
25 |                 real(kind=wp), allocatable :: sa_layer_norm_w(:,:)
26 |                 real(kind=wp), allocatable :: sa_layer_norm_b(:,:)
27 |                 real(kind=wp), allocatable :: w1(:,:,:)
28 |                 real(kind=wp), allocatable :: b1(:,:)
29 |                 real(kind=wp), allocatable :: w2(:,:,:)
30 |                 real(kind=wp), allocatable :: b2(:,:)
31 |                 real(kind=wp), allocatable :: out_layer_norm_w(:,:)
32 |                 real(kind=wp), allocatable :: out_layer_norm_b(:,:)
33 |                 real(kind=wp), allocatable :: linear(:,:)
34 | 
35 |         end type TransformerWeights
36 | 
37 |         type Config
38 |                 INTEGER :: emb_dim, hidden_dim, n_layers, n_heads, n_kv_heads, vocab_size, seq_len
39 |         end type Config
40 | 
41 |         type RunState
42 | 
43 |                 real(kind=wp), allocatable :: att(:,:)
44 |                 real(kind=wp), allocatable :: key_cache(:,:,:)
45 |                 real(kind=wp), allocatable :: value_cache(:,:,:)
46 |                 real(kind=wp) :: times(5)
47 | 
48 |         end type RunState
49 | 
50 | end module weight_module
51 | 
52 | 
53 | 
54 | 


--------------------------------------------------------------------------------
/benchmark/README.md:
--------------------------------------------------------------------------------
 1 | ## Python "transformers" code for comparison
 2 | 
 3 | Summary: I compared total calculation time for embeddings on the 21 strings in `../sentence_ex.tx` between HF transformers and transformers.f90 on an (old) Macbook and slightly newer intel/linux laptop. This code is heavily dependent on `matmul` so we expect the linear algebra backend to be important. On the intel machine, the inference runs 4-5 times faster in python vs the current fortran implementation with openblas. On the mac, the Accelerate framework + fortran is neatly 2x faster than python. Python generally sucks at the initial load time of the weights vs. fortran. 
 4 | 
 5 | 
 6 | Comparison on linux, intel core i7
 7 | 
 8 | ```bash
 9 | $ time python transformer.py 
10 | load time: 3.760972738265991
11 | 21
12 | inferenece time: 0.37131738662719727
13 | 
14 | real	0m6.031s
15 | user	0m5.267s
16 | sys	0m2.265s
17 | ```
18 | 
19 | Compiled with O3 etc.
20 | 
21 | ```bash
22 | $ gfortran-10 -O3 -march=native -ffast-math -funroll-loops transformer.f90 -o tx
23 | $ time ./tx -m msmarco-distilbert-base-dot-prod-v3.bin -q -f sentence_ex.txt --time
24 |  Load time in seconds:    9.60000008E-02
25 |  Total inference time in seconds:    4.36800003    
26 | 
27 | real	0m4.476s
28 | user	0m4.376s
29 | sys	0m0.100s
30 | ```
31 | 
32 | With external blas library
33 | 
34 | ```bash
35 | $ gfortran-10 -O3 -march=native -ffast-math -funroll-loops transformer.f90 -fexternal-blas -lopenblas -o tx
36 | $ time ./tx -m msmarco-distilbert-base-dot-prod-v3.bin -q -f sentence_ex.txt --time
37 |  Load time in seconds:   0.128000006    
38 |  Total inference time in seconds:    1.28000009    
39 | 
40 | real	0m1.416s
41 | user	0m9.457s
42 | sys	0m7.029s
43 | ```
44 | 
45 | MacOS (intel, 2017 MBP)
46 | 
47 | ```bash
48 | % python transformer.py
49 | load time: 0.9851226806640625
50 | 21
51 | inferenece time: 2.890425205230713
52 | ```
53 | 
54 | ```bash
55 | % gfortran -O3 -march=native -ffast-math -funroll-loops transformer.f90 -o tx
56 | % time ./tx -m msmarco-distilbert-base-dot-prod-v3.bin -q -f sentence_ex.txt --time
57 |  Load time in seconds:   0.224000007
58 |  Total inference time in seconds:    7.64800024
59 | ./tx -m msmarco-distilbert-base-dot-prod-v3.bin -q -f sentence_ex.txt --time  7.27s user 0.25s system 94% cpu 7.976 total
60 | ```
61 | 
62 | ```bash
63 | % gfortran -O3 -march=native -ffast-math -funroll-loops transformer.f90 -fexternal-blas -framework Accelerate -o tx
64 | % time ./tx -m msmarco-distilbert-base-dot-prod-v3.bin -q -f sentence_ex.txt --time
65 |  Load time in seconds:   0.352000028
66 |  Total inference time in seconds:    1.50400007
67 | ./tx -m msmarco-distilbert-base-dot-prod-v3.bin -q -f sentence_ex.txt --time  1.47s user 0.19s system 76% cpu 2.178 total
68 | ```
69 | 


--------------------------------------------------------------------------------
/sentence_ex.txt:
--------------------------------------------------------------------------------
 1 | Europe is a continent located entirely in the Northern Hemisphere and mostly in the Eastern Hemisphere. 
 2 | It comprises the westernmost part of Eurasia and is bordered by the Arctic Ocean to the north, the Atlantic Ocean to the west, the Mediterranean Sea to the south, and Asia to the east. 
 3 | Europe is commonly considered to be separated from Asia by the watershed of the Ural Mountains, the Ural River, the Caspian Sea, the Greater Caucasus, the Black Sea, and the waterways of the Turkish Straits. 
 4 | Although some of this border is over land, Europe is generally accorded the status of a full continent because of its great physical size and the weight of history and tradition.
 5 | Europe covers about 10,180,000 square kilometres (3,930,000 sq mi), or 2% of the Earth's surface (6.8% of land area), making it the second smallest continent. 
 6 | Politically, Europe is divided into about fifty sovereign states, of which Russia is the largest and most populous, spanning 39% of the continent and comprising 15% of its population. 
 7 | Europe had a total population of about 741 million (about 11% of the world population) as of 2018. 
 8 | The European climate is largely affected by warm Atlantic currents that temper winters and summers on much of the continent, even at latitudes along which the climate in Asia and North America is severe. 
 9 | Further from the sea, seasonal differences are more noticeable than close to the coast.
10 | European culture is the root of Western civilization, which traces its lineage back to ancient Greece and ancient Rome. 
11 | The fall of the Western Roman Empire in 476 AD and the subsequent Migration Period marked the end of Europe's ancient history and the beginning of the Middle Ages.
12 | A saxophone is a type of musical instrument in the woodwind family. 
13 | The saxophone uses a piece of wood, called a reed, to make sound. 
14 | The player blows air into the mouthpiece, which vibrates the reed. 
15 | The saxophone also uses keys to change pitch, and the player closes or opens holes to choose the note. 
16 | Commonly, saxophones have about 22 keys.
17 | The saxophone is most commonly found in four voices: soprano, alto, tenor, and baritone saxophones. 
18 | However, uncommon saxophones include the bass and contrabass saxophones (lower than a baritone saxophone), the C-melody saxophone (between the tenor and alto saxophones), and the sopranino saxophone (higher than a soprano saxophone).
19 | It was invented in 1840 by Adolphe Sax and is used in classical, jazz, and occasionally in rock, pop, and other styles. 
20 | The saxophone was originally created for military bands, but was commonly used in jazz big bands in the 1940s and 1950s. 
21 | Famous saxophone players include Marcel Mule (classical music), John Coltrane (jazz music), and Charlie Parker (jazz music).
22 | 


--------------------------------------------------------------------------------
/savemodel.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import struct
  3 | import json
  4 | import torch
  5 | import numpy as np
  6 | 
  7 | #from transformers import AutoModel, AutoTokenizer 
  8 | from sentence_transformers import SentenceTransformer
  9 | import re
 10 | 
 11 | if len(sys.argv) > 1:
 12 |     dir_model = sys.argv[1]
 13 | else:
 14 |     dir_model = "msmarco-distilbert-base-dot-prod-v3"
 15 | 
 16 | with open(dir_model + "/tokenizer.json", "r", encoding="utf-8") as f:
 17 |     encoder = json.load(f)
 18 | 
 19 | with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
 20 |     hparams = json.load(f)
 21 | 
 22 | with open(dir_model + "/modules.json", "r", encoding="utf-8") as f:
 23 |     modules = json.load(f)
 24 | 
 25 | st_model =  SentenceTransformer(dir_model)
 26 | 
 27 | list_vars = st_model[0].state_dict() # transformer
 28 | 
 29 | def strip(x: str):
 30 |     x = "auto_model." + x
 31 |     print(x)
 32 |     y = list_vars[x]
 33 |     assert y.view(-1)[0].dtype == torch.float32
 34 |     return y.numpy()
 35 | 
 36 | if len(sys.argv) > 2:
 37 |     outfile = sys.argv[2]
 38 | else:
 39 |     outfile = "msmarco-distilbert-base-dot-prod-v3_converted_full.bin"
 40 | 
 41 | with open(outfile,mode='wb') as of:
 42 |         #write up front stuff
 43 |         header = struct.pack(
 44 |         'iiiiiii',
 45 |         hparams['dim'], hparams['hidden_dim'], hparams['n_layers'],
 46 |             hparams['n_heads'], 0, len(encoder['model']['vocab']),
 47 |             hparams['max_position_embeddings'],
 48 |             )
 49 |         of.write(header)
 50 | 
 51 |         w = strip('embeddings.word_embeddings.weight')
 52 |         of.write(memoryview(w))
 53 | 
 54 |         w = strip('embeddings.position_embeddings.weight')
 55 |         of.write(memoryview(w))
 56 | 
 57 |         w = strip('embeddings.LayerNorm.weight')
 58 |         of.write(memoryview(w))
 59 | 
 60 |         w = strip('embeddings.LayerNorm.bias')
 61 |         of.write(memoryview(w))
 62 | 
 63 |         layers = hparams['n_layers']
 64 | 
 65 |         for l in range(layers):
 66 |             w = strip(f'transformer.layer.{l}.attention.q_lin.weight')
 67 |             of.write(memoryview(w))
 68 | 
 69 |         for l in range(layers):
 70 |             w = strip(f'transformer.layer.{l}.attention.q_lin.bias')
 71 |             of.write(memoryview(w))
 72 | 
 73 |         for l in range(layers):
 74 |             w = strip(f'transformer.layer.{l}.attention.k_lin.weight')
 75 |             of.write(memoryview(w))
 76 | 
 77 |         for l in range(layers):
 78 |             w = strip(f'transformer.layer.{l}.attention.k_lin.bias')
 79 |             of.write(memoryview(w))
 80 | 
 81 |         for l in range(layers):
 82 |             w = strip(f'transformer.layer.{l}.attention.v_lin.weight')
 83 |             of.write(memoryview(w))
 84 | 
 85 |         for l in range(layers):
 86 |             w = strip(f'transformer.layer.{l}.attention.v_lin.bias')
 87 |             of.write(memoryview(w))
 88 | 
 89 |         for l in range(layers):
 90 |             w = strip(f'transformer.layer.{l}.attention.out_lin.weight')
 91 |             of.write(memoryview(w))
 92 | 
 93 |         for l in range(layers):
 94 |             w = strip(f'transformer.layer.{l}.attention.out_lin.bias')
 95 |             of.write(memoryview(w))
 96 | 
 97 |         for l in range(layers):
 98 |             w = strip(f'transformer.layer.{l}.sa_layer_norm.weight')
 99 |             of.write(memoryview(w))
100 | 
101 |         for l in range(layers):
102 |             w = strip(f'transformer.layer.{l}.sa_layer_norm.bias')
103 |             of.write(memoryview(w))
104 | 
105 |         for l in range(layers):
106 |             w = strip(f'transformer.layer.{l}.ffn.lin1.weight')
107 |             of.write(memoryview(w))
108 | 
109 |         for l in range(layers):
110 |             w = strip(f'transformer.layer.{l}.ffn.lin1.bias')
111 |             of.write(memoryview(w))
112 | 
113 |         for l in range(layers):
114 |             w = strip(f'transformer.layer.{l}.ffn.lin2.weight')
115 |             of.write(memoryview(w))
116 | 
117 |         for l in range(layers):
118 |             w = strip(f'transformer.layer.{l}.ffn.lin2.bias')
119 |             of.write(memoryview(w))
120 | 
121 |         for l in range(layers):
122 |             w = strip(f'transformer.layer.{l}.output_layer_norm.weight')
123 |             of.write(memoryview(w))
124 | 
125 |         for l in range(layers):
126 |             w = strip(f'transformer.layer.{l}.output_layer_norm.bias')
127 |             of.write(memoryview(w))
128 | 
129 |         # just stick the linear weights at the end
130 |         print("linear.weight")
131 |         y = st_model[2].state_dict()['linear.weight']
132 |         assert y.view(-1)[0].dtype == torch.float32
133 |         of.write(memoryview(y.numpy()))
134 | 
135 | if len(sys.argv) > 3:
136 |     vname = sys.argv[3]
137 | else:
138 |     vname = "tokenizer.bin"
139 | 
140 | vocab = encoder["model"]["vocab"]
141 | # write out vocab
142 | max_len = max([len(bytes(v,"utf-8")) for v in vocab])
143 | print("Maximum word size: ", max_len)
144 | with open(vname, "wb") as f:
145 |     f.write(struct.pack("i", max_len))
146 | 
147 |     for v in vocab:
148 |         vb = bytes(v,"utf-8")
149 |         f.write(struct.pack("ii", 0, len(vb)))
150 |         f.write(struct.pack(f"{len(vb)}s",vb))
151 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ## Ferrite - Simple, lightweight transformers in Fortran
  2 | 
  3 | Modern ML frameworks like HF transformers are easy to use but extremely abstract. There are times the abstraction makes sense, particularly model training. For inference using transformers, the "real" inference code is less complex then the abstraction, and it can be faster and more transparent to just write the code. That way you can plainly see what your model is doing under the hood and adapt it to your use case, rather than picking through layer after layer of aabstraction over what is effectively a for-loop with some matrix multiplications inside. 
  4 | 
  5 | To that end, as a complement to the [llama.f90](rbitr/llama.f90) Fortran LLM, this project demonstrates a [Sentence Transformer](https://www.sbert.net/index.html) in "pure" Fortran with no dependencies (you still need python to convert pytorch models if you want to use them).
  6 | 
  7 | I plan to evolve this to make sure it can work with general transformer models, and add performance optimization as required. That said, I don't want to add any abstraction so I only want to add generalizations that don't obscure what is going on. The code can easily be adapted for architectural variations.
  8 | 
  9 | ## Setup and running
 10 | 
 11 | ```bash
 12 | # clone the repo
 13 | git clone https://github.com/rbitr/ferrite
 14 | cd ferrite
 15 | # download a model
 16 | wget https://huggingface.co/SDFASDGA/llm/resolve/main/msmarco-distilbert-base-dot-prod-v3-f32.gguf
 17 | # compile
 18 | make
 19 | #run
 20 | ./tx -m msmarco-distilbert-base-dot-prod-v3-f32.gguf -v -p "I alwas feel like somebody's watching me" # (sic)
 21 | 
 22 | GGUF Header Info
 23 |  Magic number:   1179993927
 24 |  Version:            3
 25 |  Tensor Count:                   101
 26 |  Key-Value Pairs:                    15
 27 |  general.architecture                                            
 28 |  distilbert                                                      
 29 |  general.name                                                    
 30 |  DistilBert                                                      
 31 |  distilbert.context_length                                       
 32 |          512
 33 |  distilbert.embedding_length                                     
 34 |          768
 35 |  distilbert.feed_forward_length                                  
 36 |         3072
 37 |  distilbert.block_count                                          
 38 |            6
 39 |  distilbert.attention.head_count                                 
 40 |           12
 41 |  distilbert.attention.head_count_kv                              
 42 |            1
 43 |  general.file_type                                               
 44 |            0
 45 |  tokenizer.ggml.model                                            
 46 |  gpt2                                                            
 47 |  tokenizer.ggml.tokens                                           
 48 |        30522
 49 |  tokenizer.ggml.token_type                                       
 50 |        30522
 51 |  tokenizer.ggml.unknown_token_id                                 
 52 |          100
 53 |  tokenizer.ggml.seperator_token_id                               
 54 |          102
 55 |  tokenizer.ggml.padding_token_id                                 
 56 |            0
 57 |  Position      573471
 58 |  Deficit          30
 59 |  data offset      573473
 60 |  Embedding dimension:          768
 61 |  Hidden dimension:         3072
 62 |  Layers:            6
 63 |  Heads:           12
 64 |  kv Heads:            1
 65 |  Vocabulary Size:        30522
 66 |  Sequence Length:          512
 67 |  head size           64
 68 |  kv head Size           64
 69 |  loaded word embedding weights:    23440896
 70 |  loaded position embedding weights:      393216
 71 |  loaded embedding layernorm weights:         768
 72 |  loaded embedding layernorm bias:         768
 73 |  loaded wq weights:     3538944
 74 |  loaded wq bias:        4608
 75 |  loaded wk weights:     3538944
 76 |  loaded wk bias:        4608
 77 |  loaded wv weights:     3538944
 78 |  loaded wv bias:        4608
 79 |  loaded wo weights:     3538944
 80 |  loaded wo bias:        4608
 81 |  loaded sa layernorm weights:        4608
 82 |  loaded sa layernorm bias:        4608
 83 |  loaded w1 weights:    14155776
 84 |  loaded w1 bias:       18432
 85 |  loaded w2 (down) weights:    14155776
 86 |  loaded w2 (down) bias:        4608
 87 |  loaded output norm weights:        4608
 88 |  loaded output norm bias:        4608
 89 |  loaded classifier weights:      589824
 90 |  loading tokens
 91 | found 30522 tokens
 92 |  maximum token length           18
 93 | Token 4081 is andrew                                                          
 94 |  simple token: i                 
 95 |  wordpiece tokens: i                 
 96 |  simple token: alwas             
 97 |  wordpiece tokens: al                ##was             
 98 |  simple token: feel              
 99 |  wordpiece tokens: feel              
100 |  simple token: like              
101 |  wordpiece tokens: like              
102 |  simple token: somebody          
103 |  wordpiece tokens: somebody          
104 |  simple token: '                 
105 |  wordpiece tokens: '                 
106 |  simple token: s                 
107 |  wordpiece tokens: s                 
108 |  simple token: watching          
109 |  wordpiece tokens: watching          
110 |  simple token: me                
111 |  wordpiece tokens: me                
112 |          102        1046        2633       17312        2515        2067        8308        1006        1056        3667        2034         103
113 |   0.117702775      0.268108070     -0.412374288     -0.684159577     -0.272519588     -0.633238137 ...
114 | ```
115 | 
116 | Right now I've only tested it with the `msmarco-distilbert-base-dot-prod-v3` model from sbert.net. This is a DistilBbert transformer with a pooling and linear layer used for generating embeddings for semantic search. See https://www.sbert.net/docs/pretrained-models/msmarco-v3.html for more information. 
117 | 
118 | Command line arguments are as follows:
119 | 
120 | ```bash
121 | case ('-m', '--model')
122 | ! path to model file
123 | --
124 | case ('-p', '--prompt')
125 | ! prompt string
126 | --
127 | case ('-s', '--tokenizer')
128 | ! path to custom tokenizer
129 | --
130 | case ('-t', '--temperature')
131 | ! temperature scaling (not used)
132 | --
133 | case ('-n', '--num_tokens')
134 | ! number of tokens to generate, including prompt (not used)
135 | --
136 | case ('-v', '--verbose')
137 | ! print additional information
138 | --
139 | case ('-1', '--single_line')
140 | ! print each element on single line
141 | --
142 | case ('-q', '--quiet')
143 | ! don't print embedding
144 | 
145 | ```
146 | 
147 | ## Getting models
148 | 
149 | Models are in gguf format, see https://github.com/ggerganov/ggml/blob/master/docs/gguf.md
150 | 
151 | You can use the `convert-hf-to-gguf.py` file from https://github.com/rbitr/llama.cpp to convery HF model files, ie 
152 | 
153 | ```bash
154 | git clone https://github.com/rbitr/llama.cpp
155 | # get the model
156 | git clone https://huggingface.co/sentence-transformers/msmarco-distilbert-base-dot-prod-v3
157 | # convert
158 | python ./llama.cpp/convert-hf-to-gguf.py msmarco-distilbert-base-dot-prod-v3 --outtype f32
159 | ```
160 | 
161 | Note that only distilbert models are supported and it has not been extensively tested. Support is currently limited to the fork referenced above, it's not part of the original repo.
162 | 
163 | 
164 | ## Examples (currently using the old model file format, adjust accordingly)
165 | 
166 | Included in the repo is a file `sentence_ex` made up of some sentences from wikipedia about Europe and the saxoaphone. We save temporary embeddings for each sentence with a bash one-liner:
167 | 
168 | ```bash
169 | x=1; while read s; do echo $x $s; ./tx -m msmarco-distilbert-base-dot-prod-v3_converted_full.bin -1 -p "$s" > tmp/emb${x}.txt; x=$((x+1)); done < sentence_ex.txt
170 | 1 Europe is a continent located entirely in the Northern Hemisphere and mostly in the Eastern Hemisphere.
171 | 2 It comprises the westernmost part of Eurasia and is bordered by the Arctic Ocean to the north, the Atlantic Ocean to the west, the Mediterranean Sea to the south, and Asia to the east.
172 | 3 Europe is commonly considered to be separated from Asia by the watershed of the Ural Mountains, the Ural River, the Caspian Sea, the Greater Caucasus, the Black Sea, and the waterways of the Turkish Straits.
173 | 4 Although some of this border is over land, Europe is generally accorded the status of a full continent because of its great physical size and the weight of history and tradition.
174 | 5 Europe covers about 10,180,000 square kilometres (3,930,000 sq mi), or 2% of the Earth's surface (6.8% of land area), making it the second smallest continent.
175 | 6 Politically, Europe is divided into about fifty sovereign states, of which Russia is the largest and most populous, spanning 39% of the continent and comprising 15% of its population.
176 | 7 Europe had a total population of about 741 million (about 11% of the world population) as of 2018.
177 | 8 The European climate is largely affected by warm Atlantic currents that temper winters and summers on much of the continent, even at latitudes along which the climate in Asia and North America is severe.
178 | 9 Further from the sea, seasonal differences are more noticeable than close to the coast.
179 | 10 European culture is the root of Western civilization, which traces its lineage back to ancient Greece and ancient Rome.
180 | 11 The fall of the Western Roman Empire in 476 AD and the subsequent Migration Period marked the end of Europe's ancient history and the beginning of the Middle Ages.
181 | 12 A saxophone is a type of musical instrument in the woodwind family.
182 | 13 The saxophone uses a piece of wood, called a reed, to make sound.
183 | 14 The player blows air into the mouthpiece, which vibrates the reed.
184 | 15 The saxophone also uses keys to change pitch, and the player closes or opens holes to choose the note.
185 | 16 Commonly, saxophones have about 22 keys.
186 | 17 The saxophone is most commonly found in four voices: soprano, alto, tenor, and baritone saxophones.
187 | 18 However, uncommon saxophones include the bass and contrabass saxophones (lower than a baritone saxophone), the C-melody saxophone (between the tenor and alto saxophones), and the sopranino saxophone (higher than a soprano saxophone).
188 | 19 It was invented in 1840 by Adolphe Sax and is used in classical, jazz, and occasionally in rock, pop, and other styles.
189 | 20 The saxophone was originally created for military bands, but was commonly used in jazz big bands in the 1940s and 1950s.
190 | 21 Famous saxophone players include Marcel Mule (classical music), John Coltrane (jazz music), and Charlie Parker (jazz music).
191 | ```
192 | 
193 | Then we can lookup queries by making an embedding and finding the entry with the largest dot-product (computed here in awk)
194 | 
195 | ```bash
196 | ./tx -m msmarco-distilbert-base-dot-prod-v3_converted_full.bin -1 -p "What bodies of water are in europe?" > tmp/embq.txt
197 | for x in {1..21}; do echo $x; paste tmp/emb${x}.txt tmp/embq.txt | awk '{dp+=$1*$2} END {print dp}'; done
198 | 1
199 | 35.505
200 | 2
201 | 33.9245
202 | 3
203 | 37.5551
204 | 4
205 | 29.1835
206 | 5
207 | 36.0957
208 | 6
209 | 31.6795
210 | 7
211 | 29.0034
212 | 8
213 | 31.7701
214 | 9
215 | 17.0193
216 | 10
217 | 26.859
218 | 11
219 | 20.4201
220 | 12
221 | 10.0551
222 | 13
223 | 9.95383
224 | 14
225 | 14.5428
226 | 15
227 | 8.84668
228 | 16
229 | 10.5251
230 | 17
231 | 10.2478
232 | 18
233 | 8.90325
234 | 19
235 | 12.1863
236 | 20
237 | 6.15891
238 | 21
239 | 7.62652
240 | ```
241 | 
242 | The question was about Europe so the scores are higher on the first 11 entries, and the maximum is #3 which talks about waterways.
243 | 
244 | Below we ask who invented the saxophone and get the highest score at sentence 19 which contains the answer. (Note I misspelled saxophone in the query and it still works).
245 | 
246 | ```bash
247 | ./tx -m msmarco-distilbert-base-dot-prod-v3_converted_full.bin -1 -p "Who invented the saxaphone?" > tmp/embq.txt
248 | for x in {1..21}; do echo $x; paste tmp/emb${x}.txt tmp/embq.txt | awk '{dp+=$1*$2} END {print dp}'; done
249 | 1
250 | 8.78761
251 | 2
252 | 11.0401
253 | 3
254 | 11.4972
255 | 4
256 | 5.93544
257 | 5
258 | 3.17357
259 | 6
260 | 6.38081
261 | 7
262 | 9.9643
263 | 8
264 | 16.3048
265 | 9
266 | 12.8389
267 | 10
268 | 22.387
269 | 11
270 | 22.8647
271 | 12
272 | 31.1579
273 | 13
274 | 32.949
275 | 14
276 | 24.6756
277 | 15
278 | 28.0059
279 | 16
280 | 24.3043
281 | 17
282 | 25.446
283 | 18
284 | 29.0274
285 | 19
286 | 42.3414
287 | 20
288 | 30.9246
289 | 21
290 | 33.6924
291 | ```
292 | 
293 | 
294 | 


--------------------------------------------------------------------------------
/transformer.f90:
--------------------------------------------------------------------------------
  1 | module arg_parse
  2 |         implicit none
  3 | 
  4 |         type args
  5 |                 real :: temperature
  6 |                 character(:), allocatable :: model_file
  7 |                 character(:), allocatable :: prompt
  8 |                 character(:), allocatable :: tokenizer
  9 |                 character(:), allocatable :: filename
 10 |                 logical :: verbose
 11 |                 integer :: n
 12 |                 logical :: single_line, quiet, time
 13 |         end type args
 14 | 
 15 |         contains
 16 | 
 17 |                 subroutine parse_args(arg_values)
 18 |                         type(args) :: arg_values
 19 |                         integer :: i, num_args
 20 |                         character(256) :: arg
 21 | 
 22 | 
 23 | 
 24 |                         !defaults 
 25 |                         arg_values%temperature = 0
 26 |                         arg_values%model_file = ""
 27 |                         arg_values%prompt = ""
 28 |                         arg_values%verbose = .false.
 29 |                         arg_values%n = 256
 30 |                         arg_values%tokenizer = "tokenizer.bin"
 31 |                         arg_values%single_line = .false.
 32 |                         arg_values%quiet = .false.
 33 |                         arg_values%filename = ""
 34 |                         arg_values%time = .false.
 35 | 
 36 |                         num_args = command_argument_count()
 37 | 
 38 |                         i = 1
 39 |                         do while (i <= num_args)
 40 |                                 call get_command_argument(i, arg)
 41 |                                         select case (arg)
 42 |                                                 case ('-m', '--model')
 43 |                                                 ! path to model file
 44 |                                                 call get_command_argument(i+1, arg)
 45 |                                                 arg_values%model_file = trim(arg)
 46 |                                                 i = i + 2
 47 |                                                 case ('-p', '--prompt')
 48 |                                                 ! prompt string
 49 |                                                 call get_command_argument(i+1, arg)
 50 |                                                 arg_values%prompt = trim(arg)
 51 |                                                 i = i + 2
 52 |                                                 case ('-s', '--tokenizer')
 53 |                                                 ! path to custom tokenizer
 54 |                                                 call get_command_argument(i+1, arg)
 55 |                                                 arg_values%tokenizer = trim(arg)
 56 |                                                 i = i + 2
 57 |                                                 case ('-t', '--temperature')
 58 |                                                 ! temperature scaling
 59 |                                                 call get_command_argument(i+1, arg)
 60 |                                                 read(arg,*) arg_values%temperature
 61 |                                                 i = i + 2
 62 |                                                 case ('-n', '--num_tokens')
 63 |                                                 ! number of tokens to generate, including prompt
 64 |                                                 call get_command_argument(i+1, arg)
 65 |                                                 read(arg,*) arg_values%n
 66 |                                                 i = i + 2
 67 |                                                 case ('-f', '--filename')
 68 |                                                 ! text file with a prompt on each line
 69 |                                                 call get_command_argument(i+1, arg)
 70 |                                                 arg_values%filename = trim(arg)
 71 | 
 72 |                                                 i = i + 2
 73 |                                                 case ('-v', '--verbose')
 74 |                                                 ! print additional information
 75 |                                                 arg_values%verbose = .true.
 76 |                                                 i = i + 1
 77 |                                                 case ('-1', '--single_line')
 78 |                                                 ! print each element on single line
 79 |                                                 arg_values%single_line = .true.
 80 |                                                 i = i + 1
 81 |                                                 case ('-q', '--quiet')
 82 |                                                         ! don't print embedding
 83 |                                                 arg_values%quiet = .true.
 84 |                                                 i = i + 1
 85 |                                                 case ('--time')
 86 |                                                         ! display timings
 87 |                                                 arg_values%time = .true.
 88 |                                                 i = i + 1
 89 |                                                 case default
 90 |                                                 print *, 'Unrecognized option:', trim(arg)
 91 |                                                 stop
 92 |                                                 end select
 93 |                         end do
 94 | 
 95 |                         ! check for arguments
 96 | 
 97 | 
 98 |                 end subroutine
 99 | 
100 | end module arg_parse
101 | 
102 | program transformer
103 | 
104 |         use iso_c_binding
105 |         use precision_module
106 |         use weight_module
107 |         use arg_parse
108 |         use read_ggml, only: load_ggml
109 |         implicit none
110 | 
111 |         type(TransformerWeights) :: weights
112 |         type(Config) :: cfg
113 | 
114 |         integer(4) :: emb_dim, hidden_dim, n_layers, n_heads, n_kv_heads, vocab_size, seq_len
115 |         integer(4) :: itmp, msize
116 | 
117 |         type (args) :: arg_values
118 |         character(:), allocatable :: prompt
119 |         logical :: verbose, time
120 | 
121 |         real(kind=wp) :: score
122 |         integer :: tok_len, max_len, n, p, l
123 |         !integer :: vocab_size = 32000
124 |         character(:), allocatable :: tmpstr
125 |         character(:), dimension(:), allocatable :: vocab
126 |         real(kind=wp),allocatable :: y(:), scores(:)
127 |         integer, allocatable :: prompt_tokens(:)
128 |         integer, allocatable :: vocab_len(:)
129 |         integer, parameter :: max_prompt_len = 1024
130 |         character(:), dimension(:), allocatable :: prompts
131 |         character(len=max_prompt_len) :: temp_prompt
132 |         integer :: tfid, ierr, num_lines
133 |         real(kind=wp) :: t_start, t_end
134 | 
135 |         character(:), dimension(:), allocatable :: simple_tokens
136 |         !integer, allocatable :: prompt_tokens
137 | 
138 | 
139 |         call parse_args(arg_values)
140 | 
141 |         if (arg_values%prompt == "" .and. arg_values%filename == "") then
142 |                 !print *, arg_values%filename
143 |                 print *, "prompt required"
144 |                 stop
145 |         end if
146 | 
147 |         verbose = arg_values%verbose
148 |         time = arg_values%time
149 | 
150 |         msize = 0
151 |         
152 |         t_start = time_ms()
153 |         
154 |         call load_ggml(arg_values%model_file, weights, cfg, vocab, vocab_len, verbose)
155 |         emb_dim = cfg%emb_dim
156 |         hidden_dim = cfg%hidden_dim
157 |         n_layers = cfg%n_layers
158 |         n_heads = cfg%n_heads
159 |         vocab_size = cfg%vocab_size
160 |         seq_len = cfg%seq_len
161 |         max_len = maxval(vocab_len)
162 |         
163 |         
164 |         if (verbose) then
165 |                 !write(*,"(A,I0,A)") "Read ", vocab_size, " tokens"
166 |                 write(*,"(A,A)") "Token 4081 is ", vocab(4081)
167 |         end if
168 | 
169 | 
170 |         ! if there is a prompt, read the prompt and make a length 1 list
171 |         ! if there is a file, read the lines into a list
172 |         
173 |         if (arg_values%prompt /= "") then
174 |                 allocate(character(len=max_prompt_len) ::  prompts(1))
175 |                 prompts(1) = arg_values%prompt
176 |         
177 |         else if (arg_values%filename /= "") then
178 | 
179 |                 tfid = 5
180 |                 open(unit=tfid,file=arg_values%filename)
181 |                 ierr = 0
182 |                 num_lines = -1
183 |                 do while (ierr == 0)
184 |                         num_lines = num_lines + 1
185 |                         read(tfid,*,iostat=ierr) temp_prompt
186 |                 end do
187 | 
188 |                 if (verbose) then
189 |                         write(*,'(A,I0,A)') "Read ", num_lines, " lines"
190 |                 end if
191 | 
192 |                 allocate(character(len=max_prompt_len) ::  prompts(num_lines))
193 | 
194 |                 rewind(tfid)
195 |                 do p = 1,num_lines
196 |                 read(tfid, '(A)') prompts(p)
197 |                 end do
198 |         
199 |         end if
200 |         
201 |         t_end = time_ms()
202 | 
203 |         if (time) then
204 |                 print *, "Load time in seconds: ", (t_end-t_start)/1000
205 |         end if  
206 |         
207 |         ! tokenize prompt
208 |         !simple_tokens = simple_tokenize(arg_values%prompt)
209 | 
210 |         t_start = time_ms()
211 |         do p=1,size(prompts)
212 |         prompt_tokens = sp_tokenize(trim(prompts(p)))
213 | 
214 | 
215 |         if (verbose) then 
216 |         simple_tokens = simple_tokenize(trim(prompts(p)))
217 |         do n=1,size(simple_tokens)
218 |                 print *, "simple token: ", simple_tokens(n)
219 |                 print *, "wordpiece tokens: ", encode_word(simple_tokens(n))
220 |         end do
221 | 
222 |         print *, prompt_tokens
223 | 
224 |         end if
225 | 
226 |         !run through transformer
227 |         y = dbert(prompt_tokens,weights,cfg)
228 |         
229 |         if (arg_values%quiet) then
230 |                 cycle
231 |         end if 
232 | 
233 |         if (arg_values%single_line) then
234 |                 do n=1,emb_dim
235 |                 write (*,"(F10.5)") y(n)
236 |                 end do
237 |         else
238 |                 print *, y
239 |         end if
240 | 
241 |         end do
242 |         t_end = time_ms()
243 | 
244 |         if (time) then
245 |                 print *, "Total inference time in seconds: ", (t_end-t_start)/1000
246 |         end if
247 | 
248 | 
249 | contains
250 | 
251 | 
252 |         function layer_norm(x,w,b) result(xr)
253 |               real(kind=wp) :: x(:,:), w(:), b(:)
254 |               real(kind=wp) :: xr(size(x,1), size(x,2))
255 |               real(kind=wp) :: xmean(size(x,1),size(x,2)), xvar(size(x,1),size(x,2))
256 |               real(kind=wp) :: xn
257 |               !print *, "A"
258 |               xmean = spread(sum(x,dim=1)/size(x,1),1,size(x,1))
259 |               xvar = spread(sum( (x-xmean)*(x-xmean),dim=1 ) / size(x,1), 1, size(x,1))
260 |               xr = (x - xmean) / sqrt(xvar + 1e-12)
261 |               !print *, "B"
262 |               xr = xr*spread(w,2,size(x,2)) + spread(b,2,size(x,2))
263 |         end function
264 | 
265 |         function softmax(x) result(y)
266 |                 real(kind=wp), intent(in) :: x(:,:)
267 |                 real(kind=wp) :: y(size(x,1),size(x,2))
268 |                 
269 |                 y = exp(x - spread(maxval(x,dim=1),1,size(x,1)))
270 |                 y = y / spread(sum(y,dim=1),1,size(x,1) )
271 | 
272 |         end function
273 | 
274 |         function attention(q,k,v) result(y)
275 |                 real(kind=wp), intent(in) :: q(:,:), k(:,:), v(:,:)
276 |                 real(kind=wp) :: y(size(q,1),size(q,2))
277 |                 real(kind=wp), allocatable :: y_int(:,:)
278 |                 y = matmul(v,(softmax(matmul(transpose(k),q) / sqrt(1.0*size(q,1)))))
279 |         end function        
280 | 
281 |         function gelu(x) result(y)
282 |                 real(kind=wp), intent(in) :: x(:,:)
283 |                 real(kind=wp) :: y(size(x,1),size(x,2))
284 |                 y = 0.5 * x * (1 + tanh(sqrt(2 / 3.1415926536) * (x + 0.044715 * x**3)))
285 |         end function
286 | 
287 |         function dbert(toks,w,c) result(y)
288 |         integer, intent(in) :: toks(:)
289 |         type(TransformerWeights) :: w
290 |         type(Config) :: c
291 |         real(kind=wp), allocatable :: y(:)
292 |         integer :: i,j,l,h,nt,hsize
293 |         real(kind=wp), allocatable :: x(:,:)
294 | 
295 |         real(kind=wp), allocatable :: q(:,:), k(:,:), v(:,:)
296 |         real(kind=wp), allocatable :: qs(:,:,:), ks(:,:,:), vs(:,:,:)
297 |         real(kind=wp), allocatable :: xb(:,:), attn_out(:,:), xbup(:,:)
298 |         nt = size(toks)
299 |         allocate(x(c%emb_dim, nt))
300 |         allocate(y(c%emb_dim))
301 |         allocate(xb(c%emb_dim, nt))
302 |         allocate(attn_out(c%emb_dim,nt))
303 |         allocate(xbup(c%hidden_dim,nt))
304 | 
305 |         hsize = c%emb_dim/c%n_heads
306 | 
307 |         do i=1,nt
308 |         x(:,i) = w%word_embeddings(:,toks(i))
309 |         x(:,i) = x(:,i) + w%position_embeddings(:,i)
310 |         end do
311 |         
312 |         x = layer_norm(x,w%emb_layer_norm_w, w%emb_layer_norm_b)
313 | 
314 |         
315 |         do l=1,c%n_layers
316 |         
317 |                 q = matmul(transpose(w%wq(:,:,l)),x) + spread(w%bq(:,l),2,nt)
318 |                 k = matmul(transpose(w%wk(:,:,l)),x) + spread(w%bk(:,l),2,nt)
319 |                 v = matmul(transpose(w%wv(:,:,l)),x) + spread(w%bv(:,l),2,nt)
320 |                 
321 |                 ! split along embedding dim
322 |                 do h = 1,c%n_heads
323 |                 
324 |                 xb(((h-1)*hsize+1):(h*hsize),:) = attention( q(((h-1)*hsize+1):(h*hsize),:),&
325 |                         &k(((h-1)*hsize+1):(h*hsize),:), v(((h-1)*hsize+1):(h*hsize),:))
326 | 
327 |                 end do 
328 | 
329 |                 xb = matmul(transpose(w%wo(:,:,l)),xb) + spread(w%bo(:,l),2,nt)
330 |                 xb = xb + x
331 |                 
332 |                 xb = layer_norm(xb,w%sa_layer_norm_w(:,l), w%sa_layer_norm_b(:,l))
333 | 
334 |                 attn_out = xb
335 | 
336 |                 xbup = matmul(transpose(w%w1(:,:,l)),xb) + spread(w%b1(:,l),2,nt)
337 |                 
338 |                 xbup = gelu(xbup)
339 |                 
340 |                 xb = matmul(transpose(w%w2(:,:,l)),xbup) + spread(w%b2(:,l),2,nt)
341 |                 
342 |                 xb = xb + attn_out
343 | 
344 |                 x = layer_norm(xb,w%out_layer_norm_w(:,l), w%out_layer_norm_b(:,l))
345 | 
346 |         end do
347 |         
348 |         ! "pooling" average
349 |         y = sum(x,dim=2) / size(x,2)
350 | 
351 |         ! linear
352 |         y = matmul(transpose(w%linear), y)
353 | 
354 |         end function 
355 |         
356 |         function sp_tokenize(text) result(inds)
357 |                 character(len=*) :: text
358 |                 integer, allocatable :: inds(:)
359 |                 character(:), dimension(:), allocatable :: tokens, wpe
360 |                 integer :: m, n
361 | 
362 |                 allocate(inds(1))
363 | 
364 |                 inds(1) = 102 ! bos (1 added because 1 based indices)
365 | 
366 |                 tokens = simple_tokenize(text)
367 | 
368 |                 do m=1,size(tokens)
369 |                         wpe = encode_word(tokens(m))
370 |                         do n = 1,size(wpe)
371 |                         inds = [inds, lookup(wpe(n),len_trim(wpe(n)))]
372 |                         end do
373 |                 end do
374 | 
375 |                 inds = [inds, 103]
376 | 
377 |         end function
378 | 
379 |         function lookup(s,l) result(ind)
380 |                 character(len=*) :: s
381 |                 integer :: l
382 |                 integer :: i, ind
383 | 
384 |                 do i = 1,size(vocab)
385 |                 if (vocab(i) == s .and. vocab_len(i)==l) then
386 |                         ind = i
387 |                         return
388 |                 end if
389 |                 end do
390 |                 ind = -1
391 |         end function
392 | 
393 |         function encode_word(word) result(tokens)
394 |                 character(len=*) :: word
395 |                 character(:), dimension(:), allocatable :: tokens
396 |                 integer :: i
397 |                 
398 |                 allocate(character(len=max_len) ::  tokens(0))
399 | 
400 |                 do while(len_trim(word) > 0)
401 |                         i = len_trim(word)
402 |                         do while ( (i > 0) .and. (lookup(word(:i),i) <= 0))
403 |                         i = i - 1
404 |                         end do  
405 |                         
406 |                         if ( i == 0) then
407 |                                 deallocate(tokens)
408 |                                 tokens = ["UNK"]
409 |                                 return 
410 |                         end if
411 |                         tokens = [tokens, word(:i)]
412 |                         !print *, tokens
413 |                         word = word((i+1):)
414 |                         if (len_trim(word) > 0) then
415 |                                 word = "##" // word
416 |                         end if
417 | 
418 | 
419 |                 end do
420 | 
421 |         end function
422 |         
423 |         
424 |         function simple_tokenize(text) result(tokens)
425 |                 character(len=*) :: text
426 |                 character(:), dimension(:), allocatable :: tokens
427 |                 character(:), allocatable :: ltext, allc
428 |                 character(len=max_len) :: next_token
429 |                 integer :: pos
430 | 
431 |                 character(26), parameter :: alpha = 'abcdefghijklmnopqrstuvwxyz'
432 |                 character(35)  :: punct = '[!"#$%&\()*+,-./:;<=>?@\\^_`{|}~])x'
433 |                 character(10) :: numbers = '0123456789'
434 |                 
435 |                 ! is there another way to add the single quote?
436 |                 punct(35:35) = "'"
437 |                 !print *, punct
438 |                 allc = alpha // punct // numbers
439 | 
440 |                 allocate(character(len=max_len) ::  tokens(0))
441 | 
442 |                 ltext = to_lower(text)
443 | 
444 |                 do while (len_trim(ltext) > 0)
445 |                 pos = 1
446 |                 
447 |                 next_token = ""
448 |                 
449 |                 
450 |                 
451 |                 do while(index(allc,ltext(pos:pos)) <= 0) 
452 |                         pos = pos + 1
453 |                 end do
454 | 
455 |                 ltext = ltext(pos:)
456 | 
457 |                 pos = 1
458 | 
459 |                 if (index(punct,ltext(pos:pos)) > 0 .and. pos <= len_trim(ltext)) then
460 |                         !print *, index(punct,ltext(pos:pos))
461 |                         next_token = ltext(pos:pos)
462 |                         !if (verbose) then
463 |                         !        print *, next_token
464 |                         !end if 
465 |                         tokens = [tokens, next_token]
466 |                         ltext = ltext((pos+1):)
467 |                         cycle 
468 |                 end if
469 |                 
470 |                 if (index(alpha,ltext(pos:pos)) > 0 .and. pos <= len_trim(ltext)) then !next char is alphabet
471 |                 
472 |                 do while(index(alpha,ltext(pos:pos)) > 0 .and. pos <= len_trim(ltext))
473 |                         pos = pos + 1
474 |                 end do
475 | 
476 |                 next_token = ltext(1:(pos-1))
477 |                 ltext = ltext(pos:)
478 |                 
479 |                 !if (verbose) then 
480 |                         !print *, "control"
481 |                         !print *, pos
482 |                 !        print *, next_token
483 |                         !print *, ltext
484 |                 !end if
485 |                 
486 |                 ! fortran 2003?
487 |                 tokens = [tokens, next_token]
488 | 
489 |                 else if (index(numbers,ltext(pos:pos)) > 0 .and. pos <= len_trim(ltext)) then ! next char is number
490 |                 do while(index(numbers,ltext(pos:pos)) > 0 .and. pos <= len_trim(ltext))
491 |                         pos = pos + 1
492 |                 end do
493 | 
494 |                 next_token = ltext(1:(pos-1))
495 |                 ltext = ltext(pos:)
496 | 
497 |                 !if (verbose) then
498 |                         !print *, "control"
499 |                         !print *, pos
500 |                 !        print *, next_token
501 |                         !print *, ltext
502 |                 !end if
503 | 
504 |                 ! fortran 2003?
505 |                 tokens = [tokens, next_token]
506 | 
507 | 
508 |                 end if 
509 | 
510 |                 end do
511 | 
512 |         
513 |         end function
514 | 
515 |         !stackoverflow.com/questions/10759375/how-can-i-write-a-to-upper-or-to-lower-function-in-f90
516 |         function to_lower (str) result (string)
517 | 
518 | 
519 |         implicit None
520 |         character(*), intent(in) :: str
521 |         character(len(str))      :: string
522 | 
523 |         integer :: ic, i
524 | 
525 |         character(26), parameter :: cap = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
526 |         character(26), parameter :: low = 'abcdefghijklmnopqrstuvwxyz'
527 | 
528 |         string = str
529 |         do i = 1, len_trim(str)
530 |                 ic = index(cap, str(i:i))
531 |                 if (ic > 0) string(i:i) = low(ic:ic)
532 |         end do
533 | 
534 |         end function to_lower
535 | 
536 |         function time_ms() result(t_ms)
537 |                 real(kind=wp) :: t_ms
538 |                 integer(4) :: ms
539 |                 !call cpu_time(t_ms)
540 |                 call system_clock(ms)
541 |                 t_ms = real(ms)
542 |         end function
543 | 
544 | end program
545 | 


--------------------------------------------------------------------------------
/read_ggml.f90:
--------------------------------------------------------------------------------
  1 | ! load.f90
  2 | 
  3 | module mixed_type_module
  4 |   use precision_module
  5 |   implicit none
  6 |   type mixed_type
  7 |     class(*), allocatable :: item
  8 |   end type mixed_type
  9 | 
 10 |   type multi_type
 11 |         integer :: type_num
 12 |         integer(4) :: i32
 13 |         !integer(2) :: i16
 14 |         real(4) :: f32
 15 |         character(64)  :: string
 16 |         type(multi_type), allocatable :: a(:)
 17 |   end type
 18 | 
 19 |   type ggml_tensor_info
 20 |         character(64) :: tname
 21 |         integer(4) :: ndim, ttype
 22 |         integer(8) :: offset
 23 |         integer(8), allocatable :: dims(:)
 24 |   end type
 25 | 
 26 |   type generic_tensor
 27 |         integer :: ndims
 28 |         integer :: ttype
 29 |         integer(2), allocatable :: f161d(:)
 30 |         integer(2), allocatable :: f162d(:,:)
 31 |         real(kind=wp), allocatable :: f321d(:)
 32 |         real(kind=wp), allocatable :: f322d(:,:)
 33 |         ! can add fp4
 34 |   end type
 35 | 
 36 | 
 37 | end module
 38 | 
 39 | 
 40 | module read_ggml
 41 | 
 42 |         use precision_module
 43 |         use mixed_type_module
 44 |         use weight_module
 45 |         implicit none
 46 |         
 47 |         type(ggml_tensor_info), allocatable :: tensors(:)
 48 |         logical :: verbose
 49 |         !integer :: file_pos
 50 |         integer(8) :: tensor_count
 51 |         logical, parameter :: verbose2 = .false.
 52 | contains
 53 |         subroutine load_ggml(filename, w, c, vocab, token_lengths, v)
 54 |         character(len=*), intent(in) :: filename
 55 |         type(TransformerWeights), intent(out) :: w
 56 |         type(Config), intent(out) :: c
 57 |         real(kind=wp), allocatable :: scores(:)
 58 |         character(:), dimension(:), allocatable, intent(out) :: vocab
 59 |         integer(4), allocatable, intent(out) :: token_lengths(:)
 60 |         logical, intent(in) :: v
 61 |         
 62 |         character(:), dimension(:), allocatable :: vocab_swp
 63 |         integer(4) :: magic, version
 64 |         integer(8) :: kv_pairs
 65 |         !class(*), allocatable :: demo
 66 |         integer :: max_len = 64
 67 |         integer :: i, j, val_type,file_pos,  alignment, deficit 
 68 |         integer(4) :: num_layers, emb_length, context_length, head_count, ffn_length, kv_heads, vocab_size
 69 |         type(multi_type), allocatable :: values(:)
 70 |         type(multi_type) :: multi_temp
 71 |         character(:), dimension(:), allocatable :: keys
 72 |         !type(multi_type), allocatable :: x(:) 
 73 |         !type(ggml_tensor_info), allocatable :: tensors(:)
 74 |         type(ggml_tensor_info) :: t0
 75 |         !demo = 3
 76 |         integer(1) :: tbyte
 77 |         integer(1) :: tbytes(3)
 78 |         integer(2) :: f16
 79 |         integer(2), allocatable :: temp2f16(:,:)
 80 |         integer(2), allocatable :: tempf16(:)
 81 |         real(kind=wp), allocatable :: tempf32(:)
 82 |         real(kind=wp), allocatable :: temp2f32(:,:)
 83 |         character(:), allocatable :: tempstr
 84 |         type(generic_tensor) :: temp_gt
 85 |         !type (args) :: arg_values
 86 | 
 87 | 
 88 |         !real(kind=wp), allocatable :: scores(:)
 89 |         !character(:), dimension(:), allocatable :: vocab
 90 |         !integer(4), allocatable :: token_lengths(:)
 91 |         integer(8) :: tmp_vocab_size
 92 |         integer(4) :: temp_int, maxlen
 93 | 
 94 |         integer(8) :: strlen
 95 | 
 96 |         character(:), allocatable :: loaded_str
 97 |         
 98 |         integer :: head_size, kv_head_size
 99 |         
100 |         allocate(character(len=max_len) :: tempstr)
101 |         verbose = v
102 |         
103 |         ! assumed to be 32 if not specified
104 |         alignment = 32
105 |         num_layers = 0
106 | 
107 |         open(UNIT=5, FILE=filename, FORM="UNFORMATTED",&
108 |                 &ACCESS="STREAM", STATUS="OLD", POSITION="REWIND", ACTION="READ")
109 |                 
110 |                 ! config
111 |         
112 |                 read(5) magic, version, tensor_count, kv_pairs
113 | 
114 |                 if (verbose) then
115 |                         print *, "GGUF Header Info"
116 |                         print *, "Magic number: ", magic
117 |                         print *, "Version: ", version
118 |                         print *, "Tensor Count: ", tensor_count
119 |                         print *, "Key-Value Pairs: ", kv_pairs
120 |                 end if
121 | 
122 |                 if (magic .ne. 1179993927) then
123 |                         print *, "Magic numbers do not match, exiting"
124 |                         stop
125 |                 end if
126 | 
127 |                 allocate(character(len=max_len) ::  keys(kv_pairs))
128 |                 allocate(values(kv_pairs))
129 |                 do i = 1,kv_pairs
130 |                         keys(i) = read_str(5)
131 |                         read(5) val_type
132 |                         values(i) = read_val(5,val_type)
133 |                         if (keys(i) .eq. "general.alignment") then
134 |                                 alignment = values(i)%i32
135 |                                 if (verbose) then 
136 |                                         print *, "alignment set to", alignment
137 |                                 end if
138 |                         else if (keys(i) .eq. "distilbert.block_count") then
139 |                                 num_layers = values(i)%i32 !assume it's int(4)
140 |                         else if (keys(i) .eq. "distilbert.embedding_length") then
141 |                                 emb_length = values(i)%i32
142 |                         else if (keys(i) .eq. "distilbert.attention.head_count") then
143 |                                 head_count = values(i)%i32
144 |                         else if (keys(i) .eq. "distilbert.context_length") then
145 |                                 context_length = values(i)%i32
146 |                         else if (keys(i) .eq. "tokenizer.ggml.tokens") then
147 |                                 vocab_size = (size(values(i)%a))
148 |                         else if (keys(i) .eq. "distilbert.attention.head_count_kv") then
149 |                                 kv_heads = values(i)%i32
150 |                         else if (keys(i) .eq. "distilbert.feed_forward_length") then
151 |                                 ffn_length = values(i)%i32
152 |                         end if
153 |                         
154 |                         if (verbose) then
155 |                         print *, keys(i)
156 |                         call print_multi(values(i))
157 |                         end if
158 |                 end do
159 | 
160 |                 allocate(tensors(tensor_count))
161 |                 do i = 1,tensor_count
162 |                         tensors(i) = read_tensor_info(5)
163 |                 end do
164 | 
165 |                 ! "level 2 verbose"
166 |                 if (verbose2) then
167 |                         do i = 1, tensor_count
168 |                                 write (*, fmt="(A20,I2)",advance="no") tensors(i)%tname, tensors(i)%ndim
169 |                                 do j=1,tensors(i)%ndim
170 |                                 write (*, fmt="(I6)", advance="no") tensors(i)%dims(j) 
171 |                                 end do        
172 |                                 write (*, fmt="(I2,I11)") tensors(i)%ttype, tensors(i)%offset 
173 |                         end do
174 |                 end if
175 |                 
176 |                 inquire(unit=5,pos=file_pos)
177 | 
178 |                 deficit = mod(file_pos-1,alignment) ! -1
179 | 
180 |                 if (verbose) then
181 |                 print *, "Position", file_pos
182 |                 print *, "Deficit", deficit
183 |                 end if
184 | 
185 |                 if (deficit > 0) then
186 |                 do i = 1,(alignment-deficit)
187 |                         read (5) tbyte
188 |                         if (tbyte /= 0) then
189 |                                 print *, "padding error", tbyte
190 |                         end if
191 |                 end do
192 |         end if
193 | 
194 |                 inquire(unit=5,pos=file_pos)
195 | 
196 |                 print *, "data offset", file_pos
197 | 
198 |                 !read(5) f16
199 |                
200 |                 !print *, "First value", half_to_float_c(f16) 
201 | 
202 |               
203 |                 !if (outfile /= "") then
204 |                 !open(unit=8, file=outfile, form='unformatted', status='unknown', ACCESS="STREAM", action="write")
205 |                 ! write the header:
206 |                 if (verbose) then 
207 |                         if (verbose) then
208 |                         print *, "Embedding dimension: ", emb_length
209 |                         print *, "Hidden dimension: ", ffn_length
210 |                         print *, "Layers: ", num_layers
211 |                         print *, "Heads: ", head_count
212 |                         print *, "kv Heads: ", kv_heads
213 |                         print *, "Vocabulary Size: ", vocab_size
214 |                         print *, "Sequence Length: ", context_length
215 | 
216 |                 end if
217 | 
218 |                         !print *, "Header:"
219 |                         !print *, emb_length, ffn_length, num_layers, head_count, kv_heads, vocab_size, context_length
220 |                 end if 
221 |                 !write(8) emb_length, ffn_length, num_layers, head_count, kv_heads, vocab_size, context_length
222 |                 c%emb_dim = emb_length
223 |                 c%hidden_dim = ffn_length
224 |                 c%n_layers = num_layers
225 |                 c%n_heads = head_count
226 |                 c%n_kv_heads = kv_heads
227 |                 c%vocab_size = vocab_size
228 |                 c%seq_len = context_length 
229 |                 
230 |                 head_size = emb_length / head_count
231 |                 kv_head_size = kv_heads * head_size
232 |                 
233 |                 if (verbose) then
234 |                 print *, "head size ", head_size
235 |                 print *, "kv head Size ", kv_head_size
236 |                 end if
237 | 
238 |                 t0 = tensor_by_name("token_embd.weight")
239 |                 temp_gt = read_layer(5,t0,file_pos)
240 | 
241 |                 !call write_tensor(8,temp_gt)
242 |                 w%word_embeddings = temp_gt%f322d
243 | 
244 |                 if (verbose) then
245 |                         print *, "loaded word embedding weights:", size(w%word_embeddings)
246 |                 end if
247 |        
248 |                 t0 = tensor_by_name("position_embd.weight")
249 |                 temp_gt = read_layer(5,t0,file_pos)
250 | 
251 |                 !call write_tensor(8,temp_gt)
252 |                 w%position_embeddings = temp_gt%f322d
253 | 
254 |                 if (verbose) then
255 |                         print *, "loaded position embedding weights:", size(w%position_embeddings)
256 |                 end if
257 | 
258 | 
259 |                 !print *, temp_gt%ttype
260 |                 !print *, temp_gt%ndims
261 |                 !print *, w%token_embedding_table(1:10,1)
262 |                 !print *, "embed sum: ", sum(w%token_embedding_table(1:10,1:10))
263 | 
264 |                 t0 = tensor_by_name("token_embd_norm.weight")
265 |                 temp_gt = read_layer(5,t0,file_pos)
266 | 
267 |                 !call write_tensor(8,temp_gt)
268 |                 w%emb_layer_norm_w = temp_gt%f321d
269 | 
270 |                 if (verbose) then
271 |                         print *, "loaded embedding layernorm weights:", size(w%emb_layer_norm_w)
272 |                 end if
273 | 
274 |                 t0 = tensor_by_name("token_embd_norm.bias")
275 |                 temp_gt = read_layer(5,t0,file_pos)
276 | 
277 |                 !call write_tensor(8,temp_gt)
278 |                 w%emb_layer_norm_b = temp_gt%f321d
279 | 
280 |                 if (verbose) then
281 |                         print *, "loaded embedding layernorm bias:", size(w%emb_layer_norm_b)
282 |                 end if
283 | 
284 | 
285 |                 allocate(w%wq(emb_length,emb_length,num_layers))
286 |                 do i = 1,num_layers
287 |                         write(tempstr,"(A,I0,A)") "blk.", i-1, ".attn_q.weight"
288 |                         t0 = tensor_by_name(tempstr)
289 |                         temp_gt = read_layer(5,t0,file_pos)
290 |                         ! f16
291 |                         !call write_tensor(8,temp_gt)
292 |                         w%wq(:,:,i) = temp_gt%f322d
293 |                 end do 
294 | 
295 |                 if (verbose) then
296 |                         print *, "loaded wq weights:", size(w%wq)
297 |                 end if
298 | 
299 | 
300 |                 allocate(w%bq(emb_length,num_layers))
301 |                 do i = 1,num_layers
302 |                         write(tempstr,"(A,I0,A)") "blk.", i-1, ".attn_q.bias"
303 |                         t0 = tensor_by_name(tempstr)
304 |                         temp_gt = read_layer(5,t0,file_pos)
305 |                         ! f16
306 |                         !call write_tensor(8,temp_gt)
307 |                         w%bq(:,i) = temp_gt%f321d
308 |                 end do
309 | 
310 | 
311 |                 if (verbose) then
312 |                         print *, "loaded wq bias:", size(w%bq)
313 |                 end if
314 | 
315 |                 allocate(w%wk(emb_length,emb_length,num_layers)) 
316 |                 do i = 1,num_layers
317 |                         write(tempstr,"(A,I0,A)") "blk.", i-1, ".attn_k.weight"
318 |                         t0 = tensor_by_name(tempstr)
319 |                         temp_gt = read_layer(5,t0,file_pos)
320 |                         ! f16 
321 |                         !call write_tensor(8,temp_gt)
322 |                         w%wk(:,:,i) = temp_gt%f322d
323 |                 end do
324 | 
325 |                 if (verbose) then
326 |                         print *, "loaded wk weights:", size(w%wk)
327 |                 end if
328 | 
329 |                 allocate(w%bk(emb_length,num_layers))
330 |                 do i = 1,num_layers
331 |                         write(tempstr,"(A,I0,A)") "blk.", i-1, ".attn_k.bias"
332 |                         t0 = tensor_by_name(tempstr)
333 |                         temp_gt = read_layer(5,t0,file_pos)
334 |                         ! f16
335 |                         !call write_tensor(8,temp_gt)
336 |                         w%bk(:,i) = temp_gt%f321d
337 |                 end do
338 | 
339 | 
340 |                 if (verbose) then
341 |                         print *, "loaded wk bias:", size(w%bk)
342 |                 end if
343 | 
344 | 
345 |                 allocate(w%wv(emb_length,emb_length,num_layers))
346 |                 do i = 1,num_layers
347 |                         write(tempstr,"(A,I0,A)") "blk.", i-1, ".attn_v.weight"
348 |                         t0 = tensor_by_name(tempstr)
349 |                         temp_gt = read_layer(5,t0,file_pos)
350 |                         ! f16
351 |                         !call write_tensor(8,temp_gt)
352 |                         w%wv(:,:,i) = temp_gt%f322d
353 |                 end do
354 | 
355 |                 !print *, "qkv sum: ", sum(w%wqkv)
356 |                 if (verbose) then
357 |                         print *, "loaded wv weights:", size(w%wv)
358 |                 end if
359 | 
360 |                 allocate(w%bv(emb_length,num_layers))
361 |                 do i = 1,num_layers
362 |                         write(tempstr,"(A,I0,A)") "blk.", i-1, ".attn_v.bias"
363 |                         t0 = tensor_by_name(tempstr)
364 |                         temp_gt = read_layer(5,t0,file_pos)
365 |                         ! f16
366 |                         !call write_tensor(8,temp_gt)
367 |                         w%bv(:,i) = temp_gt%f321d
368 |                 end do
369 | 
370 | 
371 |                 if (verbose) then
372 |                         print *, "loaded wv bias:", size(w%bv)
373 |                 end if
374 | 
375 | 
376 | 
377 | 
378 |                 allocate(w%wo(emb_length,emb_length,num_layers))
379 |                 do i = 1,num_layers
380 |                         write(tempstr,"(A,I0,A)") "blk.", i-1, ".attn_output.weight"
381 |                         t0 = tensor_by_name(tempstr)
382 |                         temp_gt = read_layer(5,t0,file_pos)
383 |                         ! f16
384 |                         !call write_tensor(8,temp_gt)
385 |                         w%wo(:,:,i) = temp_gt%f322d
386 |                 end do
387 | 
388 |                 if (verbose) then
389 |                         print *, "loaded wo weights:", size(w%wo)
390 |                 end if
391 | 
392 |                 allocate(w%bo(emb_length,num_layers))
393 |                 do i = 1,num_layers
394 |                         write(tempstr,"(A,I0,A)") "blk.", i-1, ".attn_output.bias"
395 |                         t0 = tensor_by_name(tempstr)
396 |                         temp_gt = read_layer(5,t0,file_pos)
397 |                         ! f16
398 |                         !call write_tensor(8,temp_gt)
399 |                         w%bo(:,i) = temp_gt%f321d
400 |                 end do
401 | 
402 | 
403 |                 if (verbose) then
404 |                         print *, "loaded wo bias:", size(w%bo)
405 |                 end if
406 | 
407 | 
408 | 
409 |                 allocate(w%sa_layer_norm_w(emb_length,num_layers))
410 |                 do i = 1,num_layers
411 |                         write(tempstr,"(A,I0,A)") "blk.", i-1, ".ffn_norm.weight"
412 |                         t0 = tensor_by_name(tempstr)
413 |                         temp_gt = read_layer(5,t0,file_pos)
414 |                         ! should be f32
415 |                         !call write_tensor(8,temp_gt)
416 |                         w%sa_layer_norm_w(:,i) = temp_gt%f321d
417 |                 end do
418 | 
419 |                 if (verbose) then
420 |                         print *, "loaded sa layernorm weights:", size(w%sa_layer_norm_w)
421 |                 end if
422 | 
423 |                 allocate(w%sa_layer_norm_b(emb_length,num_layers))
424 |                 do i = 1,num_layers
425 |                         write(tempstr,"(A,I0,A)") "blk.", i-1, ".ffn_norm.bias"
426 |                         t0 = tensor_by_name(tempstr)
427 |                         temp_gt = read_layer(5,t0,file_pos)
428 |                         ! should be f32
429 |                         !call write_tensor(8,temp_gt)
430 |                         w%sa_layer_norm_b(:,i) = temp_gt%f321d
431 |                 end do
432 | 
433 |                 if (verbose) then
434 |                         print *, "loaded sa layernorm bias:", size(w%sa_layer_norm_w)
435 |                 end if
436 | 
437 | 
438 | 
439 |                 allocate(w%w1(emb_length,ffn_length,num_layers))
440 |                 do i = 1,num_layers
441 |                         write(tempstr,"(A,I0,A)") "blk.", i-1, ".ffn_up.weight"
442 |                         t0 = tensor_by_name(tempstr)
443 |                         temp_gt = read_layer(5,t0,file_pos)
444 |                         ! f16
445 |                         !call write_tensor(8,temp_gt)
446 |                         w%w1(:,:,i) = temp_gt%f322d
447 |                 end do
448 | 
449 |                 if (verbose) then
450 |                         print *, "loaded w1 weights:", size(w%w1)
451 |                 end if
452 | 
453 |                 allocate(w%b1(ffn_length,num_layers))
454 |                 do i = 1,num_layers
455 |                         write(tempstr,"(A,I0,A)") "blk.", i-1, ".ffn_up.bias"
456 |                         t0 = tensor_by_name(tempstr)
457 |                         temp_gt = read_layer(5,t0,file_pos)
458 |                         ! f16
459 |                         !call write_tensor(8,temp_gt)
460 |                         w%b1(:,i) = temp_gt%f321d
461 |                 end do
462 | 
463 |                 if (verbose) then
464 |                         print *, "loaded w1 bias:", size(w%b1)
465 |                 end if
466 | 
467 | 
468 |                 allocate(w%w2(ffn_length,emb_length,num_layers))
469 |                 do i = 1,num_layers
470 |                         write(tempstr,"(A,I0,A)") "blk.", i-1, ".ffn_down.weight"
471 |                         t0 = tensor_by_name(tempstr)
472 |                         temp_gt = read_layer(5,t0,file_pos)
473 |                         ! f16
474 |                         !call write_tensor(8,temp_gt)
475 |                         w%w2(:,:,i) = temp_gt%f322d
476 |                 end do
477 | 
478 |                 if (verbose) then
479 |                         print *, "loaded w2 (down) weights:", size(w%w2)
480 |                 end if
481 | 
482 |                 allocate(w%b2(emb_length,num_layers))
483 |                 do i = 1,num_layers
484 |                         write(tempstr,"(A,I0,A)") "blk.", i-1, ".ffn_down.bias"
485 |                         t0 = tensor_by_name(tempstr)
486 |                         temp_gt = read_layer(5,t0,file_pos)
487 |                         ! f16
488 |                         !call write_tensor(8,temp_gt)
489 |                         w%b2(:,i) = temp_gt%f321d
490 |                 end do
491 | 
492 |                 if (verbose) then
493 |                         print *, "loaded w2 (down) bias:", size(w%b2)
494 |                 end if
495 | 
496 |                 allocate(w%out_layer_norm_w(emb_length,num_layers))
497 |                 do i = 1,num_layers
498 |                         write(tempstr,"(A,I0,A)") "blk.", i-1, ".output_norm.weight"
499 |                         t0 = tensor_by_name(tempstr)
500 |                         temp_gt = read_layer(5,t0,file_pos)
501 |                         ! should be f32
502 |                         !call write_tensor(8,temp_gt)
503 |                         w%out_layer_norm_w(:,i) = temp_gt%f321d
504 |                 end do
505 | 
506 |                 if (verbose) then
507 |                         print *, "loaded output norm weights:", size(w%out_layer_norm_w)
508 |                 end if
509 | 
510 |                 allocate(w%out_layer_norm_b(emb_length,num_layers))
511 |                 do i = 1,num_layers
512 |                         write(tempstr,"(A,I0,A)") "blk.", i-1, ".output_norm.bias"
513 |                         t0 = tensor_by_name(tempstr)
514 |                         temp_gt = read_layer(5,t0,file_pos)
515 |                         ! should be f32
516 |                         !call write_tensor(8,temp_gt)
517 |                         w%out_layer_norm_b(:,i) = temp_gt%f321d
518 |                 end do
519 | 
520 |                 if (verbose) then
521 |                         print *, "loaded output norm bias:", size(w%out_layer_norm_w)
522 |                 end if
523 | 
524 | 
525 | 
526 |                 !temp2f32 = get_rope_freqs(emb_length/head_count,context_length,10000.0)
527 |                 !if (verbose) then
528 |                 !write(*,"(A)") "rope cos: writing float32"
529 |                 !end if
530 |                 !write(8) cos(temp2f32(:,:context_length))
531 |                 !if (verbose) then
532 |                 !write(*,"(A)") "rope sin: writing float32"
533 |                 !end if
534 |                 !write(8) sin(temp2f32(:,:context_length))
535 |                 ! cos and sin of the above are the cos/sin respectively (f32)
536 | 
537 |                 t0 = tensor_by_name("output")
538 |                 temp_gt = read_layer(5,t0,file_pos)
539 |                 ! f16
540 |                 !call write_tensor(8,temp_gt)
541 |                 w%linear = temp_gt%f322d
542 | 
543 |                 if (verbose) then
544 |                         print *, "loaded classifier weights:", size(w%linear)
545 |                 end if
546 | 
547 | 
548 |         !close(8)
549 |         !end if ! writing outfile 
550 | 
551 |         if (.true.) then
552 |                 ! just read and write the values again:
553 |                 call fseek(5,0,0) 
554 |                 read(5) magic, version, tensor_count, kv_pairs
555 | 
556 |                 if (magic .ne. 1179993927) then
557 |                         print *, "Magic numbers do not match, exiting"
558 |                         stop
559 |                 end if
560 | 
561 |                 do i = 1,kv_pairs
562 |                         tempstr = read_str(5)
563 |                         read(5) val_type
564 |                         if (verbose2) then
565 |                         print *, "scanning ", tempstr
566 |                         end if
567 |                         if (tempstr .eq. "tokenizer.ggml.tokens") then
568 |                                 if (verbose) then
569 |                                 print *, "loading tokens"
570 |                                 end if
571 |                                 ! allocate
572 |                                 read(5) temp_int, tmp_vocab_size
573 |                         !allocate(val%a(alen))
574 |                         !do i = 1,alen
575 |                         ! val%a(i) = read_val(handle, atype)
576 |                         !end do
577 |                                 allocate(character(len=max_len) ::  vocab(tmp_vocab_size))
578 |                                 allocate(token_lengths(tmp_vocab_size))
579 |                                 do j=1,int(tmp_vocab_size,4)
580 |                                         read(5) strlen
581 |                                         allocate(character(strlen) :: loaded_str)
582 |                                         read(5) loaded_str
583 |                                         token_lengths(j) = int(strlen,4)
584 |                                         vocab(j) = loaded_str
585 |                                         deallocate(loaded_str)
586 |                                 end do
587 |                                 if (verbose) then
588 |                                 write (*,"(A,I0,A)") "found ", size(vocab), " tokens"
589 |                                 end if
590 |         
591 |                         else if (tempstr .eq. "tokenizer.ggml.scores") then
592 |                                 multi_temp = read_val(5,val_type)
593 |                                 allocate(scores(size(multi_temp%a)))
594 |                                 do j = 1,size(multi_temp%a)
595 |                                         scores(j) = multi_temp%a(j)%f32
596 |                                 end do
597 |                                 if (verbose) then
598 |                                 write (*,"(A,I0,A)") "found ", size(multi_temp%a), " scores" 
599 |                                 end if
600 |                         else
601 |                         multi_temp = read_val(5,val_type)
602 |                         end if
603 |                 end do
604 | 
605 |                 !open(unit=8, file="", form='unformatted', status='unknown', ACCESS="STREAM", action="write")
606 |                 maxlen = maxval(token_lengths)
607 |                 
608 |                 allocate(character(len=max_len) ::  vocab_swp(tmp_vocab_size))
609 |                 if (verbose) then
610 |                 print *, "maximum token length ", maxlen
611 |                 end if
612 |                 !temp_int = 10
613 |                 !write(8) maxlen 
614 |                 do i=1,size(vocab)
615 |                 read(vocab(i)(1:1), "(A)") tbytes(1)
616 |                 read(vocab(i)(2:2), "(A)") tbytes(2)
617 |                 read(vocab(i)(3:3), "(A)") tbytes(3)
618 | 
619 |                 !end if
620 |                 if ( (tbytes(1) .eq. -30) .and.&
621 |                         &(tbytes(2) .eq. -106) .and.&
622 |                         &(tbytes(3) .eq. -127) ) then
623 |                 allocate(character(token_lengths(i)-2) :: loaded_str)
624 |                 loaded_str(1:1) = " "
625 |                 loaded_str(2:) = vocab(i)(4:token_lengths(i))
626 |                 !write(8) scores(i),token_lengths(i)-2,loaded_str
627 |                 token_lengths(i) = token_lengths(i)-2
628 |                 vocab_swp(i) = loaded_str
629 |                 deallocate(loaded_str)
630 |                 else
631 |                 !write(8) scores(i),token_lengths(i),vocab(i)(1:token_lengths(i))
632 |                 vocab_swp(i) = vocab(i)(1:token_lengths(i))
633 |         end if
634 |                 end do
635 | 
636 |         end if
637 | 
638 |         !close(8)
639 | 
640 |         close(5)
641 |         vocab = vocab_swp
642 |         end subroutine
643 | 
644 |         
645 |         subroutine write_tensor(handle, t)
646 |                 integer :: handle
647 |                 type(generic_tensor) :: t
648 |                 
649 |                 if (t%ttype .eq. 0) then
650 |                         if (verbose) then
651 |                                 write(*,"(A)") "writing float32"
652 |                         end if
653 |                         if (t%ndims .eq. 1) then
654 |                                 write(handle) t%f321d
655 |                         else if (t%ndims .eq. 2) then
656 |                                 write(handle) t%f322d
657 |                         end if
658 |                 else if (t%ttype .eq. 1) then
659 |                         if (verbose) then
660 |                                 write(*,"(A)") "writing fp16"
661 |                         end if
662 |                         if (t%ndims .eq. 1) then
663 |                                 write(handle) t%f161d
664 |                         else if (t%ndims .eq. 2) then
665 |                                 write(handle) t%f162d
666 |                         end if
667 |                 end if
668 | 
669 | 
670 |         end subroutine 
671 |         
672 |         function get_rope_freqs(i_dim, i_end, theta) result(freq_array)
673 |                 integer :: i_dim, i_end
674 |                 real(kind=wp) :: theta
675 |                 !real(kind=wp) :: cis(i_end/2,2)
676 |                 real(kind=wp),allocatable :: freqs(:)
677 |                 real(kind=wp),allocatable :: freq_array(:,:)
678 |                 real(kind=wp) :: irange(i_dim/2)
679 |                 integer :: i
680 |                 do i = 1,i_dim/2
681 |                         irange(i) = 2.0*(i-1) / i_dim
682 |                         freqs = 1.0 / (theta ** irange) 
683 |                 
684 |                 end do
685 |                 allocate(freq_array(size(freqs),i_end)) ! may need transposing
686 |                 do i = 0,(i_end-1)
687 |                 freq_array(:,i+1) = i*freqs
688 |                 end do
689 | 
690 |         end function
691 |         
692 |         function tensor_by_name(s)
693 |                 character(len=*) :: s
694 |                 integer :: i
695 |                 type(ggml_tensor_info) :: tensor_by_name
696 |                 do i=1,tensor_count
697 |                         if (tensors(i)%tname .eq. s) then
698 |                                 tensor_by_name = tensors(i)
699 |                                 return 
700 |                         end if
701 |                 end do 
702 |                 print *, "key not found",s
703 |                 stop
704 |         end
705 |         function prod(a)
706 |                 integer(8) :: a(:)
707 |                 integer :: i
708 |                 integer(8) :: prod
709 |                 prod = 1
710 |                 do i = 1,size(a)
711 |                  prod = prod * a(i)
712 |                  end do 
713 |         end function
714 |     
715 |         function read_layer_fp16(handle, layer) result(d)
716 |                 integer :: handle
717 |                 type(ggml_tensor_info) :: layer
718 |                 integer(2), allocatable :: d(:)
719 |                 if (verbose) then
720 |                         write(*,"(A,A26)",advance="no") "reading",layer%tname 
721 |                 end if
722 |                 !call fseek(handle,layer%offset+file_pos,0)
723 |                 allocate(d(prod(layer%dims)))
724 |                 read(handle) d
725 |                 if (verbose) then
726 |                         write(*,"(A)") "... done"
727 |                 end if 
728 | 
729 |         end function 
730 | 
731 |         function read_layer(handle, layer,file_pos) result(d)
732 |                 integer :: handle
733 |                 type(ggml_tensor_info) :: layer
734 |                 type(generic_tensor) :: d
735 |                 integer :: file_pos
736 |                 !integer(2), allocatable :: d(:)
737 |                 !if (verbose) then
738 |                 !        write(*,"(A,A26)",advance="no") "reading",layer%tname
739 |                 !end if
740 |                 call fseek(handle,layer%offset+file_pos-1,0)
741 |                 d%ttype = layer%ttype
742 |                 d%ndims = layer%ndim
743 |                 
744 |                 if (d%ttype .eq. 0) then
745 |                         if (d%ndims .eq. 1) then
746 |                                 allocate(d%f321d(layer%dims(1)))
747 |                                 read(handle) d%f321d
748 |                         else if (d%ndims .eq. 2) then
749 |                                 allocate(d%f322d(layer%dims(1),layer%dims(2)))
750 |                                 read(handle) d%f322d
751 |                         else
752 |                         print *, "Ndims nuot supported", layer%dims
753 |                         end if
754 |                 else if (d%ttype .eq. 1) then
755 |                         if (d%ndims .eq. 1) then
756 |                                 allocate(d%f161d(layer%dims(1)))
757 |                                 read(handle) d%f161d
758 |                         else if (d%ndims .eq. 2) then
759 |                                 allocate(d%f162d(layer%dims(1),layer%dims(2)))
760 |                                 read(handle) d%f162d
761 |                         else
762 |                         print *, "Ndims not supported", layer%dims
763 |                         end if
764 |                 else
765 |                         print *, "Type not supported", layer%ttype
766 |                 end if
767 |                 
768 |                 !if (verbose) then
769 |                 !        write(*,"(A)") "... done"
770 |                 !end if
771 | 
772 |         end function   
773 |         
774 |         function read_str(handle)
775 |                 integer :: handle
776 |                 integer(8) :: strlen
777 |            
778 |                 character(:), allocatable :: read_str
779 |                 read(handle) strlen
780 |                 allocate(character(strlen) :: read_str)
781 |                 read(handle) read_str
782 |                 
783 |         end function
784 | 
785 |         recursive function read_val(handle, val_type) result (val)
786 |                 integer :: handle, val_type, i
787 |                 character (:), allocatable :: temp
788 |                 type(multi_type) :: val
789 |                 integer(4) :: atype
790 |                 integer(8) :: alen
791 | 
792 |                 val%type_num = val_type
793 |                 
794 |                 if (val_type .eq. 8) then
795 |                         temp = read_str(handle)
796 |                         !print *, temp
797 |                         val%string = temp
798 | 
799 |                 else if (val_type .eq. 4) then
800 |                         ! read in an int32
801 |                         read(handle) val%i32
802 |                 else if (val_type .eq. 6) then
803 |                         read(handle) val%f32 
804 |                 else if (val_type .eq. 5) then
805 |                         read(handle) val%i32 
806 |                 else if (val_type .eq. 9) then
807 |                         read(handle) atype, alen
808 |                         allocate(val%a(alen))
809 |                         do i = 1,alen
810 |                          val%a(i) = read_val(handle, atype)
811 |                         end do
812 | 
813 |                 else 
814 |                         print *, "Not implemented", val_type
815 |                         stop
816 |                 end if
817 | 
818 | 
819 |         end function
820 | 
821 |         subroutine print_multi(m) 
822 |                 type(multi_type) :: m
823 |                 if (m%type_num .eq. 8) then
824 |                         print *, m%string
825 |                 else if (m%type_num .eq. 4) then
826 |                        print *, m%i32 
827 |                 else if (m%type_num .eq. 5) then
828 |                         print *, m%i32
829 |                 else if (m%type_num .eq. 6) then
830 |                        print *, m%f32         
831 |                else if (m%type_num .eq. 9) then
832 |                        print *, size(m%a)
833 |                 end if
834 | 
835 |         end subroutine
836 | 
837 |         function read_tensor_info(handle) result(info)
838 |                 integer :: handle, i
839 |                 type(ggml_tensor_info) :: info
840 |                 info%tname = read_str(handle)
841 |                 read(handle) info%ndim 
842 |                 allocate(info%dims(info%ndim))
843 |                 do i = 1,info%ndim
844 |                         read(handle) info%dims(i)
845 |                 end do
846 |                 read(handle) info%ttype
847 |                 read(handle) info%offset
848 | 
849 |         end function
850 |         
851 | 
852 | end module
853 | 


--------------------------------------------------------------------------------