├── requirements.txt ├── img ├── jam-img.jpg ├── jam-img.png └── jam-img.svg ├── docs ├── 1706.03762.pdf ├── jam-convo.md └── jam-gpt.md ├── jam_gpt ├── __init__.py ├── data.py ├── tokenizer.py ├── config.py ├── model.py └── lm.py ├── .gitignore ├── note.md ├── data-set ├── data_graber.py └── jam-data.json ├── setup.py ├── test-drive └── test_drive.py ├── LICENSE ├── README.md ├── test-gptLM.ipynb ├── test-old.demo.ipynb └── test.ipynb /requirements.txt: -------------------------------------------------------------------------------- 1 | setuptools 2 | torch>=2.0.0 3 | tiktoken -------------------------------------------------------------------------------- /img/jam-img.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/loke-x/jam-gpt/HEAD/img/jam-img.jpg -------------------------------------------------------------------------------- /img/jam-img.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/loke-x/jam-gpt/HEAD/img/jam-img.png -------------------------------------------------------------------------------- /docs/1706.03762.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/loke-x/jam-gpt/HEAD/docs/1706.03762.pdf -------------------------------------------------------------------------------- /jam_gpt/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from .data import Data as Data 3 | from .tokenizer import Tokenizer as Tokenizer 4 | from .model import Model as Model 5 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # models 3 | # md-test-01/ 4 | 5 | # model 6 | # *.bin 7 | 8 | __pycache__/ 9 | *.pyc 10 | .pypirc 11 | dist/ 12 | 13 | 14 | # datasets 15 | 16 | 17 | *.egg-info/ -------------------------------------------------------------------------------- /docs/jam-convo.md: -------------------------------------------------------------------------------- 1 | ```py 2 | eos_token = tok.encode(""" [eos]""") 3 | 4 | eos = tok.encode(" [eos]") 5 | print(tok.decode(model.generate(pmt,max_new_tokens=3000,eos_token=eos))) 6 | ``` 7 | 8 | -------------------------------------------------------------------------------- /note.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## to publish next release in pip jam-gpt-0.0.5 4 | 5 | ### run : 6 | 7 | ``` 8 | python setup.py sdist 9 | ``` 10 | >include requirements.txt inside the ziped 11 | ./dist/jam-gpt-0.0.5.tar.gz/jam-gpt-0.0.5 12 | ./dist/jam-gpt-0.0.5.tar.gz/jam-gpt-0.0.5/jam_gpt.egg-info/SOURCES.txt 13 | 14 | ### then run 15 | 16 | ``` 17 | twine upload dist/jam-gpt-0.0.5.tar.gz 18 | ``` -------------------------------------------------------------------------------- /data-set/data_graber.py: -------------------------------------------------------------------------------- 1 | import os 2 | import datasets 3 | from datasets import load_dataset 4 | 5 | # Load the daily dialogue dataset. 6 | dataset = load_dataset("daily_dialog") 7 | 8 | # Select the first 5000 dialogues from the dataset. 9 | dialogues = dataset["train"]["dialog"][:5000] 10 | 11 | # # Create the file if it does not exist. 12 | # os.makedirs("dialogues.txt", exist_ok=True) 13 | 14 | # Write the dialogues to the text file. 15 | with open("dialogues.txt", "w") as f: 16 | for dialogue in [str(dialogue) + "\n" for dialogue in dialogues]: 17 | f.write(dialogue) 18 | print("writing ....") -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | from setuptools import setup, find_packages 3 | 4 | setup( 5 | name='jam-gpt', 6 | version='0.0.4', 7 | description='A reimplementation of large language model (LLM) architectures designed for research and development processes', 8 | author='Lokeshwaran M', 9 | author_email='lokeshwaran.m23072003@gmail.com', 10 | url="https://github.com/Lokeshwaran-M/jam-gpt.git", 11 | license="MIT", 12 | packages=find_packages(), 13 | package_data={'': ['requirements.txt', 'README.md']}, 14 | install_requires=open('requirements.txt').readlines(), 15 | keywords='jam-gpt Jam-AI Jam-AGI', 16 | ) 17 | 18 | 19 | # install_requires=["setuptools==67.8.0","torch==2.0.1","tiktoken"] 20 | 21 | -------------------------------------------------------------------------------- /test-drive/test_drive.py: -------------------------------------------------------------------------------- 1 | # testing Data and Tokenizer calsses 2 | 3 | from jam_gpt import Data, Tokenizer, config 4 | 5 | tok = Tokenizer() 6 | 7 | def t_tokenizer(path,text_input = "test sample data"): 8 | 9 | text = Data.get(path) 10 | tok.set_encoding("md-test-01",text) 11 | tok.get_encoding("md-test-01") 12 | vocab_size = tok.n_vocab 13 | print(vocab_size) 14 | enc = tok.encode(text_input) 15 | print(enc) 16 | dec = tok.decode(enc) 17 | print(dec) 18 | 19 | # # output :: 20 | # 2 21 | # [75, 60, 74, 75, 1, 74, 56, 68, 71, 67, 60, 1, 59, 56, 75, 56] 22 | # test sample data 23 | 24 | # # to run : 25 | # from jam_gpt.test import test_drive as td 26 | 27 | # td.t_tokenizer("data.txt") 28 | 29 | def t_config(): 30 | x = config.pass_args() 31 | print(x) 32 | x[0] = 96 33 | config.set_args(x) 34 | y = config.pass_args() 35 | print(x) 36 | print(y) 37 | 38 | # [0, 16, 32, 5000, 100, 0.001, 'cuda', 200, 64, 4, 4, 0.0] 39 | # [96, 16, 32, 5000, 100, 0.001, 'cuda', 200, 64, 4, 4, 0.0] 40 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Lokeshwaran M 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /jam_gpt/data.py: -------------------------------------------------------------------------------- 1 | import json 2 | import torch 3 | from . import config 4 | 5 | 6 | class Data: 7 | """ just a data preprocessor """ 8 | def __init__(self) -> None: 9 | pass 10 | 11 | @classmethod 12 | def get(cls, path: str) -> str: 13 | """ 14 | text data file -> string data 15 | 16 | """ 17 | with open(path, "r", encoding="utf-8") as f: 18 | text_data = f.read() 19 | return text_data 20 | 21 | @classmethod 22 | def set(cls, path: str, data: str) -> None: 23 | """ 24 | string data -> text data file 25 | """ 26 | with open(path, "w", encoding="utf-8") as f: 27 | f.write(data) 28 | print("writen data : ", len(data)) 29 | 30 | @classmethod 31 | def train_test_split(cls, data, split_percent: int = 90): 32 | """ 33 | split the data into train ans test based on split percentage 34 | """ 35 | 36 | tensor_data = torch.tensor(data, dtype=torch.long, device=config.device) 37 | n = int((split_percent/100)*len(data)) 38 | train_data = tensor_data[:n] 39 | test_data = tensor_data[n:] 40 | 41 | return [train_data, test_data] 42 | 43 | @classmethod 44 | def chat_formater(cls,context=None, prompt=None, response=None): 45 | """Creates a JSON object from the given context, prompt, and output. 46 | 47 | Args: 48 | context: A string containing the context 49 | prompt: A string containing the prompt 50 | Response: A string containing the Response 51 | 52 | Returns: 53 | A JSON object containing the context, prompt, and Response 54 | """ 55 | 56 | if response : 57 | data = f"### context:\n{context}\n\n### prompt:\n{prompt}\n\n### response:\n{response}\n [eos] \n" 58 | elif context and prompt : 59 | data = f"### context:\n{context}\n\n### prompt:\n{prompt}\n\n### response:\n" 60 | elif not context : 61 | data = f"### prompt:\n{prompt}\n\n### response:\n" 62 | 63 | return data 64 | 65 | @classmethod 66 | def chat_JsonToTxt(cls,path_json,path_txt): 67 | 68 | # Read data from the JSON file 69 | with open(path_json, 'r') as json_file: 70 | data = json.load(json_file) 71 | 72 | # Create a text file to save the data 73 | with open(path_txt, 'w') as txt_file: 74 | # Iterate through the data and write context, prompt, and response to the text file 75 | for chat in data: 76 | context = chat['context'] 77 | prompt = chat['prompt'] 78 | response = chat['response'] 79 | 80 | # Write to the text file 81 | formated_chat = cls.chat_formater(context, prompt, response) 82 | txt_file.write(formated_chat) 83 | txt_file.write('\n') # Add an empty line to separate entries 84 | 85 | 86 | 87 | 88 | -------------------------------------------------------------------------------- /jam_gpt/tokenizer.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import shutil 4 | 5 | class Tokenizer: 6 | """ 7 | tokenizer class will tokenize the input interms of charater level 8 | based on the the given text_data to get_encoding function it will use vacobalory 9 | if not it will use the default encodings to encode and decode data 10 | """ 11 | 12 | def __init__(self): 13 | pass 14 | 15 | def get_encoding(self, model: str = None): 16 | 17 | self.vocab = Tokenizer.get_char_vocab(model) 18 | self.stoi, self.itos = self.vocab 19 | self.n_vocab = len(self.stoi) 20 | # print(self.vocab)) 21 | # print(self.n_vocab) 22 | 23 | def set_encoding(self, model: str, data: str): 24 | """ 25 | toake text or string data and segregate it into vocab and store it in model 26 | """ 27 | # handelling folder not existerror 28 | 29 | Tokenizer.set_char_vocab(model, data) 30 | 31 | def encode(self, s: str) -> list[int]: 32 | # encoder: take a string char , output a list of integers 33 | enc_list = [] 34 | for c in s: 35 | if c not in self.stoi: 36 | self.stoi[c] = max(self.stoi.values())+1 37 | self.n_vocab += 1 38 | enc_list.append(self.stoi[c]) 39 | return enc_list 40 | 41 | def decode(self, l: list[int]) -> str: 42 | # decoder: take a list of integers, output a string 43 | return ''.join([self.itos[str(i)] for i in l]) 44 | 45 | @classmethod 46 | def store_vocab(cls,smodel,md_name: str): 47 | """ 48 | args : 49 | smodel = source model name 50 | md_name = destination model name 51 | source model -> destination model 52 | """ 53 | if not os.path.exists(f"./{md_name}"): 54 | os.makedirs(f"./{md_name}") 55 | spath = f"{smodel}/vocab.json" 56 | dpath = f"{md_name}/vocab.json" 57 | shutil.copy(spath, dpath) 58 | 59 | @classmethod 60 | def get_char_vocab(cls, model: str): 61 | """ 62 | json file -> dict -> dict,dict 63 | """ 64 | path =f"{model}/vocab.json" 65 | with open(path, "r", encoding="utf-8") as f: 66 | data = json.load(f) 67 | stoi = data["stoi"] 68 | itos = data["itos"] 69 | return stoi, itos 70 | 71 | @classmethod 72 | def set_char_vocab(cls, model: str, data: str) -> None: 73 | """ 74 | string data -> vocab -> dict,dict -> dict -> json file 75 | """ 76 | if not os.path.exists(f"./{model}"): 77 | os.makedirs(f"./{model}") 78 | path = f"{model}/vocab.json" 79 | data_chars = sorted(list(set(data))) 80 | stoi = {ch: i for i, ch in enumerate(data_chars)} 81 | itos = {i: ch for i, ch in enumerate(data_chars)} 82 | vocab = {"stoi": stoi, "itos": itos} 83 | with open(path, "w", encoding="utf-8") as f: 84 | json.dump(vocab, f) 85 | # print("writen data string : ",len(data_string)) 86 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Jam-gpt 2 | 3 | ## about : 4 | 5 | > note : its a simple toy llm model builder framework to play around llm models and understand its working principles 6 | 7 | its not a llm model but you can put ur data and train it to build llm models 8 | 9 | An Experimental implementation of **language model (LLM)** architecture for research and development of it architectures, design process to build, training, and fine-tuning efficient **Generative Pretrained Transformers (GPT)** models 10 | 11 | for more ai related tools and framework look into **[OX-AI](https://github.com/ox-ai)** an open source AI project 12 | 13 | 14 | 15 |
16 | 17 |
18 |
19 | 20 | ## Installation : 21 | 22 | ### latest version 23 | > github pull will be clean if encountered with bugs please report issues 24 | ```bash 25 | pip install git+https://github.com/Lokeshwaran-M/jam-gpt.git 26 | ``` 27 | 28 | ### stable release 29 | > jam-gpt==0.0.4 may not have fine tuning as its still under development and may contain bug please report issues if any 30 | 31 | ```bash 32 | pip install jam-gpt 33 | ``` 34 | 35 | 36 | 37 | ## Usage : 38 | 39 | Refere [Docs](./docs/jam-gpt.md) and [test-gptLM.ipynb](test-gptLM.ipynb) for code examples 40 | 41 | ```python 42 | 43 | from jam_gpt.tokenizer import Tokenizer 44 | from jam_gpt import config 45 | from jam_gpt import lm 46 | from jam_gpt.model import Model 47 | 48 | md_name = "md-name" 49 | 50 | tok = Tokenizer() 51 | tok.get_encoding(md_name) 52 | 53 | # model initilization 54 | model = Model() 55 | 56 | # load pretrined model 57 | model.load_model(md_name) 58 | 59 | # Generate data using Model 60 | pmt = tok.encode("user prompt") 61 | res = tok.decode(model.generate(pmt)) 62 | print(res) 63 | 64 | ``` 65 | 66 | ## Docs : 67 | 68 | [Jam-gpt docs](./docs/jam-gpt.md) will give you the complete useage and explanation of the jam-gpt library 69 | 70 | 1 [ setup](./docs/jam-gpt.md#1-setup) 71 | 2 [ Collecting data](./docs/jam-gpt.md#2-collecting-data) 72 | 3 [ Tokenization](./docs/jam-gpt.md#3-tokenization) 73 | 4 [ configuration](./docs/jam-gpt.md#4-configuration) 74 | 5 [ Language Model ( LM , Model )](./docs/jam-gpt.md#5-language-model--lm--model) 75 | 6 [ Model Fine Tuning](./docs/jam-gpt.md#6-model-fine-tuning) 76 | 77 | ## Contribution : 78 | 79 | for contribution guidelines and terms and condition to contribute refere [jam-contribution](https://github.com/Lokeshwaran-M/jam-contribution.git) by rasing the PR you are accepting the terms and condition 80 | 81 | Any form of contribution is accepted here 82 | 83 | Submitting : 84 | Issues 85 | pull requests 86 | feature requests 87 | bug reports 88 | documentation 89 | 90 | ## credits : 91 | 92 | * kudos to [Andrej karpathy](https://github.com/karpathy) for his lectures on deep learning 93 | * [Open AI](https://github.com/openai) for GPT-2 94 | * paper ["Attention Is All You Need"](https://arxiv.org/pdf/1706.03762.pdf) 95 | 96 | -------------------------------------------------------------------------------- /jam_gpt/config.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import torch 4 | 5 | 6 | # -----------------------------------------------------------# 7 | # hyperparameters 8 | 9 | vocab_size = 0 10 | batch_size = 32 11 | block_size = 256 12 | max_iters = 5000 13 | eval_interval = 250 14 | learning_rate = 1e-3 15 | device = "cuda" if torch.cuda.is_available() else "cpu" 16 | eval_iters = 200 17 | n_embd = 384 18 | n_head = 6 19 | n_layer = 6 20 | dropout = 0.2 21 | model_architecture = None 22 | 23 | # -----------------------------------------------------------# 24 | 25 | 26 | def pass_args(): 27 | return [vocab_size, batch_size, block_size, max_iters, eval_interval, learning_rate, device, eval_iters, n_embd, n_head, n_layer, dropout, model_architecture] 28 | 29 | 30 | def get_args(): 31 | args = pass_args() 32 | arg_names = ['vocab_size', 'batch_size', 'block_size', 'max_iters', 'eval_interval', 33 | 'learning_rate', 'device', 'eval_iters', 'n_embd', 'n_head', 'n_layer', 'dropout', 'model_architecture'] 34 | print("# -------------------------------------#\n# hyperparameters") 35 | max_arg_name_length = max(len(arg) for arg in arg_names) 36 | for arg_name, arg_value in zip(arg_names, args): 37 | padding = ' ' * (max_arg_name_length - len(arg_name)) 38 | print(f"{arg_name} {padding} : {arg_value}") 39 | print("# -------------------------------------#") 40 | return args 41 | 42 | 43 | def set_args(args: list): 44 | global vocab_size, batch_size, block_size, max_iters, eval_interval, learning_rate, device, eval_iters, n_embd, n_head, n_layer, dropout, model_architecture 45 | [vocab_size, batch_size, block_size, max_iters, eval_interval, 46 | learning_rate, device, eval_iters, n_embd, n_head, n_layer, dropout, model_architecture] = args 47 | 48 | 49 | def variables_to_dict(): 50 | config_dict = { 51 | "vocab_size": vocab_size, 52 | "batch_size": batch_size, 53 | "block_size": block_size, 54 | "max_iters": max_iters, 55 | "eval_interval": eval_interval, 56 | "learning_rate": learning_rate, 57 | "device": device, 58 | "eval_iters": eval_iters, 59 | "n_embd": n_embd, 60 | "n_head": n_head, 61 | "n_layer": n_layer, 62 | "dropout": dropout, 63 | "model_architecture": model_architecture 64 | } 65 | return config_dict 66 | 67 | 68 | def store(model_name, args=pass_args()): 69 | if not os.path.exists(f"./{model_name}"): 70 | os.makedirs(f"./{model_name}") 71 | path = f"{model_name}/config.json" 72 | set_args(args) 73 | config_dict = variables_to_dict() 74 | with open(path, "w") as config_file: 75 | config_data = {"config_args": args, "config_dict": config_dict} 76 | json.dump(config_data, config_file) 77 | 78 | 79 | def retrive(model_name): 80 | """ 81 | args : model_name 82 | return : config.json 83 | """ 84 | path = f"{model_name}/config.json" 85 | try: 86 | with open(path, "r") as config_file: 87 | config_data = json.load(config_file) 88 | config_data["config_args"][6] = "cuda" if torch.cuda.is_available() else "cpu" 89 | config_data["config_dict"]["device"] = "cuda" if torch.cuda.is_available() else "cpu" 90 | set_args(config_data["config_args"]) 91 | return config_data 92 | except FileNotFoundError: 93 | print("warning : config.json not found using default args") 94 | config_data = {"config_args": pass_args(), "config_dict": variables_to_dict()} 95 | return config_data 96 | 97 | 98 | get_args() 99 | -------------------------------------------------------------------------------- /data-set/jam-data.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "context": "Introduction", 4 | "prompt": "Hello, who are you?", 5 | "response": "I am Jam-AGI, your AI assistant." 6 | }, 7 | { 8 | "context": "Introduction", 9 | "prompt": "What can you do?", 10 | "response": "I can answer questions, provide information, and assist with various tasks." 11 | }, 12 | { 13 | "context": "Introduction", 14 | "prompt": "Tell me about yourself.", 15 | "response": "I am an AI assistant built using GPT-3. I'm here to help you with your questions and tasks." 16 | }, 17 | { 18 | "context": "General Question", 19 | "prompt": "What's the weather like today?", 20 | "response": "I'm sorry, I don't have access to real-time information. Please check a weather website or app for the latest updates." 21 | }, 22 | { 23 | "context": "General Question", 24 | "prompt": "Can you recommend a good book to read?", 25 | "response": "Of course! What genre are you interested in?" 26 | }, 27 | { 28 | "context": "General Question", 29 | "prompt": "How do I cook a perfect omelette?", 30 | "response": "Here's a simple recipe: [Provide omelette recipe]" 31 | }, 32 | { 33 | "context": "General Question", 34 | "prompt": "What's the capital of France?", 35 | "response": "The capital of France is Paris." 36 | }, 37 | { 38 | "context": "General Question", 39 | "prompt": "Tell me a joke.", 40 | "response": "Why did the computer catch a cold? Because it had too many windows open!" 41 | }, 42 | { 43 | "context": "General Question", 44 | "prompt": "What's your favorite color?", 45 | "response": "I don't have personal preferences, but I can help you find information on colors." 46 | }, 47 | { 48 | "context": "Technology", 49 | "prompt": "Explain artificial intelligence.", 50 | "response": "Artificial intelligence (AI) is the simulation of human intelligence processes by machines, typically computer systems. It involves tasks such as learning, reasoning, problem-solving, and understanding natural language." 51 | }, 52 | { 53 | "context": "Technology", 54 | "prompt": "Tell me about machine learning.", 55 | "response": "Machine learning is a subset of AI that focuses on the development of algorithms that allow computers to learn from and make predictions or decisions based on data without being explicitly programmed." 56 | }, 57 | { 58 | "context": "Science", 59 | "prompt": "What is the theory of relativity?", 60 | "response": "The theory of relativity, developed by Albert Einstein, consists of two main parts: special relativity and general relativity. It revolutionized our understanding of space, time, and gravity." 61 | }, 62 | { 63 | "context": "Science", 64 | "prompt": "Can you explain quantum mechanics?", 65 | "response": "Quantum mechanics is a fundamental theory in physics that describes the behavior of matter and energy at the smallest scales, typically at the level of atoms and subatomic particles." 66 | }, 67 | { 68 | "context": "Entertainment", 69 | "prompt": "Who is your favorite actor?", 70 | "response": "I don't have personal preferences, but I can provide information about various actors and their work." 71 | }, 72 | { 73 | "context": "Entertainment", 74 | "prompt": "Recommend a classic movie.", 75 | "response": "How about watching 'Casablanca'? It's a timeless classic with great storytelling." 76 | } 77 | ] -------------------------------------------------------------------------------- /docs/jam-gpt.md: -------------------------------------------------------------------------------- 1 | # Jam-gpt 2 | 3 | documentation for using jam-gpt library 4 | 5 | ## 1 Setup : 6 | 7 | ### Installization 8 | 9 | #### insatll from pip relese 10 | 11 | ```bash 12 | pip insatll jam-gpt 13 | ``` 14 | 15 | #### install it in your local site-packages directory 16 | 17 | ```bash 18 | pip install git+https://github.com/Lokeshwaran-M/jam-gpt.git 19 | ``` 20 | 21 | ### Modified installization 22 | 23 | To add your modification and install in your local site-packages directory 24 | 25 | ```bash 26 | # clone in your project directory 27 | git clone https://github.com/Lokeshwaran-M/jam-gpt.git 28 | cd jam-gpt 29 | 30 | # run pip to move inside your local site-packages directory 31 | pip install . 32 | 33 | # to do modification editable mode 34 | pip install -e . 35 | ``` 36 | 37 | ## 2 Collecting Data : 38 | 39 | ```python 40 | from jam_gpt import Data 41 | 42 | # data collection 43 | data=Data.get("path-to-textfile") 44 | ``` 45 | 46 | just to get data from a text data file and return as one large single string for furthere pre processing 47 | 48 | ## 3 Tokenization : 49 | 50 | ```python 51 | from jam_gpt import Tokenizer 52 | 53 | tok = Tokenizer() 54 | 55 | model_name = "md-test" 56 | # tokanization 57 | tok.set_encoding(model_name, data) 58 | tok.get_encoding(model_name) 59 | 60 | vocab_size = tok.n_vocab 61 | 62 | enc = tok.encode("test sample $^&~~data") 63 | dec = tok.decode(enc) 64 | 65 | # out : 66 | # [81, 66, 80, 81, 1, 80, 62, 74, 77, 73, 66, 1, 4, 60, 6, 90, 90, 65, 62, 81, 62] 67 | # test sample $^&~~data 68 | 69 | #to store a pretrained model vocab to finetuned model or other model 70 | tok.store_vocab("source_md-name","md-name") 71 | ``` 72 | 73 | ```python 74 | import tiktoken 75 | 76 | # tokanization using tiktoken 77 | tok = tiktoken.get_encoding("gpt2") 78 | 79 | vocab_size = 50257 80 | 81 | enc = tok.encode("test sample $^&~~data") 82 | dec = tok.decode(enc) 83 | 84 | # out : 85 | # [9288, 6291, 720, 61, 5, 4907, 7890] 86 | # test sample $^&~~data 87 | ``` 88 | 89 | A tokenizer is a tool that breaks down text into smaller units called tokens These tokens can then be processed by an LLM. The tokens can be words, characters, subwords, or other segments of text, depending on the type of LLM and the desired granularity of the text representation. 90 | 91 | ## 4 configuration : 92 | 93 | ```python 94 | from jam_gpt import config 95 | 96 | # customizing parameter settings before initializing model 97 | 98 | args = config.pass_args() 99 | config.vocab_size = 50257 100 | 101 | print(args) 102 | print(config.pass_args()) 103 | 104 | # out : 105 | # [0, 16, 32, 5000, 100, 0.001, 'cuda', 200, 64, 4, 4, 0.0] 106 | # [50257, 16, 32, 5000, 100, 0.001, 'cuda', 200, 64, 4, 4, 0.0] 107 | 108 | # To store the customized config setting into model/config.json 109 | config.store(model_name,args) 110 | # To retrive the config settings from model/config.json 111 | config.retrive(model_name) 112 | ``` 113 | The config custamization need to be done before initializing the model 114 | 115 | ## 5 Language Model ( LM , Model ) : 116 | 117 | ### Initilizing model 118 | 119 | ```python 120 | from jam_gpt import lm 121 | from jam_gpt import Model 122 | 123 | # model instantiation 124 | model = Model() 125 | 126 | # setting model architecture 127 | 128 | # GPT Language Model 129 | model.set_model(lm.GPTLM()) 130 | ``` 131 | 132 | ### Traning 133 | 134 | ```python 135 | # prepare data for training ( train , test ) 136 | model.set_data(Data.train_test_split(enc_data)) 137 | 138 | # traning 139 | model.optimize() 140 | model.train() 141 | ``` 142 | 143 | ### Saving model 144 | 145 | ```python 146 | # default bin 147 | model.save_model(model_name) 148 | # can edit model_format 149 | # model_format = pt or pkl 150 | model.save_model(model_name,model_format) 151 | ``` 152 | 153 | ### load model 154 | 155 | ```python 156 | # retrive model parameter settings 157 | config.retrive(md_name) 158 | 159 | # model instantiation 160 | model = Model() 161 | 162 | model.load_model(model_name) 163 | ``` 164 | 165 | ### Generate data using Model 166 | 167 | ```python 168 | pmt = tok.encode("user prompt") 169 | eos = tok.encode(" [eos] ") 170 | res = tok.decode(model.generate(pmt,3000,eos)) 171 | 172 | print(res) 173 | ``` 174 | 175 | ## 6 Model Fine Tuning : 176 | 177 | ```python 178 | 179 | Coming Soon ........ 180 | 181 | ``` 182 | -------------------------------------------------------------------------------- /test-gptLM.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "# -------------------------------------#\n", 13 | "# hyperparameters\n", 14 | "vocab_size : 0\n", 15 | "batch_size : 32\n", 16 | "block_size : 256\n", 17 | "max_iters : 5000\n", 18 | "eval_interval : 250\n", 19 | "learning_rate : 0.001\n", 20 | "device : cuda\n", 21 | "eval_iters : 200\n", 22 | "n_embd : 384\n", 23 | "n_head : 6\n", 24 | "n_layer : 6\n", 25 | "dropout : 0.2\n", 26 | "model_architecture : None\n", 27 | "# -------------------------------------#\n" 28 | ] 29 | } 30 | ], 31 | "source": [ 32 | "from jam_gpt import Data, Tokenizer, config, lm, Model\n", 33 | "import tiktoken # type: ignore" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "\n", 43 | "path = \"./data-set/md-to.txt\"\n", 44 | "data = Data.get(path)\n", 45 | "\n", 46 | "config.vocab_size = 50257\n", 47 | "tok = tiktoken.get_encoding(\"gpt2\")\n", 48 | "enc_data = tok.encode(data)\n", 49 | "\n", 50 | "model_name = \"md-t0\"\n", 51 | "\n", 52 | "# model genration\n", 53 | "model = Model()\n", 54 | "\n", 55 | "model.set_model(lm.GPTLM())\n", 56 | "model.set_data(Data.train_test_split(enc_data))\n", 57 | "\n", 58 | "model.optimize()\n", 59 | "model.train()" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": null, 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "# model.save_model(model_name)" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": null, 74 | "metadata": {}, 75 | "outputs": [], 76 | "source": [ 77 | "# model.load_model(model_name)" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": null, 83 | "metadata": {}, 84 | "outputs": [ 85 | { 86 | "name": "stdout", 87 | "output_type": "stream", 88 | "text": [ 89 | "I love you read, as overI-52, you are all, a loved are\n", 90 | "THE THEORY OF LOVE 117 whohas him.\n", 91 | "The same emphasis in Both views on aspects, but wearehuman all types of exchange. In the opposite is not that sheent connection between their child, she must it. Hence thinking should receive its veryess Kali; when he does notaches to love; the childanxietyis the child are of giving must love the than to love.Inasmuch when we mean that using all one can do miracles, not do miracles, not liveser to be as much as everything, to receive unconditional—all persons dis nature, is a need to follow the child or mother; he promises he also she remainsutter her, sees only when they belong to her bodily sensitivity toward theattitudes.\n", 92 | "In all of fixation usually occurs after man not thinkingbut as others, but discipline, this need something apart; mother only one has nots, symbolized?\n", 93 | "As more since these viewsis that she\n", 94 | "sto take him, explores the evening, of it remains superficial they are affected by their bliss anxiety. At this path to receiving.\n", 95 | "than the insane. One premise,the otherm,the ability more conducive to mother, so on to be an\n", 96 | "poss discussion of the one's own human functioning, and for the object of duty, and we are all shareother with God,the prepondervercoming the mostkft grow from the center of truth. Tosecret is essentially the individualical character, the only one makes them on the children. Sheis that~oved/ is suffer is to be loved by a welove, is impossible for that of in two systems to do not one's breathing;not, but it\n", 97 | "it is now to be taught, such an alienated from whichimwardsity is taken rich. Theillusiones that for external I assume a pattern of the consciousness of this goal of this lack of separateism, it,fyou only the following discmanent and again his value; in our fellowman, and so a \"For\n", 98 | "than the book explores which one/ and to destroy. For the one's development: for the two individuals of the use of meaning of God. The mostele—alienable emphasis on the concept of\n", 99 | "THE THEORY OF LOVE 45phis trend of thought and complain bitterly about the earlieststage in thought. In contrast to live separate words, that all this I aml\n" 100 | ] 101 | } 102 | ], 103 | "source": [ 104 | "pmt = tok.encode(\"I love you\")\n", 105 | "print(tok.decode(model.generate(pmt)))" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": null, 111 | "metadata": {}, 112 | "outputs": [], 113 | "source": [] 114 | } 115 | ], 116 | "metadata": { 117 | "kernelspec": { 118 | "display_name": "Python 3", 119 | "language": "python", 120 | "name": "python3" 121 | }, 122 | "language_info": { 123 | "codemirror_mode": { 124 | "name": "ipython", 125 | "version": 3 126 | }, 127 | "file_extension": ".py", 128 | "mimetype": "text/x-python", 129 | "name": "python", 130 | "nbconvert_exporter": "python", 131 | "pygments_lexer": "ipython3", 132 | "version": "3.11.9" 133 | }, 134 | "orig_nbformat": 4 135 | }, 136 | "nbformat": 4, 137 | "nbformat_minor": 2 138 | } 139 | -------------------------------------------------------------------------------- /test-old.demo.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "# -------------------------------------#\n", 13 | "# hyperparameters\n", 14 | "vocab_size : 0\n", 15 | "batch_size : 32\n", 16 | "block_size : 256\n", 17 | "max_iters : 5000\n", 18 | "eval_interval : 250\n", 19 | "learning_rate : 0.001\n", 20 | "device : cuda\n", 21 | "eval_iters : 200\n", 22 | "n_embd : 384\n", 23 | "n_head : 6\n", 24 | "n_layer : 6\n", 25 | "dropout : 0.2\n", 26 | "model_architecture : None\n", 27 | "# -------------------------------------#\n", 28 | "CPU times: user 1.34 s, sys: 1.33 s, total: 2.68 s\n", 29 | "Wall time: 1.22 s\n" 30 | ] 31 | } 32 | ], 33 | "source": [ 34 | "%%time\n", 35 | "from jam_gpt import Data, Tokenizer, config, lm, Model" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "### Time taken to load library :\n", 43 | "\n", 44 | " time : 15.9s" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 2, 50 | "metadata": {}, 51 | "outputs": [ 52 | { 53 | "name": "stdout", 54 | "output_type": "stream", 55 | "text": [ 56 | "vocab size : 101\n", 57 | "parameters : 0.214373 M\n", 58 | "# -------------------------------------#\n", 59 | "# hyperparameters\n", 60 | "vocab_size : 101\n", 61 | "batch_size : 16\n", 62 | "block_size : 32\n", 63 | "max_iters : 5000\n", 64 | "eval_interval : 100\n", 65 | "learning_rate : 0.001\n", 66 | "device : cpu\n", 67 | "eval_iters : 200\n", 68 | "n_embd : 64\n", 69 | "n_head : 4\n", 70 | "n_layer : 4\n", 71 | "dropout : 0.0\n", 72 | "model_architecture : lm.BigramLM\n", 73 | "# -------------------------------------#\n", 74 | "CPU times: user 955 ms, sys: 347 ms, total: 1.3 s\n", 75 | "Wall time: 1.22 s\n" 76 | ] 77 | } 78 | ], 79 | "source": [ 80 | "%%time\n", 81 | "tok = Tokenizer()\n", 82 | "\n", 83 | "\n", 84 | "# path = \"./data-set/linuxsourcecodesnippets.txt\"\n", 85 | "\n", 86 | "# # data collection\n", 87 | "# data = Data.get(path)\n", 88 | "\n", 89 | "# tokanization\n", 90 | "model_name = \"md-t02-bglm\"\n", 91 | "# tok.set_encoding(model_name, data)\n", 92 | "tok.get_encoding(model_name)\n", 93 | "# enc_data = tok.encode(data)\n", 94 | "\n", 95 | "# # setting parameters\n", 96 | "# config.vocab_size = tok.n_vocab\n", 97 | "config.retrive(\"md-t02-bglm\")\n", 98 | "config.device=\"cpu\"\n", 99 | "\n", 100 | "# model genration\n", 101 | "test_model = Model()\n", 102 | "# test_model.set_model(lm.BigramLM())\n", 103 | "# test_model.set_data(Data.train_test_split(enc_data))\n", 104 | "# test_model.optimize()\n", 105 | "# test_model.train()\n", 106 | "test_model.load_model(model_name,args=config.pass_args())\n" 107 | ] 108 | }, 109 | { 110 | "cell_type": "markdown", 111 | "metadata": {}, 112 | "source": [ 113 | "### Traning detials : \n", 114 | " step 4900 : train loss 1.6238, val loss 1.7738\n", 115 | " traning time : 7m 54.9s" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": 3, 121 | "metadata": {}, 122 | "outputs": [], 123 | "source": [ 124 | "# test_model.save_model(model_name)" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": 4, 130 | "metadata": {}, 131 | "outputs": [ 132 | { 133 | "name": "stdout", 134 | "output_type": "stream", 135 | "text": [ 136 | "linux engine\n", 137 | "/*\n", 138 | " * Find a physical address of a virtual object..\n", 139 | " *\n", 140 | " * This is easy using the virtual page table address.\n", 141 | " */\n", 142 | "aread_entry_signd_flags - &&\n", 143 | "numq_backlog_copy(blkcg.list_entry);\n", 144 | "}\n", 145 | "#uf CONFIL_MMPIN)\n", 146 | "\n", 147 | "/*\n", 148 | " * Eewarity_idx_init_of(wq, info, entry, struct filt *info, \"after\\n\", true;\n", 149 | "\tstimct eventpol_shm_fs(char->show,final_entry_chunk_bytes &&\n", 150 | "#inc 0,\n", 151 | "\t.iostic\t\t= bio_put_current_lock();\n", 152 | "\n", 153 | "\treturn long rb_lock++)\n", 154 | "\t\treturn alg;\n", 155 | "\t\tif (struct epitem; i err)\n", 156 | "{\n", 157 | "\tsize = kmq_wait_work(rq->wq);\n", 158 | "\n", 159 | "\tif (info->wait_country(CONFIG_NO *, 0)\n", 160 | "\t\treturn -ENOEMEC(\"Cown of thrating listrimitions, the last begup han it IPC__ENOMEM t\n", 161 | "CPU times: user 8.74 s, sys: 27 ms, total: 8.76 s\n", 162 | "Wall time: 3.52 s\n" 163 | ] 164 | } 165 | ], 166 | "source": [ 167 | "%%time\n", 168 | "pmt = tok.encode(\"\"\"linux engine\n", 169 | "/*\n", 170 | " * Find a physical address of a virtual object..\n", 171 | " *\n", 172 | " * This is easy using the virtual page table address.\n", 173 | " */\n", 174 | "\"\"\")\n", 175 | "print(tok.decode(test_model.generate(pmt)))" 176 | ] 177 | } 178 | ], 179 | "metadata": { 180 | "kernelspec": { 181 | "display_name": "Python 3", 182 | "language": "python", 183 | "name": "python3" 184 | }, 185 | "language_info": { 186 | "codemirror_mode": { 187 | "name": "ipython", 188 | "version": 3 189 | }, 190 | "file_extension": ".py", 191 | "mimetype": "text/x-python", 192 | "name": "python", 193 | "nbconvert_exporter": "python", 194 | "pygments_lexer": "ipython3", 195 | "version": "3.11.9" 196 | }, 197 | "orig_nbformat": 4 198 | }, 199 | "nbformat": 4, 200 | "nbformat_minor": 2 201 | } 202 | -------------------------------------------------------------------------------- /jam_gpt/model.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | import torch 4 | from . import config 5 | from . import lm 6 | 7 | torch.manual_seed(1337) 8 | 9 | 10 | class Model: 11 | """class model to work on lm models""" 12 | 13 | def __init__(self): 14 | [ 15 | self.vocab_size, 16 | self.batch_size, 17 | self.block_size, 18 | self.max_iters, 19 | self.eval_interval, 20 | self.learning_rate, 21 | self.device, 22 | self.eval_iters, 23 | self.n_embd, 24 | self.n_head, 25 | self.n_layer, 26 | self.dropout, 27 | self.model_architecture, 28 | ] = config.pass_args() 29 | 30 | self.model = None 31 | 32 | self.train_data = None 33 | self.test_data = None 34 | 35 | def set_parameters(self, args: list): 36 | [ 37 | self.vocab_size, 38 | self.batch_size, 39 | self.block_size, 40 | self.max_iters, 41 | self.eval_interval, 42 | self.learning_rate, 43 | self.device, 44 | self.eval_iters, 45 | self.n_embd, 46 | self.n_head, 47 | self.n_layer, 48 | self.dropout, 49 | self.model_architecture, 50 | ] = args 51 | 52 | def get_parameters(self): 53 | return [ 54 | self.vocab_size, 55 | self.batch_size, 56 | self.block_size, 57 | self.max_iters, 58 | self.eval_interval, 59 | self.learning_rate, 60 | self.device, 61 | self.eval_iters, 62 | self.n_embd, 63 | self.n_head, 64 | self.n_layer, 65 | self.dropout, 66 | self.model_architecture, 67 | ] 68 | 69 | def set_model(self, model): 70 | self.model_architecture = f"lm.{model.__class__.__name__}" 71 | self.model = model 72 | self.m = self.model.to(self.device) 73 | # print the number of parameters in the model 74 | print("vocab size : ", self.vocab_size) 75 | print("parameters : ", sum(p.numel() for p in self.m.parameters()) / 1e6, " M") 76 | config.get_args() 77 | return self.m 78 | 79 | def set_data(self, data): 80 | self.train_data = data[0] 81 | self.test_data = data[1] 82 | 83 | def get_batch(self, split): 84 | # generate small batch of data of input -> x and targets -> y 85 | data = self.train_data if split == "train" else self.test_data 86 | ix = torch.randint(len(data) - self.block_size, (self.batch_size,)) 87 | x = torch.stack([data[i : i + self.block_size] for i in ix]) 88 | y = torch.stack([data[i + 1 : i + self.block_size + 1] for i in ix]) 89 | x, y = x.to(self.device), y.to(self.device) 90 | return x, y 91 | 92 | @torch.no_grad() 93 | def estimate_loss(self): 94 | # estimates the loss of model by eval using test data 95 | out = {} 96 | self.model.eval() 97 | for split in ["train", "val"]: 98 | losses = torch.zeros(self.eval_iters) 99 | for k in range(self.eval_iters): 100 | X, Y = self.get_batch(split) 101 | logits, loss = self.model(X, Y) 102 | losses[k] = loss.item() 103 | out[split] = losses.mean() 104 | self.model.train() 105 | return out 106 | 107 | def optimize(self): 108 | self.optimizer = torch.optim.AdamW( 109 | self.model.parameters(), lr=self.learning_rate 110 | ) 111 | 112 | def train(self, max_iters=None): 113 | if not max_iters: 114 | max_iters = self.max_iters 115 | for iter in range(max_iters): 116 | # every once in a while evaluate the loss on train and val sets 117 | if iter % self.eval_interval == 0: 118 | losses = self.estimate_loss() 119 | print( 120 | f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}" 121 | ) 122 | 123 | # sample a batch of data 124 | xb, yb = self.get_batch("train") 125 | 126 | # evaluate the loss 127 | logits, loss = self.model(xb, yb) 128 | self.optimizer.zero_grad(set_to_none=True) 129 | loss.backward() 130 | self.optimizer.step() 131 | 132 | def generate(self, prompt, max_new_tokens=500, eos_token=None): 133 | """ 134 | TO generate response from model 135 | """ 136 | # generate from the model 137 | # context = torch.zeros((1, 1), dtype=torch.long, device=self.device) 138 | tensor_prompt = torch.tensor(prompt, dtype=torch.long, device=self.device)[ 139 | None, ... 140 | ] 141 | if eos_token: 142 | tensor_eos_token = torch.tensor(eos_token, dtype=torch.long, device=self.device)[ 143 | None, ... 144 | ] 145 | else : 146 | tensor_eos_token = None 147 | 148 | return self.m.generate(tensor_prompt, max_new_tokens, tensor_eos_token)[ 149 | 0 150 | ].tolist() 151 | 152 | def save_model(self, model_name, model_format="bin"): 153 | # to save model 154 | if not os.path.exists(f"./{model_name}"): 155 | os.makedirs(f"./{model_name}") 156 | path = f"{model_name}/{model_name}.{model_format}" 157 | if model_format == "bin" or model_format == "pt": 158 | torch.save(self.model.state_dict(), path) 159 | elif model_format == "pkl": 160 | with open(path, "wb") as f: 161 | pickle.dump(self.model, f) 162 | else: 163 | print(f"given model format : {model_format} is not supported") 164 | 165 | # to save config info 166 | config.store(model_name, self.get_parameters()) 167 | 168 | def load_model(self, model_name, model_format="bin", args=None): 169 | if args: 170 | self.set_parameters(args) 171 | else: 172 | # to load config info 173 | config_data = config.retrive(model_name) 174 | self.set_parameters(config_data["config_args"]) 175 | 176 | # to load model 177 | path = f"{model_name}/{model_name}.{model_format}" 178 | if model_format == "bin" or model_format == "pt": 179 | cls_model_architecture = eval(self.model_architecture) 180 | self.set_model(cls_model_architecture()) 181 | self.model.load_state_dict(torch.load(path,map_location=torch.device(config_data["config_dict"]["device"]))) 182 | elif model_format == "pkl": 183 | with open(path, "rb") as f: 184 | loaded_model = pickle.load(f) 185 | self.set_model(loaded_model) 186 | 187 | self.model.eval() 188 | # return self.model 189 | -------------------------------------------------------------------------------- /jam_gpt/lm.py: -------------------------------------------------------------------------------- 1 | """ 2 | B0rrowed form Andreg karpathy : karpathy/nanoGPT 3 | https://github.com/karpathy/nanoGPT/blob/master/model.py 4 | """ 5 | 6 | import torch 7 | import torch.nn as nn 8 | from torch.nn import functional as F 9 | from . import config 10 | 11 | torch.manual_seed(1337) 12 | 13 | 14 | """ 15 | defines a language model using PyTorch 16 | based on the Transformer architecture attention Mechanism 17 | """ 18 | [ 19 | vocab_size, 20 | batch_size, 21 | block_size, 22 | max_iters, 23 | eval_interval, 24 | learning_rate, 25 | device, 26 | eval_iters, 27 | n_embd, 28 | n_head, 29 | n_layer, 30 | dropout, 31 | model_architecture, 32 | ] = config.pass_args() 33 | 34 | 35 | def set_parameters(args): 36 | global vocab_size, batch_size, block_size, max_iters, eval_interval, learning_rate, device, eval_iters, n_embd, n_head, n_layer, dropout, model_architecture 37 | [ 38 | vocab_size, 39 | batch_size, 40 | block_size, 41 | max_iters, 42 | eval_interval, 43 | learning_rate, 44 | device, 45 | eval_iters, 46 | n_embd, 47 | n_head, 48 | n_layer, 49 | dropout, 50 | model_architecture, 51 | ] = args 52 | 53 | 54 | class Head(nn.Module): 55 | """one head of self-attention""" 56 | 57 | def __init__(self, head_size): 58 | super().__init__() 59 | self.key = nn.Linear(n_embd, head_size, bias=False) 60 | self.query = nn.Linear(n_embd, head_size, bias=False) 61 | self.value = nn.Linear(n_embd, head_size, bias=False) 62 | self.register_buffer("tril", torch.tril(torch.ones(block_size, block_size))) 63 | 64 | self.dropout = nn.Dropout(dropout) 65 | 66 | def forward(self, x): 67 | B, T, C = x.shape 68 | k = self.key(x) # (B,T,C) 69 | q = self.query(x) # (B,T,C) 70 | # compute attention scores ("affinities") 71 | # (B, T, C) @ (B, C, T) -> (B, T, T) 72 | wei = q @ k.transpose(-2, -1) * C**-0.5 73 | wei = wei.masked_fill(self.tril[:T, :T] == 0, float("-inf")) # (B, T, T) 74 | wei = F.softmax(wei, dim=-1) # (B, T, T) 75 | wei = self.dropout(wei) 76 | # perform the weighted aggregation of the values 77 | v = self.value(x) # (B,T,C) 78 | out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C) 79 | return out 80 | 81 | 82 | class MultiHeadAttention(nn.Module): 83 | """multiple heads of self-attention in parallel""" 84 | 85 | def __init__(self, num_heads, head_size): 86 | super().__init__() 87 | self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)]) 88 | self.proj = nn.Linear(n_embd, n_embd) 89 | self.dropout = nn.Dropout(dropout) 90 | 91 | def forward(self, x): 92 | out = torch.cat([h(x) for h in self.heads], dim=-1) 93 | out = self.dropout(self.proj(out)) 94 | return out 95 | 96 | 97 | class FeedFoward(nn.Module): 98 | """a simple linear layer followed by a non-linearity""" 99 | 100 | def __init__(self, n_embd): 101 | super().__init__() 102 | self.net = nn.Sequential( 103 | nn.Linear(n_embd, 4 * n_embd), 104 | nn.ReLU(), 105 | nn.Linear(4 * n_embd, n_embd), 106 | nn.Dropout(dropout), 107 | ) 108 | 109 | def forward(self, x): 110 | return self.net(x) 111 | 112 | 113 | class Block(nn.Module): 114 | """Transformer block: communication followed by computation""" 115 | 116 | def __init__(self, n_embd, n_head): 117 | # n_embd: embedding dimension, n_head: the number of heads we'd like 118 | super().__init__() 119 | head_size = n_embd // n_head 120 | self.sa = MultiHeadAttention(n_head, head_size) 121 | self.ffwd = FeedFoward(n_embd) 122 | self.ln1 = nn.LayerNorm(n_embd) 123 | self.ln2 = nn.LayerNorm(n_embd) 124 | 125 | def forward(self, x): 126 | x = x + self.sa(self.ln1(x)) 127 | x = x + self.ffwd(self.ln2(x)) 128 | return x 129 | 130 | 131 | # ---------------------------models------------------------------- 132 | 133 | 134 | class GPTLM(nn.Module): 135 | """ 136 | GPT-style approach where each token predicts the next token 137 | """ 138 | 139 | def __init__(self): 140 | super().__init__() 141 | # setting updated parameters 142 | set_parameters(config.pass_args()) 143 | 144 | # each token directly reads off the logits for the next token from a lookup table 145 | self.token_embedding_table = nn.Embedding(vocab_size, n_embd) 146 | self.position_embedding_table = nn.Embedding(block_size, n_embd) 147 | self.blocks = nn.Sequential( 148 | *[Block(n_embd, n_head=n_head) for _ in range(n_layer)] 149 | ) 150 | self.ln_f = nn.LayerNorm(n_embd) # final layer norm 151 | self.lm_head = nn.Linear(n_embd, vocab_size) 152 | 153 | # better init, not covered in the original GPT video, but important, will cover in followup video 154 | self.apply(self._init_weights) 155 | 156 | def _init_weights(self, module): 157 | if isinstance(module, nn.Linear): 158 | torch.nn.init.normal_(module.weight, mean=0.0, std=0.02) 159 | if module.bias is not None: 160 | torch.nn.init.zeros_(module.bias) 161 | elif isinstance(module, nn.Embedding): 162 | torch.nn.init.normal_(module.weight, mean=0.0, std=0.02) 163 | 164 | def forward(self, idx, targets=None): 165 | B, T = idx.shape 166 | 167 | # idx and targets are both (B,T) tensor of integers 168 | tok_emb = self.token_embedding_table(idx) # (B,T,C) 169 | pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C) 170 | x = tok_emb + pos_emb # (B,T,C) 171 | x = self.blocks(x) # (B,T,C) 172 | x = self.ln_f(x) # (B,T,C) 173 | logits = self.lm_head(x) # (B,T,vocab_size) 174 | 175 | if targets is None: 176 | loss = None 177 | else: 178 | B, T, C = logits.shape 179 | logits = logits.view(B * T, C) 180 | targets = targets.view(B * T) 181 | loss = F.cross_entropy(logits, targets) 182 | 183 | return logits, loss 184 | 185 | def generate(self, idx, max_new_tokens, eos_token=None): 186 | # idx is (B, T) array of indices in the current context 187 | for _ in range(max_new_tokens): 188 | # crop idx to the last block_size tokens 189 | # idx_cond = idx[:, -block_size:] 190 | idx_cond = idx if idx.size(1) <= block_size else idx[:, -block_size:] 191 | # get the predictions 192 | logits, loss = self(idx_cond) 193 | # focus only on the last time step 194 | logits = logits[:, -1, :] # becomes (B, C) 195 | # apply softmax to get probabilities 196 | probs = F.softmax(logits, dim=-1) # (B, C) 197 | # sample from the distribution 198 | idx_next = torch.multinomial(probs, num_samples=1) # (B, 1) 199 | # append sampled index to the running sequence 200 | idx = torch.cat((idx, idx_next), dim=1) # (B, T+1) 201 | 202 | # Check if the last tokens in idx match eos_token 203 | if eos_token!=None: 204 | if idx.size(1) >= eos_token.size(1) and torch.equal( 205 | idx[:, -eos_token.size(1) :], eos_token 206 | ): 207 | return idx 208 | 209 | return idx 210 | 211 | 212 | class JamLM(nn.Module): 213 | """ 214 | a new neural schema 215 | """ 216 | 217 | def __init__(self): 218 | super().__init__() 219 | pass 220 | -------------------------------------------------------------------------------- /test.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "# -------------------------------------#\n", 13 | "# hyperparameters\n", 14 | "vocab_size : 0\n", 15 | "batch_size : 32\n", 16 | "block_size : 256\n", 17 | "max_iters : 5000\n", 18 | "eval_interval : 250\n", 19 | "learning_rate : 0.001\n", 20 | "device : cuda\n", 21 | "eval_iters : 200\n", 22 | "n_embd : 384\n", 23 | "n_head : 6\n", 24 | "n_layer : 6\n", 25 | "dropout : 0.2\n", 26 | "model_architecture : None\n", 27 | "# -------------------------------------#\n" 28 | ] 29 | } 30 | ], 31 | "source": [ 32 | "from jam_gpt import Data, Tokenizer, config, lm, Model" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 3, 38 | "metadata": {}, 39 | "outputs": [ 40 | { 41 | "name": "stdout", 42 | "output_type": "stream", 43 | "text": [ 44 | "# -------------------------------------#\n", 45 | "# hyperparameters\n", 46 | "vocab_size : 101\n", 47 | "batch_size : 16\n", 48 | "block_size : 32\n", 49 | "max_iters : 5000\n", 50 | "eval_interval : 100\n", 51 | "learning_rate : 0.001\n", 52 | "device : cuda\n", 53 | "eval_iters : 200\n", 54 | "n_embd : 64\n", 55 | "n_head : 4\n", 56 | "n_layer : 4\n", 57 | "dropout : 0.0\n", 58 | "model_architecture : lm.BigramLM\n", 59 | "# -------------------------------------#\n", 60 | "[101, 16, 32, 5000, 100, 0.001, 'cuda', 200, 64, 4, 4, 0.0, 'lm.BigramLM']\n" 61 | ] 62 | } 63 | ], 64 | "source": [ 65 | "tok = Tokenizer()\n", 66 | "\n", 67 | "\n", 68 | "# path = \"./data-set/linuxsourcecodesnippets.txt\"\n", 69 | "\n", 70 | "# # data collection\n", 71 | "# data = Data.get(path)\n", 72 | "\n", 73 | "# tokanization\n", 74 | "model_name = \"md-t02-bglm-rerun\"\n", 75 | "# tok.set_encoding(model_name, data)\n", 76 | "tok.get_encoding(model_name)\n", 77 | "# enc_data = tok.encode(data)\n", 78 | "\n", 79 | "# setting parameters\n", 80 | "# config.vocab_size = tok.n_vocab\n", 81 | "\n", 82 | "\n", 83 | "# model genration\n", 84 | "model = Model()\n", 85 | "# model.set_model(lm.BigramLM())\n", 86 | "# model.set_data(Data.train_test_split(enc_data))\n" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": 4, 92 | "metadata": {}, 93 | "outputs": [ 94 | { 95 | "name": "stdout", 96 | "output_type": "stream", 97 | "text": [ 98 | "vocab size : 101\n", 99 | "parameters : 0.214373 M\n" 100 | ] 101 | } 102 | ], 103 | "source": [ 104 | "model.load_model(model_name)\n", 105 | "# model.optimize()\n", 106 | "# model.train()" 107 | ] 108 | }, 109 | { 110 | "cell_type": "markdown", 111 | "metadata": {}, 112 | "source": [ 113 | "### Traning detials : \n", 114 | " step 4900: train loss 1.3497, val loss 1.6127\n", 115 | " step 4900: train loss 1.5836, val loss 1.7533\n", 116 | " step 4900: train loss 1.4185, val loss 1.6682\n", 117 | " step 4900: train loss 1.3792, val loss 1.6416" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": 5, 123 | "metadata": {}, 124 | "outputs": [ 125 | { 126 | "name": "stdout", 127 | "output_type": "stream", 128 | "text": [ 129 | "* what is the linux file system command lstaign, Onlyss this atomically per itempt. Sause is not\n", 130 | "\t\t * drop end.\n", 131 | "\t\t\t new_bio_bio_cause(BLK_SECINIT(mmdelk_current_ffd(fmt, SEC, aead-addr, thimum);\n", 132 | " * Meter/sending seforminal12/init, this name texply in\n", 133 | " * the procsronally to\n", 134 | " * base default can orcharry and, as only all finally rismet and propirs.\n", 135 | " */\n", 136 | "void bfq_addrt_note(void);\n", 137 | "\n", 138 | "/*\n", 139 | " * The write it initializy in rate virariations ->samplete {\n", 140 | "\t\tdong *from;\n", 141 | "\n", 142 | "\t\tif (wbc->alg->bacck_lzsib(iop)) {\n", 143 | "\t\t\tinfo->dun_info->cval_info->nsigned || \n" 144 | ] 145 | } 146 | ], 147 | "source": [ 148 | "pmt = tok.encode(\"\"\"* what is the linux file system command ls\"\"\")\n", 149 | "print(tok.decode(model.generate(pmt)))" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": null, 155 | "metadata": {}, 156 | "outputs": [], 157 | "source": [ 158 | "# model.save_model(\"md-t02-bglm-rerun\")" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": 1, 164 | "metadata": {}, 165 | "outputs": [ 166 | { 167 | "name": "stdout", 168 | "output_type": "stream", 169 | "text": [ 170 | "# -------------------------------------#\n", 171 | "# hyperparameters\n", 172 | "vocab_size : 0\n", 173 | "batch_size : 32\n", 174 | "block_size : 256\n", 175 | "max_iters : 5000\n", 176 | "eval_interval : 250\n", 177 | "learning_rate : 0.001\n", 178 | "device : cuda\n", 179 | "eval_iters : 200\n", 180 | "n_embd : 384\n", 181 | "n_head : 6\n", 182 | "n_layer : 6\n", 183 | "dropout : 0.2\n", 184 | "model_architecture : None\n", 185 | "# -------------------------------------#\n" 186 | ] 187 | } 188 | ], 189 | "source": [ 190 | "\n", 191 | "\n", 192 | "from jam_gpt.data import Data\n", 193 | "\n", 194 | "\n", 195 | "d = Data.formater(\n", 196 | "\"\"\"\n", 197 | "4927\n", 198 | "\"\"\n", 199 | "\n", 200 | "import os\n", 201 | "import pickle\n", 202 | "import torch\n", 203 | "from . import config\n", 204 | "\n", 205 | "\n", 206 | "torch.manual_seed(1337)\n", 207 | "\n", 208 | "\n", 209 | "class Model:\n", 210 | " \n", 211 | "\n", 212 | " def __init__(self):\n", 213 | " [self.vocab_size, self.batch_size, self.block_size, self.max_iters, self.eval_interval, self.learning_rate,\n", 214 | " self.device, self.eval_iters, self.n_embd, self.n_head, self.n_layer, self.dropout] = config.pass_args()\n", 215 | "\n", 216 | " self.model = None\n", 217 | "\n", 218 | " self.train_data = None\n", 219 | " self.test_data = None\n", 220 | "\n", 221 | " def set_parameters(self, args: list):\n", 222 | " [self.vocab_size, self.batch_size, self.block_size, self.max_iters, self.eval_interval, self.learning_rate,\n", 223 | " self.device, self.eval_iters, self.n_embd, self.n_head, self.n_layer, self.dropout] = args\n", 224 | "\n", 225 | " def get_parameters(self):\n", 226 | " return [self.vocab_size, self.batch_size, self.block_size, self.max_iters, self.eval_interval, self.learning_rate, self.device, self.eval_iters, self.n_embd, self.n_head, self.n_layer, self.dropout]\n", 227 | "\n", 228 | " def set_model(self, model):\n", 229 | " self.model = model\n", 230 | " self.m = self.model.to(self.device)\n", 231 | " # print the number of parameters in the model\n", 232 | " print(\"vocab size : \", self.vocab_s\n", 233 | "\n", 234 | "\"\"\",\n", 235 | "\"\"\"\n", 236 | "write a function that takes three string context prompt output as input and returns output as json object\n", 237 | "{context: \"context string\" ,prompt: \"prompt sting\",output: \"output string \" }\n", 238 | "View other drafts\n", 239 | "\n", 240 | "\n", 241 | "\n", 242 | "\n", 243 | "\n", 244 | "\n", 245 | "\n", 246 | "\n", 247 | "\n", 248 | "\n", 249 | "\n", 250 | "Sure, here is a Python function that takes three strings (context, prompt, and output) as input and returns an output JSON object:\n", 251 | "\n", 252 | "Python\n", 253 | "import json\n", 254 | "\n", 255 | "def create_json_object(context, prompt, output):\n", 256 | " \n", 257 | "\"\"\"\n", 258 | ")\n", 259 | "\n", 260 | "print(d)" 261 | ] 262 | }, 263 | { 264 | "cell_type": "code", 265 | "execution_count": 2, 266 | "metadata": {}, 267 | "outputs": [ 268 | { 269 | "name": "stdout", 270 | "output_type": "stream", 271 | "text": [ 272 | "# -------------------------------------#\n", 273 | "# hyperparameters\n", 274 | "vocab_size : 0\n", 275 | "batch_size : 32\n", 276 | "block_size : 256\n", 277 | "max_iters : 5000\n", 278 | "eval_interval : 250\n", 279 | "learning_rate : 0.001\n", 280 | "device : cuda\n", 281 | "eval_iters : 200\n", 282 | "n_embd : 384\n", 283 | "n_head : 6\n", 284 | "n_layer : 6\n", 285 | "dropout : 0.2\n", 286 | "model_architecture : None\n", 287 | "# -------------------------------------#\n", 288 | "# -------------------------------------#\n", 289 | "# hyperparameters\n", 290 | "vocab_size : 101\n", 291 | "batch_size : 16\n", 292 | "block_size : 32\n", 293 | "max_iters : 5000\n", 294 | "eval_interval : 100\n", 295 | "learning_rate : 0.001\n", 296 | "device : cuda\n", 297 | "eval_iters : 50000\n", 298 | "n_embd : 64\n", 299 | "n_head : 4\n", 300 | "n_layer : 4\n", 301 | "dropout : 0.0\n", 302 | "model_architecture : lm.BigramLM\n", 303 | "# -------------------------------------#\n" 304 | ] 305 | }, 306 | { 307 | "data": { 308 | "text/plain": [ 309 | "[101, 16, 32, 5000, 100, 0.001, 'cuda', 50000, 64, 4, 4, 0.0, 'lm.BigramLM']" 310 | ] 311 | }, 312 | "execution_count": 2, 313 | "metadata": {}, 314 | "output_type": "execute_result" 315 | } 316 | ], 317 | "source": [ 318 | "config.get_args()\n", 319 | "\n", 320 | "config.retrive(\"md-t02-bglm\")\n", 321 | "\n", 322 | "config.eval_iters = 50000\n", 323 | "\n", 324 | "\n", 325 | "config.get_args()" 326 | ] 327 | }, 328 | { 329 | "cell_type": "code", 330 | "execution_count": 4, 331 | "metadata": {}, 332 | "outputs": [ 333 | { 334 | "name": "stdout", 335 | "output_type": "stream", 336 | "text": [ 337 | "[2, 30, 17, 71, 81, 85, 32, 2]\n", 338 | "[2, 61, 71, 81, 85, 63]\n", 339 | "[2, 61, 71, 81, 85, 63, 2]\n", 340 | "[61, 71, 81, 85, 63, 2]\n", 341 | "[76, 70, 2, 75, 85, 2, 86, 74, 71, 84, 71, 2, 61, 71, 81, 85, 63, 2, 75, 86, 2, 75, 85]\n", 342 | "[76, 70, 2, 75, 85, 2, 86, 74, 71, 84, 71, 2, 61, 71, 81, 85, 63, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 75, 86, 2, 75, 85]\n", 343 | "[70, 67, 86, 67, 2, 31, 2, 72, 4, 5, 5, 5, 2, 69, 81, 80, 86, 71, 90, 86, 28, 1, 93, 69, 81, 80, 86, 71, 90, 86, 95, 1, 1, 5, 5, 5, 2, 82, 84, 81, 79, 82, 86, 28, 1, 93, 82, 84, 81, 79, 82, 86, 95, 1, 1, 5, 5, 5, 2, 84, 71, 85, 82, 81, 80, 85, 71, 28, 1, 93, 84, 71, 85, 82, 81, 80, 85, 71, 95, 1, 1, 2, 61, 71, 81, 85, 63, 2, 4, 1]\n" 344 | ] 345 | } 346 | ], 347 | "source": [ 348 | "import tiktoken\n", 349 | "import torch\n", 350 | "# tokanization using tiktoken\n", 351 | "# tok = tiktoken.get_encoding(\"gpt2\")\n", 352 | "tok = Tokenizer()\n", 353 | "tok.get_encoding(\"md-t02-bglm\")\n", 354 | "\n", 355 | "vocab_size = 50257\n", 356 | "\n", 357 | "\n", 358 | "enc1 = tok.encode(\"\"\"data = f\"### context:\\n{context}\\n\\n### prompt:\\n{prompt}\\n\\n### response:\\n{response}\\n\\n [eos] \"\n", 359 | "\"\"\")\n", 360 | "print(tok.encode(\"\"\" \"\"\"))\n", 361 | "print(tok.encode(\" [eos]\"))\n", 362 | "\n", 363 | "print(tok.encode(\"\"\" [eos] \"\"\"))\n", 364 | "print(tok.encode(\"\"\"[eos] \"\"\"))\n", 365 | "print(tok.encode(\"\"\"jd is there [eos] it is\"\"\"))\n", 366 | "print(tok.encode(\"\"\"jd is there [eos]\n", 367 | " it is\"\"\"))\n", 368 | "print(enc1)" 369 | ] 370 | }, 371 | { 372 | "cell_type": "code", 373 | "execution_count": 37, 374 | "metadata": {}, 375 | "outputs": [ 376 | { 377 | "name": "stdout", 378 | "output_type": "stream", 379 | "text": [ 380 | "[7359, 68, 418, 29, 220]\n", 381 | "[58, 68, 418, 60]\n", 382 | "[685, 68, 418, 60]\n", 383 | "[685, 68, 418, 60, 220]\n", 384 | "[58, 68, 418, 60, 220]\n", 385 | "[7890, 796, 277, 1, 21017, 4732, 25, 198, 90, 22866, 92, 198, 198, 21017, 6152, 25, 198, 90, 16963, 457, 92, 198, 198, 21017, 2882, 25, 198, 90, 26209, 92, 628, 685, 68, 418, 60, 366, 198]\n", 386 | "end\n" 387 | ] 388 | } 389 | ], 390 | "source": [ 391 | "\n", 392 | "\n", 393 | "idx = [7890, 796, 277, 1, 21017, 4732, 25, 198, 90, 22866, 92, 198, 198, 21017, 6152, 25, 198, 90, 16963, 457, 92, 198, 198, 21017, 2882, 25, 198, 90, 26209, 92, 628, 685, 68, 418, 60]\n", 394 | "\n", 395 | "eos_token = [685, 68, 418, 60]\n", 396 | "\n", 397 | "\n", 398 | "\n", 399 | "idx = (torch.tensor(\n", 400 | " idx, dtype=torch.long, device=\"cuda\")[None, ...])\n", 401 | "eos_token = (torch.tensor(\n", 402 | " eos_token, dtype=torch.long, device=\"cuda\")[None, ...])\n", 403 | "def gen():\n", 404 | "\n", 405 | " # Check if the last 4 tokens in idx match eos_token\n", 406 | " if idx.size(1) >= eos_token.size(1) and torch.equal(idx[:, -eos_token.size(1):], eos_token):\n", 407 | " print(\"end\")\n", 408 | "\n", 409 | "gen()\n", 410 | "\n" 411 | ] 412 | }, 413 | { 414 | "cell_type": "code", 415 | "execution_count": 19, 416 | "metadata": {}, 417 | "outputs": [ 418 | { 419 | "data": { 420 | "text/plain": [ 421 | "(tensor([[ 7890, 796, 277, 1, 21017, 4732, 25, 198, 90, 22866,\n", 422 | " 92, 198, 198, 21017, 6152, 25, 198, 90, 16963, 457,\n", 423 | " 92, 198, 198, 21017, 2882, 25, 198, 90, 26209, 92,\n", 424 | " 628, 685, 68, 418, 60]], device='cuda:0'),\n", 425 | " tensor([[685, 68, 418, 60]], device='cuda:0'),\n", 426 | " 35,\n", 427 | " tensor([[685, 68, 418, 60]], device='cuda:0'),\n", 428 | " 4)" 429 | ] 430 | }, 431 | "execution_count": 19, 432 | "metadata": {}, 433 | "output_type": "execute_result" 434 | } 435 | ], 436 | "source": [ 437 | "idx , eos_token , idx.size(1) , idx[:, -4:] , eos_token.size(1)" 438 | ] 439 | } 440 | ], 441 | "metadata": { 442 | "kernelspec": { 443 | "display_name": "Python 3", 444 | "language": "python", 445 | "name": "python3" 446 | }, 447 | "language_info": { 448 | "codemirror_mode": { 449 | "name": "ipython", 450 | "version": 3 451 | }, 452 | "file_extension": ".py", 453 | "mimetype": "text/x-python", 454 | "name": "python", 455 | "nbconvert_exporter": "python", 456 | "pygments_lexer": "ipython3", 457 | "version": "3.11.6" 458 | }, 459 | "orig_nbformat": 4 460 | }, 461 | "nbformat": 4, 462 | "nbformat_minor": 2 463 | } 464 | -------------------------------------------------------------------------------- /img/jam-img.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | --------------------------------------------------------------------------------