├── requirements.txt
├── img
├── jam-img.jpg
├── jam-img.png
└── jam-img.svg
├── docs
├── 1706.03762.pdf
├── jam-convo.md
└── jam-gpt.md
├── jam_gpt
├── __init__.py
├── data.py
├── tokenizer.py
├── config.py
├── model.py
└── lm.py
├── .gitignore
├── note.md
├── data-set
├── data_graber.py
└── jam-data.json
├── setup.py
├── test-drive
└── test_drive.py
├── LICENSE
├── README.md
├── test-gptLM.ipynb
├── test-old.demo.ipynb
└── test.ipynb
/requirements.txt:
--------------------------------------------------------------------------------
1 | setuptools
2 | torch>=2.0.0
3 | tiktoken
--------------------------------------------------------------------------------
/img/jam-img.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/loke-x/jam-gpt/HEAD/img/jam-img.jpg
--------------------------------------------------------------------------------
/img/jam-img.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/loke-x/jam-gpt/HEAD/img/jam-img.png
--------------------------------------------------------------------------------
/docs/1706.03762.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/loke-x/jam-gpt/HEAD/docs/1706.03762.pdf
--------------------------------------------------------------------------------
/jam_gpt/__init__.py:
--------------------------------------------------------------------------------
1 |
2 | from .data import Data as Data
3 | from .tokenizer import Tokenizer as Tokenizer
4 | from .model import Model as Model
5 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 |
2 | # models
3 | # md-test-01/
4 |
5 | # model
6 | # *.bin
7 |
8 | __pycache__/
9 | *.pyc
10 | .pypirc
11 | dist/
12 |
13 |
14 | # datasets
15 |
16 |
17 | *.egg-info/
--------------------------------------------------------------------------------
/docs/jam-convo.md:
--------------------------------------------------------------------------------
1 | ```py
2 | eos_token = tok.encode(""" [eos]""")
3 |
4 | eos = tok.encode(" [eos]")
5 | print(tok.decode(model.generate(pmt,max_new_tokens=3000,eos_token=eos)))
6 | ```
7 |
8 |
--------------------------------------------------------------------------------
/note.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | ## to publish next release in pip jam-gpt-0.0.5
4 |
5 | ### run :
6 |
7 | ```
8 | python setup.py sdist
9 | ```
10 | >include requirements.txt inside the ziped
11 | ./dist/jam-gpt-0.0.5.tar.gz/jam-gpt-0.0.5
12 | ./dist/jam-gpt-0.0.5.tar.gz/jam-gpt-0.0.5/jam_gpt.egg-info/SOURCES.txt
13 |
14 | ### then run
15 |
16 | ```
17 | twine upload dist/jam-gpt-0.0.5.tar.gz
18 | ```
--------------------------------------------------------------------------------
/data-set/data_graber.py:
--------------------------------------------------------------------------------
1 | import os
2 | import datasets
3 | from datasets import load_dataset
4 |
5 | # Load the daily dialogue dataset.
6 | dataset = load_dataset("daily_dialog")
7 |
8 | # Select the first 5000 dialogues from the dataset.
9 | dialogues = dataset["train"]["dialog"][:5000]
10 |
11 | # # Create the file if it does not exist.
12 | # os.makedirs("dialogues.txt", exist_ok=True)
13 |
14 | # Write the dialogues to the text file.
15 | with open("dialogues.txt", "w") as f:
16 | for dialogue in [str(dialogue) + "\n" for dialogue in dialogues]:
17 | f.write(dialogue)
18 | print("writing ....")
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | import os
2 | from setuptools import setup, find_packages
3 |
4 | setup(
5 | name='jam-gpt',
6 | version='0.0.4',
7 | description='A reimplementation of large language model (LLM) architectures designed for research and development processes',
8 | author='Lokeshwaran M',
9 | author_email='lokeshwaran.m23072003@gmail.com',
10 | url="https://github.com/Lokeshwaran-M/jam-gpt.git",
11 | license="MIT",
12 | packages=find_packages(),
13 | package_data={'': ['requirements.txt', 'README.md']},
14 | install_requires=open('requirements.txt').readlines(),
15 | keywords='jam-gpt Jam-AI Jam-AGI',
16 | )
17 |
18 |
19 | # install_requires=["setuptools==67.8.0","torch==2.0.1","tiktoken"]
20 |
21 |
--------------------------------------------------------------------------------
/test-drive/test_drive.py:
--------------------------------------------------------------------------------
1 | # testing Data and Tokenizer calsses
2 |
3 | from jam_gpt import Data, Tokenizer, config
4 |
5 | tok = Tokenizer()
6 |
7 | def t_tokenizer(path,text_input = "test sample data"):
8 |
9 | text = Data.get(path)
10 | tok.set_encoding("md-test-01",text)
11 | tok.get_encoding("md-test-01")
12 | vocab_size = tok.n_vocab
13 | print(vocab_size)
14 | enc = tok.encode(text_input)
15 | print(enc)
16 | dec = tok.decode(enc)
17 | print(dec)
18 |
19 | # # output ::
20 | # 2
21 | # [75, 60, 74, 75, 1, 74, 56, 68, 71, 67, 60, 1, 59, 56, 75, 56]
22 | # test sample data
23 |
24 | # # to run :
25 | # from jam_gpt.test import test_drive as td
26 |
27 | # td.t_tokenizer("data.txt")
28 |
29 | def t_config():
30 | x = config.pass_args()
31 | print(x)
32 | x[0] = 96
33 | config.set_args(x)
34 | y = config.pass_args()
35 | print(x)
36 | print(y)
37 |
38 | # [0, 16, 32, 5000, 100, 0.001, 'cuda', 200, 64, 4, 4, 0.0]
39 | # [96, 16, 32, 5000, 100, 0.001, 'cuda', 200, 64, 4, 4, 0.0]
40 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2023 Lokeshwaran M
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/jam_gpt/data.py:
--------------------------------------------------------------------------------
1 | import json
2 | import torch
3 | from . import config
4 |
5 |
6 | class Data:
7 | """ just a data preprocessor """
8 | def __init__(self) -> None:
9 | pass
10 |
11 | @classmethod
12 | def get(cls, path: str) -> str:
13 | """
14 | text data file -> string data
15 |
16 | """
17 | with open(path, "r", encoding="utf-8") as f:
18 | text_data = f.read()
19 | return text_data
20 |
21 | @classmethod
22 | def set(cls, path: str, data: str) -> None:
23 | """
24 | string data -> text data file
25 | """
26 | with open(path, "w", encoding="utf-8") as f:
27 | f.write(data)
28 | print("writen data : ", len(data))
29 |
30 | @classmethod
31 | def train_test_split(cls, data, split_percent: int = 90):
32 | """
33 | split the data into train ans test based on split percentage
34 | """
35 |
36 | tensor_data = torch.tensor(data, dtype=torch.long, device=config.device)
37 | n = int((split_percent/100)*len(data))
38 | train_data = tensor_data[:n]
39 | test_data = tensor_data[n:]
40 |
41 | return [train_data, test_data]
42 |
43 | @classmethod
44 | def chat_formater(cls,context=None, prompt=None, response=None):
45 | """Creates a JSON object from the given context, prompt, and output.
46 |
47 | Args:
48 | context: A string containing the context
49 | prompt: A string containing the prompt
50 | Response: A string containing the Response
51 |
52 | Returns:
53 | A JSON object containing the context, prompt, and Response
54 | """
55 |
56 | if response :
57 | data = f"### context:\n{context}\n\n### prompt:\n{prompt}\n\n### response:\n{response}\n [eos] \n"
58 | elif context and prompt :
59 | data = f"### context:\n{context}\n\n### prompt:\n{prompt}\n\n### response:\n"
60 | elif not context :
61 | data = f"### prompt:\n{prompt}\n\n### response:\n"
62 |
63 | return data
64 |
65 | @classmethod
66 | def chat_JsonToTxt(cls,path_json,path_txt):
67 |
68 | # Read data from the JSON file
69 | with open(path_json, 'r') as json_file:
70 | data = json.load(json_file)
71 |
72 | # Create a text file to save the data
73 | with open(path_txt, 'w') as txt_file:
74 | # Iterate through the data and write context, prompt, and response to the text file
75 | for chat in data:
76 | context = chat['context']
77 | prompt = chat['prompt']
78 | response = chat['response']
79 |
80 | # Write to the text file
81 | formated_chat = cls.chat_formater(context, prompt, response)
82 | txt_file.write(formated_chat)
83 | txt_file.write('\n') # Add an empty line to separate entries
84 |
85 |
86 |
87 |
88 |
--------------------------------------------------------------------------------
/jam_gpt/tokenizer.py:
--------------------------------------------------------------------------------
1 | import json
2 | import os
3 | import shutil
4 |
5 | class Tokenizer:
6 | """
7 | tokenizer class will tokenize the input interms of charater level
8 | based on the the given text_data to get_encoding function it will use vacobalory
9 | if not it will use the default encodings to encode and decode data
10 | """
11 |
12 | def __init__(self):
13 | pass
14 |
15 | def get_encoding(self, model: str = None):
16 |
17 | self.vocab = Tokenizer.get_char_vocab(model)
18 | self.stoi, self.itos = self.vocab
19 | self.n_vocab = len(self.stoi)
20 | # print(self.vocab))
21 | # print(self.n_vocab)
22 |
23 | def set_encoding(self, model: str, data: str):
24 | """
25 | toake text or string data and segregate it into vocab and store it in model
26 | """
27 | # handelling folder not existerror
28 |
29 | Tokenizer.set_char_vocab(model, data)
30 |
31 | def encode(self, s: str) -> list[int]:
32 | # encoder: take a string char , output a list of integers
33 | enc_list = []
34 | for c in s:
35 | if c not in self.stoi:
36 | self.stoi[c] = max(self.stoi.values())+1
37 | self.n_vocab += 1
38 | enc_list.append(self.stoi[c])
39 | return enc_list
40 |
41 | def decode(self, l: list[int]) -> str:
42 | # decoder: take a list of integers, output a string
43 | return ''.join([self.itos[str(i)] for i in l])
44 |
45 | @classmethod
46 | def store_vocab(cls,smodel,md_name: str):
47 | """
48 | args :
49 | smodel = source model name
50 | md_name = destination model name
51 | source model -> destination model
52 | """
53 | if not os.path.exists(f"./{md_name}"):
54 | os.makedirs(f"./{md_name}")
55 | spath = f"{smodel}/vocab.json"
56 | dpath = f"{md_name}/vocab.json"
57 | shutil.copy(spath, dpath)
58 |
59 | @classmethod
60 | def get_char_vocab(cls, model: str):
61 | """
62 | json file -> dict -> dict,dict
63 | """
64 | path =f"{model}/vocab.json"
65 | with open(path, "r", encoding="utf-8") as f:
66 | data = json.load(f)
67 | stoi = data["stoi"]
68 | itos = data["itos"]
69 | return stoi, itos
70 |
71 | @classmethod
72 | def set_char_vocab(cls, model: str, data: str) -> None:
73 | """
74 | string data -> vocab -> dict,dict -> dict -> json file
75 | """
76 | if not os.path.exists(f"./{model}"):
77 | os.makedirs(f"./{model}")
78 | path = f"{model}/vocab.json"
79 | data_chars = sorted(list(set(data)))
80 | stoi = {ch: i for i, ch in enumerate(data_chars)}
81 | itos = {i: ch for i, ch in enumerate(data_chars)}
82 | vocab = {"stoi": stoi, "itos": itos}
83 | with open(path, "w", encoding="utf-8") as f:
84 | json.dump(vocab, f)
85 | # print("writen data string : ",len(data_string))
86 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Jam-gpt
2 |
3 | ## about :
4 |
5 | > note : its a simple toy llm model builder framework to play around llm models and understand its working principles
6 |
7 | its not a llm model but you can put ur data and train it to build llm models
8 |
9 | An Experimental implementation of **language model (LLM)** architecture for research and development of it architectures, design process to build, training, and fine-tuning efficient **Generative Pretrained Transformers (GPT)** models
10 |
11 | for more ai related tools and framework look into **[OX-AI](https://github.com/ox-ai)** an open source AI project
12 |
13 |
14 |
15 |
16 |

17 |
18 |
19 |
20 | ## Installation :
21 |
22 | ### latest version
23 | > github pull will be clean if encountered with bugs please report issues
24 | ```bash
25 | pip install git+https://github.com/Lokeshwaran-M/jam-gpt.git
26 | ```
27 |
28 | ### stable release
29 | > jam-gpt==0.0.4 may not have fine tuning as its still under development and may contain bug please report issues if any
30 |
31 | ```bash
32 | pip install jam-gpt
33 | ```
34 |
35 |
36 |
37 | ## Usage :
38 |
39 | Refere [Docs](./docs/jam-gpt.md) and [test-gptLM.ipynb](test-gptLM.ipynb) for code examples
40 |
41 | ```python
42 |
43 | from jam_gpt.tokenizer import Tokenizer
44 | from jam_gpt import config
45 | from jam_gpt import lm
46 | from jam_gpt.model import Model
47 |
48 | md_name = "md-name"
49 |
50 | tok = Tokenizer()
51 | tok.get_encoding(md_name)
52 |
53 | # model initilization
54 | model = Model()
55 |
56 | # load pretrined model
57 | model.load_model(md_name)
58 |
59 | # Generate data using Model
60 | pmt = tok.encode("user prompt")
61 | res = tok.decode(model.generate(pmt))
62 | print(res)
63 |
64 | ```
65 |
66 | ## Docs :
67 |
68 | [Jam-gpt docs](./docs/jam-gpt.md) will give you the complete useage and explanation of the jam-gpt library
69 |
70 | 1 [ setup](./docs/jam-gpt.md#1-setup)
71 | 2 [ Collecting data](./docs/jam-gpt.md#2-collecting-data)
72 | 3 [ Tokenization](./docs/jam-gpt.md#3-tokenization)
73 | 4 [ configuration](./docs/jam-gpt.md#4-configuration)
74 | 5 [ Language Model ( LM , Model )](./docs/jam-gpt.md#5-language-model--lm--model)
75 | 6 [ Model Fine Tuning](./docs/jam-gpt.md#6-model-fine-tuning)
76 |
77 | ## Contribution :
78 |
79 | for contribution guidelines and terms and condition to contribute refere [jam-contribution](https://github.com/Lokeshwaran-M/jam-contribution.git) by rasing the PR you are accepting the terms and condition
80 |
81 | Any form of contribution is accepted here
82 |
83 | Submitting :
84 | Issues
85 | pull requests
86 | feature requests
87 | bug reports
88 | documentation
89 |
90 | ## credits :
91 |
92 | * kudos to [Andrej karpathy](https://github.com/karpathy) for his lectures on deep learning
93 | * [Open AI](https://github.com/openai) for GPT-2
94 | * paper ["Attention Is All You Need"](https://arxiv.org/pdf/1706.03762.pdf)
95 |
96 |
--------------------------------------------------------------------------------
/jam_gpt/config.py:
--------------------------------------------------------------------------------
1 | import json
2 | import os
3 | import torch
4 |
5 |
6 | # -----------------------------------------------------------#
7 | # hyperparameters
8 |
9 | vocab_size = 0
10 | batch_size = 32
11 | block_size = 256
12 | max_iters = 5000
13 | eval_interval = 250
14 | learning_rate = 1e-3
15 | device = "cuda" if torch.cuda.is_available() else "cpu"
16 | eval_iters = 200
17 | n_embd = 384
18 | n_head = 6
19 | n_layer = 6
20 | dropout = 0.2
21 | model_architecture = None
22 |
23 | # -----------------------------------------------------------#
24 |
25 |
26 | def pass_args():
27 | return [vocab_size, batch_size, block_size, max_iters, eval_interval, learning_rate, device, eval_iters, n_embd, n_head, n_layer, dropout, model_architecture]
28 |
29 |
30 | def get_args():
31 | args = pass_args()
32 | arg_names = ['vocab_size', 'batch_size', 'block_size', 'max_iters', 'eval_interval',
33 | 'learning_rate', 'device', 'eval_iters', 'n_embd', 'n_head', 'n_layer', 'dropout', 'model_architecture']
34 | print("# -------------------------------------#\n# hyperparameters")
35 | max_arg_name_length = max(len(arg) for arg in arg_names)
36 | for arg_name, arg_value in zip(arg_names, args):
37 | padding = ' ' * (max_arg_name_length - len(arg_name))
38 | print(f"{arg_name} {padding} : {arg_value}")
39 | print("# -------------------------------------#")
40 | return args
41 |
42 |
43 | def set_args(args: list):
44 | global vocab_size, batch_size, block_size, max_iters, eval_interval, learning_rate, device, eval_iters, n_embd, n_head, n_layer, dropout, model_architecture
45 | [vocab_size, batch_size, block_size, max_iters, eval_interval,
46 | learning_rate, device, eval_iters, n_embd, n_head, n_layer, dropout, model_architecture] = args
47 |
48 |
49 | def variables_to_dict():
50 | config_dict = {
51 | "vocab_size": vocab_size,
52 | "batch_size": batch_size,
53 | "block_size": block_size,
54 | "max_iters": max_iters,
55 | "eval_interval": eval_interval,
56 | "learning_rate": learning_rate,
57 | "device": device,
58 | "eval_iters": eval_iters,
59 | "n_embd": n_embd,
60 | "n_head": n_head,
61 | "n_layer": n_layer,
62 | "dropout": dropout,
63 | "model_architecture": model_architecture
64 | }
65 | return config_dict
66 |
67 |
68 | def store(model_name, args=pass_args()):
69 | if not os.path.exists(f"./{model_name}"):
70 | os.makedirs(f"./{model_name}")
71 | path = f"{model_name}/config.json"
72 | set_args(args)
73 | config_dict = variables_to_dict()
74 | with open(path, "w") as config_file:
75 | config_data = {"config_args": args, "config_dict": config_dict}
76 | json.dump(config_data, config_file)
77 |
78 |
79 | def retrive(model_name):
80 | """
81 | args : model_name
82 | return : config.json
83 | """
84 | path = f"{model_name}/config.json"
85 | try:
86 | with open(path, "r") as config_file:
87 | config_data = json.load(config_file)
88 | config_data["config_args"][6] = "cuda" if torch.cuda.is_available() else "cpu"
89 | config_data["config_dict"]["device"] = "cuda" if torch.cuda.is_available() else "cpu"
90 | set_args(config_data["config_args"])
91 | return config_data
92 | except FileNotFoundError:
93 | print("warning : config.json not found using default args")
94 | config_data = {"config_args": pass_args(), "config_dict": variables_to_dict()}
95 | return config_data
96 |
97 |
98 | get_args()
99 |
--------------------------------------------------------------------------------
/data-set/jam-data.json:
--------------------------------------------------------------------------------
1 | [
2 | {
3 | "context": "Introduction",
4 | "prompt": "Hello, who are you?",
5 | "response": "I am Jam-AGI, your AI assistant."
6 | },
7 | {
8 | "context": "Introduction",
9 | "prompt": "What can you do?",
10 | "response": "I can answer questions, provide information, and assist with various tasks."
11 | },
12 | {
13 | "context": "Introduction",
14 | "prompt": "Tell me about yourself.",
15 | "response": "I am an AI assistant built using GPT-3. I'm here to help you with your questions and tasks."
16 | },
17 | {
18 | "context": "General Question",
19 | "prompt": "What's the weather like today?",
20 | "response": "I'm sorry, I don't have access to real-time information. Please check a weather website or app for the latest updates."
21 | },
22 | {
23 | "context": "General Question",
24 | "prompt": "Can you recommend a good book to read?",
25 | "response": "Of course! What genre are you interested in?"
26 | },
27 | {
28 | "context": "General Question",
29 | "prompt": "How do I cook a perfect omelette?",
30 | "response": "Here's a simple recipe: [Provide omelette recipe]"
31 | },
32 | {
33 | "context": "General Question",
34 | "prompt": "What's the capital of France?",
35 | "response": "The capital of France is Paris."
36 | },
37 | {
38 | "context": "General Question",
39 | "prompt": "Tell me a joke.",
40 | "response": "Why did the computer catch a cold? Because it had too many windows open!"
41 | },
42 | {
43 | "context": "General Question",
44 | "prompt": "What's your favorite color?",
45 | "response": "I don't have personal preferences, but I can help you find information on colors."
46 | },
47 | {
48 | "context": "Technology",
49 | "prompt": "Explain artificial intelligence.",
50 | "response": "Artificial intelligence (AI) is the simulation of human intelligence processes by machines, typically computer systems. It involves tasks such as learning, reasoning, problem-solving, and understanding natural language."
51 | },
52 | {
53 | "context": "Technology",
54 | "prompt": "Tell me about machine learning.",
55 | "response": "Machine learning is a subset of AI that focuses on the development of algorithms that allow computers to learn from and make predictions or decisions based on data without being explicitly programmed."
56 | },
57 | {
58 | "context": "Science",
59 | "prompt": "What is the theory of relativity?",
60 | "response": "The theory of relativity, developed by Albert Einstein, consists of two main parts: special relativity and general relativity. It revolutionized our understanding of space, time, and gravity."
61 | },
62 | {
63 | "context": "Science",
64 | "prompt": "Can you explain quantum mechanics?",
65 | "response": "Quantum mechanics is a fundamental theory in physics that describes the behavior of matter and energy at the smallest scales, typically at the level of atoms and subatomic particles."
66 | },
67 | {
68 | "context": "Entertainment",
69 | "prompt": "Who is your favorite actor?",
70 | "response": "I don't have personal preferences, but I can provide information about various actors and their work."
71 | },
72 | {
73 | "context": "Entertainment",
74 | "prompt": "Recommend a classic movie.",
75 | "response": "How about watching 'Casablanca'? It's a timeless classic with great storytelling."
76 | }
77 | ]
--------------------------------------------------------------------------------
/docs/jam-gpt.md:
--------------------------------------------------------------------------------
1 | # Jam-gpt
2 |
3 | documentation for using jam-gpt library
4 |
5 | ## 1 Setup :
6 |
7 | ### Installization
8 |
9 | #### insatll from pip relese
10 |
11 | ```bash
12 | pip insatll jam-gpt
13 | ```
14 |
15 | #### install it in your local site-packages directory
16 |
17 | ```bash
18 | pip install git+https://github.com/Lokeshwaran-M/jam-gpt.git
19 | ```
20 |
21 | ### Modified installization
22 |
23 | To add your modification and install in your local site-packages directory
24 |
25 | ```bash
26 | # clone in your project directory
27 | git clone https://github.com/Lokeshwaran-M/jam-gpt.git
28 | cd jam-gpt
29 |
30 | # run pip to move inside your local site-packages directory
31 | pip install .
32 |
33 | # to do modification editable mode
34 | pip install -e .
35 | ```
36 |
37 | ## 2 Collecting Data :
38 |
39 | ```python
40 | from jam_gpt import Data
41 |
42 | # data collection
43 | data=Data.get("path-to-textfile")
44 | ```
45 |
46 | just to get data from a text data file and return as one large single string for furthere pre processing
47 |
48 | ## 3 Tokenization :
49 |
50 | ```python
51 | from jam_gpt import Tokenizer
52 |
53 | tok = Tokenizer()
54 |
55 | model_name = "md-test"
56 | # tokanization
57 | tok.set_encoding(model_name, data)
58 | tok.get_encoding(model_name)
59 |
60 | vocab_size = tok.n_vocab
61 |
62 | enc = tok.encode("test sample $^&~~data")
63 | dec = tok.decode(enc)
64 |
65 | # out :
66 | # [81, 66, 80, 81, 1, 80, 62, 74, 77, 73, 66, 1, 4, 60, 6, 90, 90, 65, 62, 81, 62]
67 | # test sample $^&~~data
68 |
69 | #to store a pretrained model vocab to finetuned model or other model
70 | tok.store_vocab("source_md-name","md-name")
71 | ```
72 |
73 | ```python
74 | import tiktoken
75 |
76 | # tokanization using tiktoken
77 | tok = tiktoken.get_encoding("gpt2")
78 |
79 | vocab_size = 50257
80 |
81 | enc = tok.encode("test sample $^&~~data")
82 | dec = tok.decode(enc)
83 |
84 | # out :
85 | # [9288, 6291, 720, 61, 5, 4907, 7890]
86 | # test sample $^&~~data
87 | ```
88 |
89 | A tokenizer is a tool that breaks down text into smaller units called tokens These tokens can then be processed by an LLM. The tokens can be words, characters, subwords, or other segments of text, depending on the type of LLM and the desired granularity of the text representation.
90 |
91 | ## 4 configuration :
92 |
93 | ```python
94 | from jam_gpt import config
95 |
96 | # customizing parameter settings before initializing model
97 |
98 | args = config.pass_args()
99 | config.vocab_size = 50257
100 |
101 | print(args)
102 | print(config.pass_args())
103 |
104 | # out :
105 | # [0, 16, 32, 5000, 100, 0.001, 'cuda', 200, 64, 4, 4, 0.0]
106 | # [50257, 16, 32, 5000, 100, 0.001, 'cuda', 200, 64, 4, 4, 0.0]
107 |
108 | # To store the customized config setting into model/config.json
109 | config.store(model_name,args)
110 | # To retrive the config settings from model/config.json
111 | config.retrive(model_name)
112 | ```
113 | The config custamization need to be done before initializing the model
114 |
115 | ## 5 Language Model ( LM , Model ) :
116 |
117 | ### Initilizing model
118 |
119 | ```python
120 | from jam_gpt import lm
121 | from jam_gpt import Model
122 |
123 | # model instantiation
124 | model = Model()
125 |
126 | # setting model architecture
127 |
128 | # GPT Language Model
129 | model.set_model(lm.GPTLM())
130 | ```
131 |
132 | ### Traning
133 |
134 | ```python
135 | # prepare data for training ( train , test )
136 | model.set_data(Data.train_test_split(enc_data))
137 |
138 | # traning
139 | model.optimize()
140 | model.train()
141 | ```
142 |
143 | ### Saving model
144 |
145 | ```python
146 | # default bin
147 | model.save_model(model_name)
148 | # can edit model_format
149 | # model_format = pt or pkl
150 | model.save_model(model_name,model_format)
151 | ```
152 |
153 | ### load model
154 |
155 | ```python
156 | # retrive model parameter settings
157 | config.retrive(md_name)
158 |
159 | # model instantiation
160 | model = Model()
161 |
162 | model.load_model(model_name)
163 | ```
164 |
165 | ### Generate data using Model
166 |
167 | ```python
168 | pmt = tok.encode("user prompt")
169 | eos = tok.encode(" [eos] ")
170 | res = tok.decode(model.generate(pmt,3000,eos))
171 |
172 | print(res)
173 | ```
174 |
175 | ## 6 Model Fine Tuning :
176 |
177 | ```python
178 |
179 | Coming Soon ........
180 |
181 | ```
182 |
--------------------------------------------------------------------------------
/test-gptLM.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [
8 | {
9 | "name": "stdout",
10 | "output_type": "stream",
11 | "text": [
12 | "# -------------------------------------#\n",
13 | "# hyperparameters\n",
14 | "vocab_size : 0\n",
15 | "batch_size : 32\n",
16 | "block_size : 256\n",
17 | "max_iters : 5000\n",
18 | "eval_interval : 250\n",
19 | "learning_rate : 0.001\n",
20 | "device : cuda\n",
21 | "eval_iters : 200\n",
22 | "n_embd : 384\n",
23 | "n_head : 6\n",
24 | "n_layer : 6\n",
25 | "dropout : 0.2\n",
26 | "model_architecture : None\n",
27 | "# -------------------------------------#\n"
28 | ]
29 | }
30 | ],
31 | "source": [
32 | "from jam_gpt import Data, Tokenizer, config, lm, Model\n",
33 | "import tiktoken # type: ignore"
34 | ]
35 | },
36 | {
37 | "cell_type": "code",
38 | "execution_count": null,
39 | "metadata": {},
40 | "outputs": [],
41 | "source": [
42 | "\n",
43 | "path = \"./data-set/md-to.txt\"\n",
44 | "data = Data.get(path)\n",
45 | "\n",
46 | "config.vocab_size = 50257\n",
47 | "tok = tiktoken.get_encoding(\"gpt2\")\n",
48 | "enc_data = tok.encode(data)\n",
49 | "\n",
50 | "model_name = \"md-t0\"\n",
51 | "\n",
52 | "# model genration\n",
53 | "model = Model()\n",
54 | "\n",
55 | "model.set_model(lm.GPTLM())\n",
56 | "model.set_data(Data.train_test_split(enc_data))\n",
57 | "\n",
58 | "model.optimize()\n",
59 | "model.train()"
60 | ]
61 | },
62 | {
63 | "cell_type": "code",
64 | "execution_count": null,
65 | "metadata": {},
66 | "outputs": [],
67 | "source": [
68 | "# model.save_model(model_name)"
69 | ]
70 | },
71 | {
72 | "cell_type": "code",
73 | "execution_count": null,
74 | "metadata": {},
75 | "outputs": [],
76 | "source": [
77 | "# model.load_model(model_name)"
78 | ]
79 | },
80 | {
81 | "cell_type": "code",
82 | "execution_count": null,
83 | "metadata": {},
84 | "outputs": [
85 | {
86 | "name": "stdout",
87 | "output_type": "stream",
88 | "text": [
89 | "I love you read, as overI-52, you are all, a loved are\n",
90 | "THE THEORY OF LOVE 117 whohas him.\n",
91 | "The same emphasis in Both views on aspects, but wearehuman all types of exchange. In the opposite is not that sheent connection between their child, she must it. Hence thinking should receive its veryess Kali; when he does notaches to love; the childanxietyis the child are of giving must love the than to love.Inasmuch when we mean that using all one can do miracles, not do miracles, not liveser to be as much as everything, to receive unconditional—all persons dis nature, is a need to follow the child or mother; he promises he also she remainsutter her, sees only when they belong to her bodily sensitivity toward theattitudes.\n",
92 | "In all of fixation usually occurs after man not thinkingbut as others, but discipline, this need something apart; mother only one has nots, symbolized?\n",
93 | "As more since these viewsis that she\n",
94 | "sto take him, explores the evening, of it remains superficial they are affected by their bliss anxiety. At this path to receiving.\n",
95 | "than the insane. One premise,the otherm,the ability more conducive to mother, so on to be an\n",
96 | "poss discussion of the one's own human functioning, and for the object of duty, and we are all shareother with God,the prepondervercoming the mostkft grow from the center of truth. Tosecret is essentially the individualical character, the only one makes them on the children. Sheis that~oved/ is suffer is to be loved by a welove, is impossible for that of in two systems to do not one's breathing;not, but it\n",
97 | "it is now to be taught, such an alienated from whichimwardsity is taken rich. Theillusiones that for external I assume a pattern of the consciousness of this goal of this lack of separateism, it,fyou only the following discmanent and again his value; in our fellowman, and so a \"For\n",
98 | "than the book explores which one/ and to destroy. For the one's development: for the two individuals of the use of meaning of God. The mostele—alienable emphasis on the concept of\n",
99 | "THE THEORY OF LOVE 45phis trend of thought and complain bitterly about the earlieststage in thought. In contrast to live separate words, that all this I aml\n"
100 | ]
101 | }
102 | ],
103 | "source": [
104 | "pmt = tok.encode(\"I love you\")\n",
105 | "print(tok.decode(model.generate(pmt)))"
106 | ]
107 | },
108 | {
109 | "cell_type": "code",
110 | "execution_count": null,
111 | "metadata": {},
112 | "outputs": [],
113 | "source": []
114 | }
115 | ],
116 | "metadata": {
117 | "kernelspec": {
118 | "display_name": "Python 3",
119 | "language": "python",
120 | "name": "python3"
121 | },
122 | "language_info": {
123 | "codemirror_mode": {
124 | "name": "ipython",
125 | "version": 3
126 | },
127 | "file_extension": ".py",
128 | "mimetype": "text/x-python",
129 | "name": "python",
130 | "nbconvert_exporter": "python",
131 | "pygments_lexer": "ipython3",
132 | "version": "3.11.9"
133 | },
134 | "orig_nbformat": 4
135 | },
136 | "nbformat": 4,
137 | "nbformat_minor": 2
138 | }
139 |
--------------------------------------------------------------------------------
/test-old.demo.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [
8 | {
9 | "name": "stdout",
10 | "output_type": "stream",
11 | "text": [
12 | "# -------------------------------------#\n",
13 | "# hyperparameters\n",
14 | "vocab_size : 0\n",
15 | "batch_size : 32\n",
16 | "block_size : 256\n",
17 | "max_iters : 5000\n",
18 | "eval_interval : 250\n",
19 | "learning_rate : 0.001\n",
20 | "device : cuda\n",
21 | "eval_iters : 200\n",
22 | "n_embd : 384\n",
23 | "n_head : 6\n",
24 | "n_layer : 6\n",
25 | "dropout : 0.2\n",
26 | "model_architecture : None\n",
27 | "# -------------------------------------#\n",
28 | "CPU times: user 1.34 s, sys: 1.33 s, total: 2.68 s\n",
29 | "Wall time: 1.22 s\n"
30 | ]
31 | }
32 | ],
33 | "source": [
34 | "%%time\n",
35 | "from jam_gpt import Data, Tokenizer, config, lm, Model"
36 | ]
37 | },
38 | {
39 | "cell_type": "markdown",
40 | "metadata": {},
41 | "source": [
42 | "### Time taken to load library :\n",
43 | "\n",
44 | " time : 15.9s"
45 | ]
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": 2,
50 | "metadata": {},
51 | "outputs": [
52 | {
53 | "name": "stdout",
54 | "output_type": "stream",
55 | "text": [
56 | "vocab size : 101\n",
57 | "parameters : 0.214373 M\n",
58 | "# -------------------------------------#\n",
59 | "# hyperparameters\n",
60 | "vocab_size : 101\n",
61 | "batch_size : 16\n",
62 | "block_size : 32\n",
63 | "max_iters : 5000\n",
64 | "eval_interval : 100\n",
65 | "learning_rate : 0.001\n",
66 | "device : cpu\n",
67 | "eval_iters : 200\n",
68 | "n_embd : 64\n",
69 | "n_head : 4\n",
70 | "n_layer : 4\n",
71 | "dropout : 0.0\n",
72 | "model_architecture : lm.BigramLM\n",
73 | "# -------------------------------------#\n",
74 | "CPU times: user 955 ms, sys: 347 ms, total: 1.3 s\n",
75 | "Wall time: 1.22 s\n"
76 | ]
77 | }
78 | ],
79 | "source": [
80 | "%%time\n",
81 | "tok = Tokenizer()\n",
82 | "\n",
83 | "\n",
84 | "# path = \"./data-set/linuxsourcecodesnippets.txt\"\n",
85 | "\n",
86 | "# # data collection\n",
87 | "# data = Data.get(path)\n",
88 | "\n",
89 | "# tokanization\n",
90 | "model_name = \"md-t02-bglm\"\n",
91 | "# tok.set_encoding(model_name, data)\n",
92 | "tok.get_encoding(model_name)\n",
93 | "# enc_data = tok.encode(data)\n",
94 | "\n",
95 | "# # setting parameters\n",
96 | "# config.vocab_size = tok.n_vocab\n",
97 | "config.retrive(\"md-t02-bglm\")\n",
98 | "config.device=\"cpu\"\n",
99 | "\n",
100 | "# model genration\n",
101 | "test_model = Model()\n",
102 | "# test_model.set_model(lm.BigramLM())\n",
103 | "# test_model.set_data(Data.train_test_split(enc_data))\n",
104 | "# test_model.optimize()\n",
105 | "# test_model.train()\n",
106 | "test_model.load_model(model_name,args=config.pass_args())\n"
107 | ]
108 | },
109 | {
110 | "cell_type": "markdown",
111 | "metadata": {},
112 | "source": [
113 | "### Traning detials : \n",
114 | " step 4900 : train loss 1.6238, val loss 1.7738\n",
115 | " traning time : 7m 54.9s"
116 | ]
117 | },
118 | {
119 | "cell_type": "code",
120 | "execution_count": 3,
121 | "metadata": {},
122 | "outputs": [],
123 | "source": [
124 | "# test_model.save_model(model_name)"
125 | ]
126 | },
127 | {
128 | "cell_type": "code",
129 | "execution_count": 4,
130 | "metadata": {},
131 | "outputs": [
132 | {
133 | "name": "stdout",
134 | "output_type": "stream",
135 | "text": [
136 | "linux engine\n",
137 | "/*\n",
138 | " * Find a physical address of a virtual object..\n",
139 | " *\n",
140 | " * This is easy using the virtual page table address.\n",
141 | " */\n",
142 | "aread_entry_signd_flags - &&\n",
143 | "numq_backlog_copy(blkcg.list_entry);\n",
144 | "}\n",
145 | "#uf CONFIL_MMPIN)\n",
146 | "\n",
147 | "/*\n",
148 | " * Eewarity_idx_init_of(wq, info, entry, struct filt *info, \"after\\n\", true;\n",
149 | "\tstimct eventpol_shm_fs(char->show,final_entry_chunk_bytes &&\n",
150 | "#inc 0,\n",
151 | "\t.iostic\t\t= bio_put_current_lock();\n",
152 | "\n",
153 | "\treturn long rb_lock++)\n",
154 | "\t\treturn alg;\n",
155 | "\t\tif (struct epitem; i err)\n",
156 | "{\n",
157 | "\tsize = kmq_wait_work(rq->wq);\n",
158 | "\n",
159 | "\tif (info->wait_country(CONFIG_NO *, 0)\n",
160 | "\t\treturn -ENOEMEC(\"Cown of thrating listrimitions, the last begup han it IPC__ENOMEM t\n",
161 | "CPU times: user 8.74 s, sys: 27 ms, total: 8.76 s\n",
162 | "Wall time: 3.52 s\n"
163 | ]
164 | }
165 | ],
166 | "source": [
167 | "%%time\n",
168 | "pmt = tok.encode(\"\"\"linux engine\n",
169 | "/*\n",
170 | " * Find a physical address of a virtual object..\n",
171 | " *\n",
172 | " * This is easy using the virtual page table address.\n",
173 | " */\n",
174 | "\"\"\")\n",
175 | "print(tok.decode(test_model.generate(pmt)))"
176 | ]
177 | }
178 | ],
179 | "metadata": {
180 | "kernelspec": {
181 | "display_name": "Python 3",
182 | "language": "python",
183 | "name": "python3"
184 | },
185 | "language_info": {
186 | "codemirror_mode": {
187 | "name": "ipython",
188 | "version": 3
189 | },
190 | "file_extension": ".py",
191 | "mimetype": "text/x-python",
192 | "name": "python",
193 | "nbconvert_exporter": "python",
194 | "pygments_lexer": "ipython3",
195 | "version": "3.11.9"
196 | },
197 | "orig_nbformat": 4
198 | },
199 | "nbformat": 4,
200 | "nbformat_minor": 2
201 | }
202 |
--------------------------------------------------------------------------------
/jam_gpt/model.py:
--------------------------------------------------------------------------------
1 | import os
2 | import pickle
3 | import torch
4 | from . import config
5 | from . import lm
6 |
7 | torch.manual_seed(1337)
8 |
9 |
10 | class Model:
11 | """class model to work on lm models"""
12 |
13 | def __init__(self):
14 | [
15 | self.vocab_size,
16 | self.batch_size,
17 | self.block_size,
18 | self.max_iters,
19 | self.eval_interval,
20 | self.learning_rate,
21 | self.device,
22 | self.eval_iters,
23 | self.n_embd,
24 | self.n_head,
25 | self.n_layer,
26 | self.dropout,
27 | self.model_architecture,
28 | ] = config.pass_args()
29 |
30 | self.model = None
31 |
32 | self.train_data = None
33 | self.test_data = None
34 |
35 | def set_parameters(self, args: list):
36 | [
37 | self.vocab_size,
38 | self.batch_size,
39 | self.block_size,
40 | self.max_iters,
41 | self.eval_interval,
42 | self.learning_rate,
43 | self.device,
44 | self.eval_iters,
45 | self.n_embd,
46 | self.n_head,
47 | self.n_layer,
48 | self.dropout,
49 | self.model_architecture,
50 | ] = args
51 |
52 | def get_parameters(self):
53 | return [
54 | self.vocab_size,
55 | self.batch_size,
56 | self.block_size,
57 | self.max_iters,
58 | self.eval_interval,
59 | self.learning_rate,
60 | self.device,
61 | self.eval_iters,
62 | self.n_embd,
63 | self.n_head,
64 | self.n_layer,
65 | self.dropout,
66 | self.model_architecture,
67 | ]
68 |
69 | def set_model(self, model):
70 | self.model_architecture = f"lm.{model.__class__.__name__}"
71 | self.model = model
72 | self.m = self.model.to(self.device)
73 | # print the number of parameters in the model
74 | print("vocab size : ", self.vocab_size)
75 | print("parameters : ", sum(p.numel() for p in self.m.parameters()) / 1e6, " M")
76 | config.get_args()
77 | return self.m
78 |
79 | def set_data(self, data):
80 | self.train_data = data[0]
81 | self.test_data = data[1]
82 |
83 | def get_batch(self, split):
84 | # generate small batch of data of input -> x and targets -> y
85 | data = self.train_data if split == "train" else self.test_data
86 | ix = torch.randint(len(data) - self.block_size, (self.batch_size,))
87 | x = torch.stack([data[i : i + self.block_size] for i in ix])
88 | y = torch.stack([data[i + 1 : i + self.block_size + 1] for i in ix])
89 | x, y = x.to(self.device), y.to(self.device)
90 | return x, y
91 |
92 | @torch.no_grad()
93 | def estimate_loss(self):
94 | # estimates the loss of model by eval using test data
95 | out = {}
96 | self.model.eval()
97 | for split in ["train", "val"]:
98 | losses = torch.zeros(self.eval_iters)
99 | for k in range(self.eval_iters):
100 | X, Y = self.get_batch(split)
101 | logits, loss = self.model(X, Y)
102 | losses[k] = loss.item()
103 | out[split] = losses.mean()
104 | self.model.train()
105 | return out
106 |
107 | def optimize(self):
108 | self.optimizer = torch.optim.AdamW(
109 | self.model.parameters(), lr=self.learning_rate
110 | )
111 |
112 | def train(self, max_iters=None):
113 | if not max_iters:
114 | max_iters = self.max_iters
115 | for iter in range(max_iters):
116 | # every once in a while evaluate the loss on train and val sets
117 | if iter % self.eval_interval == 0:
118 | losses = self.estimate_loss()
119 | print(
120 | f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}"
121 | )
122 |
123 | # sample a batch of data
124 | xb, yb = self.get_batch("train")
125 |
126 | # evaluate the loss
127 | logits, loss = self.model(xb, yb)
128 | self.optimizer.zero_grad(set_to_none=True)
129 | loss.backward()
130 | self.optimizer.step()
131 |
132 | def generate(self, prompt, max_new_tokens=500, eos_token=None):
133 | """
134 | TO generate response from model
135 | """
136 | # generate from the model
137 | # context = torch.zeros((1, 1), dtype=torch.long, device=self.device)
138 | tensor_prompt = torch.tensor(prompt, dtype=torch.long, device=self.device)[
139 | None, ...
140 | ]
141 | if eos_token:
142 | tensor_eos_token = torch.tensor(eos_token, dtype=torch.long, device=self.device)[
143 | None, ...
144 | ]
145 | else :
146 | tensor_eos_token = None
147 |
148 | return self.m.generate(tensor_prompt, max_new_tokens, tensor_eos_token)[
149 | 0
150 | ].tolist()
151 |
152 | def save_model(self, model_name, model_format="bin"):
153 | # to save model
154 | if not os.path.exists(f"./{model_name}"):
155 | os.makedirs(f"./{model_name}")
156 | path = f"{model_name}/{model_name}.{model_format}"
157 | if model_format == "bin" or model_format == "pt":
158 | torch.save(self.model.state_dict(), path)
159 | elif model_format == "pkl":
160 | with open(path, "wb") as f:
161 | pickle.dump(self.model, f)
162 | else:
163 | print(f"given model format : {model_format} is not supported")
164 |
165 | # to save config info
166 | config.store(model_name, self.get_parameters())
167 |
168 | def load_model(self, model_name, model_format="bin", args=None):
169 | if args:
170 | self.set_parameters(args)
171 | else:
172 | # to load config info
173 | config_data = config.retrive(model_name)
174 | self.set_parameters(config_data["config_args"])
175 |
176 | # to load model
177 | path = f"{model_name}/{model_name}.{model_format}"
178 | if model_format == "bin" or model_format == "pt":
179 | cls_model_architecture = eval(self.model_architecture)
180 | self.set_model(cls_model_architecture())
181 | self.model.load_state_dict(torch.load(path,map_location=torch.device(config_data["config_dict"]["device"])))
182 | elif model_format == "pkl":
183 | with open(path, "rb") as f:
184 | loaded_model = pickle.load(f)
185 | self.set_model(loaded_model)
186 |
187 | self.model.eval()
188 | # return self.model
189 |
--------------------------------------------------------------------------------
/jam_gpt/lm.py:
--------------------------------------------------------------------------------
1 | """
2 | B0rrowed form Andreg karpathy : karpathy/nanoGPT
3 | https://github.com/karpathy/nanoGPT/blob/master/model.py
4 | """
5 |
6 | import torch
7 | import torch.nn as nn
8 | from torch.nn import functional as F
9 | from . import config
10 |
11 | torch.manual_seed(1337)
12 |
13 |
14 | """
15 | defines a language model using PyTorch
16 | based on the Transformer architecture attention Mechanism
17 | """
18 | [
19 | vocab_size,
20 | batch_size,
21 | block_size,
22 | max_iters,
23 | eval_interval,
24 | learning_rate,
25 | device,
26 | eval_iters,
27 | n_embd,
28 | n_head,
29 | n_layer,
30 | dropout,
31 | model_architecture,
32 | ] = config.pass_args()
33 |
34 |
35 | def set_parameters(args):
36 | global vocab_size, batch_size, block_size, max_iters, eval_interval, learning_rate, device, eval_iters, n_embd, n_head, n_layer, dropout, model_architecture
37 | [
38 | vocab_size,
39 | batch_size,
40 | block_size,
41 | max_iters,
42 | eval_interval,
43 | learning_rate,
44 | device,
45 | eval_iters,
46 | n_embd,
47 | n_head,
48 | n_layer,
49 | dropout,
50 | model_architecture,
51 | ] = args
52 |
53 |
54 | class Head(nn.Module):
55 | """one head of self-attention"""
56 |
57 | def __init__(self, head_size):
58 | super().__init__()
59 | self.key = nn.Linear(n_embd, head_size, bias=False)
60 | self.query = nn.Linear(n_embd, head_size, bias=False)
61 | self.value = nn.Linear(n_embd, head_size, bias=False)
62 | self.register_buffer("tril", torch.tril(torch.ones(block_size, block_size)))
63 |
64 | self.dropout = nn.Dropout(dropout)
65 |
66 | def forward(self, x):
67 | B, T, C = x.shape
68 | k = self.key(x) # (B,T,C)
69 | q = self.query(x) # (B,T,C)
70 | # compute attention scores ("affinities")
71 | # (B, T, C) @ (B, C, T) -> (B, T, T)
72 | wei = q @ k.transpose(-2, -1) * C**-0.5
73 | wei = wei.masked_fill(self.tril[:T, :T] == 0, float("-inf")) # (B, T, T)
74 | wei = F.softmax(wei, dim=-1) # (B, T, T)
75 | wei = self.dropout(wei)
76 | # perform the weighted aggregation of the values
77 | v = self.value(x) # (B,T,C)
78 | out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
79 | return out
80 |
81 |
82 | class MultiHeadAttention(nn.Module):
83 | """multiple heads of self-attention in parallel"""
84 |
85 | def __init__(self, num_heads, head_size):
86 | super().__init__()
87 | self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
88 | self.proj = nn.Linear(n_embd, n_embd)
89 | self.dropout = nn.Dropout(dropout)
90 |
91 | def forward(self, x):
92 | out = torch.cat([h(x) for h in self.heads], dim=-1)
93 | out = self.dropout(self.proj(out))
94 | return out
95 |
96 |
97 | class FeedFoward(nn.Module):
98 | """a simple linear layer followed by a non-linearity"""
99 |
100 | def __init__(self, n_embd):
101 | super().__init__()
102 | self.net = nn.Sequential(
103 | nn.Linear(n_embd, 4 * n_embd),
104 | nn.ReLU(),
105 | nn.Linear(4 * n_embd, n_embd),
106 | nn.Dropout(dropout),
107 | )
108 |
109 | def forward(self, x):
110 | return self.net(x)
111 |
112 |
113 | class Block(nn.Module):
114 | """Transformer block: communication followed by computation"""
115 |
116 | def __init__(self, n_embd, n_head):
117 | # n_embd: embedding dimension, n_head: the number of heads we'd like
118 | super().__init__()
119 | head_size = n_embd // n_head
120 | self.sa = MultiHeadAttention(n_head, head_size)
121 | self.ffwd = FeedFoward(n_embd)
122 | self.ln1 = nn.LayerNorm(n_embd)
123 | self.ln2 = nn.LayerNorm(n_embd)
124 |
125 | def forward(self, x):
126 | x = x + self.sa(self.ln1(x))
127 | x = x + self.ffwd(self.ln2(x))
128 | return x
129 |
130 |
131 | # ---------------------------models-------------------------------
132 |
133 |
134 | class GPTLM(nn.Module):
135 | """
136 | GPT-style approach where each token predicts the next token
137 | """
138 |
139 | def __init__(self):
140 | super().__init__()
141 | # setting updated parameters
142 | set_parameters(config.pass_args())
143 |
144 | # each token directly reads off the logits for the next token from a lookup table
145 | self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
146 | self.position_embedding_table = nn.Embedding(block_size, n_embd)
147 | self.blocks = nn.Sequential(
148 | *[Block(n_embd, n_head=n_head) for _ in range(n_layer)]
149 | )
150 | self.ln_f = nn.LayerNorm(n_embd) # final layer norm
151 | self.lm_head = nn.Linear(n_embd, vocab_size)
152 |
153 | # better init, not covered in the original GPT video, but important, will cover in followup video
154 | self.apply(self._init_weights)
155 |
156 | def _init_weights(self, module):
157 | if isinstance(module, nn.Linear):
158 | torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
159 | if module.bias is not None:
160 | torch.nn.init.zeros_(module.bias)
161 | elif isinstance(module, nn.Embedding):
162 | torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
163 |
164 | def forward(self, idx, targets=None):
165 | B, T = idx.shape
166 |
167 | # idx and targets are both (B,T) tensor of integers
168 | tok_emb = self.token_embedding_table(idx) # (B,T,C)
169 | pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
170 | x = tok_emb + pos_emb # (B,T,C)
171 | x = self.blocks(x) # (B,T,C)
172 | x = self.ln_f(x) # (B,T,C)
173 | logits = self.lm_head(x) # (B,T,vocab_size)
174 |
175 | if targets is None:
176 | loss = None
177 | else:
178 | B, T, C = logits.shape
179 | logits = logits.view(B * T, C)
180 | targets = targets.view(B * T)
181 | loss = F.cross_entropy(logits, targets)
182 |
183 | return logits, loss
184 |
185 | def generate(self, idx, max_new_tokens, eos_token=None):
186 | # idx is (B, T) array of indices in the current context
187 | for _ in range(max_new_tokens):
188 | # crop idx to the last block_size tokens
189 | # idx_cond = idx[:, -block_size:]
190 | idx_cond = idx if idx.size(1) <= block_size else idx[:, -block_size:]
191 | # get the predictions
192 | logits, loss = self(idx_cond)
193 | # focus only on the last time step
194 | logits = logits[:, -1, :] # becomes (B, C)
195 | # apply softmax to get probabilities
196 | probs = F.softmax(logits, dim=-1) # (B, C)
197 | # sample from the distribution
198 | idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
199 | # append sampled index to the running sequence
200 | idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
201 |
202 | # Check if the last tokens in idx match eos_token
203 | if eos_token!=None:
204 | if idx.size(1) >= eos_token.size(1) and torch.equal(
205 | idx[:, -eos_token.size(1) :], eos_token
206 | ):
207 | return idx
208 |
209 | return idx
210 |
211 |
212 | class JamLM(nn.Module):
213 | """
214 | a new neural schema
215 | """
216 |
217 | def __init__(self):
218 | super().__init__()
219 | pass
220 |
--------------------------------------------------------------------------------
/test.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 2,
6 | "metadata": {},
7 | "outputs": [
8 | {
9 | "name": "stdout",
10 | "output_type": "stream",
11 | "text": [
12 | "# -------------------------------------#\n",
13 | "# hyperparameters\n",
14 | "vocab_size : 0\n",
15 | "batch_size : 32\n",
16 | "block_size : 256\n",
17 | "max_iters : 5000\n",
18 | "eval_interval : 250\n",
19 | "learning_rate : 0.001\n",
20 | "device : cuda\n",
21 | "eval_iters : 200\n",
22 | "n_embd : 384\n",
23 | "n_head : 6\n",
24 | "n_layer : 6\n",
25 | "dropout : 0.2\n",
26 | "model_architecture : None\n",
27 | "# -------------------------------------#\n"
28 | ]
29 | }
30 | ],
31 | "source": [
32 | "from jam_gpt import Data, Tokenizer, config, lm, Model"
33 | ]
34 | },
35 | {
36 | "cell_type": "code",
37 | "execution_count": 3,
38 | "metadata": {},
39 | "outputs": [
40 | {
41 | "name": "stdout",
42 | "output_type": "stream",
43 | "text": [
44 | "# -------------------------------------#\n",
45 | "# hyperparameters\n",
46 | "vocab_size : 101\n",
47 | "batch_size : 16\n",
48 | "block_size : 32\n",
49 | "max_iters : 5000\n",
50 | "eval_interval : 100\n",
51 | "learning_rate : 0.001\n",
52 | "device : cuda\n",
53 | "eval_iters : 200\n",
54 | "n_embd : 64\n",
55 | "n_head : 4\n",
56 | "n_layer : 4\n",
57 | "dropout : 0.0\n",
58 | "model_architecture : lm.BigramLM\n",
59 | "# -------------------------------------#\n",
60 | "[101, 16, 32, 5000, 100, 0.001, 'cuda', 200, 64, 4, 4, 0.0, 'lm.BigramLM']\n"
61 | ]
62 | }
63 | ],
64 | "source": [
65 | "tok = Tokenizer()\n",
66 | "\n",
67 | "\n",
68 | "# path = \"./data-set/linuxsourcecodesnippets.txt\"\n",
69 | "\n",
70 | "# # data collection\n",
71 | "# data = Data.get(path)\n",
72 | "\n",
73 | "# tokanization\n",
74 | "model_name = \"md-t02-bglm-rerun\"\n",
75 | "# tok.set_encoding(model_name, data)\n",
76 | "tok.get_encoding(model_name)\n",
77 | "# enc_data = tok.encode(data)\n",
78 | "\n",
79 | "# setting parameters\n",
80 | "# config.vocab_size = tok.n_vocab\n",
81 | "\n",
82 | "\n",
83 | "# model genration\n",
84 | "model = Model()\n",
85 | "# model.set_model(lm.BigramLM())\n",
86 | "# model.set_data(Data.train_test_split(enc_data))\n"
87 | ]
88 | },
89 | {
90 | "cell_type": "code",
91 | "execution_count": 4,
92 | "metadata": {},
93 | "outputs": [
94 | {
95 | "name": "stdout",
96 | "output_type": "stream",
97 | "text": [
98 | "vocab size : 101\n",
99 | "parameters : 0.214373 M\n"
100 | ]
101 | }
102 | ],
103 | "source": [
104 | "model.load_model(model_name)\n",
105 | "# model.optimize()\n",
106 | "# model.train()"
107 | ]
108 | },
109 | {
110 | "cell_type": "markdown",
111 | "metadata": {},
112 | "source": [
113 | "### Traning detials : \n",
114 | " step 4900: train loss 1.3497, val loss 1.6127\n",
115 | " step 4900: train loss 1.5836, val loss 1.7533\n",
116 | " step 4900: train loss 1.4185, val loss 1.6682\n",
117 | " step 4900: train loss 1.3792, val loss 1.6416"
118 | ]
119 | },
120 | {
121 | "cell_type": "code",
122 | "execution_count": 5,
123 | "metadata": {},
124 | "outputs": [
125 | {
126 | "name": "stdout",
127 | "output_type": "stream",
128 | "text": [
129 | "* what is the linux file system command lstaign, Onlyss this atomically per itempt. Sause is not\n",
130 | "\t\t * drop end.\n",
131 | "\t\t\t new_bio_bio_cause(BLK_SECINIT(mmdelk_current_ffd(fmt, SEC, aead-addr, thimum);\n",
132 | " * Meter/sending seforminal12/init, this name texply in\n",
133 | " * the procsronally to\n",
134 | " * base default can orcharry and, as only all finally rismet and propirs.\n",
135 | " */\n",
136 | "void bfq_addrt_note(void);\n",
137 | "\n",
138 | "/*\n",
139 | " * The write it initializy in rate virariations ->samplete {\n",
140 | "\t\tdong *from;\n",
141 | "\n",
142 | "\t\tif (wbc->alg->bacck_lzsib(iop)) {\n",
143 | "\t\t\tinfo->dun_info->cval_info->nsigned || \n"
144 | ]
145 | }
146 | ],
147 | "source": [
148 | "pmt = tok.encode(\"\"\"* what is the linux file system command ls\"\"\")\n",
149 | "print(tok.decode(model.generate(pmt)))"
150 | ]
151 | },
152 | {
153 | "cell_type": "code",
154 | "execution_count": null,
155 | "metadata": {},
156 | "outputs": [],
157 | "source": [
158 | "# model.save_model(\"md-t02-bglm-rerun\")"
159 | ]
160 | },
161 | {
162 | "cell_type": "code",
163 | "execution_count": 1,
164 | "metadata": {},
165 | "outputs": [
166 | {
167 | "name": "stdout",
168 | "output_type": "stream",
169 | "text": [
170 | "# -------------------------------------#\n",
171 | "# hyperparameters\n",
172 | "vocab_size : 0\n",
173 | "batch_size : 32\n",
174 | "block_size : 256\n",
175 | "max_iters : 5000\n",
176 | "eval_interval : 250\n",
177 | "learning_rate : 0.001\n",
178 | "device : cuda\n",
179 | "eval_iters : 200\n",
180 | "n_embd : 384\n",
181 | "n_head : 6\n",
182 | "n_layer : 6\n",
183 | "dropout : 0.2\n",
184 | "model_architecture : None\n",
185 | "# -------------------------------------#\n"
186 | ]
187 | }
188 | ],
189 | "source": [
190 | "\n",
191 | "\n",
192 | "from jam_gpt.data import Data\n",
193 | "\n",
194 | "\n",
195 | "d = Data.formater(\n",
196 | "\"\"\"\n",
197 | "4927\n",
198 | "\"\"\n",
199 | "\n",
200 | "import os\n",
201 | "import pickle\n",
202 | "import torch\n",
203 | "from . import config\n",
204 | "\n",
205 | "\n",
206 | "torch.manual_seed(1337)\n",
207 | "\n",
208 | "\n",
209 | "class Model:\n",
210 | " \n",
211 | "\n",
212 | " def __init__(self):\n",
213 | " [self.vocab_size, self.batch_size, self.block_size, self.max_iters, self.eval_interval, self.learning_rate,\n",
214 | " self.device, self.eval_iters, self.n_embd, self.n_head, self.n_layer, self.dropout] = config.pass_args()\n",
215 | "\n",
216 | " self.model = None\n",
217 | "\n",
218 | " self.train_data = None\n",
219 | " self.test_data = None\n",
220 | "\n",
221 | " def set_parameters(self, args: list):\n",
222 | " [self.vocab_size, self.batch_size, self.block_size, self.max_iters, self.eval_interval, self.learning_rate,\n",
223 | " self.device, self.eval_iters, self.n_embd, self.n_head, self.n_layer, self.dropout] = args\n",
224 | "\n",
225 | " def get_parameters(self):\n",
226 | " return [self.vocab_size, self.batch_size, self.block_size, self.max_iters, self.eval_interval, self.learning_rate, self.device, self.eval_iters, self.n_embd, self.n_head, self.n_layer, self.dropout]\n",
227 | "\n",
228 | " def set_model(self, model):\n",
229 | " self.model = model\n",
230 | " self.m = self.model.to(self.device)\n",
231 | " # print the number of parameters in the model\n",
232 | " print(\"vocab size : \", self.vocab_s\n",
233 | "\n",
234 | "\"\"\",\n",
235 | "\"\"\"\n",
236 | "write a function that takes three string context prompt output as input and returns output as json object\n",
237 | "{context: \"context string\" ,prompt: \"prompt sting\",output: \"output string \" }\n",
238 | "View other drafts\n",
239 | "\n",
240 | "\n",
241 | "\n",
242 | "\n",
243 | "\n",
244 | "\n",
245 | "\n",
246 | "\n",
247 | "\n",
248 | "\n",
249 | "\n",
250 | "Sure, here is a Python function that takes three strings (context, prompt, and output) as input and returns an output JSON object:\n",
251 | "\n",
252 | "Python\n",
253 | "import json\n",
254 | "\n",
255 | "def create_json_object(context, prompt, output):\n",
256 | " \n",
257 | "\"\"\"\n",
258 | ")\n",
259 | "\n",
260 | "print(d)"
261 | ]
262 | },
263 | {
264 | "cell_type": "code",
265 | "execution_count": 2,
266 | "metadata": {},
267 | "outputs": [
268 | {
269 | "name": "stdout",
270 | "output_type": "stream",
271 | "text": [
272 | "# -------------------------------------#\n",
273 | "# hyperparameters\n",
274 | "vocab_size : 0\n",
275 | "batch_size : 32\n",
276 | "block_size : 256\n",
277 | "max_iters : 5000\n",
278 | "eval_interval : 250\n",
279 | "learning_rate : 0.001\n",
280 | "device : cuda\n",
281 | "eval_iters : 200\n",
282 | "n_embd : 384\n",
283 | "n_head : 6\n",
284 | "n_layer : 6\n",
285 | "dropout : 0.2\n",
286 | "model_architecture : None\n",
287 | "# -------------------------------------#\n",
288 | "# -------------------------------------#\n",
289 | "# hyperparameters\n",
290 | "vocab_size : 101\n",
291 | "batch_size : 16\n",
292 | "block_size : 32\n",
293 | "max_iters : 5000\n",
294 | "eval_interval : 100\n",
295 | "learning_rate : 0.001\n",
296 | "device : cuda\n",
297 | "eval_iters : 50000\n",
298 | "n_embd : 64\n",
299 | "n_head : 4\n",
300 | "n_layer : 4\n",
301 | "dropout : 0.0\n",
302 | "model_architecture : lm.BigramLM\n",
303 | "# -------------------------------------#\n"
304 | ]
305 | },
306 | {
307 | "data": {
308 | "text/plain": [
309 | "[101, 16, 32, 5000, 100, 0.001, 'cuda', 50000, 64, 4, 4, 0.0, 'lm.BigramLM']"
310 | ]
311 | },
312 | "execution_count": 2,
313 | "metadata": {},
314 | "output_type": "execute_result"
315 | }
316 | ],
317 | "source": [
318 | "config.get_args()\n",
319 | "\n",
320 | "config.retrive(\"md-t02-bglm\")\n",
321 | "\n",
322 | "config.eval_iters = 50000\n",
323 | "\n",
324 | "\n",
325 | "config.get_args()"
326 | ]
327 | },
328 | {
329 | "cell_type": "code",
330 | "execution_count": 4,
331 | "metadata": {},
332 | "outputs": [
333 | {
334 | "name": "stdout",
335 | "output_type": "stream",
336 | "text": [
337 | "[2, 30, 17, 71, 81, 85, 32, 2]\n",
338 | "[2, 61, 71, 81, 85, 63]\n",
339 | "[2, 61, 71, 81, 85, 63, 2]\n",
340 | "[61, 71, 81, 85, 63, 2]\n",
341 | "[76, 70, 2, 75, 85, 2, 86, 74, 71, 84, 71, 2, 61, 71, 81, 85, 63, 2, 75, 86, 2, 75, 85]\n",
342 | "[76, 70, 2, 75, 85, 2, 86, 74, 71, 84, 71, 2, 61, 71, 81, 85, 63, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 75, 86, 2, 75, 85]\n",
343 | "[70, 67, 86, 67, 2, 31, 2, 72, 4, 5, 5, 5, 2, 69, 81, 80, 86, 71, 90, 86, 28, 1, 93, 69, 81, 80, 86, 71, 90, 86, 95, 1, 1, 5, 5, 5, 2, 82, 84, 81, 79, 82, 86, 28, 1, 93, 82, 84, 81, 79, 82, 86, 95, 1, 1, 5, 5, 5, 2, 84, 71, 85, 82, 81, 80, 85, 71, 28, 1, 93, 84, 71, 85, 82, 81, 80, 85, 71, 95, 1, 1, 2, 61, 71, 81, 85, 63, 2, 4, 1]\n"
344 | ]
345 | }
346 | ],
347 | "source": [
348 | "import tiktoken\n",
349 | "import torch\n",
350 | "# tokanization using tiktoken\n",
351 | "# tok = tiktoken.get_encoding(\"gpt2\")\n",
352 | "tok = Tokenizer()\n",
353 | "tok.get_encoding(\"md-t02-bglm\")\n",
354 | "\n",
355 | "vocab_size = 50257\n",
356 | "\n",
357 | "\n",
358 | "enc1 = tok.encode(\"\"\"data = f\"### context:\\n{context}\\n\\n### prompt:\\n{prompt}\\n\\n### response:\\n{response}\\n\\n [eos] \"\n",
359 | "\"\"\")\n",
360 | "print(tok.encode(\"\"\" \"\"\"))\n",
361 | "print(tok.encode(\" [eos]\"))\n",
362 | "\n",
363 | "print(tok.encode(\"\"\" [eos] \"\"\"))\n",
364 | "print(tok.encode(\"\"\"[eos] \"\"\"))\n",
365 | "print(tok.encode(\"\"\"jd is there [eos] it is\"\"\"))\n",
366 | "print(tok.encode(\"\"\"jd is there [eos]\n",
367 | " it is\"\"\"))\n",
368 | "print(enc1)"
369 | ]
370 | },
371 | {
372 | "cell_type": "code",
373 | "execution_count": 37,
374 | "metadata": {},
375 | "outputs": [
376 | {
377 | "name": "stdout",
378 | "output_type": "stream",
379 | "text": [
380 | "[7359, 68, 418, 29, 220]\n",
381 | "[58, 68, 418, 60]\n",
382 | "[685, 68, 418, 60]\n",
383 | "[685, 68, 418, 60, 220]\n",
384 | "[58, 68, 418, 60, 220]\n",
385 | "[7890, 796, 277, 1, 21017, 4732, 25, 198, 90, 22866, 92, 198, 198, 21017, 6152, 25, 198, 90, 16963, 457, 92, 198, 198, 21017, 2882, 25, 198, 90, 26209, 92, 628, 685, 68, 418, 60, 366, 198]\n",
386 | "end\n"
387 | ]
388 | }
389 | ],
390 | "source": [
391 | "\n",
392 | "\n",
393 | "idx = [7890, 796, 277, 1, 21017, 4732, 25, 198, 90, 22866, 92, 198, 198, 21017, 6152, 25, 198, 90, 16963, 457, 92, 198, 198, 21017, 2882, 25, 198, 90, 26209, 92, 628, 685, 68, 418, 60]\n",
394 | "\n",
395 | "eos_token = [685, 68, 418, 60]\n",
396 | "\n",
397 | "\n",
398 | "\n",
399 | "idx = (torch.tensor(\n",
400 | " idx, dtype=torch.long, device=\"cuda\")[None, ...])\n",
401 | "eos_token = (torch.tensor(\n",
402 | " eos_token, dtype=torch.long, device=\"cuda\")[None, ...])\n",
403 | "def gen():\n",
404 | "\n",
405 | " # Check if the last 4 tokens in idx match eos_token\n",
406 | " if idx.size(1) >= eos_token.size(1) and torch.equal(idx[:, -eos_token.size(1):], eos_token):\n",
407 | " print(\"end\")\n",
408 | "\n",
409 | "gen()\n",
410 | "\n"
411 | ]
412 | },
413 | {
414 | "cell_type": "code",
415 | "execution_count": 19,
416 | "metadata": {},
417 | "outputs": [
418 | {
419 | "data": {
420 | "text/plain": [
421 | "(tensor([[ 7890, 796, 277, 1, 21017, 4732, 25, 198, 90, 22866,\n",
422 | " 92, 198, 198, 21017, 6152, 25, 198, 90, 16963, 457,\n",
423 | " 92, 198, 198, 21017, 2882, 25, 198, 90, 26209, 92,\n",
424 | " 628, 685, 68, 418, 60]], device='cuda:0'),\n",
425 | " tensor([[685, 68, 418, 60]], device='cuda:0'),\n",
426 | " 35,\n",
427 | " tensor([[685, 68, 418, 60]], device='cuda:0'),\n",
428 | " 4)"
429 | ]
430 | },
431 | "execution_count": 19,
432 | "metadata": {},
433 | "output_type": "execute_result"
434 | }
435 | ],
436 | "source": [
437 | "idx , eos_token , idx.size(1) , idx[:, -4:] , eos_token.size(1)"
438 | ]
439 | }
440 | ],
441 | "metadata": {
442 | "kernelspec": {
443 | "display_name": "Python 3",
444 | "language": "python",
445 | "name": "python3"
446 | },
447 | "language_info": {
448 | "codemirror_mode": {
449 | "name": "ipython",
450 | "version": 3
451 | },
452 | "file_extension": ".py",
453 | "mimetype": "text/x-python",
454 | "name": "python",
455 | "nbconvert_exporter": "python",
456 | "pygments_lexer": "ipython3",
457 | "version": "3.11.6"
458 | },
459 | "orig_nbformat": 4
460 | },
461 | "nbformat": 4,
462 | "nbformat_minor": 2
463 | }
464 |
--------------------------------------------------------------------------------
/img/jam-img.svg:
--------------------------------------------------------------------------------
1 |
118 |
--------------------------------------------------------------------------------