├── .gitignore ├── README.md ├── create_dataset.ipynb ├── filter_and_push.ipynb ├── image.png ├── synthetic_data.jsonl └── synthetic_data_filtered.jsonl /.gitignore: -------------------------------------------------------------------------------- 1 | /.env -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Creating Synthetic Dataset Using Llama 3.1 405B and Nemotron 4 2 | 3 | In this notebook we will use the following structure to create a synthetic dataset of Intructions and Git Commands. 4 | 5 | 6 | ![alt text](image.png) 7 | 8 | We will create a set of instructions related to git queries in natural language, then we will generate the response for each instruction. 9 | 10 | The instruction/response pairs will be passed to a reward model, Nemotron 4, to filter out any bad pairs. 11 | 12 | Finally, the dataset will be pushed to HuggingFace. 13 | 14 | ## Dataset 15 | The final work can be viewed here: 16 | https://huggingface.co/datasets/hesamsheikh/git-prompt -------------------------------------------------------------------------------- /filter_and_push.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "data": { 10 | "text/plain": [ 11 | "True" 12 | ] 13 | }, 14 | "execution_count": 1, 15 | "metadata": {}, 16 | "output_type": "execute_result" 17 | } 18 | ], 19 | "source": [ 20 | "from rich import print\n", 21 | "import os\n", 22 | "from openai import OpenAI\n", 23 | "from datasets import Dataset, DatasetDict, load_dataset\n", 24 | "import json\n", 25 | "from dotenv import load_dotenv\n", 26 | "\n", 27 | "# Load environment variables from .env file\n", 28 | "load_dotenv()" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 2, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "client = OpenAI(\n", 38 | " base_url=\"https://integrate.api.nvidia.com/v1\",\n", 39 | " api_key=os.environ[\"NVIDIA_API_KEY\"]\n", 40 | ")" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 3, 46 | "metadata": {}, 47 | "outputs": [ 48 | { 49 | "data": { 50 | "text/html": [ 51 | "
[\n",
 52 |        "    ChatCompletionTokenLogprob(token='helpfulness', bytes=None, logprob=4.15625, top_logprobs=[]),\n",
 53 |        "    ChatCompletionTokenLogprob(token='correctness', bytes=None, logprob=4.125, top_logprobs=[]),\n",
 54 |        "    ChatCompletionTokenLogprob(token='coherence', bytes=None, logprob=4.09375, top_logprobs=[]),\n",
 55 |        "    ChatCompletionTokenLogprob(token='complexity', bytes=None, logprob=0.55078125, top_logprobs=[]),\n",
 56 |        "    ChatCompletionTokenLogprob(token='verbosity', bytes=None, logprob=0.52734375, top_logprobs=[])\n",
 57 |        "]\n",
 58 |        "
\n" 59 | ], 60 | "text/plain": [ 61 | "\u001b[1m[\u001b[0m\n", 62 | " \u001b[1;35mChatCompletionTokenLogprob\u001b[0m\u001b[1m(\u001b[0m\u001b[33mtoken\u001b[0m=\u001b[32m'helpfulness'\u001b[0m, \u001b[33mbytes\u001b[0m=\u001b[3;35mNone\u001b[0m, \u001b[33mlogprob\u001b[0m=\u001b[1;36m4\u001b[0m\u001b[1;36m.15625\u001b[0m, \u001b[33mtop_logprobs\u001b[0m=\u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\u001b[1m)\u001b[0m,\n", 63 | " \u001b[1;35mChatCompletionTokenLogprob\u001b[0m\u001b[1m(\u001b[0m\u001b[33mtoken\u001b[0m=\u001b[32m'correctness'\u001b[0m, \u001b[33mbytes\u001b[0m=\u001b[3;35mNone\u001b[0m, \u001b[33mlogprob\u001b[0m=\u001b[1;36m4\u001b[0m\u001b[1;36m.125\u001b[0m, \u001b[33mtop_logprobs\u001b[0m=\u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\u001b[1m)\u001b[0m,\n", 64 | " \u001b[1;35mChatCompletionTokenLogprob\u001b[0m\u001b[1m(\u001b[0m\u001b[33mtoken\u001b[0m=\u001b[32m'coherence'\u001b[0m, \u001b[33mbytes\u001b[0m=\u001b[3;35mNone\u001b[0m, \u001b[33mlogprob\u001b[0m=\u001b[1;36m4\u001b[0m\u001b[1;36m.09375\u001b[0m, \u001b[33mtop_logprobs\u001b[0m=\u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\u001b[1m)\u001b[0m,\n", 65 | " \u001b[1;35mChatCompletionTokenLogprob\u001b[0m\u001b[1m(\u001b[0m\u001b[33mtoken\u001b[0m=\u001b[32m'complexity'\u001b[0m, \u001b[33mbytes\u001b[0m=\u001b[3;35mNone\u001b[0m, \u001b[33mlogprob\u001b[0m=\u001b[1;36m0\u001b[0m\u001b[1;36m.55078125\u001b[0m, \u001b[33mtop_logprobs\u001b[0m=\u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\u001b[1m)\u001b[0m,\n", 66 | " \u001b[1;35mChatCompletionTokenLogprob\u001b[0m\u001b[1m(\u001b[0m\u001b[33mtoken\u001b[0m=\u001b[32m'verbosity'\u001b[0m, \u001b[33mbytes\u001b[0m=\u001b[3;35mNone\u001b[0m, \u001b[33mlogprob\u001b[0m=\u001b[1;36m0\u001b[0m\u001b[1;36m.52734375\u001b[0m, \u001b[33mtop_logprobs\u001b[0m=\u001b[1m[\u001b[0m\u001b[1m]\u001b[0m\u001b[1m)\u001b[0m\n", 67 | "\u001b[1m]\u001b[0m\n" 68 | ] 69 | }, 70 | "metadata": {}, 71 | "output_type": "display_data" 72 | } 73 | ], 74 | "source": [ 75 | "messages = [\n", 76 | " {\n", 77 | " \"role\": \"user\",\n", 78 | " \"content\": \"What is the capital of Iran?\"\n", 79 | " },\n", 80 | " {\n", 81 | " \"role\": \"assistant\",\n", 82 | " \"content\": \"The capital of Iran is Tehran.\"\n", 83 | " },\n", 84 | "]\n", 85 | "\n", 86 | "response = client.chat.completions.create(\n", 87 | " model=\"nvidia/nemotron-4-340b-reward\",\n", 88 | " messages=messages,\n", 89 | ")\n", 90 | "\n", 91 | "print(response.choices[0].logprobs.content)" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": 4, 97 | "metadata": {}, 98 | "outputs": [ 99 | { 100 | "data": { 101 | "text/html": [ 102 | "
{\n",
103 |        "    'helpfulness': 4.15625,\n",
104 |        "    'correctness': 4.125,\n",
105 |        "    'coherence': 4.09375,\n",
106 |        "    'complexity': 0.55078125,\n",
107 |        "    'verbosity': 0.52734375\n",
108 |        "}\n",
109 |        "
\n" 110 | ], 111 | "text/plain": [ 112 | "\u001b[1m{\u001b[0m\n", 113 | " \u001b[32m'helpfulness'\u001b[0m: \u001b[1;36m4.15625\u001b[0m,\n", 114 | " \u001b[32m'correctness'\u001b[0m: \u001b[1;36m4.125\u001b[0m,\n", 115 | " \u001b[32m'coherence'\u001b[0m: \u001b[1;36m4.09375\u001b[0m,\n", 116 | " \u001b[32m'complexity'\u001b[0m: \u001b[1;36m0.55078125\u001b[0m,\n", 117 | " \u001b[32m'verbosity'\u001b[0m: \u001b[1;36m0.52734375\u001b[0m\n", 118 | "\u001b[1m}\u001b[0m\n" 119 | ] 120 | }, 121 | "metadata": {}, 122 | "output_type": "display_data" 123 | } 124 | ], 125 | "source": [ 126 | "def get_scores_from_response(score_response_template):\n", 127 | " logprobs = score_response_template.choices[0].logprobs.content\n", 128 | " score_dict = {}\n", 129 | " for score in logprobs:\n", 130 | " score_dict[score.token] = score.logprob\n", 131 | " return score_dict\n", 132 | "\n", 133 | "print(get_scores_from_response(response))" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": 5, 139 | "metadata": {}, 140 | "outputs": [], 141 | "source": [ 142 | "def get_response_and_scores(client, model, question, response_content):\n", 143 | " messages = [\n", 144 | " {\n", 145 | " \"role\": \"user\",\n", 146 | " \"content\": question\n", 147 | " },\n", 148 | " {\n", 149 | " \"role\": \"assistant\",\n", 150 | " \"content\": response_content\n", 151 | " },\n", 152 | " ]\n", 153 | "\n", 154 | " response = client.chat.completions.create(\n", 155 | " model=model,\n", 156 | " messages=messages,\n", 157 | " )\n", 158 | "\n", 159 | " scores = get_scores_from_response(response)\n", 160 | " return scores" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": 64, 166 | "metadata": {}, 167 | "outputs": [ 168 | { 169 | "data": { 170 | "text/plain": [ 171 | "335" 172 | ] 173 | }, 174 | "execution_count": 64, 175 | "metadata": {}, 176 | "output_type": "execute_result" 177 | } 178 | ], 179 | "source": [ 180 | "import json\n", 181 | "\n", 182 | "def read_synthetic_data(file_path):\n", 183 | " data_list = []\n", 184 | " with open(file_path, 'r') as file:\n", 185 | " for line in file:\n", 186 | " entry = json.loads(line)\n", 187 | " one_line_dict = {}\n", 188 | " one_line_dict[\"instruction\"] = entry['instruction']\n", 189 | " one_line_dict[\"response\"] = entry['responses']\n", 190 | " data_list.append(one_line_dict)\n", 191 | " return data_list\n", 192 | "\n", 193 | "synthetic_data = read_synthetic_data(\"synthetic_data.jsonl\")\n", 194 | "len(synthetic_data)" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": 7, 200 | "metadata": {}, 201 | "outputs": [ 202 | { 203 | "data": { 204 | "text/plain": [ 205 | "{'instruction': 'Create a new branch for my feature',\n", 206 | " 'response': \"To create a new branch for your feature, use the following Git command:\\n\\n`git branch feature/your-feature-name`\\n\\nReplace `your-feature-name` with a descriptive name for your feature. This will create a new branch, but you'll still be on the current branch.\\n\\nTo switch to the new branch, use:\\n\\n`git checkout feature/your-feature-name`\\n\\nAlternatively, you can create and switch to the new branch in one step with:\\n\\n`git checkout -b feature/your-feature-name`\"}" 207 | ] 208 | }, 209 | "execution_count": 7, 210 | "metadata": {}, 211 | "output_type": "execute_result" 212 | } 213 | ], 214 | "source": [ 215 | "synthetic_data[0]" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": 8, 221 | "metadata": {}, 222 | "outputs": [], 223 | "source": [ 224 | "def process_instruction_response_score(client, model, synthetic_data):\n", 225 | " score_list = []\n", 226 | " for instruction_response_pair in synthetic_data:\n", 227 | " instruction = instruction_response_pair[\"instruction\"]\n", 228 | " response = instruction_response_pair[\"response\"]\n", 229 | " score = get_response_and_scores(client, model, instruction, response)\n", 230 | " score_list.append(score)\n", 231 | " return score_list" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": 9, 237 | "metadata": {}, 238 | "outputs": [], 239 | "source": [ 240 | "score_list = process_instruction_response_score(client, \"nvidia/nemotron-4-340b-reward\", synthetic_data)" 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": 65, 246 | "metadata": {}, 247 | "outputs": [ 248 | { 249 | "data": { 250 | "text/plain": [ 251 | "322" 252 | ] 253 | }, 254 | "execution_count": 65, 255 | "metadata": {}, 256 | "output_type": "execute_result" 257 | } 258 | ], 259 | "source": [ 260 | "helpfulness_THRESHOLD = 3\n", 261 | "verbosity_THRESHOLD = 2.5\n", 262 | "synthetic_data = [data for i, data in enumerate(synthetic_data) \n", 263 | " if not (score_list[i][\"helpfulness\"] < helpfulness_THRESHOLD or \n", 264 | " score_list[i][\"verbosity\"] > verbosity_THRESHOLD)]\n", 265 | "len(synthetic_data)" 266 | ] 267 | }, 268 | { 269 | "cell_type": "code", 270 | "execution_count": 67, 271 | "metadata": {}, 272 | "outputs": [], 273 | "source": [ 274 | "with open('synthetic_data_filtered.jsonl', 'w') as f:\n", 275 | " for item in synthetic_data:\n", 276 | " f.write(json.dumps(item))\n", 277 | " f.write('\\n')" 278 | ] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "execution_count": 71, 283 | "metadata": {}, 284 | "outputs": [ 285 | { 286 | "data": { 287 | "application/vnd.jupyter.widget-view+json": { 288 | "model_id": "56874f3fc89e443f97b34bf2e01600b8", 289 | "version_major": 2, 290 | "version_minor": 0 291 | }, 292 | "text/plain": [ 293 | "VBox(children=(HTML(value='