├── .gitignore
├── LICENSE
├── README.md
├── data-generation
    ├── dbrx_script.py
    ├── gemini_script.py
    ├── llama_script.py
    ├── mixtral_script.py
    ├── run-dbrx.sbatch
    ├── run-llama.sbatch
    └── run-mixtral.sbatch
├── fine-tuning
    ├── arguments.py
    ├── data_utils.py
    ├── ft.py
    ├── get_rank.sh
    ├── graph.py
    ├── run_ft.sbatch
    └── test_hf_model.py
└── v1
    ├── README.md
    ├── analysis
        ├── README.md
        ├── clean-data.bash
        ├── ds_config_zero1.json
        ├── ds_config_zero2.json
        ├── ds_config_zero3.json
        ├── generate_text-zaratan.sbatch
        ├── generate_text.py
        ├── omp_tests-zaratan.sbatch
        ├── omp_tests.py
        ├── parse_losses.py
        ├── plot_training_results.py
        ├── prepare-data.bash
        ├── run_clm-evaluate-zaratan.sbatch
        ├── run_clm-zaratan.sbatch
        ├── run_clm.py
        ├── train-tokenizer.py
        ├── train.py
        ├── train.sbatch
        └── training-results
        │   ├── gpt-neo-eval-results.csv
        │   ├── gpt-neo-training-results.csv
        │   ├── gpt2-medium-eval-results.csv
        │   ├── gpt2-medium-training-results.csv
        │   ├── polycoder-eval-results.csv
        │   └── polycoder-training-results.csv
    ├── code-gen-tests
        ├── codegen_tests-zaratan.sbatch
        └── codegen_tests.py
    └── data
        ├── README.md
        ├── clone-repos.py
        ├── collect-dataset.py
        ├── collect-repo-metadata.bash
        ├── collect-repo-metadata.py
        ├── copy-repos.bash
        ├── create-omp-dataset.py
        ├── dataset_utils.py
        ├── edit-metadata.py
        ├── generate-all-repo-plots.bash
        ├── repo-plots.py
        ├── repos-gt3.csv
        └── repos-gt5.csv


/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | .env
3 | *.png
4 | *.pkl
5 | slurm-*.out
6 | hpc-tok


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2022-2024, Parallel Software and Systems Group, University of
 2 | Maryland.
 3 | 
 4 | Permission is hereby granted, free of charge, to any person obtaining a
 5 | copy of this software and associated documentation files (the "Software"),
 6 | to deal in the Software without restriction, including without limitation
 7 | the rights to use, copy, modify, merge, publish, distribute, sublicense,
 8 | and/or sell copies of the Software, and to permit persons to whom the
 9 | Software is furnished to do so, subject to the following conditions:
10 | 
11 | The above copyright notice and this permission notice shall be included in
12 | all copies or substantial portions of the Software.
13 | 
14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20 | DEALINGS IN THE SOFTWARE.
21 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # HPC-Coder-v2
 2 | 
 3 | The HPC-Coder-v2-6.7b model is an HPC code LLM fine-tuned on an instruction
 4 | dataset catered to common HPC topics such as parallelism, optimization,
 5 | accelerator porting, etc. This version is a fine-tuning of the [Deepseek Coder
 6 | 6.7b](https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base) model. It is
 7 | fine-tuned on the
 8 | [hpc-instruct](https://huggingface.co/datasets/hpcgroup/hpc-instruct),
 9 | [oss-instruct](https://huggingface.co/datasets/ise-uiuc/Magicoder-OSS-Instruct-75K),
10 | and
11 | [evol-instruct](https://huggingface.co/datasets/nickrosh/Evol-Instruct-Code-80k-v1)
12 | datasets. We utilized the distributed training library
13 | [AxoNN](https://github.com/axonn-ai/axonn) to fine-tune in parallel across many
14 | GPUs.
15 | 
16 | HPC-Coder-v2-6.7b is the best performing LLM under 30b parameters on the
17 | [ParEval](https://github.com/parallelcodefoundry/ParEval) parallel code
18 | generation benchmark in terms of _correctness_ and _performance_. It scores
19 | similarly to 34B and commercial models like Phind-V2 and GPT-4 on parallel code
20 | generation.
21 | 
22 | ## Using HPC-Coder-v2
23 | 
24 | The model is provided as a standard huggingface model with safetensor weights.
25 | The weights are available on
26 | [huggingface](https://huggingface.co/hpcgroup/hpc-coder-v2-6.7b). It can be used
27 | with [transformers
28 | pipelines](https://huggingface.co/docs/transformers/en/main_classes/pipelines),
29 | [vllm](https://github.com/vllm-project/vllm), or any other standard model
30 | inference framework. HPC-Coder-v2 is an instruct model and prompts need to be
31 | formatted as instructions for best results. It was trained with the following
32 | instruct template:
33 | 
34 | ```md
35 | Below is an instruction that describes a task. Write a response that appropriately completes the request.
36 | 
37 | ### Instruction:
38 | {instruction}
39 | 
40 | ### Response:
41 | 
42 | ```
43 | 
44 | ## Quantized Models
45 | 
46 | 4 and 8 bit quantized weights are available in the GGUF format for use with
47 | [llama.cpp](https://github.com/ggerganov/llama.cpp). The 4 bit model requires
48 | ~3.8 GB memory and can be found
49 | [here](https://huggingface.co/hpcgroup/hpc-coder-v2-6.7b-Q4_K_S-GGUF). The 8 bit
50 | model requires ~7.1 GB memory and can be found
51 | [here](https://huggingface.co/hpcgroup/hpc-coder-v2-6.7b-Q8_0-GGUF). Further
52 | information on how to use them with llama.cpp can be found in [its
53 | documentation](https://github.com/ggerganov/llama.cpp).
54 | 
55 | ## Evaluation
56 | 
57 | We evaluated the model on the ParEval benchmark for parallel code generation. It
58 | scores a pass@1 of 31.17 on parallel code generation tasks including OpenMP,
59 | MPI, MPI+OpenMP, CUDA, HIP, and Kokkos. This is the best performing open-source
60 | model on ParEval under 30B parameters. Furthermore, it performs similarly to the
61 | 34B parameter model Phind-V2-34B (pass@1 = 32.12) and GPT-4 (pass@1 = 37.75).
62 | Check out [ParEval](https://github.com/parallelcodefoundry/ParEval) for more
63 | information.
64 | 


--------------------------------------------------------------------------------
/data-generation/dbrx_script.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import ray
  3 | from vllm import LLM, SamplingParams
  4 | from transformers import AutoModelForCausalLM, AutoTokenizer
  5 | # std imports
  6 | from collections import Counter
  7 | import json
  8 | import os
  9 | import time
 10 | from typing import Optional
 11 | # tpl imports
 12 | from alive_progress import alive_bar
 13 | import datasets
 14 | 
 15 | seed_dataset_id = 'hpcgroup/hpc-stack-seeds'
 16 | max_new_tokens = 2048
 17 | random_seed = 42
 18 | language_counter = Counter()
 19 | total_samples = 40000
 20 | prompt_template_1 = """You are exceptionally skilled at crafting high-quality programming problems and offering precise solutions.
 21 | Please gain inspiration from the random code snippet below to create a high-quality programming problem. Be creative. Present your output in two distinct sections: **Problem Description** and **Solution**.
 22 | Provide code in the **Solution** section that solve the problem you describe in the **Problem Description** section.
 23 | You must use the following code snippet as inspiration for the problem you describe in the **Problem Description** section:
 24 | {seed}
 25 | Lines for each section of output:
 26 | 1. **Problem Description**: This should be **completely self-contained**, providing all the contextual information one needs to understand and solve the problem. Assume common programming knowledge, but ensure that any specific context, variables, or code snippets pertinent to this problem are explicitly included.
 27 | 2. **Solution**: Offer a comprehensive, **correct**, **optimal** solution that accurately addresses the **Problem Description** you provided. The provided code should be **fast** and **efficient**."""
 28 | 
 29 | prompt_template_2 = """You are exceptionally skilled at crafting high-quality programming problems and offering precise solutions.
 30 | Please gain inspiration from the following random code snippet to create a high-quality code optimization problem. Be creative.Present your output in two distinct sections: **Problem Description** and **Solution**.
 31 | Code snippet for inspiration:
 32 | {seed}
 33 | lines for each section:
 34 | 1. **Problem Description**: This should be **completely self-contained**, providing all the contextual information one needs to understand and solve the problem. The problem should require optimizing a piece of code. Provide this code to optimize in the problem description. Assume common programming knowledge, but ensure that any specific context, variables, or code snippets pertinent to this problem are explicitly included.
 35 | 2. **Solution**: Offer a comprehensive, **correct** solution that accurately addresses the **Problem Description** you provided and optimizes the code. Include the optimized code."""
 36 | 
 37 | prompt_template_3 = """You are exceptionally skilled at crafting high-quality programming problems and offering precise solutions.
 38 | Please gain inspiration from the following random code snippet to create a high-quality code translation problem. Present your output in two distinct sections: **Problem Description** and **Solution**.
 39 | Code snippet for inspiration:
 40 | {seed}
 41 | lines for each section:
 42 | 1. **Problem Description**: This should be **completely self-contained**, providing all the contextual information one needs to understand and solve the problem. The problem should require translating code between execution models (i.e. translating cuda to openmp or openmp to mpi or mpi to cuda or cuda to raja or raja to mpi or mpi to kokkos). Assume common programming knowledge, but ensure that any specific context, variables, or code snippets pertinent to this problem are explicitly included.
 43 | 2. **Solution**: Offer a comprehensive, **correct** solution that accurately addresses the **Problem Description** you provided and translates the code. Include the translated code."""
 44 | 
 45 | prompt_template_4 = """You are exceptionally skilled at crafting high-quality programming problems and offering precise solutions.
 46 | Please gain inspiration from the following random code snippet to create a high-quality code parallelization problem.Present your output in two distinct sections: **Problem Description** and **Solution**.
 47 | Code snippet for inspiration:
 48 | {seed}
 49 | lines for each section:
 50 | 1. **Problem Description**: This should be **completely self-contained**, providing all the contextual information one needs to understand and solve the problem. The problem should require parallelizing a piece of code. Assume common programming knowledge, but ensure that any specific context, variables, or code snippets pertinent to this problem are explicitly included.
 51 | 2. **Solution**: Offer a comprehensive, **correct** solution that accurately addresses the **Problem Description** you provided and parallelizes the code. Include the parallel code."""
 52 | results=[]
 53 | model_id = "databricks/dbrx-instruct"
 54 | """tokenizer = AutoTokenizer.from_pretrained(model_id,trust_remote_code=True, token="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")
 55 | model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, device_map="auto",trust_remote_code=True, token="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")"""
 56 | ray.init(num_gpus=4)
 57 | if ray.is_initialized():
 58 |     ray.shutdown()
 59 | llm = LLM(model="databricks/dbrx-instruct",gpu_memory_utilization=0.95, max_num_seqs=1,swap_space=1,tensor_parallel_size=4)
 60 | print ("model loaded")
 61 | tokenizer = llm.get_tokenizer()
 62 | def postprocess(input_text: str) -> str:
 63 |     """ Postprocess the model output to return the text from each section.
 64 |         This is accomplished by finding lines that contain each section header.
 65 |     """
 66 |     lines = input_text.splitlines()
 67 |     problem_keyword = "**Problem Description:**"
 68 |     solution_keyword = "**Solution:**"
 69 | 
 70 |     if(input_text.find(problem_keyword) == -1 or input_text.find(solution_keyword) == -1):
 71 |         raise ValueError(f"All sections not present")
 72 | 
 73 |     # Find the starting index of each section
 74 |     problem_start = input_text.find(problem_keyword) + len(problem_keyword)
 75 |     solution_start = input_text.find(solution_keyword) + len(solution_keyword)
 76 | 
 77 |     # Extract the sections
 78 |     problem_description = input_text[problem_start:solution_start - len(solution_keyword)].strip()
 79 |     solution = input_text[solution_start:].strip()
 80 |     return problem_description, solution
 81 | 
 82 | def generate_output(prompts,total,sum) -> str:
 83 |     conversations = tokenizer.apply_chat_template(
 84 |         prompts,
 85 |         tokenize=False,
 86 |         add_generation_prompt=True,
 87 |     )
 88 |     outputs = llm.generate(conversations,
 89 |                        SamplingParams(
 90 |         temperature=0.8,
 91 |         top_p=0.95,
 92 |         max_tokens=4096,
 93 |         stop_token_ids=[tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|im_end|>")],
 94 |         )
 95 |     )
 96 |     for output in outputs:
 97 |         time = (output.metrics.finished_time)-(output.metrics.first_token_time)
 98 |         total+=len(output.outputs[0].token_ids)/time
 99 |         sum.append(len(output.outputs[0].token_ids)/time)
100 |         print(f"Tokens/second = ",len(output.outputs[0].token_ids)/time)
101 |         return output.outputs[0].text
102 | 
103 | seed_dataset = datasets.load_dataset(seed_dataset_id, split='train', streaming=True).shuffle(seed=random_seed, buffer_size=50)
104 | """
105 | langs=[]
106 | seeds=[]"""
107 | i = 0
108 | total=0
109 | sum=[]
110 | with alive_bar(total_samples) as bar:
111 |     bar(len(results), skipped=True)
112 |     for element in seed_dataset:
113 |       prompts=[]
114 |       if(i > 46000):
115 |         seed = element['text']
116 |         try:
117 |           if(i % 4 == 0):
118 |             prompt = [{"role": "user", "content":prompt_template_4.format(seed=seed)}]
119 |             prompts.append(prompt)
120 |             generated_text = generate_output(prompts,total,sum)
121 |             problem_statement, solution = postprocess(generated_text)
122 |           elif(i % 3 == 0):
123 |             prompt = [{"role": "user", "content":prompt_template_3.format(seed=seed)}]
124 |             prompts.append(prompt)
125 |             generated_text = generate_output(prompts,total,sum)
126 |             problem_statement, solution = postprocess(generated_text)
127 |           elif(i % 2 == 0):
128 |             prompt = [{"role": "user", "content":prompt_template_2.format(seed=seed)}]
129 |             prompts.append(prompt)
130 |             generated_text = generate_output(prompts,total,sum)
131 |             problem_statement, solution = postprocess(generated_text)
132 |           else:
133 |             prompt = [{"role": "user", "content":prompt_template_1.format(seed=seed)}]
134 |             prompts.append(prompt)
135 |             generated_text = generate_output(prompts,total,sum)
136 |             problem_statement, solution = postprocess(generated_text)
137 |         except Exception as e:
138 |             print("Error:{e}")
139 |             continue
140 |         bar()
141 |         results.append({
142 |           "language": element['lang'],
143 |           "seed": seed,
144 |           "problem statement": problem_statement,
145 |           "solution": solution,
146 |           "model": "dbrx-instruct"
147 |         })
148 |         if(i % 100 == 0):
149 |             with open('dbrx-outputs-2.json', 'w') as fp:
150 |               json.dump(results, fp)
151 |         if(i == 50000):
152 |             break
153 |       i=i+1
154 | with open('dbrx-outputs-2.json', 'w') as fp:
155 |     json.dump(results, fp)
156 | datasets.Dataset.from_list(results).push_to_hub('hpcgroup/hpc-synthetic-dbrx-2', token='HF write token')
157 | 


--------------------------------------------------------------------------------
/data-generation/gemini_script.py:
--------------------------------------------------------------------------------
  1 | """ Use Gemini API to create a synthetic dataset of performance data.
  2 |     The model be given an inspiration prompt and be asked to write a 
  3 |     problem statement and one solution.
  4 |     Prompts are divided into 4 sections: 
  5 |     original(where we just get a problem and solution based on the code snippet), 
  6 |     optimization(where we ask the model to provide an optimization problem and solution),
  7 |     translation(translating from one language to another eg. CUDA to openmp),
  8 |     parallelization(parallelize serial code)
  9 | """
 10 | 
 11 | # std imports
 12 | from collections import Counter
 13 | import json
 14 | import os
 15 | import time
 16 | from typing import Optional
 17 | 
 18 | # tpl imports
 19 | from alive_progress import alive_bar
 20 | import datasets
 21 | import google.generativeai as genai
 22 | 
 23 | seed_dataset_id = 'hpcgroup/hpc-stack-seeds'
 24 | 
 25 | max_new_tokens = 2048
 26 | random_seed = 42
 27 | language_counter = Counter()
 28 | total_samples = 14000
 29 | prompt_template_1 = """You are exceptionally skilled at crafting high-quality programming problems and offering precise solutions.
 30 | Please gain inspiration from the random code snippet below to create a high-quality programming problem. Be creative. Present your output in two distinct sections: [Problem Description] and [Solution].
 31 | Provide code in the [Solution] section that solve the problem you describe in the [Problem Description] section. 
 32 | You must use the following code snippet as inspiration for the problem you describe in the [Problem Description] section:
 33 | {seed}
 34 | Lines for each section of output:
 35 | 1. [Problem Description]: This should be **completely self-contained**, providing all the contextual information one needs to understand and solve the problem. Assume common programming knowledge, but ensure that any specific context, variables, or code snippets pertinent to this problem are explicitly included.
 36 | 2. [Solution]: Offer a comprehensive, **correct**, **optimal** solution that accurately addresses the [Problem Description] you provided. The provided code should be **fast** and **efficient**.
 37 | """
 38 | 
 39 | prompt_template_2 = """You are exceptionally skilled at crafting high-quality programming problems and offering precise solutions.
 40 | Please gain inspiration from the following random code snippet to create a high-quality code optimization problem. Be creative.Present your output in two distinct sections: [Problem Description] and [Solution].
 41 | Code snippet for inspiration:
 42 | {seed}
 43 | lines for each section:
 44 | 1. [Problem Description]: This should be **completely self-contained**, providing all the contextual information one needs to understand and solve the problem. The problem should require optimizing a piece of code. Provide this code to optimize in the problem description. Assume common programming knowledge, but ensure that any specific context, variables, or code snippets pertinent to this problem are explicitly included.
 45 | 2. [Solution]: Offer a comprehensive, **correct** solution that accurately addresses the [Problem Description] you provided and optimizes the code. Include the optimized code.
 46 | """
 47 | 
 48 | prompt_template_3 = """You are exceptionally skilled at crafting high-quality programming problems and offering precise solutions.
 49 | Please gain inspiration from the following random code snippet to create a high-quality code translation problem. Present your output in two distinct sections: [Problem Description] and [Solution].
 50 | Code snippet for inspiration:
 51 | {seed}
 52 | lines for each section:
 53 | 1. [Problem Description]: This should be **completely self-contained**, providing all the contextual information one needs to understand and solve the problem. The problem should require translating code between execution models (i.e. translating cuda to openmp or openmp to mpi). Assume common programming knowledge, but ensure that any specific context, variables, or code snippets pertinent to this problem are explicitly included.
 54 | 2. [Solution]: Offer a comprehensive, **correct** solution that accurately addresses the [Problem Description] you provided and translates the code. Include the translated code.
 55 | """
 56 | 
 57 | prompt_template_4 = """You are exceptionally skilled at crafting high-quality programming problems and offering precise solutions.
 58 | Please gain inspiration from the following random code snippet to create a high-quality code parallelization problem. Present your output in two distinct sections: [Problem Description] and [Solution].
 59 | Code snippet for inspiration:
 60 | {seed}
 61 | lines for each section:
 62 | 1. [Problem Description]: This should be **completely self-contained**, providing all the contextual information one needs to understand and solve the problem. The problem should require parallelizing a piece of code. Assume common programming knowledge, but ensure that any specific context, variables, or code snippets pertinent to this problem are explicitly included.
 63 | 2. [Solution]: Offer a comprehensive, **correct** solution that accurately addresses the [Problem Description] you provided and parallelizes the code. Include the parallel code.
 64 | """
 65 | 
 66 | genai.configure(api_key='AIzaSyDoeDru21t5NPzLgiM5IMTsJ1FgOVz6MSw')
 67 | config = genai.types.GenerationConfig(
 68 |         candidate_count=1,
 69 |         temperature=0.6,
 70 |         max_output_tokens=max_new_tokens,
 71 | )
 72 | model = genai.GenerativeModel("gemini-1.0-pro", generation_config=config)
 73 | 
 74 | def get_gemini_model_output(model, prompt: str) -> Optional[str]:
 75 |     """ Query the Gemini API to get the model output for the given prompt.
 76 |     """
 77 |     completion = model.generate_content(prompt)
 78 |     if completion.candidates[0].finish_reason == 1:
 79 |         return completion.text.strip()
 80 |     else:
 81 |         return None
 82 | 
 83 | def postprocess(output: str) -> str:
 84 |     """ Postprocess the model output to return the text from each section.
 85 |         This is accomplished by finding lines that contain each section header.
 86 |     """
 87 |     lines = output.splitlines()
 88 |     problem_statement_line_idx = [idx for idx, line in enumerate(lines) if '[Problem Description]' in line][0]
 89 |     solution_line_idx = [idx for idx, line in enumerate(lines) if '[Solution]' in line][0]
 90 | 
 91 |     problem_statement_str = '\n'.join(lines[problem_statement_line_idx+1:solution_line_idx])
 92 |     solution_str = '\n'.join(lines[solution_line_idx+1:])
 93 |     return problem_statement_str, solution_str
 94 | 
 95 | seed_dataset = datasets.load_dataset(seed_dataset_id, split='train', streaming=True).shuffle(seed=random_seed, buffer_size=50)
 96 | i = 1
 97 | results = []
 98 | with alive_bar(total_samples) as bar:
 99 |     bar(len(results), skipped=True)
100 |     for element in seed_dataset:
101 |       if(i > 26000):
102 |         seed = element['text']
103 | 
104 |         try:
105 |             if(i % 4 == 0):
106 |                 prompt = prompt_template_4.format(seed=seed)
107 |                 output = get_gemini_model_output(model, prompt)
108 |                 problem_statement, solution = postprocess(output)
109 |             elif(i % 3 == 0):
110 |                 prompt = prompt_template_3.format(seed=seed)
111 |                 output = get_gemini_model_output(model, prompt)
112 |                 problem_statement, solution = postprocess(output)
113 |             elif(i % 2 == 0):
114 |                 prompt = prompt_template_2.format(seed=seed)
115 |                 output = get_gemini_model_output(model, prompt)
116 |                 problem_statement, solution = postprocess(output)
117 |             else:
118 |                 prompt = prompt_template_1.format(seed=seed)
119 |                 output = get_gemini_model_output(model, prompt)
120 |                 problem_statement, solution = postprocess(output)
121 |         except Exception as e:
122 |             print("Error:{e}")
123 |             print('Sleeping for 5 seconds...')
124 |             time.sleep(5)
125 |             continue
126 |         results.append({
127 |             "language": element['lang'],
128 |             "seed": seed,
129 |             "problem statement": problem_statement,
130 |             "solution": solution,
131 |             "model": "gemini-1.0-pro"
132 |         })
133 |         bar()
134 |         if i % 100 == 0:
135 |             print(i)
136 |             time.sleep(5)
137 | 
138 |             # cache it intermittently in case something fails, so we don't lose
139 |             # expensive API calls
140 |             with open('gemini-outputs-2.json', 'w') as fp:
141 |                 json.dump(results, fp)
142 |       i=i+1
143 | 
144 | datasets.Dataset.from_list(results).push_to_hub('hpcgroup/hpc-synthetic-gemini',token='HF write token')
145 | 


--------------------------------------------------------------------------------
/data-generation/llama_script.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import ray
  3 | from vllm import LLM, SamplingParams
  4 | from transformers import AutoModelForCausalLM, AutoTokenizer
  5 | # std imports
  6 | from collections import Counter
  7 | import json
  8 | import os
  9 | import time
 10 | from typing import Optional
 11 | # tpl imports
 12 | from alive_progress import alive_bar
 13 | import datasets
 14 | 
 15 | seed_dataset_id = 'hpcgroup/hpc-stack-seeds'
 16 | max_new_tokens = 2048
 17 | random_seed = 42
 18 | language_counter = Counter()
 19 | total_samples = 40000
 20 | prompt_template_1 = """You are exceptionally skilled at crafting high-quality programming problems and offering precise solutions.
 21 | Please gain inspiration from the random code snippet below to create a high-quality programming problem. Be creative. Present your output in two distinct sections: **Problem Description** and **Solution**.
 22 | Provide code in the **Solution** section that solve the problem you describe in the **Problem Description** section.
 23 | You must use the following code snippet as inspiration for the problem you describe in the **Problem Description** section:
 24 | {seed}
 25 | Lines for each section of output:
 26 | 1. **Problem Description**: This should be **completely self-contained**, providing all the contextual information one needs to understand and solve the problem. Assume common programming knowledge, but ensure that any specific context, variables, or code snippets pertinent to this problem are explicitly included.
 27 | 2. **Solution**: Offer a comprehensive, **correct**, **optimal** solution that accurately addresses the **Problem Description** you provided. The provided code should be **fast** and **efficient**."""
 28 | 
 29 | prompt_template_2 = """You are exceptionally skilled at crafting high-quality programming problems and offering precise solutions.
 30 | Please gain inspiration from the following random code snippet to create a high-quality code optimization problem. Be creative.Present your output in two distinct sections: **Problem Description** and **Solution**.
 31 | Code snippet for inspiration:
 32 | {seed}
 33 | lines for each section:
 34 | 1. **Problem Description**: This should be **completely self-contained**, providing all the contextual information one needs to understand and solve the problem. The problem should require optimizing a piece of code. Provide this code to optimize in the problem description. Assume common programming knowledge, but ensure that any specific context, variables, or code snippets pertinent to this problem are explicitly included.
 35 | 2. **Solution**: Offer a comprehensive, **correct** solution that accurately addresses the **Problem Description** you provided and optimizes the code. Include the optimized code."""
 36 | 
 37 | prompt_template_3 = """You are exceptionally skilled at crafting high-quality programming problems and offering precise solutions.
 38 | Please gain inspiration from the following random code snippet to create a high-quality code translation problem. Present your output in two distinct sections: **Problem Description** and **Solution**.
 39 | Code snippet for inspiration:
 40 | {seed}
 41 | lines for each section:
 42 | 1. **Problem Description**: This should be **completely self-contained**, providing all the contextual information one needs to understand and solve the problem. The problem should require translating code between execution models (i.e. translating cuda to openmp or openmp to mpi or mpi to cuda or cuda to raja or raja to mpi or mpi to kokkos). Assume common programming knowledge, but ensure that any specific context, variables, or code snippets pertinent to this problem are explicitly included.
 43 | 2. **Solution**: Offer a comprehensive, **correct** solution that accurately addresses the **Problem Description** you provided and translates the code. Include the translated code."""
 44 | 
 45 | prompt_template_4 = """You are exceptionally skilled at crafting high-quality programming problems and offering precise solutions.
 46 | Please gain inspiration from the following random code snippet to create a high-quality code parallelization problem.Present your output in two distinct sections: **Problem Description** and **Solution**.
 47 | Code snippet for inspiration:
 48 | {seed}
 49 | lines for each section:
 50 | 1. **Problem Description**: This should be **completely self-contained**, providing all the contextual information one needs to understand and solve the problem. The problem should require parallelizing a piece of code. Assume common programming knowledge, but ensure that any specific context, variables, or code snippets pertinent to this problem are explicitly included.
 51 | 2. **Solution**: Offer a comprehensive, **correct** solution that accurately addresses the **Problem Description** you provided and parallelizes the code. Include the parallel code."""
 52 | results=[]
 53 | model_id = "meta-llama/Meta-Llama-3-70B-Instruct"
 54 | ray.init(num_gpus=2)
 55 | if ray.is_initialized():
 56 |     ray.shutdown()
 57 | llm = LLM(model="meta-llama/Meta-Llama-3-70B-Instruct",gpu_memory_utilization=0.95, max_num_seqs=1,swap_space=1,tensor_parallel_size=2)
 58 | print ("model loaded")
 59 | tokenizer = llm.get_tokenizer()
 60 | def postprocess(input_text: str) -> str:
 61 |     """ Postprocess the model output to return the text from each section.
 62 |         This is accomplished by finding lines that contain each section header.
 63 |     """
 64 |     lines = input_text.splitlines()
 65 |     problem_keyword = "**Problem Description**"
 66 |     solution_keyword = "**Solution**"
 67 |     if(input_text.find(problem_keyword) == -1):
 68 |         problem_keyword = "**Problem Description:**"
 69 |     if(input_text.find(solution_keyword) == -1):
 70 |         solution_keyword = "**Solution:**"
 71 | 
 72 |     if(input_text.find(problem_keyword) == -1 or input_text.find(solution_keyword) == -1):
 73 |         raise ValueError(f"All sections not present")
 74 | 
 75 |     # Find the starting index of each section
 76 |     problem_start = input_text.find(problem_keyword) + len(problem_keyword)
 77 |     solution_start = input_text.find(solution_keyword) + len(solution_keyword)
 78 | 
 79 |     # Extract the sections
 80 |     problem_description = input_text[problem_start:solution_start - len(solution_keyword)].strip()
 81 |     solution = input_text[solution_start:].strip()
 82 |     return problem_description, solution
 83 | 
 84 | def generate_output(prompts,total,sum) -> str:
 85 |     conversations = tokenizer.apply_chat_template(
 86 |         prompts,
 87 |         tokenize=False,
 88 |         add_generation_prompt=True,
 89 |     )
 90 |     outputs = llm.generate(conversations,
 91 |                        SamplingParams(
 92 |         temperature=0.8,
 93 |         top_p=0.95,
 94 |         max_tokens=4096,
 95 |         stop_token_ids=[tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|eot_id|>")],
 96 |         )
 97 |     )
 98 |     for output in outputs:
 99 |         time = (output.metrics.finished_time)-(output.metrics.first_token_time)
100 |         total+=len(output.outputs[0].token_ids)/time
101 |         sum.append(len(output.outputs[0].token_ids)/time)
102 |         print(f"Tokens/second = ",len(output.outputs[0].token_ids)/time)
103 |         return output.outputs[0].text
104 | 
105 | seed_dataset = datasets.load_dataset(seed_dataset_id, split='train', streaming=True).shuffle(seed=random_seed, buffer_size=50)
106 | i = 0
107 | total=0
108 | sum=[]
109 | with alive_bar(total_samples) as bar:
110 |     bar(len(results), skipped=True)
111 |     for element in seed_dataset:
112 |       prompts=[]
113 |       if(i > 57000):
114 |         seed = element['text']
115 |         try:
116 |           if(i % 4 == 0):
117 |             prompt = [{"role": "user", "content":prompt_template_4.format(seed=seed)}]
118 |           elif(i % 3 == 0):
119 |             prompt = [{"role": "user", "content":prompt_template_3.format(seed=seed)}]
120 |           elif(i % 2 == 0):
121 |             prompt = [{"role": "user", "content":prompt_template_2.format(seed=seed)}]
122 |           else:
123 |             prompt = [{"role": "user", "content":prompt_template_1.format(seed=seed)}]
124 |           prompts.append(prompt)
125 |           generated_text = generate_output(prompts,total,sum)
126 |           problem_statement, solution = postprocess(generated_text)
127 |         except Exception as e:
128 |           print("Error:{e}")
129 |           continue
130 |         bar()
131 |         results.append({
132 |           "language": element['lang'],
133 |           "seed": seed,
134 |           "problem statement": problem_statement,
135 |           "solution": solution,
136 |           "model": "Meta-Llama-3-70B-Instruct"
137 |         })
138 |         if(i % 100 == 0):
139 |             with open('llama-outputs-3.json', 'w') as fp:
140 |               json.dump(results, fp)
141 |         if(i == 59400):
142 |             break
143 |       i=i+1
144 | with open('llama-outputs-3.json', 'w') as fp:
145 |     json.dump(results, fp)
146 | 


--------------------------------------------------------------------------------
/data-generation/mixtral_script.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import ray
  3 | from vllm import LLM, SamplingParams
  4 | from transformers import AutoModelForCausalLM, AutoTokenizer
  5 | # std imports
  6 | from collections import Counter
  7 | import json
  8 | import os
  9 | import time
 10 | from typing import Optional
 11 | # tpl imports
 12 | from alive_progress import alive_bar
 13 | import datasets
 14 | 
 15 | seed_dataset_id = 'hpcgroup/hpc-stack-seeds'
 16 | max_new_tokens = 2048
 17 | random_seed = 42
 18 | language_counter = Counter()
 19 | total_samples = 40000
 20 | prompt_template_1 = """You are exceptionally skilled at crafting high-quality programming problems and offering precise solutions.
 21 | Please gain inspiration from the random code snippet below to create a high-quality programming problem. Be creative. Present your output in two distinct sections: **Problem Description** and **Solution**.
 22 | Provide code in the **Solution** section that solve the problem you describe in the **Problem Description** section.
 23 | You must use the following code snippet as inspiration for the problem you describe in the **Problem Description** section:
 24 | {seed}
 25 | Lines for each section of output:
 26 | 1. **Problem Description**: This should be **completely self-contained**, providing all the contextual information one needs to understand and solve the problem. Assume common programming knowledge, but ensure that any specific context, variables, or code snippets pertinent to this problem are explicitly included.
 27 | 2. **Solution**: Offer a comprehensive, **correct**, **optimal** solution that accurately addresses the **Problem Description** you provided. The provided code should be **fast** and **efficient**."""
 28 | 
 29 | prompt_template_2 = """You are exceptionally skilled at crafting high-quality programming problems and offering precise solutions.
 30 | Please gain inspiration from the following random code snippet to create a high-quality code optimization problem. Be creative.Present your output in two distinct sections: **Problem Description** and **Solution**.
 31 | Code snippet for inspiration:
 32 | {seed}
 33 | lines for each section:
 34 | 1. **Problem Description**: This should be **completely self-contained**, providing all the contextual information one needs to understand and solve the problem. The problem should require optimizing a piece of code. Provide this code to optimize in the problem description. Assume common programming knowledge, but ensure that any specific context, variables, or code snippets pertinent to this problem are explicitly included.
 35 | 2. **Solution**: Offer a comprehensive, **correct** solution that accurately addresses the **Problem Description** you provided and optimizes the code. Include the optimized code."""
 36 | 
 37 | prompt_template_3 = """You are exceptionally skilled at crafting high-quality programming problems and offering precise solutions.
 38 | Please gain inspiration from the following random code snippet to create a high-quality code translation problem. Present your output in two distinct sections: **Problem Description** and **Solution**.
 39 | Code snippet for inspiration:
 40 | {seed}
 41 | lines for each section:
 42 | 1. **Problem Description**: This should be **completely self-contained**, providing all the contextual information one needs to understand and solve the problem. The problem should require translating code between execution models (i.e. translating cuda to openmp or openmp to mpi or mpi to cuda or cuda to raja or raja to mpi or mpi to kokkos). Assume common programming knowledge, but ensure that any specific context, variables, or code snippets pertinent to this problem are explicitly included.
 43 | 2. **Solution**: Offer a comprehensive, **correct** solution that accurately addresses the **Problem Description** you provided and translates the code. Include the translated code."""
 44 | 
 45 | prompt_template_4 = """You are exceptionally skilled at crafting high-quality programming problems and offering precise solutions.
 46 | Please gain inspiration from the following random code snippet to create a high-quality code parallelization problem.Present your output in two distinct sections: **Problem Description** and **Solution**.
 47 | Code snippet for inspiration:
 48 | {seed}
 49 | lines for each section:
 50 | 1. **Problem Description**: This should be **completely self-contained**, providing all the contextual information one needs to understand and solve the problem. The problem should require parallelizing a piece of code. Assume common programming knowledge, but ensure that any specific context, variables, or code snippets pertinent to this problem are explicitly included.
 51 | 2. **Solution**: Offer a comprehensive, **correct** solution that accurately addresses the **Problem Description** you provided and parallelizes the code. Include the parallel code."""
 52 | results=[]
 53 | model_id = "mistralai/Mixtral-8x7B-Instruct-v0.1"
 54 | """tokenizer = AutoTokenizer.from_pretrained(model_id,trust_remote_code=True, token="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")
 55 | model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, device_map="auto",trust_remote_code=True, token="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")"""
 56 | ray.init(num_gpus=2)
 57 | if ray.is_initialized():
 58 |     ray.shutdown()
 59 | llm = LLM(model="mistralai/Mixtral-8x7B-Instruct-v0.1",gpu_memory_utilization=0.95, max_num_seqs=1,swap_space=1,tensor_parallel_size=2)
 60 | print ("model loaded")
 61 | tokenizer = llm.get_tokenizer()
 62 | def postprocess(input_text: str) -> str:
 63 |     """ Postprocess the model output to return the text from each section.
 64 |         This is accomplished by finding lines that contain each section header.
 65 |     """
 66 |     lines = input_text.splitlines()
 67 |     problem_keyword = "**Problem Description**"
 68 |     solution_keyword = "**Solution**"
 69 |     if(input_text.find(problem_keyword) == -1):
 70 |         problem_keyword = "**Problem Description:**"
 71 |     if(input_text.find(solution_keyword) == -1):
 72 |         solution_keyword = "**Solution:**"
 73 | 
 74 |     if(input_text.find(problem_keyword) == -1 or input_text.find(solution_keyword) == -1):
 75 |         raise ValueError(f"All sections not present")
 76 | 
 77 |     # Find the starting index of each section
 78 |     problem_start = input_text.find(problem_keyword) + len(problem_keyword)
 79 |     solution_start = input_text.find(solution_keyword) + len(solution_keyword)
 80 | 
 81 |     # Extract the sections
 82 |     problem_description = input_text[problem_start:solution_start - len(solution_keyword)].strip()
 83 |     solution = input_text[solution_start:].strip()
 84 |     return problem_description, solution
 85 | 
 86 | def generate_output(prompts,total,sum) -> str:
 87 |     conversations = tokenizer.apply_chat_template(
 88 |         prompts,
 89 |         tokenize=False,
 90 |         add_generation_prompt=True,
 91 |     )
 92 |     outputs = llm.generate(conversations,
 93 |                        SamplingParams(
 94 |         temperature=0.8,
 95 |         top_p=0.95,
 96 |         max_tokens=4096,
 97 |         stop_token_ids=[tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|im_end|>")],
 98 |         )
 99 |     )
100 |     for output in outputs:
101 |         time = (output.metrics.finished_time)-(output.metrics.first_token_time)
102 |         total+=len(output.outputs[0].token_ids)/time
103 |         sum.append(len(output.outputs[0].token_ids)/time)
104 |         print(f"Tokens/second = ",len(output.outputs[0].token_ids)/time)
105 |         return output.outputs[0].text
106 | 
107 | seed_dataset = datasets.load_dataset(seed_dataset_id, split='train', streaming=True).shuffle(seed=random_seed, buffer_size=50)
108 | """
109 | langs=[]
110 | seeds=[]"""
111 | i = 0
112 | total=0
113 | sum=[]
114 | with alive_bar(total_samples) as bar:
115 |     bar(len(results), skipped=True)
116 |     for element in seed_dataset:
117 |       prompts=[]
118 |       if(i > 50720):
119 |         seed = element['text']
120 |         try:
121 |           if(i % 4 == 0):
122 |             prompt = [{"role": "user", "content":prompt_template_4.format(seed=seed)}]
123 |           elif(i % 3 == 0):
124 |             prompt = [{"role": "user", "content":prompt_template_3.format(seed=seed)}]
125 |           elif(i % 2 == 0):
126 |             prompt = [{"role": "user", "content":prompt_template_2.format(seed=seed)}]
127 |           else:
128 |             prompt = [{"role": "user", "content":prompt_template_1.format(seed=seed)}]
129 |           prompts.append(prompt)
130 |           generated_text = generate_output(prompts,total,sum)
131 |           problem_statement, solution = postprocess(generated_text)
132 |         except Exception as e:
133 |           print("Error:{e}")
134 |           continue
135 |         bar()
136 |         results.append({
137 |           "language": element['lang'],
138 |           "seed": seed,
139 |           "problem statement": problem_statement,
140 |           "solution": solution,
141 |           "model": "Mixtral-8x7B-Instruct-v0.1"
142 |         })
143 |         if(i % 100 == 0):
144 |             with open('mixtral-outputs-2.json', 'w') as fp:
145 |               json.dump(results, fp)
146 |         if(i == 60000):
147 |             break
148 |       i=i+1
149 | with open('mixtral-outputs-2.json', 'w') as fp:
150 |     json.dump(results, fp)
151 | datasets.Dataset.from_list(results).push_to_hub('hpcgroup/hpc-synthetic-mixtral', token='HF write token')
152 | 


--------------------------------------------------------------------------------
/data-generation/run-dbrx.sbatch:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH -N 1
 3 | #SBATCH --ntasks-per-node=1
 4 | #SBATCH --gpus-per-task=4
 5 | #SBATCH -t 20:00:00
 6 | #SBATCH -A m2404
 7 | #SBATCH -C gpu&hbm80g
 8 | #SBATCH -q regular
 9 | 
10 | source ~/dbrx/.env/bin/activate
11 | 
12 | python -u dbrx_script.py
13 | 


--------------------------------------------------------------------------------
/data-generation/run-llama.sbatch:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH -N 1
 3 | #SBATCH --ntasks-per-node=1
 4 | #SBATCH --gpus-per-task=2
 5 | #SBATCH -t 24:00:00
 6 | #SBATCH -A m2404
 7 | #SBATCH -C gpu&hbm80g
 8 | #SBATCH -q regular
 9 | 
10 | source ~/dbrx/.env/bin/activate
11 | 
12 | python -u  llama_script.py
13 | 


--------------------------------------------------------------------------------
/data-generation/run-mixtral.sbatch:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH -N 1
 3 | #SBATCH --ntasks-per-node=1
 4 | #SBATCH --gpus-per-task=2
 5 | #SBATCH -t 00:30:00
 6 | #SBATCH -A m2404
 7 | #SBATCH -C gpu&hbm80g
 8 | #SBATCH -q regular
 9 | 
10 | source ~/dbrx/.env/bin/activate
11 | 
12 | python -u  mixtral_script.py
13 | 


--------------------------------------------------------------------------------
/fine-tuning/arguments.py:
--------------------------------------------------------------------------------
 1 | from argparse import ArgumentParser
 2 | 
 3 | def create_parser():
 4 |     parser = ArgumentParser()
 5 |     parser.add_argument("--model_id", default="deepseek-ai/deepseek-coder-6.7b-base", type=str,
 6 |                        help="name of huggingface transformers model you want to run")
 7 |     parser.add_argument("--seed", type=int, default=123456, 
 8 |                         help="random seed")
 9 |     parser.add_argument("--dtype", choices=["bf16", "fp16", "fp32"],
10 |                         help="data type for running inference", default="fp16")
11 |     parser.add_argument("--use-flash-attention", action='store_true',
12 |                         help="Use Flash Attention for faster training")
13 |     parser.add_argument("--global-batch-size", type=int, default=16, 
14 |                         help="Global Batch Size")
15 |     parser.add_argument("--gradient-acc-steps", type=int, default=1, 
16 |                         help="Gradient Accumulation Steps")
17 |     parser.add_argument("--sequence-length", type=int, default=256, 
18 |                         help="Sequence Length")
19 |     parser.add_argument("--disable-axonn", action='store_false', dest='use_axonn',
20 |                         help="Disable AxoNN's Tensor Paralellism")
21 |     parser.add_argument("--log-interval", type=int, default=10,
22 |                         help="Interval for logging train loss")
23 |     parser.add_argument("--num-epochs", type=int, default=3,
24 |                         help="Number of epochs")
25 |     parser.add_argument("--save-every", type=int, default=100,
26 |                         help="Save model weights after every --save-every iterations")
27 |     parser.add_argument("--check-max-mem-usage", action='store_true',
28 |                         help="Pad all sequences to the --sequence-length to get the maximum memory usage.")
29 |     return parser
30 | 


--------------------------------------------------------------------------------
/fine-tuning/data_utils.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os.path as osp
  3 | from typing import Union
  4 | import torch
  5 | 
  6 | alpaca_template = {
  7 |     "description": "Template used by Alpaca-LoRA.",
  8 |     "prompt_no_input": "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Response:\n",
  9 |     "response_split": "### Response:"
 10 | }
 11 | 
 12 | class Prompter(object):
 13 |     __slots__ = ("template", "_verbose")
 14 | 
 15 |     def __init__(self, template_name: str = "", verbose: bool = False):
 16 |         self._verbose = verbose
 17 |         if not template_name:
 18 |             # Enforce the default here, so the constructor can be called with '' and will not break.
 19 |             template_name = "alpaca"
 20 |         assert template_name == "alpaca"
 21 |         self.template = alpaca_template
 22 |         if self._verbose:
 23 |             print(
 24 |                 f"Using prompt template {template_name}: {self.template['description']}"
 25 |             )
 26 | 
 27 |     def generate_prompt(
 28 |         self,
 29 |         instruction: str,
 30 |         context: Union[None, str] = None,
 31 |         response: Union[None, str] = None,
 32 |     ) -> str:
 33 |         # returns the full prompt from instruction and optional input
 34 |         # if a label (=response, =output) is provided, it's also appended.
 35 |         """ if input:
 36 |             res = self.template["prompt_input"].format(
 37 |                 instruction=instruction, context=context
 38 |             )
 39 |         else:
 40 |             res = self.template["prompt_no_input"].format(
 41 |                 instruction=instruction
 42 |             )"""
 43 |         res = self.template["prompt_no_input"].format(
 44 |                 instruction=instruction
 45 |               )
 46 |         if response:
 47 |             res = f"{res}{response}"
 48 |         if self._verbose:
 49 |             print(res)
 50 |         return res
 51 | 
 52 |     def get_response(self, output: str) -> str:
 53 |         return output.split(self.template["response_split"])[1].strip()
 54 | 
 55 | prompter = Prompter()
 56 | 
 57 | 
 58 | def tokenize(prompt, tokenizer, cutoff_len, add_eos_token=True):
 59 |         # there's probably a way to do this with the tokenizer settings
 60 |         # but again, gotta move fast
 61 |         result = tokenizer(
 62 |             prompt,
 63 |             truncation=True,
 64 |             max_length=cutoff_len,
 65 |             padding=False,
 66 |             return_tensors=None,
 67 |         )
 68 |         if (
 69 |             result["input_ids"][-1] != tokenizer.eos_token_id
 70 |             and len(result["input_ids"]) < cutoff_len
 71 |             and add_eos_token
 72 |         ):
 73 |             result["input_ids"].append(tokenizer.eos_token_id)
 74 |             result["attention_mask"].append(1)
 75 | 
 76 |         result["labels"] = result["input_ids"].copy()
 77 |         return result
 78 | 
 79 | 
 80 | def get_tokenizer_mapping_fn(tokenizer, cutoff_len, train_on_inputs=True, add_eos_token=True):
 81 |     def generate_and_tokenize_prompt(data_point):
 82 |         full_prompt = prompter.generate_prompt(
 83 |             instruction = data_point["instruction"],
 84 |             response = data_point["response"],
 85 |         )
 86 |         """if torch.distributed.get_rank() == 0:
 87 |             print ("Generated full prompt:", full_prompt)"""
 88 |         tokenized_full_prompt = tokenize(full_prompt, tokenizer, cutoff_len, add_eos_token)
 89 |         if not train_on_inputs:
 90 |             user_prompt = prompter.generate_prompt(
 91 |                 data_point["instruction"]
 92 |             )
 93 |             tokenized_user_prompt = tokenize(
 94 |                 user_prompt, tokenizer,  cutoff_len, add_eos_token
 95 |             )
 96 |             user_prompt_len = len(tokenized_user_prompt["input_ids"])
 97 | 
 98 |             if add_eos_token:
 99 |                 user_prompt_len -= 1
100 | 
101 |             tokenized_full_prompt["labels"] = [
102 |                 -100
103 |             ] * user_prompt_len + tokenized_full_prompt["labels"][
104 |                 user_prompt_len:
105 |             ]  # could be sped up, probably
106 |         return tokenized_full_prompt
107 |     return generate_and_tokenize_prompt
108 | 


--------------------------------------------------------------------------------
/fine-tuning/ft.py:
--------------------------------------------------------------------------------
  1 | from datasets import load_dataset
  2 | from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, DataCollatorForSeq2Seq
  3 | from datasets import load_dataset
  4 | from axonn.models.transformers import parallelize 
  5 | from axonn import axonn as ax
  6 | import torch
  7 | import random
  8 | import numpy as np
  9 | from arguments import create_parser
 10 | from contextlib import nullcontext
 11 | from data_utils import get_tokenizer_mapping_fn
 12 | from torch.utils.data import DataLoader
 13 | from axonn.checkpoint import save
 14 | from axonn.intra_layer import sync_gradients, optimize_communication, clear_weights_cache, clip_grad_norm_
 15 | from axonn import axonn as ax
 16 | 
 17 | def init_everything():
 18 |     torch.distributed.init_process_group(backend='nccl')
 19 |     world_size = torch.distributed.get_world_size()
 20 |     rank = torch.distributed.get_rank()
 21 |     if rank == 0:
 22 |         print(f"Going to distribute the model over {world_size} GPUs")
 23 |     ax.init(G_data=1, G_inter=1, G_intra_r=1, G_intra_c=1, G_intra_d=world_size)
 24 | 
 25 | def set_seed(seed=123456):
 26 |     random.seed(seed)
 27 |     np.random.seed(seed)
 28 |     torch.manual_seed(seed)
 29 |     torch.cuda.manual_seed_all(seed)
 30 | 
 31 | 
 32 | dtype_map = {
 33 |     "bf16": torch.bfloat16,
 34 |     "fp16": torch.float16,
 35 |     "fp32": torch.float32
 36 | }
 37 | 
 38 | 
 39 | def get_tokenized_dataset(tokenizer, sequence_length=256):
 40 |     data = load_dataset("hpcgroup/hpc-data")
 41 |     mapping_fn = get_tokenizer_mapping_fn(tokenizer, cutoff_len=sequence_length, train_on_inputs=False)
 42 |     train_data = data["train"].shuffle().map(mapping_fn, remove_columns=data["train"].column_names)
 43 |     return train_data
 44 | 
 45 | def pretty_log(iteration,
 46 |                total_train_iters,
 47 |                train_loss,
 48 |                elapsed_time_per_iteration,
 49 |                learning_rate,
 50 |                grad_norm,
 51 | ):
 52 | 
 53 |     log_string = '> global batch {:8d}/{:8d} |'.format(
 54 |         iteration, total_train_iters)
 55 |     log_string += ' elapsed time per global batch (ms): {:.1f} |'.format(
 56 |         elapsed_time_per_iteration)
 57 |     log_string += ' learning rate: {:.3E} |'.format(learning_rate)
 58 |     log_string += ' loss: {:.5f} |'.format(train_loss)
 59 |     curr_mem =  torch.cuda.memory_allocated() / 1024 / 1024 / 1024
 60 |     peak_mem =  torch.cuda.max_memory_allocated() / 1024 / 1024 / 1024
 61 |     log_string += ' memory used by tensors {:.3f} GB (peak {:.3f} GB) |'.format(curr_mem, peak_mem)
 62 |     log_string += f' grad norm: {grad_norm:.5f}'
 63 |     return log_string
 64 | 
 65 | 
 66 | if __name__ == "__main__":
 67 |     parser = create_parser()
 68 |     args = parser.parse_args()
 69 |     init_everything()
 70 |     set_seed(args.seed)
 71 |     dtype = dtype_map[args.dtype]
 72 | 
 73 |     if args.use_axonn:
 74 |         with parallelize(args.model_id):
 75 |             model = AutoModelForCausalLM.from_pretrained(args.model_id, 
 76 |                                                              torch_dtype=dtype, 
 77 |                                                          attn_implementation='eager' if not args.use_flash_attention else "flash_attention_2").to('cuda').float()
 78 |     else:
 79 |         model = AutoModelForCausalLM.from_pretrained(args.model_id,
 80 |                                                              torch_dtype=dtype,
 81 |                                                          attn_implementation='eager' if not args.use_flash_attention else "flash_attention_2").to('cuda').float()
 82 | 
 83 |     model.train()
 84 |     model.gradient_checkpointing_enable()
 85 |     tokenizer = AutoTokenizer.from_pretrained(args.model_id)
 86 |     tokenizer.pad_token_id = (
 87 |         0  
 88 |     )
 89 |     tokenizer.padding_side = "left"  
 90 |     tokenized_dataset = get_tokenized_dataset(tokenizer, args.sequence_length)
 91 |     sampler = torch.utils.data.distributed.DistributedSampler(
 92 |         tokenized_dataset
 93 |     )
 94 |     assert args.global_batch_size % (args.gradient_acc_steps * torch.distributed.get_world_size()) == 0
 95 |     dataloader = DataLoader(
 96 |                                 tokenized_dataset, 
 97 |                                 batch_size=args.global_batch_size // args.gradient_acc_steps // torch.distributed.get_world_size(), 
 98 |                                 collate_fn=DataCollatorForSeq2Seq(
 99 |                                         tokenizer, 
100 |                                         max_length=args.sequence_length if args.check_max_mem_usage else None,
101 |                                         pad_to_multiple_of=8 if not args.check_max_mem_usage else None, 
102 |                                         return_tensors="pt", 
103 |                                         padding='max_length' if args.check_max_mem_usage else True
104 |                                         ),
105 |                                 sampler=sampler
106 |                             ) 
107 |     optimizer = torch.optim.AdamW(model.parameters(), 
108 |                                   lr=1e-5, 
109 |                                   betas=(0.9, 0.95), 
110 |                                   eps=1e-5,
111 |                                   weight_decay=0.0)
112 | 
113 |     iters_per_epoch = len(dataloader) // args.gradient_acc_steps
114 |     main_lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=iters_per_epoch * args.num_epochs)
115 |     warmup_iters = 100
116 |     warmup_lr_scheduler = torch.optim.lr_scheduler.LinearLR(
117 |                 optimizer, start_factor=0.01, total_iters=warmup_iters
118 |             )
119 |     lr_scheduler = torch.optim.lr_scheduler.SequentialLR(
120 |             optimizer, schedulers=[warmup_lr_scheduler, main_lr_scheduler], milestones=[warmup_iters]
121 |         )
122 | 
123 |     scaler = torch.cuda.amp.GradScaler(enabled=(dtype == torch.float16))
124 |     loss_fn = torch.nn.CrossEntropyLoss()
125 | 
126 |     start_event = torch.cuda.Event(enable_timing=True)
127 |     end_event = torch.cuda.Event(enable_timing=True)
128 |     iter_no = 0
129 |     for epoch_no in range(args.num_epochs):
130 |         microbatch_no = 0
131 |         start_event.record()
132 |         batch_loss = 0
133 |         for batch in dataloader:
134 |             input_ids, labels, attention_mask = batch["input_ids"], batch["labels"], batch["attention_mask"]
135 |             input_ids, labels, attention_mask = input_ids.cuda(), labels.cuda(), attention_mask.cuda()
136 |             with optimize_communication(True, True, True, model):
137 |                 with torch.amp.autocast(device_type='cuda', dtype=dtype):
138 |                     input_ids = input_ids[:, :-1]
139 |                     attention_mask = attention_mask[:, :-1]
140 |                     labels = labels[:, 1:]
141 |                     output = model(input_ids = input_ids, attention_mask=attention_mask)
142 |                     logits = output["logits"]
143 |                     loss = loss_fn(logits.reshape(-1, logits.shape[-1]), 
144 |                                    labels.reshape(-1))
145 |                 scaler.scale(loss / args.gradient_acc_steps / torch.distributed.get_world_size()).backward()
146 |             clear_weights_cache()
147 |             global_loss = loss / args.gradient_acc_steps / torch.distributed.get_world_size()
148 |             torch.distributed.all_reduce(global_loss)
149 |             batch_loss += global_loss.item()
150 |             microbatch_no += 1
151 | 
152 |             if microbatch_no == args.gradient_acc_steps:
153 |                 scaler.unscale_(optimizer)
154 |                 sync_gradients(model)
155 |                 grad_norm = clip_grad_norm_(model.parameters(), 1.0)
156 |                 scaler.step(optimizer)
157 |                 scaler.update()
158 |                 optimizer.zero_grad(set_to_none=True)
159 |                 lr_scheduler.step()
160 |                 iter_no += 1
161 |                 end_event.record()
162 |                 if torch.distributed.get_rank() == 0 and (iter_no % args.log_interval==0):
163 |                     torch.cuda.synchronize()
164 |                     elapsed_time = start_event.elapsed_time(end_event) 
165 |                     log_string = pretty_log(iter_no, len(dataloader)*args.num_epochs // args.gradient_acc_steps, 
166 |                                             batch_loss, elapsed_time, learning_rate=optimizer.param_groups[0]['lr'], 
167 |                                             grad_norm=grad_norm)
168 |                     print(log_string)
169 | 
170 |                 microbatch_no = 0
171 |                 batch_loss = 0
172 |                 start_event.record()
173 |                 state = {
174 |                         "iter_no": iter_no,
175 |                         "optimizer": optimizer.state_dict(),
176 |                         "model": model.state_dict()
177 |                         }
178 |         save(state, checkpoint_folder="/pscratch/sd/a/amanc/ckpt", checkpoint_name=f"epoch_{epoch_no}")
179 | 


--------------------------------------------------------------------------------
/fine-tuning/get_rank.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # select_gpu_device wrapper script
3 | export RANK=${SLURM_PROCID}
4 | exec $*
5 | 


--------------------------------------------------------------------------------
/fine-tuning/graph.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import matplotlib.pyplot as plt
 3 | import numpy as np
 4 | 
 5 | # Read the data from the file
 6 | file_path = 'run_ft.out'
 7 | try:
 8 |     with open(file_path, 'r') as file:
 9 |         data = file.read()
10 | except FileNotFoundError:
11 |     print(f"File {file_path} not found.")
12 |     exit(1)
13 | 
14 | # Regular expression to extract the required values
15 | pattern = re.compile(r'global batch\s+(\d+)/\s+\d+\s+\|.*?\| loss: ([\d.]+)')
16 | 
17 | # Lists to hold extracted values
18 | global_batches = []
19 | losses = []
20 | 
21 | # Extracting data from the file
22 | for match in pattern.finditer(data):
23 |     global_batches.append(int(match.group(1)))
24 |     losses.append(float(match.group(2)))
25 | 
26 | # Check if data was extracted
27 | if not global_batches or not losses:
28 |     print("No data was extracted. Please check the file format and ensure it contains the expected information.")
29 |     exit(1)
30 | 
31 | # Print the first few extracted data points for verification
32 | print(f"Extracted {len(global_batches)} data points.")
33 | print("Sample data points:")
34 | for i in range(min(5, len(global_batches))):
35 |     print(f"Global Batch: {global_batches[i]}, Loss: {losses[i]}")
36 | 
37 | # Function to smooth the data using moving average
38 | def moving_average(data, window_size):
39 |     return np.convolve(data, np.ones(window_size)/window_size, mode='valid')
40 | 
41 | # Set the window size for smoothing
42 | window_size = 10
43 | 
44 | # Apply smoothing
45 | smoothed_losses = moving_average(losses, window_size)
46 | 
47 | # Adjust the x-axis to match the length of the smoothed data
48 | smoothed_global_batches = global_batches[window_size - 1:]
49 | 
50 | # Plotting the data
51 | plt.figure(figsize=(10, 6))
52 | plt.plot(global_batches, losses, label='Original Loss')
53 | plt.plot(smoothed_global_batches, smoothed_losses, label='Smoothed Loss', color='orange')
54 | plt.xlabel('Global Batch')
55 | plt.ylabel('Loss')
56 | plt.title('Loss per Global Batch')
57 | plt.legend()
58 | plt.savefig('loss_per_global_batch.png')
59 | plt.show()
60 | 


--------------------------------------------------------------------------------
/fine-tuning/run_ft.sbatch:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --gpus-per-node=4
 3 | #SBATCH -t 11:00:00
 4 | #SBATCH -A m2404
 5 | #SBATCH -C gpu&hbm80g
 6 | #SBATCH -q regular
 7 | #SBATCH --nodes=4
 8 | #SBATCH --qos=regular
 9 | #SBATCH --ntasks-per-node=4
10 | #SBATCH --output=run_ft.out
11 | 
12 | # Getting number of nodes and GPUs
13 | NNODES=$SLURM_JOB_NUM_NODES
14 | GPUS=$(( NNODES * 4 ))
15 | 
16 | # ENV variables for torch.distributed
17 | export MASTER_ADDR=$(hostname)
18 | export MASTER_PORT=29500
19 | export WORLD_SIZE=16
20 | 
21 | # ENV variables for fast NCCL on perlmutter
22 | # remove for other clusters
23 | export NCCL_NET_GDR_LEVEL=PHB
24 | export CUDA_DEVICE_MAX_CONNECTIONS=1
25 | export CUDA_VISIBLE_DEVICES=3,2,1,0
26 | export NCCL_CROSS_NIC=1
27 | export NCCL_SOCKET_IFNAME=hsn
28 | export NCCL_NET="AWS Libfabric"
29 | export FI_CXI_RDZV_THRESHOLD=0
30 | export FI_CXI_RDZV_GET_MIN=0
31 | export FI_CXI_OFLOW_BUF_SIZE=1073741824
32 | export FI_CXI_OFLOW_BUF_COUNT=1
33 | 
34 | # hf env variables, remove/change if needed
35 | export HF_HOME="${SCRATCH}/.cache/huggingface"
36 | export HF_TRANSFORMERS_CACHE="${HF_HOME}"
37 | export HF_DATASETS_CACHE="${HF_HOME}/datasets"
38 | 
39 | 
40 | module load pytorch
41 | source ${SCRATCH}/axonn_venv/bin/activate
42 | 
43 | SCRIPT="python -u ft.py --dtype bf16 --global-batch-size 128 --gradient-acc-steps 2 --log-interval 1 --sequence-length 8192 --use-flash-attention"
44 | 
45 | #Uncomment if you want to check max memory usage
46 | #SCRIPT="$SCRIPT --check-max-mem-usage"
47 | 
48 | run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 ./get_rank.sh $SCRIPT"
49 | 
50 | echo $run_cmd
51 | eval $run_cmd
52 | 


--------------------------------------------------------------------------------
/fine-tuning/test_hf_model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, DataCollatorForSeq2Seq
 3 | from data_utils import alpaca_template
 4 | 
 5 | use_flash_attention = True
 6 | model_id = "/global/cfs/cdirs/m2404/ckpt/hpc-coder-v2-hf/"
 7 | model = AutoModelForCausalLM.from_pretrained(model_id,
 8 |                                                 torch_dtype=torch.bfloat16,
 9 |                                                     attn_implementation='eager' if not use_flash_attention else "flash_attention_2").to('cuda')
10 | 
11 | model.eval()
12 | template = alpaca_template["prompt_no_input"]
13 | 
14 | instructions = ['\nGiven a distributed file system where files are divided into blocks of equal size, design an algorithm to efficiently allocate and deallocate blocks across multiple storage nodes. Consider the following requirements:\n\n* Each block must be assigned to exactly one storage node.\n* The distribution of blocks should be balanced across the storage nodes to optimize performance.\n* The algorithm should minimize the number of block reassignments when a new block is allocated or an existing block is deallocated.\n',]
15 | 
16 | tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/deepseek-coder-6.7b-base")
17 | 
18 | with torch.no_grad():
19 |     for instruction in instructions:
20 |         with torch.amp.autocast(device_type='cuda', dtype=torch.bfloat16):
21 |             prompt = template.format(instruction=instruction)
22 |             input_ids = tokenizer(prompt, return_tensors="pt").input_ids
23 |             outputs = model.generate(input_ids.cuda(),
24 |                                      do_sample=True, max_new_tokens=1024)
25 | 
26 |             print(tokenizer.batch_decode(outputs)[0])
27 | 


--------------------------------------------------------------------------------
/v1/README.md:
--------------------------------------------------------------------------------
 1 | # HPC LLM Training
 2 | 
 3 | > [!WARNING]
 4 | > This directory contains the old hpc-coder-v1 scripts. See the [main repo](/)
 5 | > for more up-to-date scripts, data, and models.
 6 | 
 7 | This repo contains scripts for training LLMs on HPC source code data.
 8 | It is organized as follows:
 9 | 
10 | - ***data:*** scripts and utilities for collecting and preprocessing the dataset [[README]](data/README.md)
11 | - ***analysis:*** scripts related to analyzing dataset and training LLM [[README]](analysis/README.md)
12 | 
13 | To overview of the workflow from start to finish is as follows.
14 | Use `data/collect-repo-metadata.py` and `data/edit-metadata.py` to create dataset of GitHub repositories, if desired,
15 | otherwise use the existing `repos-gt3.csv` dataset.
16 | Run `data/clone-repos.py` to clone the repositories to a desired location.
17 | `data/collect-dataset.py` script can then be used to create a json lines dataset with all the textual data.
18 | The `analysis/run_clm-*.sbatch` scripts can then be used to train the models on the data.
19 | 
20 | ## Notes and Misc.
21 | 
22 | *Weird bug fix #1:*
23 | I had to change the default value of `max_workers` from 64 to 32 in the parameter list of 
24 | `_get_origin_metadata_locally_or_by_urls` in `datasets/data_files.py#L708`. 
25 | Zaratan's CPUs have 64 cores, but tqdm errors trying to start 64 threads for some reason.
26 | Similar to this I also generally have to `export TOKENIZERS_PARALLELISM=false` to prevent 
27 | huggingface from spinning up threads in forked processes.
28 | 
29 | *Weird bug fix #2:*
30 | I had to change the following lines in `torch/distributed/distributed_c10d.py`, line 2068.
31 | The PolyCoder model seems to output non-contiguous logits
32 | and there is a current bug in PyTorch where the NCCL backend errors if passed
33 | non-contiguous tensors to `all_gather`. This bug is documented [here](https://github.com/pytorch/pytorch/issues/73515)
34 | and [here](https://github.com/pytorch/pytorch/pull/75276).
35 | With a future release of torch this likely won't be necessary.
36 | 
37 | ```python
38 | work = default_pg.allgather([tensor_list], [tensor])
39 | # to -->
40 | work = default_pg.allgather([[t.contiguous() for t in tensor_list]], [tensor.contiguous()])
41 | ```


--------------------------------------------------------------------------------
/v1/analysis/README.md:
--------------------------------------------------------------------------------
 1 | # analysis
 2 | 
 3 | This directory contains scripts for training LLMs on the dataset.
 4 | 
 5 | ## run_clm scripts
 6 | `run_clm.py` is an extension of the default huggingface script for training
 7 | causal language models.
 8 | We use it here to train the different models on the dataset.
 9 | They are called by the sbatch scripts (i.e. `run_clm-zaratan.sbatch`), which 
10 | also take care of the environment setup.
11 | These are fairly specific to my Zaratan environment.
12 | 
13 | ## generate_text.py
14 | A script for using the model to generate code based on a prompt.
15 | Run `python generate_text.py -h` to see how to pass args.
16 | 
17 | ## parse_losses.py
18 | If not using tensorboard or other monitoring software, then you can use 
19 | this to parse the training output text for loss/accuracy values.
20 | Simply give it a list of files to parse and where to output the training and
21 | validation csv files.
22 | Run `python parse_losses.py -h` for more options.
23 | 
24 | ## plot_training_data.py
25 | Uses the CSVs output by `parse_losses.py` to create training and validation
26 | loss+perplexity curves.
27 | By default this will save them in the `figs/` directory.
28 | 
29 | ## train-tokenizer.py
30 | Trains a tokenizer on the dataset.
31 | Taken from HuggingFace repo (see script for URL and how to run).
32 | 
33 | 


--------------------------------------------------------------------------------
/v1/analysis/clean-data.bash:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Data cannot persist in scratch space after training job. Clear these out from login node.
 3 | # THIS SCRIPT IS SPECIFIC TO ZARATAN.
 4 | # author: Daniel Nichols
 5 | # date: October 2022
 6 | 
 7 | # remove dataset
 8 | echo "Removing dataset..."
 9 | rm /scratch/zt1/project/bhatele-lab/user/dnicho/code-ml/data/dataset.json
10 | if [ $? -ne 0 ]; then
11 |     echo "Error deleting dataset!"
12 | fi
13 | 
14 | # remove cache
15 | echo "Saving and removing cache..."
16 | cp -r /scratch/zt1/project/bhatele-lab/user/dnicho/.cache/huggingface \
17 |     /afs/shell.umd.edu/project/bhatele-lab/user/dnicho/.cache
18 | if [ $? -eq 0 ]; then
19 |     rm -r /scratch/zt1/project/bhatele-lab/user/dnicho/.cache/huggingface
20 | else
21 |     echo "Error copying .cache directory!"
22 | fi
23 | 
24 | # move model checkpoints
25 | echo "Saving and removing saved models..."
26 | cp -r /scratch/zt1/project/bhatele-lab/user/dnicho/code-ml/models \
27 |     /afs/shell.umd.edu/project/bhatele-lab/user/dnicho/code-ml/
28 | if [ $? -eq 0 ]; then
29 |     rm -r /scratch/zt1/project/bhatele-lab/user/dnicho/code-ml/models
30 | else
31 |     echo "Error copying saved models!"
32 | fi
33 | 


--------------------------------------------------------------------------------
/v1/analysis/ds_config_zero1.json:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/parallelcodefoundry/HPC-Coder/e12b8b949a3b6a6a2eeca565d8cb3c520eb8eb70/v1/analysis/ds_config_zero1.json


--------------------------------------------------------------------------------
/v1/analysis/ds_config_zero2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 | 
11 |     "optimizer": {
12 |         "type": "AdamW",
13 |         "params": {
14 |             "lr": "auto",
15 |             "betas": "auto",
16 |             "eps": "auto",
17 |             "weight_decay": "auto"
18 |         }
19 |     },
20 | 
21 |     "scheduler": {
22 |         "type": "WarmupLR",
23 |         "params": {
24 |             "warmup_min_lr": "auto",
25 |             "warmup_max_lr": "auto",
26 |             "warmup_num_steps": "auto"
27 |         }
28 |     },
29 | 
30 |     "zero_optimization": {
31 |         "stage": 2,
32 |         "allgather_partitions": true,
33 |         "allgather_bucket_size": 2e8,
34 |         "overlap_comm": true,
35 |         "reduce_scatter": true,
36 |         "reduce_bucket_size": 2e8,
37 |         "contiguous_gradients": true
38 |     },
39 | 
40 |     "gradient_accumulation_steps": "auto",
41 |     "gradient_clipping": "auto",
42 |     "steps_per_print": 1000,
43 |     "train_batch_size": "auto",
44 |     "train_micro_batch_size_per_gpu": "auto",
45 |     "wall_clock_breakdown": false
46 | }


--------------------------------------------------------------------------------
/v1/analysis/ds_config_zero3.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 | 
11 |     "optimizer": {
12 |         "type": "AdamW",
13 |         "params": {
14 |             "lr": "auto",
15 |             "betas": "auto",
16 |             "eps": "auto",
17 |             "weight_decay": "auto"
18 |         }
19 |     },
20 | 
21 |     "scheduler": {
22 |         "type": "WarmupLR",
23 |         "params": {
24 |             "warmup_min_lr": "auto",
25 |             "warmup_max_lr": "auto",
26 |             "warmup_num_steps": "auto"
27 |         }
28 |     },
29 | 
30 |     "zero_optimization": {
31 |         "stage": 3,
32 |         "offload_optimizer": {
33 |             "device": "cpu",
34 |             "pin_memory": true
35 |         },
36 |         "offload_param": {
37 |             "device": "cpu",
38 |             "pin_memory": true
39 |         },
40 |         "overlap_comm": true,
41 |         "contiguous_gradients": true,
42 |         "sub_group_size": 1e9,
43 |         "reduce_bucket_size": "auto",
44 |         "stage3_prefetch_bucket_size": "auto",
45 |         "stage3_param_persistence_threshold": "auto",
46 |         "stage3_max_live_parameters": 1e9,
47 |         "stage3_max_reuse_distance": 1e9,
48 |         "stage3_gather_16bit_weights_on_model_save": true
49 |     },
50 | 
51 |     "gradient_accumulation_steps": "auto",
52 |     "gradient_clipping": "auto",
53 |     "steps_per_print": 2000,
54 |     "train_batch_size": "auto",
55 |     "train_micro_batch_size_per_gpu": "auto",
56 |     "wall_clock_breakdown": false
57 | }


--------------------------------------------------------------------------------
/v1/analysis/generate_text-zaratan.sbatch:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH -N 1
 3 | #SBATCH -t 00:10:00
 4 | #SBATCH -J model-inference
 5 | #SBATCH -p gpu
 6 | #SBATCH --gres=gpu:a100
 7 | #SBATCH --mem=16384
 8 | 
 9 | module load python/3.8.12/zen2 git-lfs/zen2/3.1.2 openmpi/4.1.1/gcc/9.4.0/zen2 cuda/11.6.2/gcc
10 | source .env/bin/activate
11 | export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${CUDA_HOME}/lib64"
12 | 
13 | echo "device(s): $CUDA_VISIBLE_DEVICES"
14 | 
15 | #MODEL="/scratch/zt1/project/bhatele-lab/user/dnicho/code-ml/data/polycoder-hpc-source-ckpt/checkpoint-200"
16 | MODEL="hpcgroup/gpt2-medium-hpc-source"
17 | TOKENIZER="./hpc-tok"
18 | DEVICE="$CUDA_VISIBLE_DEVICES"
19 | PROMPT="/* saxpy -- multiply scalar float a by vector x and add to y */ void saxpy(float *x, float *y, float a, int N) { for (int i = 0; i < N; i++) {"
20 | #PROMPT="cuda-prompt.txt"
21 | 
22 | echo "Prompt: \"${PROMPT}\""
23 | 
24 | python generate_text.py \
25 |     --model $MODEL \
26 |     --tokenizer $TOKENIZER \
27 |     --device $DEVICE \
28 |     --min-len 25 \
29 |     --max-len 350 \
30 |     --num-samples 15 \
31 |     --text "${PROMPT}"
32 | 


--------------------------------------------------------------------------------
/v1/analysis/generate_text.py:
--------------------------------------------------------------------------------
 1 | ''' Given one of the models generate some text from a prompt.
 2 |     author: Daniel Nichols
 3 |     date: October 2022
 4 | '''
 5 | # std imports
 6 | from argparse import ArgumentParser
 7 | from os import environ
 8 | 
 9 | # tpl imports
10 | from transformers import pipeline
11 | 
12 | 
13 | def main():
14 |     parser = ArgumentParser(description='Generate text from a prompt using a LLM.')
15 |     input_group = parser.add_mutually_exclusive_group()
16 |     input_group.add_argument('--text', type=str, help='Input text to generate new text from.')
17 |     input_group.add_argument('--text-file', type=str, help='File to get text contents from.')
18 |     parser.add_argument('-n', '--num-samples', type=int, default=10, help='How many times to sample a particular input.')
19 |     parser.add_argument('--model', type=str, required=True, help='Huggingface hub model name or path to model.')
20 |     parser.add_argument('--tokenizer', type=str, required=True, help='Tokenizer to use on data.')
21 |     parser.add_argument('--min-len', type=int, default=50, help='Minimum length to generate.')
22 |     parser.add_argument('--max-len', type=int, default=150, help='Maximum length to generate.')
23 |     parser.add_argument('--top-k', type=int, default=50, help='Number of samples to use in top-k sampling.')
24 |     parser.add_argument('--top-p', type=float, default=0.95, help='Fraction to use in nucleas sampling.')
25 |     parser.add_argument('--temperature', type=float, default=0.5, help='Sampling temperature.')
26 |     parser.add_argument('--device', type=int, default=-1, help='Where to run model')
27 |     parser.add_argument('-o', '--output', type=str, default='-', help='Output location. Omit or \'-\' for stdout.')
28 |     args = parser.parse_args()
29 | 
30 |     # environment setup
31 |     environ['TOKENIZERS_PARALLELISM'] = '0'
32 |     environ['OMP_NUM_THREADS'] = '64'
33 | 
34 |     # get text data
35 |     prompt = ''
36 |     reprompt = False
37 |     if args.text:
38 |         prompt = args.text
39 |     elif args.text_file:
40 |         with open(args.text_file, 'r', errors='ignore') as fp:
41 |             prompt = fp.read()
42 |     else:
43 |         reprompt = True
44 |         prompt = input('prompt: ')
45 |     
46 |     # create pipeline and generate
47 |     generator = pipeline('text-generation', model=args.model, tokenizer=args.tokenizer, framework='pt', device=args.device)
48 |     #generator = pipeline('text-generation', model=args.model, framework='pt', device=args.device)
49 |     
50 |     if reprompt:
51 |         while prompt not in ['q', 'quit', 'exit']:
52 |             prompt = prompt.strip()
53 |             if prompt == '':
54 |                 prompt = input('prompt: ')
55 |                 continue
56 | 
57 |             result = generator(
58 |                 prompt, 
59 |                 do_sample=True, 
60 |                 max_new_tokens=args.max_len,
61 |                 top_k=args.top_k,
62 |                 top_p=args.top_p,
63 |                 num_return_sequences=args.num_samples,
64 |                 temperature=args.temperature
65 |             )
66 |             print(result)
67 | 
68 |             prompt = input('prompt: ')
69 |     else:
70 |         result = generator(
71 |             prompt, 
72 |             do_sample=True, 
73 |             max_new_tokens=args.max_len,
74 |             top_k=args.top_k,
75 |             top_p=args.top_p,
76 |             num_return_sequences=args.num_samples,
77 |             temperature=args.temperature
78 |         )
79 |     
80 |         # output
81 |         response = result
82 |         for idx, resp in enumerate(response, start=1):
83 |             gen_text = resp['generated_text']
84 |             if args.output is None or args.output == '-':
85 |                 print('Sample {}: \'{}\'\n'.format(idx, gen_text))
86 |             else:
87 |                 with open(args.output, 'w') as fp:
88 |                     fp.write('Sample {}: \'{}\'\n'.format(idx, gen_text))
89 | 
90 | 
91 | if __name__ == '__main__':
92 |     main()


--------------------------------------------------------------------------------
/v1/analysis/omp_tests-zaratan.sbatch:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH -N 1
 3 | #SBATCH -t 02:00:00
 4 | #SBATCH -J train-causal
 5 | #SBATCH -A bhatele-lab-cmsc
 6 | #SBATCH -p gpu
 7 | #SBATCH --gres=gpu:a100
 8 | #SBATCH --mem=16384
 9 | #SBATCH --mail-type=FAIL
10 | 
11 | MODEL="/scratch/zt1/project/bhatele-lab/user/dnicho/code-ml/data/gpt-neo-omp-for-loops-ckpt"
12 | TOKENIZER="/scratch/zt1/project/bhatele-lab/user/dnicho/code-ml/data/gpt-neo-omp-for-loops-ckpt"
13 | DEVICE="$CUDA_VISIBLE_DEVICES"
14 | CACHE_DIR="/scratch/zt1/project/bhatele-lab/user/dnicho/.cache/huggingface"
15 | TEMPERATURE="0.2"
16 | 
17 | module load python/3.8.12/zen2 git-lfs/zen2/3.1.2 openmpi/4.1.1/gcc/9.4.0/zen2 cuda/11.6.2/gcc
18 | source .env/bin/activate
19 | export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${CUDA_HOME}/lib64"
20 | export HF_HOME="${CACHE_DIR}"
21 | 
22 | echo "device(s): $CUDA_VISIBLE_DEVICES"
23 | 
24 | 
25 | python omp_tests.py \
26 |     --model $MODEL \
27 |     --tokenizer $TOKENIZER \
28 |     --cache-dir $CACHE_DIR \
29 |     --temperature $TEMPERATURE \
30 |     --num-samples 10 \
31 |     --device $DEVICE 
32 | 


--------------------------------------------------------------------------------
/v1/analysis/omp_tests.py:
--------------------------------------------------------------------------------
  1 | ''' Run the OpenMP auto-complete tests.
  2 |     author: Daniel Nichols
  3 |     date: November 2022
  4 | '''
  5 | # std imports
  6 | from argparse import ArgumentParser
  7 | from os import environ
  8 | from typing import Tuple
  9 | 
 10 | # tpl imports
 11 | from datasets import load_dataset
 12 | from transformers import pipeline, AutoTokenizer, set_seed
 13 | 
 14 | 
 15 | def get_loop_text(text : str, end_loop_token : str = '<LOOP-END> ') -> str:
 16 |     ''' 
 17 |     '''
 18 |     return (text.split(end_loop_token)[0] + end_loop_token).strip()
 19 | 
 20 | 
 21 | def get_predicted_omp(text : str, end_loop_token : str = '<LOOP-END> ', end_pragma_token = '<OMP-END>') -> str:
 22 |     '''
 23 |     '''
 24 |     pragma = text.split(end_pragma_token)[0]
 25 |     return pragma
 26 | 
 27 | 
 28 | def chunks(lst, n):
 29 |     '''Yield successive n-sized chunks from lst.'''
 30 |     for i in range(0, len(lst), n):
 31 |         yield lst[i:i + n]
 32 | 
 33 | 
 34 | def is_correct_pragma(
 35 |     generated_text : str, 
 36 |     real_pragma : str
 37 | ) -> bool:
 38 |     '''
 39 |     '''
 40 |     if '<OMP-END>' not in generated_text and generated_text.startswith('#pragma omp parallel for'):
 41 |         generated_text = generated_text.split('\n')[0].strip()
 42 |     elif '<LOOP-END> #pragma omp' in generated_text:
 43 |         generated_text = generated_text.split('<LOOP-END>')[1].strip()
 44 |     else:
 45 |         generated_text = generated_text.split('<OMP-END>')[0].strip()
 46 | 
 47 |     print(f'Predicted: \'{generated_text}\'')
 48 | 
 49 |     return generated_text == real_pragma
 50 | 
 51 | 
 52 | def test(
 53 |     generator, 
 54 |     data, 
 55 |     true_results,
 56 |     max_len : int = 200,
 57 |     top_k : int = 50,
 58 |     top_p : float = 0.95,
 59 |     num_samples : int = 10,
 60 |     temperature : float = 0.2
 61 | ) -> float:
 62 |     '''
 63 |     '''
 64 |     results = []
 65 |     try:
 66 |         for idx, d in enumerate( data ):
 67 |             #print(f'{idx}: \'{d}\'', flush=True)
 68 |             tmp_results = generator(
 69 |                 d,
 70 |                 return_full_text=False,
 71 |                 do_sample=True, 
 72 |                 max_new_tokens=max_len,
 73 |                 top_k=top_k,
 74 |                 top_p=top_p,
 75 |                 num_return_sequences=num_samples,
 76 |                 temperature=temperature
 77 |             )
 78 |             results.append( tmp_results )
 79 |     except Exception as err:
 80 |         print(f'Error during inference: \'{err}\'', flush=True)
 81 |         print(results)
 82 |         print(d, flush=True)
 83 |         return 0, 0
 84 | 
 85 |     num_correct, num_incorrect = 0, 0
 86 |     for idx, (result, true_result) in enumerate( zip(results, true_results) ):
 87 |         print(f'Sample {idx}:')
 88 |         print(f'Real: \'{true_result}\'')
 89 | 
 90 |         is_correct = any( is_correct_pragma(s['generated_text'], true_result.strip()) for s in result )
 91 | 
 92 |         if is_correct:
 93 |             print('CORRECT')
 94 |             num_correct += 1
 95 |         else:
 96 |             print('INCORRECT')
 97 |             num_incorrect += 1
 98 |         
 99 |         print()
100 |     
101 |     return num_correct / (num_correct + num_incorrect), (num_correct + num_incorrect) 
102 | 
103 | 
104 | def main():
105 |     parser = ArgumentParser(description='Test a models OpenMP pragma prediction.')
106 |     parser.add_argument('-m', '--model', type=str, required=True, help='path to model or HF hub model name')
107 |     parser.add_argument('--tokenizer', type=str, required=True, help='text tokenizer')
108 |     parser.add_argument('--cache-dir', type=str, default='~/.cache/huggingface', help='path to HF cache')
109 |     parser.add_argument('-k', '--num-samples', type=int, default=1, help='how many samples to generate')
110 |     parser.add_argument('--min-len', type=int, default=50, help='Minimum length to generate.')
111 |     parser.add_argument('--max-len', type=int, default=150, help='Maximum length to generate.')
112 |     parser.add_argument('--top-k', type=int, default=50, help='Number of samples to use in top-k sampling.')
113 |     parser.add_argument('--top-p', type=float, default=0.95, help='Fraction to use in nucleas sampling.')
114 |     parser.add_argument('--temperature', type=float, default=0.2, help='Sampling temperature.')
115 |     parser.add_argument('--batch-size', type=int, default=16, help='Batch size to feed to inference.')
116 |     parser.add_argument('--device', type=int, default=-1, help='Where to run model')
117 |     args = parser.parse_args()
118 | 
119 |     # environment setup
120 |     environ['TOKENIZERS_PARALLELISM'] = '0'
121 |     environ['OMP_NUM_THREADS'] = '64'
122 |     set_seed(42)
123 | 
124 |     val_dataset = load_dataset(
125 |         'hpcgroup/omp-for-loops',
126 |         split='train[:5%]',
127 |         cache_dir=args.cache_dir
128 |     )
129 | 
130 |     # create pipeline
131 |     tokenizer = AutoTokenizer.from_pretrained(args.tokenizer)
132 |     generator = pipeline('text-generation', model=args.model, tokenizer=tokenizer, framework='pt', device=args.device)
133 | 
134 |     inference_batches, true_outputs = [], []
135 |     for sample in val_dataset:
136 |         omp_pragma = sample['omp_pragma_line']
137 |         loop = get_loop_text(sample['text'])
138 | 
139 |         toks = tokenizer(loop)['input_ids']
140 |         if len(toks) >= 1024:
141 |             continue
142 | 
143 |         inference_batches.append( loop )
144 |         true_outputs.append( omp_pragma )
145 |         
146 |     accuracy, total = test(generator, inference_batches, true_outputs, max_len=args.max_len, top_k=args.top_k, 
147 |         top_p=args.top_p, num_samples=args.num_samples, temperature=args.temperature)
148 |     
149 |     print('Accuracy: {}% ({} tested)'.format(accuracy * 100.0, total))
150 | 
151 | 
152 | if __name__ == '__main__':
153 |     main()
154 | 


--------------------------------------------------------------------------------
/v1/analysis/parse_losses.py:
--------------------------------------------------------------------------------
  1 | ''' Parse the output of the training script for losses and output csv.
  2 |     author: Daniel Nichols
  3 |     date: October 2022
  4 | '''
  5 | # std imports
  6 | from argparse import ArgumentParser
  7 | from os import PathLike
  8 | from typing import Optional, Iterable
  9 | from collections import deque
 10 | from csv import QUOTE_NONNUMERIC
 11 | import json
 12 | import math
 13 | from os.path import join as path_join
 14 | 
 15 | # tpl imports
 16 | import pandas as pd
 17 | 
 18 | 
 19 | def parse_output(
 20 |     results_txt_files: Iterable[PathLike], 
 21 |     add_perplexity: bool = True,
 22 |     samples_per_step: int = 2,
 23 |     model_name : Optional[str] = None
 24 | ) -> pd.DataFrame:
 25 |     ''' Parse the output of a training run.
 26 | 
 27 |         Args:
 28 |             results_txt_file: paths to the text outputs of training runs
 29 |             add_perplexity: calculate perplexity for result if it's not already there
 30 |             samples_per_step: how many samples per step were computed
 31 |             model_name: include the name of the model in the dataframe
 32 |         
 33 |         Returns:
 34 |             Two dataframes t,v  -- the first is the training results and the second eval results
 35 |     '''
 36 |     LINE_START_KEY = '{\'loss\':'
 37 |     EVAL_START_KEY = '{\'eval_loss\':'
 38 |     BAD_ESC_STR = ''.join(chr(o) for o in [27, 91, 65])
 39 | 
 40 |     results, eval_results = [], []
 41 |     for text_file in results_txt_files:
 42 |         with open(text_file, 'r', encoding='ascii', errors='ignore') as fp:
 43 |             prev_lines = deque(3*[None], maxlen=3)
 44 |             last_steps = 0
 45 |             for line in fp:                
 46 |                 line = line.strip().replace(BAD_ESC_STR, '')
 47 |                 if line.startswith(LINE_START_KEY) or LINE_START_KEY in line:
 48 |                     obj = json.loads(line.replace('\'', '"'))
 49 |                     try:
 50 |                         steps = int( prev_lines[-1].split()[2].split('/')[0] )
 51 |                     except:
 52 |                         steps = int( prev_lines[-1].split()[1].split('/')[0] )
 53 |                     last_steps = steps
 54 | 
 55 |                     if add_perplexity and 'perplexity' not in obj and 'loss' in obj:
 56 |                         obj['perplexity'] = math.exp(obj['loss'])
 57 |                     
 58 |                     if 'steps' not in obj:
 59 |                         obj['steps'] = steps
 60 | 
 61 |                     if 'samples' not in obj:
 62 |                         obj['samples'] = obj['steps'] * samples_per_step
 63 | 
 64 |                     if model_name is not None:
 65 |                         obj['model'] = model_name
 66 |                     
 67 |                     results.append( obj )
 68 |                 
 69 |                 elif line.startswith(EVAL_START_KEY) or EVAL_START_KEY in line:
 70 |                     obj = json.loads(line.replace('\'', '"'))
 71 |                     #steps = int( prev_lines[-1].split()[1].split('/')[0] )
 72 |                     steps = last_steps
 73 | 
 74 |                     if add_perplexity and 'perplexity' not in obj and 'eval_loss' in obj:
 75 |                         obj['perplexity'] = math.exp(obj['eval_loss'])
 76 | 
 77 |                     if 'steps' not in obj:
 78 |                         obj['steps'] = steps
 79 | 
 80 |                     if 'samples' not in obj:
 81 |                         obj['samples'] = obj['steps'] * samples_per_step
 82 | 
 83 |                     if model_name is not None:
 84 |                         obj['model'] = model_name
 85 | 
 86 |                     eval_results.append( obj )
 87 | 
 88 |                 
 89 |                 if line.strip() != '':
 90 |                     prev_lines.append(line)
 91 |     
 92 |     results.sort(key=lambda x: x['samples'])
 93 |     eval_results.sort(key=lambda x: x['samples'])
 94 | 
 95 |     return pd.DataFrame( results ), pd.DataFrame( eval_results )
 96 | 
 97 | 
 98 | def main():
 99 |     parser = ArgumentParser(description='Scrape loss and accuracy from training results.')
100 |     parser.add_argument('-i', '--input', type=str, nargs='+', required=True, help='training output files')
101 |     parser.add_argument('-o', '--output', type=str, required=True, help='where to write output csv')
102 |     parser.add_argument('--eval-output', type=str, required=True, help='where to write eval output csv')
103 |     parser.add_argument('--samples-per-step', type=int, default=8, help='how many samples are computed per step')
104 |     parser.add_argument('--model-name', type=str, help='Name of model being trained to include in data.')
105 |     args = parser.parse_args()
106 | 
107 |     results, eval_results = parse_output(args.input, samples_per_step=args.samples_per_step, model_name=args.model_name)
108 |     results.to_csv(args.output, index=False, quoting=QUOTE_NONNUMERIC)
109 | 
110 |     eval_results.columns = eval_results.columns.str.removeprefix('eval_')
111 |     eval_results.to_csv(args.eval_output, index=False, quoting=QUOTE_NONNUMERIC)
112 | 
113 | 
114 | if __name__ == '__main__':
115 |     main()
116 | 


--------------------------------------------------------------------------------
/v1/analysis/plot_training_results.py:
--------------------------------------------------------------------------------
  1 | ''' Plot training results using data from csv files.
  2 |     author: Daniel Nichols
  3 | '''
  4 | # std imports
  5 | from argparse import ArgumentParser
  6 | from functools import reduce
  7 | from os import PathLike
  8 | from typing import Optional
  9 | from os.path import join as path_join
 10 | 
 11 | # tpl imports
 12 | import pandas as pd
 13 | import matplotlib.pyplot as plt
 14 | import seaborn as sns
 15 | 
 16 | 
 17 | def plot(
 18 |     train_data : pd.DataFrame,
 19 |     val_data : pd.DataFrame,
 20 |     output_path : PathLike,
 21 |     xcolumn : str = 'samples', 
 22 |     ycolumn : str = 'perplexity',
 23 |     seriescolumn : str = 'model',
 24 |     xscale : Optional[int] = None,
 25 |     title : Optional[str] = None
 26 | ):
 27 |     ''' plot the training loss/perplexity curves
 28 | 
 29 |         Args:
 30 |             train_data: training dataset
 31 |             val_data: validation dataset
 32 |             output_path: where to save file
 33 |             xcolumn: what column to use for x axis
 34 |             ycolumn: what column to use for y axis
 35 |             seriescolumn: how to distinguish series on line plot
 36 |             xscale: scale value for x-axis
 37 |             title: set title of figure if not None
 38 |     '''
 39 |     assert xcolumn in ['samples', 'steps']
 40 |     assert ycolumn in ['loss', 'perplexity', 'accuracy']
 41 | 
 42 |     train_data = train_data.copy(deep=True)
 43 |     val_data = val_data.copy(deep=True)
 44 | 
 45 |     xlabel_prefix = f'{xscale}x ' if xscale else ''
 46 |     if xscale:
 47 |         train_data[xcolumn] /= xscale
 48 |         val_data[xcolumn] /= xscale
 49 | 
 50 |     plt.clf()
 51 |     fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16,6), sharey=True)
 52 | 
 53 |     ax1 = sns.lineplot(data=train_data, x=xcolumn, y=ycolumn, hue=seriescolumn, ax=ax1)
 54 |     ax2 = sns.lineplot(data=val_data, x=xcolumn, y=ycolumn, hue=seriescolumn, ax=ax2)
 55 | 
 56 |     #ax1.set_ylim((1, None))
 57 |     #ax2.set_ylim((1, None))
 58 | 
 59 |     for ax, ds in zip([ax1, ax2], [train_data, val_data]):
 60 | 
 61 |         series_names = ds[seriescolumn].unique()
 62 |         for series_idx, series_name in enumerate(series_names):
 63 |             series_ds = ds[ds[seriescolumn] == series_name]
 64 | 
 65 |             xpos = (series_ds[xcolumn].values[0], series_ds[xcolumn].values[-1])
 66 |             ypos = (series_ds[ycolumn].values[0], series_ds[ycolumn].values[-1])
 67 |             color = sns.color_palette()[series_idx]
 68 |             for x, y in zip(xpos, ypos): 
 69 |                 ax.text(x, y, f'{y:.2f}', color=color)
 70 | 
 71 |     ax1.set_ylabel(ycolumn.capitalize())
 72 |     ax1.set_xlabel(xlabel_prefix + xcolumn.capitalize())
 73 |     ax1.get_legend().set_title(seriescolumn.capitalize())
 74 | 
 75 |     ax2.set_xlabel(xlabel_prefix + xcolumn.capitalize())
 76 |     ax2.get_legend().set_title(seriescolumn.capitalize())
 77 | 
 78 |     ax1.set_title('Training')
 79 |     ax2.set_title('Validation')
 80 | 
 81 |     if title:
 82 |         fig.suptitle(title)
 83 | 
 84 |     plt.savefig(output_path, bbox_inches='tight')
 85 | 
 86 | 
 87 | def main():
 88 |     parser = ArgumentParser(description='Plot training results.')
 89 |     parser.add_argument('-t', '--training-results', type=str, nargs='+', required=True, help='csv of training results')
 90 |     parser.add_argument('-v', '--validation-results', type=str, nargs='+', required=True, 
 91 |         help='csv of validation results')
 92 |     parser.add_argument('--output-root', type=str, default='figs', help='root of figs directory')
 93 |     args = parser.parse_args()
 94 | 
 95 |     train_df = pd.concat([pd.read_csv(fpath) for fpath in args.training_results], ignore_index=True)
 96 |     val_df = pd.concat([pd.read_csv(fpath) for fpath in args.validation_results], ignore_index=True)
 97 | 
 98 |     sns.set(font_scale=1.5, font='DejaVu Sans')
 99 |     plot(train_df, val_df, path_join(args.output_root, 'perplexity.png'), ycolumn='perplexity', 
100 |         title='Perplexity During Training', xscale=1000)
101 | 
102 |     plot(train_df, val_df, path_join(args.output_root, 'loss.png'), ycolumn='loss', title='Loss During Training', 
103 |         xscale=1000)
104 |     
105 | 
106 | if __name__ == '__main__':
107 |     main()
108 | 


--------------------------------------------------------------------------------
/v1/analysis/prepare-data.bash:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Initial setup for training requires some work to be done on login node. Namely the dataset has to be copied
 3 | # to scratch space for the job to access it.
 4 | # THIS SCRIPT IS SPECIFIC TO ZARATAN.
 5 | # author: Daniel Nichols
 6 | # date: October 2022
 7 | 
 8 | SRC="/afs/shell.umd.edu/project/bhatele-lab/shared/hpc-llms/data/dataset.jsonl"
 9 | DST="/scratch/zt1/project/bhatele-lab/user/dnicho/code-ml/data/dataset.json"
10 | 
11 | # copy dataset
12 | echo "Copying dataset..."
13 | cp ${SRC} ${DST}
14 | if [ $? -ne 0 ]; then
15 |       echo "Error copying dataset!"
16 | fi
17 | 
18 | # create huggingface cache
19 | echo "Copying huggingface cache..."
20 | cp -r /afs/shell.umd.edu/project/bhatele-lab/user/dnicho/.cache/huggingface \
21 |       /scratch/zt1/project/bhatele-lab/user/dnicho/.cache/
22 | if [ $? -ne 0 ]; then
23 |       echo "Error copying huggingface cache!"
24 | fi
25 | 
26 | # move save dirs
27 | echo "Copying saved models..."
28 | cp -r /afs/shell.umd.edu/project/bhatele-lab/user/dnicho/code-ml/models \
29 |     /scratch/zt1/project/bhatele-lab/user/dnicho/code-ml/
30 | if [ $? -ne 0 ]; then
31 |       echo "Error copying saved models!"
32 | fi
33 | 


--------------------------------------------------------------------------------
/v1/analysis/run_clm-evaluate-zaratan.sbatch:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH -N 1
 3 | #SBATCH -t 01:00:00
 4 | #SBATCH -J evaluate-causal
 5 | #SBATCH -A bhatele-lab-cmsc
 6 | #SBATCH -p gpu
 7 | #SBATCH --gres=gpu:a100:1
 8 | #SBATCH --mem=131072
 9 | 
10 | #DATASET="/scratch/zt1/project/bhatele-lab/user/dnicho/code-ml/data/dataset.json"
11 | DATASET="daniellnichols/hpc-source"
12 | MODEL="daniellnichols/gpt-neo-hpc-source"
13 | CACHE_DIR="/scratch/zt1/project/bhatele-lab/user/dnicho/.cache/huggingface"
14 | OUTPUT_DIR="/scratch/zt1/project/bhatele-lab/user/dnicho/code-ml/data/gpt-neo-hpc-ckpt"
15 | LOG_DIR="/scratch/zt1/project/bhatele-lab/user/dnicho/code-ml/logging"
16 | DSCONFIG="./ds_config_zero3.json"
17 | 
18 | MAX_STEPS="10000"
19 | 
20 | module load python/3.8.12/zen2 git-lfs/zen2/3.1.2 openmpi/4.1.1/gcc/9.4.0/zen2 cuda/11.6.2/gcc
21 | source .env/bin/activate
22 | 
23 | export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${CUDA_HOME}/lib64"
24 | 
25 | echo "device(s): $CUDA_VISIBLE_DEVICES"
26 | 
27 | deepspeed run_clm.py \
28 |     --model_name_or_path ${OUTPUT_DIR}/checkpoint-3500 \
29 |     --tokenizer_name ./hpc-tok \
30 |     --output_dir $OUTPUT_DIR \
31 |     --dataset_name $DATASET \
32 |     --validation_split_percentage 5 \
33 |     --cache_dir $CACHE_DIR \
34 |     --optim adamw_torch \
35 |     --fp16 \
36 |     --per_device_train_batch_size 1 \
37 |     --per_device_eval_batch_size 1 \
38 |     --seed 42 \
39 |     --do_eval \
40 |     --deepspeed $DSCONFIG \
41 |     --max_eval_samples 250 \
42 |     --logging_steps 50 \
43 |     --log_level passive 
44 | 


--------------------------------------------------------------------------------
/v1/analysis/run_clm-zaratan.sbatch:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH -N 1
 3 | #SBATCH -n 8
 4 | #SBATCH -t 04:00:00
 5 | #SBATCH -J train-causal
 6 | #SBATCH -A bhatele-lab-cmsc
 7 | #SBATCH -p gpu
 8 | #SBATCH --gres=gpu:a100:4
 9 | #SBATCH --mem=196608
10 | #SBATCH --mail-type=FAIL
11 | 
12 | #DATASET="hpcgroup/hpc-source"
13 | DATASET="hpcgroup/omp-for-loops"
14 | #MODEL="hpcgroup/polycoder-hpc-source"
15 | CACHE_DIR="/scratch/zt1/project/bhatele-lab/user/dnicho/.cache/huggingface"
16 | OUTPUT_DIR="/scratch/zt1/project/bhatele-lab/user/dnicho/code-ml/data/gpt-neo-omp-for-loops-ckpt"
17 | DSCONFIG="./ds_config_zero2.json"
18 | HUB_TOKEN="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"
19 | 
20 | MAX_STEPS="50000"
21 | MAX_CHECKPOINTS="5"
22 | 
23 | module load python/3.8.12/zen2 git-lfs/zen2/3.1.2 openmpi/4.1.1/gcc/9.4.0/zen2 cuda/11.6.2/gcc
24 | source .env/bin/activate
25 | 
26 | export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${CUDA_HOME}/lib64"
27 | #export TOKENIZERS_PARALLELISM="false"
28 | export HF_HOME="${CACHE_DIR}"
29 | 
30 | echo "device(s): $CUDA_VISIBLE_DEVICES"
31 | 
32 | deepspeed run_clm.py \
33 |     --model_name_or_path hpcgroup/gpt-neo-hpc-source \
34 |     --tokenizer_name ./hpc-tok \
35 |     --output_dir $OUTPUT_DIR \
36 |     --dataset_name $DATASET \
37 |     --validation_split_percentage 5 \
38 |     --cache_dir $CACHE_DIR \
39 |     --optim adamw_torch \
40 |     --fp16 \
41 |     --per_device_train_batch_size 2 \
42 |     --per_device_eval_batch_size 2 \
43 |     --preprocessing_num_workers 8 \
44 |     --dataloader_num_workers 8 \
45 |     --seed 42 \
46 |     --do_eval \
47 |     --do_train \
48 |     --deepspeed $DSCONFIG \
49 |     --num_train_epochs 3 \
50 |     --save_steps 500 \
51 |     --save_total_limit $MAX_CHECKPOINTS \
52 |     --evaluation_strategy steps \
53 |     --eval_steps 500 \
54 |     --max_eval_samples 1000 \
55 |     --logging_steps 50 \
56 |     --log_level passive 
57 | 
58 | 
59 |     #--use_auth_token \
60 |     #--push_to_hub \
61 |     #--hub_model_id $MODEL \
62 |     #--hub_private_repo \
63 |     #--hub_token $HUB_TOKEN
64 | 


--------------------------------------------------------------------------------
/v1/analysis/run_clm.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | ''' [NOTE] This script is taken from:
  3 |     https://github.com/huggingface/transformers/blob/bd469c40659ce76c81f69c7726759d249b4aef49/examples/pytorch/language-modeling/run_clm.py
  4 | '''
  5 | # coding=utf-8
  6 | # Copyright 2020 The HuggingFace Inc. team. All rights reserved.
  7 | #
  8 | # Licensed under the Apache License, Version 2.0 (the "License");
  9 | # you may not use this file except in compliance with the License.
 10 | # You may obtain a copy of the License at
 11 | #
 12 | #     http://www.apache.org/licenses/LICENSE-2.0
 13 | #
 14 | # Unless required by applicable law or agreed to in writing, software
 15 | # distributed under the License is distributed on an "AS IS" BASIS,
 16 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 17 | # See the License for the specific language governing permissions and
 18 | # limitations under the License.
 19 | """
 20 | Fine-tuning the library models for causal language modeling (GPT, GPT-2, CTRL, ...) on a text file or a dataset.
 21 | 
 22 | Here is the full list of checkpoints on the hub that can be fine-tuned by this script:
 23 | https://huggingface.co/models?filter=text-generation
 24 | """
 25 | # You can also adapt this script on your own causal language modeling task. Pointers for this are left as comments.
 26 | 
 27 | import logging
 28 | import math
 29 | import os
 30 | import sys
 31 | from dataclasses import dataclass, field
 32 | from itertools import chain
 33 | from typing import Optional
 34 | 
 35 | import datasets
 36 | from datasets import load_dataset
 37 | 
 38 | import evaluate
 39 | import transformers
 40 | from transformers import (
 41 |     CONFIG_MAPPING,
 42 |     MODEL_FOR_CAUSAL_LM_MAPPING,
 43 |     AutoConfig,
 44 |     AutoModelForCausalLM,
 45 |     AutoTokenizer,
 46 |     HfArgumentParser,
 47 |     Trainer,
 48 |     TrainingArguments,
 49 |     default_data_collator,
 50 |     is_torch_tpu_available,
 51 |     set_seed,
 52 | )
 53 | from transformers.testing_utils import CaptureLogger
 54 | from transformers.trainer_utils import get_last_checkpoint
 55 | from transformers.utils import check_min_version, send_example_telemetry
 56 | from transformers.utils.versions import require_version
 57 | 
 58 | 
 59 | # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
 60 | check_min_version("4.23.0")
 61 | 
 62 | require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
 63 | 
 64 | logger = logging.getLogger(__name__)
 65 | 
 66 | 
 67 | MODEL_CONFIG_CLASSES = list(MODEL_FOR_CAUSAL_LM_MAPPING.keys())
 68 | MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
 69 | 
 70 | 
 71 | @dataclass
 72 | class ModelArguments:
 73 |     """
 74 |     Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
 75 |     """
 76 | 
 77 |     model_name_or_path: Optional[str] = field(
 78 |         default=None,
 79 |         metadata={
 80 |             "help": (
 81 |                 "The model checkpoint for weights initialization.Don't set if you want to train a model from scratch."
 82 |             )
 83 |         },
 84 |     )
 85 |     model_type: Optional[str] = field(
 86 |         default=None,
 87 |         metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)},
 88 |     )
 89 |     config_overrides: Optional[str] = field(
 90 |         default=None,
 91 |         metadata={
 92 |             "help": (
 93 |                 "Override some existing default config settings when a model is trained from scratch. Example: "
 94 |                 "n_embd=10,resid_pdrop=0.2,scale_attn_weights=false,summary_type=cls_index"
 95 |             )
 96 |         },
 97 |     )
 98 |     config_name: Optional[str] = field(
 99 |         default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
100 |     )
101 |     tokenizer_name: Optional[str] = field(
102 |         default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
103 |     )
104 |     cache_dir: Optional[str] = field(
105 |         default=None,
106 |         metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
107 |     )
108 |     use_fast_tokenizer: bool = field(
109 |         default=True,
110 |         metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
111 |     )
112 |     model_revision: str = field(
113 |         default="main",
114 |         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
115 |     )
116 |     use_auth_token: bool = field(
117 |         default=False,
118 |         metadata={
119 |             "help": (
120 |                 "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
121 |                 "with private models)."
122 |             )
123 |         },
124 |     )
125 | 
126 |     def __post_init__(self):
127 |         if self.config_overrides is not None and (self.config_name is not None or self.model_name_or_path is not None):
128 |             raise ValueError(
129 |                 "--config_overrides can't be used in combination with --config_name or --model_name_or_path"
130 |             )
131 | 
132 | 
133 | @dataclass
134 | class DataTrainingArguments:
135 |     """
136 |     Arguments pertaining to what data we are going to input our model for training and eval.
137 |     """
138 | 
139 |     dataset_name: Optional[str] = field(
140 |         default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
141 |     )
142 |     dataset_config_name: Optional[str] = field(
143 |         default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
144 |     )
145 |     train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})
146 |     validation_file: Optional[str] = field(
147 |         default=None,
148 |         metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
149 |     )
150 |     max_train_samples: Optional[int] = field(
151 |         default=None,
152 |         metadata={
153 |             "help": (
154 |                 "For debugging purposes or quicker training, truncate the number of training examples to this "
155 |                 "value if set."
156 |             )
157 |         },
158 |     )
159 |     max_eval_samples: Optional[int] = field(
160 |         default=None,
161 |         metadata={
162 |             "help": (
163 |                 "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
164 |                 "value if set."
165 |             )
166 |         },
167 |     )
168 | 
169 |     block_size: Optional[int] = field(
170 |         default=None,
171 |         metadata={
172 |             "help": (
173 |                 "Optional input sequence length after tokenization. "
174 |                 "The training dataset will be truncated in block of this size for training. "
175 |                 "Default to the model max input length for single sentence inputs (take into account special tokens)."
176 |             )
177 |         },
178 |     )
179 |     overwrite_cache: bool = field(
180 |         default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
181 |     )
182 |     validation_split_percentage: Optional[int] = field(
183 |         default=5,
184 |         metadata={
185 |             "help": "The percentage of the train set used as validation set in case there's no validation split"
186 |         },
187 |     )
188 |     preprocessing_num_workers: Optional[int] = field(
189 |         default=None,
190 |         metadata={"help": "The number of processes to use for the preprocessing."},
191 |     )
192 |     keep_linebreaks: bool = field(
193 |         default=True, metadata={"help": "Whether to keep line breaks when using TXT files or not."}
194 |     )
195 | 
196 |     def __post_init__(self):
197 |         if self.dataset_name is None and self.train_file is None and self.validation_file is None:
198 |             raise ValueError("Need either a dataset name or a training/validation file.")
199 |         else:
200 |             if self.train_file is not None:
201 |                 extension = self.train_file.split(".")[-1]
202 |                 assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file."
203 |             if self.validation_file is not None:
204 |                 extension = self.validation_file.split(".")[-1]
205 |                 assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file."
206 | 
207 | 
208 | def main():
209 |     # See all possible arguments in src/transformers/training_args.py
210 |     # or by passing the --help flag to this script.
211 |     # We now keep distinct sets of args, for a cleaner separation of concerns.
212 | 
213 |     parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
214 |     if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
215 |         # If we pass only one argument to the script and it's the path to a json file,
216 |         # let's parse it to get our arguments.
217 |         model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
218 |     else:
219 |         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
220 | 
221 |     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
222 |     # information sent is the one passed as arguments along with your Python/PyTorch versions.
223 |     send_example_telemetry("run_clm", model_args, data_args)
224 | 
225 |     # Setup logging
226 |     logging.basicConfig(
227 |         format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
228 |         datefmt="%m/%d/%Y %H:%M:%S",
229 |         handlers=[logging.StreamHandler(sys.stdout)],
230 |     )
231 | 
232 |     log_level = training_args.get_process_log_level()
233 |     logger.setLevel(log_level)
234 |     datasets.utils.logging.set_verbosity(log_level)
235 |     transformers.utils.logging.set_verbosity(log_level)
236 |     transformers.utils.logging.enable_default_handler()
237 |     transformers.utils.logging.enable_explicit_format()
238 | 
239 |     # Log on each process the small summary:
240 |     logger.warning(
241 |         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
242 |         + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
243 |     )
244 |     logger.info(f"Training/evaluation parameters {training_args}")
245 | 
246 |     # Detecting last checkpoint.
247 |     last_checkpoint = None
248 |     if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
249 |         last_checkpoint = get_last_checkpoint(training_args.output_dir)
250 |         if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
251 |             raise ValueError(
252 |                 f"Output directory ({training_args.output_dir}) already exists and is not empty. "
253 |                 "Use --overwrite_output_dir to overcome."
254 |             )
255 |         elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
256 |             logger.info(
257 |                 f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
258 |                 "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
259 |             )
260 | 
261 |     # Set seed before initializing model.
262 |     set_seed(training_args.seed)
263 | 
264 |     # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
265 |     # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
266 |     # (the dataset will be downloaded automatically from the datasets Hub).
267 |     #
268 |     # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
269 |     # 'text' is found. You can easily tweak this behavior (see below).
270 |     #
271 |     # In distributed training, the load_dataset function guarantee that only one local process can concurrently
272 |     # download the dataset.
273 |     if data_args.dataset_name is not None:
274 |         # Downloading and loading a dataset from the hub.
275 |         raw_datasets = load_dataset(
276 |             data_args.dataset_name,
277 |             data_args.dataset_config_name,
278 |             cache_dir=model_args.cache_dir,
279 |             use_auth_token=True if model_args.use_auth_token else None,
280 |         )
281 |         if "validation" not in raw_datasets.keys():
282 |             logger.info(f'Creating custom validation split with {data_args.validation_split_percentage}% of data.')
283 |             raw_datasets["validation"] = load_dataset(
284 |                 data_args.dataset_name,
285 |                 data_args.dataset_config_name,
286 |                 split=f"train[:{data_args.validation_split_percentage}%]",
287 |                 cache_dir=model_args.cache_dir,
288 |                 use_auth_token=True if model_args.use_auth_token else None,
289 |             )
290 |             raw_datasets["train"] = load_dataset(
291 |                 data_args.dataset_name,
292 |                 data_args.dataset_config_name,
293 |                 split=f"train[{data_args.validation_split_percentage}%:]",
294 |                 cache_dir=model_args.cache_dir,
295 |                 use_auth_token=True if model_args.use_auth_token else None,
296 |             )
297 |     else:
298 |         data_files = {}
299 |         dataset_args = {}
300 |         if data_args.train_file is not None:
301 |             data_files["train"] = data_args.train_file
302 |         if data_args.validation_file is not None:
303 |             data_files["validation"] = data_args.validation_file
304 |         extension = (
305 |             data_args.train_file.split(".")[-1]
306 |             if data_args.train_file is not None
307 |             else data_args.validation_file.split(".")[-1]
308 |         )
309 |         if extension == "txt":
310 |             extension = "text"
311 |             dataset_args["keep_linebreaks"] = data_args.keep_linebreaks
312 |         raw_datasets = load_dataset(
313 |             extension,
314 |             data_files=data_files,
315 |             cache_dir=model_args.cache_dir,
316 |             use_auth_token=True if model_args.use_auth_token else None,
317 |             **dataset_args,
318 |         )
319 |         # If no validation data is there, validation_split_percentage will be used to divide the dataset.
320 |         if "validation" not in raw_datasets.keys():
321 |             raw_datasets["validation"] = load_dataset(
322 |                 extension,
323 |                 data_files=data_files,
324 |                 split=f"train[:{data_args.validation_split_percentage}%]",
325 |                 cache_dir=model_args.cache_dir,
326 |                 use_auth_token=True if model_args.use_auth_token else None,
327 |                 **dataset_args,
328 |             )
329 |             raw_datasets["train"] = load_dataset(
330 |                 extension,
331 |                 data_files=data_files,
332 |                 split=f"train[{data_args.validation_split_percentage}%:]",
333 |                 cache_dir=model_args.cache_dir,
334 |                 use_auth_token=True if model_args.use_auth_token else None,
335 |                 **dataset_args,
336 |             )
337 | 
338 |     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
339 |     # https://huggingface.co/docs/datasets/loading_datasets.html.
340 | 
341 |     # Load pretrained model and tokenizer
342 |     #
343 |     # Distributed training:
344 |     # The .from_pretrained methods guarantee that only one local process can concurrently
345 |     # download model & vocab.
346 | 
347 |     config_kwargs = {
348 |         "cache_dir": model_args.cache_dir,
349 |         "revision": model_args.model_revision,
350 |         "use_auth_token": True if model_args.use_auth_token else None,
351 |     }
352 |     if model_args.config_name:
353 |         config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs)
354 |     elif model_args.model_name_or_path:
355 |         config = AutoConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs)
356 |     else:
357 |         config = CONFIG_MAPPING[model_args.model_type]()
358 |         logger.warning("You are instantiating a new config instance from scratch.")
359 |         if model_args.config_overrides is not None:
360 |             logger.info(f"Overriding config: {model_args.config_overrides}")
361 |             config.update_from_string(model_args.config_overrides)
362 |             logger.info(f"New config: {config}")
363 | 
364 |     tokenizer_kwargs = {
365 |         "cache_dir": model_args.cache_dir,
366 |         "use_fast": model_args.use_fast_tokenizer,
367 |         "revision": model_args.model_revision,
368 |         "use_auth_token": True if model_args.use_auth_token else None,
369 |     }
370 |     if model_args.tokenizer_name:
371 |         tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs)
372 |     elif model_args.model_name_or_path:
373 |         tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, **tokenizer_kwargs)
374 |     else:
375 |         raise ValueError(
376 |             "You are instantiating a new tokenizer from scratch. This is not supported by this script."
377 |             "You can do it from another script, save it, and load it from here, using --tokenizer_name."
378 |         )
379 | 
380 |     if model_args.model_name_or_path:
381 |         model = AutoModelForCausalLM.from_pretrained(
382 |             model_args.model_name_or_path,
383 |             from_tf=bool(".ckpt" in model_args.model_name_or_path),
384 |             config=config,
385 |             cache_dir=model_args.cache_dir,
386 |             revision=model_args.model_revision,
387 |             use_auth_token=True if model_args.use_auth_token else None,
388 |         )
389 |     else:
390 |         model = AutoModelForCausalLM.from_config(config)
391 |         n_params = sum(dict((p.data_ptr(), p.numel()) for p in model.parameters()).values())
392 |         logger.info(f"Training new model from scratch - Total size={n_params/2**20:.2f}M params")
393 | 
394 |     model.resize_token_embeddings(len(tokenizer))
395 | 
396 |     # Preprocessing the datasets.
397 |     # First we tokenize all the texts.
398 |     if training_args.do_train:
399 |         column_names = raw_datasets["train"].column_names
400 |     else:
401 |         column_names = raw_datasets["validation"].column_names
402 |     text_column_name = "text" if "text" in column_names else column_names[0]
403 | 
404 |     # since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function
405 |     tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base")
406 | 
407 |     def tokenize_function(examples):
408 |         with CaptureLogger(tok_logger) as cl:
409 |             output = tokenizer(examples[text_column_name])
410 |         # clm input could be much much longer than block_size
411 |         if "Token indices sequence length is longer than the" in cl.out:
412 |             tok_logger.warning(
413 |                 "^^^^^^^^^^^^^^^^ Please ignore the warning above - this long input will be chunked into smaller bits"
414 |                 " before being passed to the model."
415 |             )
416 |         return output
417 | 
418 |     with training_args.main_process_first(desc="dataset map tokenization"):
419 |         tokenized_datasets = raw_datasets.map(
420 |             tokenize_function,
421 |             batched=True,
422 |             num_proc=data_args.preprocessing_num_workers,
423 |             remove_columns=column_names,
424 |             load_from_cache_file=not data_args.overwrite_cache,
425 |             desc="Running tokenizer on dataset",
426 |         )
427 | 
428 |     if data_args.block_size is None:
429 |         block_size = tokenizer.model_max_length
430 |         if block_size > 1024:
431 |             logger.warning(
432 |                 f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
433 |                 "Picking 1024 instead. You can change that default value by passing --block_size xxx."
434 |             )
435 |             block_size = 1024
436 |     else:
437 |         if data_args.block_size > tokenizer.model_max_length:
438 |             logger.warning(
439 |                 f"The block_size passed ({data_args.block_size}) is larger than the maximum length for the model"
440 |                 f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}."
441 |             )
442 |         block_size = min(data_args.block_size, tokenizer.model_max_length)
443 | 
444 |     # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
445 |     def group_texts(examples):
446 |         # Concatenate all texts.
447 |         concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
448 |         total_length = len(concatenated_examples[list(examples.keys())[0]])
449 |         # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
450 |         # customize this part to your needs.
451 |         if total_length >= block_size:
452 |             total_length = (total_length // block_size) * block_size
453 |         # Split by chunks of max_len.
454 |         result = {
455 |             k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
456 |             for k, t in concatenated_examples.items()
457 |         }
458 |         result["labels"] = result["input_ids"].copy()
459 |         return result
460 | 
461 |     # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder
462 |     # for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower
463 |     # to preprocess.
464 |     #
465 |     # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
466 |     # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
467 | 
468 |     with training_args.main_process_first(desc="grouping texts together"):
469 |         lm_datasets = tokenized_datasets.map(
470 |             group_texts,
471 |             batched=True,
472 |             num_proc=data_args.preprocessing_num_workers,
473 |             load_from_cache_file=not data_args.overwrite_cache,
474 |             desc=f"Grouping texts in chunks of {block_size}",
475 |         )
476 | 
477 |     if training_args.do_train:
478 |         if "train" not in tokenized_datasets:
479 |             raise ValueError("--do_train requires a train dataset")
480 |         train_dataset = lm_datasets["train"]
481 |         if data_args.max_train_samples is not None:
482 |             max_train_samples = min(len(train_dataset), data_args.max_train_samples)
483 |             train_dataset = train_dataset.select(range(max_train_samples))
484 | 
485 |     if training_args.do_eval:
486 |         if "validation" not in tokenized_datasets:
487 |             raise ValueError("--do_eval requires a validation dataset")
488 |         eval_dataset = lm_datasets["validation"]
489 |         if data_args.max_eval_samples is not None:
490 |             max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
491 |             eval_dataset = eval_dataset.select(range(max_eval_samples))
492 | 
493 |         def preprocess_logits_for_metrics(logits, labels):
494 |             if isinstance(logits, tuple):
495 |                 # Depending on the model and config, logits may contain extra tensors,
496 |                 # like past_key_values, but logits always come first
497 |                 logits = logits[0]
498 |             return logits.argmax(dim=-1)
499 | 
500 |         metric = evaluate.load("accuracy")
501 | 
502 |         def compute_metrics(eval_preds):
503 |             preds, labels = eval_preds
504 |             # preds have the same shape as the labels, after the argmax(-1) has been calculated
505 |             # by preprocess_logits_for_metrics but we need to shift the labels
506 |             labels = labels[:, 1:].reshape(-1)
507 |             preds = preds[:, :-1].reshape(-1)
508 |             return metric.compute(predictions=preds, references=labels)
509 | 
510 |     # Initialize our Trainer
511 |     trainer = Trainer(
512 |         model=model,
513 |         args=training_args,
514 |         train_dataset=train_dataset if training_args.do_train else None,
515 |         eval_dataset=eval_dataset if training_args.do_eval else None,
516 |         tokenizer=tokenizer,
517 |         # Data collator will default to DataCollatorWithPadding, so we change it.
518 |         data_collator=default_data_collator,
519 |         compute_metrics=compute_metrics if training_args.do_eval and not is_torch_tpu_available() else None,
520 |         preprocess_logits_for_metrics=preprocess_logits_for_metrics
521 |         if training_args.do_eval and not is_torch_tpu_available()
522 |         else None,
523 |     )
524 | 
525 |     # Training
526 |     if training_args.do_train:
527 |         logger.info(f'DS Sizes: |train|={len(train_dataset)}  |eval|={len(eval_dataset)}')
528 | 
529 |         checkpoint = None
530 |         if training_args.resume_from_checkpoint is not None:
531 |             checkpoint = training_args.resume_from_checkpoint
532 |         elif last_checkpoint is not None:
533 |             checkpoint = last_checkpoint
534 |         train_result = trainer.train(resume_from_checkpoint=checkpoint)
535 |         trainer.save_model()  # Saves the tokenizer too for easy upload
536 | 
537 |         metrics = train_result.metrics
538 | 
539 |         max_train_samples = (
540 |             data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
541 |         )
542 |         metrics["train_samples"] = min(max_train_samples, len(train_dataset))
543 | 
544 |         trainer.log_metrics("train", metrics)
545 |         trainer.save_metrics("train", metrics)
546 |         trainer.save_state()
547 | 
548 |     # Evaluation
549 |     if training_args.do_eval:
550 |         logger.info("*** Evaluate ***")
551 | 
552 |         metrics = trainer.evaluate()
553 | 
554 |         max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
555 |         metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
556 |         try:
557 |             perplexity = math.exp(metrics["eval_loss"])
558 |         except OverflowError:
559 |             perplexity = float("inf")
560 |         metrics["perplexity"] = perplexity
561 | 
562 |         trainer.log_metrics("eval", metrics)
563 |         trainer.save_metrics("eval", metrics)
564 | 
565 |     kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "text-generation"}
566 |     if data_args.dataset_name is not None:
567 |         kwargs["dataset_tags"] = data_args.dataset_name
568 |         if data_args.dataset_config_name is not None:
569 |             kwargs["dataset_args"] = data_args.dataset_config_name
570 |             kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
571 |         else:
572 |             kwargs["dataset"] = data_args.dataset_name
573 | 
574 |     if training_args.push_to_hub:
575 |         trainer.push_to_hub(**kwargs)
576 |     else:
577 |         trainer.create_model_card(**kwargs)
578 | 
579 | 
580 | def _mp_fn(index):
581 |     # For xla_spawn (TPUs)
582 |     main()
583 | 
584 | 
585 | if __name__ == "__main__":
586 |     main()


--------------------------------------------------------------------------------
/v1/analysis/train-tokenizer.py:
--------------------------------------------------------------------------------
 1 | ''' This script is taken from 
 2 |     https://github.com/huggingface/transformers/blob/main/examples/research_projects/codeparrot/scripts/bpe_training.py
 3 |     with some slight modifications. It will train a new tokenizer on our dataset.
 4 | '''
 5 | from argparse import ArgumentParser
 6 | from os import environ
 7 | 
 8 | from datasets import load_dataset
 9 | from tqdm import tqdm
10 | 
11 | from transformers import AutoTokenizer, HfArgumentParser
12 | from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode
13 | 
14 | 
15 | parser = ArgumentParser(description='script to train new tokenizer')
16 | parser.add_argument('--n-examples', type=int, default=10000, help='number of examples to train on')
17 | parser.add_argument('--text-column', type=str, default='text', help='text feature name')
18 | parser.add_argument('--base-tokenizer', type=str, default='gpt2', help='name of base tokenizer')
19 | parser.add_argument('--dataset', type=str, required=True, help='what dataset to train on')
20 | parser.add_argument('--vocab-size', type=int, default=1024, help='number of tokens in vocab')
21 | parser.add_argument('--tokenizer-name', type=str, default='hpc-tok', help='output tokenizer name')
22 | args = parser.parse_args()
23 | 
24 | #environ['TOKENIZERS_PARALLELISM'] = '0'
25 | environ['OMP_NUM_THREADS'] = '8'
26 | 
27 | 
28 | # Iterator for Training
29 | def batch_iterator(batch_size=10):
30 |     for _ in tqdm(range(0, args.n_examples, batch_size)):
31 |         yield [next(iter_dataset)[args.text_column] for _ in range(batch_size)]
32 | 
33 | # Base tokenizer
34 | tokenizer = AutoTokenizer.from_pretrained(args.base_tokenizer)
35 | base_vocab = list(bytes_to_unicode().values())
36 | tokenizer.pad_token = tokenizer.eos_token
37 | 
38 | # Load dataset
39 | #dataset = load_dataset('json', data_files=args.dataset, split="train", streaming=True)
40 | dataset = load_dataset(args.dataset, split='train', streaming=True)
41 | iter_dataset = iter(dataset)
42 | 
43 | 
44 | # Training and saving
45 | new_tokenizer = tokenizer.train_new_from_iterator(
46 |     batch_iterator(), vocab_size=args.vocab_size, initial_alphabet=base_vocab
47 | )
48 | new_tokenizer.save_pretrained(args.tokenizer_name)#, push_to_hub=args.push_to_hub)


--------------------------------------------------------------------------------
/v1/analysis/train.py:
--------------------------------------------------------------------------------
  1 | ''' Train LLM on source code data.
  2 |     author: Daniel Nichols
  3 |     date: October 2022
  4 | '''
  5 | # std imports
  6 | from argparse import ArgumentParser
  7 | from typing import Iterable, Optional, Union
  8 | import logging
  9 | from os import PathLike, environ
 10 | 
 11 | # tpl imports
 12 | import torch
 13 | from torch.utils.data import DataLoader
 14 | from torch.optim import AdamW
 15 | from datasets import load_dataset, DatasetDict, load_from_disk
 16 | from tokenizers import Tokenizer
 17 | from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForMaskedLM, get_scheduler
 18 | from accelerate import Accelerator
 19 | import tqdm
 20 | from alive_progress import alive_bar
 21 | 
 22 | 
 23 | def get_args():
 24 |     ''' Parse the command line arguments and return the object with them as properties.
 25 |     '''
 26 |     parser = ArgumentParser(description='Train a LLM on source code data')
 27 |     parser.add_argument('--log', choices=['INFO', 'DEBUG', 'WARNING', 'ERROR', 'CRITICAL'],
 28 |         default='INFO', type=str.upper, help='logging level')
 29 |     parser.add_argument('--input', type=str, required=True, help='root of textual source data or path to pkl of ' +
 30 |         'filenames list')
 31 |     parser.add_argument('--save-tokens', type=str, help='path to store token data')
 32 |     parser.add_argument('--load-tokens', type=str, help='retrieve tokens rather than retokenize')
 33 |     parser.add_argument('--model', type=str, default='gpt2', help='what model to train')
 34 |     parser.add_argument('--gradient-checkpointing', action='store_true', help='checkpoint gradients')
 35 |     parser.add_argument('--lm-task', default='causal', choices=['causal', 'masked'], help='LM training objective')
 36 |     parser.add_argument('--tokenizer', type=str, default='gpt2', help='what text tokenizer to use')
 37 |     parser.add_argument('--max-seq-length', type=int, default=1024, help='maximum sequence length')
 38 |     return parser.parse_args()
 39 | 
 40 | 
 41 | def get_dataset(dataset_path: PathLike, name: str = 'HPC-Source-Dataset', type: str = 'json') -> DatasetDict:
 42 |     ''' Fetch the dataset from dataset_path and return a huggingface DatasetDict object. Currently this is just
 43 |         a light wrapper around `load_dataset`.
 44 | 
 45 |         Args:
 46 |             dataset_path: path to dataset
 47 |     '''
 48 |     return load_dataset(type, name=name, data_files=dataset_path)
 49 |     
 50 | 
 51 | def get_model(model_name: Union[str, PathLike], training_task: str = 'causal'):
 52 |     ''' Return the pretrained model from file or huggingface.
 53 | 
 54 |         Args:
 55 |             model_name: name of huggingface model or path to model
 56 |             training_task: causal or masked
 57 |     '''
 58 |     assert training_task in ['causal', 'masked']
 59 | 
 60 |     model = None
 61 |     if training_task == 'causal':
 62 |         model = AutoModelForCausalLM.from_pretrained(model_name)
 63 |     elif training_task == 'masked':
 64 |         model = AutoModelForMaskedLM.from_pretrained(model_name)
 65 | 
 66 |     return model
 67 | 
 68 | 
 69 | def train(dataset, model, batch_size=8, num_epochs=5):
 70 |     ''' Train model on dataset.
 71 | 
 72 |         Args:
 73 |             dataset: HuggingFace text dataset
 74 |             model: LLM
 75 |     '''
 76 |     logging.debug('Creating accelerator...')
 77 |     accelerator = Accelerator()
 78 | 
 79 |     logging.debug('Creating torch dataset...')
 80 |     dataset = dataset.remove_columns(['text', 'filename'])
 81 |     dataset.set_format('torch')
 82 |     train_dl = DataLoader(dataset['train'], shuffle=True, batch_size=batch_size)
 83 | 
 84 |     logging.debug('Creating optimizer...')
 85 |     optimizer = AdamW(model.parameters(), lr=1e-5)
 86 | 
 87 |     logging.debug('Creating learning rate scheduler...')
 88 |     total_training_steps = num_epochs * len(train_dl)
 89 |     lr_schedule = get_scheduler(name='linear', optimizer=optimizer, num_warmup_steps=0, 
 90 |         num_training_steps=total_training_steps)
 91 | 
 92 |     logging.debug('Preparing training components with accelerator...')
 93 |     train_dl, model, optimizer = accelerator.prepare(train_dl, model, optimizer)
 94 |     
 95 |     #model.train()
 96 |     #completed_steps = 0
 97 |     for epoch in range(1, num_epochs+1):
 98 | 
 99 |         model.train()
100 |         with alive_bar(len(train_dl), title='Training') as bar:
101 |             for step, batch in enumerate(train_dl, start=1):
102 |                 loss = model(**batch).loss
103 |                 loss = loss / 1.0
104 |                 accelerator.backward()
105 | 
106 |                 bar()
107 | 
108 | 
109 | def main():
110 |     args = get_args()
111 | 
112 |     # setup logging
113 |     numeric_level = getattr(logging, args.log.upper(), None)
114 |     if not isinstance(numeric_level, int):
115 |         raise ValueError('Invalid log level: {}'.format(args.log))
116 |     logging.basicConfig(format='%(asctime)s [%(levelname)s] -- %(message)s', 
117 |         level=numeric_level) #filename='log.txt', filemode='w')
118 | 
119 |     # environment setup
120 |     logging.info('Setting up environment...')
121 |     device = 'cuda' if torch.cuda.is_available() else 'cpu'
122 |     environ['TOKENIZERS_PARALLELISM'] = '0'
123 |     environ['OMP_NUM_THREADS'] = '32'
124 |     #tqdm.tqdm.monitor_interval = 0  # fixes bug where tqdm calls in HF error due to monitor threading
125 |     logging.info('Using device: {}'.format(device))
126 | 
127 |     # gather and initialize dataset
128 |     logging.info('Creating dataset...')
129 |     dataset = get_dataset(args.input)
130 |     print(dataset)
131 |     
132 |     # tokenizer dataset
133 |     if args.load_tokens:
134 |         logging.info('Loading tokenized dataset...')
135 |         tokenized_dataset = load_from_disk(args.load_tokens)
136 |     else:
137 |         logging.info('Tokenizing dataset with \'{}\' tokenizer...'.format(args.tokenizer))
138 |         tokenizer = AutoTokenizer.from_pretrained(args.tokenizer)
139 |         def tokenize_func(x):
140 |             return tokenizer(x['text'], truncation=True, max_length=args.max_seq_length, padding='max_length')
141 |         
142 |         tokenized_dataset = dataset.map(tokenize_func, batched=True)
143 |         if args.save_tokens:
144 |             tokenized_dataset.save_to_disk(args.save_tokens)
145 |             logging.info('Saved tokenized dataset to \'{}\'.'.format(args.save_tokens))
146 |     print(tokenized_dataset)
147 | 
148 |     # initialize model
149 |     logging.info('Creating model...')
150 |     model = get_model(args.model, training_task = args.lm_task)
151 |     if args.gradient_checkpointing:
152 |         model.gradient_checkpointing_enable()
153 |     print(model)
154 | 
155 |     # train
156 |     logging.info('Training...')
157 |     train(tokenized_dataset, model)
158 | 
159 | 
160 | 
161 | if __name__ == '__main__':
162 |     main()
163 | 
164 | 


--------------------------------------------------------------------------------
/v1/analysis/train.sbatch:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH -N 1
 3 | #SBATCH -t 01:00:00
 4 | #SBATCH -J hpc-llm-train
 5 | #SBATCH -A bhatele-lab-aac
 6 | 
 7 | #####STATCH -p gpu
 8 | #####STATCH --gres=gpu:a100
 9 | 
10 | # config params
11 | DATASET="/scratch/zt1/project/bhatele-lab/user/dnicho/code-ml/data/dataset.jsonl"
12 | TOKENIZER="./hpc-tok"
13 | MODEL="gpt2"
14 | SEQ_LENGTH="1024"
15 | LM_TASK="causal"
16 | 
17 | if [ -d "/scratch/zt1/project/bhatele-lab/user/dnicho/code-ml/data/dataset-tokens" ]; then
18 |     TOKENS="--load-tokens /scratch/zt1/project/bhatele-lab/user/dnicho/code-ml/data/dataset-tokens"
19 | else
20 |     TOKENS="--save-tokens /scratch/zt1/project/bhatele-lab/user/dnicho/code-ml/data/dataset-tokens"
21 | fi
22 | 
23 | # setup environment
24 | module load python/3.8.12/zen2
25 | source .env/bin/activate
26 | export HF_DATASETS_CACHE="/scratch/zt1/project/bhatele-lab/user/dnicho/.cache/huggingface/datasets"
27 | 
28 | # run job
29 | python train.py --input $DATASET --log debug \
30 |     --tokenizer $TOKENIZER  \
31 |     ${TOKENS} \
32 |     --model $MODEL \
33 |     --lm-task $LM_TASK \
34 |     --max-seq-length $SEQ_LENGTH 
35 | 


--------------------------------------------------------------------------------
/v1/analysis/training-results/gpt-neo-eval-results.csv:
--------------------------------------------------------------------------------
 1 | "loss","accuracy","runtime","samples_per_second","steps_per_second","epoch","perplexity","steps","samples","model"
 2 | 2.400390625,0.6147214076246335,97.6214,10.244,1.28,0.01,11.027483150026422,500,4000,"GPT-Neo"
 3 | 2.552734375,0.5945298142717498,49.3208,10.138,1.277,0.01,12.842171128856814,500,4000,"GPT-Neo"
 4 | 2.154296875,0.6396793743890518,96.8775,10.322,1.29,0.02,8.621825825935622,1000,8000,"GPT-Neo"
 5 | 2.287109375,0.6213079178885631,48.9909,10.206,1.286,0.02,9.846434155787346,1000,8000,"GPT-Neo"
 6 | 2.130859375,0.6368387096774194,48.9364,10.217,1.287,0.03,8.422101445792912,1500,12000,"GPT-Neo"
 7 | 2.046875,0.646238514173998,49.1507,10.173,1.282,0.03,7.743664305075443,2000,16000,"GPT-Neo"
 8 | 1.8740234375,0.669357771260997,95.9844,10.418,1.302,0.03,6.514454241264857,2000,16000,"GPT-Neo"
 9 | 1.943359375,0.6557086999022483,45.1551,11.073,1.395,0.04,6.982167334831573,2500,20000,"GPT-Neo"
10 | 1.857421875,0.6639374389051809,44.6968,11.186,1.409,0.05,6.4071969051480036,3000,24000,"GPT-Neo"
11 | 1.7109375,0.6845620723362659,95.9613,10.421,1.303,0.05,5.534147309488141,3000,24000,"GPT-Neo"
12 | 1.80859375,0.669425219941349,48.77,10.252,1.292,0.06,6.101860654522169,3500,28000,"GPT-Neo"
13 | 1.5869140625,0.7031935483870968,95.9459,10.423,1.303,0.07,4.888639590844212,4000,32000,"GPT-Neo"
14 | 1.498046875,0.7116324535679375,95.7061,10.449,1.306,0.09,4.472944313953601,5000,40000,"GPT-Neo"
15 | 1.4462890625,0.7184281524926687,92.5363,10.807,1.351,0.1,4.247323681168077,6000,48000,"GPT-Neo"
16 | 1.400390625,0.7239941348973608,96.2514,10.389,1.299,0.12,4.056784338759217,7000,56000,"GPT-Neo"
17 | 1.3642578125,0.7291642228739003,95.5973,10.461,1.308,0.14,3.9128179294962275,8000,64000,"GPT-Neo"
18 | 1.3271484375,0.7331045943304008,95.8617,10.432,1.304,0.16,3.770276864036879,9000,72000,"GPT-Neo"
19 | 1.294921875,0.739147605083089,92.5299,10.807,1.351,0.17,3.6507107512224573,10000,80000,"GPT-Neo"
20 | 1.287109375,0.7387712609970675,96.5748,10.355,1.294,0.19,3.622300694762451,11000,88000,"GPT-Neo"
21 | 1.2529296875,0.7447038123167156,95.8118,10.437,1.305,0.21,3.5005835651605617,12000,96000,"GPT-Neo"
22 | 1.2353515625,0.7463235581622678,96.0774,10.408,1.301,0.22,3.439587538163958,13000,104000,"GPT-Neo"
23 | 1.203125,0.7524623655913979,96.8653,10.324,1.29,0.24,3.3305085165287003,14000,112000,"GPT-Neo"
24 | 1.197265625,0.7519931573802542,96.4695,10.366,1.296,0.26,3.3110508786568342,15000,120000,"GPT-Neo"
25 | 1.173828125,0.7558709677419355,96.2093,10.394,1.299,0.28,3.2343504676636736,16000,128000,"GPT-Neo"
26 | 1.1591796875,0.7579648093841642,96.1619,10.399,1.3,0.29,3.187317606955351,17000,136000,"GPT-Neo"
27 | 1.1591796875,0.7579648093841642,96.3719,10.376,1.297,0.29,3.187317606955351,17000,136000,"GPT-Neo"
28 | 1.1376953125,0.7613743890518084,96.1712,10.398,1.3,0.31,3.1195704388909276,18000,144000,"GPT-Neo"
29 | 1.1181640625,0.7644652981427175,96.03,10.413,1.302,0.33,3.059232484670239,19000,152000,"GPT-Neo"
30 | 1.099609375,0.7669872922776149,96.065,10.41,1.301,0.35,3.0029927507631666,20000,160000,"GPT-Neo"
31 | 1.0849609375,0.7698690127077223,91.8086,10.892,1.362,0.36,2.959324217879416,21000,168000,"GPT-Neo"
32 | 1.0791015625,0.7702101661779082,96.13,10.403,1.3,0.38,2.942035128633382,22000,176000,"GPT-Neo"
33 | 1.0654296875,0.7730869990224829,91.6519,10.911,1.364,0.4,2.9020857061418623,23000,184000,"GPT-Neo"
34 | 1.0625,0.7726177908113392,92.1198,10.855,1.357,0.41,2.893595944171761,24000,192000,"GPT-Neo"
35 | 1.046875,0.7767507331378299,91.8452,10.888,1.361,0.43,2.848734897170399,25000,200000,"GPT-Neo"
36 | 1.0322265625,0.7788132942326491,96.3255,10.381,1.298,0.45,2.8073095312574914,26000,208000,"GPT-Neo"
37 | 1.0244140625,0.7800469208211144,96.7867,10.332,1.291,0.47,2.785462875162639,27000,216000,"GPT-Neo"
38 | 1.0087890625,0.7826617790811339,98.2115,10.182,1.273,0.48,2.742278276008459,28000,224000,"GPT-Neo"
39 | 0.99951171875,0.784781036168133,99.8289,10.017,1.252,0.5,2.7169548664017036,29000,232000,"GPT-Neo"
40 | 0.99462890625,0.7846500488758553,92.0881,10.859,1.357,0.52,2.703720821192463,30000,240000,"GPT-Neo"
41 | 0.98876953125,0.7862609970674487,99.8058,10.019,1.252,0.54,2.6879250289330217,31000,248000,"GPT-Neo"
42 | 0.97314453125,0.7892443792766374,99.3598,10.064,1.258,0.55,2.6462526139222193,32000,256000,"GPT-Neo"
43 | 0.9638671875,0.790930596285435,99.3593,10.064,1.258,0.57,2.6218159477196776,33000,264000,"GPT-Neo"
44 | 0.96142578125,0.790969696969697,93.0136,10.751,1.344,0.59,2.6154228371441155,34000,272000,"GPT-Neo"
45 | 0.9462890625,0.794236559139785,92.334,10.83,1.354,0.6,2.5761320343519643,35000,280000,"GPT-Neo"
46 | 0.94775390625,0.7946001955034213,99.4961,10.051,1.256,0.62,2.579908430501536,36000,288000,"GPT-Neo"
47 | 0.93359375,0.7960723362658847,92.161,10.851,1.356,0.64,2.5436339562409365,37000,296000,"GPT-Neo"
48 | 0.923828125,0.797355816226784,99.0029,10.101,1.263,0.66,2.5189146769438335,38000,304000,"GPT-Neo"
49 | 0.9072265625,0.8001202346041055,93.2118,10.728,1.341,0.67,2.477441965930949,39000,312000,"GPT-Neo"
50 | 0.900390625,0.8026520039100684,99.5184,10.048,1.256,0.69,2.460564081299507,40000,320000,"GPT-Neo"
51 | 0.90234375,0.8013382209188661,99.4968,10.051,1.256,0.71,2.4653745667312625,41000,328000,"GPT-Neo"
52 | 0.89697265625,0.802377321603128,92.6639,10.792,1.349,0.73,2.4521683065837,42000,336000,"GPT-Neo"
53 | 0.88525390625,0.8048523949169111,98.9276,10.108,1.264,0.74,2.423599680475755,43000,344000,"GPT-Neo"
54 | 0.88525390625,0.8041427174975562,92.09,10.859,1.357,0.76,2.423599680475755,44000,352000,"GPT-Neo"
55 | 0.8759765625,0.8062003910068426,100.0763,9.992,1.249,0.78,2.4012190898666863,45000,360000,"GPT-Neo"
56 | 0.87060546875,0.8071417399804497,99.6152,10.039,1.255,0.79,2.3883564910551325,46000,368000,"GPT-Neo"
57 | 0.8701171875,0.8071876832844574,99.3153,10.069,1.259,0.81,2.3871905860301803,47000,376000,"GPT-Neo"
58 | 0.86181640625,0.8090478983382209,99.8315,10.017,1.252,0.83,2.3674570543464966,48000,384000,"GPT-Neo"
59 | 0.86474609375,0.8083206256109482,92.0641,10.862,1.358,0.85,2.374403133638017,49000,392000,"GPT-Neo"
60 | 0.85888671875,0.8097282502443792,99.7379,10.026,1.253,0.86,2.3605312951164015,50000,400000,"GPT-Neo"
61 | 0.8564453125,0.8103929618768329,92.1481,10.852,1.357,0.88,2.3547753084691005,51000,408000,"GPT-Neo"
62 | 0.84814453125,0.811496578690127,99.4614,10.054,1.257,0.9,2.3353097352427477,52000,416000,"GPT-Neo"
63 | 0.8447265625,0.812128054740958,98.5074,10.152,1.269,0.92,2.3273413451578313,53000,424000,"GPT-Neo"
64 | 0.84130859375,0.8133040078201369,99.0746,10.093,1.262,0.93,2.319400144288798,54000,432000,"GPT-Neo"
65 | 0.82958984375,0.815377321603128,99.6086,10.039,1.255,0.95,2.292378314123635,55000,440000,"GPT-Neo"
66 | 0.8271484375,0.8158387096774193,98.6696,10.135,1.267,0.97,2.286788513643567,56000,448000,"GPT-Neo"
67 | 0.822265625,0.8169257086999022,98.9728,10.104,1.263,0.98,2.2756497704322705,57000,456000,"GPT-Neo"
68 | 


--------------------------------------------------------------------------------
/v1/analysis/training-results/gpt2-medium-eval-results.csv:
--------------------------------------------------------------------------------
 1 | "loss","accuracy","runtime","samples_per_second","steps_per_second","epoch","perplexity","steps","samples","model"
 2 | 2.435546875,0.6108318670576736,10.8904,91.824,2.938,0.07,11.422063446614425,1000,32000,"GPT2-Medium"
 3 | 2.197265625,0.6316881720430108,10.0309,99.692,3.19,0.14,9.000369436556227,2000,64000,"GPT2-Medium"
 4 | 2.03125,0.6476578690127077,9.956,100.442,3.214,0.21,7.623609917712736,3000,96000,"GPT2-Medium"
 5 | 1.939453125,0.6558787878787878,10.9771,91.099,2.915,0.28,6.95494644409525,4000,128000,"GPT2-Medium"
 6 | 1.84765625,0.6650830889540567,10.915,91.617,2.932,0.35,6.344931149723057,5000,160000,"GPT2-Medium"
 7 | 1.7880859375,0.6712160312805474,10.9847,91.036,2.913,0.41,5.97799924460953,6000,192000,"GPT2-Medium"
 8 | 1.740234375,0.6770185728250244,10.9687,91.169,2.917,0.48,5.698678894031116,7000,224000,"GPT2-Medium"
 9 | 1.6845703125,0.6838435972629521,9.9051,100.958,3.231,0.55,5.390134361325962,8000,256000,"GPT2-Medium"
10 | 1.65234375,0.6885200391006843,10.9442,91.372,2.924,0.62,5.219197998726548,9000,288000,"GPT2-Medium"
11 | 1.6220703125,0.6915425219941349,11.0058,90.861,2.908,0.69,5.06356262934938,10000,320000,"GPT2-Medium"
12 | 1.587890625,0.6952649071358749,9.9235,100.771,3.225,0.76,4.893415984788658,11000,352000,"GPT2-Medium"
13 | 1.5556640625,0.70027761485826,11.0159,90.778,2.905,0.83,4.738231962823047,12000,384000,"GPT2-Medium"
14 | 1.525390625,0.7029081133919843,9.8287,101.743,3.256,0.9,4.596938897890734,13000,416000,"GPT2-Medium"
15 | 1.4970703125,0.7069266862170088,10.9471,91.348,2.923,0.97,4.4685783364439375,14000,448000,"GPT2-Medium"
16 | 


--------------------------------------------------------------------------------
/v1/analysis/training-results/gpt2-medium-training-results.csv:
--------------------------------------------------------------------------------
  1 | "loss","learning_rate","epoch","perplexity","steps","samples","model"
  2 | 4.5758,5e-05,0.0,97.1056926254968,50,1600,"GPT2-Medium"
  3 | 3.7833,5e-05,0.01,43.960873514301454,100,3200,"GPT2-Medium"
  4 | 3.4432,5e-05,0.01,31.286916282132594,150,4800,"GPT2-Medium"
  5 | 3.2667,5e-05,0.01,26.224655031214002,200,6400,"GPT2-Medium"
  6 | 3.0971,5e-05,0.02,22.133670474945003,250,8000,"GPT2-Medium"
  7 | 2.9612,5e-05,0.02,19.321143221708507,300,9600,"GPT2-Medium"
  8 | 2.9241,5e-05,0.02,18.6174627910521,350,11200,"GPT2-Medium"
  9 | 2.8459,5e-05,0.03,17.21704704044919,400,12800,"GPT2-Medium"
 10 | 2.7945,5e-05,0.03,16.354449483768388,450,14400,"GPT2-Medium"
 11 | 2.7576,5e-05,0.03,15.761968792351258,500,16000,"GPT2-Medium"
 12 | 2.6609,5e-05,0.04,14.309161550600884,550,17600,"GPT2-Medium"
 13 | 2.6756,5e-05,0.04,14.521059862240167,600,19200,"GPT2-Medium"
 14 | 2.6485,5e-05,0.04,14.132823502742552,650,20800,"GPT2-Medium"
 15 | 2.5695,5e-05,0.05,13.059293161669977,700,22400,"GPT2-Medium"
 16 | 2.5769,5e-05,0.05,13.156290378137037,750,24000,"GPT2-Medium"
 17 | 2.5195,5e-05,0.06,12.422383918561438,800,25600,"GPT2-Medium"
 18 | 2.5192,5e-05,0.06,12.418657762337254,850,27200,"GPT2-Medium"
 19 | 2.4624,5e-05,0.06,11.73293682451812,900,28800,"GPT2-Medium"
 20 | 2.4351,5e-05,0.07,11.416960352319272,950,30400,"GPT2-Medium"
 21 | 2.4442,5e-05,0.07,11.521328847951906,1000,32000,"GPT2-Medium"
 22 | 2.4026,5e-05,0.07,11.051873929878996,1050,33600,"GPT2-Medium"
 23 | 2.4352,5e-05,0.08,11.41810210544121,1100,35200,"GPT2-Medium"
 24 | 2.3622,5e-05,0.08,10.6142771945374,1150,36800,"GPT2-Medium"
 25 | 2.3558,5e-05,0.08,10.5465627378855,1200,38400,"GPT2-Medium"
 26 | 2.3598,5e-05,0.09,10.5888334739482,1250,40000,"GPT2-Medium"
 27 | 2.321,5e-05,0.09,10.185855069912932,1300,41600,"GPT2-Medium"
 28 | 2.293,5e-05,0.09,9.904606975906837,1350,43200,"GPT2-Medium"
 29 | 2.3092,5e-05,0.1,10.066368338245955,1400,44800,"GPT2-Medium"
 30 | 2.3073,5e-05,0.1,10.047260396696064,1450,46400,"GPT2-Medium"
 31 | 2.2603,5e-05,0.1,9.585964524796546,1500,48000,"GPT2-Medium"
 32 | 2.25,5e-05,0.11,9.487735836358526,1550,49600,"GPT2-Medium"
 33 | 2.2617,5e-05,0.11,9.59939427376201,1600,51200,"GPT2-Medium"
 34 | 2.2326,5e-05,0.11,9.324077192097716,1650,52800,"GPT2-Medium"
 35 | 2.2299,5e-05,0.12,9.298936139373419,1700,54400,"GPT2-Medium"
 36 | 2.2521,5e-05,0.12,9.50768101672441,1750,56000,"GPT2-Medium"
 37 | 2.1712,5e-05,0.12,8.768800290077374,1800,57600,"GPT2-Medium"
 38 | 2.2042,5e-05,0.13,9.062998268308789,1850,59200,"GPT2-Medium"
 39 | 2.2123,5e-05,0.13,9.136706670309945,1900,60800,"GPT2-Medium"
 40 | 2.1991,5e-05,0.13,9.016894641318805,1950,62400,"GPT2-Medium"
 41 | 2.1773,5e-05,0.14,8.822453447607918,2000,64000,"GPT2-Medium"
 42 | 2.1829,5e-05,0.14,8.871997781573837,2050,65600,"GPT2-Medium"
 43 | 2.1467,5e-05,0.15,8.556575057644563,2100,67200,"GPT2-Medium"
 44 | 2.1838,5e-05,0.15,8.879986173814547,2150,68800,"GPT2-Medium"
 45 | 2.1667,5e-05,0.15,8.7294293398485,2200,70400,"GPT2-Medium"
 46 | 2.1387,5e-05,0.16,8.488395538882168,2250,72000,"GPT2-Medium"
 47 | 2.1184,5e-05,0.16,8.317818325878259,2300,73600,"GPT2-Medium"
 48 | 2.1473,5e-05,0.16,8.561710543170742,2350,75200,"GPT2-Medium"
 49 | 2.0798,5e-05,0.17,8.0028681805922,2400,76800,"GPT2-Medium"
 50 | 2.1215,5e-05,0.17,8.343643571136921,2450,78400,"GPT2-Medium"
 51 | 2.1146,5e-05,0.17,8.286270594891224,2500,80000,"GPT2-Medium"
 52 | 2.0918,5e-05,0.18,8.099481117025451,2550,81600,"GPT2-Medium"
 53 | 2.0976,5e-05,0.18,8.146594604543264,2600,83200,"GPT2-Medium"
 54 | 2.0924,5e-05,0.18,8.104342263893892,2650,84800,"GPT2-Medium"
 55 | 2.0552,5e-05,0.19,7.808399396357594,2700,86400,"GPT2-Medium"
 56 | 2.0925,5e-05,0.19,8.105152738643342,2750,88000,"GPT2-Medium"
 57 | 2.07,5e-05,0.19,7.924823117849487,2800,89600,"GPT2-Medium"
 58 | 2.041,5e-05,0.2,7.6983036546645645,2850,91200,"GPT2-Medium"
 59 | 2.048,5e-05,0.2,7.752380829544347,2900,92800,"GPT2-Medium"
 60 | 2.0313,5e-05,0.2,7.623991107738291,2950,94400,"GPT2-Medium"
 61 | 2.0113,5e-05,0.21,7.4730259691075736,3000,96000,"GPT2-Medium"
 62 | 2.0361,5e-05,0.21,7.6606742341271925,3050,97600,"GPT2-Medium"
 63 | 1.997,5e-05,0.21,7.366922148160475,3100,99200,"GPT2-Medium"
 64 | 1.9945,5e-05,0.22,7.348527845249077,3150,100800,"GPT2-Medium"
 65 | 1.9995,5e-05,0.22,7.385362494359279,3200,102400,"GPT2-Medium"
 66 | 1.9876,5e-05,0.22,7.297997533172378,3250,104000,"GPT2-Medium"
 67 | 2.0416,5e-05,0.23,7.702924022829201,3300,105600,"GPT2-Medium"
 68 | 1.9788,5e-05,0.23,7.234056906268483,3350,107200,"GPT2-Medium"
 69 | 2.0016,5e-05,0.24,7.4008880517270255,3400,108800,"GPT2-Medium"
 70 | 1.9915,5e-05,0.24,7.326515297045045,3450,110400,"GPT2-Medium"
 71 | 1.9848,5e-05,0.24,7.277591721547564,3500,112000,"GPT2-Medium"
 72 | 1.9847,5e-05,0.25,7.276863998762155,3550,113600,"GPT2-Medium"
 73 | 1.9825,5e-05,0.25,7.260872495068847,3600,115200,"GPT2-Medium"
 74 | 1.9813,5e-05,0.25,7.252164673812458,3650,116800,"GPT2-Medium"
 75 | 1.9312,5e-05,0.26,6.897782616303339,3700,118400,"GPT2-Medium"
 76 | 1.9324,5e-05,0.26,6.906064923833544,3750,120000,"GPT2-Medium"
 77 | 1.9226,5e-05,0.26,6.838716036144388,3800,121600,"GPT2-Medium"
 78 | 1.9048,5e-05,0.27,6.718063880488615,3850,123200,"GPT2-Medium"
 79 | 1.9559,5e-05,0.27,7.070279412568206,3900,124800,"GPT2-Medium"
 80 | 1.9389,5e-05,0.27,6.95110055307065,3950,126400,"GPT2-Medium"
 81 | 1.8913,5e-05,0.28,6.627979456119549,4000,128000,"GPT2-Medium"
 82 | 1.9234,5e-05,0.28,6.844189197946122,4050,129600,"GPT2-Medium"
 83 | 1.9094,5e-05,0.28,6.749038160565076,4100,131200,"GPT2-Medium"
 84 | 1.8721,5e-05,0.29,6.50193613843872,4150,132800,"GPT2-Medium"
 85 | 1.9079,5e-05,0.29,6.738922192197249,4200,134400,"GPT2-Medium"
 86 | 1.9603,5e-05,0.29,7.101457182777846,4250,136000,"GPT2-Medium"
 87 | 1.8805,5e-05,0.3,6.556782433946901,4300,137600,"GPT2-Medium"
 88 | 1.8865,5e-05,0.3,6.596241387033053,4350,139200,"GPT2-Medium"
 89 | 1.8647,5e-05,0.3,6.453999395713493,4400,140800,"GPT2-Medium"
 90 | 1.845,5e-05,0.31,6.32809979040207,4450,142400,"GPT2-Medium"
 91 | 1.868,5e-05,0.31,6.475332774434199,4500,144000,"GPT2-Medium"
 92 | 1.9019,5e-05,0.31,6.698609717405791,4550,145600,"GPT2-Medium"
 93 | 1.8884,5e-05,0.32,6.6087861594283055,4600,147200,"GPT2-Medium"
 94 | 1.8584,5e-05,0.32,6.413467010594398,4650,148800,"GPT2-Medium"
 95 | 1.8562,5e-05,0.32,6.399372892385747,4700,150400,"GPT2-Medium"
 96 | 1.8538,5e-05,0.33,6.384032812902637,4750,152000,"GPT2-Medium"
 97 | 1.8672,5e-05,0.33,6.470154579768687,4800,153600,"GPT2-Medium"
 98 | 1.8446,5e-05,0.34,6.325569056666399,4850,155200,"GPT2-Medium"
 99 | 1.8796,5e-05,0.34,6.550883984456764,4900,156800,"GPT2-Medium"
100 | 1.8187,5e-05,0.34,6.1638402468599,4950,158400,"GPT2-Medium"
101 | 1.839,5e-05,0.35,6.2902448699855995,5000,160000,"GPT2-Medium"
102 | 1.8136,5e-05,0.35,6.132484686243635,5050,161600,"GPT2-Medium"
103 | 1.8249,5e-05,0.35,6.202174770615896,5100,163200,"GPT2-Medium"
104 | 1.8129,5e-05,0.36,6.1281934490714995,5150,164800,"GPT2-Medium"
105 | 1.8171,5e-05,0.36,6.153985987974274,5200,166400,"GPT2-Medium"
106 | 1.8171,5e-05,0.36,6.153985987974274,5250,168000,"GPT2-Medium"
107 | 1.8091,5e-05,0.37,6.104950503530559,5300,169600,"GPT2-Medium"
108 | 1.8211,5e-05,0.37,6.178651229522288,5350,171200,"GPT2-Medium"
109 | 1.8051,5e-05,0.37,6.0805794760660605,5400,172800,"GPT2-Medium"
110 | 1.8317,5e-05,0.38,6.2444932789171155,5450,174400,"GPT2-Medium"
111 | 1.8075,5e-05,0.38,6.095190392895576,5500,176000,"GPT2-Medium"
112 | 1.7971,5e-05,0.38,6.032128900960745,5550,177600,"GPT2-Medium"
113 | 1.8065,5e-05,0.39,6.089098249082266,5600,179200,"GPT2-Medium"
114 | 1.765,5e-05,0.39,5.841572358085994,5650,180800,"GPT2-Medium"
115 | 1.7926,5e-05,0.39,6.005045304701559,5700,182400,"GPT2-Medium"
116 | 1.7969,5e-05,0.4,6.030922595815089,5750,184000,"GPT2-Medium"
117 | 1.7918,5e-05,0.4,6.000243189559967,5800,185600,"GPT2-Medium"
118 | 1.7956,5e-05,0.4,6.023087490362518,5850,187200,"GPT2-Medium"
119 | 1.8021,5e-05,0.41,6.0623650729034075,5900,188800,"GPT2-Medium"
120 | 1.7867,5e-05,0.41,5.969719849968479,5950,190400,"GPT2-Medium"
121 | 1.7836,5e-05,0.41,5.95124237331976,6000,192000,"GPT2-Medium"
122 | 1.7422,5e-05,0.42,5.709891375889401,6050,193600,"GPT2-Medium"
123 | 1.7634,5e-05,0.42,5.8322333155394235,6100,195200,"GPT2-Medium"
124 | 1.7575,5e-05,0.43,5.79792444965634,6150,196800,"GPT2-Medium"
125 | 1.7661,5e-05,0.43,5.848001623127377,6200,198400,"GPT2-Medium"
126 | 1.7803,5e-05,0.43,5.931635642386948,6250,200000,"GPT2-Medium"
127 | 1.7164,5e-05,0.44,5.56446030616287,6300,201600,"GPT2-Medium"
128 | 1.732,5e-05,0.44,5.651946505077628,6350,203200,"GPT2-Medium"
129 | 1.7252,5e-05,0.44,5.613643646157,6400,204800,"GPT2-Medium"
130 | 1.7193,5e-05,0.45,5.5806206622413415,6450,206400,"GPT2-Medium"
131 | 1.7575,5e-05,0.45,5.79792444965634,6500,208000,"GPT2-Medium"
132 | 1.7554,5e-05,0.45,5.785761583791073,6550,209600,"GPT2-Medium"
133 | 1.7904,5e-05,0.46,5.991848726589757,6600,211200,"GPT2-Medium"
134 | 1.717,5e-05,0.46,5.567799984149774,6650,212800,"GPT2-Medium"
135 | 1.7334,5e-05,0.46,5.65986477167804,6700,214400,"GPT2-Medium"
136 | 1.7184,5e-05,0.47,5.5756003631188,6750,216000,"GPT2-Medium"
137 | 1.7336,5e-05,0.47,5.660996857837218,6800,217600,"GPT2-Medium"
138 | 1.7083,5e-05,0.47,5.519570227936003,6850,219200,"GPT2-Medium"
139 | 1.7556,5e-05,0.48,5.7869188518307775,6900,220800,"GPT2-Medium"
140 | 1.7183,5e-05,0.48,5.57504283095956,6950,222400,"GPT2-Medium"
141 | 1.7492,5e-05,0.48,5.750000834846821,7000,224000,"GPT2-Medium"
142 | 1.708,5e-05,0.49,5.517914605223447,7050,225600,"GPT2-Medium"
143 | 1.7623,5e-05,0.49,5.825821386100057,7100,227200,"GPT2-Medium"
144 | 1.7312,5e-05,0.49,5.647426756014245,7150,228800,"GPT2-Medium"
145 | 1.7411,5e-05,0.5,5.70361394859391,7200,230400,"GPT2-Medium"
146 | 1.6966,5e-05,0.5,5.455367574183706,7250,232000,"GPT2-Medium"
147 | 1.6957,5e-05,0.5,5.45045995212813,7300,233600,"GPT2-Medium"
148 | 1.7234,5e-05,0.51,5.603548176242617,7350,235200,"GPT2-Medium"
149 | 1.7159,5e-05,0.51,5.561678771451415,7400,236800,"GPT2-Medium"
150 | 1.663,5e-05,0.51,5.275112467563699,7450,238400,"GPT2-Medium"
151 | 1.6855,5e-05,0.52,5.3951478319844055,7500,240000,"GPT2-Medium"
152 | 1.7342,5e-05,0.52,5.664394475135182,7550,241600,"GPT2-Medium"
153 | 1.6795,5e-05,0.53,5.362873863719142,7600,243200,"GPT2-Medium"
154 | 1.7162,5e-05,0.53,5.563347525383424,7650,244800,"GPT2-Medium"
155 | 1.6865,5e-05,0.53,5.400545678289723,7700,246400,"GPT2-Medium"
156 | 1.683,5e-05,0.54,5.381676808200332,7750,248000,"GPT2-Medium"
157 | 1.7002,5e-05,0.54,5.475042290691792,7800,249600,"GPT2-Medium"
158 | 1.6642,5e-05,0.54,5.2814464021254395,7850,251200,"GPT2-Medium"
159 | 1.7005,5e-05,0.55,5.476685049780541,7900,252800,"GPT2-Medium"
160 | 1.6757,5e-05,0.55,5.342533613987594,7950,254400,"GPT2-Medium"
161 | 1.6907,5e-05,0.55,5.423275669707439,8000,256000,"GPT2-Medium"
162 | 1.6953,5e-05,0.56,5.448280204125942,8050,257600,"GPT2-Medium"
163 | 1.6722,5e-05,0.56,5.323867431213548,8100,259200,"GPT2-Medium"
164 | 1.7037,5e-05,0.56,5.494238512501241,8150,260800,"GPT2-Medium"
165 | 1.688,5e-05,0.57,5.408652575459991,8200,262400,"GPT2-Medium"
166 | 1.6959,5e-05,0.57,5.451550153135021,8250,264000,"GPT2-Medium"
167 | 1.644,5e-05,0.57,5.175831486683224,8300,265600,"GPT2-Medium"
168 | 1.6894,5e-05,0.58,5.416229992019582,8350,267200,"GPT2-Medium"
169 | 1.6866,5e-05,0.58,5.40108575986118,8400,268800,"GPT2-Medium"
170 | 1.6348,5e-05,0.58,5.128432208010077,8450,270400,"GPT2-Medium"
171 | 1.6497,5e-05,0.59,5.205417967522357,8500,272000,"GPT2-Medium"
172 | 1.6605,5e-05,0.59,5.2619411573925605,8550,273600,"GPT2-Medium"
173 | 1.6434,5e-05,0.59,5.172726919254581,8600,275200,"GPT2-Medium"
174 | 1.651,5e-05,0.6,5.21218941136499,8650,276800,"GPT2-Medium"
175 | 1.6815,5e-05,0.6,5.373610344348383,8700,278400,"GPT2-Medium"
176 | 1.6765,5e-05,0.6,5.346809350945528,8750,280000,"GPT2-Medium"
177 | 1.6177,5e-05,0.61,5.041481564018722,8800,281600,"GPT2-Medium"
178 | 1.6018,5e-05,0.61,4.961955911488068,8850,283200,"GPT2-Medium"
179 | 1.6331,5e-05,0.62,5.119721279643453,8900,284800,"GPT2-Medium"
180 | 1.6542,5e-05,0.62,5.228895132379433,8950,286400,"GPT2-Medium"
181 | 1.6284,5e-05,0.62,5.095715048463855,9000,288000,"GPT2-Medium"
182 | 1.6222,5e-05,0.63,5.064219352711359,9050,289600,"GPT2-Medium"
183 | 1.6607,5e-05,0.63,5.262993650869879,9100,291200,"GPT2-Medium"
184 | 1.654,5e-05,0.63,5.227849457923889,9150,292800,"GPT2-Medium"
185 | 1.6186,5e-05,0.64,5.0460209398390505,9200,294400,"GPT2-Medium"
186 | 1.6325,5e-05,0.64,5.116650368241215,9250,296000,"GPT2-Medium"
187 | 1.6347,5e-05,0.64,5.1279193904305815,9300,297600,"GPT2-Medium"
188 | 1.6232,5e-05,0.65,5.069286105017994,9350,299200,"GPT2-Medium"
189 | 1.5854,5e-05,0.65,4.881243485459497,9400,300800,"GPT2-Medium"
190 | 1.6176,5e-05,0.65,5.040977441068887,9450,302400,"GPT2-Medium"
191 | 1.6191,5e-05,0.66,5.048544581166725,9500,304000,"GPT2-Medium"
192 | 1.596,5e-05,0.66,4.933259866177374,9550,305600,"GPT2-Medium"
193 | 1.6103,5e-05,0.66,5.004312296350958,9600,307200,"GPT2-Medium"
194 | 1.6132,5e-05,0.67,5.018845865500033,9650,308800,"GPT2-Medium"
195 | 1.5995,5e-05,0.67,4.950556527208795,9700,310400,"GPT2-Medium"
196 | 1.6409,5e-05,0.67,5.159811253265839,9750,312000,"GPT2-Medium"
197 | 1.6211,5e-05,0.68,5.058651774152981,9800,313600,"GPT2-Medium"
198 | 1.6119,5e-05,0.68,5.012325604962503,9850,315200,"GPT2-Medium"
199 | 1.6153,5e-05,0.68,5.029396516123374,9900,316800,"GPT2-Medium"
200 | 1.5853,5e-05,0.69,4.8807553855163555,9950,318400,"GPT2-Medium"
201 | 1.5935,5e-05,0.69,4.9209421201100065,10000,320000,"GPT2-Medium"
202 | 1.6104,5e-05,0.69,5.004812752602988,10050,321600,"GPT2-Medium"
203 | 1.6116,5e-05,0.7,5.010822132813112,10100,323200,"GPT2-Medium"
204 | 1.6289,5e-05,0.7,5.098263543058642,10150,324800,"GPT2-Medium"
205 | 1.5997,5e-05,0.71,4.9515467375319675,10200,326400,"GPT2-Medium"
206 | 1.5694,5e-05,0.71,4.803765069882574,10250,328000,"GPT2-Medium"
207 | 1.5822,5e-05,0.71,4.865648471635888,10300,329600,"GPT2-Medium"
208 | 1.5661,5e-05,0.72,4.787938772904338,10350,331200,"GPT2-Medium"
209 | 1.5938,5e-05,0.72,4.922418624210581,10400,332800,"GPT2-Medium"
210 | 1.5854,5e-05,0.72,4.881243485459497,10450,334400,"GPT2-Medium"
211 | 1.6054,5e-05,0.73,4.979851144862652,10500,336000,"GPT2-Medium"
212 | 1.5999,5e-05,0.73,4.952537145917012,10550,337600,"GPT2-Medium"
213 | 1.611,5e-05,0.73,5.007816541301046,10600,339200,"GPT2-Medium"
214 | 1.5467,5e-05,0.74,4.695947956747251,10650,340800,"GPT2-Medium"
215 | 1.5892,5e-05,0.74,4.899827497965248,10700,342400,"GPT2-Medium"
216 | 1.5693,5e-05,0.74,4.803284717393611,10750,344000,"GPT2-Medium"
217 | 1.5892,5e-05,0.75,4.899827497965248,10800,345600,"GPT2-Medium"
218 | 1.6035,5e-05,0.75,4.970398410628632,10850,347200,"GPT2-Medium"
219 | 1.5689,5e-05,0.75,4.801363787718201,10900,348800,"GPT2-Medium"
220 | 1.5298,5e-05,0.76,4.6172532792927,10950,350400,"GPT2-Medium"
221 | 1.5772,5e-05,0.76,4.84138094864251,11000,352000,"GPT2-Medium"
222 | 1.5524,5e-05,0.76,4.722791290924828,11050,353600,"GPT2-Medium"
223 | 1.541,5e-05,0.77,4.669257194331875,11100,355200,"GPT2-Medium"
224 | 1.5454,5e-05,0.77,4.689847190760562,11150,356800,"GPT2-Medium"
225 | 1.5762,5e-05,0.77,4.836541987577648,11200,358400,"GPT2-Medium"
226 | 1.5453,5e-05,0.78,4.6893782294899395,11250,360000,"GPT2-Medium"
227 | 1.5547,5e-05,0.78,4.733666212259463,11300,361600,"GPT2-Medium"
228 | 1.575,5e-05,0.78,4.830741618110278,11350,363200,"GPT2-Medium"
229 | 1.5779,5e-05,0.79,4.844771101721707,11400,364800,"GPT2-Medium"
230 | 1.5781,5e-05,0.79,4.845740152843933,11450,366400,"GPT2-Medium"
231 | 1.5583,5e-05,0.79,4.750738121622793,11500,368000,"GPT2-Medium"
232 | 1.537,5e-05,0.8,4.650617469856458,11550,369600,"GPT2-Medium"
233 | 1.5547,5e-05,0.8,4.733666212259463,11600,371200,"GPT2-Medium"
234 | 1.5368,5e-05,0.81,4.649687439368636,11650,372800,"GPT2-Medium"
235 | 1.5115,5e-05,0.81,4.533525985625109,11700,374400,"GPT2-Medium"
236 | 1.5416,5e-05,0.81,4.672059589282888,11750,376000,"GPT2-Medium"
237 | 1.5673,5e-05,0.82,4.79368774812708,11800,377600,"GPT2-Medium"
238 | 1.5488,5e-05,0.82,4.705819809273668,11850,379200,"GPT2-Medium"
239 | 1.559,5e-05,0.82,4.7540648025104,11900,380800,"GPT2-Medium"
240 | 1.5494,5e-05,0.83,4.708644148376233,11950,382400,"GPT2-Medium"
241 | 1.5326,5e-05,0.83,4.630199705012396,12000,384000,"GPT2-Medium"
242 | 1.5647,5e-05,0.83,4.781240348613352,12050,385600,"GPT2-Medium"
243 | 1.5164,5e-05,0.84,4.5557947769372635,12100,387200,"GPT2-Medium"
244 | 1.5716,5e-05,0.84,4.8143449866775585,12150,388800,"GPT2-Medium"
245 | 1.5392,5e-05,0.84,4.660860091042256,12200,390400,"GPT2-Medium"
246 | 1.5602,5e-05,0.85,4.7597731045696525,12250,392000,"GPT2-Medium"
247 | 1.5578,5e-05,0.85,4.748363346305286,12300,393600,"GPT2-Medium"
248 | 1.5208,5e-05,0.85,4.575884438800577,12350,395200,"GPT2-Medium"
249 | 1.5164,5e-05,0.86,4.5557947769372635,12400,396800,"GPT2-Medium"
250 | 1.5217,5e-05,0.86,4.58000458858479,12450,398400,"GPT2-Medium"
251 | 1.5415,5e-05,0.86,4.671592406683479,12500,400000,"GPT2-Medium"
252 | 1.5268,5e-05,0.87,4.6034222763325126,12550,401600,"GPT2-Medium"
253 | 1.5475,5e-05,0.87,4.699706218216797,12600,403200,"GPT2-Medium"
254 | 1.5163,5e-05,0.87,4.5553392202377845,12650,404800,"GPT2-Medium"
255 | 1.5282,5e-05,0.88,4.609871580979244,12700,406400,"GPT2-Medium"
256 | 1.5533,5e-05,0.88,4.727043716391082,12750,408000,"GPT2-Medium"
257 | 1.5323,5e-05,0.88,4.628810853439045,12800,409600,"GPT2-Medium"
258 | 1.5343,5e-05,0.89,4.6380777389424654,12850,411200,"GPT2-Medium"
259 | 1.5106,5e-05,0.89,4.5294476477653705,12900,412800,"GPT2-Medium"
260 | 1.4983,5e-05,0.9,4.474076671290999,12950,414400,"GPT2-Medium"
261 | 1.5189,5e-05,0.9,4.567198512609752,13000,416000,"GPT2-Medium"
262 | 1.5525,5e-05,0.9,4.723263593668665,13050,417600,"GPT2-Medium"
263 | 1.5112,5e-05,0.91,4.532166131817691,13100,419200,"GPT2-Medium"
264 | 1.5215,5e-05,0.91,4.579088679261059,13150,420800,"GPT2-Medium"
265 | 1.5004,5e-05,0.91,4.483482104549135,13200,422400,"GPT2-Medium"
266 | 1.505,5e-05,0.92,4.504153630288483,13250,424000,"GPT2-Medium"
267 | 1.5248,5e-05,0.92,4.5942246324895715,13300,425600,"GPT2-Medium"
268 | 1.4812,5e-05,0.92,4.398220379922297,13350,427200,"GPT2-Medium"
269 | 1.5537,5e-05,0.93,4.728934912091563,13400,428800,"GPT2-Medium"
270 | 1.5208,5e-05,0.93,4.575884438800577,13450,430400,"GPT2-Medium"
271 | 1.4901,5e-05,0.93,4.437539250741782,13500,432000,"GPT2-Medium"
272 | 1.5365,5e-05,0.94,4.648292742351838,13550,433600,"GPT2-Medium"
273 | 1.4834,5e-05,0.94,4.407907116261116,13600,435200,"GPT2-Medium"
274 | 1.5178,5e-05,0.94,4.562177356388104,13650,436800,"GPT2-Medium"
275 | 1.4796,5e-05,0.95,4.391188854035191,13700,438400,"GPT2-Medium"
276 | 1.4938,5e-05,0.95,4.4539885584228225,13750,440000,"GPT2-Medium"
277 | 1.5057,5e-05,0.95,4.507307641604858,13800,441600,"GPT2-Medium"
278 | 1.4957,5e-05,0.96,4.462459181227245,13850,443200,"GPT2-Medium"
279 | 1.5431,5e-05,0.96,4.67907293736287,13900,444800,"GPT2-Medium"
280 | 1.5106,5e-05,0.96,4.5294476477653705,13950,446400,"GPT2-Medium"
281 | 1.4987,5e-05,0.97,4.475866659933377,14000,448000,"GPT2-Medium"
282 | 1.4957,5e-05,0.97,4.462459181227245,14050,449600,"GPT2-Medium"
283 | 1.4793,5e-05,0.97,4.38987169496272,14100,451200,"GPT2-Medium"
284 | 1.5054,5e-05,0.98,4.50595565212094,14150,452800,"GPT2-Medium"
285 | 1.4967,5e-05,0.98,4.466923872381991,14200,454400,"GPT2-Medium"
286 | 1.4846,5e-05,0.98,4.413199779763611,14250,456000,"GPT2-Medium"
287 | 1.5123,5e-05,0.99,4.537154257528862,14300,457600,"GPT2-Medium"
288 | 1.5036,5e-05,0.99,4.497852227217459,14350,459200,"GPT2-Medium"
289 | 1.4745,5e-05,1.0,4.368850801331712,14400,460800,"GPT2-Medium"
290 | 1.4908,5e-05,1.0,4.4406466156681415,14450,462400,"GPT2-Medium"
291 | 


--------------------------------------------------------------------------------
/v1/analysis/training-results/polycoder-eval-results.csv:
--------------------------------------------------------------------------------
 1 | "loss","accuracy","runtime","samples_per_second","steps_per_second","epoch","perplexity","steps","samples","model"
 2 | 2.3203125,0.6247663734115347,37.5165,26.655,3.332,0.02,10.17885470120484,1000,8000,"PolyCoder"
 3 | 2.001953125,0.6563352883675464,37.5282,26.647,3.331,0.03,7.403501951809864,2000,16000,"PolyCoder"
 4 | 1.814453125,0.6796471163245357,37.5235,26.65,3.331,0.05,6.137718694555804,3000,24000,"PolyCoder"
 5 | 1.6875,0.6937790811339198,37.4828,26.679,3.335,0.07,5.405948925141167,4000,32000,"PolyCoder"
 6 | 1.6181640625,0.7021691104594331,37.5268,26.648,3.331,0.09,5.043821669492627,5000,40000,"PolyCoder"
 7 | 1.5498046875,0.7108142717497556,37.5449,26.635,3.329,0.1,4.710550063429016,6000,48000,"PolyCoder"
 8 | 1.48046875,0.7193714565004887,37.5338,26.643,3.33,0.12,4.395005356905533,7000,56000,"PolyCoder"
 9 | 1.4326171875,0.7267448680351906,36.4286,27.451,3.431,0.14,4.189649955064052,8000,64000,"PolyCoder"
10 | 1.39453125,0.7316676441837732,37.4928,26.672,3.334,0.16,4.033083621527571,9000,72000,"PolyCoder"
11 | 1.35546875,0.7361603128054741,37.4779,26.682,3.335,0.17,3.878578614311837,10000,80000,"PolyCoder"
12 | 1.3076171875,0.7432629521016618,37.4606,26.695,3.337,0.19,3.697353108257716,11000,88000,"PolyCoder"
13 | 1.279296875,0.747058651026393,37.4874,26.676,3.334,0.21,3.5941117271158474,12000,96000,"PolyCoder"
14 | 1.25390625,0.7509364613880742,37.526,26.648,3.331,0.22,3.504003773550228,13000,104000,"PolyCoder"
15 | 1.2216796875,0.7560752688172043,36.4571,27.43,3.429,0.24,3.392881931415303,14000,112000,"PolyCoder"
16 | 1.201171875,0.7592453567937439,37.5808,26.609,3.326,0.26,3.3240099653895356,15000,120000,"PolyCoder"
17 | 1.1787109375,0.76244477028348,37.5529,26.629,3.329,0.28,3.2501818138500416,16000,128000,"PolyCoder"
18 | 1.16015625,0.7661984359726295,37.4715,26.687,3.336,0.29,3.1904317421322212,17000,136000,"PolyCoder"
19 | 1.140625,0.7689169110459433,37.5405,26.638,3.33,0.31,3.128723206238592,18000,144000,"PolyCoder"
20 | 1.1259765625,0.7705650048875855,36.4354,27.446,3.431,0.33,3.083226342424631,19000,152000,"PolyCoder"
21 | 1.1123046875,0.771998044965787,37.4909,26.673,3.334,0.35,3.0413597071177576,20000,160000,"PolyCoder"
22 | 1.095703125,0.7747399804496579,37.5248,26.649,3.331,0.36,2.9912851915440255,21000,168000,"PolyCoder"
23 | 1.083984375,0.7761935483870968,36.4163,27.46,3.433,0.38,2.956435663479408,22000,176000,"PolyCoder"
24 | 1.0712890625,0.778830889540567,37.598,26.597,3.325,0.4,2.9191400296218246,23000,184000,"PolyCoder"
25 | 1.0458984375,0.7831564027370479,37.6454,26.564,3.32,0.41,2.8459542874379604,24000,192000,"PolyCoder"
26 | 1.03515625,0.7846637341153471,36.4964,27.4,3.425,0.43,2.8155461303392597,25000,200000,"PolyCoder"
27 | 1.0244140625,0.7864535679374389,37.5186,26.653,3.332,0.45,2.785462875162639,26000,208000,"PolyCoder"
28 | 1.015625,0.7873470185728251,37.5455,26.634,3.329,0.47,2.7610885385501014,27000,216000,"PolyCoder"
29 | 1.0009765625,0.7907565982404692,36.5745,27.341,3.418,0.48,2.720937697156968,28000,224000,"PolyCoder"
30 | 0.99169921875,0.7921241446725318,37.4805,26.681,3.335,0.5,2.6958113558872334,29000,232000,"PolyCoder"
31 | 0.99267578125,0.7913734115347019,36.4382,27.444,3.43,0.52,2.69844527004604,30000,240000,"PolyCoder"
32 | 0.9833984375,0.7943421309872922,37.5699,26.617,3.327,0.54,2.6735266337892374,31000,248000,"PolyCoder"
33 | 0.96923828125,0.79555522971652,36.4755,27.416,3.427,0.55,2.6359358526918006,32000,256000,"PolyCoder"
34 | 0.95263671875,0.7984623655913978,37.5014,26.666,3.333,0.57,2.592536444657931,33000,264000,"PolyCoder"
35 | 0.9501953125,0.7986275659824047,37.5472,26.633,3.329,0.59,2.5862147300553904,34000,272000,"PolyCoder"
36 | 0.951171875,0.798186705767351,37.4851,26.677,3.335,0.6,2.5887415639825235,35000,280000,"PolyCoder"
37 | 0.9296875,0.801869990224829,37.5581,26.625,3.328,0.62,2.53371726724229,36000,288000,"PolyCoder"
38 | 0.927734375,0.8021876832844574,37.5294,26.646,3.331,0.64,2.528773430242131,37000,296000,"PolyCoder"
39 | 0.9208984375,0.8040645161290323,37.5926,26.601,3.325,0.66,2.511545843561349,38000,304000,"PolyCoder"
40 | 0.91259765625,0.8059970674486804,37.455,26.699,3.337,0.67,2.490784338481287,39000,312000,"PolyCoder"
41 | 0.9072265625,0.8062062561094819,37.5311,26.645,3.331,0.69,2.477441965930949,40000,320000,"PolyCoder"
42 | 0.90087890625,0.8071603128054741,37.5701,26.617,3.327,0.71,2.461765821974672,41000,328000,"PolyCoder"
43 | 0.88427734375,0.810603128054741,37.5317,26.644,3.331,0.73,2.4212340391990743,42000,336000,"PolyCoder"
44 | 0.86962890625,0.8138357771260997,37.5243,26.649,3.331,0.74,2.3860252501558272,43000,344000,"PolyCoder"
45 | 0.8671875,0.8145161290322581,37.4535,26.7,3.337,0.76,2.3802070983234693,44000,352000,"PolyCoder"
46 | 0.86181640625,0.8154985337243402,37.5464,26.634,3.329,0.78,2.3674570543464966,45000,360000,"PolyCoder"
47 | 0.86279296875,0.8153069403714565,37.5127,26.658,3.332,0.79,2.3697701533851934,46000,368000,"PolyCoder"
48 | 0.85791015625,0.8156823069403715,37.467,26.69,3.336,0.81,2.3582272139962357,47000,376000,"PolyCoder"
49 | 0.85498046875,0.8162492668621701,37.5307,26.645,3.331,0.83,2.3513284557425176,48000,384000,"PolyCoder"
50 | 0.8486328125,0.8178230694037145,36.4263,27.453,3.432,0.85,2.33645030163534,49000,392000,"PolyCoder"
51 | 0.83984375,0.819336265884653,37.5346,26.642,3.33,0.86,2.3160050727154613,50000,400000,"PolyCoder"
52 | 0.8427734375,0.817871945259042,37.4481,26.704,3.338,0.88,2.322800192755806,51000,408000,"PolyCoder"
53 | 0.83349609375,0.8201593352883676,36.4643,27.424,3.428,0.9,2.3013504291667424,52000,416000,"PolyCoder"
54 | 0.822265625,0.8214828934506354,37.4781,26.682,3.335,0.92,2.2756497704322705,53000,424000,"PolyCoder"
55 | 0.82080078125,0.8227810361681329,37.5113,26.659,3.332,0.93,2.2723187394044895,54000,432000,"PolyCoder"
56 | 0.81640625,0.8228778103616813,37.4805,26.681,3.335,0.95,2.262354872990478,55000,440000,"PolyCoder"
57 | 0.81494140625,0.823683284457478,37.3988,26.739,3.342,0.97,2.259043302653206,56000,448000,"PolyCoder"
58 | 0.8056640625,0.8256666666666667,37.5503,26.631,3.329,0.98,2.2381822983660147,57000,456000,"PolyCoder"
59 | 


--------------------------------------------------------------------------------
/v1/code-gen-tests/codegen_tests-zaratan.sbatch:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH -N 1
 3 | #SBATCH -n 2
 4 | #SBATCH -t 04:00:00
 5 | #SBATCH -J train-causal
 6 | #SBATCH -A bhatele-lab-cmsc
 7 | #SBATCH -p gpu
 8 | #SBATCH --gres=gpu:a100
 9 | #SBATCH --mem=16384
10 | #SBATCH --mail-type=FAIL
11 | 
12 | # run params
13 | MODEL="hpcgroup/gpt-neo-hpc-source"
14 | TOKENIZER="hpcgroup/gpt-neo-hpc-source"
15 | PROMPTS="codegen_tests.json"
16 | OUTPUT="gpt-neo-results.jsonl"
17 | NUM_SAMPLES="50"
18 | MIN_LEN="25"
19 | MAX_LEN="512"
20 | TOP_K="50"
21 | TOP_P="0.93"
22 | TEMPERATURES="0.2 0.4 0.6 0.8"
23 | MAX_SEQ_LENGTH="1024"
24 | BATCH_SIZE="1"
25 | 
26 | # environment setup
27 | DEVICE="$CUDA_VISIBLE_DEVICES"
28 | CACHE_DIR="/scratch/zt1/project/bhatele-lab/user/dnicho/.cache/huggingface"
29 | module load python/3.8.12/zen2 git-lfs/zen2/3.1.2 openmpi/4.1.1/gcc/9.4.0/zen2 cuda/11.6.2/gcc
30 | source /home/dnicho/code-ml/analysis/.env/bin/activate
31 | export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${CUDA_HOME}/lib64"
32 | export HF_HOME="${CACHE_DIR}"
33 | 
34 | export OMP_NUM_THREADS="2"
35 | export TOKENIZERS_PARALLELISM="true"
36 | 
37 | echo "device(s): $CUDA_VISIBLE_DEVICES"
38 | 
39 | if [ -f $OUTPUT ]; then
40 |     rm $OUTPUT
41 | fi
42 | 
43 | # run script
44 | python codegen_tests.py \
45 |     --model $MODEL \
46 |     --tokenizer $TOKENIZER \
47 |     --input $PROMPTS \
48 |     --output $OUTPUT \
49 |     --cache-dir $CACHE_DIR \
50 |     --num-samples $NUM_SAMPLES \
51 |     --min-len $MIN_LEN \
52 |     --max-len $MAX_LEN \
53 |     --top-k $TOP_K \
54 |     --top-p $TOP_P \
55 |     --temperatures $TEMPERATURES \
56 |     --max-sequence-length $MAX_SEQ_LENGTH \
57 |     --batch-size $BATCH_SIZE \
58 |     --device $DEVICE 


--------------------------------------------------------------------------------
/v1/code-gen-tests/codegen_tests.py:
--------------------------------------------------------------------------------
  1 | ''' Generate a bunch of results for codegeneration.
  2 |     author: Daniel Nichols
  3 |     date: November 2022
  4 | '''
  5 | # std imports
  6 | from argparse import ArgumentParser
  7 | from itertools import product
  8 | import json
  9 | from typing import Iterable, Optional
 10 | 
 11 | # tpl imports
 12 | from alive_progress import alive_it
 13 | import torch
 14 | from torch.utils.data import Dataset
 15 | from transformers import AutoTokenizer, pipeline, StoppingCriteria, set_seed
 16 | 
 17 | 
 18 | class PromptDataset(Dataset):
 19 |     ''' PyTorch dataset that simply wraps a list of strings. They do not have to have the same length.
 20 |     '''
 21 | 
 22 |     def __init__(self, prompts):
 23 |         super().__init__()
 24 |         self.prompts_ = prompts
 25 |     
 26 |     def __len__(self):
 27 |         return len(self.prompts_)
 28 |     
 29 |     def __getitem__(self, idx): 
 30 |         return self.prompts_[idx]
 31 | 
 32 | 
 33 | def has_balanced_brackets(text : str, left_bracket : str = '{', right_bracket : str = '}') -> bool:
 34 |     ''' Check if string has balanced brackets.
 35 |         taken from: https://stackoverflow.com/a/38834249/3769237
 36 | 
 37 |         Arguments:
 38 |             text: string to check for balanced brackets in.
 39 |             left_bracket: left bracket to balance
 40 |             right_bracket: right bracket to balance
 41 | 
 42 |         Returns:
 43 |             true if left_bracket and right_bracket are balanced
 44 |     '''
 45 |     stack = []
 46 |     balanced = True
 47 |     index = 0
 48 |     while index < len(text) and balanced:
 49 |         token = text[index]
 50 |         if token == left_bracket:
 51 |             stack.append(token)
 52 |         elif token == right_bracket:
 53 |             if len(stack) == 0:
 54 |                 balanced = False
 55 |             else:
 56 |                 stack.pop()
 57 | 
 58 |         index += 1
 59 | 
 60 |     return balanced and len(stack) == 0
 61 | 
 62 | 
 63 | class BalancedBracketsCriteria(StoppingCriteria):
 64 |     ''' extension of transformers' text-generation stopping criteria.
 65 |         Stops either when function is complete (i.e. { and } are balanced) or when max_length is surpassed, whichever
 66 |         happens first. 
 67 | 
 68 |         _Note:_ This is a slow stopping criteria, but it's much faster than continually running model inference when 
 69 |         it does not need to be run anymore.
 70 |     '''
 71 | 
 72 |     def __init__(self, max_length : int, tokenizer, left_bracket : str = '{', right_bracket : str = '}'):
 73 |         self.max_length = max_length
 74 |         self.tokenizer = tokenizer
 75 |         self.left_bracket = left_bracket
 76 |         self.right_bracket = right_bracket
 77 |     
 78 |     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
 79 |         if input_ids.shape[-1] > self.max_length:
 80 |             # already to long, early stop
 81 |             return True
 82 | 
 83 |         # return true if {} are balanced i.e. the function is complete
 84 |         return all(
 85 |             has_balanced_brackets(
 86 |                 self.tokenizer.decode(t), 
 87 |                 left_bracket=self.left_bracket, 
 88 |                 right_bracket=self.right_bracket
 89 |             ) for t in input_ids)
 90 | 
 91 | 
 92 | def get_predictions(
 93 |     prompts : Iterable[str],
 94 |     generator,
 95 |     num_samples : int = 100,
 96 |     top_p : float = 0.95,
 97 |     top_k : int = 50,
 98 |     temperature : float = 0.2,
 99 |     min_len : int = 50,
100 |     max_len : int = 500,
101 |     batch_size : int = 1,
102 |     tokenizer = None
103 | ) -> Iterable[dict]:
104 |     ''' Get prediction outputs from model.
105 | 
106 |         Arguments:
107 |             prompts: list of text prompts to generate text for
108 |             generator: transformers pipeline object
109 |             num_samples: how many samples to generate for each prompt
110 |             top_p: probability for nucleus sampling
111 |             top_k: k for top-k sampling
112 |             temperature: inference temperature
113 |             min_len: minimum generation length
114 |             max_len: maximum generation length
115 |             batch_size: [deprecated] how many samples to process at once
116 |             tokenizer: HF tokenizer to be passed to stopping criteria
117 | 
118 |         Returns:
119 |             a list of result objects that store the results as well as meta-data
120 |     '''
121 | 
122 |     prompts = [ p for p in prompts for _ in range(num_samples) ]
123 |     ds = PromptDataset(prompts)
124 | 
125 |     stopping_criteria = BalancedBracketsCriteria(max_len, tokenizer)
126 | 
127 |     gen_output = generator(
128 |         ds,
129 |         return_full_text=True,
130 |         do_sample=True,
131 |         max_new_tokens=max_len,
132 |         top_k=top_k,
133 |         top_p=top_p,
134 |         temperature=temperature,
135 |         pad_token_id=50256, # suppress error
136 |         stopping_criteria=[stopping_criteria]
137 |     )
138 | 
139 |     generated_text = []
140 |     bar = alive_it(gen_output, total=len(gen_output)*batch_size, title=f'Temperature {temperature}')
141 |     for g in bar:
142 |         texts = list(map(lambda x: x['generated_text'], g))
143 |         generated_text.extend( texts )
144 |     
145 |     results = []
146 |     for prompt, text in zip(prompts, generated_text):
147 |         result = {
148 |             'prompt': prompt,
149 |             'generated_text': text,
150 |             'num_samples': num_samples,
151 |             'min_len': min_len,
152 |             'max_len': max_len,
153 |             'top_p': top_p,
154 |             'top_k': top_k,
155 |             'temperature': temperature,
156 |         }
157 |         results.append( result )
158 | 
159 |     return results
160 | 
161 | 
162 | def main():
163 |     parser = ArgumentParser(description='Generate code samples for a set of test problems.')
164 |     parser.add_argument('-m', '--model', type=str, required=True, help='path to model or HF hub model name')
165 |     parser.add_argument('-t', '--tokenizer', type=str, required=True, help='path to tokenizer of HF hub name')
166 |     parser.add_argument('-i', '--input', type=str, required=True, help='json file with all the test prompts')
167 |     parser.add_argument('-o', '--output', type=str, required=True, default='results.json', help='output path')
168 |     parser.add_argument('--cache-dir', type=str, default='~/.cache/huggingface', help='path to HF cache')
169 |     parser.add_argument('-k', '--num-samples', type=int, default=100, help='how many samples to generate')
170 |     parser.add_argument('--min-len', type=int, default=50, help='Minimum length to generate.')
171 |     parser.add_argument('--max-len', type=int, default=350, help='Maximum length to generate.')
172 |     parser.add_argument('--top-k', type=int, default=50, help='Number of samples to use in top-k sampling.')
173 |     parser.add_argument('--top-p', type=float, default=0.95, help='Fraction to use in nucleas sampling.')
174 |     parser.add_argument('--temperatures', type=float, nargs='+', default=[0.1, 0.2, 0.4, 0.6, 0.8, 1.0, 2.0], 
175 |         help='Sampling temperatures to try.')
176 |     parser.add_argument('--batch-size', type=int, default=8, help='Batch size to feed to inference.')
177 |     parser.add_argument('--device', type=int, default=-1, help='Where to run model')
178 |     parser.add_argument('--max-sequence-length', type=int, default=1024, help='maximum sequence length of model')
179 |     args = parser.parse_args()
180 | 
181 |     set_seed(42)
182 | 
183 |     # read input prompts
184 |     with open(args.input, 'r') as fp:
185 |         prompts = json.load(fp)
186 |     
187 |     print(f'Running inference on {len(prompts)} total prompts.')
188 | 
189 |     # get hf models
190 |     tokenizer = AutoTokenizer.from_pretrained(args.tokenizer)
191 |     generator = pipeline('text-generation', model=args.model, tokenizer=tokenizer, framework='pt', device=args.device,
192 |         batch_size=args.batch_size)
193 | 
194 |     # filter prompts
195 |     filtered_prompts = []
196 |     bar = alive_it(prompts, title='Removing Long Prompts')
197 |     for prompt in bar:
198 |         tokens = tokenizer(prompt['prompt'])['input_ids']
199 |         if len(tokens) > args.max_sequence_length:
200 |             print('Skipping prompt \'{}\' as its number of tokens exceeds max_sequence_length ({} > {}).'.format(
201 |                 prompt['name'],
202 |                 len(tokens),
203 |                 args.max_sequence_length
204 |             ))
205 |         else:
206 |             filtered_prompts.append( prompt )
207 |     
208 |     prompts = filtered_prompts
209 | 
210 |     # run tests for all temperatures
211 |     for temperature in args.temperatures:
212 |         
213 |         prompt_results = get_predictions(
214 |             [p['prompt'] for p in prompts],
215 |             generator,
216 |             num_samples=args.num_samples,
217 |             top_p=args.top_p,
218 |             top_k=args.top_k,
219 |             temperature=temperature,
220 |             min_len=args.min_len,
221 |             max_len=args.max_len,
222 |             batch_size=args.batch_size,
223 |             tokenizer=tokenizer
224 |         )
225 | 
226 |         with open(args.output, 'a') as fp:
227 |             for r in prompt_results:
228 |                 # extra meta-data
229 |                 r['model'] = args.model
230 |                 r['tokenizer'] = args.tokenizer
231 |                 r['name'] = prompt['name']
232 | 
233 |                 # write out
234 |                 json.dump(r, fp, ensure_ascii=True)
235 |                 fp.write('\n')
236 | 
237 | 
238 | if __name__ == '__main__':
239 |     main()
240 | 


--------------------------------------------------------------------------------
/v1/data/README.md:
--------------------------------------------------------------------------------
 1 | # Data
 2 | 
 3 | Scripts for collecting and processing the dataset.
 4 | 
 5 | ## collect-repo-metadata.py and collect-repo-metadata.bash
 6 | The python script will query Github for repositories that match the particular query parameters.
 7 | It may take a while to run as it has to query each combination separately and sleep between them
 8 | to avoid rate-limiting from the GitHub API.
 9 | Upon running it will request your GitHub personal access token.
10 | The resulting repo info is dumped into a csv specified by `--output`.
11 | 
12 | ```
13 | usage: collect-repo-metadata.py [-h] --tags TAGS [TAGS ...] [--languages LANGUAGES [LANGUAGES ...]] [--min-stars MIN_STARS] -o OUTPUT
14 | 
15 | Collects repos that meet tag, language, and star requirements.
16 | 
17 | optional arguments:
18 |   -h, --help            show this help message and exit
19 |   --tags TAGS [TAGS ...]
20 |                         what tags to look for
21 |   --languages LANGUAGES [LANGUAGES ...]
22 |                         what languages to select repos for
23 |   --min-stars MIN_STARS
24 |                         minimum number of stars to consider
25 |   -o OUTPUT, --output OUTPUT
26 |                         where to output results
27 | ```
28 | 
29 | The bash script is just a wrapper around the python script that makes it easy to keep
30 | the configuration consistent.
31 | 
32 | ## edit-metadata.py
33 | The helper script `edit-metadata.py` is there to help add repos to the db.
34 | You can manually append repos as shown below.
35 | 
36 | ```bash
37 | python edit-metadata.py -d <csv_path> -a <repos...>
38 | 
39 | # for example
40 | python edit-metadata.py -d repos-gt3.csv -a ceed/mfem ceed/laghos lanl/elements
41 | ```
42 | 
43 | ## clone-repos.py
44 | This script will take a set of repos specified in the csv outputted by `collect-repo-metadata.py` and 
45 | clone them into some directory.
46 | This also takes a while to run.
47 | It will likely take up several GB of space and create a large number of INODES, so
48 | set the root of the output to a filesystem that can handle this.
49 | 
50 | ```
51 | usage: clone-repos.py [-h] -d DATASET --root ROOT
52 | 
53 | Clone the repositories in a dataframe into a directory root.
54 | 
55 | optional arguments:
56 |   -h, --help            show this help message and exit
57 |   -d DATASET, --dataset DATASET
58 |                         Where to find dataset of repos
59 |   --root ROOT           Root directory to clone into
60 | ```
61 | 
62 | ## collect-dataset.py
63 | Collects the contents of all the source files and outputs them as a json lines file.
64 | 
65 | ```
66 | usage: collect-dataset.py [-h] --root ROOT -o OUTPUT
67 | 
68 | Create compact dataset representation
69 | 
70 | optional arguments:
71 |   -h, --help            show this help message and exit
72 |   --root ROOT           root to start searching for source files
73 |   -o OUTPUT, --output OUTPUT
74 |                         output path
75 | ```
76 | 
77 | ## repo-plots.py and generate-all-repo-plots.bash
78 | Create info plots about the source files (i.e. file extensions count, LOC, etc...).
79 | The bash script is a helper to create all combinations of the plots.
80 | 


--------------------------------------------------------------------------------
/v1/data/clone-repos.py:
--------------------------------------------------------------------------------
 1 | ''' Clone a list of repos into a specified location.
 2 |     author: Daniel Nichols
 3 |     date: October 2022
 4 | '''
 5 | # std imports
 6 | from argparse import ArgumentParser
 7 | from os.path import join as path_join
 8 | from pathlib import Path
 9 | from typing import Optional
10 | from os import PathLike
11 | 
12 | # tpl imports
13 | import pandas as pd
14 | from alive_progress import alive_it
15 | from git import Repo
16 | 
17 | 
18 | def clone(url: str, root: PathLike, dirname: Optional[PathLike] = None):
19 |     ''' Clone the specified git url.
20 |     
21 |         Args:
22 |             url: github url to clone
23 |             root: where to store cloned repos
24 |             dirname: subdirectory of root to clone into
25 |     '''
26 |     if dirname is None:
27 |         raise NotImplementedError('\'clone\' expects a dirname.')
28 | 
29 |     dest_path = Path(path_join(root, dirname))
30 |     if dest_path.is_dir():
31 |         return
32 |     
33 |     # make the directory
34 |     dest_path.mkdir(parents=True, exist_ok=False)
35 |     
36 |     # clone
37 |     Repo.clone_from(url, dest_path)
38 | 
39 | 
40 | def main():
41 |     parser = ArgumentParser(description='Clone the repositories in a dataframe into a directory root.')
42 |     parser.add_argument('-d', '--dataset', required=True, type=str, help='Where to find dataset of repos')
43 |     parser.add_argument('--root', type=str, required=True, help='Root directory to clone into')
44 |     args = parser.parse_args()
45 | 
46 |     df = pd.read_csv(args.dataset)
47 | 
48 |     pairs = list(zip(df['full_name'], df['clone_url']))
49 |     bar = alive_it(pairs, title='Cloning Repos')
50 |     for full_name, clone_url in bar:
51 |         clone(clone_url, args.root, dirname=full_name)
52 | 
53 | 
54 | if __name__ == '__main__':
55 |     main()
56 | 
57 | 


--------------------------------------------------------------------------------
/v1/data/collect-dataset.py:
--------------------------------------------------------------------------------
 1 | ''' Create a jsonl text dataset for the list of source files.
 2 |     author: Daniel Nichols
 3 |     date: October 2022
 4 | '''
 5 | # std imports
 6 | from __future__ import annotations
 7 | from argparse import ArgumentParser
 8 | import json
 9 | 
10 | # local imports
11 | from dataset_utils import get_source_filenames, filter_bad_encoding, filter_by_size, filter_duplicates, \
12 |     print_source_file_stats
13 | 
14 | 
15 | def get_args():
16 |     parser = ArgumentParser(description='Create compact dataset representation')
17 |     parser.add_argument('--root', type=str, required=True, help='root to start searching for source files')
18 |     parser.add_argument('-o', '--output', type=str, required=True, help='output path')
19 |     return parser.parse_args()
20 | 
21 | 
22 | def main():
23 |     args = get_args()
24 | 
25 |     # retrieve dataset
26 |     fnames = get_source_filenames(args.root)
27 |     fnames = filter_bad_encoding(fnames)
28 |     fnames = filter_by_size(fnames, min_tokens=15)
29 |     fnames = filter_duplicates(fnames)
30 |     print_source_file_stats(fnames)
31 | 
32 |     # write out json lines file
33 |     with open(args.output, 'w') as fp:
34 |         for fname in fnames:
35 |             result = {'filename': fname}
36 |             with open(fname, 'r') as tmp_fp:
37 |                 result['text'] = tmp_fp.read()
38 |             
39 |             json.dump(result, fp, ensure_ascii=False)
40 |             fp.write('\n')
41 | 
42 | 
43 | if __name__ == '__main__':
44 |     main()


--------------------------------------------------------------------------------
/v1/data/collect-repo-metadata.bash:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Helper script for calling collect-repo-metadata.py
 3 | # author: Daniel Nichols
 4 | # date: October 2022
 5 | 
 6 | MIN_STARS=3
 7 | TAGS="hpc mpi openmp proxy-application miniapp mini-app parallel-computing scientific-computing high-performance-computing computational-science"
 8 | LANGUAGES="c c++"
 9 | 
10 | python3 collect-repo-metadata.py --min-stars $MIN_STARS --tags $TAGS \
11 | 	--languages $LANGUAGES --output repos.csv
12 | 


--------------------------------------------------------------------------------
/v1/data/collect-repo-metadata.py:
--------------------------------------------------------------------------------
  1 | ''' Script to collect github repository metadata.
  2 |     author: Daniel Nichols
  3 |     date: October 2022
  4 | '''
  5 | # std imports
  6 | from argparse import ArgumentParser
  7 | from getpass import getpass
  8 | import requests
  9 | from csv import QUOTE_NONNUMERIC
 10 | from itertools import product
 11 | from time import sleep
 12 | from typing import Iterable
 13 | 
 14 | # tpl imports
 15 | import pandas as pd
 16 | from alive_progress import alive_it
 17 | 
 18 | 
 19 | def query(q: str, access_token: str) -> list:
 20 |     ''' Query github API.
 21 |     
 22 |         Args:
 23 |             q: query string to give to github
 24 |             access_token: GitHub API personal access token
 25 | 
 26 |         Returns:
 27 |             A list of results describing the repos' metadata
 28 |     '''
 29 |     BASE_URL = 'https://api.github.com'
 30 |     DESIRED_ENTRIES = ['name', 'full_name', 'clone_url', 'html_url', 'created_at',
 31 |             'updated_at', 'language', 'size', 'stargazers_count', 'watchers_count',
 32 |             'forks_count', 'topics', 'visibility', 'forks', 'open_issues', 'watchers']
 33 |     search_url = '{}/search/repositories?q={}'.format(BASE_URL, q)
 34 | 
 35 |     fetched = []
 36 |     for page in range(1, 35):
 37 |         search_url = '{}/search/repositories?q={}&page={}'.format(BASE_URL, q, page)
 38 | 
 39 |         try:
 40 |             header = {'Authorization': 'token {}'.format(access_token)}
 41 |             response = requests.get(search_url, headers=header).json()
 42 |         except:
 43 |             print('Error fetching from github')
 44 |         
 45 |         # first try to wait out rate-limiting
 46 |         if 'items' not in response:
 47 |             sleep(30)
 48 |             response = requests.get(search_url, headers=header).json()
 49 |         
 50 |         if 'items' not in response:
 51 |             print(response)
 52 |             break
 53 |         elif len(response['items']) == 0:
 54 |             break
 55 | 
 56 |         for item in response['items']:
 57 |             vals = { k: item[k] for k in DESIRED_ENTRIES }
 58 |             fetched.append(vals)
 59 | 
 60 |     return fetched
 61 | 
 62 | 
 63 | 
 64 | def collect(tags: Iterable[str], languages: Iterable[str], min_stars: int):
 65 |     ''' Collect all repo meta-data that match description. Will prompt user on 
 66 |         command line to get GitHub API access token.
 67 | 
 68 |         Args:
 69 |             tags: GitHub tags to include in search query
 70 |             languages: languages to filter in query
 71 |             min_stars: only include repos with this many stars or greater
 72 |         
 73 |         Returns:
 74 |             A pandas DataFrame containing all the repo metadata
 75 |     '''
 76 |     api_token = getpass('GitHub API Token: ')
 77 | 
 78 |     results = []
 79 |     
 80 |     bar = alive_it(list(product(tags, languages)))
 81 |     for tag, language in bar:
 82 |         query_str = 'topic:{} language:{} stars:>={}'.format(tag, language, min_stars)
 83 |         query_results = query(query_str, api_token)
 84 |         results.extend(query_results)
 85 | 
 86 |         # prevent rate-limiting by GitHub API
 87 |         #sleep(10)
 88 | 
 89 |     return pd.DataFrame(results).drop_duplicates(subset='full_name', ignore_index=True)
 90 | 
 91 | 
 92 | def main():
 93 |     parser = ArgumentParser(description='Collects repos that meet tag, language, and star requirements.')
 94 |     parser.add_argument('--tags', type=str, nargs='+', required=True, help='what tags to look for')
 95 |     parser.add_argument('--languages', type=str, nargs='+', default=['c', 'c++'], help='what languages to select repos for')
 96 |     parser.add_argument('--min-stars', type=int, default=5, help='minimum number of stars to consider')
 97 |     parser.add_argument('-o', '--output', type=str, required=True, help='where to output results')
 98 |     args = parser.parse_args()
 99 | 
100 |     df = collect(args.tags, args.languages, args.min_stars)
101 |     
102 |     print('Collected {} repos. Writing to {}.'.format(df.shape[0], args.output))
103 |     df.to_csv(args.output, index=False, quoting=QUOTE_NONNUMERIC)
104 | 
105 | 
106 | if __name__ == '__main__':
107 |     main()
108 | 


--------------------------------------------------------------------------------
/v1/data/copy-repos.bash:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copy repos recursively from one directory to another
 3 | # author: Daniel Nichols
 4 | # date: October 2022
 5 | 
 6 | if [ $# -ne 2 ]; then
 7 |     printf "usage: %s <source-root> <dest-root>\n" "$0"
 8 |     exit 1
 9 | fi
10 | 
11 | SRCROOT="$1"
12 | DSTROOT="$2"
13 | 
14 | echo "cp -r $SRCROOT $DSTROOT"
15 | cp -r $SRCROOT $DSTROOT
16 | 


--------------------------------------------------------------------------------
/v1/data/create-omp-dataset.py:
--------------------------------------------------------------------------------
  1 | ''' Create a dataset for the downstream task of auto-completing openmp pragmas.
  2 |     author: Daniel Nichols
  3 |     date: November 2022
  4 | '''
  5 | # std imports
  6 | from argparse import ArgumentParser
  7 | import json
  8 | from os import PathLike
  9 | import re
 10 | from typing import Iterable, Optional
 11 | 
 12 | # tpl imports
 13 | from alive_progress import alive_bar
 14 | 
 15 | 
 16 | PATTERN = r'\#pragma omp parallel for.*'
 17 | REG = re.compile(PATTERN, flags=re.MULTILINE)
 18 | 
 19 | 
 20 | def strip_comments(text: str) -> str:
 21 |     ''' Removes C/C++ style comments from the string.
 22 |         Code taken from https://stackoverflow.com/a/241506/3769237
 23 | 
 24 |         Args:
 25 |             text: input string
 26 |         
 27 |         Returns:
 28 |             The string with C/C++ style comments removed i.e. // /* */
 29 |     '''
 30 |     def replacer(match):
 31 |         s = match.group(0)
 32 |         if s.startswith('/'):
 33 |             return " " # note: a space and not an empty string
 34 |         else:
 35 |             return s
 36 |     pattern = re.compile(
 37 |         r'//.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"',
 38 |         re.DOTALL | re.MULTILINE
 39 |     )
 40 |     return re.sub(pattern, replacer, text)
 41 | 
 42 | 
 43 | def get_omp_samples(
 44 |     data_sample : dict, 
 45 |     pre_loop_token : Optional[str] = '', 
 46 |     post_loop_token : Optional[str] = '',
 47 |     pre_pragma_token : Optional[str] = '',
 48 |     post_pragma_token : Optional[str] = '',
 49 |     lines_before : Optional[int] = None,
 50 |     chars_before : Optional[int] = None
 51 | ) -> Iterable[dict]:
 52 |     ''' Build a new dataset where each sample is an omp parallel for. Each sample will be a for-loop preceded by 
 53 |         `lines_before` or `chars_before` and followed by the openmp pragma.
 54 | 
 55 |         Arguments:
 56 |             data_sample: a sample object from the dataset. should contain a 'text' column.
 57 |             pre_loop_token: token to prepend before loop code.
 58 |             post_loop_token: token to append after loop code.
 59 |             lines_before: how many lines of context to include before loop.
 60 |             chars_before: how many chars of context to include before loop.
 61 | 
 62 |         Returns:
 63 |             A list of samples with openmp formatted code.
 64 |     '''
 65 |     assert 'text' in data_sample, 'data_sample must contain the column \'text\'.'
 66 |     assert not ((lines_before is not None) and (chars_before is not None)), 'Only one of lines_before and' + \
 67 |         ' chars_before can be defined.'
 68 | 
 69 |     text = data_sample['text']
 70 | 
 71 |     new_samples = []
 72 |     for match in REG.finditer(text):
 73 |         omp_text = strip_comments( match.group() )
 74 |         if '{' in omp_text or '}' in omp_text:
 75 |             continue
 76 |         
 77 |         search_start = match.span()[-1]
 78 |         cur_idx = text.find('{', search_start) + 1
 79 | 
 80 |         bracket_stack = 1
 81 |         failed = False
 82 |         while bracket_stack != 0:
 83 |             if text[cur_idx] == '{':
 84 |                 bracket_stack += 1
 85 |             elif text[cur_idx] == '}':
 86 |                 bracket_stack -= 1
 87 |             cur_idx += 1
 88 | 
 89 |             if cur_idx >= len(text):
 90 |                 failed = True
 91 |                 break
 92 |         
 93 |         if failed:
 94 |             # currently cannot handle single statement for loops
 95 |             # todo: fix this
 96 |             continue
 97 | 
 98 |         loop_text = text[search_start : cur_idx].replace('#endif', '').lstrip()
 99 |         loop_text = pre_loop_token + loop_text + post_loop_token
100 | 
101 |         context = ''
102 |         if chars_before is not None:
103 |             pragma_start_idx = match.span()[0]
104 |             offset_idx = max(pragma_start_idx-chars_before, 0)
105 |             context = text[offset_idx : pragma_start_idx]
106 |         elif lines_before is not None:
107 |             raise NotImplementedError('Context by lines not yet supported.')
108 | 
109 |         new_sample = { k: v for k, v in data_sample.items() if k != 'text' }
110 |         new_sample['omp_pragma_line'] = omp_text
111 |         new_sample['context_chars'] = chars_before
112 |         new_sample['text'] = context + loop_text + ' ' + pre_pragma_token + omp_text + post_pragma_token
113 |         new_samples.append( new_sample )
114 | 
115 |     return new_samples
116 | 
117 | 
118 | def count_lines(fpath : PathLike) -> int:
119 |     ''' Count the number of lines in a file.
120 | 
121 |         Args:
122 |             fpath: path to input file
123 |         
124 |         Returns:
125 |             number of lines in fpath
126 |     '''
127 |     lines = 0
128 |     with open(fpath, 'r') as fp:
129 |         for _ in fp:
130 |             lines += 1
131 |     return lines 
132 | 
133 | 
134 | def main():
135 |     parser = ArgumentParser(description='Script that creates the openmp auto-complete dataset.')
136 |     parser.add_argument('-i', '--input', type=str, required=True, help='path to jsonl dataset')
137 |     parser.add_argument('-o', '--output', type=str, required=True, help='path to output dataset from script')
138 |     parser.add_argument('--pre-loop-token', type=str, default='<LOOP-START>', help='Token to add before loop.')
139 |     parser.add_argument('--post-loop-token', type=str, default='<LOOP-END>', help='Token to add after loop.')
140 |     parser.add_argument('--pre-pragma-token', type=str, default='<OMP-START>', help='Token to add before omp pragma.')
141 |     parser.add_argument('--post-pragma-token', type=str, default='<OMP-END>', help='Token to add after omp pragma.')
142 |     context_group = parser.add_mutually_exclusive_group(required=True)
143 |     context_group.add_argument('--num-chars-context', type=int, help='how many chars before loop to include')
144 |     context_group.add_argument('--num-lines-context', type=int, help='how many lines before loop to include')
145 |     args = parser.parse_args()
146 | 
147 |     # process online online, since dataset may be large
148 |     count = 0
149 |     with open(args.input, 'r') as fp_in, open(args.output, 'w') as fp_out, \
150 |         alive_bar(count_lines(args.input), title='Finding OpenMP For Loops') as bar:
151 |         for line in fp_in:
152 |             sample = json.loads(line)
153 |             new_samples = get_omp_samples(sample, 
154 |                 pre_loop_token=args.pre_loop_token, post_loop_token=args.post_loop_token,
155 |                 pre_pragma_token=args.pre_pragma_token, post_pragma_token=args.post_pragma_token,
156 |                 lines_before=args.num_lines_context, chars_before=args.num_chars_context)
157 | 
158 |             count += len(new_samples)
159 |             for sample in new_samples:
160 |                 json.dump(sample, fp_out, ensure_ascii=False)
161 |                 fp_out.write('\n')
162 |             
163 |             bar()
164 | 
165 |     
166 |     print(f'Found {count} total omp decorated for loops.')
167 | 
168 | 
169 | if __name__ == '__main__':
170 |     main()
171 |     


--------------------------------------------------------------------------------
/v1/data/dataset_utils.py:
--------------------------------------------------------------------------------
  1 | ''' Tools for loading the dataset
  2 |     author: Daniel Nichols
  3 |     date: October 2022
  4 | '''
  5 | # std imports
  6 | from __future__ import annotations
  7 | from typing import Iterable
  8 | from os import PathLike
  9 | from os.path import splitext
 10 | import hashlib
 11 | 
 12 | # tpl imports
 13 | from alive_progress import alive_it
 14 | 
 15 | 
 16 | # C/C++ related extensions to include in dataset
 17 | C_CPP_EXTENSIONS = ['C', 'cc', 'cxx', 'cpp', 'c', 'h', 'hh', 'hpp', 'H', 'hxx', 'Hxx', 'HXX']
 18 | 
 19 | 
 20 | def get_source_filenames(root: PathLike, extensions: Iterable[str] = C_CPP_EXTENSIONS, show_progress: bool = True
 21 | ) -> list[PathLike]:
 22 |     ''' return a list of all the filenames of source files with the given extensions in root.
 23 | 
 24 |         Args:
 25 |             root: where to start searching for files. Is searched recursively.
 26 |             extensions: what extensions define the source files. C/C++ extensions by default.
 27 |             show_progress: If true, then display a progress bar.
 28 | 
 29 |         Returns:
 30 |             A list of paths to all the source files.
 31 |     '''
 32 |     from os.path import join as path_join, isdir, exists
 33 |     from os import walk
 34 | 
 35 |     get_extension = lambda x: splitext(x)[-1][1:]
 36 | 
 37 |     def is_valid_source_file(fname: PathLike) -> bool:
 38 |         return (get_extension(fname) in extensions) and (not isdir(fname)) and (exists(fname)) and \
 39 |             (all(c not in fname for c in ['[', ']']))
 40 | 
 41 |     # I've found os.walk to be ~2-3x faster at this task than glob.glob
 42 |     all_files = []
 43 |     vals = alive_it(walk(root), title='Searching for source files'.rjust(26)) if show_progress else walk(root)
 44 |     for rt, _, files in vals:
 45 |         all_files.extend( [path_join(rt, f) for f in files if is_valid_source_file(path_join(rt, f))] )
 46 | 
 47 |     return all_files
 48 | 
 49 | 
 50 | def filter_bad_encoding(fnames: Iterable[PathLike], show_progress: bool = True) -> list[PathLike]:
 51 |     ''' Remove files with non utf-8 encodings.
 52 | 
 53 |         Args:
 54 |             fnames: a list of filenames to filter.
 55 |             show_progress: If true, then display a progress bar.
 56 | 
 57 |         Returns:
 58 |             A copy of fnames with files that contained non-utf-8 characters filtered out.
 59 |     '''
 60 |     results = []
 61 |     vals = alive_it(fnames, title='Removing non-utf-8'.rjust(26)) if show_progress else fnames
 62 |     for f in vals:
 63 |         try:
 64 |             for _ in open(f, 'r', encoding='utf-8'):
 65 |                 pass
 66 |             results.append(f)
 67 |         except UnicodeDecodeError:
 68 |             pass
 69 |     return results
 70 | 
 71 | 
 72 | def filter_by_size(fnames: Iterable[PathLike], min_mb: int = 0, max_mb: int = 1, min_tokens: int = 50, 
 73 |     show_progress: bool = True
 74 | ) -> list[PathLike]:
 75 |     ''' Remove files based on size of file and number of tokens.
 76 |         Args:
 77 |             fnames: List of filenames to filter
 78 |             min_mb: minimum number of MB to allow
 79 |             max_mb: maximum number of MB to allow
 80 |             min_tokens: exclude files with less tokens (split by whitespace)
 81 |     '''
 82 |     from os.path import getsize
 83 |     result = []
 84 |     vals = alive_it(fnames, title='Filtering by size'.rjust(26)) if show_progress else fnames
 85 |     
 86 |     for fname in vals:
 87 |         mb = getsize(fname) / (1024 ** 2)
 88 |         if mb < min_mb or mb > max_mb:
 89 |             continue
 90 |         
 91 |         num_tokens = 0
 92 |         with open(fname, 'r') as fp:
 93 |             for line in fp:
 94 |                 num_tokens += len( line.split() )
 95 |                 if num_tokens >= min_tokens:
 96 |                     break
 97 |         
 98 |         if num_tokens < min_tokens:
 99 |             continue
100 | 
101 |         result.append( fname )
102 |     
103 |     return result
104 | 
105 | 
106 | def _file_hash(fname: PathLike) -> str:
107 |     ''' Compute hash of contents of fname. Method body from https://stackoverflow.com/a/44873382/3769237.
108 | 
109 |         Args:
110 |             fname: path to file
111 |         
112 |         Returns:
113 |             sha256 hash of binary contents of fname
114 |     '''
115 |     h  = hashlib.sha256()
116 |     b  = bytearray(128*1024)
117 |     mv = memoryview(b)
118 |     with open(fname, 'rb', buffering=0) as f:
119 |         for n in iter(lambda : f.readinto(mv), 0):
120 |             h.update(mv[:n])
121 |     return h.hexdigest()
122 | 
123 | 
124 | def filter_duplicates(fnames: Iterable[PathLike], show_progress: bool = True) -> list[PathLike]:
125 |     ''' Perform deduplication.
126 | 
127 |         Args:
128 |             fnames: names of files to deduplicate
129 |             show_progress: If True, then display a progress bar.
130 |         
131 |         Returns:
132 |             fnames with the duplicates filtered out
133 |     '''
134 | 
135 |     hashes = set()
136 |     unique_fnames = []
137 |     bar = alive_it(fnames, title='Deduplicating'.rjust(26)) if show_progress else fnames
138 |     for fname in bar:
139 |         fhash = _file_hash(fname)
140 |         if fhash not in hashes:
141 |             hashes.add( fhash )
142 |             unique_fnames.append( fname )
143 | 
144 |     return unique_fnames
145 | 
146 | 
147 | def get_loc(flist: Iterable[PathLike], show_progress: bool = True) -> int:
148 |     ''' Returns the total number of lines in all the files in flist.
149 | 
150 |         Args:
151 |             flist: a list of filenames to count LOC in.
152 |             show_progress: If true, then display a progress bar.
153 |         
154 |         Returns:
155 |             The total LOC summed over all the files.
156 |     '''
157 |     #import subprocess
158 |     LOC = 0
159 |     vals = alive_it(flist, title='Counting LOC'.rjust(26)) if show_progress else flist
160 |     for fname in vals:
161 |         #LOC += int(subprocess.check_output(['wc', '-l', fname]).split()[0])
162 |         LOC += sum(1 for _ in open(fname, 'r', errors='ignore'))
163 |     return LOC
164 | 
165 | 
166 | def get_loc_per_extension(flist: Iterable[PathLike], show_progress: bool = True) -> int:
167 |     ''' Returns the total number of lines in all the files in flist per extension.
168 | 
169 |         Args:
170 |             flist: a list of filenames to count LOC in.
171 |             show_progress: If true, then display a progress bar.
172 |         
173 |         Returns:
174 |             The total LOC summed over all the files stored in buckets in a dict.
175 |     '''
176 |     get_extension = lambda x: splitext(x)[-1]
177 | 
178 |     LOC = {}
179 |     vals = alive_it(flist, title='Counting LOC'.rjust(26)) if show_progress else flist
180 |     for fname in vals:
181 |         ext = get_extension(fname)
182 |         if ext not in LOC:
183 |             LOC[ext] = 0
184 | 
185 |         LOC[ext] += sum(1 for _ in open(fname, 'r', errors='ignore'))
186 |         
187 |     return LOC
188 | 
189 | 
190 | def get_source_file_size(flist: Iterable[PathLike], show_progress: bool = True) -> int:
191 |     ''' Return the data set size based on a list of fnames in bytes.
192 | 
193 |         Args:
194 |             flist: a list of filenames to sum the size over.
195 |             show_progress: If true, then display a progress bar.
196 | 
197 |         Returns:
198 |             The total number of bytes that flist files takes up.
199 |     '''
200 |     from os.path import getsize
201 | 
202 |     num_bytes = 0
203 |     vals = alive_it(flist, title='Calculating dataset size'.rjust(26)) if show_progress else flist
204 |     for fname in vals:
205 |         num_bytes += getsize(fname)
206 |     return num_bytes
207 | 
208 | 
209 | def print_source_file_stats(fnames: Iterable[PathLike]):
210 |     ''' Print meta-data about source files such as # files, LOC, and memory size.
211 | 
212 |         Args:
213 |             fnames: File names to compute statistics over
214 |     '''
215 |     loc = get_loc(fnames)
216 |     size = get_source_file_size(fnames)
217 | 
218 |     print('# source files: {:,}'.format(len(fnames)))
219 |     print('LOC: {:,}'.format(loc))
220 |     print('Dataset size: {:.3g} GB'.format(size / (1<<30)))
221 | 


--------------------------------------------------------------------------------
/v1/data/edit-metadata.py:
--------------------------------------------------------------------------------
 1 | ''' Make manual adjustments to repo data set.
 2 |     author: Daniel Nichols
 3 |     date: October 2022
 4 | '''
 5 | # std imports
 6 | from argparse import ArgumentParser
 7 | from csv import QUOTE_NONNUMERIC
 8 | from getpass import getpass
 9 | import requests
10 | from typing import Iterable
11 | 
12 | # tpl imports
13 | import pandas as pd
14 | 
15 | 
16 | def get_repo_info(org: str, name: str, access_key: str) -> dict:
17 |     ''' Query GitHub API for metadata about a repo.
18 | 
19 |         Args:
20 |             org: organization name
21 |             name: repo name within organization
22 |             access_key: GitHub API personal access key
23 | 
24 |         Returns:
25 |             A dict containing the metadata from the corresponding API request.
26 |     '''
27 |     BASE_URL = 'https://api.github.com/repos'
28 |     DESIRED_ENTRIES = ['name', 'full_name', 'clone_url', 'html_url', 'created_at',
29 |             'updated_at', 'language', 'size', 'stargazers_count', 'watchers_count',
30 |             'forks_count', 'topics', 'visibility', 'forks', 'open_issues', 'watchers']
31 |     query_url = '{}/{}/{}'.format(BASE_URL, org, name)
32 | 
33 |     try:
34 |         header = {'Authorization': 'token {}'.format(access_key)}
35 |         response = requests.get(query_url, headers=header).json()
36 |     except:
37 |         print('Error fetching GitHub info!')
38 |         return None
39 | 
40 |     result = { k: response[k] for k in DESIRED_ENTRIES }    
41 |     return result
42 | 
43 | 
44 | def append(ds: pd.DataFrame, repos: Iterable[str]) -> pd.DataFrame:
45 |     ''' Fetches metadata for the requested repos and appends them to the data set.
46 |     
47 |         Args:
48 |             ds: dataset to append to
49 |             repos: repositories to get information about
50 | 
51 |         Returns:
52 |             A new DataFrame with the appended repo information
53 |     '''
54 |     api_token = getpass('GitHub API Token: ')
55 |     
56 |     new_data = []
57 |     for repo in repos:
58 |         parts = repo.split('/')
59 |         if '/' not in repo or len(parts) != 2:
60 |             print('Invalid repo description \'{}\'. Expected <org>/<repo>.'.format(repo))
61 |             continue
62 | 
63 |         if repo.lower() in ds['full_name'].str.lower().values:
64 |             print('Dataset already contains \'{}\'.'.format(repo))
65 |             continue
66 | 
67 |         print('Fetching info for \'{}\' repository...'.format(repo))
68 |         org, name = parts[0], parts[1]
69 | 
70 |         metadata = get_repo_info(org, name, api_token)
71 |         new_data.append(metadata)
72 | 
73 |     new_rows = pd.DataFrame(new_data)
74 |     return pd.concat([ds, new_rows], ignore_index=True)
75 | 
76 | 
77 | def main():
78 |     parser = ArgumentParser(description='helper script for editting repo metadata dataset')
79 |     parser.add_argument('-d', '--dataset', type=str, required=True, help='repo csv file')
80 |     parser.add_argument('-a', '--append', type=str, nargs='+', help='append repos to dataset. provide <org>/<repo>')
81 |     args = parser.parse_args()
82 | 
83 |     df = pd.read_csv(args.dataset)
84 | 
85 |     if args.append:
86 |         nrows = df.shape[0]
87 |         df = append(df, args.append)
88 |         if df.shape[0] - nrows > 0:
89 |             print('Writing dataset to \'{}\' with {} new row(s).'.format(args.dataset, df.shape[0] - nrows))
90 |             df.to_csv(args.dataset, index=False, quoting=QUOTE_NONNUMERIC)
91 | 
92 | 
93 | if __name__ == '__main__':
94 |     main()
95 | 


--------------------------------------------------------------------------------
/v1/data/generate-all-repo-plots.bash:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Wrapper script for repo-plots.py. This will generate all of the plots describing the repos in the dataset.
 3 | # author: Daniel Nichols
 4 | # date: October 2022
 5 | 
 6 | FIGS=figs
 7 | if [ $# -gt 1 ]; then
 8 |     DATASET_PATH="$2"
 9 | else
10 |     DATASET_PATH="repos-gt3.csv"
11 | fi
12 | 
13 | mkdir -p $FIGS
14 | 
15 | python repo-plots.py -d $DATASET_PATH \
16 |     --languages $FIGS/languages-hist.png \
17 |     --sizes $FIGS/sizes-hist.png \
18 |     --stars $FIGS/stars-hist.png \
19 |     --watchers $FIGS/watchers-hist.png \
20 |     --forks $FIGS/forks-hist.png \
21 |     --tags $FIGS/tags-wordcloud.png \
22 |     --extensions $FIGS/extensions-hist.png \
23 |     --loc $FIGS/loc-hist.png
24 | 


--------------------------------------------------------------------------------
/v1/data/repo-plots.py:
--------------------------------------------------------------------------------
  1 | ''' Some summary plots of the repository data.
  2 |     author: Daniel Nichols
  3 |     date: October 2022
  4 | '''
  5 | # std imports
  6 | from argparse import ArgumentParser
  7 | from os import PathLike
  8 | from typing import Tuple, Optional, Union
  9 | 
 10 | # tpl imports
 11 | import pandas as pd
 12 | import matplotlib.pyplot as plt
 13 | import seaborn as sns
 14 | 
 15 | 
 16 | def extensions_histogram(ds: pd.DataFrame, fname: PathLike):
 17 |     ''' Plot a histogram of the extensions over the repos.
 18 |     
 19 |         Args:
 20 |             ds: repo metadata dataset
 21 |     '''
 22 |     from os.path import splitext
 23 |     from dataset_utils import get_source_filenames, filter_bad_encoding, filter_duplicates, filter_by_size
 24 | 
 25 |     ROOT = '/afs/shell.umd.edu/project/bhatele-lab/user/dnicho/code-ml/data/repos'
 26 |     fnames = get_source_filenames(ROOT)
 27 |     fnames = filter_duplicates( filter_by_size( filter_bad_encoding(fnames), min_tokens=15 ) )
 28 | 
 29 |     exclude_ext = ['.cxx', '.hh', '.H', '.hxx']
 30 | 
 31 |     get_extension = lambda x: splitext(x)[-1]
 32 |     extensions = list( filter(lambda x: x not in exclude_ext, map(get_extension, fnames)) )
 33 | 
 34 |     plt.clf()
 35 |     sns.set()
 36 |     hist_fig = sns.histplot(x=extensions)
 37 |     hist_fig.set_title('File Type Distribution')
 38 |     hist_fig.set_xlabel('File Extension')
 39 |     fig = hist_fig.get_figure()
 40 |     fig.savefig(fname, bbox_inches='tight')
 41 | 
 42 | 
 43 | def loc_histogram(ds: pd.DataFrame, fname: PathLike):
 44 |     ''' Plot a histogram of the extensions over the repos.
 45 | 
 46 |         Args:
 47 |             ds: repo metadata dataset
 48 |     '''
 49 |     from dataset_utils import get_source_filenames, get_loc_per_extension, filter_bad_encoding, filter_duplicates, \
 50 |         filter_by_size
 51 |     
 52 |     ROOT = '/afs/shell.umd.edu/project/bhatele-lab/user/dnicho/code-ml/data/repos'
 53 |     fnames = filter_duplicates( filter_by_size( filter_bad_encoding( get_source_filenames(ROOT) ), min_tokens=15 ) )
 54 | 
 55 |     loc = get_loc_per_extension(fnames)
 56 | 
 57 |     exclude_ext = ['.cxx', '.hh', '.H', '.hxx']
 58 |     for ext in exclude_ext:
 59 |         if ext in loc:
 60 |             del loc[ext]
 61 | 
 62 |     plt.clf()
 63 |     sns.set()
 64 |     hist_fig = sns.barplot(x=list(loc.keys()), y=list(loc.values()), color='b')
 65 |     hist_fig.set_title('Distribution of LOC by File Type')
 66 |     hist_fig.set_xlabel('File Extension')
 67 |     hist_fig.set_ylabel('LOC')
 68 |     fig = hist_fig.get_figure()
 69 |     fig.savefig(fname, bbox_inches='tight')
 70 | 
 71 | 
 72 | def plot_histogram(data: pd.DataFrame, column: str, fname: PathLike, nbins: Union[int,str] = 'auto', 
 73 |     title: Optional[str] = None, xlabel: Optional[str] = None, ylabel: Optional[str] = None, 
 74 |     log_scale: Union[int, bool, Tuple[Union[int,bool], Union[int,bool]]] = False):
 75 |     ''' Plot a histogram of 'column' in data. Writes out the histogram to 'fname'.
 76 | 
 77 |         Args:
 78 |             data: DataFrame to read data from
 79 |             column: what column of 'data' to use for histogram
 80 |             fname: where to write output file
 81 |             nbins: number of histogram bins
 82 |             title: title of plot
 83 |             xlabel: xlabel of plot
 84 |             ylabel: ylabel of plot
 85 |             log_scale: how to log scale the axes. Either False, an integer, or tuple of integers (or mix bool/int).
 86 |     '''
 87 |     plt.clf()
 88 |     sns.set()
 89 |     hist_fig = sns.histplot(data=data, x=column, bins=nbins, log_scale=log_scale)
 90 |     if title:
 91 |         hist_fig.set_title(title)
 92 |     if xlabel:
 93 |         hist_fig.set_xlabel(xlabel)
 94 |     if ylabel:
 95 |         hist_fig.set_ylabel(ylabel)
 96 |     hist_fig.get_figure().savefig(fname)
 97 | 
 98 | 
 99 | def tags_wordcloud(ds: pd.DataFrame, fname: PathLike):
100 |     ''' Plot a wordcloud of the tags of the repos.
101 | 
102 |         Args:
103 |             ds: repo metadata dataset
104 |     '''
105 |     from wordcloud import WordCloud
106 |     def to_list(x):
107 |         return [str(s.strip('\'')) for s in x.strip('][').split(', ') if s != '']
108 | 
109 |     plt.clf()
110 |     series_as_list = ds['topics'].values.tolist()
111 |     series_as_list = [to_list(x) for x in series_as_list]
112 |     all_tags = [item for sublist in series_as_list for item in sublist]
113 |     
114 |     wc = WordCloud(background_color='white', width=800, height=400).generate(' '.join(all_tags))
115 |     plt.figure(figsize=(20,10))
116 |     plt.imshow(wc, interpolation='bilinear')
117 |     plt.axis('off')
118 |     plt.savefig(fname, bbox_inches='tight')
119 | 
120 | 
121 | def main():
122 |     parser = ArgumentParser(description='script to plot dataset info for repo metadata dataset')
123 |     parser.add_argument('-d', '--dataset', type=str, required=True, help='dataset to use data from')
124 |     parser.add_argument('--languages', type=str, help='create a histogram of languages')
125 |     parser.add_argument('--extensions', type=str, help='create a histogram of language extensions')
126 |     parser.add_argument('--loc', type=str, help='create a histogram of loc per extension')
127 |     parser.add_argument('--sizes', type=str, help='create a histogram of sizes')
128 |     parser.add_argument('--stars', type=str, help='create a histogram of stars')
129 |     parser.add_argument('--watchers', type=str, help='create a histogram of watchers')
130 |     parser.add_argument('--forks', type=str, help='create a histogram of forks')
131 |     parser.add_argument('--tags', type=str, help='create a wordcloud of tags')
132 |     args = parser.parse_args()
133 | 
134 |     ds = pd.read_csv(args.dataset)
135 | 
136 |     sns.set_theme()
137 |     if args.languages:
138 |         plot_histogram(ds, 'language', args.languages, title='Repository Main Language Distribution')
139 |     if args.sizes:
140 |         plot_histogram(ds, 'size', args.sizes, title='Repository Size Distribution', xlabel='Repo Size (KB)',
141 |             nbins=15, log_scale=(2,False))
142 |     if args.stars:
143 |         plot_histogram(ds, 'stargazers_count', args.stars, title='Repository Stars Distribution', xlabel='# Stars',
144 |             nbins=15, log_scale=False)
145 |     if args.watchers:
146 |         plot_histogram(ds, 'watchers_count', args.watchers, title='Repository Watchers Distribution', 
147 |             xlabel='# Watchers', nbins=15, log_scale=False)
148 |     if args.forks:
149 |         plot_histogram(ds, 'forks_count', args.forks, title='Repository Forks Distribution', xlabel='# Forks', nbins=15)
150 |     if args.tags:
151 |         tags_wordcloud(ds, args.tags)
152 |     if args.extensions:
153 |         extensions_histogram(ds, args.extensions)
154 |     if args.loc:
155 |         loc_histogram(ds, args.loc)
156 | 
157 | 
158 | 
159 | if __name__ == '__main__':
160 |     main()
161 | 


--------------------------------------------------------------------------------