├── assets
├── architecture.pdf
└── architecture.png
├── inference
├── instruction_filter.sh
├── instruction_filter.py
├── utils.py
├── train.py
└── test.json
├── train
├── run.sh
├── utils.py
├── train.py
└── json4train
│ └── example.json
├── diverse-data-selection
├── sampling_def.py
├── run.py
└── kcenter_greedy.py
├── necessity-evaluation
├── merge.py
├── necessity-instruction-extraction.py
└── necessity-evaluation.py
├── quality-evaluation
├── quality-evaluation.py
└── high-quality-data-extraction.py
└── README.md
/assets/architecture.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CASIA-LM/MoDS/HEAD/assets/architecture.pdf
--------------------------------------------------------------------------------
/assets/architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CASIA-LM/MoDS/HEAD/assets/architecture.png
--------------------------------------------------------------------------------
/inference/instruction_filter.sh:
--------------------------------------------------------------------------------
1 | CUDA_VISIBLE_DEVICES=5 \
2 | python instruction_filter.py --model_name_or_path ../output/new-model --instruct_data ./test.json --instruct_filtered ./result.json
3 |
--------------------------------------------------------------------------------
/train/run.sh:
--------------------------------------------------------------------------------
1 | CUDA_VISIBLE_DEVICES=1,2,3,4 \
2 | torchrun --nproc_per_node=4 --master_port=4568 train.py \
3 | --model_name_or_path ../models/llama2-7b-hf/ \
4 | --data_path ./json4train/example.json \
5 | --bf16 True \
6 | --output_dir ../output/new-model/ \
7 | --num_train_epochs 3 \
8 | --per_device_train_batch_size 8 \
9 | --per_device_eval_batch_size 8 \
10 | --gradient_accumulation_steps 16 \
11 | --evaluation_strategy "no" \
12 | --save_strategy "steps" \
13 | --save_steps 2000 \
14 | --save_total_limit 1 \
15 | --learning_rate 2e-5 \
16 | --weight_decay 0. \
17 | --warmup_ratio 0.03 \
18 | --lr_scheduler_type "cosine" \
19 | --logging_steps 1 \
20 | --fsdp "full_shard auto_wrap offload" \
21 | --fsdp_transformer_layer_cls_to_wrap 'LlamaDecoderLayer' \
22 | --tf32 True \
23 | --gradient_checkpointing True
24 |
--------------------------------------------------------------------------------
/diverse-data-selection/sampling_def.py:
--------------------------------------------------------------------------------
1 | # Copyright 2017 Google Inc.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | """Abstract class for sampling methods.
16 |
17 | Provides interface to sampling methods that allow same signature
18 | for select_batch. Each subclass implements select_batch_ with the desired
19 | signature for readability.
20 | """
21 |
22 | from __future__ import absolute_import
23 | from __future__ import division
24 | from __future__ import print_function
25 |
26 | import abc
27 | import numpy as np
28 |
29 | class SamplingMethod(object):
30 | __metaclass__ = abc.ABCMeta
31 |
32 | @abc.abstractmethod
33 | def __init__(self, X, **kwargs):
34 | self.X = X
35 |
36 | def flatten_X(self):
37 | shape = self.X.shape
38 | flat_X = self.X
39 | if len(shape) > 2:
40 | flat_X = np.reshape(self.X, (shape[0],np.product(shape[1:])))
41 | return flat_X
42 |
43 |
44 | @abc.abstractmethod
45 | def select_batch_(self):
46 | return
47 |
48 | def select_batch(self, **kwargs):
49 | return self.select_batch_(**kwargs)
50 |
51 | def to_dict(self):
52 | return None
53 |
--------------------------------------------------------------------------------
/necessity-evaluation/merge.py:
--------------------------------------------------------------------------------
1 | import dataclasses
2 | import logging
3 | import math
4 | import os
5 | import io
6 | import sys
7 | import time
8 | import json
9 | import random
10 |
11 | def _make_w_io_base(f, mode: str):
12 | if not isinstance(f, io.IOBase):
13 | f_dirname = os.path.dirname(f)
14 | if f_dirname != "":
15 | os.makedirs(f_dirname, exist_ok=True)
16 | f = open(f, mode=mode)
17 | return f
18 |
19 | def _make_r_io_base(f, mode: str):
20 | if not isinstance(f, io.IOBase):
21 | f = open(f, mode=mode)
22 | return f
23 |
24 | def jdump(obj, f, mode="w", indent=4, default=str):
25 | """Dump a str or dictionary to a file in json format.
26 |
27 | Args:
28 | obj: An object to be written.
29 | f: A string path to the location on disk.
30 | mode: Mode for opening the file.
31 | indent: Indent for storing json dictionaries.
32 | default: A function to handle non-serializable entries; defaults to `str`.
33 | """
34 | f = _make_w_io_base(f, mode)
35 | if isinstance(obj, (dict, list)):
36 | json.dump(obj, f, indent=indent, default=default)
37 | elif isinstance(obj, str):
38 | f.write(obj)
39 | else:
40 | raise ValueError(f"Unexpected type: {type(obj)}")
41 | f.close()
42 |
43 |
44 | def jload(f, mode="r"):
45 | """Load a .json file into a dictionary."""
46 | f = _make_r_io_base(f, mode)
47 | jdict = json.load(f)
48 | f.close()
49 | return jdict
50 |
51 | file_in_1 = sys.argv[1]
52 |
53 | file_in_2 = sys.argv[2]
54 |
55 | file_out = sys.argv[3]
56 |
57 | _1 = jload(file_in_1)
58 |
59 | _2 = jload(file_in_2)
60 |
61 |
62 |
63 | print('number of file 1', len(_1))
64 |
65 | print('number of file 2', len(_2))
66 |
67 |
68 | data_list = _1 + _2
69 |
70 | print('number of data', len(data_list))
71 |
72 | jdump(data_list,file_out)
73 |
74 |
--------------------------------------------------------------------------------
/necessity-evaluation/necessity-instruction-extraction.py:
--------------------------------------------------------------------------------
1 | import dataclasses
2 | import logging
3 | import math
4 | import os
5 | import io
6 | import sys
7 | import time
8 | import json
9 | import random
10 |
11 | def _make_w_io_base(f, mode: str):
12 | if not isinstance(f, io.IOBase):
13 | f_dirname = os.path.dirname(f)
14 | if f_dirname != "":
15 | os.makedirs(f_dirname, exist_ok=True)
16 | f = open(f, mode=mode)
17 | return f
18 |
19 | def _make_r_io_base(f, mode: str):
20 | if not isinstance(f, io.IOBase):
21 | f = open(f, mode=mode)
22 | return f
23 |
24 | def jdump(obj, f, mode="w", indent=4, default=str):
25 | """Dump a str or dictionary to a file in json format.
26 |
27 | Args:
28 | obj: An object to be written.
29 | f: A string path to the location on disk.
30 | mode: Mode for opening the file.
31 | indent: Indent for storing json dictionaries.
32 | default: A function to handle non-serializable entries; defaults to `str`.
33 | """
34 | f = _make_w_io_base(f, mode)
35 | if isinstance(obj, (dict, list)):
36 | json.dump(obj, f, indent=indent, default=default)
37 | elif isinstance(obj, str):
38 | f.write(obj)
39 | else:
40 | raise ValueError(f"Unexpected type: {type(obj)}")
41 | f.close()
42 |
43 |
44 | def jload(f, mode="r"):
45 | """Load a .json file into a dictionary."""
46 | f = _make_r_io_base(f, mode)
47 | jdict = json.load(f)
48 | f.close()
49 | return jdict
50 |
51 |
52 | threshold = 0.0
53 |
54 | reward_score_file = sys.argv[1]
55 |
56 | result_file = sys.argv[2]
57 |
58 | threshold = sys.argv[3]
59 |
60 | reward_score_list = jload(reward_score_file)
61 |
62 | all_num = len(reward_score_list)
63 | print('all number of instructions', len(reward_score_list))
64 |
65 | num_dict = {}
66 |
67 | result_json = []
68 |
69 | for item in reward_score_list:
70 | upper_num = math.ceil(item['reward_score'])
71 | lower_num = math.floor(item['reward_score'])
72 | num_dict[(lower_num, upper_num)] = num_dict.get((lower_num,upper_num),0) + 1
73 | if float(item['reward_score']) < threshold:
74 | result_json.append(item)
75 |
76 | print('The percent of each score:')
77 | for k, v in num_dict.items():
78 | print(str(k)+' : '+str(v)+' '+str(float(v)/all_num))
79 |
80 | print('num of bad case : ',len(result_json))
81 |
82 | #jdump(result_json,result_file)
83 | jdump(result_json,result_file)
84 |
85 |
--------------------------------------------------------------------------------
/diverse-data-selection/run.py:
--------------------------------------------------------------------------------
1 | import os
2 | os.environ['CUDA_VISIBLE_DEVICES'] = '6'
3 |
4 | import json
5 | import sys
6 | import numpy as np
7 | from transformers import BertTokenizer, BertModel,AutoModel
8 | import torch
9 | from kcenter_greedy import *
10 |
11 |
12 | @torch.no_grad()
13 | def bert_embedding(texts,batch=100):
14 | tokenizer = BertTokenizer.from_pretrained('../models/bert-base-uncased')
15 | model = AutoModel.from_pretrained('../models/bert-base-uncased').cuda()
16 | # 将文本转化为BERT模型可识别的token序列
17 | encoded_texts = tokenizer(texts,return_tensors="pt",truncation=True,padding=True,max_length=96)
18 | encoded_texts = encoded_texts.to("cuda")
19 | cls_hid_li = []
20 | # 使用BERT模型对每个文本序列进行编码,提取其语义向量
21 | i= 0
22 | while i < len(texts):
23 | last_hids = model(input_ids=encoded_texts["input_ids"][i:i+batch],
24 | attention_mask=encoded_texts["attention_mask"][i:i+batch])['last_hidden_state']
25 | cls_hids = last_hids[:,0,:].squeeze()
26 | cls_hid_li.append(cls_hids)
27 | i+= batch
28 | print(i)
29 | # 将所有文本的embedding连成特征矩阵
30 | cls_hids_tensor = torch.concat(cls_hid_li, dim=0)
31 | np.save("bert_embedding.npy",cls_hids_tensor.cpu())
32 | return np.array(cls_hids_tensor.cpu())
33 |
34 | # 数据采样
35 | def sample_func(text_list,K):
36 | result = []
37 | if os.path.exists("bert_embedding.npy"):
38 | text_embedding = np.load("bert_embedding.npy")
39 | else:
40 | text_embedding = bert_embedding(text_list)
41 | np.save("bert_embedding.npy",text_embedding)
42 |
43 | result = []
44 |
45 | k_center = kCenterGreedy(text_embedding)
46 |
47 | already_selected = None
48 | #for _ in range(K):
49 | result = k_center.select_batch_(text_embedding,already_selected,K)
50 | #result = result + new_data
51 | #already_selected += new_data
52 | return result
53 |
54 |
55 | def main(input_file, output_file, K):
56 | data = json.load(fp=open(input_file, "r"))
57 | instruction_list = []
58 | for d in data:
59 | instruction_list.append(d["instruction"])
60 | res = sample_func(text_list = instruction_list, K = K)
61 | print('data length')
62 | print(len(data))
63 |
64 | print('sampling data:')
65 | print(len(res))
66 | print(res)
67 | data_li = []
68 | for index in res:
69 | data_li.append(data[index])
70 | json.dump(obj=data_li,fp=open(output_file,"w"),indent=2,ensure_ascii=False)
71 |
72 | if __name__ == "__main__":
73 | input_file = sys.argv[1]
74 | output_file = sys.argv[2]
75 | K = int(sys.argv[3])
76 | main(input_file, output_file, K)
77 |
--------------------------------------------------------------------------------
/quality-evaluation/quality-evaluation.py:
--------------------------------------------------------------------------------
1 | import dataclasses
2 | import logging
3 | import math
4 | import os
5 | os.environ['CUDA_VISIBLE_DEVICES'] = '5'
6 |
7 | import io
8 | import sys
9 | import time
10 | import json
11 | import random
12 | from transformers import AutoModelForSequenceClassification, AutoTokenizer
13 |
14 | def _make_w_io_base(f, mode: str):
15 | if not isinstance(f, io.IOBase):
16 | f_dirname = os.path.dirname(f)
17 | if f_dirname != "":
18 | os.makedirs(f_dirname, exist_ok=True)
19 | f = open(f, mode=mode)
20 | return f
21 |
22 | def _make_r_io_base(f, mode: str):
23 | if not isinstance(f, io.IOBase):
24 | f = open(f, mode=mode)
25 | return f
26 |
27 | def jdump(obj, f, mode="w", indent=4, default=str):
28 | """Dump a str or dictionary to a file in json format.
29 |
30 | Args:
31 | obj: An object to be written.
32 | f: A string path to the location on disk.
33 | mode: Mode for opening the file.
34 | indent: Indent for storing json dictionaries.
35 | default: A function to handle non-serializable entries; defaults to `str`.
36 | """
37 | f = _make_w_io_base(f, mode)
38 | if isinstance(obj, (dict, list)):
39 | json.dump(obj, f, indent=indent, default=default)
40 | elif isinstance(obj, str):
41 | f.write(obj)
42 | else:
43 | raise ValueError(f"Unexpected type: {type(obj)}")
44 | f.close()
45 |
46 |
47 | def jload(f, mode="r"):
48 | """Load a .json file into a dictionary."""
49 | f = _make_r_io_base(f, mode)
50 | jdict = json.load(f)
51 | f.close()
52 | return jdict
53 |
54 | file_in = sys.argv[1]
55 |
56 | file_out = sys.argv[2]
57 |
58 | input_list = jload(file_in)
59 |
60 | print('number of input file', len(input_list))
61 |
62 | reward_name = "../models/reward-model-deberta-v3-large-v2"
63 | rank_model, tokenizer = AutoModelForSequenceClassification.from_pretrained(reward_name).cuda(), AutoTokenizer.from_pretrained(reward_name)
64 | question, answer = "Explain nuclear fusion like I am five", "Nuclear fusion is the process by which two or more protons and neutrons combine to form a single nucleus. It is a very important process in the universe, as it is the source of energy for stars and galaxies. Nuclear fusion is also a key process in the production of energy for nuclear power plants."
65 | inputs = tokenizer(question, answer, return_tensors='pt').to("cuda")
66 | score = rank_model(**inputs).logits[0].detach()
67 | print(float(score))
68 |
69 | result_list = []
70 | for element in input_list:
71 | instruction = element['instruction']
72 | _input = ''
73 | if 'input' in element.keys():
74 | _input = element['input']
75 | _output = element['output']
76 | question = ''
77 | if _input == '':
78 | question = instruction
79 | else:
80 | question = instruction + '\n' +_input
81 |
82 | answer = _output
83 |
84 | try:
85 | inputs = tokenizer(question, answer, return_tensors='pt').to("cuda")
86 | score = rank_model(**inputs).logits[0].detach()
87 | except:
88 | print(instruction)
89 | print(_output)
90 | continue
91 | final_result = {'instruction':instruction,'input':_input,'output':_output,'reward_score':float(score)}
92 | result_list.append(final_result)
93 |
94 | print('number of data', len(result_list))
95 |
96 | jdump(result_list,file_out)
97 |
98 |
--------------------------------------------------------------------------------
/necessity-evaluation/necessity-evaluation.py:
--------------------------------------------------------------------------------
1 | import dataclasses
2 | import logging
3 | import math
4 | import os
5 | os.environ['CUDA_VISIBLE_DEVICES'] = '6'
6 |
7 | import io
8 | import sys
9 | import time
10 | import json
11 | import random
12 | from transformers import AutoModelForSequenceClassification, AutoTokenizer
13 |
14 | def _make_w_io_base(f, mode: str):
15 | if not isinstance(f, io.IOBase):
16 | f_dirname = os.path.dirname(f)
17 | if f_dirname != "":
18 | os.makedirs(f_dirname, exist_ok=True)
19 | f = open(f, mode=mode)
20 | return f
21 |
22 | def _make_r_io_base(f, mode: str):
23 | if not isinstance(f, io.IOBase):
24 | f = open(f, mode=mode)
25 | return f
26 |
27 | def jdump(obj, f, mode="w", indent=4, default=str):
28 | """Dump a str or dictionary to a file in json format.
29 |
30 | Args:
31 | obj: An object to be written.
32 | f: A string path to the location on disk.
33 | mode: Mode for opening the file.
34 | indent: Indent for storing json dictionaries.
35 | default: A function to handle non-serializable entries; defaults to `str`.
36 | """
37 | f = _make_w_io_base(f, mode)
38 | if isinstance(obj, (dict, list)):
39 | json.dump(obj, f, indent=indent, default=default)
40 | elif isinstance(obj, str):
41 | f.write(obj)
42 | else:
43 | raise ValueError(f"Unexpected type: {type(obj)}")
44 | f.close()
45 |
46 |
47 | def jload(f, mode="r"):
48 | """Load a .json file into a dictionary."""
49 | f = _make_r_io_base(f, mode)
50 | jdict = json.load(f)
51 | f.close()
52 | return jdict
53 |
54 | inference_result_file = sys.argv[1]
55 |
56 | necessity_evaluation_file = sys.argv[2]
57 |
58 | inference_list = jload(inference_result_file)
59 |
60 | print('number of input file', len(inference_list))
61 |
62 | reward_name = "../models/reward-model-deberta-v3-large-v2"
63 | rank_model, tokenizer = AutoModelForSequenceClassification.from_pretrained(reward_name).cuda(), AutoTokenizer.from_pretrained(reward_name)
64 | question, answer = "Explain nuclear fusion like I am five", "Nuclear fusion is the process by which two or more protons and neutrons combine to form a single nucleus. It is a very important process in the universe, as it is the source of energy for stars and galaxies. Nuclear fusion is also a key process in the production of energy for nuclear power plants."
65 | inputs = tokenizer(question, answer, return_tensors='pt').to("cuda")
66 | score = rank_model(**inputs).logits[0].detach()
67 | print(float(score))
68 |
69 | result_list = []
70 | for element in inference_list:
71 | instruction = element['instruction']
72 | _input = element['input']
73 | _output = element['output']
74 | _generated = element['generated'][:-4]
75 |
76 | question = instruction+'\n'+_input
77 |
78 | answer = _generated
79 |
80 | try:
81 | inputs = tokenizer(question, answer, return_tensors='pt').to("cuda")
82 | score = rank_model(**inputs).logits[0].detach()
83 | final_result = {'instruction':instruction,'input':_input,'output':_output,'generated':_generated,'reward_score':float(score)}
84 | result_list.append(final_result)
85 | except:
86 | print(instruction)
87 | print(_generated)
88 | continue
89 |
90 | print('number of data', len(result_list))
91 |
92 | jdump(result_list,necessity_evaluation_file)
93 |
94 |
--------------------------------------------------------------------------------
/quality-evaluation/high-quality-data-extraction.py:
--------------------------------------------------------------------------------
1 | import dataclasses
2 | import logging
3 | import math
4 | import os
5 | import io
6 | import sys
7 | import time
8 | import json
9 | import random
10 |
11 | def _make_w_io_base(f, mode: str):
12 | if not isinstance(f, io.IOBase):
13 | f_dirname = os.path.dirname(f)
14 | if f_dirname != "":
15 | os.makedirs(f_dirname, exist_ok=True)
16 | f = open(f, mode=mode)
17 | return f
18 |
19 | def _make_r_io_base(f, mode: str):
20 | if not isinstance(f, io.IOBase):
21 | f = open(f, mode=mode)
22 | return f
23 |
24 | def jdump(obj, f, mode="w", indent=4, default=str):
25 | """Dump a str or dictionary to a file in json format.
26 |
27 | Args:
28 | obj: An object to be written.
29 | f: A string path to the location on disk.
30 | mode: Mode for opening the file.
31 | indent: Indent for storing json dictionaries.
32 | default: A function to handle non-serializable entries; defaults to `str`.
33 | """
34 | f = _make_w_io_base(f, mode)
35 | if isinstance(obj, (dict, list)):
36 | json.dump(obj, f, indent=indent, default=default)
37 | elif isinstance(obj, str):
38 | f.write(obj)
39 | else:
40 | raise ValueError(f"Unexpected type: {type(obj)}")
41 | f.close()
42 |
43 |
44 | def jload(f, mode="r"):
45 | """Load a .json file into a dictionary."""
46 | f = _make_r_io_base(f, mode)
47 | jdict = json.load(f)
48 | f.close()
49 | return jdict
50 |
51 | def instruct_dict(json_list):
52 | result_dict = {}
53 | for item in json_list:
54 | if item['instruction'] in result_dict:
55 | print('Exist the same instruction in this dataset!')
56 | else:
57 | result_dict[item['instruction']] = 0
58 | return result_dict
59 |
60 | def instruct_category(json_file):
61 | category_set = []
62 | instruct_category_map = {}
63 |
64 | json_data = jload('./category.json')
65 |
66 | category_set = json_data.keys()
67 |
68 | for k, v in json_data.items():
69 | for ins in v:
70 | if ins in instruct_category_map and instruct_category_map.get(ins) == k:
71 | print('Category error!')
72 | print('category', k)
73 | print('instruction', ins)
74 | else:
75 | instruct_category_map[ins] = k
76 |
77 | return category_set, instruct_category_map
78 |
79 |
80 |
81 |
82 | threshold = 0.0
83 |
84 | quality_evaluation_file = sys.argv[1]
85 |
86 | high_quality_file = sys.argv[2]
87 |
88 | threshold = sys.argv[3]
89 |
90 | quality_evaluation_list = jload(quality_evaluation_file)
91 |
92 | all_num = len(quality_evaluation_list)
93 | print('all number of instructions', len(quality_evaluation_list))
94 |
95 | num_dict = {}
96 |
97 | result_json = []
98 |
99 | for item in quality_evaluation_list:
100 | upper_num = math.ceil(item['reward_score'])
101 | lower_num = math.floor(item['reward_score'])
102 | num_dict[(lower_num, upper_num)] = num_dict.get((lower_num,upper_num),0) + 1
103 | if float(item['reward_score']) > threshold:
104 | result_json.append(item)
105 |
106 | print('The percent of each score interval:')
107 | for k, v in num_dict.items():
108 | print(str(k)+' : '+str(v)+' '+str(float(v)/all_num))
109 |
110 | print('num of good case : ',len(result_json))
111 |
112 | #jdump(result_json,result_file)
113 | jdump(result_json,high_quality_file)
114 |
115 |
--------------------------------------------------------------------------------
/inference/instruction_filter.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass, field
2 |
3 | import numpy as np
4 | import torch
5 | import transformers
6 | from transformers import GenerationConfig
7 |
8 | from train import ModelArguments, smart_tokenizer_and_embedding_resize, DEFAULT_PAD_TOKEN, DEFAULT_EOS_TOKEN, \
9 | DEFAULT_BOS_TOKEN, DEFAULT_UNK_TOKEN, PROMPT_DICT
10 | from utils import jload
11 | import json
12 |
13 | @dataclass
14 | class InferenceArguments:
15 | model_max_length: int = field(
16 | default=1024,
17 | metadata={"help": "Maximum sequence length. Sequences will be right padded (and possibly truncated)."},
18 | )
19 | instruct_data: str = field(
20 | default=None,
21 | metadata={"help": "The instruction data to be filter"},
22 | )
23 | instruct_filtered: str = field(
24 | default=None,
25 | metadata={"help": "The filtered instructions"},
26 | )
27 | load_in_8bit: bool = field(
28 | default=False,
29 | metadata={"help": "Load the model in 8-bit mode."},
30 | )
31 | inference_dtype: torch.dtype = field(
32 | default=torch.float32,
33 | metadata={"help": "The dtype to use for inference."},
34 | )
35 |
36 |
37 | def generate_prompt(instruction, input=None):
38 | if input:
39 | return PROMPT_DICT["prompt_input"].format(instruction=instruction, input=input)
40 | else:
41 | return PROMPT_DICT["prompt_no_input"].format(instruction=instruction)
42 |
43 |
44 | def inference():
45 | parser = transformers.HfArgumentParser((ModelArguments, InferenceArguments))
46 | model_args, inference_args = parser.parse_args_into_dataclasses()
47 |
48 | model = transformers.AutoModelForCausalLM.from_pretrained(
49 | model_args.model_name_or_path,
50 | load_in_8bit=inference_args.load_in_8bit,
51 | torch_dtype=inference_args.inference_dtype,
52 | device_map="auto",
53 | )
54 | model.cuda()
55 | model.eval()
56 |
57 | generation_config = GenerationConfig(
58 | temperature=0.1,
59 | top_p=0.75,
60 | num_beams=4,
61 | )
62 |
63 | tokenizer = transformers.AutoTokenizer.from_pretrained(
64 | model_args.model_name_or_path,
65 | use_fast=False,
66 | model_max_length=inference_args.model_max_length,
67 | )
68 |
69 | if tokenizer.pad_token is None:
70 | smart_tokenizer_and_embedding_resize(
71 | special_tokens_dict=dict(pad_token=DEFAULT_PAD_TOKEN),
72 | tokenizer=tokenizer,
73 | model=model,
74 | )
75 | tokenizer.add_special_tokens(
76 | {
77 | "eos_token": DEFAULT_EOS_TOKEN,
78 | "bos_token": DEFAULT_BOS_TOKEN,
79 | "unk_token": DEFAULT_UNK_TOKEN,
80 | "pad_token": DEFAULT_PAD_TOKEN,
81 | }
82 | )
83 |
84 | ctx = ""
85 |
86 | instructions = jload(inference_args.instruct_data)
87 |
88 | result_instructions = []
89 | index = 0
90 | for instruct_triplet in instructions:
91 | instruction = instruct_triplet['instruction']
92 | _input = instruct_triplet['input']
93 | _output = instruct_triplet['output']
94 |
95 | #print("Instruction:", instruction)
96 | res = ''
97 | try:
98 | inputs = tokenizer(generate_prompt(instruction, _input), return_tensors="pt")
99 | outputs = model.generate(input_ids=inputs["input_ids"].cuda(),
100 | generation_config=generation_config,
101 | max_new_tokens=inference_args.model_max_length,
102 | return_dict_in_generate=True,
103 | output_scores=True)
104 | input_length = 1 if model.config.is_encoder_decoder else inputs.input_ids.shape[1]
105 | generated_tokens = outputs.sequences[:, input_length:]
106 |
107 | #ctx += f"Instruction: {instruction}\n" + f"Response: {generated_tokens[0]}\n"
108 | #print("Response:", tokenizer.decode(generated_tokens[0]))
109 | #print()
110 | res = tokenizer.decode(generated_tokens[0])
111 | except:
112 | print('inference error!')
113 | continue
114 |
115 | #cands = [res]
116 | #refs = [_output]
117 | #p,r,f1 = bert_score.score(cands, refs, lang='en', verbose=True, model_type='microsoft/deberta-xlarge-mnli')
118 | #_, _, f1 = score(cands, refs, lang="en", verbose=True)
119 | #b_score = round(f1.mean(),3)
120 | #print('output', res)
121 | result_instruction = {'instruction':instruction, 'input':_input, 'output':_output, 'generated':res}
122 | result_instructions.append(result_instruction)
123 | index = index + 1
124 | print('index', index)
125 |
126 | with open(inference_args.instruct_filtered, 'w') as f:
127 | json.dump(result_instructions, f)
128 |
129 |
130 | if __name__ == "__main__":
131 | inference()
132 |
--------------------------------------------------------------------------------
/diverse-data-selection/kcenter_greedy.py:
--------------------------------------------------------------------------------
1 | # Copyright 2017 Google Inc.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | """Returns points that minimizes the maximum distance of any point to a center.
16 |
17 | Implements the k-Center-Greedy method in
18 | Ozan Sener and Silvio Savarese. A Geometric Approach to Active Learning for
19 | Convolutional Neural Networks. https://arxiv.org/abs/1708.00489 2017
20 |
21 | Distance metric defaults to l2 distance. Features used to calculate distance
22 | are either raw features or if a model has transform method then uses the output
23 | of model.transform(X).
24 |
25 | Can be extended to a robust k centers algorithm that ignores a certain number of
26 | outlier datapoints. Resulting centers are solution to multiple integer program.
27 | """
28 |
29 | from __future__ import absolute_import
30 | from __future__ import division
31 | from __future__ import print_function
32 |
33 | import numpy as np
34 | from sklearn.metrics import pairwise_distances
35 | from sampling_def import SamplingMethod
36 |
37 |
38 | class kCenterGreedy(SamplingMethod):
39 |
40 | def __init__(self, X, metric='euclidean'):
41 | self.X = X
42 | self.flat_X = self.flatten_X()
43 | self.name = 'kcenter'
44 | self.features = self.flat_X
45 | self.metric = metric
46 | self.min_distances = None
47 | self.n_obs = self.X.shape[0]
48 | self.already_selected = []
49 | print('shape of features')
50 | print(X.shape)
51 |
52 | def update_distances(self, cluster_centers, only_new=True, reset_dist=False):
53 | """Update min distances given cluster centers.
54 |
55 | Args:
56 | cluster_centers: indices of cluster centers
57 | only_new: only calculate distance for newly selected points and update
58 | min_distances.
59 | rest_dist: whether to reset min_distances.
60 | """
61 |
62 | if reset_dist:
63 | self.min_distances = None
64 | if only_new:
65 | cluster_centers = [d for d in cluster_centers
66 | if d not in self.already_selected]
67 | if cluster_centers:
68 | # Update min_distances for all examples given new cluster center.
69 | x = self.features[cluster_centers]
70 | dist = pairwise_distances(self.features, x, metric=self.metric)
71 |
72 | if self.min_distances is None:
73 | self.min_distances = np.min(dist, axis=1).reshape(-1,1)
74 | else:
75 | self.min_distances = np.minimum(self.min_distances, dist)
76 |
77 | def select_batch_(self, features, already_selected, N, **kwargs):
78 | """
79 | Diversity promoting active learning method that greedily forms a batch
80 | to minimize the maximum distance to a cluster center among all unlabeled
81 | datapoints.
82 |
83 | Args:
84 | model: model with scikit-like API with decision_function implemented
85 | already_selected: index of datapoints already selected
86 | N: batch size
87 |
88 | Returns:
89 | indices of points selected to minimize distance to cluster centers
90 | """
91 | #if already_selected is None:
92 | #already_selected = []
93 | # already_selected = [ index for index in np.random.choice(np.arange(self.n_obs),200,replace=False)]
94 | try:
95 | # Assumes that the transform function takes in original data and not
96 | # flattened data.
97 | print('Getting transformed features...')
98 | self.features = features
99 | print('Calculating distances...')
100 | self.update_distances(already_selected, only_new=False, reset_dist=True)
101 | except:
102 | print('Using flat_X as features.')
103 | self.update_distances(already_selected, only_new=True, reset_dist=False)
104 |
105 | if already_selected is None:
106 | already_selected = []
107 | # already_selected = np.random.choice(np.arange(self.n_obs),100,replace=False)
108 | self.already_selected = already_selected
109 | print(self.already_selected)
110 |
111 | new_batch = []
112 |
113 | for _ in range(N):
114 | if self.already_selected == []:
115 | # Initialize centers with a randomly selected datapoint
116 | ind = np.random.choice(np.arange(self.n_obs))
117 | else:
118 | ind = np.argmax(self.min_distances)
119 | # New examples should not be in already selected since those points
120 | # should have min_distance of zero to a cluster center.
121 | assert ind not in already_selected
122 |
123 | if self.min_distances is None:
124 | print('min distances is None')
125 | else:
126 | print('Maximum distance from cluster centers is %0.2f'
127 | % max(self.min_distances))
128 |
129 | self.update_distances([ind], only_new=True, reset_dist=False)
130 | new_batch.append(ind)
131 |
132 | if self.already_selected is None:
133 | self.already_selected = []
134 | else:
135 | self.already_selected.append(ind)
136 |
137 | print('Maximum distance from cluster centers is %0.2f'
138 | % max(self.min_distances))
139 |
140 | return self.already_selected
141 |
--------------------------------------------------------------------------------
/train/utils.py:
--------------------------------------------------------------------------------
1 | import dataclasses
2 | import logging
3 | import math
4 | import os
5 | import io
6 | import sys
7 | import time
8 | import json
9 | from typing import Optional, Sequence, Union
10 |
11 | import openai
12 | import tqdm
13 | from openai import openai_object
14 | import copy
15 |
16 | StrOrOpenAIObject = Union[str, openai_object.OpenAIObject]
17 |
18 | openai_org = os.getenv("OPENAI_ORG")
19 | if openai_org is not None:
20 | openai.organization = openai_org
21 | logging.warning(f"Switching to organization: {openai_org} for OAI API key.")
22 |
23 |
24 | @dataclasses.dataclass
25 | class OpenAIDecodingArguments(object):
26 | max_tokens: int = 1800
27 | temperature: float = 0.2
28 | top_p: float = 1.0
29 | n: int = 1
30 | stream: bool = False
31 | stop: Optional[Sequence[str]] = None
32 | presence_penalty: float = 0.0
33 | frequency_penalty: float = 0.0
34 | suffix: Optional[str] = None
35 | logprobs: Optional[int] = None
36 | echo: bool = False
37 |
38 |
39 | def openai_completion(
40 | prompts: Union[str, Sequence[str], Sequence[dict[str, str]], dict[str, str]],
41 | decoding_args: OpenAIDecodingArguments,
42 | model_name="text-davinci-003",
43 | sleep_time=2,
44 | batch_size=1,
45 | max_instances=sys.maxsize,
46 | max_batches=sys.maxsize,
47 | return_text=False,
48 | **decoding_kwargs,
49 | ) -> Union[Union[StrOrOpenAIObject], Sequence[StrOrOpenAIObject], Sequence[Sequence[StrOrOpenAIObject]],]:
50 | """Decode with OpenAI API.
51 |
52 | Args:
53 | prompts: A string or a list of strings to complete. If it is a chat model the strings should be formatted
54 | as explained here: https://github.com/openai/openai-python/blob/main/chatml.md. If it is a chat model
55 | it can also be a dictionary (or list thereof) as explained here:
56 | https://github.com/openai/openai-cookbook/blob/main/examples/How_to_format_inputs_to_ChatGPT_models.ipynb
57 | decoding_args: Decoding arguments.
58 | model_name: Model name. Can be either in the format of "org/model" or just "model".
59 | sleep_time: Time to sleep once the rate-limit is hit.
60 | batch_size: Number of prompts to send in a single request. Only for non chat model.
61 | max_instances: Maximum number of prompts to decode.
62 | max_batches: Maximum number of batches to decode. This argument will be deprecated in the future.
63 | return_text: If True, return text instead of full completion object (which contains things like logprob).
64 | decoding_kwargs: Additional decoding arguments. Pass in `best_of` and `logit_bias` if you need them.
65 |
66 | Returns:
67 | A completion or a list of completions.
68 | Depending on return_text, return_openai_object, and decoding_args.n, the completion type can be one of
69 | - a string (if return_text is True)
70 | - an openai_object.OpenAIObject object (if return_text is False)
71 | - a list of objects of the above types (if decoding_args.n > 1)
72 | """
73 | is_single_prompt = isinstance(prompts, (str, dict))
74 | if is_single_prompt:
75 | prompts = [prompts]
76 |
77 | if max_batches < sys.maxsize:
78 | logging.warning(
79 | "`max_batches` will be deprecated in the future, please use `max_instances` instead."
80 | "Setting `max_instances` to `max_batches * batch_size` for now."
81 | )
82 | max_instances = max_batches * batch_size
83 |
84 | prompts = prompts[:max_instances]
85 | num_prompts = len(prompts)
86 | prompt_batches = [
87 | prompts[batch_id * batch_size : (batch_id + 1) * batch_size]
88 | for batch_id in range(int(math.ceil(num_prompts / batch_size)))
89 | ]
90 |
91 | completions = []
92 | for batch_id, prompt_batch in tqdm.tqdm(
93 | enumerate(prompt_batches),
94 | desc="prompt_batches",
95 | total=len(prompt_batches),
96 | ):
97 | batch_decoding_args = copy.deepcopy(decoding_args) # cloning the decoding_args
98 |
99 | while True:
100 | try:
101 | shared_kwargs = dict(
102 | model=model_name,
103 | **batch_decoding_args.__dict__,
104 | **decoding_kwargs,
105 | )
106 | completion_batch = openai.Completion.create(prompt=prompt_batch, **shared_kwargs)
107 | choices = completion_batch.choices
108 |
109 | for choice in choices:
110 | choice["total_tokens"] = completion_batch.usage.total_tokens
111 | completions.extend(choices)
112 | break
113 | except openai.error.OpenAIError as e:
114 | logging.warning(f"OpenAIError: {e}.")
115 | if "Please reduce your prompt" in str(e):
116 | batch_decoding_args.max_tokens = int(batch_decoding_args.max_tokens * 0.8)
117 | logging.warning(f"Reducing target length to {batch_decoding_args.max_tokens}, Retrying...")
118 | else:
119 | logging.warning("Hit request rate limit; retrying...")
120 | time.sleep(sleep_time) # Annoying rate limit on requests.
121 |
122 | if return_text:
123 | completions = [completion.text for completion in completions]
124 | if decoding_args.n > 1:
125 | # make completions a nested list, where each entry is a consecutive decoding_args.n of original entries.
126 | completions = [completions[i : i + decoding_args.n] for i in range(0, len(completions), decoding_args.n)]
127 | if is_single_prompt:
128 | # Return non-tuple if only 1 input and 1 generation.
129 | (completions,) = completions
130 | return completions
131 |
132 |
133 | def _make_w_io_base(f, mode: str):
134 | if not isinstance(f, io.IOBase):
135 | f_dirname = os.path.dirname(f)
136 | if f_dirname != "":
137 | os.makedirs(f_dirname, exist_ok=True)
138 | f = open(f, mode=mode)
139 | return f
140 |
141 |
142 | def _make_r_io_base(f, mode: str):
143 | if not isinstance(f, io.IOBase):
144 | f = open(f, mode=mode)
145 | return f
146 |
147 |
148 | def jdump(obj, f, mode="w", indent=4, default=str):
149 | """Dump a str or dictionary to a file in json format.
150 |
151 | Args:
152 | obj: An object to be written.
153 | f: A string path to the location on disk.
154 | mode: Mode for opening the file.
155 | indent: Indent for storing json dictionaries.
156 | default: A function to handle non-serializable entries; defaults to `str`.
157 | """
158 | f = _make_w_io_base(f, mode)
159 | if isinstance(obj, (dict, list)):
160 | json.dump(obj, f, indent=indent, default=default)
161 | elif isinstance(obj, str):
162 | f.write(obj)
163 | else:
164 | raise ValueError(f"Unexpected type: {type(obj)}")
165 | f.close()
166 |
167 |
168 | def jload(f, mode="r"):
169 | """Load a .json file into a dictionary."""
170 | f = _make_r_io_base(f, mode)
171 | jdict = json.load(f)
172 | f.close()
173 | return jdict
174 |
--------------------------------------------------------------------------------
/inference/utils.py:
--------------------------------------------------------------------------------
1 | import dataclasses
2 | import logging
3 | import math
4 | import os
5 | import io
6 | import sys
7 | import time
8 | import json
9 | from typing import Optional, Sequence, Union
10 |
11 | import openai
12 | import tqdm
13 | from openai import openai_object
14 | import copy
15 |
16 | StrOrOpenAIObject = Union[str, openai_object.OpenAIObject]
17 |
18 | openai_org = os.getenv("OPENAI_ORG")
19 | if openai_org is not None:
20 | openai.organization = openai_org
21 | logging.warning(f"Switching to organization: {openai_org} for OAI API key.")
22 |
23 |
24 | @dataclasses.dataclass
25 | class OpenAIDecodingArguments(object):
26 | max_tokens: int = 1800
27 | temperature: float = 0.2
28 | top_p: float = 1.0
29 | n: int = 1
30 | stream: bool = False
31 | stop: Optional[Sequence[str]] = None
32 | presence_penalty: float = 0.0
33 | frequency_penalty: float = 0.0
34 | suffix: Optional[str] = None
35 | logprobs: Optional[int] = None
36 | echo: bool = False
37 |
38 |
39 | def openai_completion(
40 | prompts: Union[str, Sequence[str], Sequence[dict[str, str]], dict[str, str]],
41 | decoding_args: OpenAIDecodingArguments,
42 | model_name="text-davinci-003",
43 | sleep_time=2,
44 | batch_size=1,
45 | max_instances=sys.maxsize,
46 | max_batches=sys.maxsize,
47 | return_text=False,
48 | **decoding_kwargs,
49 | ) -> Union[Union[StrOrOpenAIObject], Sequence[StrOrOpenAIObject], Sequence[Sequence[StrOrOpenAIObject]],]:
50 | """Decode with OpenAI API.
51 |
52 | Args:
53 | prompts: A string or a list of strings to complete. If it is a chat model the strings should be formatted
54 | as explained here: https://github.com/openai/openai-python/blob/main/chatml.md. If it is a chat model
55 | it can also be a dictionary (or list thereof) as explained here:
56 | https://github.com/openai/openai-cookbook/blob/main/examples/How_to_format_inputs_to_ChatGPT_models.ipynb
57 | decoding_args: Decoding arguments.
58 | model_name: Model name. Can be either in the format of "org/model" or just "model".
59 | sleep_time: Time to sleep once the rate-limit is hit.
60 | batch_size: Number of prompts to send in a single request. Only for non chat model.
61 | max_instances: Maximum number of prompts to decode.
62 | max_batches: Maximum number of batches to decode. This argument will be deprecated in the future.
63 | return_text: If True, return text instead of full completion object (which contains things like logprob).
64 | decoding_kwargs: Additional decoding arguments. Pass in `best_of` and `logit_bias` if you need them.
65 |
66 | Returns:
67 | A completion or a list of completions.
68 | Depending on return_text, return_openai_object, and decoding_args.n, the completion type can be one of
69 | - a string (if return_text is True)
70 | - an openai_object.OpenAIObject object (if return_text is False)
71 | - a list of objects of the above types (if decoding_args.n > 1)
72 | """
73 | is_single_prompt = isinstance(prompts, (str, dict))
74 | if is_single_prompt:
75 | prompts = [prompts]
76 |
77 | if max_batches < sys.maxsize:
78 | logging.warning(
79 | "`max_batches` will be deprecated in the future, please use `max_instances` instead."
80 | "Setting `max_instances` to `max_batches * batch_size` for now."
81 | )
82 | max_instances = max_batches * batch_size
83 |
84 | prompts = prompts[:max_instances]
85 | num_prompts = len(prompts)
86 | prompt_batches = [
87 | prompts[batch_id * batch_size : (batch_id + 1) * batch_size]
88 | for batch_id in range(int(math.ceil(num_prompts / batch_size)))
89 | ]
90 |
91 | completions = []
92 | for batch_id, prompt_batch in tqdm.tqdm(
93 | enumerate(prompt_batches),
94 | desc="prompt_batches",
95 | total=len(prompt_batches),
96 | ):
97 | batch_decoding_args = copy.deepcopy(decoding_args) # cloning the decoding_args
98 |
99 | while True:
100 | try:
101 | shared_kwargs = dict(
102 | model=model_name,
103 | **batch_decoding_args.__dict__,
104 | **decoding_kwargs,
105 | )
106 | completion_batch = openai.Completion.create(prompt=prompt_batch, **shared_kwargs)
107 | choices = completion_batch.choices
108 |
109 | for choice in choices:
110 | choice["total_tokens"] = completion_batch.usage.total_tokens
111 | completions.extend(choices)
112 | break
113 | except openai.error.OpenAIError as e:
114 | logging.warning(f"OpenAIError: {e}.")
115 | if "Please reduce your prompt" in str(e):
116 | batch_decoding_args.max_tokens = int(batch_decoding_args.max_tokens * 0.8)
117 | logging.warning(f"Reducing target length to {batch_decoding_args.max_tokens}, Retrying...")
118 | else:
119 | logging.warning("Hit request rate limit; retrying...")
120 | time.sleep(sleep_time) # Annoying rate limit on requests.
121 |
122 | if return_text:
123 | completions = [completion.text for completion in completions]
124 | if decoding_args.n > 1:
125 | # make completions a nested list, where each entry is a consecutive decoding_args.n of original entries.
126 | completions = [completions[i : i + decoding_args.n] for i in range(0, len(completions), decoding_args.n)]
127 | if is_single_prompt:
128 | # Return non-tuple if only 1 input and 1 generation.
129 | (completions,) = completions
130 | return completions
131 |
132 |
133 | def _make_w_io_base(f, mode: str):
134 | if not isinstance(f, io.IOBase):
135 | f_dirname = os.path.dirname(f)
136 | if f_dirname != "":
137 | os.makedirs(f_dirname, exist_ok=True)
138 | f = open(f, mode=mode)
139 | return f
140 |
141 |
142 | def _make_r_io_base(f, mode: str):
143 | if not isinstance(f, io.IOBase):
144 | f = open(f, mode=mode)
145 | return f
146 |
147 |
148 | def jdump(obj, f, mode="w", indent=4, default=str):
149 | """Dump a str or dictionary to a file in json format.
150 |
151 | Args:
152 | obj: An object to be written.
153 | f: A string path to the location on disk.
154 | mode: Mode for opening the file.
155 | indent: Indent for storing json dictionaries.
156 | default: A function to handle non-serializable entries; defaults to `str`.
157 | """
158 | f = _make_w_io_base(f, mode)
159 | if isinstance(obj, (dict, list)):
160 | json.dump(obj, f, indent=indent, default=default)
161 | elif isinstance(obj, str):
162 | f.write(obj)
163 | else:
164 | raise ValueError(f"Unexpected type: {type(obj)}")
165 | f.close()
166 |
167 |
168 | def jload(f, mode="r"):
169 | """Load a .json file into a dictionary."""
170 | f = _make_r_io_base(f, mode)
171 | jdict = json.load(f)
172 | f.close()
173 | return jdict
174 |
--------------------------------------------------------------------------------
/train/train.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import copy
16 | import logging
17 | from dataclasses import dataclass, field
18 | from typing import Dict, Optional, Sequence
19 |
20 | import torch
21 | import transformers
22 | import utils
23 | from torch.utils.data import Dataset
24 | from transformers import Trainer
25 |
26 | IGNORE_INDEX = -100
27 | DEFAULT_PAD_TOKEN = "[PAD]"
28 | DEFAULT_EOS_TOKEN = ""
29 | DEFAULT_BOS_TOKEN = ""
30 | DEFAULT_UNK_TOKEN = ""
31 | PROMPT_DICT = {
32 | "prompt_input": (
33 | "Below is an instruction that describes a task, paired with an input that provides further context. "
34 | "Write a response that appropriately completes the request.\n\n"
35 | "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:"
36 | ),
37 | "prompt_no_input": (
38 | "Below is an instruction that describes a task. "
39 | "Write a response that appropriately completes the request.\n\n"
40 | "### Instruction:\n{instruction}\n\n### Response:"
41 | ),
42 | }
43 |
44 |
45 | @dataclass
46 | class ModelArguments:
47 | model_name_or_path: Optional[str] = field(default="facebook/opt-125m")
48 |
49 |
50 | @dataclass
51 | class DataArguments:
52 | data_path: str = field(default=None, metadata={"help": "Path to the training data."})
53 |
54 |
55 | @dataclass
56 | class TrainingArguments(transformers.TrainingArguments):
57 | cache_dir: Optional[str] = field(default=None)
58 | optim: str = field(default="adamw_torch")
59 | model_max_length: int = field(
60 | default=1024,
61 | metadata={"help": "Maximum sequence length. Sequences will be right padded (and possibly truncated)."},
62 | )
63 |
64 |
65 | def smart_tokenizer_and_embedding_resize(
66 | special_tokens_dict: Dict,
67 | tokenizer: transformers.PreTrainedTokenizer,
68 | model: transformers.PreTrainedModel,
69 | ):
70 | """Resize tokenizer and embedding.
71 |
72 | Note: This is the unoptimized version that may make your embedding size not be divisible by 64.
73 | """
74 | num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)
75 | model.resize_token_embeddings(len(tokenizer))
76 |
77 | if num_new_tokens > 0:
78 | input_embeddings = model.get_input_embeddings().weight.data
79 | output_embeddings = model.get_output_embeddings().weight.data
80 |
81 | input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)
82 | output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)
83 |
84 | input_embeddings[-num_new_tokens:] = input_embeddings_avg
85 | output_embeddings[-num_new_tokens:] = output_embeddings_avg
86 |
87 |
88 | def _tokenize_fn(strings: Sequence[str], tokenizer: transformers.PreTrainedTokenizer) -> Dict:
89 | """Tokenize a list of strings."""
90 | tokenized_list = [
91 | tokenizer(
92 | text,
93 | return_tensors="pt",
94 | padding="longest",
95 | max_length=tokenizer.model_max_length,
96 | truncation=True,
97 | )
98 | for text in strings
99 | ]
100 | input_ids = labels = [tokenized.input_ids[0] for tokenized in tokenized_list]
101 | input_ids_lens = labels_lens = [
102 | tokenized.input_ids.ne(tokenizer.pad_token_id).sum().item() for tokenized in tokenized_list
103 | ]
104 | return dict(
105 | input_ids=input_ids,
106 | labels=labels,
107 | input_ids_lens=input_ids_lens,
108 | labels_lens=labels_lens,
109 | )
110 |
111 |
112 | def preprocess(
113 | sources: Sequence[str],
114 | targets: Sequence[str],
115 | tokenizer: transformers.PreTrainedTokenizer,
116 | ) -> Dict:
117 | """Preprocess the data by tokenizing."""
118 | examples = [s + t for s, t in zip(sources, targets)]
119 | examples_tokenized, sources_tokenized = [_tokenize_fn(strings, tokenizer) for strings in (examples, sources)]
120 | input_ids = examples_tokenized["input_ids"]
121 | labels = copy.deepcopy(input_ids)
122 | for label, source_len in zip(labels, sources_tokenized["input_ids_lens"]):
123 | label[:source_len] = IGNORE_INDEX
124 | return dict(input_ids=input_ids, labels=labels)
125 |
126 |
127 | class SupervisedDataset(Dataset):
128 | """Dataset for supervised fine-tuning."""
129 |
130 | def __init__(self, data_path: str, tokenizer: transformers.PreTrainedTokenizer):
131 | super(SupervisedDataset, self).__init__()
132 | logging.warning("Loading data...")
133 | list_data_dict = utils.jload(data_path)
134 |
135 |
136 | logging.warning("Formatting inputs...")
137 | prompt_input, prompt_no_input = PROMPT_DICT["prompt_input"], PROMPT_DICT["prompt_no_input"]
138 | sources = [
139 | prompt_input.format_map(example) if example.get("input", "") != "" else prompt_no_input.format_map(example)
140 | for example in list_data_dict
141 | ]
142 | targets = [f"{example['output']}{tokenizer.eos_token}" for example in list_data_dict]
143 |
144 | logging.warning("Tokenizing inputs... This may take some time...")
145 | data_dict = preprocess(sources, targets, tokenizer)
146 |
147 | self.input_ids = data_dict["input_ids"]
148 | self.labels = data_dict["labels"]
149 |
150 | def __len__(self):
151 | return len(self.input_ids)
152 |
153 | def __getitem__(self, i) -> Dict[str, torch.Tensor]:
154 | return dict(input_ids=self.input_ids[i], labels=self.labels[i])
155 |
156 |
157 | @dataclass
158 | class DataCollatorForSupervisedDataset(object):
159 | """Collate examples for supervised fine-tuning."""
160 |
161 | tokenizer: transformers.PreTrainedTokenizer
162 |
163 | def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
164 | input_ids, labels = tuple([instance[key] for instance in instances] for key in ("input_ids", "labels"))
165 | input_ids = torch.nn.utils.rnn.pad_sequence(
166 | input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id
167 | )
168 | labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=IGNORE_INDEX)
169 | return dict(
170 | input_ids=input_ids,
171 | labels=labels,
172 | attention_mask=input_ids.ne(self.tokenizer.pad_token_id),
173 | )
174 |
175 |
176 | def make_supervised_data_module(tokenizer: transformers.PreTrainedTokenizer, data_args) -> Dict:
177 | """Make dataset and collator for supervised fine-tuning."""
178 | train_dataset = SupervisedDataset(tokenizer=tokenizer, data_path=data_args.data_path)
179 | data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)
180 | return dict(train_dataset=train_dataset, eval_dataset=None, data_collator=data_collator)
181 |
182 |
183 | def train():
184 | parser = transformers.HfArgumentParser((ModelArguments, DataArguments, TrainingArguments))
185 | model_args, data_args, training_args = parser.parse_args_into_dataclasses()
186 |
187 | model = transformers.AutoModelForCausalLM.from_pretrained(
188 | model_args.model_name_or_path,
189 | cache_dir=training_args.cache_dir,
190 | )
191 |
192 | tokenizer = transformers.AutoTokenizer.from_pretrained(
193 | model_args.model_name_or_path,
194 | cache_dir=training_args.cache_dir,
195 | model_max_length=training_args.model_max_length,
196 | padding_side="right",
197 | use_fast=False,
198 | )
199 | special_tokens_dict = dict()
200 | if tokenizer.pad_token is None:
201 | special_tokens_dict["pad_token"] = DEFAULT_PAD_TOKEN
202 | if tokenizer.eos_token is None:
203 | special_tokens_dict["eos_token"] = DEFAULT_EOS_TOKEN
204 | if tokenizer.bos_token is None:
205 | special_tokens_dict["bos_token"] = DEFAULT_BOS_TOKEN
206 | if tokenizer.unk_token is None:
207 | special_tokens_dict["unk_token"] = DEFAULT_UNK_TOKEN
208 |
209 | smart_tokenizer_and_embedding_resize(
210 | special_tokens_dict=special_tokens_dict,
211 | tokenizer=tokenizer,
212 | model=model,
213 | )
214 |
215 | data_module = make_supervised_data_module(tokenizer=tokenizer, data_args=data_args)
216 | trainer = Trainer(model=model, tokenizer=tokenizer, args=training_args, **data_module)
217 | trainer.train()
218 | trainer.save_state()
219 | trainer.save_model(output_dir=training_args.output_dir)
220 |
221 |
222 | if __name__ == "__main__":
223 | train()
224 |
--------------------------------------------------------------------------------
/inference/train.py:
--------------------------------------------------------------------------------
1 | # Copyright 2023 Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import copy
16 | import logging
17 | from dataclasses import dataclass, field
18 | from typing import Dict, Optional, Sequence
19 |
20 | import torch
21 | import transformers
22 | import utils
23 | from torch.utils.data import Dataset
24 | from transformers import Trainer
25 |
26 | IGNORE_INDEX = -100
27 | DEFAULT_PAD_TOKEN = "[PAD]"
28 | DEFAULT_EOS_TOKEN = ""
29 | DEFAULT_BOS_TOKEN = ""
30 | DEFAULT_UNK_TOKEN = ""
31 | PROMPT_DICT = {
32 | "prompt_input": (
33 | "Below is an instruction that describes a task, paired with an input that provides further context. "
34 | "Write a response that appropriately completes the request.\n\n"
35 | "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:"
36 | ),
37 | "prompt_no_input": (
38 | "Below is an instruction that describes a task. "
39 | "Write a response that appropriately completes the request.\n\n"
40 | "### Instruction:\n{instruction}\n\n### Response:"
41 | ),
42 | }
43 |
44 |
45 | @dataclass
46 | class ModelArguments:
47 | model_name_or_path: Optional[str] = field(default="facebook/opt-125m")
48 |
49 |
50 | @dataclass
51 | class DataArguments:
52 | data_path: str = field(default=None, metadata={"help": "Path to the training data."})
53 |
54 |
55 | @dataclass
56 | class TrainingArguments(transformers.TrainingArguments):
57 | cache_dir: Optional[str] = field(default=None)
58 | optim: str = field(default="adamw_torch")
59 | model_max_length: int = field(
60 | default=1024,
61 | metadata={"help": "Maximum sequence length. Sequences will be right padded (and possibly truncated)."},
62 | )
63 |
64 |
65 | def smart_tokenizer_and_embedding_resize(
66 | special_tokens_dict: Dict,
67 | tokenizer: transformers.PreTrainedTokenizer,
68 | model: transformers.PreTrainedModel,
69 | ):
70 | """Resize tokenizer and embedding.
71 |
72 | Note: This is the unoptimized version that may make your embedding size not be divisible by 64.
73 | """
74 | num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)
75 | model.resize_token_embeddings(len(tokenizer))
76 |
77 | if num_new_tokens > 0:
78 | input_embeddings = model.get_input_embeddings().weight.data
79 | output_embeddings = model.get_output_embeddings().weight.data
80 |
81 | input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)
82 | output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)
83 |
84 | input_embeddings[-num_new_tokens:] = input_embeddings_avg
85 | output_embeddings[-num_new_tokens:] = output_embeddings_avg
86 |
87 |
88 | def _tokenize_fn(strings: Sequence[str], tokenizer: transformers.PreTrainedTokenizer) -> Dict:
89 | """Tokenize a list of strings."""
90 | tokenized_list = [
91 | tokenizer(
92 | text,
93 | return_tensors="pt",
94 | padding="longest",
95 | max_length=tokenizer.model_max_length,
96 | truncation=True,
97 | )
98 | for text in strings
99 | ]
100 | input_ids = labels = [tokenized.input_ids[0] for tokenized in tokenized_list]
101 | input_ids_lens = labels_lens = [
102 | tokenized.input_ids.ne(tokenizer.pad_token_id).sum().item() for tokenized in tokenized_list
103 | ]
104 | return dict(
105 | input_ids=input_ids,
106 | labels=labels,
107 | input_ids_lens=input_ids_lens,
108 | labels_lens=labels_lens,
109 | )
110 |
111 |
112 | def preprocess(
113 | sources: Sequence[str],
114 | targets: Sequence[str],
115 | tokenizer: transformers.PreTrainedTokenizer,
116 | ) -> Dict:
117 | """Preprocess the data by tokenizing."""
118 | examples = [s + t for s, t in zip(sources, targets)]
119 | examples_tokenized, sources_tokenized = [_tokenize_fn(strings, tokenizer) for strings in (examples, sources)]
120 | input_ids = examples_tokenized["input_ids"]
121 | labels = copy.deepcopy(input_ids)
122 | for label, source_len in zip(labels, sources_tokenized["input_ids_lens"]):
123 | label[:source_len] = IGNORE_INDEX
124 | return dict(input_ids=input_ids, labels=labels)
125 |
126 |
127 | class SupervisedDataset(Dataset):
128 | """Dataset for supervised fine-tuning."""
129 |
130 | def __init__(self, data_path: str, tokenizer: transformers.PreTrainedTokenizer):
131 | super(SupervisedDataset, self).__init__()
132 | logging.warning("Loading data...")
133 | list_data_dict = utils.jload(data_path)
134 | list_data_dict = list_data_dict[:1000]
135 |
136 | logging.warning("Formatting inputs...")
137 | prompt_input, prompt_no_input = PROMPT_DICT["prompt_input"], PROMPT_DICT["prompt_no_input"]
138 | sources = [
139 | prompt_input.format_map(example) if example.get("input", "") != "" else prompt_no_input.format_map(example)
140 | for example in list_data_dict
141 | ]
142 | targets = [f"{example['output']}{tokenizer.eos_token}" for example in list_data_dict]
143 |
144 | logging.warning("Tokenizing inputs... This may take some time...")
145 | data_dict = preprocess(sources, targets, tokenizer)
146 |
147 | self.input_ids = data_dict["input_ids"]
148 | self.labels = data_dict["labels"]
149 |
150 | def __len__(self):
151 | return len(self.input_ids)
152 |
153 | def __getitem__(self, i) -> Dict[str, torch.Tensor]:
154 | return dict(input_ids=self.input_ids[i], labels=self.labels[i])
155 |
156 |
157 | @dataclass
158 | class DataCollatorForSupervisedDataset(object):
159 | """Collate examples for supervised fine-tuning."""
160 |
161 | tokenizer: transformers.PreTrainedTokenizer
162 |
163 | def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
164 | input_ids, labels = tuple([instance[key] for instance in instances] for key in ("input_ids", "labels"))
165 | input_ids = torch.nn.utils.rnn.pad_sequence(
166 | input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id
167 | )
168 | labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=IGNORE_INDEX)
169 | return dict(
170 | input_ids=input_ids,
171 | labels=labels,
172 | attention_mask=input_ids.ne(self.tokenizer.pad_token_id),
173 | )
174 |
175 |
176 | def make_supervised_data_module(tokenizer: transformers.PreTrainedTokenizer, data_args) -> Dict:
177 | """Make dataset and collator for supervised fine-tuning."""
178 | train_dataset = SupervisedDataset(tokenizer=tokenizer, data_path=data_args.data_path)
179 | data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)
180 | return dict(train_dataset=train_dataset, eval_dataset=None, data_collator=data_collator)
181 |
182 |
183 | def train():
184 | parser = transformers.HfArgumentParser((ModelArguments, DataArguments, TrainingArguments))
185 | model_args, data_args, training_args = parser.parse_args_into_dataclasses()
186 |
187 | model = transformers.AutoModelForCausalLM.from_pretrained(
188 | model_args.model_name_or_path,
189 | cache_dir=training_args.cache_dir,
190 | )
191 |
192 | tokenizer = transformers.AutoTokenizer.from_pretrained(
193 | model_args.model_name_or_path,
194 | cache_dir=training_args.cache_dir,
195 | model_max_length=training_args.model_max_length,
196 | padding_side="right",
197 | use_fast=False,
198 | )
199 | special_tokens_dict = dict()
200 | if tokenizer.pad_token is None:
201 | special_tokens_dict["pad_token"] = DEFAULT_PAD_TOKEN
202 | if tokenizer.eos_token is None:
203 | special_tokens_dict["eos_token"] = DEFAULT_EOS_TOKEN
204 | if tokenizer.bos_token is None:
205 | special_tokens_dict["bos_token"] = DEFAULT_BOS_TOKEN
206 | if tokenizer.unk_token is None:
207 | special_tokens_dict["unk_token"] = DEFAULT_UNK_TOKEN
208 |
209 | smart_tokenizer_and_embedding_resize(
210 | special_tokens_dict=special_tokens_dict,
211 | tokenizer=tokenizer,
212 | model=model,
213 | )
214 |
215 | data_module = make_supervised_data_module(tokenizer=tokenizer, data_args=data_args)
216 | trainer = Trainer(model=model, tokenizer=tokenizer, args=training_args, **data_module)
217 | trainer.train()
218 | trainer.save_state()
219 | trainer.save_model(output_dir=training_args.output_dir)
220 |
221 |
222 | if __name__ == "__main__":
223 | train()
224 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # MoDS: Model-oriented Data Selection for Instruction Tuning
2 |
3 | This repo contains the codes of MoDS to select valuable instruction data from large-scale datasets for a given LLM.
4 |
5 | ### Introduction
6 |
7 | Instruction tuning has become the de facto method to equip large language models (LLMs) with the ability of following user instructions. Usually, hundreds of thousands or millions of instruction-following pairs are employed to fine-tune the foundation LLMs. Recently, some studies show that a small number of high-quality instruction data is enough. However, how to select appropriate instruction data for a given LLM is still an open problem. To address this problem, in this paper we present a model-oriented data selection (MoDS) approach, which selects instruction data based on a new criteria considering three aspects: quality, coverage and necessity. First, our approach utilizes a quality evaluation model to filter out the high-quality subset from the original instruction dataset, and then designs an algorithm to further select from the high-quality subset a seed instruction dataset with good coverage. The seed dataset is applied to fine-tune the foundation LLM to obtain an initial instruction-following LLM. Finally, we develop a necessity evaluation model to find out the instruction data which are performed badly in the initial instruction-following LLM and consider them necessary instructions to further improve the LLMs. In this way, we can get a small high-quality, broad-coverage and high-necessity subset from the original instruction datasets. As shown in Figure 1, it presents the architecture of our approach.
8 |
9 |
10 |

11 |
12 |
13 |
14 | ### Environment Dependencies
15 |
16 | ```shell
17 | transformers==4.31.0
18 | json==2.0.9
19 | pytorch==2.0.1
20 | numpy==1.25.2
21 | sklearn==1.2.1
22 | ```
23 |
24 | ### Stage 1: Quality Evaluation
25 |
26 | The quality of instruction data plays a crucial role in the learning of instruction-following capabilities for LLMs. Therefore, to select effective instruction data, we first evaluate the qualities of instruction data and their corresponding response in the large-scale dataset, and then filter out the higher-quality data from it.
27 |
28 | - When assessing the qualities of instruction data, we utilize the [reward-model-deberta-v3-large-v2](https://huggingface.co/OpenAssistant/reward-model-deberta-v3-large-v2) model which is developed by OpenAssistant. This is a reward model designed based on the DeBERTa architecture, and is trained on four different types of human feedback data , endowing it with the abilities of QA model evaluation, reward scoring, and detecting potential toxic response via ranking. In this paper, we mainly adopt its reward scoring capability to generate a quality score for each (instruction, input, output) triplet in the large-scale dataset. Consequently, we should download the reward-model-deberta-v3-large-v2 in this step and put it into the folder of "models"
29 |
30 | - For the json file from large-scale datasets, we can run the following script to process it and generate a new file with quality scores. "input.json" represents the file from large-scale datasets, while "quality-evaluation.json" represents the output results with quality scores. All files have the same format as [Alpaca](https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json)
31 |
32 | ```shell
33 | cd quality-evaluation
34 | python quality-evaluation.py ./input.json ./quality-evaluation.json
35 | ```
36 |
37 | - After computing a quality score for each (instruction, input, output) pair, we will extract the high-quality instruciton data with the following script. "high-quality-data.json" represents the high-quality data we extracted. And "0.0" is the threshold to filter high-quality data.
38 |
39 | ```shell
40 | python high-quality-data-extraction.py ./quality-evaluation.json ./high-quality-data.json 0.0
41 | ```
42 |
43 |
44 |
45 |
46 | ### Stage 2: Diverse Data Selection for Seed Instructions
47 |
48 | After getting a high-quality instruction dataset, we will further select data from it. In order to select diverse instruction data with the maximum coverage, we propose to use K-Center greedy algorithm for data selection.
49 |
50 | - Step 1: In order to compute the sentence embeddings of different instructions, we will download the [bert-base-uncased](https://huggingface.co/bert-base-uncased) model firstly, and then put it into the folder of "models".
51 |
52 | - Step 2: After downloading the bert-base-uncased model, we will run the following script to seed instrucitons from high-quality dataset. "top_k" represents the number of seed instructions to be selected.
53 |
54 |
55 | ```shell
56 | cd diverse-data-selection
57 | python run.py ../quality-evaluation/high-quality-data.json ./seed-instructions.json top_k
58 | ```
59 |
60 | ### Stage 3: Augmented Data Selection
61 |
62 | For different LLMs, as the knowledge and capabilities they learned in the pre-training procedure are different, the instruction tuning data they require will be different as well. For one instruction, if the given LLM could generate a good response, it indicates that the given LLM has owned the ability to handle this type of instruction, and this instruction data is not necessary for the fine-tuning of the LLM. Conversely, if the LLM cannot generate a good response, it suggests that the LLM couldn't effectively process that type of instruction data, and the instruction data is very important and unique for the fine-tuning of the target LLM. In this stage, we will extract these instructions with bad responses to build a augmented dataset for the given LLM.
63 |
64 | - Step 1: In order to find out these missed instructions, we first fine-tune the pre-trained LLM with the seed instruction dataset, generating an initial LLM. Especially, beforing fine-tuning procedure, we should download the pre-trained [llama2](https://huggingface.co/meta-llama/Llama-2-7b-hf) model and put it into the folder of "model". We can run the following scripts to fine-tune the pre-trained LLM.
65 |
66 | ```
67 | cd train
68 | ./run.sh
69 | ```
70 |
71 | The hyperparameters of fine-tuning procedure are presented in the following:
72 |
73 | ```
74 | CUDA_VISIBLE_DEVICES=1,2,3,4 \
75 | torchrun --nproc_per_node=4 --master_port=4568 train.py \
76 | --model_name_or_path ../models/llama2-7b-hf/ \
77 | --data_path ../diverse-data-selection/seed-instructions.json \
78 | --bf16 True \
79 | --output_dir ../output/initial-model/ \
80 | --num_train_epochs 3 \
81 | --per_device_train_batch_size 8 \
82 | --per_device_eval_batch_size 8 \
83 | --gradient_accumulation_steps 16 \
84 | --evaluation_strategy "no" \
85 | --save_strategy "steps" \
86 | --save_steps 2000 \
87 | --save_total_limit 1 \
88 | --learning_rate 2e-5 \
89 | --weight_decay 0. \
90 | --warmup_ratio 0.03 \
91 | --lr_scheduler_type "cosine" \
92 | --logging_steps 1 \
93 | --fsdp "full_shard auto_wrap offload" \
94 | --fsdp_transformer_layer_cls_to_wrap 'LlamaDecoderLayer' \
95 | --tf32 True \
96 | --gradient_checkpointing True
97 | ```
98 |
99 | - Step 2: After getting the initial LLM, we will use it to inference the responses of all the high-quality instructions. The commands are presented in the following:
100 |
101 | ```
102 | cd ../inference
103 | ./instruction_filter.sh
104 | ```
105 |
106 | The parameters of the instruction_filter.sh file is presented in the following:
107 |
108 | ```
109 | CUDA_VISIBLE_DEVICES=5 \
110 | python instruction_filter.py --model_name_or_path ../output/initial-model --instruct_data ../quality-evaluation/high-quality-data.json --instruct_filtered ../necessity-evaluation/inference.json
111 | ```
112 |
113 | - Step 3: After geting the responses of all the high-quality instrucitons, we will use the necessity evaluation module to compute the review score for each of them, and then extract the instructions with bad responses. "threshold" represents the value to filter out instructions with bad responses.
114 |
115 | ```
116 | cd necessity-evaluation
117 | python necessity-evaluation.py ./inference.json ./inference-necessity-evaluation.json
118 | python necessity-instruction-extraction.py ./inference-necessity-evaluation.json ./necessity-instruction.json threshold
119 | ```
120 |
121 | #### Stage 4: Fine-tuning with Selected Instructions
122 |
123 | - Step 1: Merge the seed dataset and augmented dataset.
124 |
125 | ```shell
126 | cd necessity-evaluation
127 | python merge.py ../diverse-data-selection/seed-instructions.json ./necessity-instruction.json ./final-selected-dataset.json
128 | ```
129 |
130 | - Step 2: Fine-tuning the raw LLM with the merged dataset again.
131 |
132 | ```
133 | cd ../train
134 | ./run.sh
135 | ```
136 |
137 | The parameters of training are presented in the following:
138 |
139 | ```
140 | CUDA_VISIBLE_DEVICES=1,2,3,4 \
141 | torchrun --nproc_per_node=4 --master_port=4568 train.py \
142 | --model_name_or_path ../models/llama2-7b-hf/ \
143 | --data_path ../necessity-evaluation/final-selected-dataset.json \
144 | --bf16 True \
145 | --output_dir ../output/initial-model/ \
146 | --num_train_epochs 3 \
147 | --per_device_train_batch_size 8 \
148 | --per_device_eval_batch_size 8 \
149 | --gradient_accumulation_steps 16 \
150 | --evaluation_strategy "no" \
151 | --save_strategy "steps" \
152 | --save_steps 2000 \
153 | --save_total_limit 1 \
154 | --learning_rate 2e-5 \
155 | --weight_decay 0. \
156 | --warmup_ratio 0.03 \
157 | --lr_scheduler_type "cosine" \
158 | --logging_steps 1 \
159 | --fsdp "full_shard auto_wrap offload" \
160 | --fsdp_transformer_layer_cls_to_wrap 'LlamaDecoderLayer' \
161 | --tf32 True \
162 | --gradient_checkpointing True
163 | ```
164 |
165 | ## Citation
166 | Please cite the paper if you use the data or code in this repo.
167 |
168 | ```shell
169 | @misc{du2023mods,
170 | title={MoDS: Model-oriented Data Selection for Instruction Tuning},
171 | author={Qianlong Du and Chengqing Zong and Jiajun Zhang},
172 | year={2023},
173 | eprint={2311.15653},
174 | archivePrefix={arXiv},
175 | primaryClass={cs.CL}
176 | }
177 | ```
178 |
179 | Naturally, you should also cite the work of LLaMA2 and Alpaca.
--------------------------------------------------------------------------------
/inference/test.json:
--------------------------------------------------------------------------------
1 | [
2 | {
3 | "instruction": "Rate the given essay on a scale of 1 to 10.",
4 | "input": "This essay is discussing the benefits of exercising. Exercise improves the physical and mental health of individuals. Regular exercise can reduce the chance of developing certain diseases, such as heart disease, strokes and diabetes. It can also help to regulate mood and reduce stress. Exercising also helps to improve cognitive functioning, strengthens the bones and joints, and can lead to improved physical fitness and overall wellbeing.",
5 | "output": "I would rate this essay 9 out of 10. The essay clearly outlines the benefits of exercising and does so in a compact and organized manner. The evidence presented is relevant and supported. Plus, the author provides an understandable conclusion. My only critique is that the author should have provided a few examples to support some of their points."
6 | },
7 | {
8 | "instruction": "Generate a creative poem using natural language processing.",
9 | "input": "",
10 | "output": "The sun sets in glory\nThe sky ablaze in red\nThe light of day lingering\nFading, until nightfall ahead\n\nThe moonlight a lullaby\nGently singing a sweet serenade\nA somber peace upon the land\nAs the night moves in its shade."
11 | },
12 | {
13 | "instruction": "Given an English vocabulary word, define it in 1 to 2 sentences.",
14 | "input": "Abrogate",
15 | "output": "Abrogate is a verb that means to formally cancel or repeal a law, treaty, or agreement. It is commonly used to indicate an intentional abandonment or renunciation of rights, privileges, or obligations."
16 | },
17 | {
18 | "instruction": "Given a list of countries, look up their respective capitals.",
19 | "input": "Poland, Germany",
20 | "output": "The capital of Poland is Warsaw and the capital of Germany is Berlin."
21 | },
22 | {
23 | "instruction": "Generate a chart to summarize the following data",
24 | "input": "Books Sold \u2013 10,000\nBooks Returned \u2013 2,000\nBooks on Hand \u2013 8,000",
25 | "output": "Books Sold | Books Returned | Books on Hand\n------------ | ------------- | --------------\n10,000 | 2,000 | 8,000"
26 | },
27 | {
28 | "instruction": "What is the main benefit of having a healthy lifestyle?",
29 | "input": "",
30 | "output": "The primary benefit of having a healthy lifestyle is an increased sense of overall wellbeing. A healthy lifestyle can improve physical and mental health, boost energy levels, improve focus and concentration, reduce stress, boost immunity, and help to maintain a healthy weight. Additionally, leading a healthy lifestyle can also reduce the risk of developing chronic diseases, such as heart disease, stroke, diabetes, and cancer."
31 | },
32 | {
33 | "instruction": "Name three common methods of identity verification.",
34 | "input": "",
35 | "output": "Three common methods of identity verification are: passwords, security questions, and biometric authentication."
36 | },
37 | {
38 | "instruction": "List two benefits of cloud computing",
39 | "input": "",
40 | "output": "1. Cost efficiency: Cloud computing offers businesses a cost-effective way to store and access data. They don't need to invest in expensive hardware to increase their storage capacity.\n2. Scalability: Cloud computing makes it easier for businesses to scale up or down their data storage and computing power as needed. This makes it easier for businesses to respond quickly to changing market conditions and demands."
41 | },
42 | {
43 | "instruction": "Automate the following task: Every time someone sends an email to the customer service mailbox, send a reply thanking the customer for the message.",
44 | "input": "",
45 | "output": "Sorry, the task involves complex actions which must be take by a human. The GPT model cannot perform such actions."
46 | },
47 | {
48 | "instruction": "Summarize the main points of the given article in three sentences.",
49 | "input": "Article:\nClimate change has become a top global issue that is threatening the planet\u2019s fragile ecosystems and people\u2019s livelihoods. The impacts of global warming devastatingly are the most felt by vulnerable communities in the Global South. Everyone across the world is needed to take action to help stop climate change.",
50 | "output": "Climate change is having a profound effect on the planet and its people, particularly communities in the Global South. Everyone must take action to reduce the effects of climate change. This could include reducing carbon emissions, consuming less and using renewable energy sources."
51 | },
52 | {
53 | "instruction": "List 10 books that have won a Pulitzer Prize.",
54 | "input": "",
55 | "output": "- The Echo Maker by Richard Powers (2006)\n- The Hours by Michael Cunningham (1999)\n- Atonement by Ian McEwan (2002)\n- All the Light We Cannot See by Anthony Doerr (2015)\n- The Amazing Adventures of Kavalier and Clay by Michael Chabon (2001)\n- Rabbit is Rich by John Updike (1982)\n- The Sympathizer by Viet Thanh Nguyen (2016)\n- The Grapes of Wrath by John Steinbeck (1940)\n- Beloved by Toni Morrison (1988)\n- A Passage to India by E.M. Forster (1924)"
56 | },
57 | {
58 | "instruction": "In the following sentence, categorize the words under parts of speech:",
59 | "input": "Ants are marching in the garden.",
60 | "output": "Noun (Ants), Verb (are marching), Preposition (in), Noun (garden)."
61 | },
62 | {
63 | "instruction": "Translate \"Ich bin ein Berliner\" into English.",
64 | "input": "",
65 | "output": "I am a Berliner."
66 | },
67 | {
68 | "instruction": "Describe the view you will find at the beach.",
69 | "input": "",
70 | "output": "At the beach, you will find a vast expanse of sand, stretching as far as the eye can see. The shore is lined with invitingly warm, turquoise waters, dotted with sailboats and yachts. On the horizon, you can see the hazy silhouettes of other nearby islands and tall, snow-capped mountains in the distance."
71 | },
72 | {
73 | "instruction": "Compute the area of a triangle with the vertices (3,4), (5,6), (7,8)",
74 | "input": "",
75 | "output": "The area of the triangle is 10.5 units."
76 | },
77 | {
78 | "instruction": "Provide me 3 facts about penguins",
79 | "input": "",
80 | "output": "1. Penguins are flightless birds that live in the southern hemisphere. 2. Penguins do not have teeth, instead they have short, broad beaks. 3. There are more than 17 species of penguins, all found only in the Southern Hemisphere."
81 | },
82 | {
83 | "instruction": "Generate a teaser for the given film.",
84 | "input": "The Lord of the Rings.",
85 | "output": "From the creators of The Lord of the Rings comes a thrilling epic adventure like no other! Follow the journey of a daring fellowship battle against the forces of evil in Middle Earth. Discover the fate of a kingdom as they fight to restore the order between the races of men, elves, and dwarves. Join them in the greatest adventure yet - The Lord of the Rings!"
86 | },
87 | {
88 | "instruction": "Find the sentence which does not fit in to the context of the other sentences.",
89 | "input": "He decided to pack his new tent. He chose the lightest sleeping bag available. He packed his hiking boots. He selected a lunchbox. He ordered a new book.",
90 | "output": "He ordered a new book."
91 | },
92 | {
93 | "instruction": "Identify the type of the following sentence: \"Alice is really happy about the result.\"",
94 | "input": "",
95 | "output": "Declarative sentence."
96 | },
97 | {
98 | "instruction": "What is a trade deficit?",
99 | "input": "",
100 | "output": "A trade deficit is a situation in which a country imports more goods than it exports. It occurs when the value of the country's exports falls below the value of its imports. It is typically evaluated by looking at the total value of goods and services a country imports and exports over a certain period of time."
101 | },
102 | {
103 | "instruction": "Transport the following sentence from the present to the future tense:",
104 | "input": "I am going to the store.",
105 | "output": "I will go to the store."
106 | },
107 | {
108 | "instruction": "Create a list of skills an engineer needs to have.",
109 | "input": "",
110 | "output": "An engineer should possess a range of skills, including:\n- Technical Problem Solving \u2013 Ability to identify, analyze, and create solutions to technical problems.\n- Critical Thinking - Ability to think in a structured, analytical and creative manner, and to exercise judgment when making decisions.\n- Written and Verbal Communication - Ability to effectively communicate thoughts and ideas in written and in verbal formats.\n- Teamwork \u2013 Working collaboratively with others to solve complex problems.\n- Time Management - Ability to plan and prioritize work in order to meet deadlines.\n- Leadership \u2013 Ability to lead projects, teams and initiatives.\n- Research \u2013 Ability to conduct research and obtain necessary data to inform decisions.\n- Management and Planning \u2013 Ability to plan and manage projects and resources.\n- Analytical Skills \u2013 Ability to identify and analyze complex data, draw conclusions and make appropriate recommendations.\n- Computer Skills \u2013 Ability to use various computer programs such as Microsoft Office, MATLAB, AutoCAD, etc."
111 | },
112 | {
113 | "instruction": "Construct an example of a valid CSS rule.",
114 | "input": "",
115 | "output": "body {\n background-color: white;\n}"
116 | },
117 | {
118 | "instruction": "Explain what is machine learning and identify its application.",
119 | "input": "",
120 | "output": "Machine learning is a form of artificial intelligence that involves giving computers and machines the ability to learn from data, identify patterns and make decisions with minimal human intervention. It uses data-driven algorithms and computer models to learn from data and automate decisions to develop solutions. Machine learning has applications in many industries, including healthcare, finance, retail, education, and manufacturing. It is used for tasks like automated personalized recommendation systems, fraud and anomaly detection, natural language processing and image recognition."
121 | },
122 | {
123 | "instruction": "How do you go about finding the best deal when buying a car?",
124 | "input": "",
125 | "output": "When buying a car, it is important to research the different models, prices, and dealerships. Comparing prices and options online can help you find the best deal. You should also test drive the car to make sure it meets your needs and preferences. Additionally, it is beneficial to read reviews of the car or dealership before making a purchase."
126 | },
127 | {
128 | "instruction": "Sort the following vegetables by their colors.",
129 | "input": "Tomato, Carrot, Broccoli, Avocado",
130 | "output": "Red (Tomato), Orange (Carrot), Green (Broccoli, Avocado)"
131 | },
132 | {
133 | "instruction": "Explain why the following product has poor reviews.",
134 | "input": "Apple Airpods Pro",
135 | "output": "The Apple Airpods Pro have poor reviews because they have a lower sound quality than expected and are expensive compared to other products with similar features. Additionally, they have limited noise cancellation, have problems with connectivity, and have a shorter battery life than advertised."
136 | },
137 | {
138 | "instruction": "Reword the sentence by changing its tense to the past.",
139 | "input": "The driver carefully turns the corner.",
140 | "output": "The driver carefully turned the corner."
141 | },
142 | {
143 | "instruction": "Generate a suitable metaphor to convey the concept of 'breaking a barrier'",
144 | "input": "",
145 | "output": "Breaking a barrier is like knocking down a wall, shattering the limits that stand in the way of progress."
146 | },
147 | {
148 | "instruction": "Generate a short story about a person in survival mode.",
149 | "input": "",
150 | "output": "John had always been an adventurous spirit, and so when he heard about the remote wilderness region in Alaska, he jumped at the chance to explore it. He was not prepared, however, for the ferocity of the elements he encountered. Every day was a battle against sub-zero temperatures, relentless winds and snowdrifts, and an unforgiving landscape. But with his wits and determination, John managed to survive and eventually find himself in the safety of civilization. He had learned to appreciate the essentials of life, and he was sure he would never take little things like food and shelter for granted again."
151 | },
152 | {
153 | "instruction": "Make an analogy comparing two different concepts.",
154 | "input": "",
155 | "output": "Studying is like a journey - you learn new information as you go and eventually reach your destination of mastering the subject."
156 | },
157 | {
158 | "instruction": "Identify three ways in which the government can reduce the problem of air pollution.",
159 | "input": "",
160 | "output": "The government can reduce air pollution by encouraging carpooling and public transportation by providing incentives and implementing laws to reduce emissions from vehicles; regulating industrial pollutants by setting emissions standards for companies and enforcing those standards; and increasing the green spaces around cities through tree planting and land conservation, which can help to reduce ground level ozone and smog."
161 | },
162 | {
163 | "instruction": "Give a review of the movie \"The Martian\".",
164 | "input": "",
165 | "output": "The Martian is a thrilling and inspiring science fiction drama about an astronaut stranded on Mars who must find a way to survive until he can be rescued. The movie does an excellent job of showing the difficulty and ingenuity of the astronaut\u2019s struggle, making it a very exciting and intense story. The cast does a great job of embodying their characters and the spectacular CGI visuals create a convincing and detailed representation of Mars. The movie is very thought-provoking and will leave you feeling inspired and amazed by the human spirit. Highly recommended."
166 | },
167 | {
168 | "instruction": "Give two examples of how machine learning can be used at a business.",
169 | "input": "",
170 | "output": "1. Predictive analysis: Machine learning can be used to develop predictive models that allow businesses to anticipate customer purchases, identify sales trends, and target advertising campaigns. \n2. Customer segmentation: Machine learning can be used to sort customer data into segments based on customer behaviour, preferences, and purchasing histories so that businesses can tailor products or services to specific customer groups."
171 | },
172 | {
173 | "instruction": "Summarize the text into 25 words or fewer.",
174 | "input": "The global COVID-19 pandemic has been an unprecedented event that has had a wide-reaching impact on the entire world.",
175 | "output": "Global pandemic caused by COVID-19 has caused upheaval worldwide, with widespread implications."
176 | },
177 | {
178 | "instruction": "Replace the word \"diffferent\" with the correct word:",
179 | "input": "The group discussed the controversial topic in diffferent manners.",
180 | "output": "The group discussed the controversial topic in various manners."
181 | },
182 | {
183 | "instruction": "Create a questionnaire assessing safety in the workplace.",
184 | "input": "",
185 | "output": "1. Are employees aware of safety policies and regulations in the workplace?\n2. Are personal protective equipment (PPE) worn when necessary?\n3. Are tools, machines and equipment regularly maintained and inspected for safety?\n4. How do employees report any hazards, injuries or near misses?\n5. Are appropriate measures taken after reports of injuries, near misses or hazards?\n6.Are visitors alerted to possible hazards in the area?\n7. Are safety signs and signals installed in necessary places and are they clearly visible?\n8. Is safety training regularly provided to employees?\n9. Are emergency exits clearly marked and unobstructed?\n10. Are employees aware of the signs and symptoms of common occupational diseases?"
186 | },
187 | {
188 | "instruction": "Make a list of 3 ways to exercise with no equipment.",
189 | "input": "",
190 | "output": "1. Push-ups\n2. Running\n3. Squats"
191 | },
192 | {
193 | "instruction": "Describe how a Data Scientist can use a titanic dataset",
194 | "input": "",
195 | "output": "Data Scientists can use a titanic dataset to gain insights into passenger behavior and risk factors related to death on the voyage. By analyzing the data, they can identify characteristics and trends associated with mortality, such as gender, age, ticket class, and so on. They could also use the data to create predictive models that can accurately estimate the likelihood of death under certain conditions. Additionally, they can use data visualization techniques to better understand patterns within the data."
196 | },
197 | {
198 | "instruction": "Name 5 famous authors.",
199 | "input": "",
200 | "output": "William Shakespeare, J.K. Rowling, George Orwell, Stephen King, Ernest Hemingway."
201 | },
202 | {
203 | "instruction": "Who won the 2017 Oscars for Best Actor?",
204 | "input": "",
205 | "output": "The 2017 Academy Award for Best Actor was awarded to Gary Oldman for his performance in the film \"Darkest Hour\"."
206 | },
207 | {
208 | "instruction": "Compose a poem about winter using the given keywords.",
209 | "input": "snow, frosty, ice, chilly",
210 | "output": "A frosty winter night, \nThe sky glimmers with snow and ice.\nThe trees stand tall and chill,\nWrapped in an ethereal white crystal sill. \nThe cool air creates a mist,\nBlowing through gentle winds kissed. \nThe moon's reflection a sight to see,\nAs the stars twinkle above and below the deep sea. \nIt's time to tuck away tight,\nFor tonight there's a magical winter night."
211 | },
212 | {
213 | "instruction": "Write an informative introduction for a horror novel",
214 | "input": "",
215 | "output": "The dark, moonless night cloaked the small town of Strathfield in fear. For years, it had been a peaceful and tranquil corner of the world but now, no one knew what dangers lurked in the shadows. Strange lights had been seen in the sky, a peculiar mist had engulfed the town, and terrible cries were heard coming from the forest. It seemed like something sinister was lurking within the walls of Strathfield and it was up to a brave few to find out what it was."
216 | },
217 | {
218 | "instruction": "Given a list and a value, find the index of the given value in the list.",
219 | "input": "List: [3, 8, 10, 20], Value: 10",
220 | "output": "2"
221 | },
222 | {
223 | "instruction": "Provide an example of a polygon that has six sides.",
224 | "input": "",
225 | "output": "A hexagon is an example of a polygon with six sides."
226 | },
227 | {
228 | "instruction": "Take a text of any length and rephrase it as a haiku poem.",
229 | "input": "Life is too short to waste away.",
230 | "output": "Life fleeting always passes/Enjoy now for it slips away/Cherish moments worthwhile"
231 | },
232 | {
233 | "instruction": "Compose a 20 word description on Artificial Intelligence",
234 | "input": "",
235 | "output": "Artificial Intelligence (AI) is a powerful tool of computers and machines to analyze complex data and make decisions based on it. AI can be used to process large datasets, build models, and make predictions more efficiently. It has an impressive range of applications such as natural language processing, image recognition and more."
236 | },
237 | {
238 | "instruction": "Name two famous festivals celebrated in South America",
239 | "input": "",
240 | "output": "Two famous festivals celebrated in South America are Carnaval and Dia de los Muertos. Carnaval is an annual festival which is held in February before Ash Wednesday, and marks the beginning of Lent, a period of reflection in the Christian calendar. Dia de los Muertos is a festival, similar to Halloween, in which the living commemorate and celebrate the dead who, according to tradition, come back to the world of the living on that day. It is celebrated around November 1st."
241 | },
242 | {
243 | "instruction": "Explain the term \"self-fulfilling prophecy.\"",
244 | "input": "",
245 | "output": "A self-fulfilling prophecy is an expectation that comes true because it causes people to act as if it is true. It can be positive or negative and it occurs when a person\u2019s expectations influence their behavior, which in turn influences the outcome of the situation. For example, if a person believes that they are not capable of success, then they may behave in a way that leads to failure, thereby fulfilling the prophecy that was created by their initial belief."
246 | },
247 | {
248 | "instruction": "Generate a jingle that conveys the idea of safety.",
249 | "input": "",
250 | "output": "\"Safety, a priority, it's time to be wise\nAt home, work, and play, keep your guard and your eyes\nWash your hands and wear a mask, keep social distance as a task\nSafety, a priority, you'll be alright \nIt's just a matter of time, until this pandemic will end in sight!\""
251 | },
252 | {
253 | "instruction": "Given two sentences, produce a unified sentence by proper coordination and subordination",
254 | "input": "Alice makes coffee. Bob drinks tea.",
255 | "output": "Alice makes coffee while Bob drinks tea."
256 | },
257 | {
258 | "instruction": "Given a sentence, change the tense of the sentence.",
259 | "input": "She watched the movie.",
260 | "output": "She had watched the movie."
261 | },
262 | {
263 | "instruction": "Given a list of three ingredients and the name of a dish, edit the dish to include all the given ingredients.",
264 | "input": "Ingredients: garlic, mushrooms, tomatoes\nDish: Spaghetti Carbonara",
265 | "output": "Spaghetti Carbonara with Garlic, Mushrooms, and Tomatoes"
266 | },
267 | {
268 | "instruction": "Generate two questions about artificial intelligence for a quiz.",
269 | "input": "",
270 | "output": "Q1: What is the purpose of artificial intelligence?\nA1: The purpose of artificial intelligence is to solve complex problems, automate tasks, and make decisions in a way that mimics human intelligence. \n\nQ2: What are some popular machine learning algorithms?\nA2: Some popular machine learning algorithms include Support Vector Machines (SVMs), Decision Trees, Random Forests, k-Nearest Neighbor (kNN), Naive Bayes, and Neural Networks."
271 | },
272 | {
273 | "instruction": "Name three parts of a computer.",
274 | "input": "",
275 | "output": "The three main components of a computer are the central processing unit (CPU), the main memory (RAM), and the storage device (hard drive, SSD, or optical disc)."
276 | },
277 | {
278 | "instruction": "Construct a thesis statement for an essay about a rise in cybercrimes.",
279 | "input": "",
280 | "output": "The rise in cybercrimes is a major global concern, as it poses a grave threat to personal privacy, financial security, and national security."
281 | },
282 | {
283 | "instruction": "Name one type of food eaten in Mexico.",
284 | "input": "",
285 | "output": "Taco is a popular type of food in Mexico. It is a folded, lightly fried corn or wheat tortilla filled with any variety of ingredients. The fillings usually include meats such as beef, chicken, or pork, as well as beans, cheese, and vegetables. Salsa, avocado, sour cream, guacamole, lettuce, and onion can also be added for additional flavor."
286 | },
287 | {
288 | "instruction": "Given a table of data, identify the cause of the variation between the two variables.",
289 | "input": "[Attached table]",
290 | "output": "The variation between the two variables is most likely caused by a difference in their respective prices."
291 | },
292 | {
293 | "instruction": "Capitalize the first letter in each word of the sentence.",
294 | "input": "there is no reason for it to be this way",
295 | "output": "There Is No Reason For It To Be This Way"
296 | },
297 | {
298 | "instruction": "Given a programming language, generate a \"Hello World\" program.",
299 | "input": "Java",
300 | "output": "//This Java program prints \"Hello World\npublic class HelloWorld {\n public static void main(String[] args) {\n System.out.println(\"Hello World\");\n }\n}"
301 | },
302 | {
303 | "instruction": "Create a poem that expresses loneliness.",
304 | "input": "",
305 | "output": "I feel so alone in this world,\nNo one to catch me when I fall,\nSometimes I just want to curl up and disappear,\nAway from this bleakness that's everywhere.\n\nNo light to help me find my way,\nThe darkness just keeps growing day by day,\nI want to feel loved, I want to feel whole,\nBut no one knows how much I hurt in my soul.\n\nAll I can do is keep pushing on,\nHoping that one day I'll belong,\nIt's not easy but I must persist,\nAnd never forget that my worth is not a myth."
306 | },
307 | {
308 | "instruction": "Rewrite the following general statement into an assertion statement.",
309 | "input": "The internet has had a huge impact on our lives.",
310 | "output": "The internet has had a profound influence on our everyday lives."
311 | },
312 | {
313 | "instruction": "Write the time in a different format.",
314 | "input": "7:45am",
315 | "output": "07:45"
316 | },
317 | {
318 | "instruction": "What are the positives and negatives of online communication?",
319 | "input": "",
320 | "output": "The positives of online communication include bridging geographical barriers, facilitating communication between people around the world, allowing people to communicate quicker and more efficiently, and accessing more diverse perspectives. On the downside, online communication can lead to feelings of loneliness, distortion of authentic relationships, and issue of online privacy."
321 | },
322 | {
323 | "instruction": "Give an example of how someone could save money.",
324 | "input": "",
325 | "output": "One easy way to save money is to reduce the amount of unnecessary expenses. For example, instead of eating out every day, one could pack their lunch and bring it with them to work. Another way to save money is to increase income. This can be done by finding a second job or taking on freelance gigs. Lastly, one could also save by investing, either in the stock market or another financial instrument."
326 | },
327 | {
328 | "instruction": "What are the three primary goals of content marketing?",
329 | "input": "",
330 | "output": "The three primary goals of content marketing are to attract an audience, to educate the audience and to encourage them to take action. Content marketing is used to build trust and relationships with customers, to engage an audience and to drive conversions. Content can be used to provide helpful information, to demonstrate expert knowledge, to drive leads and sales, and to encourage customer loyalty."
331 | },
332 | {
333 | "instruction": "Propose a plan to reduce air pollution in urban areas.",
334 | "input": "",
335 | "output": "One strategy for reducing air pollution in urban areas is to transition away from fossil fuels and move toward cleaner sources of energy. This can be done by increasing investments in renewable energy sources, such as solar, wind, and hydro power. Additionally, cities can replace fossil fuel-powered vehicles with electric vehicles, or create incentives for residents to use public transportation. Cities can also create more green spaces, as trees and plants help to absorb and filter pollutants. Lastly, strict regulations and policies can be implemented to ensure that businesses are held accountable and comply with air quality standards."
336 | },
337 | {
338 | "instruction": "What is the main hypothesis in the following research paper?",
339 | "input": "This paper aims to investigate the role of the internet in information access and knowledge creation among teenagers in the US.",
340 | "output": "The main hypothesis of this research paper is that the internet plays an important role in increasing access to information and knowledge creation among teenagers in the US."
341 | },
342 | {
343 | "instruction": "What would be the most appropriate title for this article?",
344 | "input": "This article discusses how social media has become an important factor in business marketing and how businesses can reach a wider audience through effective usage of various tools and platforms.",
345 | "output": "\"Leveraging the Power of Social Media for Business Growth\""
346 | },
347 | {
348 | "instruction": "Name three trends in the given field.",
349 | "input": "AI",
350 | "output": "Three trends in AI are: \n1. Automation \u2013 AI is being used to automate many processes, from production lines in factories to customer service tasks in call centers.\n2. Natural language processing \u2013 AI is being used to interpret and respond to natural language, allowing computers to understand human speech.\n3. Personalization \u2013 AI is being used to personalize experiences, such as suggesting products or services based on user data."
351 | },
352 | {
353 | "instruction": "How do the jobs of scientists and engineers differ?",
354 | "input": "",
355 | "output": "Scientists and engineers serve different roles in the development of new technologies and solutions. Scientists conduct research and gather data to understand the physical and biological world, while engineers use this data to develop practical applications and systems. Scientists focus on exploring what is possible, whereas engineers focus on creating tangible products that meet specific requirements. Additionally, scientists typically work in a laboratory setting, while engineers typically work in industry or product design."
356 | },
357 | {
358 | "instruction": "Classify the following sentence as either a metaphor or hyperbole.",
359 | "input": "The room was so hot it felt like a sauna.",
360 | "output": "Metaphor."
361 | },
362 | {
363 | "instruction": "Generate an example of a task automation process involving natural language processing.",
364 | "input": "",
365 | "output": "An example of a task automation process involving natural language processing is automatic text summarization. Automatic text summarization is the process of condensing long pieces of text into shorter summaries by using natural language processing algorithms. It works by extracting key words, identifying important phrases, and selecting sentences that are relevant to the topic of the text. The result is a condensed version of the original text that captures the main points and is easier to understand."
366 | },
367 | {
368 | "instruction": "Determine the differences between a tortoise and a turtle",
369 | "input": "",
370 | "output": "The main difference between a tortoise and a turtle is that a tortoise is a land-dwelling animal, while a turtle can live in either water or on land. Tortoises have large, sturdy feet that are designed for walking on land, while turtles have flippers that make them better adapted for swimming in the water. Additionally, the shells of turtles are generally more streamlined, while tortoises have more dome-like shells."
371 | },
372 | {
373 | "instruction": "Generate a plausible sentence that reflects a certain feeling",
374 | "input": "",
375 | "output": "She felt a wave of peacefulness wash over her as she watched the clouds drift gently across the sky."
376 | },
377 | {
378 | "instruction": "Find the current temperature in Tokyo",
379 | "input": "",
380 | "output": "Currently, the temperature in Tokyo is around 18\u00b0C (64\u00b0F)."
381 | },
382 | {
383 | "instruction": "Given a list of task instructions, what type of task is each instruction?",
384 | "input": "What is the definition of \"computer\"; Remove all duplicate words from the given sentence",
385 | "output": "The first task is a classification task and the second task is an editing task."
386 | },
387 | {
388 | "instruction": "Generate a list of 10 items to place in an emergency kit.",
389 | "input": "",
390 | "output": "1. A first aid kit\n2. Flashlight and extra batteries\n3. Non-perishable food items \n4. Water sources \n5. Emergency blankets \n6. Reusable water bottle \n7. Whistle\n8. Rain ponchos\n9. Portable radio\n10. Maps and compass"
391 | },
392 | {
393 | "instruction": "Create an original list of questions for an interview with a famous person. Output the list.",
394 | "input": "No Input",
395 | "output": "Here is a list of questions for an interview with a famous person: \n1. What motivated you to pursue your goals in your career?\n2. What role does creativity play in your life and work?\n3. What challenges have you had to face along the way?\n4. What advice would you give to young people looking to follow in your footsteps?\n5. Are there any initiatives or causes you are passionate about?\n6. Do you have any inspirational quotes or words of advice to share?\n7. What projects are you currently working on and what are your future plans?\n8. What have you learned from your experiences that you would like to pass on to others?"
396 | },
397 | {
398 | "instruction": "Explain to me how a computer virus can be contracted.",
399 | "input": "",
400 | "output": "Computer viruses can be contracted in various ways. Common methods include downloading files from untrustworthy websites, opening email attachments from unknown sources, and visiting websites that contain malicious code. Additionally, sharing unsecured computer systems with others can allow for the virus to spread from one system to another."
401 | },
402 | {
403 | "instruction": "Describe a time you used your problem solving skills to solve an issue.",
404 | "input": "",
405 | "output": "During my summer internship, I encountered a problem with the project I was working on. I identified the root cause of the problem and developed a creative solution that involved utilizing multiple resources to find the best solution. I was able to complete the project on time and it was a success."
406 | },
407 | {
408 | "instruction": "Generate sentences that use the idiom \u2018once in a blue moon\u2019.",
409 | "input": "",
410 | "output": "He only comes home once in a blue moon.\nShe comes out of her shell once in a blue moon. \nThe train only passes by here once in a blue moon. \nWe get a chance to do this activity once in a blue moon."
411 | },
412 | {
413 | "instruction": "Classify news articles by their topics.",
414 | "input": "\"New Vaccine Shows Promising Results in Early Trials\"",
415 | "output": "Health and Medicine"
416 | },
417 | {
418 | "instruction": "Make a list of 10 ideas for starting an online business.",
419 | "input": "",
420 | "output": "Here are 10 ideas to start an online business:\n1. Sell handmade crafts and art. \n2. Become a virtual assistant. \n3. Offer freelance services such as writing, website design, social media management, or programming. \n4. Start an ecommerce store. \n5. Become an online tutor. \n6. Create a subscription service. \n7. Start an online consulting business. \n8. Start an online coaching program. \n9. Sell digital products such as ebooks and software. \n10. Launch an affiliate program to generate revenue from other businesses."
421 | },
422 | {
423 | "instruction": "Identify the incorrect statement: The Earth revolves around the moon. The moon revolves around the Sun. The Sun revolves around the Earth.",
424 | "input": "",
425 | "output": "The Earth revolves around the moon is the incorrect statement."
426 | },
427 | {
428 | "instruction": "What are some ways to be more efficient?",
429 | "input": "",
430 | "output": "1. Set deadlines.\n2. Use tools to automate daily tasks.\n3. Define priorities and focus on the most important tasks.\n4. Set achievable goals and review progress.\n5. Break tasks down into smaller chunks.\n6. Streamline processes and eliminate redundancies.\n7. Schedule breaks throughout the day and stay organized."
431 | },
432 | {
433 | "instruction": "Identify the political party of the given Member of Parliament.",
434 | "input": "Jim Murphy",
435 | "output": "Jim Murphy is a Labour Party MP in the United Kingdom. He represented the constituency of East Renfrewshire from 2005 to 2015 in the House of Commons."
436 | },
437 | {
438 | "instruction": "Replace the underlined word with a suitable adjective.",
439 | "input": "The chair was strong.",
440 | "output": "The chair was sturdy."
441 | },
442 | {
443 | "instruction": "Design a graphic t-shirt.",
444 | "input": "A funny pun like: \"Tuckered Out\".",
445 | "output": "A t-shirt with a graphic of a turtle wearing a backpack with a sign saying \"Tuckered Out\". The turtle is surrounded by the phrase \"Sometimes you just need a break\" in a creative font. Underneath it, the words \"This is why I always carry my own shell\" appear in a fun font. The colors for the shirt could be a light grey or khaki for the turtle and the backpack and a bright blue for the text."
446 | },
447 | {
448 | "instruction": "Compare and contrast the roles of nature versus nurture in human behavior.",
449 | "input": "",
450 | "output": "Nature versus nurture is the debate between whether human behavior is determined by inherited genes and traits or if it is shaped by environment and experiences. Nature-focused theorists believe that biological factors such as genetics and hormones are the primary influences on behavior, while nurture-focused theorists believe that behavior is determined by environmental influences such as parental rearing and social interaction. While it is clear that both nature and nurture play a role in determining human behavior, the exact balance and contribution of each is still an area of discussion and debate."
451 | },
452 | {
453 | "instruction": "Describe a common behavior that is indicative of trust.",
454 | "input": "",
455 | "output": "One common behavior that is indicative of trust is communication. When people trust each other, they can express their opinions and ideas openly in conversations. When people share their experiences, thoughts and feelings it fosters understanding and builds trust. People who trust each other also feel comfortable disclosing personal information and showing vulnerability, which can help to deepen their connection."
456 | },
457 | {
458 | "instruction": "Select the most suitable word to describe the following definition.\n\"A large group of people, animals, or things that are all similar in another way\"",
459 | "input": "",
460 | "output": "Herd."
461 | },
462 | {
463 | "instruction": "List five alternative sources of energy",
464 | "input": "",
465 | "output": "1. Solar energy\n2. Wind energy\n3. Hydropower\n4. Geothermal energy\n5. Biomass energy"
466 | },
467 | {
468 | "instruction": "Calculate the value of X in the equation 8X+6=14",
469 | "input": "",
470 | "output": "X = 1"
471 | },
472 | {
473 | "instruction": "Write an argument in favor of gun control laws.",
474 | "input": "",
475 | "output": "Gun control laws are necessary to protect the safety of citizens in the United States. Gun violence continues to plague our nation, and the only way to reduce it is to enact stricter regulations on how guns are obtained, used, and stored. Gun control laws can save lives by preventing criminals and people with mental illness from accessing firearms, reducing the number of mass shootings, and limiting access to firearms for potential suicide victims. Further, gun control laws can prevent unintentional shootings and reduce the risk of gun-related violence."
476 | },
477 | {
478 | "instruction": "What techniques help an individual lower their stress?",
479 | "input": "",
480 | "output": "Techniques that can help an individual lower their stress include setting aside time for relaxation, getting enough sleep, exercising regularly, eating healthy meals, engaging in positive self-talk, and engaging in mindfulness activities such as yoga or meditation."
481 | },
482 | {
483 | "instruction": "Arrange the given pieces of information into a logical sequence.",
484 | "input": "i. Verify the user's identity\nii. Process the payment\niii. Offer customer support\niv. Display order confirmation",
485 | "output": "i. Verify the user's identity\nii. Process the payment\niii. Display order confirmation\niv. Offer customer support"
486 | },
487 | {
488 | "instruction": "Take the following sentence and make it a description of a person",
489 | "input": "He studies physics",
490 | "output": "He is a physics student who is passionate about understanding the laws of nature."
491 | },
492 | {
493 | "instruction": "Change the given sentence so that it contains a negative term.",
494 | "input": "The view was magnificent.",
495 | "output": "The view was not unimpressive."
496 | },
497 | {
498 | "instruction": "Delete any words in the following sentence that are unnecessary.",
499 | "input": "This picture of an elephant is absolutely breathtaking.",
500 | "output": "This picture of an elephant is breathtaking."
501 | }
502 | ]
--------------------------------------------------------------------------------
/train/json4train/example.json:
--------------------------------------------------------------------------------
1 | [
2 | {
3 | "instruction": "Rate the given essay on a scale of 1 to 10.",
4 | "input": "This essay is discussing the benefits of exercising. Exercise improves the physical and mental health of individuals. Regular exercise can reduce the chance of developing certain diseases, such as heart disease, strokes and diabetes. It can also help to regulate mood and reduce stress. Exercising also helps to improve cognitive functioning, strengthens the bones and joints, and can lead to improved physical fitness and overall wellbeing.",
5 | "output": "I would rate this essay 9 out of 10. The essay clearly outlines the benefits of exercising and does so in a compact and organized manner. The evidence presented is relevant and supported. Plus, the author provides an understandable conclusion. My only critique is that the author should have provided a few examples to support some of their points."
6 | },
7 | {
8 | "instruction": "Generate a creative poem using natural language processing.",
9 | "input": "",
10 | "output": "The sun sets in glory\nThe sky ablaze in red\nThe light of day lingering\nFading, until nightfall ahead\n\nThe moonlight a lullaby\nGently singing a sweet serenade\nA somber peace upon the land\nAs the night moves in its shade."
11 | },
12 | {
13 | "instruction": "Given an English vocabulary word, define it in 1 to 2 sentences.",
14 | "input": "Abrogate",
15 | "output": "Abrogate is a verb that means to formally cancel or repeal a law, treaty, or agreement. It is commonly used to indicate an intentional abandonment or renunciation of rights, privileges, or obligations."
16 | },
17 | {
18 | "instruction": "Given a list of countries, look up their respective capitals.",
19 | "input": "Poland, Germany",
20 | "output": "The capital of Poland is Warsaw and the capital of Germany is Berlin."
21 | },
22 | {
23 | "instruction": "Generate a chart to summarize the following data",
24 | "input": "Books Sold \u2013 10,000\nBooks Returned \u2013 2,000\nBooks on Hand \u2013 8,000",
25 | "output": "Books Sold | Books Returned | Books on Hand\n------------ | ------------- | --------------\n10,000 | 2,000 | 8,000"
26 | },
27 | {
28 | "instruction": "What is the main benefit of having a healthy lifestyle?",
29 | "input": "",
30 | "output": "The primary benefit of having a healthy lifestyle is an increased sense of overall wellbeing. A healthy lifestyle can improve physical and mental health, boost energy levels, improve focus and concentration, reduce stress, boost immunity, and help to maintain a healthy weight. Additionally, leading a healthy lifestyle can also reduce the risk of developing chronic diseases, such as heart disease, stroke, diabetes, and cancer."
31 | },
32 | {
33 | "instruction": "Name three common methods of identity verification.",
34 | "input": "",
35 | "output": "Three common methods of identity verification are: passwords, security questions, and biometric authentication."
36 | },
37 | {
38 | "instruction": "List two benefits of cloud computing",
39 | "input": "",
40 | "output": "1. Cost efficiency: Cloud computing offers businesses a cost-effective way to store and access data. They don't need to invest in expensive hardware to increase their storage capacity.\n2. Scalability: Cloud computing makes it easier for businesses to scale up or down their data storage and computing power as needed. This makes it easier for businesses to respond quickly to changing market conditions and demands."
41 | },
42 | {
43 | "instruction": "Automate the following task: Every time someone sends an email to the customer service mailbox, send a reply thanking the customer for the message.",
44 | "input": "",
45 | "output": "Sorry, the task involves complex actions which must be take by a human. The GPT model cannot perform such actions."
46 | },
47 | {
48 | "instruction": "Summarize the main points of the given article in three sentences.",
49 | "input": "Article:\nClimate change has become a top global issue that is threatening the planet\u2019s fragile ecosystems and people\u2019s livelihoods. The impacts of global warming devastatingly are the most felt by vulnerable communities in the Global South. Everyone across the world is needed to take action to help stop climate change.",
50 | "output": "Climate change is having a profound effect on the planet and its people, particularly communities in the Global South. Everyone must take action to reduce the effects of climate change. This could include reducing carbon emissions, consuming less and using renewable energy sources."
51 | },
52 | {
53 | "instruction": "List 10 books that have won a Pulitzer Prize.",
54 | "input": "",
55 | "output": "- The Echo Maker by Richard Powers (2006)\n- The Hours by Michael Cunningham (1999)\n- Atonement by Ian McEwan (2002)\n- All the Light We Cannot See by Anthony Doerr (2015)\n- The Amazing Adventures of Kavalier and Clay by Michael Chabon (2001)\n- Rabbit is Rich by John Updike (1982)\n- The Sympathizer by Viet Thanh Nguyen (2016)\n- The Grapes of Wrath by John Steinbeck (1940)\n- Beloved by Toni Morrison (1988)\n- A Passage to India by E.M. Forster (1924)"
56 | },
57 | {
58 | "instruction": "In the following sentence, categorize the words under parts of speech:",
59 | "input": "Ants are marching in the garden.",
60 | "output": "Noun (Ants), Verb (are marching), Preposition (in), Noun (garden)."
61 | },
62 | {
63 | "instruction": "Translate \"Ich bin ein Berliner\" into English.",
64 | "input": "",
65 | "output": "I am a Berliner."
66 | },
67 | {
68 | "instruction": "Describe the view you will find at the beach.",
69 | "input": "",
70 | "output": "At the beach, you will find a vast expanse of sand, stretching as far as the eye can see. The shore is lined with invitingly warm, turquoise waters, dotted with sailboats and yachts. On the horizon, you can see the hazy silhouettes of other nearby islands and tall, snow-capped mountains in the distance."
71 | },
72 | {
73 | "instruction": "Compute the area of a triangle with the vertices (3,4), (5,6), (7,8)",
74 | "input": "",
75 | "output": "The area of the triangle is 10.5 units."
76 | },
77 | {
78 | "instruction": "Provide me 3 facts about penguins",
79 | "input": "",
80 | "output": "1. Penguins are flightless birds that live in the southern hemisphere. 2. Penguins do not have teeth, instead they have short, broad beaks. 3. There are more than 17 species of penguins, all found only in the Southern Hemisphere."
81 | },
82 | {
83 | "instruction": "Generate a teaser for the given film.",
84 | "input": "The Lord of the Rings.",
85 | "output": "From the creators of The Lord of the Rings comes a thrilling epic adventure like no other! Follow the journey of a daring fellowship battle against the forces of evil in Middle Earth. Discover the fate of a kingdom as they fight to restore the order between the races of men, elves, and dwarves. Join them in the greatest adventure yet - The Lord of the Rings!"
86 | },
87 | {
88 | "instruction": "Find the sentence which does not fit in to the context of the other sentences.",
89 | "input": "He decided to pack his new tent. He chose the lightest sleeping bag available. He packed his hiking boots. He selected a lunchbox. He ordered a new book.",
90 | "output": "He ordered a new book."
91 | },
92 | {
93 | "instruction": "Identify the type of the following sentence: \"Alice is really happy about the result.\"",
94 | "input": "",
95 | "output": "Declarative sentence."
96 | },
97 | {
98 | "instruction": "What is a trade deficit?",
99 | "input": "",
100 | "output": "A trade deficit is a situation in which a country imports more goods than it exports. It occurs when the value of the country's exports falls below the value of its imports. It is typically evaluated by looking at the total value of goods and services a country imports and exports over a certain period of time."
101 | },
102 | {
103 | "instruction": "Transport the following sentence from the present to the future tense:",
104 | "input": "I am going to the store.",
105 | "output": "I will go to the store."
106 | },
107 | {
108 | "instruction": "Create a list of skills an engineer needs to have.",
109 | "input": "",
110 | "output": "An engineer should possess a range of skills, including:\n- Technical Problem Solving \u2013 Ability to identify, analyze, and create solutions to technical problems.\n- Critical Thinking - Ability to think in a structured, analytical and creative manner, and to exercise judgment when making decisions.\n- Written and Verbal Communication - Ability to effectively communicate thoughts and ideas in written and in verbal formats.\n- Teamwork \u2013 Working collaboratively with others to solve complex problems.\n- Time Management - Ability to plan and prioritize work in order to meet deadlines.\n- Leadership \u2013 Ability to lead projects, teams and initiatives.\n- Research \u2013 Ability to conduct research and obtain necessary data to inform decisions.\n- Management and Planning \u2013 Ability to plan and manage projects and resources.\n- Analytical Skills \u2013 Ability to identify and analyze complex data, draw conclusions and make appropriate recommendations.\n- Computer Skills \u2013 Ability to use various computer programs such as Microsoft Office, MATLAB, AutoCAD, etc."
111 | },
112 | {
113 | "instruction": "Construct an example of a valid CSS rule.",
114 | "input": "",
115 | "output": "body {\n background-color: white;\n}"
116 | },
117 | {
118 | "instruction": "Explain what is machine learning and identify its application.",
119 | "input": "",
120 | "output": "Machine learning is a form of artificial intelligence that involves giving computers and machines the ability to learn from data, identify patterns and make decisions with minimal human intervention. It uses data-driven algorithms and computer models to learn from data and automate decisions to develop solutions. Machine learning has applications in many industries, including healthcare, finance, retail, education, and manufacturing. It is used for tasks like automated personalized recommendation systems, fraud and anomaly detection, natural language processing and image recognition."
121 | },
122 | {
123 | "instruction": "How do you go about finding the best deal when buying a car?",
124 | "input": "",
125 | "output": "When buying a car, it is important to research the different models, prices, and dealerships. Comparing prices and options online can help you find the best deal. You should also test drive the car to make sure it meets your needs and preferences. Additionally, it is beneficial to read reviews of the car or dealership before making a purchase."
126 | },
127 | {
128 | "instruction": "Sort the following vegetables by their colors.",
129 | "input": "Tomato, Carrot, Broccoli, Avocado",
130 | "output": "Red (Tomato), Orange (Carrot), Green (Broccoli, Avocado)"
131 | },
132 | {
133 | "instruction": "Explain why the following product has poor reviews.",
134 | "input": "Apple Airpods Pro",
135 | "output": "The Apple Airpods Pro have poor reviews because they have a lower sound quality than expected and are expensive compared to other products with similar features. Additionally, they have limited noise cancellation, have problems with connectivity, and have a shorter battery life than advertised."
136 | },
137 | {
138 | "instruction": "Reword the sentence by changing its tense to the past.",
139 | "input": "The driver carefully turns the corner.",
140 | "output": "The driver carefully turned the corner."
141 | },
142 | {
143 | "instruction": "Generate a suitable metaphor to convey the concept of 'breaking a barrier'",
144 | "input": "",
145 | "output": "Breaking a barrier is like knocking down a wall, shattering the limits that stand in the way of progress."
146 | },
147 | {
148 | "instruction": "Generate a short story about a person in survival mode.",
149 | "input": "",
150 | "output": "John had always been an adventurous spirit, and so when he heard about the remote wilderness region in Alaska, he jumped at the chance to explore it. He was not prepared, however, for the ferocity of the elements he encountered. Every day was a battle against sub-zero temperatures, relentless winds and snowdrifts, and an unforgiving landscape. But with his wits and determination, John managed to survive and eventually find himself in the safety of civilization. He had learned to appreciate the essentials of life, and he was sure he would never take little things like food and shelter for granted again."
151 | },
152 | {
153 | "instruction": "Make an analogy comparing two different concepts.",
154 | "input": "",
155 | "output": "Studying is like a journey - you learn new information as you go and eventually reach your destination of mastering the subject."
156 | },
157 | {
158 | "instruction": "Identify three ways in which the government can reduce the problem of air pollution.",
159 | "input": "",
160 | "output": "The government can reduce air pollution by encouraging carpooling and public transportation by providing incentives and implementing laws to reduce emissions from vehicles; regulating industrial pollutants by setting emissions standards for companies and enforcing those standards; and increasing the green spaces around cities through tree planting and land conservation, which can help to reduce ground level ozone and smog."
161 | },
162 | {
163 | "instruction": "Give a review of the movie \"The Martian\".",
164 | "input": "",
165 | "output": "The Martian is a thrilling and inspiring science fiction drama about an astronaut stranded on Mars who must find a way to survive until he can be rescued. The movie does an excellent job of showing the difficulty and ingenuity of the astronaut\u2019s struggle, making it a very exciting and intense story. The cast does a great job of embodying their characters and the spectacular CGI visuals create a convincing and detailed representation of Mars. The movie is very thought-provoking and will leave you feeling inspired and amazed by the human spirit. Highly recommended."
166 | },
167 | {
168 | "instruction": "Give two examples of how machine learning can be used at a business.",
169 | "input": "",
170 | "output": "1. Predictive analysis: Machine learning can be used to develop predictive models that allow businesses to anticipate customer purchases, identify sales trends, and target advertising campaigns. \n2. Customer segmentation: Machine learning can be used to sort customer data into segments based on customer behaviour, preferences, and purchasing histories so that businesses can tailor products or services to specific customer groups."
171 | },
172 | {
173 | "instruction": "Summarize the text into 25 words or fewer.",
174 | "input": "The global COVID-19 pandemic has been an unprecedented event that has had a wide-reaching impact on the entire world.",
175 | "output": "Global pandemic caused by COVID-19 has caused upheaval worldwide, with widespread implications."
176 | },
177 | {
178 | "instruction": "Replace the word \"diffferent\" with the correct word:",
179 | "input": "The group discussed the controversial topic in diffferent manners.",
180 | "output": "The group discussed the controversial topic in various manners."
181 | },
182 | {
183 | "instruction": "Create a questionnaire assessing safety in the workplace.",
184 | "input": "",
185 | "output": "1. Are employees aware of safety policies and regulations in the workplace?\n2. Are personal protective equipment (PPE) worn when necessary?\n3. Are tools, machines and equipment regularly maintained and inspected for safety?\n4. How do employees report any hazards, injuries or near misses?\n5. Are appropriate measures taken after reports of injuries, near misses or hazards?\n6.Are visitors alerted to possible hazards in the area?\n7. Are safety signs and signals installed in necessary places and are they clearly visible?\n8. Is safety training regularly provided to employees?\n9. Are emergency exits clearly marked and unobstructed?\n10. Are employees aware of the signs and symptoms of common occupational diseases?"
186 | },
187 | {
188 | "instruction": "Make a list of 3 ways to exercise with no equipment.",
189 | "input": "",
190 | "output": "1. Push-ups\n2. Running\n3. Squats"
191 | },
192 | {
193 | "instruction": "Describe how a Data Scientist can use a titanic dataset",
194 | "input": "",
195 | "output": "Data Scientists can use a titanic dataset to gain insights into passenger behavior and risk factors related to death on the voyage. By analyzing the data, they can identify characteristics and trends associated with mortality, such as gender, age, ticket class, and so on. They could also use the data to create predictive models that can accurately estimate the likelihood of death under certain conditions. Additionally, they can use data visualization techniques to better understand patterns within the data."
196 | },
197 | {
198 | "instruction": "Name 5 famous authors.",
199 | "input": "",
200 | "output": "William Shakespeare, J.K. Rowling, George Orwell, Stephen King, Ernest Hemingway."
201 | },
202 | {
203 | "instruction": "Who won the 2017 Oscars for Best Actor?",
204 | "input": "",
205 | "output": "The 2017 Academy Award for Best Actor was awarded to Gary Oldman for his performance in the film \"Darkest Hour\"."
206 | },
207 | {
208 | "instruction": "Compose a poem about winter using the given keywords.",
209 | "input": "snow, frosty, ice, chilly",
210 | "output": "A frosty winter night, \nThe sky glimmers with snow and ice.\nThe trees stand tall and chill,\nWrapped in an ethereal white crystal sill. \nThe cool air creates a mist,\nBlowing through gentle winds kissed. \nThe moon's reflection a sight to see,\nAs the stars twinkle above and below the deep sea. \nIt's time to tuck away tight,\nFor tonight there's a magical winter night."
211 | },
212 | {
213 | "instruction": "Write an informative introduction for a horror novel",
214 | "input": "",
215 | "output": "The dark, moonless night cloaked the small town of Strathfield in fear. For years, it had been a peaceful and tranquil corner of the world but now, no one knew what dangers lurked in the shadows. Strange lights had been seen in the sky, a peculiar mist had engulfed the town, and terrible cries were heard coming from the forest. It seemed like something sinister was lurking within the walls of Strathfield and it was up to a brave few to find out what it was."
216 | },
217 | {
218 | "instruction": "Given a list and a value, find the index of the given value in the list.",
219 | "input": "List: [3, 8, 10, 20], Value: 10",
220 | "output": "2"
221 | },
222 | {
223 | "instruction": "Provide an example of a polygon that has six sides.",
224 | "input": "",
225 | "output": "A hexagon is an example of a polygon with six sides."
226 | },
227 | {
228 | "instruction": "Take a text of any length and rephrase it as a haiku poem.",
229 | "input": "Life is too short to waste away.",
230 | "output": "Life fleeting always passes/Enjoy now for it slips away/Cherish moments worthwhile"
231 | },
232 | {
233 | "instruction": "Compose a 20 word description on Artificial Intelligence",
234 | "input": "",
235 | "output": "Artificial Intelligence (AI) is a powerful tool of computers and machines to analyze complex data and make decisions based on it. AI can be used to process large datasets, build models, and make predictions more efficiently. It has an impressive range of applications such as natural language processing, image recognition and more."
236 | },
237 | {
238 | "instruction": "Name two famous festivals celebrated in South America",
239 | "input": "",
240 | "output": "Two famous festivals celebrated in South America are Carnaval and Dia de los Muertos. Carnaval is an annual festival which is held in February before Ash Wednesday, and marks the beginning of Lent, a period of reflection in the Christian calendar. Dia de los Muertos is a festival, similar to Halloween, in which the living commemorate and celebrate the dead who, according to tradition, come back to the world of the living on that day. It is celebrated around November 1st."
241 | },
242 | {
243 | "instruction": "Explain the term \"self-fulfilling prophecy.\"",
244 | "input": "",
245 | "output": "A self-fulfilling prophecy is an expectation that comes true because it causes people to act as if it is true. It can be positive or negative and it occurs when a person\u2019s expectations influence their behavior, which in turn influences the outcome of the situation. For example, if a person believes that they are not capable of success, then they may behave in a way that leads to failure, thereby fulfilling the prophecy that was created by their initial belief."
246 | },
247 | {
248 | "instruction": "Generate a jingle that conveys the idea of safety.",
249 | "input": "",
250 | "output": "\"Safety, a priority, it's time to be wise\nAt home, work, and play, keep your guard and your eyes\nWash your hands and wear a mask, keep social distance as a task\nSafety, a priority, you'll be alright \nIt's just a matter of time, until this pandemic will end in sight!\""
251 | },
252 | {
253 | "instruction": "Given two sentences, produce a unified sentence by proper coordination and subordination",
254 | "input": "Alice makes coffee. Bob drinks tea.",
255 | "output": "Alice makes coffee while Bob drinks tea."
256 | },
257 | {
258 | "instruction": "Given a sentence, change the tense of the sentence.",
259 | "input": "She watched the movie.",
260 | "output": "She had watched the movie."
261 | },
262 | {
263 | "instruction": "Given a list of three ingredients and the name of a dish, edit the dish to include all the given ingredients.",
264 | "input": "Ingredients: garlic, mushrooms, tomatoes\nDish: Spaghetti Carbonara",
265 | "output": "Spaghetti Carbonara with Garlic, Mushrooms, and Tomatoes"
266 | },
267 | {
268 | "instruction": "Generate two questions about artificial intelligence for a quiz.",
269 | "input": "",
270 | "output": "Q1: What is the purpose of artificial intelligence?\nA1: The purpose of artificial intelligence is to solve complex problems, automate tasks, and make decisions in a way that mimics human intelligence. \n\nQ2: What are some popular machine learning algorithms?\nA2: Some popular machine learning algorithms include Support Vector Machines (SVMs), Decision Trees, Random Forests, k-Nearest Neighbor (kNN), Naive Bayes, and Neural Networks."
271 | },
272 | {
273 | "instruction": "Name three parts of a computer.",
274 | "input": "",
275 | "output": "The three main components of a computer are the central processing unit (CPU), the main memory (RAM), and the storage device (hard drive, SSD, or optical disc)."
276 | },
277 | {
278 | "instruction": "Construct a thesis statement for an essay about a rise in cybercrimes.",
279 | "input": "",
280 | "output": "The rise in cybercrimes is a major global concern, as it poses a grave threat to personal privacy, financial security, and national security."
281 | },
282 | {
283 | "instruction": "Name one type of food eaten in Mexico.",
284 | "input": "",
285 | "output": "Taco is a popular type of food in Mexico. It is a folded, lightly fried corn or wheat tortilla filled with any variety of ingredients. The fillings usually include meats such as beef, chicken, or pork, as well as beans, cheese, and vegetables. Salsa, avocado, sour cream, guacamole, lettuce, and onion can also be added for additional flavor."
286 | },
287 | {
288 | "instruction": "Given a table of data, identify the cause of the variation between the two variables.",
289 | "input": "[Attached table]",
290 | "output": "The variation between the two variables is most likely caused by a difference in their respective prices."
291 | },
292 | {
293 | "instruction": "Capitalize the first letter in each word of the sentence.",
294 | "input": "there is no reason for it to be this way",
295 | "output": "There Is No Reason For It To Be This Way"
296 | },
297 | {
298 | "instruction": "Given a programming language, generate a \"Hello World\" program.",
299 | "input": "Java",
300 | "output": "//This Java program prints \"Hello World\npublic class HelloWorld {\n public static void main(String[] args) {\n System.out.println(\"Hello World\");\n }\n}"
301 | },
302 | {
303 | "instruction": "Create a poem that expresses loneliness.",
304 | "input": "",
305 | "output": "I feel so alone in this world,\nNo one to catch me when I fall,\nSometimes I just want to curl up and disappear,\nAway from this bleakness that's everywhere.\n\nNo light to help me find my way,\nThe darkness just keeps growing day by day,\nI want to feel loved, I want to feel whole,\nBut no one knows how much I hurt in my soul.\n\nAll I can do is keep pushing on,\nHoping that one day I'll belong,\nIt's not easy but I must persist,\nAnd never forget that my worth is not a myth."
306 | },
307 | {
308 | "instruction": "Rewrite the following general statement into an assertion statement.",
309 | "input": "The internet has had a huge impact on our lives.",
310 | "output": "The internet has had a profound influence on our everyday lives."
311 | },
312 | {
313 | "instruction": "Write the time in a different format.",
314 | "input": "7:45am",
315 | "output": "07:45"
316 | },
317 | {
318 | "instruction": "What are the positives and negatives of online communication?",
319 | "input": "",
320 | "output": "The positives of online communication include bridging geographical barriers, facilitating communication between people around the world, allowing people to communicate quicker and more efficiently, and accessing more diverse perspectives. On the downside, online communication can lead to feelings of loneliness, distortion of authentic relationships, and issue of online privacy."
321 | },
322 | {
323 | "instruction": "Give an example of how someone could save money.",
324 | "input": "",
325 | "output": "One easy way to save money is to reduce the amount of unnecessary expenses. For example, instead of eating out every day, one could pack their lunch and bring it with them to work. Another way to save money is to increase income. This can be done by finding a second job or taking on freelance gigs. Lastly, one could also save by investing, either in the stock market or another financial instrument."
326 | },
327 | {
328 | "instruction": "What are the three primary goals of content marketing?",
329 | "input": "",
330 | "output": "The three primary goals of content marketing are to attract an audience, to educate the audience and to encourage them to take action. Content marketing is used to build trust and relationships with customers, to engage an audience and to drive conversions. Content can be used to provide helpful information, to demonstrate expert knowledge, to drive leads and sales, and to encourage customer loyalty."
331 | },
332 | {
333 | "instruction": "Propose a plan to reduce air pollution in urban areas.",
334 | "input": "",
335 | "output": "One strategy for reducing air pollution in urban areas is to transition away from fossil fuels and move toward cleaner sources of energy. This can be done by increasing investments in renewable energy sources, such as solar, wind, and hydro power. Additionally, cities can replace fossil fuel-powered vehicles with electric vehicles, or create incentives for residents to use public transportation. Cities can also create more green spaces, as trees and plants help to absorb and filter pollutants. Lastly, strict regulations and policies can be implemented to ensure that businesses are held accountable and comply with air quality standards."
336 | },
337 | {
338 | "instruction": "What is the main hypothesis in the following research paper?",
339 | "input": "This paper aims to investigate the role of the internet in information access and knowledge creation among teenagers in the US.",
340 | "output": "The main hypothesis of this research paper is that the internet plays an important role in increasing access to information and knowledge creation among teenagers in the US."
341 | },
342 | {
343 | "instruction": "What would be the most appropriate title for this article?",
344 | "input": "This article discusses how social media has become an important factor in business marketing and how businesses can reach a wider audience through effective usage of various tools and platforms.",
345 | "output": "\"Leveraging the Power of Social Media for Business Growth\""
346 | },
347 | {
348 | "instruction": "Name three trends in the given field.",
349 | "input": "AI",
350 | "output": "Three trends in AI are: \n1. Automation \u2013 AI is being used to automate many processes, from production lines in factories to customer service tasks in call centers.\n2. Natural language processing \u2013 AI is being used to interpret and respond to natural language, allowing computers to understand human speech.\n3. Personalization \u2013 AI is being used to personalize experiences, such as suggesting products or services based on user data."
351 | },
352 | {
353 | "instruction": "How do the jobs of scientists and engineers differ?",
354 | "input": "",
355 | "output": "Scientists and engineers serve different roles in the development of new technologies and solutions. Scientists conduct research and gather data to understand the physical and biological world, while engineers use this data to develop practical applications and systems. Scientists focus on exploring what is possible, whereas engineers focus on creating tangible products that meet specific requirements. Additionally, scientists typically work in a laboratory setting, while engineers typically work in industry or product design."
356 | },
357 | {
358 | "instruction": "Classify the following sentence as either a metaphor or hyperbole.",
359 | "input": "The room was so hot it felt like a sauna.",
360 | "output": "Metaphor."
361 | },
362 | {
363 | "instruction": "Generate an example of a task automation process involving natural language processing.",
364 | "input": "",
365 | "output": "An example of a task automation process involving natural language processing is automatic text summarization. Automatic text summarization is the process of condensing long pieces of text into shorter summaries by using natural language processing algorithms. It works by extracting key words, identifying important phrases, and selecting sentences that are relevant to the topic of the text. The result is a condensed version of the original text that captures the main points and is easier to understand."
366 | },
367 | {
368 | "instruction": "Determine the differences between a tortoise and a turtle",
369 | "input": "",
370 | "output": "The main difference between a tortoise and a turtle is that a tortoise is a land-dwelling animal, while a turtle can live in either water or on land. Tortoises have large, sturdy feet that are designed for walking on land, while turtles have flippers that make them better adapted for swimming in the water. Additionally, the shells of turtles are generally more streamlined, while tortoises have more dome-like shells."
371 | },
372 | {
373 | "instruction": "Generate a plausible sentence that reflects a certain feeling",
374 | "input": "",
375 | "output": "She felt a wave of peacefulness wash over her as she watched the clouds drift gently across the sky."
376 | },
377 | {
378 | "instruction": "Find the current temperature in Tokyo",
379 | "input": "",
380 | "output": "Currently, the temperature in Tokyo is around 18\u00b0C (64\u00b0F)."
381 | },
382 | {
383 | "instruction": "Given a list of task instructions, what type of task is each instruction?",
384 | "input": "What is the definition of \"computer\"; Remove all duplicate words from the given sentence",
385 | "output": "The first task is a classification task and the second task is an editing task."
386 | },
387 | {
388 | "instruction": "Generate a list of 10 items to place in an emergency kit.",
389 | "input": "",
390 | "output": "1. A first aid kit\n2. Flashlight and extra batteries\n3. Non-perishable food items \n4. Water sources \n5. Emergency blankets \n6. Reusable water bottle \n7. Whistle\n8. Rain ponchos\n9. Portable radio\n10. Maps and compass"
391 | },
392 | {
393 | "instruction": "Create an original list of questions for an interview with a famous person. Output the list.",
394 | "input": "No Input",
395 | "output": "Here is a list of questions for an interview with a famous person: \n1. What motivated you to pursue your goals in your career?\n2. What role does creativity play in your life and work?\n3. What challenges have you had to face along the way?\n4. What advice would you give to young people looking to follow in your footsteps?\n5. Are there any initiatives or causes you are passionate about?\n6. Do you have any inspirational quotes or words of advice to share?\n7. What projects are you currently working on and what are your future plans?\n8. What have you learned from your experiences that you would like to pass on to others?"
396 | },
397 | {
398 | "instruction": "Explain to me how a computer virus can be contracted.",
399 | "input": "",
400 | "output": "Computer viruses can be contracted in various ways. Common methods include downloading files from untrustworthy websites, opening email attachments from unknown sources, and visiting websites that contain malicious code. Additionally, sharing unsecured computer systems with others can allow for the virus to spread from one system to another."
401 | },
402 | {
403 | "instruction": "Describe a time you used your problem solving skills to solve an issue.",
404 | "input": "",
405 | "output": "During my summer internship, I encountered a problem with the project I was working on. I identified the root cause of the problem and developed a creative solution that involved utilizing multiple resources to find the best solution. I was able to complete the project on time and it was a success."
406 | },
407 | {
408 | "instruction": "Generate sentences that use the idiom \u2018once in a blue moon\u2019.",
409 | "input": "",
410 | "output": "He only comes home once in a blue moon.\nShe comes out of her shell once in a blue moon. \nThe train only passes by here once in a blue moon. \nWe get a chance to do this activity once in a blue moon."
411 | },
412 | {
413 | "instruction": "Classify news articles by their topics.",
414 | "input": "\"New Vaccine Shows Promising Results in Early Trials\"",
415 | "output": "Health and Medicine"
416 | },
417 | {
418 | "instruction": "Make a list of 10 ideas for starting an online business.",
419 | "input": "",
420 | "output": "Here are 10 ideas to start an online business:\n1. Sell handmade crafts and art. \n2. Become a virtual assistant. \n3. Offer freelance services such as writing, website design, social media management, or programming. \n4. Start an ecommerce store. \n5. Become an online tutor. \n6. Create a subscription service. \n7. Start an online consulting business. \n8. Start an online coaching program. \n9. Sell digital products such as ebooks and software. \n10. Launch an affiliate program to generate revenue from other businesses."
421 | },
422 | {
423 | "instruction": "Identify the incorrect statement: The Earth revolves around the moon. The moon revolves around the Sun. The Sun revolves around the Earth.",
424 | "input": "",
425 | "output": "The Earth revolves around the moon is the incorrect statement."
426 | },
427 | {
428 | "instruction": "What are some ways to be more efficient?",
429 | "input": "",
430 | "output": "1. Set deadlines.\n2. Use tools to automate daily tasks.\n3. Define priorities and focus on the most important tasks.\n4. Set achievable goals and review progress.\n5. Break tasks down into smaller chunks.\n6. Streamline processes and eliminate redundancies.\n7. Schedule breaks throughout the day and stay organized."
431 | },
432 | {
433 | "instruction": "Identify the political party of the given Member of Parliament.",
434 | "input": "Jim Murphy",
435 | "output": "Jim Murphy is a Labour Party MP in the United Kingdom. He represented the constituency of East Renfrewshire from 2005 to 2015 in the House of Commons."
436 | },
437 | {
438 | "instruction": "Replace the underlined word with a suitable adjective.",
439 | "input": "The chair was strong.",
440 | "output": "The chair was sturdy."
441 | },
442 | {
443 | "instruction": "Design a graphic t-shirt.",
444 | "input": "A funny pun like: \"Tuckered Out\".",
445 | "output": "A t-shirt with a graphic of a turtle wearing a backpack with a sign saying \"Tuckered Out\". The turtle is surrounded by the phrase \"Sometimes you just need a break\" in a creative font. Underneath it, the words \"This is why I always carry my own shell\" appear in a fun font. The colors for the shirt could be a light grey or khaki for the turtle and the backpack and a bright blue for the text."
446 | },
447 | {
448 | "instruction": "Compare and contrast the roles of nature versus nurture in human behavior.",
449 | "input": "",
450 | "output": "Nature versus nurture is the debate between whether human behavior is determined by inherited genes and traits or if it is shaped by environment and experiences. Nature-focused theorists believe that biological factors such as genetics and hormones are the primary influences on behavior, while nurture-focused theorists believe that behavior is determined by environmental influences such as parental rearing and social interaction. While it is clear that both nature and nurture play a role in determining human behavior, the exact balance and contribution of each is still an area of discussion and debate."
451 | },
452 | {
453 | "instruction": "Describe a common behavior that is indicative of trust.",
454 | "input": "",
455 | "output": "One common behavior that is indicative of trust is communication. When people trust each other, they can express their opinions and ideas openly in conversations. When people share their experiences, thoughts and feelings it fosters understanding and builds trust. People who trust each other also feel comfortable disclosing personal information and showing vulnerability, which can help to deepen their connection."
456 | },
457 | {
458 | "instruction": "Select the most suitable word to describe the following definition.\n\"A large group of people, animals, or things that are all similar in another way\"",
459 | "input": "",
460 | "output": "Herd."
461 | },
462 | {
463 | "instruction": "List five alternative sources of energy",
464 | "input": "",
465 | "output": "1. Solar energy\n2. Wind energy\n3. Hydropower\n4. Geothermal energy\n5. Biomass energy"
466 | },
467 | {
468 | "instruction": "Calculate the value of X in the equation 8X+6=14",
469 | "input": "",
470 | "output": "X = 1"
471 | },
472 | {
473 | "instruction": "Write an argument in favor of gun control laws.",
474 | "input": "",
475 | "output": "Gun control laws are necessary to protect the safety of citizens in the United States. Gun violence continues to plague our nation, and the only way to reduce it is to enact stricter regulations on how guns are obtained, used, and stored. Gun control laws can save lives by preventing criminals and people with mental illness from accessing firearms, reducing the number of mass shootings, and limiting access to firearms for potential suicide victims. Further, gun control laws can prevent unintentional shootings and reduce the risk of gun-related violence."
476 | },
477 | {
478 | "instruction": "What techniques help an individual lower their stress?",
479 | "input": "",
480 | "output": "Techniques that can help an individual lower their stress include setting aside time for relaxation, getting enough sleep, exercising regularly, eating healthy meals, engaging in positive self-talk, and engaging in mindfulness activities such as yoga or meditation."
481 | },
482 | {
483 | "instruction": "Arrange the given pieces of information into a logical sequence.",
484 | "input": "i. Verify the user's identity\nii. Process the payment\niii. Offer customer support\niv. Display order confirmation",
485 | "output": "i. Verify the user's identity\nii. Process the payment\niii. Display order confirmation\niv. Offer customer support"
486 | },
487 | {
488 | "instruction": "Take the following sentence and make it a description of a person",
489 | "input": "He studies physics",
490 | "output": "He is a physics student who is passionate about understanding the laws of nature."
491 | },
492 | {
493 | "instruction": "Change the given sentence so that it contains a negative term.",
494 | "input": "The view was magnificent.",
495 | "output": "The view was not unimpressive."
496 | },
497 | {
498 | "instruction": "Delete any words in the following sentence that are unnecessary.",
499 | "input": "This picture of an elephant is absolutely breathtaking.",
500 | "output": "This picture of an elephant is breathtaking."
501 | }
502 | ]
--------------------------------------------------------------------------------