├── models ├── __init__.py ├── mpt │ ├── generation_config.json │ ├── special_tokens_map.json │ ├── tokenizer_config.json │ ├── custom_embedding.py │ ├── config.json │ ├── adapt_tokenizer.py │ ├── blocks.py │ └── norm.py └── patching_utils.py ├── modules ├── __init__.py ├── logits_processor.py └── layers.py ├── general_util ├── __init__.py ├── tokenization_utils.py ├── logger.py ├── mixin.py ├── average_meter.py ├── dist_utils.py └── lightseq_utils.py ├── data ├── preprocessor │ ├── __init__.py │ └── mmlu_merge.py ├── __init__.py ├── files │ ├── wudao │ │ ├── partition_000.json │ │ └── file_samples_50.json │ └── c4 │ │ └── en │ │ ├── partition_1100.json │ │ ├── p25 │ │ ├── partition_1025.json │ │ ├── partition_100.json │ │ ├── partition_1000.json │ │ ├── partition_125.json │ │ ├── partition_150.json │ │ ├── partition_175.json │ │ ├── partition_200.json │ │ ├── partition_225.json │ │ ├── partition_25.json │ │ ├── partition_250.json │ │ ├── partition_275.json │ │ ├── partition_300.json │ │ ├── partition_325.json │ │ ├── partition_350.json │ │ ├── partition_375.json │ │ ├── partition_400.json │ │ ├── partition_425.json │ │ ├── partition_450.json │ │ ├── partition_475.json │ │ ├── partition_50.json │ │ ├── partition_500.json │ │ ├── partition_525.json │ │ ├── partition_550.json │ │ ├── partition_575.json │ │ ├── partition_600.json │ │ ├── partition_625.json │ │ ├── partition_650.json │ │ ├── partition_675.json │ │ ├── partition_700.json │ │ ├── partition_725.json │ │ ├── partition_75.json │ │ ├── partition_750.json │ │ ├── partition_775.json │ │ ├── partition_800.json │ │ ├── partition_825.json │ │ ├── partition_850.json │ │ ├── partition_875.json │ │ ├── partition_900.json │ │ ├── partition_925.json │ │ ├── partition_950.json │ │ └── partition_975.json │ │ └── p50 │ │ ├── partition_1050.json │ │ ├── partition_100.json │ │ ├── partition_1000.json │ │ ├── partition_150.json │ │ ├── partition_200.json │ │ ├── partition_250.json │ │ ├── partition_300.json │ │ ├── partition_350.json │ │ ├── partition_400.json │ │ ├── partition_450.json │ │ ├── partition_50.json │ │ ├── partition_500.json │ │ ├── partition_550.json │ │ ├── partition_600.json │ │ ├── partition_650.json │ │ ├── partition_700.json │ │ ├── partition_750.json │ │ ├── partition_800.json │ │ ├── partition_850.json │ │ ├── partition_900.json │ │ └── partition_950.json ├── test.py ├── flan_combine.py ├── collators │ └── __init__.py ├── flan_sample.py └── strategy_qa.py ├── panda_logo.PNG ├── requirements.txt ├── post_processors ├── dist_mixin.py └── bleu.py ├── seed_multi_run.sh ├── make_delta.py ├── .gitignore ├── conf ├── base.yaml ├── roberta_split_fact_v1_1.yaml └── llama │ └── wiki │ ├── test.yaml │ └── llama_7b_flan_v1_0.yaml └── convert2hf.py /models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /modules/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /general_util/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data/preprocessor/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Write your own datasets under this directory. 3 | """ -------------------------------------------------------------------------------- /panda_logo.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dandelionsllm/pandallm/HEAD/panda_logo.PNG -------------------------------------------------------------------------------- /models/mpt/generation_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "_from_model_config": true, 3 | "transformers_version": "4.28.1", 4 | "use_cache": false 5 | } 6 | -------------------------------------------------------------------------------- /models/mpt/special_tokens_map.json: -------------------------------------------------------------------------------- 1 | { 2 | "bos_token": "<|endoftext|>", 3 | "eos_token": "<|endoftext|>", 4 | "unk_token": "<|endoftext|>" 5 | } 6 | -------------------------------------------------------------------------------- /models/mpt/tokenizer_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "add_prefix_space": false, 3 | "bos_token": "<|endoftext|>", 4 | "clean_up_tokenization_spaces": true, 5 | "eos_token": "<|endoftext|>", 6 | "model_max_length": 8192, 7 | "tokenizer_class": "GPTNeoXTokenizer", 8 | "unk_token": "<|endoftext|>" 9 | } 10 | -------------------------------------------------------------------------------- /data/files/wudao/partition_000.json: -------------------------------------------------------------------------------- 1 | [ 2 | "/opt/ml/input/data/train/baike2018/baike2018qa_train.json", 3 | "/opt/ml/input/data/train/news_2016/news2016_train.json", 4 | "/opt/ml/input/data/train/translate/translate_train.json", 5 | "/opt/ml/input/data/train/webtext/web_text_2019_train.json", 6 | "/opt/ml/input/data/train/wikizh/wikizh.json" 7 | ] -------------------------------------------------------------------------------- /models/mpt/custom_embedding.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch import Tensor 5 | 6 | 7 | class SharedEmbedding(nn.Embedding): 8 | 9 | def forward(self, input: Tensor, unembed: bool = False) -> Tensor: 10 | if unembed: 11 | return F.linear(input, self.weight) 12 | return super().forward(input) 13 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | wandb 2 | nltk 3 | tensorboard 4 | sentencepiece 5 | https://download.pytorch.org/whl/cu117/torch-2.0.1%2Bcu117-cp39-cp39-linux_x86_64.whl 6 | hydra-core 7 | fairscale 8 | deepspeed==0.9.5 9 | datasets 10 | bitsandbytes 11 | transformers 12 | git+https://github.com/huggingface/peft.git 13 | git+https://github.com/huggingface/accelerate.git 14 | einops 15 | tensor-parallel -------------------------------------------------------------------------------- /post_processors/dist_mixin.py: -------------------------------------------------------------------------------- 1 | import torch.distributed as dist 2 | from typing import List, Any 3 | 4 | 5 | class DistGatherMixin: 6 | def gather(self): 7 | pass 8 | 9 | @staticmethod 10 | def gather_object(objects: List[Any]): 11 | output = [None for _ in range(dist.get_world_size())] 12 | dist.gather_object(objects, 13 | object_gather_list=output if dist.get_rank() == 0 else None, 14 | dst=0) 15 | 16 | if dist.get_rank() == 0: 17 | return output 18 | else: 19 | return None 20 | -------------------------------------------------------------------------------- /data/test.py: -------------------------------------------------------------------------------- 1 | import os 2 | from torch.utils.data import Dataset 3 | 4 | 5 | class TestDataset(Dataset): 6 | def __init__(self, file_path, tokenizer, pseudo_dataset_len: int = -1): 7 | super().__init__() 8 | self.data = ["My name is Jiao Fangkai."] 9 | self.pseudo_dataset_len = pseudo_dataset_len 10 | # print("============================", os.environ["LOCAL_RANK"], "Test dataset initialized.") 11 | 12 | def __len__(self): 13 | if self.pseudo_dataset_len > 0: 14 | return self.pseudo_dataset_len 15 | return 100000000 16 | 17 | def __getitem__(self, index): 18 | return { 19 | "flan": { 20 | "inputs": self.data[0], 21 | "targets": self.data[0], 22 | }, 23 | "index": index, 24 | } 25 | -------------------------------------------------------------------------------- /seed_multi_run.sh: -------------------------------------------------------------------------------- 1 | conf=$1 2 | conf_name=$2 3 | num_rank=$3 4 | gpu=$4 5 | port=$5 6 | 7 | count=1 8 | for arg in "$@"; do 9 | if [ "$count" -gt "5" ]; then 10 | if [ "$num_rank" -gt "1" ]; then 11 | echo "CUDA_VISIBLE_DEVICES=$gpu python -m torch.distributed.run --nproc_per_node $num_rank --master_port $port trainer_base_fsdp_v3.py -cp $conf -cn $conf_name seed=${arg}" 12 | 13 | CUDA_VISIBLE_DEVICES=$gpu python -m torch.distributed.run --nproc_per_node $num_rank --master_port $port trainer_base_fsdp_v3.py -cp $conf -cn $conf_name seed=${arg} 14 | else 15 | echo "CUDA_VISIBLE_DEVICES=$gpu python trainer_base_fsdp_v3.py -cp $conf -cn $conf_name seed=${arg}" 16 | 17 | CUDA_VISIBLE_DEVICES=$gpu python trainer_base_fsdp_v3.py -cp $conf -cn $conf_name seed=${arg} 18 | fi 19 | fi 20 | let count=count+1 21 | done; 22 | 23 | -------------------------------------------------------------------------------- /data/flan_combine.py: -------------------------------------------------------------------------------- 1 | data_group = [ 2 | [ 3 | "cot_fs_noopt_train.jsonl.gz", 4 | "cot_fs_opt_train.jsonl.gz", 5 | "cot_zs_noopt_train.jsonl.gz", 6 | "cot_zs_opt_train.jsonl.gz", 7 | "niv2_fs_noopt_train.jsonl.gz", 8 | "niv2_fs_opt_train.jsonl.gz", 9 | "niv2_zs_noopt_train.jsonl.gz", 10 | "niv2_zs_opt_train.jsonl.gz", 11 | ], 12 | [ 13 | "dialog_zs_noopt_train.jsonl.gz", 14 | "dialog_zs_opt_train.jsonl.gz", 15 | ], 16 | "dialog_fs_noopt_train.jsonl.gz", 17 | "dialog_fs_opt_train.jsonl.gz", 18 | "flan_fs_noopt_train.jsonl.gz", 19 | "flan_fs_opt_train_part1.jsonl.gz", 20 | "flan_fs_opt_train_part2.jsonl.gz", 21 | "flan_fs_opt_train_part3.jsonl.gz", 22 | "flan_zs_noopt_train.jsonl.gz", 23 | "flan_zs_opt_train.jsonl.gz", 24 | "t0_fs_noopt_train.jsonl.gz", 25 | "t0_zs_noopt_train.jsonl.gz", 26 | "t0_zs_opt_train.jsonl.gz", 27 | ] 28 | 29 | 30 | def obtain_flan_collection_group(): 31 | return data_group 32 | -------------------------------------------------------------------------------- /modules/logits_processor.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from transformers.generation_logits_process import LogitsProcessor 3 | 4 | from modules.trie import Trie 5 | 6 | 7 | class TrieConstrainedLogitsProcessor(LogitsProcessor): 8 | def __init__(self, trie: Trie, sent_mode: bool = False): 9 | self.trie = trie 10 | # If `sent_mode` is `True`, please ensure that each sentence in trie has two copies, 11 | # one ends with `` and the other one ends with `<\s>` (eos token). 12 | self.sent_mode = sent_mode 13 | if sent_mode: 14 | assert self.trie.sep_token_id is not None 15 | self.sep_token_id = self.trie.sep_token_id 16 | 17 | def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor: 18 | sequence_ls = input_ids.tolist() 19 | scores_mask = scores.new_zeros(scores.size()).fill_(-10000.0) 20 | for seq_id, seq in enumerate(sequence_ls): 21 | if self.sent_mode: 22 | for idx in range(len(seq) - 1, -1, -1): 23 | if seq[idx] == self.sep_token_id: 24 | seq = seq[(idx + 1):] 25 | output = self.trie.get(seq) 26 | scores_mask[seq_id, output] = 0.0 27 | return scores + scores_mask 28 | -------------------------------------------------------------------------------- /data/preprocessor/mmlu_merge.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | from tqdm import tqdm 4 | import warnings 5 | warnings.simplefilter(action='ignore', category=FutureWarning) 6 | 7 | 8 | def merge_data(data_dir, type): 9 | files = [f for f in os.listdir(f'{data_dir}/{type}') if f.endswith('.csv')] 10 | dfs = [pd.read_csv(f'{data_dir}/{type}/{f}', header=None) for f in files] 11 | df_all = pd.concat(dfs) 12 | 13 | df_new = pd.DataFrame(data={'inputs': [], 'targets': []}) 14 | option_names = ['A', 'B', 'C', 'D'] 15 | for i in tqdm(range(df_all.shape[0])): 16 | query, options = df_all.iloc[i, 0], df_all.iloc[i, 1:5].values 17 | options = [f'{name}. {opt}' for name, opt in zip(option_names, options)] 18 | query = [query] + options 19 | query = '\n'.join(query) 20 | answer = df_all.iloc[i, -1] 21 | row = pd.DataFrame({'inputs': [query], 'targets': [answer]}) 22 | df_new = df_new.append(row, ignore_index=True) 23 | 24 | df_new.to_csv(f'{data_dir}/{type}.csv') 25 | print(f'{type} dataset merged successfully ...') 26 | 27 | 28 | 29 | if __name__ == '__main__': 30 | data_dir = '/home/tianze/datasets/MMLU' 31 | 32 | for type in ['test', 'val', 'dev', 'auxiliary_train']: 33 | merge_data(data_dir, type) 34 | -------------------------------------------------------------------------------- /models/mpt/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "MPTForCausalLM" 4 | ], 5 | "attn_config": { 6 | "alibi": true, 7 | "alibi_bias_max": 8, 8 | "attn_impl": "torch", 9 | "attn_pdrop": 0, 10 | "attn_type": "multihead_attention", 11 | "attn_uses_sequence_id": false, 12 | "clip_qkv": null, 13 | "prefix_lm": false, 14 | "qk_ln": false, 15 | "softmax_scale": null 16 | }, 17 | "auto_map": { 18 | "AutoConfig": "configuration_mpt.MPTConfig", 19 | "AutoModelForCausalLM": "modeling_mpt.MPTForCausalLM" 20 | }, 21 | "d_model": 7168, 22 | "emb_pdrop": 0, 23 | "embedding_fraction": 1.0, 24 | "expansion_ratio": 4, 25 | "init_config": { 26 | "emb_init_std": null, 27 | "emb_init_uniform_lim": null, 28 | "fan_mode": "fan_in", 29 | "init_div_is_residual": true, 30 | "init_gain": 0.0, 31 | "init_nonlinearity": "relu", 32 | "init_std": null, 33 | "name": "kaiming_normal_", 34 | "verbose": 0 35 | }, 36 | "init_device": "cpu", 37 | "learned_pos_emb": true, 38 | "logit_scale": null, 39 | "max_seq_len": 8192, 40 | "model_type": "mpt", 41 | "n_heads": 64, 42 | "n_layers": 48, 43 | "no_bias": true, 44 | "norm_type": "low_precision_layernorm", 45 | "resid_pdrop": 0, 46 | "tokenizer_name": "EleutherAI/gpt-neox-20b", 47 | "torch_dtype": "bfloat16", 48 | "transformers_version": "4.28.1", 49 | "use_cache": false, 50 | "verbose": 0, 51 | "vocab_size": 50432 52 | } 53 | -------------------------------------------------------------------------------- /data/files/wudao/file_samples_50.json: -------------------------------------------------------------------------------- 1 | [ 2 | "part-2021022097.json", 3 | "part-2021023489.json", 4 | "part-2021012504.json", 5 | "part-2021022428.json", 6 | "part-2021012526.json", 7 | "part-2021023008.json", 8 | "part-2021022959.json", 9 | "part-2021024569.json", 10 | "part-2021023736.json", 11 | "part-2021024167.json", 12 | "part-2021022328.json", 13 | "part-2021021914.json", 14 | "part-2021013704.json", 15 | "part-2021022050.json", 16 | "part-2021012514.json", 17 | "part-2021023855.json", 18 | "part-2021016902.json", 19 | "part-2021022805.json", 20 | "part-2021022364.json", 21 | "part-2021021957.json", 22 | "part-2021014840.json", 23 | "part-2021022605.json", 24 | "part-2021023247.json", 25 | "part-2021022649.json", 26 | "part-2021020076.json", 27 | "part-2021016146.json", 28 | "part-2021024834.json", 29 | "part-2021012506.json", 30 | "part-2021021896.json", 31 | "part-2021012713.json", 32 | "part-2021022694.json", 33 | "part-2021023747.json", 34 | "part-2021012518.json", 35 | "part-2021023507.json", 36 | "part-2021019390.json", 37 | "part-2021017289.json", 38 | "part-2021023649.json", 39 | "part-2021023020.json", 40 | "part-2021012510.json", 41 | "part-2021023741.json", 42 | "part-2021012613.json", 43 | "part-2021023988.json", 44 | "part-2021013835.json", 45 | "part-2021021921.json", 46 | "part-2021022921.json", 47 | "part-2021020428.json", 48 | "part-2021023078.json", 49 | "part-2021012502.json", 50 | "part-2021022891.json", 51 | "part-2021022198.json" 52 | ] -------------------------------------------------------------------------------- /data/files/c4/en/partition_1100.json: -------------------------------------------------------------------------------- 1 | ["/opt/ml/input/data/train/c4/en/c4-train.00649-of-01024.json.gz", "/opt/ml/input/data/train/c4/en/c4-train.00231-of-01024.json.gz", "/opt/ml/input/data/train/c4/en/c4-train.00779-of-01024.json.gz", "/opt/ml/input/data/train/c4/en/c4-train.00808-of-01024.json.gz", "/opt/ml/input/data/train/c4/en/c4-train.00789-of-01024.json.gz", "/opt/ml/input/data/train/c4/en/c4-train.00846-of-01024.json.gz", "/opt/ml/input/data/train/c4/en/c4-train.00503-of-01024.json.gz", "/opt/ml/input/data/train/c4/en/c4-train.00060-of-01024.json.gz", "/opt/ml/input/data/train/c4/en/c4-train.00082-of-01024.json.gz", "/opt/ml/input/data/train/c4/en/c4-train.00256-of-01024.json.gz", "/opt/ml/input/data/train/c4/en/c4-train.00387-of-01024.json.gz", "/opt/ml/input/data/train/c4/en/c4-train.00138-of-01024.json.gz", "/opt/ml/input/data/train/c4/en/c4-train.00930-of-01024.json.gz", "/opt/ml/input/data/train/c4/en/c4-train.00163-of-01024.json.gz", "/opt/ml/input/data/train/c4/en/c4-train.00622-of-01024.json.gz", "/opt/ml/input/data/train/c4/en/c4-train.00056-of-01024.json.gz", "/opt/ml/input/data/train/c4/en/c4-train.00349-of-01024.json.gz", "/opt/ml/input/data/train/c4/en/c4-train.00461-of-01024.json.gz", "/opt/ml/input/data/train/c4/en/c4-train.00282-of-01024.json.gz", "/opt/ml/input/data/train/c4/en/c4-train.00673-of-01024.json.gz", "/opt/ml/input/data/train/c4/en/c4-train.00224-of-01024.json.gz", "/opt/ml/input/data/train/c4/en/c4-train.00098-of-01024.json.gz", "/opt/ml/input/data/train/c4/en/c4-train.00179-of-01024.json.gz", "/opt/ml/input/data/train/c4/en/c4-train.00080-of-01024.json.gz"] -------------------------------------------------------------------------------- /data/collators/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Write your own your own collators under the directory. 3 | """ 4 | 5 | from typing import Dict, Union, Any, List 6 | 7 | import torch 8 | from torch import Tensor 9 | from torch.utils.data import Dataset 10 | from torch.utils.data.dataloader import default_collate 11 | from transformers.tokenization_utils import BatchEncoding 12 | 13 | 14 | class DictTensorDataset(Dataset): 15 | def __init__(self, data: Union[Dict[str, Tensor], BatchEncoding], meta_data: List[Dict[str, Any]] = None): 16 | self.data = data 17 | self.meta_data = meta_data 18 | self.keys = list(self.data.keys()) 19 | for v in self.data.values(): 20 | if meta_data is not None: 21 | assert len(v) == len(meta_data) 22 | else: 23 | assert len(v) == self.data[self.keys[0]].size(0) 24 | 25 | def __len__(self): 26 | return self.data[self.keys[0]].size(0) 27 | 28 | def __getitem__(self, idx): 29 | res = {k: v[idx] for k, v in self.data.items()} 30 | if self.meta_data is not None: 31 | res["meta_data"] = self.meta_data[idx] 32 | if "index" not in res or "index" not in res["meta_data"]: 33 | res["index"] = torch.LongTensor([idx]) 34 | return res 35 | 36 | 37 | class MetaCollator: 38 | def __call__(self, batch): 39 | if "meta_data" not in batch[0]: 40 | return default_collate(batch) 41 | 42 | meta_data = [b.pop("meta_data") for b in batch] 43 | batch = default_collate(batch) 44 | batch["meta_data"] = meta_data 45 | return batch 46 | -------------------------------------------------------------------------------- /data/files/c4/en/p25/partition_1025.json: -------------------------------------------------------------------------------- 1 | [ 2 | "/opt/ml/input/data/train/c4/en/c4-train.00480-of-01024.json.gz", 3 | "/opt/ml/input/data/train/c4/en/c4-train.00605-of-01024.json.gz", 4 | "/opt/ml/input/data/train/c4/en/c4-train.00825-of-01024.json.gz", 5 | "/opt/ml/input/data/train/c4/en/c4-train.00252-of-01024.json.gz", 6 | "/opt/ml/input/data/train/c4/en/c4-train.00185-of-01024.json.gz", 7 | "/opt/ml/input/data/train/c4/en/c4-train.00306-of-01024.json.gz", 8 | "/opt/ml/input/data/train/c4/en/c4-train.00688-of-01024.json.gz", 9 | "/opt/ml/input/data/train/c4/en/c4-train.01017-of-01024.json.gz", 10 | "/opt/ml/input/data/train/c4/en/c4-train.00050-of-01024.json.gz", 11 | "/opt/ml/input/data/train/c4/en/c4-train.01004-of-01024.json.gz", 12 | "/opt/ml/input/data/train/c4/en/c4-train.00740-of-01024.json.gz", 13 | "/opt/ml/input/data/train/c4/en/c4-train.00796-of-01024.json.gz", 14 | "/opt/ml/input/data/train/c4/en/c4-train.00831-of-01024.json.gz", 15 | "/opt/ml/input/data/train/c4/en/c4-train.00485-of-01024.json.gz", 16 | "/opt/ml/input/data/train/c4/en/c4-train.00677-of-01024.json.gz", 17 | "/opt/ml/input/data/train/c4/en/c4-train.00357-of-01024.json.gz", 18 | "/opt/ml/input/data/train/c4/en/c4-train.00537-of-01024.json.gz", 19 | "/opt/ml/input/data/train/c4/en/c4-train.00884-of-01024.json.gz", 20 | "/opt/ml/input/data/train/c4/en/c4-train.00073-of-01024.json.gz", 21 | "/opt/ml/input/data/train/c4/en/c4-train.00297-of-01024.json.gz", 22 | "/opt/ml/input/data/train/c4/en/c4-train.00317-of-01024.json.gz", 23 | "/opt/ml/input/data/train/c4/en/c4-train.00192-of-01024.json.gz", 24 | "/opt/ml/input/data/train/c4/en/c4-train.00323-of-01024.json.gz", 25 | "/opt/ml/input/data/train/c4/en/c4-train.00462-of-01024.json.gz" 26 | ] -------------------------------------------------------------------------------- /data/files/c4/en/p50/partition_1050.json: -------------------------------------------------------------------------------- 1 | [ 2 | "/opt/ml/input/data/train/c4/en/c4-train.00791-of-01024.json.gz", 3 | "/opt/ml/input/data/train/c4/en/c4-train.00553-of-01024.json.gz", 4 | "/opt/ml/input/data/train/c4/en/c4-train.00213-of-01024.json.gz", 5 | "/opt/ml/input/data/train/c4/en/c4-train.00177-of-01024.json.gz", 6 | "/opt/ml/input/data/train/c4/en/c4-train.00134-of-01024.json.gz", 7 | "/opt/ml/input/data/train/c4/en/c4-train.00128-of-01024.json.gz", 8 | "/opt/ml/input/data/train/c4/en/c4-train.00069-of-01024.json.gz", 9 | "/opt/ml/input/data/train/c4/en/c4-train.00846-of-01024.json.gz", 10 | "/opt/ml/input/data/train/c4/en/c4-train.00339-of-01024.json.gz", 11 | "/opt/ml/input/data/train/c4/en/c4-train.00802-of-01024.json.gz", 12 | "/opt/ml/input/data/train/c4/en/c4-train.00205-of-01024.json.gz", 13 | "/opt/ml/input/data/train/c4/en/c4-train.00291-of-01024.json.gz", 14 | "/opt/ml/input/data/train/c4/en/c4-train.00868-of-01024.json.gz", 15 | "/opt/ml/input/data/train/c4/en/c4-train.00029-of-01024.json.gz", 16 | "/opt/ml/input/data/train/c4/en/c4-train.00579-of-01024.json.gz", 17 | "/opt/ml/input/data/train/c4/en/c4-train.00761-of-01024.json.gz", 18 | "/opt/ml/input/data/train/c4/en/c4-train.00930-of-01024.json.gz", 19 | "/opt/ml/input/data/train/c4/en/c4-train.00201-of-01024.json.gz", 20 | "/opt/ml/input/data/train/c4/en/c4-train.00879-of-01024.json.gz", 21 | "/opt/ml/input/data/train/c4/en/c4-train.00478-of-01024.json.gz", 22 | "/opt/ml/input/data/train/c4/en/c4-train.00259-of-01024.json.gz", 23 | "/opt/ml/input/data/train/c4/en/c4-train.00543-of-01024.json.gz", 24 | "/opt/ml/input/data/train/c4/en/c4-train.00215-of-01024.json.gz", 25 | "/opt/ml/input/data/train/c4/en/c4-train.00504-of-01024.json.gz" 26 | ] -------------------------------------------------------------------------------- /data/files/c4/en/p25/partition_100.json: -------------------------------------------------------------------------------- 1 | [ 2 | "/opt/ml/input/data/train/c4/en/c4-train.00435-of-01024.json.gz", 3 | "/opt/ml/input/data/train/c4/en/c4-train.00218-of-01024.json.gz", 4 | "/opt/ml/input/data/train/c4/en/c4-train.00981-of-01024.json.gz", 5 | "/opt/ml/input/data/train/c4/en/c4-train.00175-of-01024.json.gz", 6 | "/opt/ml/input/data/train/c4/en/c4-train.00079-of-01024.json.gz", 7 | "/opt/ml/input/data/train/c4/en/c4-train.00662-of-01024.json.gz", 8 | "/opt/ml/input/data/train/c4/en/c4-train.00344-of-01024.json.gz", 9 | "/opt/ml/input/data/train/c4/en/c4-train.00090-of-01024.json.gz", 10 | "/opt/ml/input/data/train/c4/en/c4-train.00918-of-01024.json.gz", 11 | "/opt/ml/input/data/train/c4/en/c4-train.00155-of-01024.json.gz", 12 | "/opt/ml/input/data/train/c4/en/c4-train.00131-of-01024.json.gz", 13 | "/opt/ml/input/data/train/c4/en/c4-train.00576-of-01024.json.gz", 14 | "/opt/ml/input/data/train/c4/en/c4-train.00604-of-01024.json.gz", 15 | "/opt/ml/input/data/train/c4/en/c4-train.00774-of-01024.json.gz", 16 | "/opt/ml/input/data/train/c4/en/c4-train.00659-of-01024.json.gz", 17 | "/opt/ml/input/data/train/c4/en/c4-train.00808-of-01024.json.gz", 18 | "/opt/ml/input/data/train/c4/en/c4-train.00026-of-01024.json.gz", 19 | "/opt/ml/input/data/train/c4/en/c4-train.00115-of-01024.json.gz", 20 | "/opt/ml/input/data/train/c4/en/c4-train.00467-of-01024.json.gz", 21 | "/opt/ml/input/data/train/c4/en/c4-train.00583-of-01024.json.gz", 22 | "/opt/ml/input/data/train/c4/en/c4-train.00933-of-01024.json.gz", 23 | "/opt/ml/input/data/train/c4/en/c4-train.00907-of-01024.json.gz", 24 | "/opt/ml/input/data/train/c4/en/c4-train.00105-of-01024.json.gz", 25 | "/opt/ml/input/data/train/c4/en/c4-train.00869-of-01024.json.gz", 26 | "/opt/ml/input/data/train/c4/en/c4-train.00658-of-01024.json.gz" 27 | ] -------------------------------------------------------------------------------- /data/files/c4/en/p25/partition_1000.json: -------------------------------------------------------------------------------- 1 | [ 2 | "/opt/ml/input/data/train/c4/en/c4-train.00657-of-01024.json.gz", 3 | "/opt/ml/input/data/train/c4/en/c4-train.00597-of-01024.json.gz", 4 | "/opt/ml/input/data/train/c4/en/c4-train.00514-of-01024.json.gz", 5 | "/opt/ml/input/data/train/c4/en/c4-train.00390-of-01024.json.gz", 6 | "/opt/ml/input/data/train/c4/en/c4-train.00773-of-01024.json.gz", 7 | "/opt/ml/input/data/train/c4/en/c4-train.00931-of-01024.json.gz", 8 | "/opt/ml/input/data/train/c4/en/c4-train.00858-of-01024.json.gz", 9 | "/opt/ml/input/data/train/c4/en/c4-train.00852-of-01024.json.gz", 10 | "/opt/ml/input/data/train/c4/en/c4-train.00783-of-01024.json.gz", 11 | "/opt/ml/input/data/train/c4/en/c4-train.00994-of-01024.json.gz", 12 | "/opt/ml/input/data/train/c4/en/c4-train.00042-of-01024.json.gz", 13 | "/opt/ml/input/data/train/c4/en/c4-train.00503-of-01024.json.gz", 14 | "/opt/ml/input/data/train/c4/en/c4-train.00260-of-01024.json.gz", 15 | "/opt/ml/input/data/train/c4/en/c4-train.00243-of-01024.json.gz", 16 | "/opt/ml/input/data/train/c4/en/c4-train.00614-of-01024.json.gz", 17 | "/opt/ml/input/data/train/c4/en/c4-train.00706-of-01024.json.gz", 18 | "/opt/ml/input/data/train/c4/en/c4-train.00536-of-01024.json.gz", 19 | "/opt/ml/input/data/train/c4/en/c4-train.00502-of-01024.json.gz", 20 | "/opt/ml/input/data/train/c4/en/c4-train.00039-of-01024.json.gz", 21 | "/opt/ml/input/data/train/c4/en/c4-train.00627-of-01024.json.gz", 22 | "/opt/ml/input/data/train/c4/en/c4-train.00118-of-01024.json.gz", 23 | "/opt/ml/input/data/train/c4/en/c4-train.00712-of-01024.json.gz", 24 | "/opt/ml/input/data/train/c4/en/c4-train.00356-of-01024.json.gz", 25 | "/opt/ml/input/data/train/c4/en/c4-train.00845-of-01024.json.gz", 26 | "/opt/ml/input/data/train/c4/en/c4-train.00013-of-01024.json.gz" 27 | ] -------------------------------------------------------------------------------- /data/files/c4/en/p25/partition_125.json: -------------------------------------------------------------------------------- 1 | [ 2 | "/opt/ml/input/data/train/c4/en/c4-train.00791-of-01024.json.gz", 3 | "/opt/ml/input/data/train/c4/en/c4-train.00553-of-01024.json.gz", 4 | "/opt/ml/input/data/train/c4/en/c4-train.00213-of-01024.json.gz", 5 | "/opt/ml/input/data/train/c4/en/c4-train.00177-of-01024.json.gz", 6 | "/opt/ml/input/data/train/c4/en/c4-train.00134-of-01024.json.gz", 7 | "/opt/ml/input/data/train/c4/en/c4-train.00128-of-01024.json.gz", 8 | "/opt/ml/input/data/train/c4/en/c4-train.00069-of-01024.json.gz", 9 | "/opt/ml/input/data/train/c4/en/c4-train.00846-of-01024.json.gz", 10 | "/opt/ml/input/data/train/c4/en/c4-train.00339-of-01024.json.gz", 11 | "/opt/ml/input/data/train/c4/en/c4-train.00802-of-01024.json.gz", 12 | "/opt/ml/input/data/train/c4/en/c4-train.00205-of-01024.json.gz", 13 | "/opt/ml/input/data/train/c4/en/c4-train.00291-of-01024.json.gz", 14 | "/opt/ml/input/data/train/c4/en/c4-train.00868-of-01024.json.gz", 15 | "/opt/ml/input/data/train/c4/en/c4-train.00029-of-01024.json.gz", 16 | "/opt/ml/input/data/train/c4/en/c4-train.00579-of-01024.json.gz", 17 | "/opt/ml/input/data/train/c4/en/c4-train.00761-of-01024.json.gz", 18 | "/opt/ml/input/data/train/c4/en/c4-train.00930-of-01024.json.gz", 19 | "/opt/ml/input/data/train/c4/en/c4-train.00201-of-01024.json.gz", 20 | "/opt/ml/input/data/train/c4/en/c4-train.00879-of-01024.json.gz", 21 | "/opt/ml/input/data/train/c4/en/c4-train.00478-of-01024.json.gz", 22 | "/opt/ml/input/data/train/c4/en/c4-train.00259-of-01024.json.gz", 23 | "/opt/ml/input/data/train/c4/en/c4-train.00543-of-01024.json.gz", 24 | "/opt/ml/input/data/train/c4/en/c4-train.00215-of-01024.json.gz", 25 | "/opt/ml/input/data/train/c4/en/c4-train.00504-of-01024.json.gz", 26 | "/opt/ml/input/data/train/c4/en/c4-train.00951-of-01024.json.gz" 27 | ] -------------------------------------------------------------------------------- /data/files/c4/en/p25/partition_150.json: -------------------------------------------------------------------------------- 1 | [ 2 | "/opt/ml/input/data/train/c4/en/c4-train.00145-of-01024.json.gz", 3 | "/opt/ml/input/data/train/c4/en/c4-train.00906-of-01024.json.gz", 4 | "/opt/ml/input/data/train/c4/en/c4-train.00934-of-01024.json.gz", 5 | "/opt/ml/input/data/train/c4/en/c4-train.00938-of-01024.json.gz", 6 | "/opt/ml/input/data/train/c4/en/c4-train.00355-of-01024.json.gz", 7 | "/opt/ml/input/data/train/c4/en/c4-train.00784-of-01024.json.gz", 8 | "/opt/ml/input/data/train/c4/en/c4-train.00246-of-01024.json.gz", 9 | "/opt/ml/input/data/train/c4/en/c4-train.00408-of-01024.json.gz", 10 | "/opt/ml/input/data/train/c4/en/c4-train.00955-of-01024.json.gz", 11 | "/opt/ml/input/data/train/c4/en/c4-train.00103-of-01024.json.gz", 12 | "/opt/ml/input/data/train/c4/en/c4-train.00978-of-01024.json.gz", 13 | "/opt/ml/input/data/train/c4/en/c4-train.00197-of-01024.json.gz", 14 | "/opt/ml/input/data/train/c4/en/c4-train.00967-of-01024.json.gz", 15 | "/opt/ml/input/data/train/c4/en/c4-train.00021-of-01024.json.gz", 16 | "/opt/ml/input/data/train/c4/en/c4-train.01008-of-01024.json.gz", 17 | "/opt/ml/input/data/train/c4/en/c4-train.00979-of-01024.json.gz", 18 | "/opt/ml/input/data/train/c4/en/c4-train.00922-of-01024.json.gz", 19 | "/opt/ml/input/data/train/c4/en/c4-train.00645-of-01024.json.gz", 20 | "/opt/ml/input/data/train/c4/en/c4-train.00915-of-01024.json.gz", 21 | "/opt/ml/input/data/train/c4/en/c4-train.00432-of-01024.json.gz", 22 | "/opt/ml/input/data/train/c4/en/c4-train.00096-of-01024.json.gz", 23 | "/opt/ml/input/data/train/c4/en/c4-train.00447-of-01024.json.gz", 24 | "/opt/ml/input/data/train/c4/en/c4-train.00491-of-01024.json.gz", 25 | "/opt/ml/input/data/train/c4/en/c4-train.00556-of-01024.json.gz", 26 | "/opt/ml/input/data/train/c4/en/c4-train.00031-of-01024.json.gz" 27 | ] -------------------------------------------------------------------------------- /data/files/c4/en/p25/partition_175.json: -------------------------------------------------------------------------------- 1 | [ 2 | "/opt/ml/input/data/train/c4/en/c4-train.00062-of-01024.json.gz", 3 | "/opt/ml/input/data/train/c4/en/c4-train.00282-of-01024.json.gz", 4 | "/opt/ml/input/data/train/c4/en/c4-train.00670-of-01024.json.gz", 5 | "/opt/ml/input/data/train/c4/en/c4-train.00698-of-01024.json.gz", 6 | "/opt/ml/input/data/train/c4/en/c4-train.00385-of-01024.json.gz", 7 | "/opt/ml/input/data/train/c4/en/c4-train.00707-of-01024.json.gz", 8 | "/opt/ml/input/data/train/c4/en/c4-train.00837-of-01024.json.gz", 9 | "/opt/ml/input/data/train/c4/en/c4-train.00329-of-01024.json.gz", 10 | "/opt/ml/input/data/train/c4/en/c4-train.00248-of-01024.json.gz", 11 | "/opt/ml/input/data/train/c4/en/c4-train.01006-of-01024.json.gz", 12 | "/opt/ml/input/data/train/c4/en/c4-train.00671-of-01024.json.gz", 13 | "/opt/ml/input/data/train/c4/en/c4-train.01021-of-01024.json.gz", 14 | "/opt/ml/input/data/train/c4/en/c4-train.00550-of-01024.json.gz", 15 | "/opt/ml/input/data/train/c4/en/c4-train.00615-of-01024.json.gz", 16 | "/opt/ml/input/data/train/c4/en/c4-train.00696-of-01024.json.gz", 17 | "/opt/ml/input/data/train/c4/en/c4-train.00893-of-01024.json.gz", 18 | "/opt/ml/input/data/train/c4/en/c4-train.01003-of-01024.json.gz", 19 | "/opt/ml/input/data/train/c4/en/c4-train.00782-of-01024.json.gz", 20 | "/opt/ml/input/data/train/c4/en/c4-train.00007-of-01024.json.gz", 21 | "/opt/ml/input/data/train/c4/en/c4-train.00595-of-01024.json.gz", 22 | "/opt/ml/input/data/train/c4/en/c4-train.00224-of-01024.json.gz", 23 | "/opt/ml/input/data/train/c4/en/c4-train.00969-of-01024.json.gz", 24 | "/opt/ml/input/data/train/c4/en/c4-train.00508-of-01024.json.gz", 25 | "/opt/ml/input/data/train/c4/en/c4-train.00151-of-01024.json.gz", 26 | "/opt/ml/input/data/train/c4/en/c4-train.00624-of-01024.json.gz" 27 | ] -------------------------------------------------------------------------------- /data/files/c4/en/p25/partition_200.json: -------------------------------------------------------------------------------- 1 | [ 2 | "/opt/ml/input/data/train/c4/en/c4-train.00236-of-01024.json.gz", 3 | "/opt/ml/input/data/train/c4/en/c4-train.00104-of-01024.json.gz", 4 | "/opt/ml/input/data/train/c4/en/c4-train.00652-of-01024.json.gz", 5 | "/opt/ml/input/data/train/c4/en/c4-train.00596-of-01024.json.gz", 6 | "/opt/ml/input/data/train/c4/en/c4-train.00720-of-01024.json.gz", 7 | "/opt/ml/input/data/train/c4/en/c4-train.00110-of-01024.json.gz", 8 | "/opt/ml/input/data/train/c4/en/c4-train.00944-of-01024.json.gz", 9 | "/opt/ml/input/data/train/c4/en/c4-train.00607-of-01024.json.gz", 10 | "/opt/ml/input/data/train/c4/en/c4-train.00334-of-01024.json.gz", 11 | "/opt/ml/input/data/train/c4/en/c4-train.00527-of-01024.json.gz", 12 | "/opt/ml/input/data/train/c4/en/c4-train.00270-of-01024.json.gz", 13 | "/opt/ml/input/data/train/c4/en/c4-train.00676-of-01024.json.gz", 14 | "/opt/ml/input/data/train/c4/en/c4-train.00908-of-01024.json.gz", 15 | "/opt/ml/input/data/train/c4/en/c4-train.00445-of-01024.json.gz", 16 | "/opt/ml/input/data/train/c4/en/c4-train.00146-of-01024.json.gz", 17 | "/opt/ml/input/data/train/c4/en/c4-train.00722-of-01024.json.gz", 18 | "/opt/ml/input/data/train/c4/en/c4-train.00693-of-01024.json.gz", 19 | "/opt/ml/input/data/train/c4/en/c4-train.00396-of-01024.json.gz", 20 | "/opt/ml/input/data/train/c4/en/c4-train.00141-of-01024.json.gz", 21 | "/opt/ml/input/data/train/c4/en/c4-train.00269-of-01024.json.gz", 22 | "/opt/ml/input/data/train/c4/en/c4-train.00496-of-01024.json.gz", 23 | "/opt/ml/input/data/train/c4/en/c4-train.00387-of-01024.json.gz", 24 | "/opt/ml/input/data/train/c4/en/c4-train.00745-of-01024.json.gz", 25 | "/opt/ml/input/data/train/c4/en/c4-train.00132-of-01024.json.gz", 26 | "/opt/ml/input/data/train/c4/en/c4-train.00261-of-01024.json.gz" 27 | ] -------------------------------------------------------------------------------- /data/files/c4/en/p25/partition_225.json: -------------------------------------------------------------------------------- 1 | [ 2 | "/opt/ml/input/data/train/c4/en/c4-train.00647-of-01024.json.gz", 3 | "/opt/ml/input/data/train/c4/en/c4-train.00420-of-01024.json.gz", 4 | "/opt/ml/input/data/train/c4/en/c4-train.00014-of-01024.json.gz", 5 | "/opt/ml/input/data/train/c4/en/c4-train.00665-of-01024.json.gz", 6 | "/opt/ml/input/data/train/c4/en/c4-train.00345-of-01024.json.gz", 7 | "/opt/ml/input/data/train/c4/en/c4-train.00621-of-01024.json.gz", 8 | "/opt/ml/input/data/train/c4/en/c4-train.01015-of-01024.json.gz", 9 | "/opt/ml/input/data/train/c4/en/c4-train.00947-of-01024.json.gz", 10 | "/opt/ml/input/data/train/c4/en/c4-train.00094-of-01024.json.gz", 11 | "/opt/ml/input/data/train/c4/en/c4-train.01000-of-01024.json.gz", 12 | "/opt/ml/input/data/train/c4/en/c4-train.00574-of-01024.json.gz", 13 | "/opt/ml/input/data/train/c4/en/c4-train.00421-of-01024.json.gz", 14 | "/opt/ml/input/data/train/c4/en/c4-train.00962-of-01024.json.gz", 15 | "/opt/ml/input/data/train/c4/en/c4-train.00481-of-01024.json.gz", 16 | "/opt/ml/input/data/train/c4/en/c4-train.00905-of-01024.json.gz", 17 | "/opt/ml/input/data/train/c4/en/c4-train.00158-of-01024.json.gz", 18 | "/opt/ml/input/data/train/c4/en/c4-train.00866-of-01024.json.gz", 19 | "/opt/ml/input/data/train/c4/en/c4-train.00038-of-01024.json.gz", 20 | "/opt/ml/input/data/train/c4/en/c4-train.00540-of-01024.json.gz", 21 | "/opt/ml/input/data/train/c4/en/c4-train.00551-of-01024.json.gz", 22 | "/opt/ml/input/data/train/c4/en/c4-train.00874-of-01024.json.gz", 23 | "/opt/ml/input/data/train/c4/en/c4-train.00341-of-01024.json.gz", 24 | "/opt/ml/input/data/train/c4/en/c4-train.00140-of-01024.json.gz", 25 | "/opt/ml/input/data/train/c4/en/c4-train.00753-of-01024.json.gz", 26 | "/opt/ml/input/data/train/c4/en/c4-train.00750-of-01024.json.gz" 27 | ] -------------------------------------------------------------------------------- /data/files/c4/en/p25/partition_25.json: -------------------------------------------------------------------------------- 1 | [ 2 | "/opt/ml/input/data/train/c4/en/c4-train.00086-of-01024.json.gz", 3 | "/opt/ml/input/data/train/c4/en/c4-train.00895-of-01024.json.gz", 4 | "/opt/ml/input/data/train/c4/en/c4-train.00988-of-01024.json.gz", 5 | "/opt/ml/input/data/train/c4/en/c4-train.00200-of-01024.json.gz", 6 | "/opt/ml/input/data/train/c4/en/c4-train.00081-of-01024.json.gz", 7 | "/opt/ml/input/data/train/c4/en/c4-train.00210-of-01024.json.gz", 8 | "/opt/ml/input/data/train/c4/en/c4-train.00411-of-01024.json.gz", 9 | "/opt/ml/input/data/train/c4/en/c4-train.00812-of-01024.json.gz", 10 | "/opt/ml/input/data/train/c4/en/c4-train.00410-of-01024.json.gz", 11 | "/opt/ml/input/data/train/c4/en/c4-train.00775-of-01024.json.gz", 12 | "/opt/ml/input/data/train/c4/en/c4-train.00591-of-01024.json.gz", 13 | "/opt/ml/input/data/train/c4/en/c4-train.00901-of-01024.json.gz", 14 | "/opt/ml/input/data/train/c4/en/c4-train.00349-of-01024.json.gz", 15 | "/opt/ml/input/data/train/c4/en/c4-train.00655-of-01024.json.gz", 16 | "/opt/ml/input/data/train/c4/en/c4-train.00511-of-01024.json.gz", 17 | "/opt/ml/input/data/train/c4/en/c4-train.00450-of-01024.json.gz", 18 | "/opt/ml/input/data/train/c4/en/c4-train.00430-of-01024.json.gz", 19 | "/opt/ml/input/data/train/c4/en/c4-train.00882-of-01024.json.gz", 20 | "/opt/ml/input/data/train/c4/en/c4-train.00559-of-01024.json.gz", 21 | "/opt/ml/input/data/train/c4/en/c4-train.00598-of-01024.json.gz", 22 | "/opt/ml/input/data/train/c4/en/c4-train.00120-of-01024.json.gz", 23 | "/opt/ml/input/data/train/c4/en/c4-train.00816-of-01024.json.gz", 24 | "/opt/ml/input/data/train/c4/en/c4-train.00780-of-01024.json.gz", 25 | "/opt/ml/input/data/train/c4/en/c4-train.00395-of-01024.json.gz", 26 | "/opt/ml/input/data/train/c4/en/c4-train.00726-of-01024.json.gz" 27 | ] -------------------------------------------------------------------------------- /data/files/c4/en/p25/partition_250.json: -------------------------------------------------------------------------------- 1 | [ 2 | "/opt/ml/input/data/train/c4/en/c4-train.00065-of-01024.json.gz", 3 | "/opt/ml/input/data/train/c4/en/c4-train.00719-of-01024.json.gz", 4 | "/opt/ml/input/data/train/c4/en/c4-train.00704-of-01024.json.gz", 5 | "/opt/ml/input/data/train/c4/en/c4-train.00126-of-01024.json.gz", 6 | "/opt/ml/input/data/train/c4/en/c4-train.00174-of-01024.json.gz", 7 | "/opt/ml/input/data/train/c4/en/c4-train.00950-of-01024.json.gz", 8 | "/opt/ml/input/data/train/c4/en/c4-train.00284-of-01024.json.gz", 9 | "/opt/ml/input/data/train/c4/en/c4-train.00343-of-01024.json.gz", 10 | "/opt/ml/input/data/train/c4/en/c4-train.00727-of-01024.json.gz", 11 | "/opt/ml/input/data/train/c4/en/c4-train.00795-of-01024.json.gz", 12 | "/opt/ml/input/data/train/c4/en/c4-train.00535-of-01024.json.gz", 13 | "/opt/ml/input/data/train/c4/en/c4-train.00899-of-01024.json.gz", 14 | "/opt/ml/input/data/train/c4/en/c4-train.00465-of-01024.json.gz", 15 | "/opt/ml/input/data/train/c4/en/c4-train.00382-of-01024.json.gz", 16 | "/opt/ml/input/data/train/c4/en/c4-train.00697-of-01024.json.gz", 17 | "/opt/ml/input/data/train/c4/en/c4-train.00531-of-01024.json.gz", 18 | "/opt/ml/input/data/train/c4/en/c4-train.00638-of-01024.json.gz", 19 | "/opt/ml/input/data/train/c4/en/c4-train.00180-of-01024.json.gz", 20 | "/opt/ml/input/data/train/c4/en/c4-train.00179-of-01024.json.gz", 21 | "/opt/ml/input/data/train/c4/en/c4-train.00800-of-01024.json.gz", 22 | "/opt/ml/input/data/train/c4/en/c4-train.00208-of-01024.json.gz", 23 | "/opt/ml/input/data/train/c4/en/c4-train.00006-of-01024.json.gz", 24 | "/opt/ml/input/data/train/c4/en/c4-train.00188-of-01024.json.gz", 25 | "/opt/ml/input/data/train/c4/en/c4-train.00827-of-01024.json.gz", 26 | "/opt/ml/input/data/train/c4/en/c4-train.00814-of-01024.json.gz" 27 | ] -------------------------------------------------------------------------------- /data/files/c4/en/p25/partition_275.json: -------------------------------------------------------------------------------- 1 | [ 2 | "/opt/ml/input/data/train/c4/en/c4-train.00593-of-01024.json.gz", 3 | "/opt/ml/input/data/train/c4/en/c4-train.00870-of-01024.json.gz", 4 | "/opt/ml/input/data/train/c4/en/c4-train.00985-of-01024.json.gz", 5 | "/opt/ml/input/data/train/c4/en/c4-train.00701-of-01024.json.gz", 6 | "/opt/ml/input/data/train/c4/en/c4-train.00348-of-01024.json.gz", 7 | "/opt/ml/input/data/train/c4/en/c4-train.00635-of-01024.json.gz", 8 | "/opt/ml/input/data/train/c4/en/c4-train.00754-of-01024.json.gz", 9 | "/opt/ml/input/data/train/c4/en/c4-train.00960-of-01024.json.gz", 10 | "/opt/ml/input/data/train/c4/en/c4-train.00046-of-01024.json.gz", 11 | "/opt/ml/input/data/train/c4/en/c4-train.00080-of-01024.json.gz", 12 | "/opt/ml/input/data/train/c4/en/c4-train.00059-of-01024.json.gz", 13 | "/opt/ml/input/data/train/c4/en/c4-train.00877-of-01024.json.gz", 14 | "/opt/ml/input/data/train/c4/en/c4-train.00875-of-01024.json.gz", 15 | "/opt/ml/input/data/train/c4/en/c4-train.00452-of-01024.json.gz", 16 | "/opt/ml/input/data/train/c4/en/c4-train.00817-of-01024.json.gz", 17 | "/opt/ml/input/data/train/c4/en/c4-train.00640-of-01024.json.gz", 18 | "/opt/ml/input/data/train/c4/en/c4-train.00759-of-01024.json.gz", 19 | "/opt/ml/input/data/train/c4/en/c4-train.00479-of-01024.json.gz", 20 | "/opt/ml/input/data/train/c4/en/c4-train.00861-of-01024.json.gz", 21 | "/opt/ml/input/data/train/c4/en/c4-train.00758-of-01024.json.gz", 22 | "/opt/ml/input/data/train/c4/en/c4-train.00247-of-01024.json.gz", 23 | "/opt/ml/input/data/train/c4/en/c4-train.00011-of-01024.json.gz", 24 | "/opt/ml/input/data/train/c4/en/c4-train.00572-of-01024.json.gz", 25 | "/opt/ml/input/data/train/c4/en/c4-train.00644-of-01024.json.gz", 26 | "/opt/ml/input/data/train/c4/en/c4-train.00735-of-01024.json.gz" 27 | ] -------------------------------------------------------------------------------- /data/files/c4/en/p25/partition_300.json: -------------------------------------------------------------------------------- 1 | [ 2 | "/opt/ml/input/data/train/c4/en/c4-train.00283-of-01024.json.gz", 3 | "/opt/ml/input/data/train/c4/en/c4-train.00887-of-01024.json.gz", 4 | "/opt/ml/input/data/train/c4/en/c4-train.00288-of-01024.json.gz", 5 | "/opt/ml/input/data/train/c4/en/c4-train.00649-of-01024.json.gz", 6 | "/opt/ml/input/data/train/c4/en/c4-train.00428-of-01024.json.gz", 7 | "/opt/ml/input/data/train/c4/en/c4-train.00173-of-01024.json.gz", 8 | "/opt/ml/input/data/train/c4/en/c4-train.00156-of-01024.json.gz", 9 | "/opt/ml/input/data/train/c4/en/c4-train.00330-of-01024.json.gz", 10 | "/opt/ml/input/data/train/c4/en/c4-train.00833-of-01024.json.gz", 11 | "/opt/ml/input/data/train/c4/en/c4-train.00053-of-01024.json.gz", 12 | "/opt/ml/input/data/train/c4/en/c4-train.00199-of-01024.json.gz", 13 | "/opt/ml/input/data/train/c4/en/c4-train.00377-of-01024.json.gz", 14 | "/opt/ml/input/data/train/c4/en/c4-train.00082-of-01024.json.gz", 15 | "/opt/ml/input/data/train/c4/en/c4-train.00097-of-01024.json.gz", 16 | "/opt/ml/input/data/train/c4/en/c4-train.00279-of-01024.json.gz", 17 | "/opt/ml/input/data/train/c4/en/c4-train.00828-of-01024.json.gz", 18 | "/opt/ml/input/data/train/c4/en/c4-train.00829-of-01024.json.gz", 19 | "/opt/ml/input/data/train/c4/en/c4-train.00771-of-01024.json.gz", 20 | "/opt/ml/input/data/train/c4/en/c4-train.00611-of-01024.json.gz", 21 | "/opt/ml/input/data/train/c4/en/c4-train.00716-of-01024.json.gz", 22 | "/opt/ml/input/data/train/c4/en/c4-train.00483-of-01024.json.gz", 23 | "/opt/ml/input/data/train/c4/en/c4-train.00842-of-01024.json.gz", 24 | "/opt/ml/input/data/train/c4/en/c4-train.00996-of-01024.json.gz", 25 | "/opt/ml/input/data/train/c4/en/c4-train.00290-of-01024.json.gz", 26 | "/opt/ml/input/data/train/c4/en/c4-train.00036-of-01024.json.gz" 27 | ] -------------------------------------------------------------------------------- /data/files/c4/en/p25/partition_325.json: -------------------------------------------------------------------------------- 1 | [ 2 | "/opt/ml/input/data/train/c4/en/c4-train.00076-of-01024.json.gz", 3 | "/opt/ml/input/data/train/c4/en/c4-train.00687-of-01024.json.gz", 4 | "/opt/ml/input/data/train/c4/en/c4-train.00834-of-01024.json.gz", 5 | "/opt/ml/input/data/train/c4/en/c4-train.00880-of-01024.json.gz", 6 | "/opt/ml/input/data/train/c4/en/c4-train.00176-of-01024.json.gz", 7 | "/opt/ml/input/data/train/c4/en/c4-train.00089-of-01024.json.gz", 8 | "/opt/ml/input/data/train/c4/en/c4-train.00932-of-01024.json.gz", 9 | "/opt/ml/input/data/train/c4/en/c4-train.00793-of-01024.json.gz", 10 | "/opt/ml/input/data/train/c4/en/c4-train.00602-of-01024.json.gz", 11 | "/opt/ml/input/data/train/c4/en/c4-train.00228-of-01024.json.gz", 12 | "/opt/ml/input/data/train/c4/en/c4-train.00675-of-01024.json.gz", 13 | "/opt/ml/input/data/train/c4/en/c4-train.00085-of-01024.json.gz", 14 | "/opt/ml/input/data/train/c4/en/c4-train.00752-of-01024.json.gz", 15 | "/opt/ml/input/data/train/c4/en/c4-train.00715-of-01024.json.gz", 16 | "/opt/ml/input/data/train/c4/en/c4-train.00911-of-01024.json.gz", 17 | "/opt/ml/input/data/train/c4/en/c4-train.00238-of-01024.json.gz", 18 | "/opt/ml/input/data/train/c4/en/c4-train.00919-of-01024.json.gz", 19 | "/opt/ml/input/data/train/c4/en/c4-train.00002-of-01024.json.gz", 20 | "/opt/ml/input/data/train/c4/en/c4-train.00785-of-01024.json.gz", 21 | "/opt/ml/input/data/train/c4/en/c4-train.00636-of-01024.json.gz", 22 | "/opt/ml/input/data/train/c4/en/c4-train.00402-of-01024.json.gz", 23 | "/opt/ml/input/data/train/c4/en/c4-train.00772-of-01024.json.gz", 24 | "/opt/ml/input/data/train/c4/en/c4-train.00459-of-01024.json.gz", 25 | "/opt/ml/input/data/train/c4/en/c4-train.00476-of-01024.json.gz", 26 | "/opt/ml/input/data/train/c4/en/c4-train.00904-of-01024.json.gz" 27 | ] -------------------------------------------------------------------------------- /data/files/c4/en/p25/partition_350.json: -------------------------------------------------------------------------------- 1 | [ 2 | "/opt/ml/input/data/train/c4/en/c4-train.00857-of-01024.json.gz", 3 | "/opt/ml/input/data/train/c4/en/c4-train.00321-of-01024.json.gz", 4 | "/opt/ml/input/data/train/c4/en/c4-train.00957-of-01024.json.gz", 5 | "/opt/ml/input/data/train/c4/en/c4-train.00100-of-01024.json.gz", 6 | "/opt/ml/input/data/train/c4/en/c4-train.00966-of-01024.json.gz", 7 | "/opt/ml/input/data/train/c4/en/c4-train.01020-of-01024.json.gz", 8 | "/opt/ml/input/data/train/c4/en/c4-train.00113-of-01024.json.gz", 9 | "/opt/ml/input/data/train/c4/en/c4-train.01016-of-01024.json.gz", 10 | "/opt/ml/input/data/train/c4/en/c4-train.00070-of-01024.json.gz", 11 | "/opt/ml/input/data/train/c4/en/c4-train.00557-of-01024.json.gz", 12 | "/opt/ml/input/data/train/c4/en/c4-train.00244-of-01024.json.gz", 13 | "/opt/ml/input/data/train/c4/en/c4-train.00945-of-01024.json.gz", 14 | "/opt/ml/input/data/train/c4/en/c4-train.00181-of-01024.json.gz", 15 | "/opt/ml/input/data/train/c4/en/c4-train.00225-of-01024.json.gz", 16 | "/opt/ml/input/data/train/c4/en/c4-train.00167-of-01024.json.gz", 17 | "/opt/ml/input/data/train/c4/en/c4-train.00946-of-01024.json.gz", 18 | "/opt/ml/input/data/train/c4/en/c4-train.00847-of-01024.json.gz", 19 | "/opt/ml/input/data/train/c4/en/c4-train.00746-of-01024.json.gz", 20 | "/opt/ml/input/data/train/c4/en/c4-train.00891-of-01024.json.gz", 21 | "/opt/ml/input/data/train/c4/en/c4-train.00373-of-01024.json.gz", 22 | "/opt/ml/input/data/train/c4/en/c4-train.00358-of-01024.json.gz", 23 | "/opt/ml/input/data/train/c4/en/c4-train.00807-of-01024.json.gz", 24 | "/opt/ml/input/data/train/c4/en/c4-train.00873-of-01024.json.gz", 25 | "/opt/ml/input/data/train/c4/en/c4-train.00016-of-01024.json.gz", 26 | "/opt/ml/input/data/train/c4/en/c4-train.00129-of-01024.json.gz" 27 | ] -------------------------------------------------------------------------------- /data/files/c4/en/p25/partition_375.json: -------------------------------------------------------------------------------- 1 | [ 2 | "/opt/ml/input/data/train/c4/en/c4-train.00903-of-01024.json.gz", 3 | "/opt/ml/input/data/train/c4/en/c4-train.00669-of-01024.json.gz", 4 | "/opt/ml/input/data/train/c4/en/c4-train.00272-of-01024.json.gz", 5 | "/opt/ml/input/data/train/c4/en/c4-train.00034-of-01024.json.gz", 6 | "/opt/ml/input/data/train/c4/en/c4-train.00047-of-01024.json.gz", 7 | "/opt/ml/input/data/train/c4/en/c4-train.00660-of-01024.json.gz", 8 | "/opt/ml/input/data/train/c4/en/c4-train.00999-of-01024.json.gz", 9 | "/opt/ml/input/data/train/c4/en/c4-train.00912-of-01024.json.gz", 10 | "/opt/ml/input/data/train/c4/en/c4-train.00308-of-01024.json.gz", 11 | "/opt/ml/input/data/train/c4/en/c4-train.00732-of-01024.json.gz", 12 | "/opt/ml/input/data/train/c4/en/c4-train.00440-of-01024.json.gz", 13 | "/opt/ml/input/data/train/c4/en/c4-train.00012-of-01024.json.gz", 14 | "/opt/ml/input/data/train/c4/en/c4-train.00643-of-01024.json.gz", 15 | "/opt/ml/input/data/train/c4/en/c4-train.01019-of-01024.json.gz", 16 | "/opt/ml/input/data/train/c4/en/c4-train.00219-of-01024.json.gz", 17 | "/opt/ml/input/data/train/c4/en/c4-train.00304-of-01024.json.gz", 18 | "/opt/ml/input/data/train/c4/en/c4-train.00253-of-01024.json.gz", 19 | "/opt/ml/input/data/train/c4/en/c4-train.00484-of-01024.json.gz", 20 | "/opt/ml/input/data/train/c4/en/c4-train.00335-of-01024.json.gz", 21 | "/opt/ml/input/data/train/c4/en/c4-train.00507-of-01024.json.gz", 22 | "/opt/ml/input/data/train/c4/en/c4-train.00285-of-01024.json.gz", 23 | "/opt/ml/input/data/train/c4/en/c4-train.00926-of-01024.json.gz", 24 | "/opt/ml/input/data/train/c4/en/c4-train.00325-of-01024.json.gz", 25 | "/opt/ml/input/data/train/c4/en/c4-train.00354-of-01024.json.gz", 26 | "/opt/ml/input/data/train/c4/en/c4-train.00311-of-01024.json.gz" 27 | ] -------------------------------------------------------------------------------- /data/files/c4/en/p25/partition_400.json: -------------------------------------------------------------------------------- 1 | [ 2 | "/opt/ml/input/data/train/c4/en/c4-train.00150-of-01024.json.gz", 3 | "/opt/ml/input/data/train/c4/en/c4-train.00137-of-01024.json.gz", 4 | "/opt/ml/input/data/train/c4/en/c4-train.00299-of-01024.json.gz", 5 | "/opt/ml/input/data/train/c4/en/c4-train.00954-of-01024.json.gz", 6 | "/opt/ml/input/data/train/c4/en/c4-train.00631-of-01024.json.gz", 7 | "/opt/ml/input/data/train/c4/en/c4-train.00404-of-01024.json.gz", 8 | "/opt/ml/input/data/train/c4/en/c4-train.00054-of-01024.json.gz", 9 | "/opt/ml/input/data/train/c4/en/c4-train.00694-of-01024.json.gz", 10 | "/opt/ml/input/data/train/c4/en/c4-train.00767-of-01024.json.gz", 11 | "/opt/ml/input/data/train/c4/en/c4-train.00910-of-01024.json.gz", 12 | "/opt/ml/input/data/train/c4/en/c4-train.00642-of-01024.json.gz", 13 | "/opt/ml/input/data/train/c4/en/c4-train.00600-of-01024.json.gz", 14 | "/opt/ml/input/data/train/c4/en/c4-train.00751-of-01024.json.gz", 15 | "/opt/ml/input/data/train/c4/en/c4-train.00490-of-01024.json.gz", 16 | "/opt/ml/input/data/train/c4/en/c4-train.00948-of-01024.json.gz", 17 | "/opt/ml/input/data/train/c4/en/c4-train.00017-of-01024.json.gz", 18 | "/opt/ml/input/data/train/c4/en/c4-train.00650-of-01024.json.gz", 19 | "/opt/ml/input/data/train/c4/en/c4-train.00987-of-01024.json.gz", 20 | "/opt/ml/input/data/train/c4/en/c4-train.00970-of-01024.json.gz", 21 | "/opt/ml/input/data/train/c4/en/c4-train.00865-of-01024.json.gz", 22 | "/opt/ml/input/data/train/c4/en/c4-train.00522-of-01024.json.gz", 23 | "/opt/ml/input/data/train/c4/en/c4-train.00589-of-01024.json.gz", 24 | "/opt/ml/input/data/train/c4/en/c4-train.00510-of-01024.json.gz", 25 | "/opt/ml/input/data/train/c4/en/c4-train.00468-of-01024.json.gz", 26 | "/opt/ml/input/data/train/c4/en/c4-train.00204-of-01024.json.gz" 27 | ] -------------------------------------------------------------------------------- /data/files/c4/en/p25/partition_425.json: -------------------------------------------------------------------------------- 1 | [ 2 | "/opt/ml/input/data/train/c4/en/c4-train.00801-of-01024.json.gz", 3 | "/opt/ml/input/data/train/c4/en/c4-train.00456-of-01024.json.gz", 4 | "/opt/ml/input/data/train/c4/en/c4-train.00449-of-01024.json.gz", 5 | "/opt/ml/input/data/train/c4/en/c4-train.00580-of-01024.json.gz", 6 | "/opt/ml/input/data/train/c4/en/c4-train.00418-of-01024.json.gz", 7 | "/opt/ml/input/data/train/c4/en/c4-train.00063-of-01024.json.gz", 8 | "/opt/ml/input/data/train/c4/en/c4-train.00451-of-01024.json.gz", 9 | "/opt/ml/input/data/train/c4/en/c4-train.00028-of-01024.json.gz", 10 | "/opt/ml/input/data/train/c4/en/c4-train.00972-of-01024.json.gz", 11 | "/opt/ml/input/data/train/c4/en/c4-train.00963-of-01024.json.gz", 12 | "/opt/ml/input/data/train/c4/en/c4-train.00512-of-01024.json.gz", 13 | "/opt/ml/input/data/train/c4/en/c4-train.00391-of-01024.json.gz", 14 | "/opt/ml/input/data/train/c4/en/c4-train.00769-of-01024.json.gz", 15 | "/opt/ml/input/data/train/c4/en/c4-train.00900-of-01024.json.gz", 16 | "/opt/ml/input/data/train/c4/en/c4-train.00427-of-01024.json.gz", 17 | "/opt/ml/input/data/train/c4/en/c4-train.00379-of-01024.json.gz", 18 | "/opt/ml/input/data/train/c4/en/c4-train.00613-of-01024.json.gz", 19 | "/opt/ml/input/data/train/c4/en/c4-train.00399-of-01024.json.gz", 20 | "/opt/ml/input/data/train/c4/en/c4-train.00227-of-01024.json.gz", 21 | "/opt/ml/input/data/train/c4/en/c4-train.00853-of-01024.json.gz", 22 | "/opt/ml/input/data/train/c4/en/c4-train.00588-of-01024.json.gz", 23 | "/opt/ml/input/data/train/c4/en/c4-train.00045-of-01024.json.gz", 24 | "/opt/ml/input/data/train/c4/en/c4-train.00189-of-01024.json.gz", 25 | "/opt/ml/input/data/train/c4/en/c4-train.00538-of-01024.json.gz", 26 | "/opt/ml/input/data/train/c4/en/c4-train.00859-of-01024.json.gz" 27 | ] -------------------------------------------------------------------------------- /data/files/c4/en/p25/partition_450.json: -------------------------------------------------------------------------------- 1 | [ 2 | "/opt/ml/input/data/train/c4/en/c4-train.00298-of-01024.json.gz", 3 | "/opt/ml/input/data/train/c4/en/c4-train.00342-of-01024.json.gz", 4 | "/opt/ml/input/data/train/c4/en/c4-train.00372-of-01024.json.gz", 5 | "/opt/ml/input/data/train/c4/en/c4-train.00961-of-01024.json.gz", 6 | "/opt/ml/input/data/train/c4/en/c4-train.00736-of-01024.json.gz", 7 | "/opt/ml/input/data/train/c4/en/c4-train.00639-of-01024.json.gz", 8 | "/opt/ml/input/data/train/c4/en/c4-train.00892-of-01024.json.gz", 9 | "/opt/ml/input/data/train/c4/en/c4-train.00249-of-01024.json.gz", 10 | "/opt/ml/input/data/train/c4/en/c4-train.00610-of-01024.json.gz", 11 | "/opt/ml/input/data/train/c4/en/c4-train.00609-of-01024.json.gz", 12 | "/opt/ml/input/data/train/c4/en/c4-train.00368-of-01024.json.gz", 13 | "/opt/ml/input/data/train/c4/en/c4-train.00381-of-01024.json.gz", 14 | "/opt/ml/input/data/train/c4/en/c4-train.00654-of-01024.json.gz", 15 | "/opt/ml/input/data/train/c4/en/c4-train.00820-of-01024.json.gz", 16 | "/opt/ml/input/data/train/c4/en/c4-train.00832-of-01024.json.gz", 17 | "/opt/ml/input/data/train/c4/en/c4-train.00982-of-01024.json.gz", 18 | "/opt/ml/input/data/train/c4/en/c4-train.00333-of-01024.json.gz", 19 | "/opt/ml/input/data/train/c4/en/c4-train.00501-of-01024.json.gz", 20 | "/opt/ml/input/data/train/c4/en/c4-train.00965-of-01024.json.gz", 21 | "/opt/ml/input/data/train/c4/en/c4-train.00562-of-01024.json.gz", 22 | "/opt/ml/input/data/train/c4/en/c4-train.00517-of-01024.json.gz", 23 | "/opt/ml/input/data/train/c4/en/c4-train.00191-of-01024.json.gz", 24 | "/opt/ml/input/data/train/c4/en/c4-train.00690-of-01024.json.gz", 25 | "/opt/ml/input/data/train/c4/en/c4-train.00806-of-01024.json.gz", 26 | "/opt/ml/input/data/train/c4/en/c4-train.00500-of-01024.json.gz" 27 | ] -------------------------------------------------------------------------------- /data/files/c4/en/p25/partition_475.json: -------------------------------------------------------------------------------- 1 | [ 2 | "/opt/ml/input/data/train/c4/en/c4-train.01010-of-01024.json.gz", 3 | "/opt/ml/input/data/train/c4/en/c4-train.01012-of-01024.json.gz", 4 | "/opt/ml/input/data/train/c4/en/c4-train.00590-of-01024.json.gz", 5 | "/opt/ml/input/data/train/c4/en/c4-train.00122-of-01024.json.gz", 6 | "/opt/ml/input/data/train/c4/en/c4-train.00709-of-01024.json.gz", 7 | "/opt/ml/input/data/train/c4/en/c4-train.00000-of-01024.json.gz", 8 | "/opt/ml/input/data/train/c4/en/c4-train.00287-of-01024.json.gz", 9 | "/opt/ml/input/data/train/c4/en/c4-train.00281-of-01024.json.gz", 10 | "/opt/ml/input/data/train/c4/en/c4-train.00986-of-01024.json.gz", 11 | "/opt/ml/input/data/train/c4/en/c4-train.00737-of-01024.json.gz", 12 | "/opt/ml/input/data/train/c4/en/c4-train.00098-of-01024.json.gz", 13 | "/opt/ml/input/data/train/c4/en/c4-train.00241-of-01024.json.gz", 14 | "/opt/ml/input/data/train/c4/en/c4-train.00976-of-01024.json.gz", 15 | "/opt/ml/input/data/train/c4/en/c4-train.00705-of-01024.json.gz", 16 | "/opt/ml/input/data/train/c4/en/c4-train.00876-of-01024.json.gz", 17 | "/opt/ml/input/data/train/c4/en/c4-train.00760-of-01024.json.gz", 18 | "/opt/ml/input/data/train/c4/en/c4-train.00923-of-01024.json.gz", 19 | "/opt/ml/input/data/train/c4/en/c4-train.00713-of-01024.json.gz", 20 | "/opt/ml/input/data/train/c4/en/c4-train.00599-of-01024.json.gz", 21 | "/opt/ml/input/data/train/c4/en/c4-train.00804-of-01024.json.gz", 22 | "/opt/ml/input/data/train/c4/en/c4-train.00894-of-01024.json.gz", 23 | "/opt/ml/input/data/train/c4/en/c4-train.00830-of-01024.json.gz", 24 | "/opt/ml/input/data/train/c4/en/c4-train.00971-of-01024.json.gz", 25 | "/opt/ml/input/data/train/c4/en/c4-train.00584-of-01024.json.gz", 26 | "/opt/ml/input/data/train/c4/en/c4-train.00770-of-01024.json.gz" 27 | ] -------------------------------------------------------------------------------- /data/files/c4/en/p25/partition_50.json: -------------------------------------------------------------------------------- 1 | [ 2 | "/opt/ml/input/data/train/c4/en/c4-train.00273-of-01024.json.gz", 3 | "/opt/ml/input/data/train/c4/en/c4-train.00768-of-01024.json.gz", 4 | "/opt/ml/input/data/train/c4/en/c4-train.00190-of-01024.json.gz", 5 | "/opt/ml/input/data/train/c4/en/c4-train.00499-of-01024.json.gz", 6 | "/opt/ml/input/data/train/c4/en/c4-train.00458-of-01024.json.gz", 7 | "/opt/ml/input/data/train/c4/en/c4-train.00506-of-01024.json.gz", 8 | "/opt/ml/input/data/train/c4/en/c4-train.00061-of-01024.json.gz", 9 | "/opt/ml/input/data/train/c4/en/c4-train.00005-of-01024.json.gz", 10 | "/opt/ml/input/data/train/c4/en/c4-train.00889-of-01024.json.gz", 11 | "/opt/ml/input/data/train/c4/en/c4-train.00442-of-01024.json.gz", 12 | "/opt/ml/input/data/train/c4/en/c4-train.00617-of-01024.json.gz", 13 | "/opt/ml/input/data/train/c4/en/c4-train.00786-of-01024.json.gz", 14 | "/opt/ml/input/data/train/c4/en/c4-train.00187-of-01024.json.gz", 15 | "/opt/ml/input/data/train/c4/en/c4-train.00839-of-01024.json.gz", 16 | "/opt/ml/input/data/train/c4/en/c4-train.00405-of-01024.json.gz", 17 | "/opt/ml/input/data/train/c4/en/c4-train.00555-of-01024.json.gz", 18 | "/opt/ml/input/data/train/c4/en/c4-train.00133-of-01024.json.gz", 19 | "/opt/ml/input/data/train/c4/en/c4-train.00634-of-01024.json.gz", 20 | "/opt/ml/input/data/train/c4/en/c4-train.00434-of-01024.json.gz", 21 | "/opt/ml/input/data/train/c4/en/c4-train.00519-of-01024.json.gz", 22 | "/opt/ml/input/data/train/c4/en/c4-train.00340-of-01024.json.gz", 23 | "/opt/ml/input/data/train/c4/en/c4-train.00810-of-01024.json.gz", 24 | "/opt/ml/input/data/train/c4/en/c4-train.00217-of-01024.json.gz", 25 | "/opt/ml/input/data/train/c4/en/c4-train.00087-of-01024.json.gz", 26 | "/opt/ml/input/data/train/c4/en/c4-train.00302-of-01024.json.gz" 27 | ] -------------------------------------------------------------------------------- /data/files/c4/en/p25/partition_500.json: -------------------------------------------------------------------------------- 1 | [ 2 | "/opt/ml/input/data/train/c4/en/c4-train.00679-of-01024.json.gz", 3 | "/opt/ml/input/data/train/c4/en/c4-train.00616-of-01024.json.gz", 4 | "/opt/ml/input/data/train/c4/en/c4-train.00561-of-01024.json.gz", 5 | "/opt/ml/input/data/train/c4/en/c4-train.00184-of-01024.json.gz", 6 | "/opt/ml/input/data/train/c4/en/c4-train.00072-of-01024.json.gz", 7 | "/opt/ml/input/data/train/c4/en/c4-train.00347-of-01024.json.gz", 8 | "/opt/ml/input/data/train/c4/en/c4-train.00949-of-01024.json.gz", 9 | "/opt/ml/input/data/train/c4/en/c4-train.00051-of-01024.json.gz", 10 | "/opt/ml/input/data/train/c4/en/c4-train.00401-of-01024.json.gz", 11 | "/opt/ml/input/data/train/c4/en/c4-train.00567-of-01024.json.gz", 12 | "/opt/ml/input/data/train/c4/en/c4-train.00305-of-01024.json.gz", 13 | "/opt/ml/input/data/train/c4/en/c4-train.00498-of-01024.json.gz", 14 | "/opt/ml/input/data/train/c4/en/c4-train.00896-of-01024.json.gz", 15 | "/opt/ml/input/data/train/c4/en/c4-train.00101-of-01024.json.gz", 16 | "/opt/ml/input/data/train/c4/en/c4-train.00169-of-01024.json.gz", 17 | "/opt/ml/input/data/train/c4/en/c4-train.00984-of-01024.json.gz", 18 | "/opt/ml/input/data/train/c4/en/c4-train.00815-of-01024.json.gz", 19 | "/opt/ml/input/data/train/c4/en/c4-train.00262-of-01024.json.gz", 20 | "/opt/ml/input/data/train/c4/en/c4-train.00164-of-01024.json.gz", 21 | "/opt/ml/input/data/train/c4/en/c4-train.00077-of-01024.json.gz", 22 | "/opt/ml/input/data/train/c4/en/c4-train.00710-of-01024.json.gz", 23 | "/opt/ml/input/data/train/c4/en/c4-train.00723-of-01024.json.gz", 24 | "/opt/ml/input/data/train/c4/en/c4-train.00172-of-01024.json.gz", 25 | "/opt/ml/input/data/train/c4/en/c4-train.00558-of-01024.json.gz", 26 | "/opt/ml/input/data/train/c4/en/c4-train.00993-of-01024.json.gz" 27 | ] -------------------------------------------------------------------------------- /data/files/c4/en/p25/partition_525.json: -------------------------------------------------------------------------------- 1 | [ 2 | "/opt/ml/input/data/train/c4/en/c4-train.00392-of-01024.json.gz", 3 | "/opt/ml/input/data/train/c4/en/c4-train.00123-of-01024.json.gz", 4 | "/opt/ml/input/data/train/c4/en/c4-train.00790-of-01024.json.gz", 5 | "/opt/ml/input/data/train/c4/en/c4-train.00083-of-01024.json.gz", 6 | "/opt/ml/input/data/train/c4/en/c4-train.00024-of-01024.json.gz", 7 | "/opt/ml/input/data/train/c4/en/c4-train.00792-of-01024.json.gz", 8 | "/opt/ml/input/data/train/c4/en/c4-train.00245-of-01024.json.gz", 9 | "/opt/ml/input/data/train/c4/en/c4-train.00648-of-01024.json.gz", 10 | "/opt/ml/input/data/train/c4/en/c4-train.00939-of-01024.json.gz", 11 | "/opt/ml/input/data/train/c4/en/c4-train.00964-of-01024.json.gz", 12 | "/opt/ml/input/data/train/c4/en/c4-train.00153-of-01024.json.gz", 13 | "/opt/ml/input/data/train/c4/en/c4-train.00867-of-01024.json.gz", 14 | "/opt/ml/input/data/train/c4/en/c4-train.00545-of-01024.json.gz", 15 | "/opt/ml/input/data/train/c4/en/c4-train.00119-of-01024.json.gz", 16 | "/opt/ml/input/data/train/c4/en/c4-train.00509-of-01024.json.gz", 17 | "/opt/ml/input/data/train/c4/en/c4-train.00416-of-01024.json.gz", 18 | "/opt/ml/input/data/train/c4/en/c4-train.00040-of-01024.json.gz", 19 | "/opt/ml/input/data/train/c4/en/c4-train.01018-of-01024.json.gz", 20 | "/opt/ml/input/data/train/c4/en/c4-train.01002-of-01024.json.gz", 21 | "/opt/ml/input/data/train/c4/en/c4-train.00369-of-01024.json.gz", 22 | "/opt/ml/input/data/train/c4/en/c4-train.01013-of-01024.json.gz", 23 | "/opt/ml/input/data/train/c4/en/c4-train.00968-of-01024.json.gz", 24 | "/opt/ml/input/data/train/c4/en/c4-train.00666-of-01024.json.gz", 25 | "/opt/ml/input/data/train/c4/en/c4-train.00102-of-01024.json.gz", 26 | "/opt/ml/input/data/train/c4/en/c4-train.00928-of-01024.json.gz" 27 | ] -------------------------------------------------------------------------------- /data/files/c4/en/p25/partition_550.json: -------------------------------------------------------------------------------- 1 | [ 2 | "/opt/ml/input/data/train/c4/en/c4-train.00068-of-01024.json.gz", 3 | "/opt/ml/input/data/train/c4/en/c4-train.00313-of-01024.json.gz", 4 | "/opt/ml/input/data/train/c4/en/c4-train.00294-of-01024.json.gz", 5 | "/opt/ml/input/data/train/c4/en/c4-train.00851-of-01024.json.gz", 6 | "/opt/ml/input/data/train/c4/en/c4-train.00794-of-01024.json.gz", 7 | "/opt/ml/input/data/train/c4/en/c4-train.00678-of-01024.json.gz", 8 | "/opt/ml/input/data/train/c4/en/c4-train.00681-of-01024.json.gz", 9 | "/opt/ml/input/data/train/c4/en/c4-train.00637-of-01024.json.gz", 10 | "/opt/ml/input/data/train/c4/en/c4-train.00747-of-01024.json.gz", 11 | "/opt/ml/input/data/train/c4/en/c4-train.00935-of-01024.json.gz", 12 | "/opt/ml/input/data/train/c4/en/c4-train.00168-of-01024.json.gz", 13 | "/opt/ml/input/data/train/c4/en/c4-train.00263-of-01024.json.gz", 14 | "/opt/ml/input/data/train/c4/en/c4-train.00223-of-01024.json.gz", 15 | "/opt/ml/input/data/train/c4/en/c4-train.00841-of-01024.json.gz", 16 | "/opt/ml/input/data/train/c4/en/c4-train.00563-of-01024.json.gz", 17 | "/opt/ml/input/data/train/c4/en/c4-train.00787-of-01024.json.gz", 18 | "/opt/ml/input/data/train/c4/en/c4-train.00980-of-01024.json.gz", 19 | "/opt/ml/input/data/train/c4/en/c4-train.00338-of-01024.json.gz", 20 | "/opt/ml/input/data/train/c4/en/c4-train.00286-of-01024.json.gz", 21 | "/opt/ml/input/data/train/c4/en/c4-train.00049-of-01024.json.gz", 22 | "/opt/ml/input/data/train/c4/en/c4-train.00403-of-01024.json.gz", 23 | "/opt/ml/input/data/train/c4/en/c4-train.00393-of-01024.json.gz", 24 | "/opt/ml/input/data/train/c4/en/c4-train.00492-of-01024.json.gz", 25 | "/opt/ml/input/data/train/c4/en/c4-train.00656-of-01024.json.gz", 26 | "/opt/ml/input/data/train/c4/en/c4-train.00840-of-01024.json.gz" 27 | ] -------------------------------------------------------------------------------- /data/files/c4/en/p25/partition_575.json: -------------------------------------------------------------------------------- 1 | [ 2 | "/opt/ml/input/data/train/c4/en/c4-train.00902-of-01024.json.gz", 3 | "/opt/ml/input/data/train/c4/en/c4-train.00351-of-01024.json.gz", 4 | "/opt/ml/input/data/train/c4/en/c4-train.00378-of-01024.json.gz", 5 | "/opt/ml/input/data/train/c4/en/c4-train.00757-of-01024.json.gz", 6 | "/opt/ml/input/data/train/c4/en/c4-train.00942-of-01024.json.gz", 7 | "/opt/ml/input/data/train/c4/en/c4-train.00441-of-01024.json.gz", 8 | "/opt/ml/input/data/train/c4/en/c4-train.00147-of-01024.json.gz", 9 | "/opt/ml/input/data/train/c4/en/c4-train.00471-of-01024.json.gz", 10 | "/opt/ml/input/data/train/c4/en/c4-train.00015-of-01024.json.gz", 11 | "/opt/ml/input/data/train/c4/en/c4-train.00075-of-01024.json.gz", 12 | "/opt/ml/input/data/train/c4/en/c4-train.00618-of-01024.json.gz", 13 | "/opt/ml/input/data/train/c4/en/c4-train.00818-of-01024.json.gz", 14 | "/opt/ml/input/data/train/c4/en/c4-train.00324-of-01024.json.gz", 15 | "/opt/ml/input/data/train/c4/en/c4-train.00370-of-01024.json.gz", 16 | "/opt/ml/input/data/train/c4/en/c4-train.00413-of-01024.json.gz", 17 | "/opt/ml/input/data/train/c4/en/c4-train.00139-of-01024.json.gz", 18 | "/opt/ml/input/data/train/c4/en/c4-train.00674-of-01024.json.gz", 19 | "/opt/ml/input/data/train/c4/en/c4-train.00114-of-01024.json.gz", 20 | "/opt/ml/input/data/train/c4/en/c4-train.00463-of-01024.json.gz", 21 | "/opt/ml/input/data/train/c4/en/c4-train.00917-of-01024.json.gz", 22 | "/opt/ml/input/data/train/c4/en/c4-train.00154-of-01024.json.gz", 23 | "/opt/ml/input/data/train/c4/en/c4-train.00549-of-01024.json.gz", 24 | "/opt/ml/input/data/train/c4/en/c4-train.00296-of-01024.json.gz", 25 | "/opt/ml/input/data/train/c4/en/c4-train.00419-of-01024.json.gz", 26 | "/opt/ml/input/data/train/c4/en/c4-train.00310-of-01024.json.gz" 27 | ] -------------------------------------------------------------------------------- /data/files/c4/en/p25/partition_600.json: -------------------------------------------------------------------------------- 1 | [ 2 | "/opt/ml/input/data/train/c4/en/c4-train.00157-of-01024.json.gz", 3 | "/opt/ml/input/data/train/c4/en/c4-train.00469-of-01024.json.gz", 4 | "/opt/ml/input/data/train/c4/en/c4-train.00521-of-01024.json.gz", 5 | "/opt/ml/input/data/train/c4/en/c4-train.00699-of-01024.json.gz", 6 | "/opt/ml/input/data/train/c4/en/c4-train.00730-of-01024.json.gz", 7 | "/opt/ml/input/data/train/c4/en/c4-train.00603-of-01024.json.gz", 8 | "/opt/ml/input/data/train/c4/en/c4-train.00781-of-01024.json.gz", 9 | "/opt/ml/input/data/train/c4/en/c4-train.00777-of-01024.json.gz", 10 | "/opt/ml/input/data/train/c4/en/c4-train.00560-of-01024.json.gz", 11 | "/opt/ml/input/data/train/c4/en/c4-train.00159-of-01024.json.gz", 12 | "/opt/ml/input/data/train/c4/en/c4-train.00032-of-01024.json.gz", 13 | "/opt/ml/input/data/train/c4/en/c4-train.00107-of-01024.json.gz", 14 | "/opt/ml/input/data/train/c4/en/c4-train.00064-of-01024.json.gz", 15 | "/opt/ml/input/data/train/c4/en/c4-train.00346-of-01024.json.gz", 16 | "/opt/ml/input/data/train/c4/en/c4-train.00571-of-01024.json.gz", 17 | "/opt/ml/input/data/train/c4/en/c4-train.00714-of-01024.json.gz", 18 | "/opt/ml/input/data/train/c4/en/c4-train.00135-of-01024.json.gz", 19 | "/opt/ml/input/data/train/c4/en/c4-train.00554-of-01024.json.gz", 20 | "/opt/ml/input/data/train/c4/en/c4-train.00533-of-01024.json.gz", 21 | "/opt/ml/input/data/train/c4/en/c4-train.00071-of-01024.json.gz", 22 | "/opt/ml/input/data/train/c4/en/c4-train.00628-of-01024.json.gz", 23 | "/opt/ml/input/data/train/c4/en/c4-train.00235-of-01024.json.gz", 24 | "/opt/ml/input/data/train/c4/en/c4-train.00487-of-01024.json.gz", 25 | "/opt/ml/input/data/train/c4/en/c4-train.00303-of-01024.json.gz", 26 | "/opt/ml/input/data/train/c4/en/c4-train.00683-of-01024.json.gz" 27 | ] -------------------------------------------------------------------------------- /data/files/c4/en/p25/partition_625.json: -------------------------------------------------------------------------------- 1 | [ 2 | "/opt/ml/input/data/train/c4/en/c4-train.00565-of-01024.json.gz", 3 | "/opt/ml/input/data/train/c4/en/c4-train.00663-of-01024.json.gz", 4 | "/opt/ml/input/data/train/c4/en/c4-train.00019-of-01024.json.gz", 5 | "/opt/ml/input/data/train/c4/en/c4-train.00052-of-01024.json.gz", 6 | "/opt/ml/input/data/train/c4/en/c4-train.00909-of-01024.json.gz", 7 | "/opt/ml/input/data/train/c4/en/c4-train.01009-of-01024.json.gz", 8 | "/opt/ml/input/data/train/c4/en/c4-train.00439-of-01024.json.gz", 9 | "/opt/ml/input/data/train/c4/en/c4-train.00620-of-01024.json.gz", 10 | "/opt/ml/input/data/train/c4/en/c4-train.00766-of-01024.json.gz", 11 | "/opt/ml/input/data/train/c4/en/c4-train.00350-of-01024.json.gz", 12 | "/opt/ml/input/data/train/c4/en/c4-train.00797-of-01024.json.gz", 13 | "/opt/ml/input/data/train/c4/en/c4-train.00526-of-01024.json.gz", 14 | "/opt/ml/input/data/train/c4/en/c4-train.00232-of-01024.json.gz", 15 | "/opt/ml/input/data/train/c4/en/c4-train.00362-of-01024.json.gz", 16 | "/opt/ml/input/data/train/c4/en/c4-train.00060-of-01024.json.gz", 17 | "/opt/ml/input/data/train/c4/en/c4-train.00044-of-01024.json.gz", 18 | "/opt/ml/input/data/train/c4/en/c4-train.00256-of-01024.json.gz", 19 | "/opt/ml/input/data/train/c4/en/c4-train.00196-of-01024.json.gz", 20 | "/opt/ml/input/data/train/c4/en/c4-train.00622-of-01024.json.gz", 21 | "/opt/ml/input/data/train/c4/en/c4-train.00216-of-01024.json.gz", 22 | "/opt/ml/input/data/train/c4/en/c4-train.00242-of-01024.json.gz", 23 | "/opt/ml/input/data/train/c4/en/c4-train.00470-of-01024.json.gz", 24 | "/opt/ml/input/data/train/c4/en/c4-train.00058-of-01024.json.gz", 25 | "/opt/ml/input/data/train/c4/en/c4-train.00363-of-01024.json.gz", 26 | "/opt/ml/input/data/train/c4/en/c4-train.00843-of-01024.json.gz" 27 | ] -------------------------------------------------------------------------------- /data/files/c4/en/p25/partition_650.json: -------------------------------------------------------------------------------- 1 | [ 2 | "/opt/ml/input/data/train/c4/en/c4-train.00493-of-01024.json.gz", 3 | "/opt/ml/input/data/train/c4/en/c4-train.00916-of-01024.json.gz", 4 | "/opt/ml/input/data/train/c4/en/c4-train.00983-of-01024.json.gz", 5 | "/opt/ml/input/data/train/c4/en/c4-train.00544-of-01024.json.gz", 6 | "/opt/ml/input/data/train/c4/en/c4-train.00778-of-01024.json.gz", 7 | "/opt/ml/input/data/train/c4/en/c4-train.00318-of-01024.json.gz", 8 | "/opt/ml/input/data/train/c4/en/c4-train.00454-of-01024.json.gz", 9 | "/opt/ml/input/data/train/c4/en/c4-train.00239-of-01024.json.gz", 10 | "/opt/ml/input/data/train/c4/en/c4-train.00220-of-01024.json.gz", 11 | "/opt/ml/input/data/train/c4/en/c4-train.00762-of-01024.json.gz", 12 | "/opt/ml/input/data/train/c4/en/c4-train.00927-of-01024.json.gz", 13 | "/opt/ml/input/data/train/c4/en/c4-train.00532-of-01024.json.gz", 14 | "/opt/ml/input/data/train/c4/en/c4-train.01014-of-01024.json.gz", 15 | "/opt/ml/input/data/train/c4/en/c4-train.00257-of-01024.json.gz", 16 | "/opt/ml/input/data/train/c4/en/c4-train.00160-of-01024.json.gz", 17 | "/opt/ml/input/data/train/c4/en/c4-train.00489-of-01024.json.gz", 18 | "/opt/ml/input/data/train/c4/en/c4-train.00048-of-01024.json.gz", 19 | "/opt/ml/input/data/train/c4/en/c4-train.00568-of-01024.json.gz", 20 | "/opt/ml/input/data/train/c4/en/c4-train.00528-of-01024.json.gz", 21 | "/opt/ml/input/data/train/c4/en/c4-train.00692-of-01024.json.gz", 22 | "/opt/ml/input/data/train/c4/en/c4-train.00729-of-01024.json.gz", 23 | "/opt/ml/input/data/train/c4/en/c4-train.00001-of-01024.json.gz", 24 | "/opt/ml/input/data/train/c4/en/c4-train.00623-of-01024.json.gz", 25 | "/opt/ml/input/data/train/c4/en/c4-train.00711-of-01024.json.gz", 26 | "/opt/ml/input/data/train/c4/en/c4-train.00027-of-01024.json.gz" 27 | ] -------------------------------------------------------------------------------- /data/files/c4/en/p25/partition_675.json: -------------------------------------------------------------------------------- 1 | [ 2 | "/opt/ml/input/data/train/c4/en/c4-train.00265-of-01024.json.gz", 3 | "/opt/ml/input/data/train/c4/en/c4-train.00755-of-01024.json.gz", 4 | "/opt/ml/input/data/train/c4/en/c4-train.00667-of-01024.json.gz", 5 | "/opt/ml/input/data/train/c4/en/c4-train.00921-of-01024.json.gz", 6 | "/opt/ml/input/data/train/c4/en/c4-train.00233-of-01024.json.gz", 7 | "/opt/ml/input/data/train/c4/en/c4-train.00940-of-01024.json.gz", 8 | "/opt/ml/input/data/train/c4/en/c4-train.00937-of-01024.json.gz", 9 | "/opt/ml/input/data/train/c4/en/c4-train.00161-of-01024.json.gz", 10 | "/opt/ml/input/data/train/c4/en/c4-train.00433-of-01024.json.gz", 11 | "/opt/ml/input/data/train/c4/en/c4-train.00171-of-01024.json.gz", 12 | "/opt/ml/input/data/train/c4/en/c4-train.00055-of-01024.json.gz", 13 | "/opt/ml/input/data/train/c4/en/c4-train.00525-of-01024.json.gz", 14 | "/opt/ml/input/data/train/c4/en/c4-train.00230-of-01024.json.gz", 15 | "/opt/ml/input/data/train/c4/en/c4-train.00524-of-01024.json.gz", 16 | "/opt/ml/input/data/train/c4/en/c4-train.00300-of-01024.json.gz", 17 | "/opt/ml/input/data/train/c4/en/c4-train.00193-of-01024.json.gz", 18 | "/opt/ml/input/data/train/c4/en/c4-train.00301-of-01024.json.gz", 19 | "/opt/ml/input/data/train/c4/en/c4-train.00422-of-01024.json.gz", 20 | "/opt/ml/input/data/train/c4/en/c4-train.00809-of-01024.json.gz", 21 | "/opt/ml/input/data/train/c4/en/c4-train.00668-of-01024.json.gz", 22 | "/opt/ml/input/data/train/c4/en/c4-train.00023-of-01024.json.gz", 23 | "/opt/ml/input/data/train/c4/en/c4-train.00924-of-01024.json.gz", 24 | "/opt/ml/input/data/train/c4/en/c4-train.00633-of-01024.json.gz", 25 | "/opt/ml/input/data/train/c4/en/c4-train.00414-of-01024.json.gz", 26 | "/opt/ml/input/data/train/c4/en/c4-train.01011-of-01024.json.gz" 27 | ] -------------------------------------------------------------------------------- /data/files/c4/en/p25/partition_700.json: -------------------------------------------------------------------------------- 1 | [ 2 | "/opt/ml/input/data/train/c4/en/c4-train.00819-of-01024.json.gz", 3 | "/opt/ml/input/data/train/c4/en/c4-train.01022-of-01024.json.gz", 4 | "/opt/ml/input/data/train/c4/en/c4-train.00995-of-01024.json.gz", 5 | "/opt/ml/input/data/train/c4/en/c4-train.00586-of-01024.json.gz", 6 | "/opt/ml/input/data/train/c4/en/c4-train.00004-of-01024.json.gz", 7 | "/opt/ml/input/data/train/c4/en/c4-train.00763-of-01024.json.gz", 8 | "/opt/ml/input/data/train/c4/en/c4-train.00673-of-01024.json.gz", 9 | "/opt/ml/input/data/train/c4/en/c4-train.00295-of-01024.json.gz", 10 | "/opt/ml/input/data/train/c4/en/c4-train.00423-of-01024.json.gz", 11 | "/opt/ml/input/data/train/c4/en/c4-train.00925-of-01024.json.gz", 12 | "/opt/ml/input/data/train/c4/en/c4-train.00474-of-01024.json.gz", 13 | "/opt/ml/input/data/train/c4/en/c4-train.00183-of-01024.json.gz", 14 | "/opt/ml/input/data/train/c4/en/c4-train.00898-of-01024.json.gz", 15 | "/opt/ml/input/data/train/c4/en/c4-train.00626-of-01024.json.gz", 16 | "/opt/ml/input/data/train/c4/en/c4-train.00084-of-01024.json.gz", 17 | "/opt/ml/input/data/train/c4/en/c4-train.00425-of-01024.json.gz", 18 | "/opt/ml/input/data/train/c4/en/c4-train.00229-of-01024.json.gz", 19 | "/opt/ml/input/data/train/c4/en/c4-train.00826-of-01024.json.gz", 20 | "/opt/ml/input/data/train/c4/en/c4-train.00764-of-01024.json.gz", 21 | "/opt/ml/input/data/train/c4/en/c4-train.00415-of-01024.json.gz", 22 | "/opt/ml/input/data/train/c4/en/c4-train.00121-of-01024.json.gz", 23 | "/opt/ml/input/data/train/c4/en/c4-train.00569-of-01024.json.gz", 24 | "/opt/ml/input/data/train/c4/en/c4-train.00337-of-01024.json.gz", 25 | "/opt/ml/input/data/train/c4/en/c4-train.00530-of-01024.json.gz", 26 | "/opt/ml/input/data/train/c4/en/c4-train.00444-of-01024.json.gz" 27 | ] -------------------------------------------------------------------------------- /data/files/c4/en/p25/partition_725.json: -------------------------------------------------------------------------------- 1 | [ 2 | "/opt/ml/input/data/train/c4/en/c4-train.00756-of-01024.json.gz", 3 | "/opt/ml/input/data/train/c4/en/c4-train.00601-of-01024.json.gz", 4 | "/opt/ml/input/data/train/c4/en/c4-train.00307-of-01024.json.gz", 5 | "/opt/ml/input/data/train/c4/en/c4-train.00221-of-01024.json.gz", 6 | "/opt/ml/input/data/train/c4/en/c4-train.00182-of-01024.json.gz", 7 | "/opt/ml/input/data/train/c4/en/c4-train.00207-of-01024.json.gz", 8 | "/opt/ml/input/data/train/c4/en/c4-train.00632-of-01024.json.gz", 9 | "/opt/ml/input/data/train/c4/en/c4-train.00186-of-01024.json.gz", 10 | "/opt/ml/input/data/train/c4/en/c4-train.00914-of-01024.json.gz", 11 | "/opt/ml/input/data/train/c4/en/c4-train.00513-of-01024.json.gz", 12 | "/opt/ml/input/data/train/c4/en/c4-train.00448-of-01024.json.gz", 13 | "/opt/ml/input/data/train/c4/en/c4-train.00612-of-01024.json.gz", 14 | "/opt/ml/input/data/train/c4/en/c4-train.00700-of-01024.json.gz", 15 | "/opt/ml/input/data/train/c4/en/c4-train.00254-of-01024.json.gz", 16 | "/opt/ml/input/data/train/c4/en/c4-train.00523-of-01024.json.gz", 17 | "/opt/ml/input/data/train/c4/en/c4-train.00138-of-01024.json.gz", 18 | "/opt/ml/input/data/train/c4/en/c4-train.00731-of-01024.json.gz", 19 | "/opt/ml/input/data/train/c4/en/c4-train.00166-of-01024.json.gz", 20 | "/opt/ml/input/data/train/c4/en/c4-train.00871-of-01024.json.gz", 21 | "/opt/ml/input/data/train/c4/en/c4-train.00920-of-01024.json.gz", 22 | "/opt/ml/input/data/train/c4/en/c4-train.00293-of-01024.json.gz", 23 | "/opt/ml/input/data/train/c4/en/c4-train.00365-of-01024.json.gz", 24 | "/opt/ml/input/data/train/c4/en/c4-train.00546-of-01024.json.gz", 25 | "/opt/ml/input/data/train/c4/en/c4-train.00407-of-01024.json.gz", 26 | "/opt/ml/input/data/train/c4/en/c4-train.00682-of-01024.json.gz" 27 | ] -------------------------------------------------------------------------------- /data/files/c4/en/p25/partition_75.json: -------------------------------------------------------------------------------- 1 | [ 2 | "/opt/ml/input/data/train/c4/en/c4-train.00226-of-01024.json.gz", 3 | "/opt/ml/input/data/train/c4/en/c4-train.00315-of-01024.json.gz", 4 | "/opt/ml/input/data/train/c4/en/c4-train.00371-of-01024.json.gz", 5 | "/opt/ml/input/data/train/c4/en/c4-train.00280-of-01024.json.gz", 6 | "/opt/ml/input/data/train/c4/en/c4-train.00258-of-01024.json.gz", 7 | "/opt/ml/input/data/train/c4/en/c4-train.00231-of-01024.json.gz", 8 | "/opt/ml/input/data/train/c4/en/c4-train.00018-of-01024.json.gz", 9 | "/opt/ml/input/data/train/c4/en/c4-train.00888-of-01024.json.gz", 10 | "/opt/ml/input/data/train/c4/en/c4-train.00721-of-01024.json.gz", 11 | "/opt/ml/input/data/train/c4/en/c4-train.00020-of-01024.json.gz", 12 | "/opt/ml/input/data/train/c4/en/c4-train.00144-of-01024.json.gz", 13 | "/opt/ml/input/data/train/c4/en/c4-train.00209-of-01024.json.gz", 14 | "/opt/ml/input/data/train/c4/en/c4-train.00789-of-01024.json.gz", 15 | "/opt/ml/input/data/train/c4/en/c4-train.00264-of-01024.json.gz", 16 | "/opt/ml/input/data/train/c4/en/c4-train.00431-of-01024.json.gz", 17 | "/opt/ml/input/data/train/c4/en/c4-train.00811-of-01024.json.gz", 18 | "/opt/ml/input/data/train/c4/en/c4-train.00863-of-01024.json.gz", 19 | "/opt/ml/input/data/train/c4/en/c4-train.00212-of-01024.json.gz", 20 | "/opt/ml/input/data/train/c4/en/c4-train.00862-of-01024.json.gz", 21 | "/opt/ml/input/data/train/c4/en/c4-train.00685-of-01024.json.gz", 22 | "/opt/ml/input/data/train/c4/en/c4-train.00897-of-01024.json.gz", 23 | "/opt/ml/input/data/train/c4/en/c4-train.00974-of-01024.json.gz", 24 | "/opt/ml/input/data/train/c4/en/c4-train.00630-of-01024.json.gz", 25 | "/opt/ml/input/data/train/c4/en/c4-train.00292-of-01024.json.gz", 26 | "/opt/ml/input/data/train/c4/en/c4-train.00198-of-01024.json.gz" 27 | ] -------------------------------------------------------------------------------- /data/files/c4/en/p25/partition_750.json: -------------------------------------------------------------------------------- 1 | [ 2 | "/opt/ml/input/data/train/c4/en/c4-train.00078-of-01024.json.gz", 3 | "/opt/ml/input/data/train/c4/en/c4-train.00581-of-01024.json.gz", 4 | "/opt/ml/input/data/train/c4/en/c4-train.00424-of-01024.json.gz", 5 | "/opt/ml/input/data/train/c4/en/c4-train.00641-of-01024.json.gz", 6 | "/opt/ml/input/data/train/c4/en/c4-train.00453-of-01024.json.gz", 7 | "/opt/ml/input/data/train/c4/en/c4-train.00608-of-01024.json.gz", 8 | "/opt/ml/input/data/train/c4/en/c4-train.00653-of-01024.json.gz", 9 | "/opt/ml/input/data/train/c4/en/c4-train.00977-of-01024.json.gz", 10 | "/opt/ml/input/data/train/c4/en/c4-train.00992-of-01024.json.gz", 11 | "/opt/ml/input/data/train/c4/en/c4-train.00495-of-01024.json.gz", 12 | "/opt/ml/input/data/train/c4/en/c4-train.00738-of-01024.json.gz", 13 | "/opt/ml/input/data/train/c4/en/c4-train.00312-of-01024.json.gz", 14 | "/opt/ml/input/data/train/c4/en/c4-train.00552-of-01024.json.gz", 15 | "/opt/ml/input/data/train/c4/en/c4-train.00397-of-01024.json.gz", 16 | "/opt/ml/input/data/train/c4/en/c4-train.00267-of-01024.json.gz", 17 | "/opt/ml/input/data/train/c4/en/c4-train.00529-of-01024.json.gz", 18 | "/opt/ml/input/data/train/c4/en/c4-train.00494-of-01024.json.gz", 19 | "/opt/ml/input/data/train/c4/en/c4-train.00566-of-01024.json.gz", 20 | "/opt/ml/input/data/train/c4/en/c4-train.00505-of-01024.json.gz", 21 | "/opt/ml/input/data/train/c4/en/c4-train.00276-of-01024.json.gz", 22 | "/opt/ml/input/data/train/c4/en/c4-train.00619-of-01024.json.gz", 23 | "/opt/ml/input/data/train/c4/en/c4-train.00733-of-01024.json.gz", 24 | "/opt/ml/input/data/train/c4/en/c4-train.00488-of-01024.json.gz", 25 | "/opt/ml/input/data/train/c4/en/c4-train.00486-of-01024.json.gz", 26 | "/opt/ml/input/data/train/c4/en/c4-train.00587-of-01024.json.gz" 27 | ] -------------------------------------------------------------------------------- /data/files/c4/en/p25/partition_775.json: -------------------------------------------------------------------------------- 1 | [ 2 | "/opt/ml/input/data/train/c4/en/c4-train.00941-of-01024.json.gz", 3 | "/opt/ml/input/data/train/c4/en/c4-train.00821-of-01024.json.gz", 4 | "/opt/ml/input/data/train/c4/en/c4-train.00708-of-01024.json.gz", 5 | "/opt/ml/input/data/train/c4/en/c4-train.00822-of-01024.json.gz", 6 | "/opt/ml/input/data/train/c4/en/c4-train.00366-of-01024.json.gz", 7 | "/opt/ml/input/data/train/c4/en/c4-train.00394-of-01024.json.gz", 8 | "/opt/ml/input/data/train/c4/en/c4-train.00250-of-01024.json.gz", 9 | "/opt/ml/input/data/train/c4/en/c4-train.00389-of-01024.json.gz", 10 | "/opt/ml/input/data/train/c4/en/c4-train.00353-of-01024.json.gz", 11 | "/opt/ml/input/data/train/c4/en/c4-train.00035-of-01024.json.gz", 12 | "/opt/ml/input/data/train/c4/en/c4-train.00202-of-01024.json.gz", 13 | "/opt/ml/input/data/train/c4/en/c4-train.00008-of-01024.json.gz", 14 | "/opt/ml/input/data/train/c4/en/c4-train.00364-of-01024.json.gz", 15 | "/opt/ml/input/data/train/c4/en/c4-train.00717-of-01024.json.gz", 16 | "/opt/ml/input/data/train/c4/en/c4-train.00222-of-01024.json.gz", 17 | "/opt/ml/input/data/train/c4/en/c4-train.00178-of-01024.json.gz", 18 | "/opt/ml/input/data/train/c4/en/c4-train.00240-of-01024.json.gz", 19 | "/opt/ml/input/data/train/c4/en/c4-train.00398-of-01024.json.gz", 20 | "/opt/ml/input/data/train/c4/en/c4-train.00672-of-01024.json.gz", 21 | "/opt/ml/input/data/train/c4/en/c4-train.00278-of-01024.json.gz", 22 | "/opt/ml/input/data/train/c4/en/c4-train.00959-of-01024.json.gz", 23 | "/opt/ml/input/data/train/c4/en/c4-train.00429-of-01024.json.gz", 24 | "/opt/ml/input/data/train/c4/en/c4-train.00592-of-01024.json.gz", 25 | "/opt/ml/input/data/train/c4/en/c4-train.00327-of-01024.json.gz", 26 | "/opt/ml/input/data/train/c4/en/c4-train.00686-of-01024.json.gz" 27 | ] -------------------------------------------------------------------------------- /data/files/c4/en/p25/partition_800.json: -------------------------------------------------------------------------------- 1 | [ 2 | "/opt/ml/input/data/train/c4/en/c4-train.00952-of-01024.json.gz", 3 | "/opt/ml/input/data/train/c4/en/c4-train.00516-of-01024.json.gz", 4 | "/opt/ml/input/data/train/c4/en/c4-train.00460-of-01024.json.gz", 5 | "/opt/ml/input/data/train/c4/en/c4-train.00748-of-01024.json.gz", 6 | "/opt/ml/input/data/train/c4/en/c4-train.00165-of-01024.json.gz", 7 | "/opt/ml/input/data/train/c4/en/c4-train.00309-of-01024.json.gz", 8 | "/opt/ml/input/data/train/c4/en/c4-train.00929-of-01024.json.gz", 9 | "/opt/ml/input/data/train/c4/en/c4-train.00734-of-01024.json.gz", 10 | "/opt/ml/input/data/train/c4/en/c4-train.00956-of-01024.json.gz", 11 | "/opt/ml/input/data/train/c4/en/c4-train.00143-of-01024.json.gz", 12 | "/opt/ml/input/data/train/c4/en/c4-train.00041-of-01024.json.gz", 13 | "/opt/ml/input/data/train/c4/en/c4-train.00022-of-01024.json.gz", 14 | "/opt/ml/input/data/train/c4/en/c4-train.00438-of-01024.json.gz", 15 | "/opt/ml/input/data/train/c4/en/c4-train.00776-of-01024.json.gz", 16 | "/opt/ml/input/data/train/c4/en/c4-train.00170-of-01024.json.gz", 17 | "/opt/ml/input/data/train/c4/en/c4-train.00646-of-01024.json.gz", 18 | "/opt/ml/input/data/train/c4/en/c4-train.00573-of-01024.json.gz", 19 | "/opt/ml/input/data/train/c4/en/c4-train.00997-of-01024.json.gz", 20 | "/opt/ml/input/data/train/c4/en/c4-train.00823-of-01024.json.gz", 21 | "/opt/ml/input/data/train/c4/en/c4-train.00206-of-01024.json.gz", 22 | "/opt/ml/input/data/train/c4/en/c4-train.00547-of-01024.json.gz", 23 | "/opt/ml/input/data/train/c4/en/c4-train.00194-of-01024.json.gz", 24 | "/opt/ml/input/data/train/c4/en/c4-train.00130-of-01024.json.gz", 25 | "/opt/ml/input/data/train/c4/en/c4-train.00067-of-01024.json.gz", 26 | "/opt/ml/input/data/train/c4/en/c4-train.00765-of-01024.json.gz" 27 | ] -------------------------------------------------------------------------------- /data/files/c4/en/p25/partition_825.json: -------------------------------------------------------------------------------- 1 | [ 2 | "/opt/ml/input/data/train/c4/en/c4-train.00799-of-01024.json.gz", 3 | "/opt/ml/input/data/train/c4/en/c4-train.00466-of-01024.json.gz", 4 | "/opt/ml/input/data/train/c4/en/c4-train.00680-of-01024.json.gz", 5 | "/opt/ml/input/data/train/c4/en/c4-train.00277-of-01024.json.gz", 6 | "/opt/ml/input/data/train/c4/en/c4-train.00885-of-01024.json.gz", 7 | "/opt/ml/input/data/train/c4/en/c4-train.00548-of-01024.json.gz", 8 | "/opt/ml/input/data/train/c4/en/c4-train.00703-of-01024.json.gz", 9 | "/opt/ml/input/data/train/c4/en/c4-train.00374-of-01024.json.gz", 10 | "/opt/ml/input/data/train/c4/en/c4-train.00779-of-01024.json.gz", 11 | "/opt/ml/input/data/train/c4/en/c4-train.00881-of-01024.json.gz", 12 | "/opt/ml/input/data/train/c4/en/c4-train.00749-of-01024.json.gz", 13 | "/opt/ml/input/data/train/c4/en/c4-train.00234-of-01024.json.gz", 14 | "/opt/ml/input/data/train/c4/en/c4-train.00844-of-01024.json.gz", 15 | "/opt/ml/input/data/train/c4/en/c4-train.00352-of-01024.json.gz", 16 | "/opt/ml/input/data/train/c4/en/c4-train.00943-of-01024.json.gz", 17 | "/opt/ml/input/data/train/c4/en/c4-train.00088-of-01024.json.gz", 18 | "/opt/ml/input/data/train/c4/en/c4-train.00195-of-01024.json.gz", 19 | "/opt/ml/input/data/train/c4/en/c4-train.00864-of-01024.json.gz", 20 | "/opt/ml/input/data/train/c4/en/c4-train.00066-of-01024.json.gz", 21 | "/opt/ml/input/data/train/c4/en/c4-train.00743-of-01024.json.gz", 22 | "/opt/ml/input/data/train/c4/en/c4-train.00437-of-01024.json.gz", 23 | "/opt/ml/input/data/train/c4/en/c4-train.00606-of-01024.json.gz", 24 | "/opt/ml/input/data/train/c4/en/c4-train.00728-of-01024.json.gz", 25 | "/opt/ml/input/data/train/c4/en/c4-train.00361-of-01024.json.gz", 26 | "/opt/ml/input/data/train/c4/en/c4-train.00095-of-01024.json.gz" 27 | ] -------------------------------------------------------------------------------- /data/files/c4/en/p25/partition_850.json: -------------------------------------------------------------------------------- 1 | [ 2 | "/opt/ml/input/data/train/c4/en/c4-train.00541-of-01024.json.gz", 3 | "/opt/ml/input/data/train/c4/en/c4-train.00539-of-01024.json.gz", 4 | "/opt/ml/input/data/train/c4/en/c4-train.00142-of-01024.json.gz", 5 | "/opt/ml/input/data/train/c4/en/c4-train.00913-of-01024.json.gz", 6 | "/opt/ml/input/data/train/c4/en/c4-train.00718-of-01024.json.gz", 7 | "/opt/ml/input/data/train/c4/en/c4-train.00446-of-01024.json.gz", 8 | "/opt/ml/input/data/train/c4/en/c4-train.00091-of-01024.json.gz", 9 | "/opt/ml/input/data/train/c4/en/c4-train.00332-of-01024.json.gz", 10 | "/opt/ml/input/data/train/c4/en/c4-train.00625-of-01024.json.gz", 11 | "/opt/ml/input/data/train/c4/en/c4-train.00803-of-01024.json.gz", 12 | "/opt/ml/input/data/train/c4/en/c4-train.00813-of-01024.json.gz", 13 | "/opt/ml/input/data/train/c4/en/c4-train.00975-of-01024.json.gz", 14 | "/opt/ml/input/data/train/c4/en/c4-train.00724-of-01024.json.gz", 15 | "/opt/ml/input/data/train/c4/en/c4-train.00412-of-01024.json.gz", 16 | "/opt/ml/input/data/train/c4/en/c4-train.00582-of-01024.json.gz", 17 | "/opt/ml/input/data/train/c4/en/c4-train.00074-of-01024.json.gz", 18 | "/opt/ml/input/data/train/c4/en/c4-train.00112-of-01024.json.gz", 19 | "/opt/ml/input/data/train/c4/en/c4-train.00883-of-01024.json.gz", 20 | "/opt/ml/input/data/train/c4/en/c4-train.00739-of-01024.json.gz", 21 | "/opt/ml/input/data/train/c4/en/c4-train.00443-of-01024.json.gz", 22 | "/opt/ml/input/data/train/c4/en/c4-train.00266-of-01024.json.gz", 23 | "/opt/ml/input/data/train/c4/en/c4-train.00203-of-01024.json.gz", 24 | "/opt/ml/input/data/train/c4/en/c4-train.00872-of-01024.json.gz", 25 | "/opt/ml/input/data/train/c4/en/c4-train.00211-of-01024.json.gz", 26 | "/opt/ml/input/data/train/c4/en/c4-train.00855-of-01024.json.gz" 27 | ] -------------------------------------------------------------------------------- /data/files/c4/en/p25/partition_875.json: -------------------------------------------------------------------------------- 1 | [ 2 | "/opt/ml/input/data/train/c4/en/c4-train.00136-of-01024.json.gz", 3 | "/opt/ml/input/data/train/c4/en/c4-train.00695-of-01024.json.gz", 4 | "/opt/ml/input/data/train/c4/en/c4-train.00271-of-01024.json.gz", 5 | "/opt/ml/input/data/train/c4/en/c4-train.00824-of-01024.json.gz", 6 | "/opt/ml/input/data/train/c4/en/c4-train.00367-of-01024.json.gz", 7 | "/opt/ml/input/data/train/c4/en/c4-train.00010-of-01024.json.gz", 8 | "/opt/ml/input/data/train/c4/en/c4-train.00125-of-01024.json.gz", 9 | "/opt/ml/input/data/train/c4/en/c4-train.00436-of-01024.json.gz", 10 | "/opt/ml/input/data/train/c4/en/c4-train.00958-of-01024.json.gz", 11 | "/opt/ml/input/data/train/c4/en/c4-train.00289-of-01024.json.gz", 12 | "/opt/ml/input/data/train/c4/en/c4-train.00742-of-01024.json.gz", 13 | "/opt/ml/input/data/train/c4/en/c4-train.00056-of-01024.json.gz", 14 | "/opt/ml/input/data/train/c4/en/c4-train.01001-of-01024.json.gz", 15 | "/opt/ml/input/data/train/c4/en/c4-train.00360-of-01024.json.gz", 16 | "/opt/ml/input/data/train/c4/en/c4-train.00856-of-01024.json.gz", 17 | "/opt/ml/input/data/train/c4/en/c4-train.00989-of-01024.json.gz", 18 | "/opt/ml/input/data/train/c4/en/c4-train.00991-of-01024.json.gz", 19 | "/opt/ml/input/data/train/c4/en/c4-train.00585-of-01024.json.gz", 20 | "/opt/ml/input/data/train/c4/en/c4-train.00534-of-01024.json.gz", 21 | "/opt/ml/input/data/train/c4/en/c4-train.00388-of-01024.json.gz", 22 | "/opt/ml/input/data/train/c4/en/c4-train.00578-of-01024.json.gz", 23 | "/opt/ml/input/data/train/c4/en/c4-train.00383-of-01024.json.gz", 24 | "/opt/ml/input/data/train/c4/en/c4-train.00149-of-01024.json.gz", 25 | "/opt/ml/input/data/train/c4/en/c4-train.01005-of-01024.json.gz", 26 | "/opt/ml/input/data/train/c4/en/c4-train.01023-of-01024.json.gz" 27 | ] -------------------------------------------------------------------------------- /data/files/c4/en/p25/partition_900.json: -------------------------------------------------------------------------------- 1 | [ 2 | "/opt/ml/input/data/train/c4/en/c4-train.00106-of-01024.json.gz", 3 | "/opt/ml/input/data/train/c4/en/c4-train.00043-of-01024.json.gz", 4 | "/opt/ml/input/data/train/c4/en/c4-train.00314-of-01024.json.gz", 5 | "/opt/ml/input/data/train/c4/en/c4-train.00409-of-01024.json.gz", 6 | "/opt/ml/input/data/train/c4/en/c4-train.00331-of-01024.json.gz", 7 | "/opt/ml/input/data/train/c4/en/c4-train.00850-of-01024.json.gz", 8 | "/opt/ml/input/data/train/c4/en/c4-train.00542-of-01024.json.gz", 9 | "/opt/ml/input/data/train/c4/en/c4-train.00953-of-01024.json.gz", 10 | "/opt/ml/input/data/train/c4/en/c4-train.00316-of-01024.json.gz", 11 | "/opt/ml/input/data/train/c4/en/c4-train.00426-of-01024.json.gz", 12 | "/opt/ml/input/data/train/c4/en/c4-train.00251-of-01024.json.gz", 13 | "/opt/ml/input/data/train/c4/en/c4-train.00594-of-01024.json.gz", 14 | "/opt/ml/input/data/train/c4/en/c4-train.00990-of-01024.json.gz", 15 | "/opt/ml/input/data/train/c4/en/c4-train.00689-of-01024.json.gz", 16 | "/opt/ml/input/data/train/c4/en/c4-train.00725-of-01024.json.gz", 17 | "/opt/ml/input/data/train/c4/en/c4-train.00570-of-01024.json.gz", 18 | "/opt/ml/input/data/train/c4/en/c4-train.00461-of-01024.json.gz", 19 | "/opt/ml/input/data/train/c4/en/c4-train.00805-of-01024.json.gz", 20 | "/opt/ml/input/data/train/c4/en/c4-train.00275-of-01024.json.gz", 21 | "/opt/ml/input/data/train/c4/en/c4-train.00691-of-01024.json.gz", 22 | "/opt/ml/input/data/train/c4/en/c4-train.00860-of-01024.json.gz", 23 | "/opt/ml/input/data/train/c4/en/c4-train.00384-of-01024.json.gz", 24 | "/opt/ml/input/data/train/c4/en/c4-train.00030-of-01024.json.gz", 25 | "/opt/ml/input/data/train/c4/en/c4-train.00237-of-01024.json.gz", 26 | "/opt/ml/input/data/train/c4/en/c4-train.00359-of-01024.json.gz" 27 | ] -------------------------------------------------------------------------------- /data/files/c4/en/p25/partition_925.json: -------------------------------------------------------------------------------- 1 | [ 2 | "/opt/ml/input/data/train/c4/en/c4-train.00400-of-01024.json.gz", 3 | "/opt/ml/input/data/train/c4/en/c4-train.00386-of-01024.json.gz", 4 | "/opt/ml/input/data/train/c4/en/c4-train.00473-of-01024.json.gz", 5 | "/opt/ml/input/data/train/c4/en/c4-train.00127-of-01024.json.gz", 6 | "/opt/ml/input/data/train/c4/en/c4-train.00684-of-01024.json.gz", 7 | "/opt/ml/input/data/train/c4/en/c4-train.00162-of-01024.json.gz", 8 | "/opt/ml/input/data/train/c4/en/c4-train.00108-of-01024.json.gz", 9 | "/opt/ml/input/data/train/c4/en/c4-train.00320-of-01024.json.gz", 10 | "/opt/ml/input/data/train/c4/en/c4-train.00099-of-01024.json.gz", 11 | "/opt/ml/input/data/train/c4/en/c4-train.00111-of-01024.json.gz", 12 | "/opt/ml/input/data/train/c4/en/c4-train.00375-of-01024.json.gz", 13 | "/opt/ml/input/data/train/c4/en/c4-train.00328-of-01024.json.gz", 14 | "/opt/ml/input/data/train/c4/en/c4-train.00148-of-01024.json.gz", 15 | "/opt/ml/input/data/train/c4/en/c4-train.00117-of-01024.json.gz", 16 | "/opt/ml/input/data/train/c4/en/c4-train.00664-of-01024.json.gz", 17 | "/opt/ml/input/data/train/c4/en/c4-train.00472-of-01024.json.gz", 18 | "/opt/ml/input/data/train/c4/en/c4-train.00124-of-01024.json.gz", 19 | "/opt/ml/input/data/train/c4/en/c4-train.00702-of-01024.json.gz", 20 | "/opt/ml/input/data/train/c4/en/c4-train.00886-of-01024.json.gz", 21 | "/opt/ml/input/data/train/c4/en/c4-train.00651-of-01024.json.gz", 22 | "/opt/ml/input/data/train/c4/en/c4-train.00838-of-01024.json.gz", 23 | "/opt/ml/input/data/train/c4/en/c4-train.00336-of-01024.json.gz", 24 | "/opt/ml/input/data/train/c4/en/c4-train.00274-of-01024.json.gz", 25 | "/opt/ml/input/data/train/c4/en/c4-train.00255-of-01024.json.gz", 26 | "/opt/ml/input/data/train/c4/en/c4-train.00025-of-01024.json.gz" 27 | ] -------------------------------------------------------------------------------- /data/files/c4/en/p25/partition_950.json: -------------------------------------------------------------------------------- 1 | [ 2 | "/opt/ml/input/data/train/c4/en/c4-train.00836-of-01024.json.gz", 3 | "/opt/ml/input/data/train/c4/en/c4-train.00575-of-01024.json.gz", 4 | "/opt/ml/input/data/train/c4/en/c4-train.00319-of-01024.json.gz", 5 | "/opt/ml/input/data/train/c4/en/c4-train.00455-of-01024.json.gz", 6 | "/opt/ml/input/data/train/c4/en/c4-train.01007-of-01024.json.gz", 7 | "/opt/ml/input/data/train/c4/en/c4-train.00482-of-01024.json.gz", 8 | "/opt/ml/input/data/train/c4/en/c4-train.00464-of-01024.json.gz", 9 | "/opt/ml/input/data/train/c4/en/c4-train.00214-of-01024.json.gz", 10 | "/opt/ml/input/data/train/c4/en/c4-train.00849-of-01024.json.gz", 11 | "/opt/ml/input/data/train/c4/en/c4-train.00878-of-01024.json.gz", 12 | "/opt/ml/input/data/train/c4/en/c4-train.00163-of-01024.json.gz", 13 | "/opt/ml/input/data/train/c4/en/c4-train.00577-of-01024.json.gz", 14 | "/opt/ml/input/data/train/c4/en/c4-train.00629-of-01024.json.gz", 15 | "/opt/ml/input/data/train/c4/en/c4-train.00661-of-01024.json.gz", 16 | "/opt/ml/input/data/train/c4/en/c4-train.00093-of-01024.json.gz", 17 | "/opt/ml/input/data/train/c4/en/c4-train.00057-of-01024.json.gz", 18 | "/opt/ml/input/data/train/c4/en/c4-train.00477-of-01024.json.gz", 19 | "/opt/ml/input/data/train/c4/en/c4-train.00037-of-01024.json.gz", 20 | "/opt/ml/input/data/train/c4/en/c4-train.00457-of-01024.json.gz", 21 | "/opt/ml/input/data/train/c4/en/c4-train.00973-of-01024.json.gz", 22 | "/opt/ml/input/data/train/c4/en/c4-train.00788-of-01024.json.gz", 23 | "/opt/ml/input/data/train/c4/en/c4-train.00848-of-01024.json.gz", 24 | "/opt/ml/input/data/train/c4/en/c4-train.00564-of-01024.json.gz", 25 | "/opt/ml/input/data/train/c4/en/c4-train.00406-of-01024.json.gz", 26 | "/opt/ml/input/data/train/c4/en/c4-train.00109-of-01024.json.gz" 27 | ] -------------------------------------------------------------------------------- /data/files/c4/en/p25/partition_975.json: -------------------------------------------------------------------------------- 1 | [ 2 | "/opt/ml/input/data/train/c4/en/c4-train.00520-of-01024.json.gz", 3 | "/opt/ml/input/data/train/c4/en/c4-train.00322-of-01024.json.gz", 4 | "/opt/ml/input/data/train/c4/en/c4-train.00380-of-01024.json.gz", 5 | "/opt/ml/input/data/train/c4/en/c4-train.00518-of-01024.json.gz", 6 | "/opt/ml/input/data/train/c4/en/c4-train.00854-of-01024.json.gz", 7 | "/opt/ml/input/data/train/c4/en/c4-train.00092-of-01024.json.gz", 8 | "/opt/ml/input/data/train/c4/en/c4-train.00741-of-01024.json.gz", 9 | "/opt/ml/input/data/train/c4/en/c4-train.00515-of-01024.json.gz", 10 | "/opt/ml/input/data/train/c4/en/c4-train.00417-of-01024.json.gz", 11 | "/opt/ml/input/data/train/c4/en/c4-train.00326-of-01024.json.gz", 12 | "/opt/ml/input/data/train/c4/en/c4-train.00033-of-01024.json.gz", 13 | "/opt/ml/input/data/train/c4/en/c4-train.00268-of-01024.json.gz", 14 | "/opt/ml/input/data/train/c4/en/c4-train.00835-of-01024.json.gz", 15 | "/opt/ml/input/data/train/c4/en/c4-train.00376-of-01024.json.gz", 16 | "/opt/ml/input/data/train/c4/en/c4-train.00798-of-01024.json.gz", 17 | "/opt/ml/input/data/train/c4/en/c4-train.00744-of-01024.json.gz", 18 | "/opt/ml/input/data/train/c4/en/c4-train.00009-of-01024.json.gz", 19 | "/opt/ml/input/data/train/c4/en/c4-train.00152-of-01024.json.gz", 20 | "/opt/ml/input/data/train/c4/en/c4-train.00116-of-01024.json.gz", 21 | "/opt/ml/input/data/train/c4/en/c4-train.00497-of-01024.json.gz", 22 | "/opt/ml/input/data/train/c4/en/c4-train.00475-of-01024.json.gz", 23 | "/opt/ml/input/data/train/c4/en/c4-train.00003-of-01024.json.gz", 24 | "/opt/ml/input/data/train/c4/en/c4-train.00890-of-01024.json.gz", 25 | "/opt/ml/input/data/train/c4/en/c4-train.00936-of-01024.json.gz", 26 | "/opt/ml/input/data/train/c4/en/c4-train.00998-of-01024.json.gz" 27 | ] -------------------------------------------------------------------------------- /general_util/tokenization_utils.py: -------------------------------------------------------------------------------- 1 | from transformers import PreTrainedTokenizer 2 | import os 3 | 4 | from data.data_utils import tokenizer_get_name 5 | from general_util.logger import get_child_logger 6 | 7 | DEFAULT_PAD_TOKEN = "[PAD]" 8 | DEFAULT_EOS_TOKEN = "" 9 | DEFAULT_BOS_TOKEN = "" 10 | DEFAULT_UNK_TOKEN = "" 11 | 12 | logger = get_child_logger(__name__) 13 | 14 | 15 | def expand_special_tokenizer(tokenizer: PreTrainedTokenizer): 16 | if "llama" in tokenizer_get_name(tokenizer): 17 | special_tokens_map = {} 18 | eos_token = os.environ.get("EOS_TOKEN", None) 19 | if eos_token or (not tokenizer.eos_token): 20 | special_tokens_map["eos_token"] = eos_token if eos_token else DEFAULT_EOS_TOKEN 21 | 22 | bos_token = os.environ.get("BOS_TOKEN", None) 23 | if bos_token or (not tokenizer.bos_token): 24 | special_tokens_map["bos_token"] = bos_token if bos_token else DEFAULT_BOS_TOKEN 25 | 26 | unk_token = os.environ.get("UNK_TOKEN", None) 27 | if not tokenizer.unk_token: 28 | special_tokens_map["unk_token"] = unk_token if unk_token else DEFAULT_UNK_TOKEN 29 | 30 | pad_token = os.environ.get("PAD_TOKEN", None) 31 | if not tokenizer.pad_token: 32 | special_tokens_map["pad_token"] = pad_token if pad_token else DEFAULT_PAD_TOKEN 33 | 34 | new_tokens = tokenizer.add_special_tokens( 35 | special_tokens_dict=special_tokens_map 36 | ) 37 | # new_tokens = tokenizer.add_special_tokens(special_tokens_dict=dict(pad_token=DEFAULT_PAD_TOKEN)) 38 | # tokenizer.pad_token = tokenizer.eos_token 39 | # tokenizer.pad_token_id = tokenizer.eos_token_id 40 | # assert new_tokens == 1 41 | logger.info(tokenizer) 42 | logger.info(f"PAD TOKEN ID = {tokenizer.pad_token_id}") 43 | -------------------------------------------------------------------------------- /post_processors/bleu.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List, Any 2 | 3 | from nltk import word_tokenize 4 | from nltk.translate.bleu_score import sentence_bleu 5 | from torch import distributed as dist 6 | 7 | from post_processors.dist_mixin import DistGatherMixin 8 | 9 | 10 | class BLEUMetric(DistGatherMixin): 11 | def __init__(self): 12 | self.predictions = [] 13 | 14 | def __call__(self, meta_data: List[Dict[str, Any]], batch_model_outputs: Dict[str, Any], ddp: bool = False): 15 | sources = [] 16 | targets = [] 17 | for item in meta_data: 18 | sources.append(item["src"]) 19 | if "tgt" in item and item["tgt"]: 20 | targets.append(item["tgt"]) 21 | else: 22 | targets.append("") 23 | 24 | pred_seq = batch_model_outputs["generated_seq"] 25 | predictions = [ 26 | { 27 | "source": src, 28 | "target": tgt, 29 | "prediction": pred, 30 | } for src, tgt, pred in zip(sources, targets, pred_seq) 31 | ] 32 | 33 | if ddp: 34 | obj = predictions 35 | gather_res = self.gather_object(obj) 36 | if dist.get_rank() == 0: 37 | tmp = [] 38 | for item in gather_res: 39 | tmp.extend(item) 40 | predictions = tmp 41 | 42 | self.predictions.extend(predictions) 43 | 44 | del meta_data, batch_model_outputs, sources, targets, pred_seq, predictions 45 | 46 | def get_results(self): 47 | bleu = sum( 48 | [sentence_bleu([word_tokenize(pred["target"])], word_tokenize(pred["prediction"])) for pred in 49 | self.predictions] 50 | ) * 1.0 / len(self.predictions) 51 | 52 | return {"bleu": bleu}, self.predictions 53 | -------------------------------------------------------------------------------- /general_util/logger.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import sys 4 | 5 | _root_name = 'FK' 6 | 7 | 8 | def get_child_logger(child_name): 9 | # _local_rank = getattr(os.environ, "LOCAL_RANK", "") 10 | # 11 | # if _root_name == "FK" and _local_rank: 12 | # return logging.getLogger(_root_name + '.' + _local_rank + '.' + child_name) 13 | 14 | return logging.getLogger(_root_name + '.' + child_name) 15 | 16 | 17 | def setting_logger(log_file: str, local_rank: int = -1): 18 | model_name = "-".join(log_file.replace('/', ' ').split()[1:]) 19 | 20 | logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', 21 | datefmt='%m/%d/%Y %H:%M:%S', 22 | level=logging.INFO if local_rank in [-1, 0] else logging.WARNING) 23 | 24 | # global _root_name 25 | # if local_rank != -1 and _root_name == "FK": 26 | # _root_name = _root_name + '.' + str(local_rank) 27 | logger = logging.getLogger(_root_name) 28 | logger.setLevel(logging.INFO if local_rank in [-1, 0] else logging.WARNING) 29 | 30 | rf_handler = logging.StreamHandler(sys.stderr) 31 | rf_handler.setLevel(logging.INFO) 32 | rf_handler.setFormatter(logging.Formatter(fmt='%(asctime)s - %(levelname)s - %(name)s - %(message)s', 33 | datefmt='%m/%d/%Y %H:%M:%S')) 34 | 35 | output_dir = './log_dir' 36 | if not os.path.exists(output_dir): 37 | os.makedirs(output_dir) 38 | f_handler = logging.FileHandler(os.path.join( 39 | output_dir, model_name + '-output.log')) 40 | f_handler.setLevel(logging.INFO) 41 | f_handler.setFormatter(logging.Formatter(fmt="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 42 | datefmt='%m/%d/%Y %H:%M:%S')) 43 | 44 | logger.addHandler(f_handler) 45 | return logger 46 | -------------------------------------------------------------------------------- /models/mpt/adapt_tokenizer.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast 3 | Tokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast] 4 | NUM_SENTINEL_TOKENS: int = 100 5 | 6 | def adapt_tokenizer_for_denoising(tokenizer: Tokenizer): 7 | """Adds sentinel tokens and padding token (if missing). 8 | 9 | Expands the tokenizer vocabulary to include sentinel tokens 10 | used in mixture-of-denoiser tasks as well as a padding token. 11 | 12 | All added tokens are added as special tokens. No tokens are 13 | added if sentinel tokens and padding token already exist. 14 | """ 15 | sentinels_to_add = [f'' for i in range(NUM_SENTINEL_TOKENS)] 16 | tokenizer.add_tokens(sentinels_to_add, special_tokens=True) 17 | if tokenizer.pad_token is None: 18 | tokenizer.add_tokens('', special_tokens=True) 19 | tokenizer.pad_token = '' 20 | assert tokenizer.pad_token_id is not None 21 | sentinels = ''.join([f'' for i in range(NUM_SENTINEL_TOKENS)]) 22 | _sentinel_token_ids = tokenizer(sentinels, add_special_tokens=False).input_ids 23 | tokenizer.sentinel_token_ids = _sentinel_token_ids 24 | 25 | class AutoTokenizerForMOD(AutoTokenizer): 26 | """AutoTokenizer + Adaptation for MOD. 27 | 28 | A simple wrapper around AutoTokenizer to make instantiating 29 | an MOD-adapted tokenizer a bit easier. 30 | 31 | MOD-adapted tokenizers have sentinel tokens (e.g., ), 32 | a padding token, and a property to get the token ids of the 33 | sentinel tokens. 34 | """ 35 | 36 | @classmethod 37 | def from_pretrained(cls, *args, **kwargs): 38 | """See `AutoTokenizer.from_pretrained` docstring.""" 39 | tokenizer = super().from_pretrained(*args, **kwargs) 40 | adapt_tokenizer_for_denoising(tokenizer) 41 | return tokenizer -------------------------------------------------------------------------------- /general_util/mixin.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | from typing import Dict, List, Tuple 3 | 4 | import torch 5 | 6 | from general_util.average_meter import LogMetric, AverageMeter 7 | from general_util.logger import get_child_logger 8 | 9 | logger = get_child_logger("Mixin") 10 | 11 | 12 | class LogMixin: 13 | eval_metrics: LogMetric = None 14 | 15 | def init_metric(self, *metric_names): 16 | self.eval_metrics = LogMetric(*metric_names) 17 | 18 | def get_eval_log(self, reset=False, ddp=False, device='cpu'): 19 | 20 | if self.eval_metrics is None: 21 | logger.warning("The `eval_metrics` attribute hasn't been initialized.") 22 | 23 | if ddp: 24 | for metric in self.eval_metrics.metrics.values(): 25 | metric.gather(device=device) 26 | 27 | results = self.eval_metrics.get_log() 28 | 29 | _eval_metric_log = '\t'.join([f"{k}: {v}" for k, v in results.items()]) 30 | 31 | if reset: 32 | self.eval_metrics.reset() 33 | 34 | return _eval_metric_log, results 35 | 36 | 37 | class MetricMixin: 38 | # TODO: 如何利用hydra解耦计算metric的方式和模型? 39 | def __init__(self, metrics: List[Tuple[str, str, str, str]]): 40 | self.metrics = { 41 | name: { 42 | "key": key, 43 | "val": val, 44 | "func": func, 45 | "meter": AverageMeter() 46 | } for key, val, func, name in metrics 47 | } 48 | 49 | 50 | class PredictionMixin: 51 | tensor_dict: Dict[str, List] = defaultdict(list) 52 | 53 | def reset_predict_tensors(self): 54 | self.tensor_dict = defaultdict(list) 55 | 56 | def concat_predict_tensors(self, **tensors: torch.Tensor): 57 | for k, v in tensors.items(): 58 | self.tensor_dict[k].extend(v.detach().cpu().tolist()) 59 | 60 | def get_predict_tensors(self): 61 | return self.tensor_dict 62 | -------------------------------------------------------------------------------- /make_delta.py: -------------------------------------------------------------------------------- 1 | """ 2 | Code is modified from https://github.com/lm-sys/FastChat/blob/main/fastchat/model/make_delta.py. 3 | 4 | Make the delta weights by subtracting base weights. 5 | 6 | Usage: 7 | python3 -m fastchat.model.make_delta --base ~/model_weights/llama-13b --target ~/model_weights/vicuna-13b --delta ~/model_weights/vicuna-13b-delta --hub-repo-id lmsys/vicuna-13b-delta-v1.1 8 | """ 9 | import argparse 10 | 11 | import torch 12 | from tqdm import tqdm 13 | from transformers import AutoTokenizer, AutoModelForCausalLM 14 | 15 | 16 | def make_delta(base_model_path, target_model_path, delta_path): 17 | print(f"Loading the base model from {base_model_path}") 18 | base = AutoModelForCausalLM.from_pretrained( 19 | base_model_path, low_cpu_mem_usage=True 20 | ) 21 | 22 | print(f"Loading the target model from {target_model_path}") 23 | target = AutoModelForCausalLM.from_pretrained( 24 | target_model_path, low_cpu_mem_usage=True 25 | ) 26 | target_tokenizer = AutoTokenizer.from_pretrained( 27 | target_model_path, use_fast=False 28 | ) 29 | 30 | print("Calculating the delta") 31 | for name, param in tqdm(target.state_dict().items(), desc="Calculating delta"): 32 | assert name in base.state_dict() 33 | if "embed_tokens" in name or "lm_head.weight" in name: 34 | continue 35 | try: 36 | param.data -= base.state_dict()[name] 37 | except: 38 | print(name) 39 | raise ValueError() 40 | 41 | print(f"Saving the delta to {delta_path}") 42 | if args.hub_repo_id: 43 | kwargs = {"push_to_hub": True, "repo_id": args.hub_repo_id} 44 | else: 45 | kwargs = {} 46 | target.save_pretrained(delta_path, **kwargs) 47 | target_tokenizer.save_pretrained(delta_path, **kwargs) 48 | 49 | 50 | if __name__ == "__main__": 51 | parser = argparse.ArgumentParser() 52 | parser.add_argument("--base-model-path", type=str, required=True) 53 | parser.add_argument("--target-model-path", type=str, required=True) 54 | parser.add_argument("--delta-path", type=str, required=True) 55 | parser.add_argument("--hub-repo-id", type=str) 56 | args = parser.parse_args() 57 | 58 | make_delta(args.base_model_path, args.target_model_path, args.delta_path) 59 | -------------------------------------------------------------------------------- /data/flan_sample.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import gzip 3 | import json 4 | import os.path 5 | 6 | import torch 7 | 8 | cot_zs_submix = ( 9 | ("cot_zs_noopt_train.jsonl.gz", 1), 10 | ("cot_zs_opt_train.jsonl.gz", 1), 11 | ) 12 | 13 | dialog_zs_submix = ( 14 | ("dialog_zs_noopt_train.jsonl.gz", 1), 15 | ("dialog_zs_opt_train.jsonl.gz", 1), 16 | ) 17 | 18 | flan_zs_submix = ( 19 | ("flan_zs_noopt_train.jsonl.gz", 1), 20 | ("flan_zs_opt_train.jsonl.gz", 1), 21 | ) 22 | 23 | niv2_zs_submix = ( 24 | ("niv2_zs_noopt_train.jsonl.gz", 1), 25 | ("niv2_zs_opt_train.jsonl.gz", 1), 26 | ) 27 | 28 | t0_zs_submix = ( 29 | ("t0_zs_noopt_train.jsonl.gz", 1), 30 | ("t0_zs_opt_train.jsonl.gz", 1), 31 | ) 32 | 33 | flan_v2_submix = ( 34 | (flan_zs_submix, 0.4), # mixing weight = 40% 35 | (t0_zs_submix, 0.32), # mixing weight = 32% 36 | (niv2_zs_submix, 0.2), # mixing weight = 20% 37 | (cot_zs_submix, 0.05), # mixing weight = 5% 38 | (dialog_zs_submix, 0.03), # mixing weight = 3% 39 | ) 40 | 41 | if __name__ == '__main__': 42 | parser = argparse.ArgumentParser() 43 | parser.add_argument("--input_dir", type=str, default="data") 44 | parser.add_argument("--total_data_num", type=int, default=500000) 45 | parser.add_argument("--output_file", type=str, default="data") 46 | args = parser.parse_args() 47 | 48 | all_data = [] 49 | for dataset, ratio in flan_v2_submix: 50 | data_num = int(args.total_data_num * ratio) 51 | dataset_data = [] 52 | tmp = sum([sub_ratio for _, sub_ratio in dataset]) 53 | 54 | for file, sub_ratio in dataset: 55 | sub_data_num = int(data_num * sub_ratio / tmp) 56 | 57 | sub_data_f = gzip.open(os.path.join(args.input_dir, file), "rt") 58 | sub_data = [] 59 | line = sub_data_f.readline() 60 | while line: 61 | sub_data.append(json.loads(line)) 62 | if len(sub_data) >= sub_data_num: 63 | break 64 | line = sub_data_f.readline() 65 | sub_data_f.close() 66 | print(f"Read {len(sub_data)} lines from {file}") 67 | 68 | sub_data = sub_data[:sub_data_num] 69 | print(sub_data[0]) 70 | print(sub_data[-1]) 71 | dataset_data.extend(sub_data) 72 | 73 | all_data.extend(dataset_data) 74 | 75 | print(f"Total data num: {len(all_data)}") 76 | torch.save(all_data, args.output_file) 77 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /data/strategy_qa.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import torch 4 | from transformers import PreTrainedTokenizer 5 | from transformers.tokenization_utils import TruncationStrategy, PaddingStrategy 6 | 7 | from data.collators.dict2dict import DictTensorDataset 8 | from general_util.logger import get_child_logger 9 | 10 | logger = get_child_logger(__name__) 11 | 12 | 13 | def split_get_tensor_with_gold_para(file_path: str, tokenizer: PreTrainedTokenizer, 14 | train_para_file: str, max_seq_length: int, use_fact: bool = False): 15 | data = json.load(open(file_path)) 16 | train_paragraphs = json.load(open(train_para_file)) 17 | 18 | text_inputs_a = [] 19 | text_inputs_b = [] 20 | labels = [] 21 | for item in data: 22 | question = item["question"] 23 | label = int(item["answer"]) 24 | 25 | if use_fact: 26 | paragraphs = item["facts"] 27 | else: 28 | para_ids = set() 29 | for evidence in item["evidence"]: 30 | for annotation in evidence: 31 | for evi_item in annotation: 32 | if isinstance(evi_item, list): 33 | for para_id in evi_item: 34 | if para_id in train_paragraphs: 35 | # paragraphs.append(train_paragraphs[para_id]["content"]) 36 | # Remove duplicate paragraphs. 37 | para_ids.add(para_id) 38 | else: 39 | logger.warning(f"Cannot find paragraph with id: {para_id}") 40 | else: 41 | assert evi_item in ["operation", "no_evidence"], evi_item 42 | paragraphs = [train_paragraphs[para_id]["content"] for para_id in para_ids] 43 | 44 | context = " ".join(paragraphs) 45 | 46 | text_inputs_a.append(context) 47 | text_inputs_b.append(question) 48 | labels.append(label) 49 | 50 | model_inputs = tokenizer(text_inputs_a, 51 | text_pair=text_inputs_b, 52 | truncation=TruncationStrategy.LONGEST_FIRST, 53 | padding=PaddingStrategy.LONGEST, 54 | max_length=max_seq_length, 55 | return_tensors="pt") 56 | model_inputs["labels"] = torch.tensor(labels, dtype=torch.long) 57 | 58 | dataset = DictTensorDataset(model_inputs) 59 | 60 | logger.info(f"Max seq length: {model_inputs['input_ids'].size(1)}") 61 | 62 | return dataset 63 | -------------------------------------------------------------------------------- /models/mpt/blocks.py: -------------------------------------------------------------------------------- 1 | """GPT Blocks used for the GPT Model.""" 2 | from typing import Dict, Optional, Tuple 3 | import torch 4 | import torch.nn as nn 5 | from .attention import ATTN_CLASS_REGISTRY 6 | from .norm import NORM_CLASS_REGISTRY 7 | 8 | class MPTMLP(nn.Module): 9 | 10 | def __init__(self, d_model: int, expansion_ratio: int, device: Optional[str]=None): 11 | super().__init__() 12 | self.up_proj = nn.Linear(d_model, expansion_ratio * d_model, device=device) 13 | self.act = nn.GELU(approximate='none') 14 | self.down_proj = nn.Linear(expansion_ratio * d_model, d_model, device=device) 15 | self.down_proj._is_residual = True 16 | 17 | def forward(self, x): 18 | return self.down_proj(self.act(self.up_proj(x))) 19 | 20 | class MPTBlock(nn.Module): 21 | 22 | def __init__(self, d_model: int, n_heads: int, expansion_ratio: int, attn_config: Dict={'attn_type': 'multihead_attention', 'attn_pdrop': 0.0, 'attn_impl': 'triton', 'qk_ln': False, 'clip_qkv': None, 'softmax_scale': None, 'prefix_lm': False, 'attn_uses_sequence_id': False, 'alibi': False, 'alibi_bias_max': 8}, resid_pdrop: float=0.0, norm_type: str='low_precision_layernorm', verbose: int=0, device: Optional[str]=None, **kwargs): 23 | del kwargs 24 | super().__init__() 25 | norm_class = NORM_CLASS_REGISTRY[norm_type.lower()] 26 | attn_class = ATTN_CLASS_REGISTRY[attn_config['attn_type']] 27 | self.norm_1 = norm_class(d_model, device=device) 28 | self.attn = attn_class(attn_impl=attn_config['attn_impl'], clip_qkv=attn_config['clip_qkv'], qk_ln=attn_config['qk_ln'], softmax_scale=attn_config['softmax_scale'], attn_pdrop=attn_config['attn_pdrop'], d_model=d_model, n_heads=n_heads, verbose=verbose, device=device) 29 | self.norm_2 = norm_class(d_model, device=device) 30 | self.ffn = MPTMLP(d_model=d_model, expansion_ratio=expansion_ratio, device=device) 31 | self.resid_attn_dropout = nn.Dropout(resid_pdrop) 32 | self.resid_ffn_dropout = nn.Dropout(resid_pdrop) 33 | 34 | def forward(self, x: torch.Tensor, past_key_value: Optional[Tuple[torch.Tensor]]=None, attn_bias: Optional[torch.Tensor]=None, attention_mask: Optional[torch.ByteTensor]=None, is_causal: bool=True) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor]]]: 35 | a = self.norm_1(x) 36 | (b, attn_weights, past_key_value) = self.attn(a, past_key_value=past_key_value, attn_bias=attn_bias, attention_mask=attention_mask, is_causal=is_causal) 37 | x = x + self.resid_attn_dropout(b) 38 | m = self.norm_2(x) 39 | n = self.ffn(m) 40 | x = x + self.resid_ffn_dropout(n) 41 | return (x, attn_weights, past_key_value) -------------------------------------------------------------------------------- /models/mpt/norm.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | def _cast_if_autocast_enabled(tensor): 4 | if torch.is_autocast_enabled(): 5 | if tensor.device.type == 'cuda': 6 | dtype = torch.get_autocast_gpu_dtype() 7 | elif tensor.device.type == 'cpu': 8 | dtype = torch.get_autocast_cpu_dtype() 9 | else: 10 | raise NotImplementedError() 11 | return tensor.to(dtype=dtype) 12 | return tensor 13 | 14 | class LPLayerNorm(torch.nn.LayerNorm): 15 | 16 | def __init__(self, normalized_shape, eps=1e-05, elementwise_affine=True, device=None, dtype=None): 17 | super().__init__(normalized_shape=normalized_shape, eps=eps, elementwise_affine=elementwise_affine, device=device, dtype=dtype) 18 | 19 | def forward(self, x): 20 | module_device = x.device 21 | downcast_x = _cast_if_autocast_enabled(x) 22 | downcast_weight = _cast_if_autocast_enabled(self.weight) if self.weight is not None else self.weight 23 | downcast_bias = _cast_if_autocast_enabled(self.bias) if self.bias is not None else self.bias 24 | with torch.autocast(enabled=False, device_type=module_device.type): 25 | return torch.nn.functional.layer_norm(downcast_x, self.normalized_shape, downcast_weight, downcast_bias, self.eps) 26 | 27 | def rms_norm(x, weight=None, eps=1e-05): 28 | output = x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + eps) 29 | if weight is not None: 30 | return output * weight 31 | return output 32 | 33 | class RMSNorm(torch.nn.Module): 34 | 35 | def __init__(self, normalized_shape, eps=1e-05, weight=True, dtype=None, device=None): 36 | super().__init__() 37 | self.eps = eps 38 | if weight: 39 | self.weight = torch.nn.Parameter(torch.ones(normalized_shape, dtype=dtype, device=device)) 40 | else: 41 | self.register_parameter('weight', None) 42 | 43 | def forward(self, x): 44 | return rms_norm(x.float(), self.weight, self.eps).to(dtype=x.dtype) 45 | 46 | class LPRMSNorm(RMSNorm): 47 | 48 | def __init__(self, normalized_shape, eps=1e-05, weight=True, dtype=None, device=None): 49 | super().__init__(normalized_shape=normalized_shape, eps=eps, weight=weight, dtype=dtype, device=device) 50 | 51 | def forward(self, x): 52 | downcast_x = _cast_if_autocast_enabled(x) 53 | downcast_weight = _cast_if_autocast_enabled(self.weight) if self.weight is not None else self.weight 54 | with torch.autocast(enabled=False, device_type=x.device.type): 55 | return rms_norm(downcast_x, downcast_weight, self.eps).to(dtype=x.dtype) 56 | NORM_CLASS_REGISTRY = {'layernorm': torch.nn.LayerNorm, 'low_precision_layernorm': LPLayerNorm, 'rmsnorm': RMSNorm, 'low_precision_rmsnorm': LPRMSNorm} -------------------------------------------------------------------------------- /models/patching_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | 4 | 5 | def compute_flash_attention(flash_attn, q, k, v, attention_mask=None, head_mask=None): 6 | # q, k, v: [bs, seq_len, num_attention_heads, attn_head_size] 7 | # attention_mask (float): [bs, seq_len] 8 | batch_size, max_len = q.size(0), q.size(1) 9 | 10 | qkv = torch.stack([q, k, v], dim=2).to(torch.float16) # need to truncate in case input is fp32 11 | cu_seqlens, max_seqlen = None, None 12 | 13 | if attention_mask is None: 14 | return flash_attn(qkv, cu_seqlens=cu_seqlens, max_seqlen=max_seqlen) 15 | else: 16 | # Limitation: non-contiguous attention mask will not be handled correctly 17 | # model will be able to pay attention between the first and last non-masked token, i.e. left- and right-side padding is supported. 18 | csums = (attention_mask >= 0).cumsum(dim=1) 19 | ends = csums.argmax(dim=1) + 1 20 | starts = ends - csums.max(dim=1).values 21 | seqlens = ends - starts 22 | 23 | qkv = torch.cat([qkv[i, starts[i]: ends[i]] for i in range(batch_size)], dim=0) 24 | zero = torch.zeros_like(seqlens[:1]) # torch.tensor([0]) with correct dtype and device 25 | cu_seqlens = torch.cat([zero, seqlens.cumsum(dim=0)], dim=0).to(torch.int32) 26 | max_seqlen = seqlens.max().item() 27 | 28 | out = flash_attn(qkv, cu_seqlens=cu_seqlens, max_seqlen=max_seqlen) 29 | # out: [num_unmasked_tokens, num_attention_heads, attn_head_size] 30 | 31 | seqs = [out[start:end] for start, end in zip(cu_seqlens[:-1], cu_seqlens[1:])] 32 | # stack and pad sequences together 33 | padded_seqs = [ 34 | F.pad(seqs[i], (0, 0) * (seqs[i].dim() - 1) + (starts[i], max_len - ends[i]), value=0.0) 35 | for i in range(batch_size) 36 | ] 37 | out = torch.stack(padded_seqs) 38 | return out 39 | 40 | # if __name__ == "__main__": 41 | # from flash_attn.modules.mha import FlashSelfAttention 42 | # 43 | # flash_attn = FlashSelfAttention(causal=True) 44 | # 45 | # dtype = torch.float16 46 | # device = torch.device("cuda:0") 47 | # 48 | # batch_size, seq_len, num_heads, head_size = 4, 18, 8, 32 49 | # q = torch.randn(batch_size, seq_len, num_heads, head_size, dtype=dtype, device=device) 50 | # k = torch.randn(batch_size, seq_len, num_heads, head_size, dtype=dtype, device=device) 51 | # v = torch.randn(batch_size, seq_len, num_heads, head_size, dtype=dtype, device=device) 52 | # 53 | # attn_mask = torch.randn(batch_size, seq_len, dtype=dtype, device=device).abs().cumsum(dim=1) 54 | # attn_mask = ((attn_mask > 3) & (attn_mask < 10)).int().log() 55 | # 56 | # out = compute_flash_attention(flash_attn, q, k, v, attention_mask=attn_mask) 57 | -------------------------------------------------------------------------------- /general_util/average_meter.py: -------------------------------------------------------------------------------- 1 | from typing import Dict 2 | 3 | import torch 4 | import torch.distributed as dist 5 | 6 | 7 | class AverageMeter(object): 8 | """Computes and stores the average and current value.""" 9 | 10 | def __init__(self): 11 | self.val = 0 12 | self.avg = 0 13 | self.sum = 0 14 | self.count = 0 15 | 16 | def reset(self): 17 | self.val = 0 18 | self.avg = 0 19 | self.sum = 0 20 | self.count = 0 21 | 22 | def update(self, val, n=1): 23 | if isinstance(val, torch.Tensor): 24 | val = val.item() 25 | if isinstance(n, torch.Tensor): 26 | n = n.item() 27 | 28 | self.val = val 29 | self.sum += val * n 30 | self.count += n 31 | if self.count > 0: 32 | self.avg = self.sum / self.count 33 | else: 34 | self.avg = 0 35 | 36 | def save(self): 37 | return { 38 | 'val': self.val, 39 | 'avg': self.avg, 40 | 'sum': self.sum, 41 | 'count': self.count 42 | } 43 | 44 | def load(self, value: dict): 45 | if value is None: 46 | self.reset() 47 | self.val = value['val'] if 'val' in value else 0 48 | self.avg = value['avg'] if 'avg' in value else 0 49 | self.sum = value['sum'] if 'sum' in value else 0 50 | self.count = value['count'] if 'count' in value else 0 51 | 52 | def gather(self, device): 53 | tensor_list = [torch.zeros(2, device=device) for _ in range(dist.get_world_size())] 54 | tensor = torch.tensor([self.sum, self.count], device=device) 55 | dist.all_gather(tensor_list, tensor) 56 | 57 | all_tensor = torch.stack(tensor_list, dim=0) 58 | self.sum = all_tensor[:, 0].sum().item() 59 | self.count = all_tensor[:, 1].sum().item() 60 | if self.count > 0: 61 | self.avg = self.sum / self.count 62 | else: 63 | self.avg = 0 64 | 65 | del all_tensor 66 | 67 | 68 | class LogMetric(object): 69 | """ 70 | Record all metrics for logging. 71 | """ 72 | 73 | def __init__(self, *metric_names): 74 | 75 | self.metrics: Dict[str, AverageMeter] = { 76 | key: AverageMeter() for key in metric_names 77 | } 78 | 79 | def update(self, metric_name, val, n=1): 80 | 81 | self.metrics[metric_name].update(val, n) 82 | 83 | def reset(self, metric_name=None): 84 | if metric_name is None: 85 | for key in self.metrics.keys(): 86 | self.metrics[key].reset() 87 | return 88 | 89 | self.metrics[metric_name].reset() 90 | 91 | def get_log(self): 92 | 93 | log = { 94 | key: self.metrics[key].avg for key in self.metrics 95 | } 96 | return log 97 | -------------------------------------------------------------------------------- /general_util/dist_utils.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import os 3 | import subprocess 4 | 5 | import torch 6 | import torch.distributed as dist 7 | from omegaconf import DictConfig 8 | 9 | 10 | def vanilla_torch_dist(cfg: DictConfig, backend="nccl"): 11 | if "LOCAL_RANK" in os.environ and os.environ["LOCAL_RANK"] not in [-1, "-1"]: 12 | cfg.local_rank = int(os.environ["LOCAL_RANK"]) 13 | 14 | if cfg.local_rank == -1 or cfg.no_cuda: 15 | device = str(torch.device("cuda" if torch.cuda.is_available() and not cfg.no_cuda else "cpu")) 16 | cfg.n_gpu = torch.cuda.device_count() 17 | else: # Initializes the distributed backend which will take care of synchronizing nodes/GPUs 18 | torch.cuda.set_device(cfg.local_rank) 19 | device = str(torch.device("cuda", cfg.local_rank)) 20 | dist.init_process_group(backend=backend, timeout=datetime.timedelta(seconds=7200)) 21 | cfg.n_gpu = 1 22 | cfg.world_size = dist.get_world_size() 23 | cfg.device = device 24 | 25 | 26 | def setup_slurm_distributed(cfg: DictConfig, backend="nccl", port=None): 27 | """ 28 | Most code are copied from https://github.com/BIGBALLON/distribuuuu/blob/master/tutorial/mnmc_ddp_slurm.py. 29 | """ 30 | num_gpus = torch.cuda.device_count() 31 | print(num_gpus) 32 | if num_gpus <= 1 or cfg.no_cuda: 33 | cfg.local_rank = -1 34 | cfg.device = str(torch.device("cuda" if torch.cuda.is_available() and not cfg.no_cuda else "cpu")) 35 | cfg.n_gpu = min(num_gpus, 1) 36 | cfg.ddp_eval = False 37 | return 38 | 39 | # Data Parallel or Model Parallel on multiple GPUs with single task. 40 | if int(os.environ["SLURM_NTASKS"]) == 1: 41 | cfg.n_gpu = num_gpus 42 | cfg.ddp_eval = False 43 | cfg.device = str(torch.device("cuda")) 44 | cfg.local_rank = -1 45 | return 46 | 47 | proc_id = int(os.environ["SLURM_PROCID"]) 48 | n_tasks = int(os.environ["SLURM_NTASKS"]) 49 | node_list = os.environ["SLURM_NODELIST"] 50 | 51 | torch.cuda.set_device(proc_id % num_gpus) 52 | 53 | addr = subprocess.getoutput(f"scontrol show hostname {node_list} | head -n1") 54 | # specify master port 55 | if port is not None: 56 | os.environ["MASTER_PORT"] = str(port) 57 | elif "MASTER_PORT" not in os.environ: 58 | os.environ["MASTER_PORT"] = "29500" 59 | if "MASTER_ADDR" not in os.environ: 60 | os.environ["MASTER_ADDR"] = addr 61 | 62 | os.environ["WORLD_SIZE"] = str(n_tasks) 63 | os.environ["LOCAL_RANK"] = str(proc_id % num_gpus) 64 | os.environ["RANK"] = str(proc_id) 65 | 66 | cfg.n_gpu = 1 67 | cfg.local_rank = int(os.environ["LOCAL_RANK"]) 68 | # cfg.local_rank = int(os.environ["RANK"]) 69 | cfg.world_size = int(os.environ["WORLD_SIZE"]) 70 | cfg.device = str(torch.device("cuda", cfg.local_rank)) 71 | 72 | dist.init_process_group(backend=backend, world_size=int(os.environ["WORLD_SIZE"]), rank=int(os.environ["RANK"])) 73 | 74 | # print(cfg.n_gpu, cfg.local_rank, cfg.world_size, cfg.device) 75 | # print(cfg.local_rank) 76 | cfg.local_rank = dist.get_rank() 77 | # print(cfg.local_rank) 78 | -------------------------------------------------------------------------------- /conf/base.yaml: -------------------------------------------------------------------------------- 1 | hydra: 2 | run: 3 | dir: ./ 4 | 5 | train_file: 6 | dev_file: 7 | test_file: 8 | 9 | # Model 10 | model: 11 | _target_: models.roberta_baseline.RobertaForMultipleChoiceForZeroShot.from_pretrained 12 | 13 | # Data loading 14 | read_tensor: 15 | _target_: data.reclor_sentence_prefix.convert_examples_into_features 16 | max_seq_length: 256 17 | num_workers: 2 18 | token_num: 5 19 | 20 | extended_vocab: ${read_tensor.token_num} 21 | 22 | # Data collator 23 | collator: 24 | _target_: data.collators.ReClorSentenceCollator 25 | 26 | # Dataloader 27 | num_workers: 4 28 | prefetch_factor: 2 29 | 30 | # Wiki path pretrain v8.2 31 | model_name_or_path: experiments/roberta.large.wiki_erica_path_v7_v8.2.2.1aug.ctx.1k.2080Ti/checkpoint-500 32 | pretrain: 33 | 34 | output_dir: 35 | 36 | 37 | do_train: True 38 | evaluate_during_training: True 39 | 40 | do_eval: True 41 | eval_sub_path: 42 | 43 | # Training hyper-parameters 44 | per_gpu_train_batch_size: 1 45 | per_gpu_eval_batch_size: 1 46 | learning_rate: 1.5e-5 47 | gradient_accumulation_steps: 12 48 | weight_decay: 0.01 49 | adam_epsilon: 1e-6 50 | adam_betas: "(0.9, 0.98)" 51 | max_grad_norm: 0.0 52 | num_train_epochs: 10 53 | max_steps: 0 54 | warmup_proportion: 0.1 55 | warmup_steps: 0 56 | 57 | 58 | logging_steps: 5 59 | save_steps: -1 60 | save_best: True 61 | eval_steps: 100 62 | no_cuda: False 63 | seed: 42 64 | local_rank: -1 65 | fp16: True 66 | fp16_opt_level: O1 67 | 68 | # Prediction config 69 | prediction_cfg: 70 | metric: "acc" 71 | measure: 1 72 | best_checkpoint: 73 | best_result: 74 | 75 | # fairscale.FullyShardedDP 76 | fairscale_config: 77 | _target_: general_util.fsdp_utils.default_initialize 78 | fp16: ${fp16} 79 | reshard_after_forward: False 80 | cpu_offload: False 81 | move_grads_to_cpu: False 82 | move_params_to_cpu: False 83 | 84 | # Deepspeed config 85 | ds_cfg: 86 | train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size} 87 | gradient_accumulation_steps: ${gradient_accumulation_steps} 88 | optimizer: 89 | type: AdamW 90 | params: 91 | lr: ${learning_rate} 92 | betas: [0.9, 0.999] 93 | eps: ${adam_epsilon} 94 | weight_decay: ${weight_decay} 95 | scheduler: 96 | type: WarmupDecayLR 97 | params: 98 | total_num_steps: 99 | warmup_max_lr: ${learning_rate} 100 | warmup_num_steps: 101 | warmup_type: linear 102 | gradient_clipping: ${max_grad_norm} 103 | fp16: 104 | enabled: ${fp16} 105 | initial_scale_power: 12 106 | zero_optimization: 107 | stage: 3 108 | # offload_optimizer: 109 | # device: cpu 110 | # pin_memory: True 111 | # offload_param: 112 | # device: cpu 113 | # pin_memory: True 114 | # activation_checkpointing: 115 | # partition_activations: True 116 | # cpu_checkpointing: True 117 | # contiguous_memory_optimization: False 118 | # number_checkpoints: False 119 | # synchronize_checkpoint_boundary: False 120 | # profile: False 121 | steps_per_print: 1024 122 | 123 | summary_helper: 124 | _target_: general_util.tensorboard_helper.SummaryWriterHelper 125 | batch_index_or_keys: 126 | outputs_index_or_keys: 127 | 128 | # Temporary variables 129 | n_gpu: 130 | device: 131 | train_batch_size: 132 | eval_batch_size: 133 | world_size: 134 | -------------------------------------------------------------------------------- /convert2hf.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os 4 | from glob import glob 5 | from pathlib import Path 6 | 7 | import torch 8 | import transformers 9 | from accelerate import init_empty_weights 10 | from transformers import AutoModelForCausalLM 11 | 12 | PARAM_MAP = { 13 | "7B": { 14 | "n_layers": 32, 15 | }, 16 | "13B": { 17 | "n_layers": 40, 18 | }, 19 | "30B": { 20 | "n_layers": 60, 21 | }, 22 | "65B": { 23 | "n_layers": 80, 24 | }, 25 | } 26 | 27 | ORIGINAL_TOKENIZER_SIZE = 32000 28 | 29 | 30 | def read_json(path): 31 | with open(path, "r") as f: 32 | return json.load(f) 33 | 34 | 35 | def write_json(text, path): 36 | with open(path, "w") as f: 37 | json.dump(text, f) 38 | 39 | 40 | def load_weights(checkpoint_dir, n_layers: int): 41 | state_dict = {} 42 | for pt in Path(checkpoint_dir).iterdir(): 43 | print("Processing ", pt.name) 44 | if not pt.name.startswith('layer_'): 45 | continue 46 | 47 | sd = torch.load(pt, map_location="cpu") 48 | 49 | if pt.name.startswith("layer_00"): 50 | print(f"{pt.name} -> model.embed_tokens.weight") 51 | state_dict["model.embed_tokens.weight"] = sd["weight"] 52 | elif pt.name.startswith(f"layer_{n_layers + 1}"): 53 | print(f"{pt.name} -> model.norm.weight") 54 | state_dict["model.norm.weight"] = sd["weight"] 55 | elif pt.name.startswith(f"layer_{n_layers + 2}"): 56 | print(f"{pt.name} -> lm_head.weight") 57 | state_dict["lm_head.weight"] = sd["weight"] 58 | else: 59 | layer_idx = int(pt.name[len("layer_"):].split("-")[0]) - 1 60 | assert 0 <= layer_idx < n_layers 61 | for k, v in sd.items(): 62 | state_dict[f"model.layers.{layer_idx}.{k}"] = v 63 | print(f"{pt.name} -> model.layers.{layer_idx}") 64 | return state_dict 65 | 66 | 67 | def write_model(input_base_path, model_size, config_dir): 68 | assert model_size in PARAM_MAP 69 | config = transformers.AutoConfig.from_pretrained(config_dir) 70 | with init_empty_weights(): 71 | model = AutoModelForCausalLM.from_config(config) 72 | 73 | params = PARAM_MAP[model_size] 74 | n_layers = params["n_layers"] 75 | 76 | if os.path.exists(input_base_path): 77 | checkpoint_dirs = [input_base_path] 78 | else: 79 | checkpoint_dirs = glob(input_base_path, recursive=True) 80 | print(f"Found checkpoints: {checkpoint_dirs}") 81 | 82 | for checkpoint_dir in checkpoint_dirs: 83 | checkpoint_state_dict = load_weights(checkpoint_dir, n_layers) 84 | model.save_pretrained("/".join(checkpoint_dir.split("/")[:-1]), state_dict=checkpoint_state_dict, max_shard_size="3GB") 85 | 86 | 87 | def main(): 88 | parser = argparse.ArgumentParser() 89 | parser.add_argument( 90 | "--input_dir", 91 | help="Location of LLaMA weights, which contains tokenizer.model and model folders", 92 | ) 93 | parser.add_argument( 94 | "--model_size", 95 | choices=["7B", "13B", "30B", "65B"], 96 | ) 97 | parser.add_argument( 98 | "--config_dir", 99 | ) 100 | args = parser.parse_args() 101 | write_model( 102 | input_base_path=args.input_dir, 103 | model_size=args.model_size, 104 | config_dir=args.config_dir, 105 | ) 106 | 107 | 108 | if __name__ == "__main__": 109 | main() 110 | -------------------------------------------------------------------------------- /data/files/c4/en/p50/partition_100.json: -------------------------------------------------------------------------------- 1 | [ 2 | "/opt/ml/input/data/train/c4/en/c4-train.00086-of-01024.json.gz", 3 | "/opt/ml/input/data/train/c4/en/c4-train.00895-of-01024.json.gz", 4 | "/opt/ml/input/data/train/c4/en/c4-train.00988-of-01024.json.gz", 5 | "/opt/ml/input/data/train/c4/en/c4-train.00200-of-01024.json.gz", 6 | "/opt/ml/input/data/train/c4/en/c4-train.00081-of-01024.json.gz", 7 | "/opt/ml/input/data/train/c4/en/c4-train.00210-of-01024.json.gz", 8 | "/opt/ml/input/data/train/c4/en/c4-train.00411-of-01024.json.gz", 9 | "/opt/ml/input/data/train/c4/en/c4-train.00812-of-01024.json.gz", 10 | "/opt/ml/input/data/train/c4/en/c4-train.00410-of-01024.json.gz", 11 | "/opt/ml/input/data/train/c4/en/c4-train.00775-of-01024.json.gz", 12 | "/opt/ml/input/data/train/c4/en/c4-train.00591-of-01024.json.gz", 13 | "/opt/ml/input/data/train/c4/en/c4-train.00901-of-01024.json.gz", 14 | "/opt/ml/input/data/train/c4/en/c4-train.00349-of-01024.json.gz", 15 | "/opt/ml/input/data/train/c4/en/c4-train.00655-of-01024.json.gz", 16 | "/opt/ml/input/data/train/c4/en/c4-train.00511-of-01024.json.gz", 17 | "/opt/ml/input/data/train/c4/en/c4-train.00450-of-01024.json.gz", 18 | "/opt/ml/input/data/train/c4/en/c4-train.00430-of-01024.json.gz", 19 | "/opt/ml/input/data/train/c4/en/c4-train.00882-of-01024.json.gz", 20 | "/opt/ml/input/data/train/c4/en/c4-train.00559-of-01024.json.gz", 21 | "/opt/ml/input/data/train/c4/en/c4-train.00598-of-01024.json.gz", 22 | "/opt/ml/input/data/train/c4/en/c4-train.00120-of-01024.json.gz", 23 | "/opt/ml/input/data/train/c4/en/c4-train.00816-of-01024.json.gz", 24 | "/opt/ml/input/data/train/c4/en/c4-train.00780-of-01024.json.gz", 25 | "/opt/ml/input/data/train/c4/en/c4-train.00395-of-01024.json.gz", 26 | "/opt/ml/input/data/train/c4/en/c4-train.00726-of-01024.json.gz", 27 | "/opt/ml/input/data/train/c4/en/c4-train.00273-of-01024.json.gz", 28 | "/opt/ml/input/data/train/c4/en/c4-train.00768-of-01024.json.gz", 29 | "/opt/ml/input/data/train/c4/en/c4-train.00190-of-01024.json.gz", 30 | "/opt/ml/input/data/train/c4/en/c4-train.00499-of-01024.json.gz", 31 | "/opt/ml/input/data/train/c4/en/c4-train.00458-of-01024.json.gz", 32 | "/opt/ml/input/data/train/c4/en/c4-train.00506-of-01024.json.gz", 33 | "/opt/ml/input/data/train/c4/en/c4-train.00061-of-01024.json.gz", 34 | "/opt/ml/input/data/train/c4/en/c4-train.00005-of-01024.json.gz", 35 | "/opt/ml/input/data/train/c4/en/c4-train.00889-of-01024.json.gz", 36 | "/opt/ml/input/data/train/c4/en/c4-train.00442-of-01024.json.gz", 37 | "/opt/ml/input/data/train/c4/en/c4-train.00617-of-01024.json.gz", 38 | "/opt/ml/input/data/train/c4/en/c4-train.00786-of-01024.json.gz", 39 | "/opt/ml/input/data/train/c4/en/c4-train.00187-of-01024.json.gz", 40 | "/opt/ml/input/data/train/c4/en/c4-train.00839-of-01024.json.gz", 41 | "/opt/ml/input/data/train/c4/en/c4-train.00405-of-01024.json.gz", 42 | "/opt/ml/input/data/train/c4/en/c4-train.00555-of-01024.json.gz", 43 | "/opt/ml/input/data/train/c4/en/c4-train.00133-of-01024.json.gz", 44 | "/opt/ml/input/data/train/c4/en/c4-train.00634-of-01024.json.gz", 45 | "/opt/ml/input/data/train/c4/en/c4-train.00434-of-01024.json.gz", 46 | "/opt/ml/input/data/train/c4/en/c4-train.00519-of-01024.json.gz", 47 | "/opt/ml/input/data/train/c4/en/c4-train.00340-of-01024.json.gz", 48 | "/opt/ml/input/data/train/c4/en/c4-train.00810-of-01024.json.gz", 49 | "/opt/ml/input/data/train/c4/en/c4-train.00217-of-01024.json.gz", 50 | "/opt/ml/input/data/train/c4/en/c4-train.00087-of-01024.json.gz", 51 | "/opt/ml/input/data/train/c4/en/c4-train.00302-of-01024.json.gz" 52 | ] -------------------------------------------------------------------------------- /data/files/c4/en/p50/partition_1000.json: -------------------------------------------------------------------------------- 1 | [ 2 | "/opt/ml/input/data/train/c4/en/c4-train.00226-of-01024.json.gz", 3 | "/opt/ml/input/data/train/c4/en/c4-train.00315-of-01024.json.gz", 4 | "/opt/ml/input/data/train/c4/en/c4-train.00371-of-01024.json.gz", 5 | "/opt/ml/input/data/train/c4/en/c4-train.00280-of-01024.json.gz", 6 | "/opt/ml/input/data/train/c4/en/c4-train.00258-of-01024.json.gz", 7 | "/opt/ml/input/data/train/c4/en/c4-train.00231-of-01024.json.gz", 8 | "/opt/ml/input/data/train/c4/en/c4-train.00018-of-01024.json.gz", 9 | "/opt/ml/input/data/train/c4/en/c4-train.00888-of-01024.json.gz", 10 | "/opt/ml/input/data/train/c4/en/c4-train.00721-of-01024.json.gz", 11 | "/opt/ml/input/data/train/c4/en/c4-train.00020-of-01024.json.gz", 12 | "/opt/ml/input/data/train/c4/en/c4-train.00144-of-01024.json.gz", 13 | "/opt/ml/input/data/train/c4/en/c4-train.00209-of-01024.json.gz", 14 | "/opt/ml/input/data/train/c4/en/c4-train.00789-of-01024.json.gz", 15 | "/opt/ml/input/data/train/c4/en/c4-train.00264-of-01024.json.gz", 16 | "/opt/ml/input/data/train/c4/en/c4-train.00431-of-01024.json.gz", 17 | "/opt/ml/input/data/train/c4/en/c4-train.00811-of-01024.json.gz", 18 | "/opt/ml/input/data/train/c4/en/c4-train.00863-of-01024.json.gz", 19 | "/opt/ml/input/data/train/c4/en/c4-train.00212-of-01024.json.gz", 20 | "/opt/ml/input/data/train/c4/en/c4-train.00862-of-01024.json.gz", 21 | "/opt/ml/input/data/train/c4/en/c4-train.00685-of-01024.json.gz", 22 | "/opt/ml/input/data/train/c4/en/c4-train.00897-of-01024.json.gz", 23 | "/opt/ml/input/data/train/c4/en/c4-train.00974-of-01024.json.gz", 24 | "/opt/ml/input/data/train/c4/en/c4-train.00630-of-01024.json.gz", 25 | "/opt/ml/input/data/train/c4/en/c4-train.00292-of-01024.json.gz", 26 | "/opt/ml/input/data/train/c4/en/c4-train.00198-of-01024.json.gz", 27 | "/opt/ml/input/data/train/c4/en/c4-train.00435-of-01024.json.gz", 28 | "/opt/ml/input/data/train/c4/en/c4-train.00218-of-01024.json.gz", 29 | "/opt/ml/input/data/train/c4/en/c4-train.00981-of-01024.json.gz", 30 | "/opt/ml/input/data/train/c4/en/c4-train.00175-of-01024.json.gz", 31 | "/opt/ml/input/data/train/c4/en/c4-train.00079-of-01024.json.gz", 32 | "/opt/ml/input/data/train/c4/en/c4-train.00662-of-01024.json.gz", 33 | "/opt/ml/input/data/train/c4/en/c4-train.00344-of-01024.json.gz", 34 | "/opt/ml/input/data/train/c4/en/c4-train.00090-of-01024.json.gz", 35 | "/opt/ml/input/data/train/c4/en/c4-train.00918-of-01024.json.gz", 36 | "/opt/ml/input/data/train/c4/en/c4-train.00155-of-01024.json.gz", 37 | "/opt/ml/input/data/train/c4/en/c4-train.00131-of-01024.json.gz", 38 | "/opt/ml/input/data/train/c4/en/c4-train.00576-of-01024.json.gz", 39 | "/opt/ml/input/data/train/c4/en/c4-train.00604-of-01024.json.gz", 40 | "/opt/ml/input/data/train/c4/en/c4-train.00774-of-01024.json.gz", 41 | "/opt/ml/input/data/train/c4/en/c4-train.00659-of-01024.json.gz", 42 | "/opt/ml/input/data/train/c4/en/c4-train.00808-of-01024.json.gz", 43 | "/opt/ml/input/data/train/c4/en/c4-train.00026-of-01024.json.gz", 44 | "/opt/ml/input/data/train/c4/en/c4-train.00115-of-01024.json.gz", 45 | "/opt/ml/input/data/train/c4/en/c4-train.00467-of-01024.json.gz", 46 | "/opt/ml/input/data/train/c4/en/c4-train.00583-of-01024.json.gz", 47 | "/opt/ml/input/data/train/c4/en/c4-train.00933-of-01024.json.gz", 48 | "/opt/ml/input/data/train/c4/en/c4-train.00907-of-01024.json.gz", 49 | "/opt/ml/input/data/train/c4/en/c4-train.00105-of-01024.json.gz", 50 | "/opt/ml/input/data/train/c4/en/c4-train.00869-of-01024.json.gz", 51 | "/opt/ml/input/data/train/c4/en/c4-train.00658-of-01024.json.gz" 52 | ] -------------------------------------------------------------------------------- /data/files/c4/en/p50/partition_150.json: -------------------------------------------------------------------------------- 1 | [ 2 | "/opt/ml/input/data/train/c4/en/c4-train.00951-of-01024.json.gz", 3 | "/opt/ml/input/data/train/c4/en/c4-train.00145-of-01024.json.gz", 4 | "/opt/ml/input/data/train/c4/en/c4-train.00906-of-01024.json.gz", 5 | "/opt/ml/input/data/train/c4/en/c4-train.00934-of-01024.json.gz", 6 | "/opt/ml/input/data/train/c4/en/c4-train.00938-of-01024.json.gz", 7 | "/opt/ml/input/data/train/c4/en/c4-train.00355-of-01024.json.gz", 8 | "/opt/ml/input/data/train/c4/en/c4-train.00784-of-01024.json.gz", 9 | "/opt/ml/input/data/train/c4/en/c4-train.00246-of-01024.json.gz", 10 | "/opt/ml/input/data/train/c4/en/c4-train.00408-of-01024.json.gz", 11 | "/opt/ml/input/data/train/c4/en/c4-train.00955-of-01024.json.gz", 12 | "/opt/ml/input/data/train/c4/en/c4-train.00103-of-01024.json.gz", 13 | "/opt/ml/input/data/train/c4/en/c4-train.00978-of-01024.json.gz", 14 | "/opt/ml/input/data/train/c4/en/c4-train.00197-of-01024.json.gz", 15 | "/opt/ml/input/data/train/c4/en/c4-train.00967-of-01024.json.gz", 16 | "/opt/ml/input/data/train/c4/en/c4-train.00021-of-01024.json.gz", 17 | "/opt/ml/input/data/train/c4/en/c4-train.01008-of-01024.json.gz", 18 | "/opt/ml/input/data/train/c4/en/c4-train.00979-of-01024.json.gz", 19 | "/opt/ml/input/data/train/c4/en/c4-train.00922-of-01024.json.gz", 20 | "/opt/ml/input/data/train/c4/en/c4-train.00645-of-01024.json.gz", 21 | "/opt/ml/input/data/train/c4/en/c4-train.00915-of-01024.json.gz", 22 | "/opt/ml/input/data/train/c4/en/c4-train.00432-of-01024.json.gz", 23 | "/opt/ml/input/data/train/c4/en/c4-train.00096-of-01024.json.gz", 24 | "/opt/ml/input/data/train/c4/en/c4-train.00447-of-01024.json.gz", 25 | "/opt/ml/input/data/train/c4/en/c4-train.00491-of-01024.json.gz", 26 | "/opt/ml/input/data/train/c4/en/c4-train.00556-of-01024.json.gz", 27 | "/opt/ml/input/data/train/c4/en/c4-train.00031-of-01024.json.gz", 28 | "/opt/ml/input/data/train/c4/en/c4-train.00062-of-01024.json.gz", 29 | "/opt/ml/input/data/train/c4/en/c4-train.00282-of-01024.json.gz", 30 | "/opt/ml/input/data/train/c4/en/c4-train.00670-of-01024.json.gz", 31 | "/opt/ml/input/data/train/c4/en/c4-train.00698-of-01024.json.gz", 32 | "/opt/ml/input/data/train/c4/en/c4-train.00385-of-01024.json.gz", 33 | "/opt/ml/input/data/train/c4/en/c4-train.00707-of-01024.json.gz", 34 | "/opt/ml/input/data/train/c4/en/c4-train.00837-of-01024.json.gz", 35 | "/opt/ml/input/data/train/c4/en/c4-train.00329-of-01024.json.gz", 36 | "/opt/ml/input/data/train/c4/en/c4-train.00248-of-01024.json.gz", 37 | "/opt/ml/input/data/train/c4/en/c4-train.01006-of-01024.json.gz", 38 | "/opt/ml/input/data/train/c4/en/c4-train.00671-of-01024.json.gz", 39 | "/opt/ml/input/data/train/c4/en/c4-train.01021-of-01024.json.gz", 40 | "/opt/ml/input/data/train/c4/en/c4-train.00550-of-01024.json.gz", 41 | "/opt/ml/input/data/train/c4/en/c4-train.00615-of-01024.json.gz", 42 | "/opt/ml/input/data/train/c4/en/c4-train.00696-of-01024.json.gz", 43 | "/opt/ml/input/data/train/c4/en/c4-train.00893-of-01024.json.gz", 44 | "/opt/ml/input/data/train/c4/en/c4-train.01003-of-01024.json.gz", 45 | "/opt/ml/input/data/train/c4/en/c4-train.00782-of-01024.json.gz", 46 | "/opt/ml/input/data/train/c4/en/c4-train.00007-of-01024.json.gz", 47 | "/opt/ml/input/data/train/c4/en/c4-train.00595-of-01024.json.gz", 48 | "/opt/ml/input/data/train/c4/en/c4-train.00224-of-01024.json.gz", 49 | "/opt/ml/input/data/train/c4/en/c4-train.00969-of-01024.json.gz", 50 | "/opt/ml/input/data/train/c4/en/c4-train.00508-of-01024.json.gz", 51 | "/opt/ml/input/data/train/c4/en/c4-train.00151-of-01024.json.gz" 52 | ] -------------------------------------------------------------------------------- /data/files/c4/en/p50/partition_200.json: -------------------------------------------------------------------------------- 1 | [ 2 | "/opt/ml/input/data/train/c4/en/c4-train.00624-of-01024.json.gz", 3 | "/opt/ml/input/data/train/c4/en/c4-train.00236-of-01024.json.gz", 4 | "/opt/ml/input/data/train/c4/en/c4-train.00104-of-01024.json.gz", 5 | "/opt/ml/input/data/train/c4/en/c4-train.00652-of-01024.json.gz", 6 | "/opt/ml/input/data/train/c4/en/c4-train.00596-of-01024.json.gz", 7 | "/opt/ml/input/data/train/c4/en/c4-train.00720-of-01024.json.gz", 8 | "/opt/ml/input/data/train/c4/en/c4-train.00110-of-01024.json.gz", 9 | "/opt/ml/input/data/train/c4/en/c4-train.00944-of-01024.json.gz", 10 | "/opt/ml/input/data/train/c4/en/c4-train.00607-of-01024.json.gz", 11 | "/opt/ml/input/data/train/c4/en/c4-train.00334-of-01024.json.gz", 12 | "/opt/ml/input/data/train/c4/en/c4-train.00527-of-01024.json.gz", 13 | "/opt/ml/input/data/train/c4/en/c4-train.00270-of-01024.json.gz", 14 | "/opt/ml/input/data/train/c4/en/c4-train.00676-of-01024.json.gz", 15 | "/opt/ml/input/data/train/c4/en/c4-train.00908-of-01024.json.gz", 16 | "/opt/ml/input/data/train/c4/en/c4-train.00445-of-01024.json.gz", 17 | "/opt/ml/input/data/train/c4/en/c4-train.00146-of-01024.json.gz", 18 | "/opt/ml/input/data/train/c4/en/c4-train.00722-of-01024.json.gz", 19 | "/opt/ml/input/data/train/c4/en/c4-train.00693-of-01024.json.gz", 20 | "/opt/ml/input/data/train/c4/en/c4-train.00396-of-01024.json.gz", 21 | "/opt/ml/input/data/train/c4/en/c4-train.00141-of-01024.json.gz", 22 | "/opt/ml/input/data/train/c4/en/c4-train.00269-of-01024.json.gz", 23 | "/opt/ml/input/data/train/c4/en/c4-train.00496-of-01024.json.gz", 24 | "/opt/ml/input/data/train/c4/en/c4-train.00387-of-01024.json.gz", 25 | "/opt/ml/input/data/train/c4/en/c4-train.00745-of-01024.json.gz", 26 | "/opt/ml/input/data/train/c4/en/c4-train.00132-of-01024.json.gz", 27 | "/opt/ml/input/data/train/c4/en/c4-train.00261-of-01024.json.gz", 28 | "/opt/ml/input/data/train/c4/en/c4-train.00647-of-01024.json.gz", 29 | "/opt/ml/input/data/train/c4/en/c4-train.00420-of-01024.json.gz", 30 | "/opt/ml/input/data/train/c4/en/c4-train.00014-of-01024.json.gz", 31 | "/opt/ml/input/data/train/c4/en/c4-train.00665-of-01024.json.gz", 32 | "/opt/ml/input/data/train/c4/en/c4-train.00345-of-01024.json.gz", 33 | "/opt/ml/input/data/train/c4/en/c4-train.00621-of-01024.json.gz", 34 | "/opt/ml/input/data/train/c4/en/c4-train.01015-of-01024.json.gz", 35 | "/opt/ml/input/data/train/c4/en/c4-train.00947-of-01024.json.gz", 36 | "/opt/ml/input/data/train/c4/en/c4-train.00094-of-01024.json.gz", 37 | "/opt/ml/input/data/train/c4/en/c4-train.01000-of-01024.json.gz", 38 | "/opt/ml/input/data/train/c4/en/c4-train.00574-of-01024.json.gz", 39 | "/opt/ml/input/data/train/c4/en/c4-train.00421-of-01024.json.gz", 40 | "/opt/ml/input/data/train/c4/en/c4-train.00962-of-01024.json.gz", 41 | "/opt/ml/input/data/train/c4/en/c4-train.00481-of-01024.json.gz", 42 | "/opt/ml/input/data/train/c4/en/c4-train.00905-of-01024.json.gz", 43 | "/opt/ml/input/data/train/c4/en/c4-train.00158-of-01024.json.gz", 44 | "/opt/ml/input/data/train/c4/en/c4-train.00866-of-01024.json.gz", 45 | "/opt/ml/input/data/train/c4/en/c4-train.00038-of-01024.json.gz", 46 | "/opt/ml/input/data/train/c4/en/c4-train.00540-of-01024.json.gz", 47 | "/opt/ml/input/data/train/c4/en/c4-train.00551-of-01024.json.gz", 48 | "/opt/ml/input/data/train/c4/en/c4-train.00874-of-01024.json.gz", 49 | "/opt/ml/input/data/train/c4/en/c4-train.00341-of-01024.json.gz", 50 | "/opt/ml/input/data/train/c4/en/c4-train.00140-of-01024.json.gz", 51 | "/opt/ml/input/data/train/c4/en/c4-train.00753-of-01024.json.gz" 52 | ] -------------------------------------------------------------------------------- /data/files/c4/en/p50/partition_250.json: -------------------------------------------------------------------------------- 1 | [ 2 | "/opt/ml/input/data/train/c4/en/c4-train.00750-of-01024.json.gz", 3 | "/opt/ml/input/data/train/c4/en/c4-train.00065-of-01024.json.gz", 4 | "/opt/ml/input/data/train/c4/en/c4-train.00719-of-01024.json.gz", 5 | "/opt/ml/input/data/train/c4/en/c4-train.00704-of-01024.json.gz", 6 | "/opt/ml/input/data/train/c4/en/c4-train.00126-of-01024.json.gz", 7 | "/opt/ml/input/data/train/c4/en/c4-train.00174-of-01024.json.gz", 8 | "/opt/ml/input/data/train/c4/en/c4-train.00950-of-01024.json.gz", 9 | "/opt/ml/input/data/train/c4/en/c4-train.00284-of-01024.json.gz", 10 | "/opt/ml/input/data/train/c4/en/c4-train.00343-of-01024.json.gz", 11 | "/opt/ml/input/data/train/c4/en/c4-train.00727-of-01024.json.gz", 12 | "/opt/ml/input/data/train/c4/en/c4-train.00795-of-01024.json.gz", 13 | "/opt/ml/input/data/train/c4/en/c4-train.00535-of-01024.json.gz", 14 | "/opt/ml/input/data/train/c4/en/c4-train.00899-of-01024.json.gz", 15 | "/opt/ml/input/data/train/c4/en/c4-train.00465-of-01024.json.gz", 16 | "/opt/ml/input/data/train/c4/en/c4-train.00382-of-01024.json.gz", 17 | "/opt/ml/input/data/train/c4/en/c4-train.00697-of-01024.json.gz", 18 | "/opt/ml/input/data/train/c4/en/c4-train.00531-of-01024.json.gz", 19 | "/opt/ml/input/data/train/c4/en/c4-train.00638-of-01024.json.gz", 20 | "/opt/ml/input/data/train/c4/en/c4-train.00180-of-01024.json.gz", 21 | "/opt/ml/input/data/train/c4/en/c4-train.00179-of-01024.json.gz", 22 | "/opt/ml/input/data/train/c4/en/c4-train.00800-of-01024.json.gz", 23 | "/opt/ml/input/data/train/c4/en/c4-train.00208-of-01024.json.gz", 24 | "/opt/ml/input/data/train/c4/en/c4-train.00006-of-01024.json.gz", 25 | "/opt/ml/input/data/train/c4/en/c4-train.00188-of-01024.json.gz", 26 | "/opt/ml/input/data/train/c4/en/c4-train.00827-of-01024.json.gz", 27 | "/opt/ml/input/data/train/c4/en/c4-train.00814-of-01024.json.gz", 28 | "/opt/ml/input/data/train/c4/en/c4-train.00593-of-01024.json.gz", 29 | "/opt/ml/input/data/train/c4/en/c4-train.00870-of-01024.json.gz", 30 | "/opt/ml/input/data/train/c4/en/c4-train.00985-of-01024.json.gz", 31 | "/opt/ml/input/data/train/c4/en/c4-train.00701-of-01024.json.gz", 32 | "/opt/ml/input/data/train/c4/en/c4-train.00348-of-01024.json.gz", 33 | "/opt/ml/input/data/train/c4/en/c4-train.00635-of-01024.json.gz", 34 | "/opt/ml/input/data/train/c4/en/c4-train.00754-of-01024.json.gz", 35 | "/opt/ml/input/data/train/c4/en/c4-train.00960-of-01024.json.gz", 36 | "/opt/ml/input/data/train/c4/en/c4-train.00046-of-01024.json.gz", 37 | "/opt/ml/input/data/train/c4/en/c4-train.00080-of-01024.json.gz", 38 | "/opt/ml/input/data/train/c4/en/c4-train.00059-of-01024.json.gz", 39 | "/opt/ml/input/data/train/c4/en/c4-train.00877-of-01024.json.gz", 40 | "/opt/ml/input/data/train/c4/en/c4-train.00875-of-01024.json.gz", 41 | "/opt/ml/input/data/train/c4/en/c4-train.00452-of-01024.json.gz", 42 | "/opt/ml/input/data/train/c4/en/c4-train.00817-of-01024.json.gz", 43 | "/opt/ml/input/data/train/c4/en/c4-train.00640-of-01024.json.gz", 44 | "/opt/ml/input/data/train/c4/en/c4-train.00759-of-01024.json.gz", 45 | "/opt/ml/input/data/train/c4/en/c4-train.00479-of-01024.json.gz", 46 | "/opt/ml/input/data/train/c4/en/c4-train.00861-of-01024.json.gz", 47 | "/opt/ml/input/data/train/c4/en/c4-train.00758-of-01024.json.gz", 48 | "/opt/ml/input/data/train/c4/en/c4-train.00247-of-01024.json.gz", 49 | "/opt/ml/input/data/train/c4/en/c4-train.00011-of-01024.json.gz", 50 | "/opt/ml/input/data/train/c4/en/c4-train.00572-of-01024.json.gz", 51 | "/opt/ml/input/data/train/c4/en/c4-train.00644-of-01024.json.gz" 52 | ] -------------------------------------------------------------------------------- /data/files/c4/en/p50/partition_300.json: -------------------------------------------------------------------------------- 1 | [ 2 | "/opt/ml/input/data/train/c4/en/c4-train.00735-of-01024.json.gz", 3 | "/opt/ml/input/data/train/c4/en/c4-train.00283-of-01024.json.gz", 4 | "/opt/ml/input/data/train/c4/en/c4-train.00887-of-01024.json.gz", 5 | "/opt/ml/input/data/train/c4/en/c4-train.00288-of-01024.json.gz", 6 | "/opt/ml/input/data/train/c4/en/c4-train.00649-of-01024.json.gz", 7 | "/opt/ml/input/data/train/c4/en/c4-train.00428-of-01024.json.gz", 8 | "/opt/ml/input/data/train/c4/en/c4-train.00173-of-01024.json.gz", 9 | "/opt/ml/input/data/train/c4/en/c4-train.00156-of-01024.json.gz", 10 | "/opt/ml/input/data/train/c4/en/c4-train.00330-of-01024.json.gz", 11 | "/opt/ml/input/data/train/c4/en/c4-train.00833-of-01024.json.gz", 12 | "/opt/ml/input/data/train/c4/en/c4-train.00053-of-01024.json.gz", 13 | "/opt/ml/input/data/train/c4/en/c4-train.00199-of-01024.json.gz", 14 | "/opt/ml/input/data/train/c4/en/c4-train.00377-of-01024.json.gz", 15 | "/opt/ml/input/data/train/c4/en/c4-train.00082-of-01024.json.gz", 16 | "/opt/ml/input/data/train/c4/en/c4-train.00097-of-01024.json.gz", 17 | "/opt/ml/input/data/train/c4/en/c4-train.00279-of-01024.json.gz", 18 | "/opt/ml/input/data/train/c4/en/c4-train.00828-of-01024.json.gz", 19 | "/opt/ml/input/data/train/c4/en/c4-train.00829-of-01024.json.gz", 20 | "/opt/ml/input/data/train/c4/en/c4-train.00771-of-01024.json.gz", 21 | "/opt/ml/input/data/train/c4/en/c4-train.00611-of-01024.json.gz", 22 | "/opt/ml/input/data/train/c4/en/c4-train.00716-of-01024.json.gz", 23 | "/opt/ml/input/data/train/c4/en/c4-train.00483-of-01024.json.gz", 24 | "/opt/ml/input/data/train/c4/en/c4-train.00842-of-01024.json.gz", 25 | "/opt/ml/input/data/train/c4/en/c4-train.00996-of-01024.json.gz", 26 | "/opt/ml/input/data/train/c4/en/c4-train.00290-of-01024.json.gz", 27 | "/opt/ml/input/data/train/c4/en/c4-train.00036-of-01024.json.gz", 28 | "/opt/ml/input/data/train/c4/en/c4-train.00076-of-01024.json.gz", 29 | "/opt/ml/input/data/train/c4/en/c4-train.00687-of-01024.json.gz", 30 | "/opt/ml/input/data/train/c4/en/c4-train.00834-of-01024.json.gz", 31 | "/opt/ml/input/data/train/c4/en/c4-train.00880-of-01024.json.gz", 32 | "/opt/ml/input/data/train/c4/en/c4-train.00176-of-01024.json.gz", 33 | "/opt/ml/input/data/train/c4/en/c4-train.00089-of-01024.json.gz", 34 | "/opt/ml/input/data/train/c4/en/c4-train.00932-of-01024.json.gz", 35 | "/opt/ml/input/data/train/c4/en/c4-train.00793-of-01024.json.gz", 36 | "/opt/ml/input/data/train/c4/en/c4-train.00602-of-01024.json.gz", 37 | "/opt/ml/input/data/train/c4/en/c4-train.00228-of-01024.json.gz", 38 | "/opt/ml/input/data/train/c4/en/c4-train.00675-of-01024.json.gz", 39 | "/opt/ml/input/data/train/c4/en/c4-train.00085-of-01024.json.gz", 40 | "/opt/ml/input/data/train/c4/en/c4-train.00752-of-01024.json.gz", 41 | "/opt/ml/input/data/train/c4/en/c4-train.00715-of-01024.json.gz", 42 | "/opt/ml/input/data/train/c4/en/c4-train.00911-of-01024.json.gz", 43 | "/opt/ml/input/data/train/c4/en/c4-train.00238-of-01024.json.gz", 44 | "/opt/ml/input/data/train/c4/en/c4-train.00919-of-01024.json.gz", 45 | "/opt/ml/input/data/train/c4/en/c4-train.00002-of-01024.json.gz", 46 | "/opt/ml/input/data/train/c4/en/c4-train.00785-of-01024.json.gz", 47 | "/opt/ml/input/data/train/c4/en/c4-train.00636-of-01024.json.gz", 48 | "/opt/ml/input/data/train/c4/en/c4-train.00402-of-01024.json.gz", 49 | "/opt/ml/input/data/train/c4/en/c4-train.00772-of-01024.json.gz", 50 | "/opt/ml/input/data/train/c4/en/c4-train.00459-of-01024.json.gz", 51 | "/opt/ml/input/data/train/c4/en/c4-train.00476-of-01024.json.gz" 52 | ] -------------------------------------------------------------------------------- /data/files/c4/en/p50/partition_350.json: -------------------------------------------------------------------------------- 1 | [ 2 | "/opt/ml/input/data/train/c4/en/c4-train.00904-of-01024.json.gz", 3 | "/opt/ml/input/data/train/c4/en/c4-train.00857-of-01024.json.gz", 4 | "/opt/ml/input/data/train/c4/en/c4-train.00321-of-01024.json.gz", 5 | "/opt/ml/input/data/train/c4/en/c4-train.00957-of-01024.json.gz", 6 | "/opt/ml/input/data/train/c4/en/c4-train.00100-of-01024.json.gz", 7 | "/opt/ml/input/data/train/c4/en/c4-train.00966-of-01024.json.gz", 8 | "/opt/ml/input/data/train/c4/en/c4-train.01020-of-01024.json.gz", 9 | "/opt/ml/input/data/train/c4/en/c4-train.00113-of-01024.json.gz", 10 | "/opt/ml/input/data/train/c4/en/c4-train.01016-of-01024.json.gz", 11 | "/opt/ml/input/data/train/c4/en/c4-train.00070-of-01024.json.gz", 12 | "/opt/ml/input/data/train/c4/en/c4-train.00557-of-01024.json.gz", 13 | "/opt/ml/input/data/train/c4/en/c4-train.00244-of-01024.json.gz", 14 | "/opt/ml/input/data/train/c4/en/c4-train.00945-of-01024.json.gz", 15 | "/opt/ml/input/data/train/c4/en/c4-train.00181-of-01024.json.gz", 16 | "/opt/ml/input/data/train/c4/en/c4-train.00225-of-01024.json.gz", 17 | "/opt/ml/input/data/train/c4/en/c4-train.00167-of-01024.json.gz", 18 | "/opt/ml/input/data/train/c4/en/c4-train.00946-of-01024.json.gz", 19 | "/opt/ml/input/data/train/c4/en/c4-train.00847-of-01024.json.gz", 20 | "/opt/ml/input/data/train/c4/en/c4-train.00746-of-01024.json.gz", 21 | "/opt/ml/input/data/train/c4/en/c4-train.00891-of-01024.json.gz", 22 | "/opt/ml/input/data/train/c4/en/c4-train.00373-of-01024.json.gz", 23 | "/opt/ml/input/data/train/c4/en/c4-train.00358-of-01024.json.gz", 24 | "/opt/ml/input/data/train/c4/en/c4-train.00807-of-01024.json.gz", 25 | "/opt/ml/input/data/train/c4/en/c4-train.00873-of-01024.json.gz", 26 | "/opt/ml/input/data/train/c4/en/c4-train.00016-of-01024.json.gz", 27 | "/opt/ml/input/data/train/c4/en/c4-train.00129-of-01024.json.gz", 28 | "/opt/ml/input/data/train/c4/en/c4-train.00903-of-01024.json.gz", 29 | "/opt/ml/input/data/train/c4/en/c4-train.00669-of-01024.json.gz", 30 | "/opt/ml/input/data/train/c4/en/c4-train.00272-of-01024.json.gz", 31 | "/opt/ml/input/data/train/c4/en/c4-train.00034-of-01024.json.gz", 32 | "/opt/ml/input/data/train/c4/en/c4-train.00047-of-01024.json.gz", 33 | "/opt/ml/input/data/train/c4/en/c4-train.00660-of-01024.json.gz", 34 | "/opt/ml/input/data/train/c4/en/c4-train.00999-of-01024.json.gz", 35 | "/opt/ml/input/data/train/c4/en/c4-train.00912-of-01024.json.gz", 36 | "/opt/ml/input/data/train/c4/en/c4-train.00308-of-01024.json.gz", 37 | "/opt/ml/input/data/train/c4/en/c4-train.00732-of-01024.json.gz", 38 | "/opt/ml/input/data/train/c4/en/c4-train.00440-of-01024.json.gz", 39 | "/opt/ml/input/data/train/c4/en/c4-train.00012-of-01024.json.gz", 40 | "/opt/ml/input/data/train/c4/en/c4-train.00643-of-01024.json.gz", 41 | "/opt/ml/input/data/train/c4/en/c4-train.01019-of-01024.json.gz", 42 | "/opt/ml/input/data/train/c4/en/c4-train.00219-of-01024.json.gz", 43 | "/opt/ml/input/data/train/c4/en/c4-train.00304-of-01024.json.gz", 44 | "/opt/ml/input/data/train/c4/en/c4-train.00253-of-01024.json.gz", 45 | "/opt/ml/input/data/train/c4/en/c4-train.00484-of-01024.json.gz", 46 | "/opt/ml/input/data/train/c4/en/c4-train.00335-of-01024.json.gz", 47 | "/opt/ml/input/data/train/c4/en/c4-train.00507-of-01024.json.gz", 48 | "/opt/ml/input/data/train/c4/en/c4-train.00285-of-01024.json.gz", 49 | "/opt/ml/input/data/train/c4/en/c4-train.00926-of-01024.json.gz", 50 | "/opt/ml/input/data/train/c4/en/c4-train.00325-of-01024.json.gz", 51 | "/opt/ml/input/data/train/c4/en/c4-train.00354-of-01024.json.gz" 52 | ] -------------------------------------------------------------------------------- /data/files/c4/en/p50/partition_400.json: -------------------------------------------------------------------------------- 1 | [ 2 | "/opt/ml/input/data/train/c4/en/c4-train.00311-of-01024.json.gz", 3 | "/opt/ml/input/data/train/c4/en/c4-train.00150-of-01024.json.gz", 4 | "/opt/ml/input/data/train/c4/en/c4-train.00137-of-01024.json.gz", 5 | "/opt/ml/input/data/train/c4/en/c4-train.00299-of-01024.json.gz", 6 | "/opt/ml/input/data/train/c4/en/c4-train.00954-of-01024.json.gz", 7 | "/opt/ml/input/data/train/c4/en/c4-train.00631-of-01024.json.gz", 8 | "/opt/ml/input/data/train/c4/en/c4-train.00404-of-01024.json.gz", 9 | "/opt/ml/input/data/train/c4/en/c4-train.00054-of-01024.json.gz", 10 | "/opt/ml/input/data/train/c4/en/c4-train.00694-of-01024.json.gz", 11 | "/opt/ml/input/data/train/c4/en/c4-train.00767-of-01024.json.gz", 12 | "/opt/ml/input/data/train/c4/en/c4-train.00910-of-01024.json.gz", 13 | "/opt/ml/input/data/train/c4/en/c4-train.00642-of-01024.json.gz", 14 | "/opt/ml/input/data/train/c4/en/c4-train.00600-of-01024.json.gz", 15 | "/opt/ml/input/data/train/c4/en/c4-train.00751-of-01024.json.gz", 16 | "/opt/ml/input/data/train/c4/en/c4-train.00490-of-01024.json.gz", 17 | "/opt/ml/input/data/train/c4/en/c4-train.00948-of-01024.json.gz", 18 | "/opt/ml/input/data/train/c4/en/c4-train.00017-of-01024.json.gz", 19 | "/opt/ml/input/data/train/c4/en/c4-train.00650-of-01024.json.gz", 20 | "/opt/ml/input/data/train/c4/en/c4-train.00987-of-01024.json.gz", 21 | "/opt/ml/input/data/train/c4/en/c4-train.00970-of-01024.json.gz", 22 | "/opt/ml/input/data/train/c4/en/c4-train.00865-of-01024.json.gz", 23 | "/opt/ml/input/data/train/c4/en/c4-train.00522-of-01024.json.gz", 24 | "/opt/ml/input/data/train/c4/en/c4-train.00589-of-01024.json.gz", 25 | "/opt/ml/input/data/train/c4/en/c4-train.00510-of-01024.json.gz", 26 | "/opt/ml/input/data/train/c4/en/c4-train.00468-of-01024.json.gz", 27 | "/opt/ml/input/data/train/c4/en/c4-train.00204-of-01024.json.gz", 28 | "/opt/ml/input/data/train/c4/en/c4-train.00801-of-01024.json.gz", 29 | "/opt/ml/input/data/train/c4/en/c4-train.00456-of-01024.json.gz", 30 | "/opt/ml/input/data/train/c4/en/c4-train.00449-of-01024.json.gz", 31 | "/opt/ml/input/data/train/c4/en/c4-train.00580-of-01024.json.gz", 32 | "/opt/ml/input/data/train/c4/en/c4-train.00418-of-01024.json.gz", 33 | "/opt/ml/input/data/train/c4/en/c4-train.00063-of-01024.json.gz", 34 | "/opt/ml/input/data/train/c4/en/c4-train.00451-of-01024.json.gz", 35 | "/opt/ml/input/data/train/c4/en/c4-train.00028-of-01024.json.gz", 36 | "/opt/ml/input/data/train/c4/en/c4-train.00972-of-01024.json.gz", 37 | "/opt/ml/input/data/train/c4/en/c4-train.00963-of-01024.json.gz", 38 | "/opt/ml/input/data/train/c4/en/c4-train.00512-of-01024.json.gz", 39 | "/opt/ml/input/data/train/c4/en/c4-train.00391-of-01024.json.gz", 40 | "/opt/ml/input/data/train/c4/en/c4-train.00769-of-01024.json.gz", 41 | "/opt/ml/input/data/train/c4/en/c4-train.00900-of-01024.json.gz", 42 | "/opt/ml/input/data/train/c4/en/c4-train.00427-of-01024.json.gz", 43 | "/opt/ml/input/data/train/c4/en/c4-train.00379-of-01024.json.gz", 44 | "/opt/ml/input/data/train/c4/en/c4-train.00613-of-01024.json.gz", 45 | "/opt/ml/input/data/train/c4/en/c4-train.00399-of-01024.json.gz", 46 | "/opt/ml/input/data/train/c4/en/c4-train.00227-of-01024.json.gz", 47 | "/opt/ml/input/data/train/c4/en/c4-train.00853-of-01024.json.gz", 48 | "/opt/ml/input/data/train/c4/en/c4-train.00588-of-01024.json.gz", 49 | "/opt/ml/input/data/train/c4/en/c4-train.00045-of-01024.json.gz", 50 | "/opt/ml/input/data/train/c4/en/c4-train.00189-of-01024.json.gz", 51 | "/opt/ml/input/data/train/c4/en/c4-train.00538-of-01024.json.gz" 52 | ] -------------------------------------------------------------------------------- /data/files/c4/en/p50/partition_450.json: -------------------------------------------------------------------------------- 1 | [ 2 | "/opt/ml/input/data/train/c4/en/c4-train.00859-of-01024.json.gz", 3 | "/opt/ml/input/data/train/c4/en/c4-train.00298-of-01024.json.gz", 4 | "/opt/ml/input/data/train/c4/en/c4-train.00342-of-01024.json.gz", 5 | "/opt/ml/input/data/train/c4/en/c4-train.00372-of-01024.json.gz", 6 | "/opt/ml/input/data/train/c4/en/c4-train.00961-of-01024.json.gz", 7 | "/opt/ml/input/data/train/c4/en/c4-train.00736-of-01024.json.gz", 8 | "/opt/ml/input/data/train/c4/en/c4-train.00639-of-01024.json.gz", 9 | "/opt/ml/input/data/train/c4/en/c4-train.00892-of-01024.json.gz", 10 | "/opt/ml/input/data/train/c4/en/c4-train.00249-of-01024.json.gz", 11 | "/opt/ml/input/data/train/c4/en/c4-train.00610-of-01024.json.gz", 12 | "/opt/ml/input/data/train/c4/en/c4-train.00609-of-01024.json.gz", 13 | "/opt/ml/input/data/train/c4/en/c4-train.00368-of-01024.json.gz", 14 | "/opt/ml/input/data/train/c4/en/c4-train.00381-of-01024.json.gz", 15 | "/opt/ml/input/data/train/c4/en/c4-train.00654-of-01024.json.gz", 16 | "/opt/ml/input/data/train/c4/en/c4-train.00820-of-01024.json.gz", 17 | "/opt/ml/input/data/train/c4/en/c4-train.00832-of-01024.json.gz", 18 | "/opt/ml/input/data/train/c4/en/c4-train.00982-of-01024.json.gz", 19 | "/opt/ml/input/data/train/c4/en/c4-train.00333-of-01024.json.gz", 20 | "/opt/ml/input/data/train/c4/en/c4-train.00501-of-01024.json.gz", 21 | "/opt/ml/input/data/train/c4/en/c4-train.00965-of-01024.json.gz", 22 | "/opt/ml/input/data/train/c4/en/c4-train.00562-of-01024.json.gz", 23 | "/opt/ml/input/data/train/c4/en/c4-train.00517-of-01024.json.gz", 24 | "/opt/ml/input/data/train/c4/en/c4-train.00191-of-01024.json.gz", 25 | "/opt/ml/input/data/train/c4/en/c4-train.00690-of-01024.json.gz", 26 | "/opt/ml/input/data/train/c4/en/c4-train.00806-of-01024.json.gz", 27 | "/opt/ml/input/data/train/c4/en/c4-train.00500-of-01024.json.gz", 28 | "/opt/ml/input/data/train/c4/en/c4-train.01010-of-01024.json.gz", 29 | "/opt/ml/input/data/train/c4/en/c4-train.01012-of-01024.json.gz", 30 | "/opt/ml/input/data/train/c4/en/c4-train.00590-of-01024.json.gz", 31 | "/opt/ml/input/data/train/c4/en/c4-train.00122-of-01024.json.gz", 32 | "/opt/ml/input/data/train/c4/en/c4-train.00709-of-01024.json.gz", 33 | "/opt/ml/input/data/train/c4/en/c4-train.00000-of-01024.json.gz", 34 | "/opt/ml/input/data/train/c4/en/c4-train.00287-of-01024.json.gz", 35 | "/opt/ml/input/data/train/c4/en/c4-train.00281-of-01024.json.gz", 36 | "/opt/ml/input/data/train/c4/en/c4-train.00986-of-01024.json.gz", 37 | "/opt/ml/input/data/train/c4/en/c4-train.00737-of-01024.json.gz", 38 | "/opt/ml/input/data/train/c4/en/c4-train.00098-of-01024.json.gz", 39 | "/opt/ml/input/data/train/c4/en/c4-train.00241-of-01024.json.gz", 40 | "/opt/ml/input/data/train/c4/en/c4-train.00976-of-01024.json.gz", 41 | "/opt/ml/input/data/train/c4/en/c4-train.00705-of-01024.json.gz", 42 | "/opt/ml/input/data/train/c4/en/c4-train.00876-of-01024.json.gz", 43 | "/opt/ml/input/data/train/c4/en/c4-train.00760-of-01024.json.gz", 44 | "/opt/ml/input/data/train/c4/en/c4-train.00923-of-01024.json.gz", 45 | "/opt/ml/input/data/train/c4/en/c4-train.00713-of-01024.json.gz", 46 | "/opt/ml/input/data/train/c4/en/c4-train.00599-of-01024.json.gz", 47 | "/opt/ml/input/data/train/c4/en/c4-train.00804-of-01024.json.gz", 48 | "/opt/ml/input/data/train/c4/en/c4-train.00894-of-01024.json.gz", 49 | "/opt/ml/input/data/train/c4/en/c4-train.00830-of-01024.json.gz", 50 | "/opt/ml/input/data/train/c4/en/c4-train.00971-of-01024.json.gz", 51 | "/opt/ml/input/data/train/c4/en/c4-train.00584-of-01024.json.gz" 52 | ] -------------------------------------------------------------------------------- /data/files/c4/en/p50/partition_50.json: -------------------------------------------------------------------------------- 1 | [ 2 | "/opt/ml/input/data/train/c4/en/c4-train.00770-of-01024.json.gz", 3 | "/opt/ml/input/data/train/c4/en/c4-train.00679-of-01024.json.gz", 4 | "/opt/ml/input/data/train/c4/en/c4-train.00616-of-01024.json.gz", 5 | "/opt/ml/input/data/train/c4/en/c4-train.00561-of-01024.json.gz", 6 | "/opt/ml/input/data/train/c4/en/c4-train.00184-of-01024.json.gz", 7 | "/opt/ml/input/data/train/c4/en/c4-train.00072-of-01024.json.gz", 8 | "/opt/ml/input/data/train/c4/en/c4-train.00347-of-01024.json.gz", 9 | "/opt/ml/input/data/train/c4/en/c4-train.00949-of-01024.json.gz", 10 | "/opt/ml/input/data/train/c4/en/c4-train.00051-of-01024.json.gz", 11 | "/opt/ml/input/data/train/c4/en/c4-train.00401-of-01024.json.gz", 12 | "/opt/ml/input/data/train/c4/en/c4-train.00567-of-01024.json.gz", 13 | "/opt/ml/input/data/train/c4/en/c4-train.00305-of-01024.json.gz", 14 | "/opt/ml/input/data/train/c4/en/c4-train.00498-of-01024.json.gz", 15 | "/opt/ml/input/data/train/c4/en/c4-train.00896-of-01024.json.gz", 16 | "/opt/ml/input/data/train/c4/en/c4-train.00101-of-01024.json.gz", 17 | "/opt/ml/input/data/train/c4/en/c4-train.00169-of-01024.json.gz", 18 | "/opt/ml/input/data/train/c4/en/c4-train.00984-of-01024.json.gz", 19 | "/opt/ml/input/data/train/c4/en/c4-train.00815-of-01024.json.gz", 20 | "/opt/ml/input/data/train/c4/en/c4-train.00262-of-01024.json.gz", 21 | "/opt/ml/input/data/train/c4/en/c4-train.00164-of-01024.json.gz", 22 | "/opt/ml/input/data/train/c4/en/c4-train.00077-of-01024.json.gz", 23 | "/opt/ml/input/data/train/c4/en/c4-train.00710-of-01024.json.gz", 24 | "/opt/ml/input/data/train/c4/en/c4-train.00723-of-01024.json.gz", 25 | "/opt/ml/input/data/train/c4/en/c4-train.00172-of-01024.json.gz", 26 | "/opt/ml/input/data/train/c4/en/c4-train.00558-of-01024.json.gz", 27 | "/opt/ml/input/data/train/c4/en/c4-train.00993-of-01024.json.gz", 28 | "/opt/ml/input/data/train/c4/en/c4-train.00392-of-01024.json.gz", 29 | "/opt/ml/input/data/train/c4/en/c4-train.00123-of-01024.json.gz", 30 | "/opt/ml/input/data/train/c4/en/c4-train.00790-of-01024.json.gz", 31 | "/opt/ml/input/data/train/c4/en/c4-train.00083-of-01024.json.gz", 32 | "/opt/ml/input/data/train/c4/en/c4-train.00024-of-01024.json.gz", 33 | "/opt/ml/input/data/train/c4/en/c4-train.00792-of-01024.json.gz", 34 | "/opt/ml/input/data/train/c4/en/c4-train.00245-of-01024.json.gz", 35 | "/opt/ml/input/data/train/c4/en/c4-train.00648-of-01024.json.gz", 36 | "/opt/ml/input/data/train/c4/en/c4-train.00939-of-01024.json.gz", 37 | "/opt/ml/input/data/train/c4/en/c4-train.00964-of-01024.json.gz", 38 | "/opt/ml/input/data/train/c4/en/c4-train.00153-of-01024.json.gz", 39 | "/opt/ml/input/data/train/c4/en/c4-train.00867-of-01024.json.gz", 40 | "/opt/ml/input/data/train/c4/en/c4-train.00545-of-01024.json.gz", 41 | "/opt/ml/input/data/train/c4/en/c4-train.00119-of-01024.json.gz", 42 | "/opt/ml/input/data/train/c4/en/c4-train.00509-of-01024.json.gz", 43 | "/opt/ml/input/data/train/c4/en/c4-train.00416-of-01024.json.gz", 44 | "/opt/ml/input/data/train/c4/en/c4-train.00040-of-01024.json.gz", 45 | "/opt/ml/input/data/train/c4/en/c4-train.01018-of-01024.json.gz", 46 | "/opt/ml/input/data/train/c4/en/c4-train.01002-of-01024.json.gz", 47 | "/opt/ml/input/data/train/c4/en/c4-train.00369-of-01024.json.gz", 48 | "/opt/ml/input/data/train/c4/en/c4-train.01013-of-01024.json.gz", 49 | "/opt/ml/input/data/train/c4/en/c4-train.00968-of-01024.json.gz", 50 | "/opt/ml/input/data/train/c4/en/c4-train.00666-of-01024.json.gz", 51 | "/opt/ml/input/data/train/c4/en/c4-train.00102-of-01024.json.gz" 52 | ] -------------------------------------------------------------------------------- /data/files/c4/en/p50/partition_500.json: -------------------------------------------------------------------------------- 1 | [ 2 | "/opt/ml/input/data/train/c4/en/c4-train.00928-of-01024.json.gz", 3 | "/opt/ml/input/data/train/c4/en/c4-train.00068-of-01024.json.gz", 4 | "/opt/ml/input/data/train/c4/en/c4-train.00313-of-01024.json.gz", 5 | "/opt/ml/input/data/train/c4/en/c4-train.00294-of-01024.json.gz", 6 | "/opt/ml/input/data/train/c4/en/c4-train.00851-of-01024.json.gz", 7 | "/opt/ml/input/data/train/c4/en/c4-train.00794-of-01024.json.gz", 8 | "/opt/ml/input/data/train/c4/en/c4-train.00678-of-01024.json.gz", 9 | "/opt/ml/input/data/train/c4/en/c4-train.00681-of-01024.json.gz", 10 | "/opt/ml/input/data/train/c4/en/c4-train.00637-of-01024.json.gz", 11 | "/opt/ml/input/data/train/c4/en/c4-train.00747-of-01024.json.gz", 12 | "/opt/ml/input/data/train/c4/en/c4-train.00935-of-01024.json.gz", 13 | "/opt/ml/input/data/train/c4/en/c4-train.00168-of-01024.json.gz", 14 | "/opt/ml/input/data/train/c4/en/c4-train.00263-of-01024.json.gz", 15 | "/opt/ml/input/data/train/c4/en/c4-train.00223-of-01024.json.gz", 16 | "/opt/ml/input/data/train/c4/en/c4-train.00841-of-01024.json.gz", 17 | "/opt/ml/input/data/train/c4/en/c4-train.00563-of-01024.json.gz", 18 | "/opt/ml/input/data/train/c4/en/c4-train.00787-of-01024.json.gz", 19 | "/opt/ml/input/data/train/c4/en/c4-train.00980-of-01024.json.gz", 20 | "/opt/ml/input/data/train/c4/en/c4-train.00338-of-01024.json.gz", 21 | "/opt/ml/input/data/train/c4/en/c4-train.00286-of-01024.json.gz", 22 | "/opt/ml/input/data/train/c4/en/c4-train.00049-of-01024.json.gz", 23 | "/opt/ml/input/data/train/c4/en/c4-train.00403-of-01024.json.gz", 24 | "/opt/ml/input/data/train/c4/en/c4-train.00393-of-01024.json.gz", 25 | "/opt/ml/input/data/train/c4/en/c4-train.00492-of-01024.json.gz", 26 | "/opt/ml/input/data/train/c4/en/c4-train.00656-of-01024.json.gz", 27 | "/opt/ml/input/data/train/c4/en/c4-train.00840-of-01024.json.gz", 28 | "/opt/ml/input/data/train/c4/en/c4-train.00902-of-01024.json.gz", 29 | "/opt/ml/input/data/train/c4/en/c4-train.00351-of-01024.json.gz", 30 | "/opt/ml/input/data/train/c4/en/c4-train.00378-of-01024.json.gz", 31 | "/opt/ml/input/data/train/c4/en/c4-train.00757-of-01024.json.gz", 32 | "/opt/ml/input/data/train/c4/en/c4-train.00942-of-01024.json.gz", 33 | "/opt/ml/input/data/train/c4/en/c4-train.00441-of-01024.json.gz", 34 | "/opt/ml/input/data/train/c4/en/c4-train.00147-of-01024.json.gz", 35 | "/opt/ml/input/data/train/c4/en/c4-train.00471-of-01024.json.gz", 36 | "/opt/ml/input/data/train/c4/en/c4-train.00015-of-01024.json.gz", 37 | "/opt/ml/input/data/train/c4/en/c4-train.00075-of-01024.json.gz", 38 | "/opt/ml/input/data/train/c4/en/c4-train.00618-of-01024.json.gz", 39 | "/opt/ml/input/data/train/c4/en/c4-train.00818-of-01024.json.gz", 40 | "/opt/ml/input/data/train/c4/en/c4-train.00324-of-01024.json.gz", 41 | "/opt/ml/input/data/train/c4/en/c4-train.00370-of-01024.json.gz", 42 | "/opt/ml/input/data/train/c4/en/c4-train.00413-of-01024.json.gz", 43 | "/opt/ml/input/data/train/c4/en/c4-train.00139-of-01024.json.gz", 44 | "/opt/ml/input/data/train/c4/en/c4-train.00674-of-01024.json.gz", 45 | "/opt/ml/input/data/train/c4/en/c4-train.00114-of-01024.json.gz", 46 | "/opt/ml/input/data/train/c4/en/c4-train.00463-of-01024.json.gz", 47 | "/opt/ml/input/data/train/c4/en/c4-train.00917-of-01024.json.gz", 48 | "/opt/ml/input/data/train/c4/en/c4-train.00154-of-01024.json.gz", 49 | "/opt/ml/input/data/train/c4/en/c4-train.00549-of-01024.json.gz", 50 | "/opt/ml/input/data/train/c4/en/c4-train.00296-of-01024.json.gz", 51 | "/opt/ml/input/data/train/c4/en/c4-train.00419-of-01024.json.gz" 52 | ] -------------------------------------------------------------------------------- /data/files/c4/en/p50/partition_550.json: -------------------------------------------------------------------------------- 1 | [ 2 | "/opt/ml/input/data/train/c4/en/c4-train.00310-of-01024.json.gz", 3 | "/opt/ml/input/data/train/c4/en/c4-train.00157-of-01024.json.gz", 4 | "/opt/ml/input/data/train/c4/en/c4-train.00469-of-01024.json.gz", 5 | "/opt/ml/input/data/train/c4/en/c4-train.00521-of-01024.json.gz", 6 | "/opt/ml/input/data/train/c4/en/c4-train.00699-of-01024.json.gz", 7 | "/opt/ml/input/data/train/c4/en/c4-train.00730-of-01024.json.gz", 8 | "/opt/ml/input/data/train/c4/en/c4-train.00603-of-01024.json.gz", 9 | "/opt/ml/input/data/train/c4/en/c4-train.00781-of-01024.json.gz", 10 | "/opt/ml/input/data/train/c4/en/c4-train.00777-of-01024.json.gz", 11 | "/opt/ml/input/data/train/c4/en/c4-train.00560-of-01024.json.gz", 12 | "/opt/ml/input/data/train/c4/en/c4-train.00159-of-01024.json.gz", 13 | "/opt/ml/input/data/train/c4/en/c4-train.00032-of-01024.json.gz", 14 | "/opt/ml/input/data/train/c4/en/c4-train.00107-of-01024.json.gz", 15 | "/opt/ml/input/data/train/c4/en/c4-train.00064-of-01024.json.gz", 16 | "/opt/ml/input/data/train/c4/en/c4-train.00346-of-01024.json.gz", 17 | "/opt/ml/input/data/train/c4/en/c4-train.00571-of-01024.json.gz", 18 | "/opt/ml/input/data/train/c4/en/c4-train.00714-of-01024.json.gz", 19 | "/opt/ml/input/data/train/c4/en/c4-train.00135-of-01024.json.gz", 20 | "/opt/ml/input/data/train/c4/en/c4-train.00554-of-01024.json.gz", 21 | "/opt/ml/input/data/train/c4/en/c4-train.00533-of-01024.json.gz", 22 | "/opt/ml/input/data/train/c4/en/c4-train.00071-of-01024.json.gz", 23 | "/opt/ml/input/data/train/c4/en/c4-train.00628-of-01024.json.gz", 24 | "/opt/ml/input/data/train/c4/en/c4-train.00235-of-01024.json.gz", 25 | "/opt/ml/input/data/train/c4/en/c4-train.00487-of-01024.json.gz", 26 | "/opt/ml/input/data/train/c4/en/c4-train.00303-of-01024.json.gz", 27 | "/opt/ml/input/data/train/c4/en/c4-train.00683-of-01024.json.gz", 28 | "/opt/ml/input/data/train/c4/en/c4-train.00565-of-01024.json.gz", 29 | "/opt/ml/input/data/train/c4/en/c4-train.00663-of-01024.json.gz", 30 | "/opt/ml/input/data/train/c4/en/c4-train.00019-of-01024.json.gz", 31 | "/opt/ml/input/data/train/c4/en/c4-train.00052-of-01024.json.gz", 32 | "/opt/ml/input/data/train/c4/en/c4-train.00909-of-01024.json.gz", 33 | "/opt/ml/input/data/train/c4/en/c4-train.01009-of-01024.json.gz", 34 | "/opt/ml/input/data/train/c4/en/c4-train.00439-of-01024.json.gz", 35 | "/opt/ml/input/data/train/c4/en/c4-train.00620-of-01024.json.gz", 36 | "/opt/ml/input/data/train/c4/en/c4-train.00766-of-01024.json.gz", 37 | "/opt/ml/input/data/train/c4/en/c4-train.00350-of-01024.json.gz", 38 | "/opt/ml/input/data/train/c4/en/c4-train.00797-of-01024.json.gz", 39 | "/opt/ml/input/data/train/c4/en/c4-train.00526-of-01024.json.gz", 40 | "/opt/ml/input/data/train/c4/en/c4-train.00232-of-01024.json.gz", 41 | "/opt/ml/input/data/train/c4/en/c4-train.00362-of-01024.json.gz", 42 | "/opt/ml/input/data/train/c4/en/c4-train.00060-of-01024.json.gz", 43 | "/opt/ml/input/data/train/c4/en/c4-train.00044-of-01024.json.gz", 44 | "/opt/ml/input/data/train/c4/en/c4-train.00256-of-01024.json.gz", 45 | "/opt/ml/input/data/train/c4/en/c4-train.00196-of-01024.json.gz", 46 | "/opt/ml/input/data/train/c4/en/c4-train.00622-of-01024.json.gz", 47 | "/opt/ml/input/data/train/c4/en/c4-train.00216-of-01024.json.gz", 48 | "/opt/ml/input/data/train/c4/en/c4-train.00242-of-01024.json.gz", 49 | "/opt/ml/input/data/train/c4/en/c4-train.00470-of-01024.json.gz", 50 | "/opt/ml/input/data/train/c4/en/c4-train.00058-of-01024.json.gz", 51 | "/opt/ml/input/data/train/c4/en/c4-train.00363-of-01024.json.gz" 52 | ] -------------------------------------------------------------------------------- /data/files/c4/en/p50/partition_600.json: -------------------------------------------------------------------------------- 1 | [ 2 | "/opt/ml/input/data/train/c4/en/c4-train.00843-of-01024.json.gz", 3 | "/opt/ml/input/data/train/c4/en/c4-train.00493-of-01024.json.gz", 4 | "/opt/ml/input/data/train/c4/en/c4-train.00916-of-01024.json.gz", 5 | "/opt/ml/input/data/train/c4/en/c4-train.00983-of-01024.json.gz", 6 | "/opt/ml/input/data/train/c4/en/c4-train.00544-of-01024.json.gz", 7 | "/opt/ml/input/data/train/c4/en/c4-train.00778-of-01024.json.gz", 8 | "/opt/ml/input/data/train/c4/en/c4-train.00318-of-01024.json.gz", 9 | "/opt/ml/input/data/train/c4/en/c4-train.00454-of-01024.json.gz", 10 | "/opt/ml/input/data/train/c4/en/c4-train.00239-of-01024.json.gz", 11 | "/opt/ml/input/data/train/c4/en/c4-train.00220-of-01024.json.gz", 12 | "/opt/ml/input/data/train/c4/en/c4-train.00762-of-01024.json.gz", 13 | "/opt/ml/input/data/train/c4/en/c4-train.00927-of-01024.json.gz", 14 | "/opt/ml/input/data/train/c4/en/c4-train.00532-of-01024.json.gz", 15 | "/opt/ml/input/data/train/c4/en/c4-train.01014-of-01024.json.gz", 16 | "/opt/ml/input/data/train/c4/en/c4-train.00257-of-01024.json.gz", 17 | "/opt/ml/input/data/train/c4/en/c4-train.00160-of-01024.json.gz", 18 | "/opt/ml/input/data/train/c4/en/c4-train.00489-of-01024.json.gz", 19 | "/opt/ml/input/data/train/c4/en/c4-train.00048-of-01024.json.gz", 20 | "/opt/ml/input/data/train/c4/en/c4-train.00568-of-01024.json.gz", 21 | "/opt/ml/input/data/train/c4/en/c4-train.00528-of-01024.json.gz", 22 | "/opt/ml/input/data/train/c4/en/c4-train.00692-of-01024.json.gz", 23 | "/opt/ml/input/data/train/c4/en/c4-train.00729-of-01024.json.gz", 24 | "/opt/ml/input/data/train/c4/en/c4-train.00001-of-01024.json.gz", 25 | "/opt/ml/input/data/train/c4/en/c4-train.00623-of-01024.json.gz", 26 | "/opt/ml/input/data/train/c4/en/c4-train.00711-of-01024.json.gz", 27 | "/opt/ml/input/data/train/c4/en/c4-train.00027-of-01024.json.gz", 28 | "/opt/ml/input/data/train/c4/en/c4-train.00265-of-01024.json.gz", 29 | "/opt/ml/input/data/train/c4/en/c4-train.00755-of-01024.json.gz", 30 | "/opt/ml/input/data/train/c4/en/c4-train.00667-of-01024.json.gz", 31 | "/opt/ml/input/data/train/c4/en/c4-train.00921-of-01024.json.gz", 32 | "/opt/ml/input/data/train/c4/en/c4-train.00233-of-01024.json.gz", 33 | "/opt/ml/input/data/train/c4/en/c4-train.00940-of-01024.json.gz", 34 | "/opt/ml/input/data/train/c4/en/c4-train.00937-of-01024.json.gz", 35 | "/opt/ml/input/data/train/c4/en/c4-train.00161-of-01024.json.gz", 36 | "/opt/ml/input/data/train/c4/en/c4-train.00433-of-01024.json.gz", 37 | "/opt/ml/input/data/train/c4/en/c4-train.00171-of-01024.json.gz", 38 | "/opt/ml/input/data/train/c4/en/c4-train.00055-of-01024.json.gz", 39 | "/opt/ml/input/data/train/c4/en/c4-train.00525-of-01024.json.gz", 40 | "/opt/ml/input/data/train/c4/en/c4-train.00230-of-01024.json.gz", 41 | "/opt/ml/input/data/train/c4/en/c4-train.00524-of-01024.json.gz", 42 | "/opt/ml/input/data/train/c4/en/c4-train.00300-of-01024.json.gz", 43 | "/opt/ml/input/data/train/c4/en/c4-train.00193-of-01024.json.gz", 44 | "/opt/ml/input/data/train/c4/en/c4-train.00301-of-01024.json.gz", 45 | "/opt/ml/input/data/train/c4/en/c4-train.00422-of-01024.json.gz", 46 | "/opt/ml/input/data/train/c4/en/c4-train.00809-of-01024.json.gz", 47 | "/opt/ml/input/data/train/c4/en/c4-train.00668-of-01024.json.gz", 48 | "/opt/ml/input/data/train/c4/en/c4-train.00023-of-01024.json.gz", 49 | "/opt/ml/input/data/train/c4/en/c4-train.00924-of-01024.json.gz", 50 | "/opt/ml/input/data/train/c4/en/c4-train.00633-of-01024.json.gz", 51 | "/opt/ml/input/data/train/c4/en/c4-train.00414-of-01024.json.gz" 52 | ] -------------------------------------------------------------------------------- /data/files/c4/en/p50/partition_650.json: -------------------------------------------------------------------------------- 1 | [ 2 | "/opt/ml/input/data/train/c4/en/c4-train.01011-of-01024.json.gz", 3 | "/opt/ml/input/data/train/c4/en/c4-train.00819-of-01024.json.gz", 4 | "/opt/ml/input/data/train/c4/en/c4-train.01022-of-01024.json.gz", 5 | "/opt/ml/input/data/train/c4/en/c4-train.00995-of-01024.json.gz", 6 | "/opt/ml/input/data/train/c4/en/c4-train.00586-of-01024.json.gz", 7 | "/opt/ml/input/data/train/c4/en/c4-train.00004-of-01024.json.gz", 8 | "/opt/ml/input/data/train/c4/en/c4-train.00763-of-01024.json.gz", 9 | "/opt/ml/input/data/train/c4/en/c4-train.00673-of-01024.json.gz", 10 | "/opt/ml/input/data/train/c4/en/c4-train.00295-of-01024.json.gz", 11 | "/opt/ml/input/data/train/c4/en/c4-train.00423-of-01024.json.gz", 12 | "/opt/ml/input/data/train/c4/en/c4-train.00925-of-01024.json.gz", 13 | "/opt/ml/input/data/train/c4/en/c4-train.00474-of-01024.json.gz", 14 | "/opt/ml/input/data/train/c4/en/c4-train.00183-of-01024.json.gz", 15 | "/opt/ml/input/data/train/c4/en/c4-train.00898-of-01024.json.gz", 16 | "/opt/ml/input/data/train/c4/en/c4-train.00626-of-01024.json.gz", 17 | "/opt/ml/input/data/train/c4/en/c4-train.00084-of-01024.json.gz", 18 | "/opt/ml/input/data/train/c4/en/c4-train.00425-of-01024.json.gz", 19 | "/opt/ml/input/data/train/c4/en/c4-train.00229-of-01024.json.gz", 20 | "/opt/ml/input/data/train/c4/en/c4-train.00826-of-01024.json.gz", 21 | "/opt/ml/input/data/train/c4/en/c4-train.00764-of-01024.json.gz", 22 | "/opt/ml/input/data/train/c4/en/c4-train.00415-of-01024.json.gz", 23 | "/opt/ml/input/data/train/c4/en/c4-train.00121-of-01024.json.gz", 24 | "/opt/ml/input/data/train/c4/en/c4-train.00569-of-01024.json.gz", 25 | "/opt/ml/input/data/train/c4/en/c4-train.00337-of-01024.json.gz", 26 | "/opt/ml/input/data/train/c4/en/c4-train.00530-of-01024.json.gz", 27 | "/opt/ml/input/data/train/c4/en/c4-train.00444-of-01024.json.gz", 28 | "/opt/ml/input/data/train/c4/en/c4-train.00756-of-01024.json.gz", 29 | "/opt/ml/input/data/train/c4/en/c4-train.00601-of-01024.json.gz", 30 | "/opt/ml/input/data/train/c4/en/c4-train.00307-of-01024.json.gz", 31 | "/opt/ml/input/data/train/c4/en/c4-train.00221-of-01024.json.gz", 32 | "/opt/ml/input/data/train/c4/en/c4-train.00182-of-01024.json.gz", 33 | "/opt/ml/input/data/train/c4/en/c4-train.00207-of-01024.json.gz", 34 | "/opt/ml/input/data/train/c4/en/c4-train.00632-of-01024.json.gz", 35 | "/opt/ml/input/data/train/c4/en/c4-train.00186-of-01024.json.gz", 36 | "/opt/ml/input/data/train/c4/en/c4-train.00914-of-01024.json.gz", 37 | "/opt/ml/input/data/train/c4/en/c4-train.00513-of-01024.json.gz", 38 | "/opt/ml/input/data/train/c4/en/c4-train.00448-of-01024.json.gz", 39 | "/opt/ml/input/data/train/c4/en/c4-train.00612-of-01024.json.gz", 40 | "/opt/ml/input/data/train/c4/en/c4-train.00700-of-01024.json.gz", 41 | "/opt/ml/input/data/train/c4/en/c4-train.00254-of-01024.json.gz", 42 | "/opt/ml/input/data/train/c4/en/c4-train.00523-of-01024.json.gz", 43 | "/opt/ml/input/data/train/c4/en/c4-train.00138-of-01024.json.gz", 44 | "/opt/ml/input/data/train/c4/en/c4-train.00731-of-01024.json.gz", 45 | "/opt/ml/input/data/train/c4/en/c4-train.00166-of-01024.json.gz", 46 | "/opt/ml/input/data/train/c4/en/c4-train.00871-of-01024.json.gz", 47 | "/opt/ml/input/data/train/c4/en/c4-train.00920-of-01024.json.gz", 48 | "/opt/ml/input/data/train/c4/en/c4-train.00293-of-01024.json.gz", 49 | "/opt/ml/input/data/train/c4/en/c4-train.00365-of-01024.json.gz", 50 | "/opt/ml/input/data/train/c4/en/c4-train.00546-of-01024.json.gz", 51 | "/opt/ml/input/data/train/c4/en/c4-train.00407-of-01024.json.gz" 52 | ] -------------------------------------------------------------------------------- /data/files/c4/en/p50/partition_700.json: -------------------------------------------------------------------------------- 1 | [ 2 | "/opt/ml/input/data/train/c4/en/c4-train.00682-of-01024.json.gz", 3 | "/opt/ml/input/data/train/c4/en/c4-train.00078-of-01024.json.gz", 4 | "/opt/ml/input/data/train/c4/en/c4-train.00581-of-01024.json.gz", 5 | "/opt/ml/input/data/train/c4/en/c4-train.00424-of-01024.json.gz", 6 | "/opt/ml/input/data/train/c4/en/c4-train.00641-of-01024.json.gz", 7 | "/opt/ml/input/data/train/c4/en/c4-train.00453-of-01024.json.gz", 8 | "/opt/ml/input/data/train/c4/en/c4-train.00608-of-01024.json.gz", 9 | "/opt/ml/input/data/train/c4/en/c4-train.00653-of-01024.json.gz", 10 | "/opt/ml/input/data/train/c4/en/c4-train.00977-of-01024.json.gz", 11 | "/opt/ml/input/data/train/c4/en/c4-train.00992-of-01024.json.gz", 12 | "/opt/ml/input/data/train/c4/en/c4-train.00495-of-01024.json.gz", 13 | "/opt/ml/input/data/train/c4/en/c4-train.00738-of-01024.json.gz", 14 | "/opt/ml/input/data/train/c4/en/c4-train.00312-of-01024.json.gz", 15 | "/opt/ml/input/data/train/c4/en/c4-train.00552-of-01024.json.gz", 16 | "/opt/ml/input/data/train/c4/en/c4-train.00397-of-01024.json.gz", 17 | "/opt/ml/input/data/train/c4/en/c4-train.00267-of-01024.json.gz", 18 | "/opt/ml/input/data/train/c4/en/c4-train.00529-of-01024.json.gz", 19 | "/opt/ml/input/data/train/c4/en/c4-train.00494-of-01024.json.gz", 20 | "/opt/ml/input/data/train/c4/en/c4-train.00566-of-01024.json.gz", 21 | "/opt/ml/input/data/train/c4/en/c4-train.00505-of-01024.json.gz", 22 | "/opt/ml/input/data/train/c4/en/c4-train.00276-of-01024.json.gz", 23 | "/opt/ml/input/data/train/c4/en/c4-train.00619-of-01024.json.gz", 24 | "/opt/ml/input/data/train/c4/en/c4-train.00733-of-01024.json.gz", 25 | "/opt/ml/input/data/train/c4/en/c4-train.00488-of-01024.json.gz", 26 | "/opt/ml/input/data/train/c4/en/c4-train.00486-of-01024.json.gz", 27 | "/opt/ml/input/data/train/c4/en/c4-train.00587-of-01024.json.gz", 28 | "/opt/ml/input/data/train/c4/en/c4-train.00941-of-01024.json.gz", 29 | "/opt/ml/input/data/train/c4/en/c4-train.00821-of-01024.json.gz", 30 | "/opt/ml/input/data/train/c4/en/c4-train.00708-of-01024.json.gz", 31 | "/opt/ml/input/data/train/c4/en/c4-train.00822-of-01024.json.gz", 32 | "/opt/ml/input/data/train/c4/en/c4-train.00366-of-01024.json.gz", 33 | "/opt/ml/input/data/train/c4/en/c4-train.00394-of-01024.json.gz", 34 | "/opt/ml/input/data/train/c4/en/c4-train.00250-of-01024.json.gz", 35 | "/opt/ml/input/data/train/c4/en/c4-train.00389-of-01024.json.gz", 36 | "/opt/ml/input/data/train/c4/en/c4-train.00353-of-01024.json.gz", 37 | "/opt/ml/input/data/train/c4/en/c4-train.00035-of-01024.json.gz", 38 | "/opt/ml/input/data/train/c4/en/c4-train.00202-of-01024.json.gz", 39 | "/opt/ml/input/data/train/c4/en/c4-train.00008-of-01024.json.gz", 40 | "/opt/ml/input/data/train/c4/en/c4-train.00364-of-01024.json.gz", 41 | "/opt/ml/input/data/train/c4/en/c4-train.00717-of-01024.json.gz", 42 | "/opt/ml/input/data/train/c4/en/c4-train.00222-of-01024.json.gz", 43 | "/opt/ml/input/data/train/c4/en/c4-train.00178-of-01024.json.gz", 44 | "/opt/ml/input/data/train/c4/en/c4-train.00240-of-01024.json.gz", 45 | "/opt/ml/input/data/train/c4/en/c4-train.00398-of-01024.json.gz", 46 | "/opt/ml/input/data/train/c4/en/c4-train.00672-of-01024.json.gz", 47 | "/opt/ml/input/data/train/c4/en/c4-train.00278-of-01024.json.gz", 48 | "/opt/ml/input/data/train/c4/en/c4-train.00959-of-01024.json.gz", 49 | "/opt/ml/input/data/train/c4/en/c4-train.00429-of-01024.json.gz", 50 | "/opt/ml/input/data/train/c4/en/c4-train.00592-of-01024.json.gz", 51 | "/opt/ml/input/data/train/c4/en/c4-train.00327-of-01024.json.gz" 52 | ] -------------------------------------------------------------------------------- /data/files/c4/en/p50/partition_750.json: -------------------------------------------------------------------------------- 1 | [ 2 | "/opt/ml/input/data/train/c4/en/c4-train.00686-of-01024.json.gz", 3 | "/opt/ml/input/data/train/c4/en/c4-train.00952-of-01024.json.gz", 4 | "/opt/ml/input/data/train/c4/en/c4-train.00516-of-01024.json.gz", 5 | "/opt/ml/input/data/train/c4/en/c4-train.00460-of-01024.json.gz", 6 | "/opt/ml/input/data/train/c4/en/c4-train.00748-of-01024.json.gz", 7 | "/opt/ml/input/data/train/c4/en/c4-train.00165-of-01024.json.gz", 8 | "/opt/ml/input/data/train/c4/en/c4-train.00309-of-01024.json.gz", 9 | "/opt/ml/input/data/train/c4/en/c4-train.00929-of-01024.json.gz", 10 | "/opt/ml/input/data/train/c4/en/c4-train.00734-of-01024.json.gz", 11 | "/opt/ml/input/data/train/c4/en/c4-train.00956-of-01024.json.gz", 12 | "/opt/ml/input/data/train/c4/en/c4-train.00143-of-01024.json.gz", 13 | "/opt/ml/input/data/train/c4/en/c4-train.00041-of-01024.json.gz", 14 | "/opt/ml/input/data/train/c4/en/c4-train.00022-of-01024.json.gz", 15 | "/opt/ml/input/data/train/c4/en/c4-train.00438-of-01024.json.gz", 16 | "/opt/ml/input/data/train/c4/en/c4-train.00776-of-01024.json.gz", 17 | "/opt/ml/input/data/train/c4/en/c4-train.00170-of-01024.json.gz", 18 | "/opt/ml/input/data/train/c4/en/c4-train.00646-of-01024.json.gz", 19 | "/opt/ml/input/data/train/c4/en/c4-train.00573-of-01024.json.gz", 20 | "/opt/ml/input/data/train/c4/en/c4-train.00997-of-01024.json.gz", 21 | "/opt/ml/input/data/train/c4/en/c4-train.00823-of-01024.json.gz", 22 | "/opt/ml/input/data/train/c4/en/c4-train.00206-of-01024.json.gz", 23 | "/opt/ml/input/data/train/c4/en/c4-train.00547-of-01024.json.gz", 24 | "/opt/ml/input/data/train/c4/en/c4-train.00194-of-01024.json.gz", 25 | "/opt/ml/input/data/train/c4/en/c4-train.00130-of-01024.json.gz", 26 | "/opt/ml/input/data/train/c4/en/c4-train.00067-of-01024.json.gz", 27 | "/opt/ml/input/data/train/c4/en/c4-train.00765-of-01024.json.gz", 28 | "/opt/ml/input/data/train/c4/en/c4-train.00799-of-01024.json.gz", 29 | "/opt/ml/input/data/train/c4/en/c4-train.00466-of-01024.json.gz", 30 | "/opt/ml/input/data/train/c4/en/c4-train.00680-of-01024.json.gz", 31 | "/opt/ml/input/data/train/c4/en/c4-train.00277-of-01024.json.gz", 32 | "/opt/ml/input/data/train/c4/en/c4-train.00885-of-01024.json.gz", 33 | "/opt/ml/input/data/train/c4/en/c4-train.00548-of-01024.json.gz", 34 | "/opt/ml/input/data/train/c4/en/c4-train.00703-of-01024.json.gz", 35 | "/opt/ml/input/data/train/c4/en/c4-train.00374-of-01024.json.gz", 36 | "/opt/ml/input/data/train/c4/en/c4-train.00779-of-01024.json.gz", 37 | "/opt/ml/input/data/train/c4/en/c4-train.00881-of-01024.json.gz", 38 | "/opt/ml/input/data/train/c4/en/c4-train.00749-of-01024.json.gz", 39 | "/opt/ml/input/data/train/c4/en/c4-train.00234-of-01024.json.gz", 40 | "/opt/ml/input/data/train/c4/en/c4-train.00844-of-01024.json.gz", 41 | "/opt/ml/input/data/train/c4/en/c4-train.00352-of-01024.json.gz", 42 | "/opt/ml/input/data/train/c4/en/c4-train.00943-of-01024.json.gz", 43 | "/opt/ml/input/data/train/c4/en/c4-train.00088-of-01024.json.gz", 44 | "/opt/ml/input/data/train/c4/en/c4-train.00195-of-01024.json.gz", 45 | "/opt/ml/input/data/train/c4/en/c4-train.00864-of-01024.json.gz", 46 | "/opt/ml/input/data/train/c4/en/c4-train.00066-of-01024.json.gz", 47 | "/opt/ml/input/data/train/c4/en/c4-train.00743-of-01024.json.gz", 48 | "/opt/ml/input/data/train/c4/en/c4-train.00437-of-01024.json.gz", 49 | "/opt/ml/input/data/train/c4/en/c4-train.00606-of-01024.json.gz", 50 | "/opt/ml/input/data/train/c4/en/c4-train.00728-of-01024.json.gz", 51 | "/opt/ml/input/data/train/c4/en/c4-train.00361-of-01024.json.gz" 52 | ] -------------------------------------------------------------------------------- /data/files/c4/en/p50/partition_800.json: -------------------------------------------------------------------------------- 1 | [ 2 | "/opt/ml/input/data/train/c4/en/c4-train.00095-of-01024.json.gz", 3 | "/opt/ml/input/data/train/c4/en/c4-train.00541-of-01024.json.gz", 4 | "/opt/ml/input/data/train/c4/en/c4-train.00539-of-01024.json.gz", 5 | "/opt/ml/input/data/train/c4/en/c4-train.00142-of-01024.json.gz", 6 | "/opt/ml/input/data/train/c4/en/c4-train.00913-of-01024.json.gz", 7 | "/opt/ml/input/data/train/c4/en/c4-train.00718-of-01024.json.gz", 8 | "/opt/ml/input/data/train/c4/en/c4-train.00446-of-01024.json.gz", 9 | "/opt/ml/input/data/train/c4/en/c4-train.00091-of-01024.json.gz", 10 | "/opt/ml/input/data/train/c4/en/c4-train.00332-of-01024.json.gz", 11 | "/opt/ml/input/data/train/c4/en/c4-train.00625-of-01024.json.gz", 12 | "/opt/ml/input/data/train/c4/en/c4-train.00803-of-01024.json.gz", 13 | "/opt/ml/input/data/train/c4/en/c4-train.00813-of-01024.json.gz", 14 | "/opt/ml/input/data/train/c4/en/c4-train.00975-of-01024.json.gz", 15 | "/opt/ml/input/data/train/c4/en/c4-train.00724-of-01024.json.gz", 16 | "/opt/ml/input/data/train/c4/en/c4-train.00412-of-01024.json.gz", 17 | "/opt/ml/input/data/train/c4/en/c4-train.00582-of-01024.json.gz", 18 | "/opt/ml/input/data/train/c4/en/c4-train.00074-of-01024.json.gz", 19 | "/opt/ml/input/data/train/c4/en/c4-train.00112-of-01024.json.gz", 20 | "/opt/ml/input/data/train/c4/en/c4-train.00883-of-01024.json.gz", 21 | "/opt/ml/input/data/train/c4/en/c4-train.00739-of-01024.json.gz", 22 | "/opt/ml/input/data/train/c4/en/c4-train.00443-of-01024.json.gz", 23 | "/opt/ml/input/data/train/c4/en/c4-train.00266-of-01024.json.gz", 24 | "/opt/ml/input/data/train/c4/en/c4-train.00203-of-01024.json.gz", 25 | "/opt/ml/input/data/train/c4/en/c4-train.00872-of-01024.json.gz", 26 | "/opt/ml/input/data/train/c4/en/c4-train.00211-of-01024.json.gz", 27 | "/opt/ml/input/data/train/c4/en/c4-train.00855-of-01024.json.gz", 28 | "/opt/ml/input/data/train/c4/en/c4-train.00136-of-01024.json.gz", 29 | "/opt/ml/input/data/train/c4/en/c4-train.00695-of-01024.json.gz", 30 | "/opt/ml/input/data/train/c4/en/c4-train.00271-of-01024.json.gz", 31 | "/opt/ml/input/data/train/c4/en/c4-train.00824-of-01024.json.gz", 32 | "/opt/ml/input/data/train/c4/en/c4-train.00367-of-01024.json.gz", 33 | "/opt/ml/input/data/train/c4/en/c4-train.00010-of-01024.json.gz", 34 | "/opt/ml/input/data/train/c4/en/c4-train.00125-of-01024.json.gz", 35 | "/opt/ml/input/data/train/c4/en/c4-train.00436-of-01024.json.gz", 36 | "/opt/ml/input/data/train/c4/en/c4-train.00958-of-01024.json.gz", 37 | "/opt/ml/input/data/train/c4/en/c4-train.00289-of-01024.json.gz", 38 | "/opt/ml/input/data/train/c4/en/c4-train.00742-of-01024.json.gz", 39 | "/opt/ml/input/data/train/c4/en/c4-train.00056-of-01024.json.gz", 40 | "/opt/ml/input/data/train/c4/en/c4-train.01001-of-01024.json.gz", 41 | "/opt/ml/input/data/train/c4/en/c4-train.00360-of-01024.json.gz", 42 | "/opt/ml/input/data/train/c4/en/c4-train.00856-of-01024.json.gz", 43 | "/opt/ml/input/data/train/c4/en/c4-train.00989-of-01024.json.gz", 44 | "/opt/ml/input/data/train/c4/en/c4-train.00991-of-01024.json.gz", 45 | "/opt/ml/input/data/train/c4/en/c4-train.00585-of-01024.json.gz", 46 | "/opt/ml/input/data/train/c4/en/c4-train.00534-of-01024.json.gz", 47 | "/opt/ml/input/data/train/c4/en/c4-train.00388-of-01024.json.gz", 48 | "/opt/ml/input/data/train/c4/en/c4-train.00578-of-01024.json.gz", 49 | "/opt/ml/input/data/train/c4/en/c4-train.00383-of-01024.json.gz", 50 | "/opt/ml/input/data/train/c4/en/c4-train.00149-of-01024.json.gz", 51 | "/opt/ml/input/data/train/c4/en/c4-train.01005-of-01024.json.gz" 52 | ] -------------------------------------------------------------------------------- /data/files/c4/en/p50/partition_850.json: -------------------------------------------------------------------------------- 1 | [ 2 | "/opt/ml/input/data/train/c4/en/c4-train.01023-of-01024.json.gz", 3 | "/opt/ml/input/data/train/c4/en/c4-train.00106-of-01024.json.gz", 4 | "/opt/ml/input/data/train/c4/en/c4-train.00043-of-01024.json.gz", 5 | "/opt/ml/input/data/train/c4/en/c4-train.00314-of-01024.json.gz", 6 | "/opt/ml/input/data/train/c4/en/c4-train.00409-of-01024.json.gz", 7 | "/opt/ml/input/data/train/c4/en/c4-train.00331-of-01024.json.gz", 8 | "/opt/ml/input/data/train/c4/en/c4-train.00850-of-01024.json.gz", 9 | "/opt/ml/input/data/train/c4/en/c4-train.00542-of-01024.json.gz", 10 | "/opt/ml/input/data/train/c4/en/c4-train.00953-of-01024.json.gz", 11 | "/opt/ml/input/data/train/c4/en/c4-train.00316-of-01024.json.gz", 12 | "/opt/ml/input/data/train/c4/en/c4-train.00426-of-01024.json.gz", 13 | "/opt/ml/input/data/train/c4/en/c4-train.00251-of-01024.json.gz", 14 | "/opt/ml/input/data/train/c4/en/c4-train.00594-of-01024.json.gz", 15 | "/opt/ml/input/data/train/c4/en/c4-train.00990-of-01024.json.gz", 16 | "/opt/ml/input/data/train/c4/en/c4-train.00689-of-01024.json.gz", 17 | "/opt/ml/input/data/train/c4/en/c4-train.00725-of-01024.json.gz", 18 | "/opt/ml/input/data/train/c4/en/c4-train.00570-of-01024.json.gz", 19 | "/opt/ml/input/data/train/c4/en/c4-train.00461-of-01024.json.gz", 20 | "/opt/ml/input/data/train/c4/en/c4-train.00805-of-01024.json.gz", 21 | "/opt/ml/input/data/train/c4/en/c4-train.00275-of-01024.json.gz", 22 | "/opt/ml/input/data/train/c4/en/c4-train.00691-of-01024.json.gz", 23 | "/opt/ml/input/data/train/c4/en/c4-train.00860-of-01024.json.gz", 24 | "/opt/ml/input/data/train/c4/en/c4-train.00384-of-01024.json.gz", 25 | "/opt/ml/input/data/train/c4/en/c4-train.00030-of-01024.json.gz", 26 | "/opt/ml/input/data/train/c4/en/c4-train.00237-of-01024.json.gz", 27 | "/opt/ml/input/data/train/c4/en/c4-train.00359-of-01024.json.gz", 28 | "/opt/ml/input/data/train/c4/en/c4-train.00400-of-01024.json.gz", 29 | "/opt/ml/input/data/train/c4/en/c4-train.00386-of-01024.json.gz", 30 | "/opt/ml/input/data/train/c4/en/c4-train.00473-of-01024.json.gz", 31 | "/opt/ml/input/data/train/c4/en/c4-train.00127-of-01024.json.gz", 32 | "/opt/ml/input/data/train/c4/en/c4-train.00684-of-01024.json.gz", 33 | "/opt/ml/input/data/train/c4/en/c4-train.00162-of-01024.json.gz", 34 | "/opt/ml/input/data/train/c4/en/c4-train.00108-of-01024.json.gz", 35 | "/opt/ml/input/data/train/c4/en/c4-train.00320-of-01024.json.gz", 36 | "/opt/ml/input/data/train/c4/en/c4-train.00099-of-01024.json.gz", 37 | "/opt/ml/input/data/train/c4/en/c4-train.00111-of-01024.json.gz", 38 | "/opt/ml/input/data/train/c4/en/c4-train.00375-of-01024.json.gz", 39 | "/opt/ml/input/data/train/c4/en/c4-train.00328-of-01024.json.gz", 40 | "/opt/ml/input/data/train/c4/en/c4-train.00148-of-01024.json.gz", 41 | "/opt/ml/input/data/train/c4/en/c4-train.00117-of-01024.json.gz", 42 | "/opt/ml/input/data/train/c4/en/c4-train.00664-of-01024.json.gz", 43 | "/opt/ml/input/data/train/c4/en/c4-train.00472-of-01024.json.gz", 44 | "/opt/ml/input/data/train/c4/en/c4-train.00124-of-01024.json.gz", 45 | "/opt/ml/input/data/train/c4/en/c4-train.00702-of-01024.json.gz", 46 | "/opt/ml/input/data/train/c4/en/c4-train.00886-of-01024.json.gz", 47 | "/opt/ml/input/data/train/c4/en/c4-train.00651-of-01024.json.gz", 48 | "/opt/ml/input/data/train/c4/en/c4-train.00838-of-01024.json.gz", 49 | "/opt/ml/input/data/train/c4/en/c4-train.00336-of-01024.json.gz", 50 | "/opt/ml/input/data/train/c4/en/c4-train.00274-of-01024.json.gz", 51 | "/opt/ml/input/data/train/c4/en/c4-train.00255-of-01024.json.gz" 52 | ] -------------------------------------------------------------------------------- /data/files/c4/en/p50/partition_900.json: -------------------------------------------------------------------------------- 1 | [ 2 | "/opt/ml/input/data/train/c4/en/c4-train.00025-of-01024.json.gz", 3 | "/opt/ml/input/data/train/c4/en/c4-train.00836-of-01024.json.gz", 4 | "/opt/ml/input/data/train/c4/en/c4-train.00575-of-01024.json.gz", 5 | "/opt/ml/input/data/train/c4/en/c4-train.00319-of-01024.json.gz", 6 | "/opt/ml/input/data/train/c4/en/c4-train.00455-of-01024.json.gz", 7 | "/opt/ml/input/data/train/c4/en/c4-train.01007-of-01024.json.gz", 8 | "/opt/ml/input/data/train/c4/en/c4-train.00482-of-01024.json.gz", 9 | "/opt/ml/input/data/train/c4/en/c4-train.00464-of-01024.json.gz", 10 | "/opt/ml/input/data/train/c4/en/c4-train.00214-of-01024.json.gz", 11 | "/opt/ml/input/data/train/c4/en/c4-train.00849-of-01024.json.gz", 12 | "/opt/ml/input/data/train/c4/en/c4-train.00878-of-01024.json.gz", 13 | "/opt/ml/input/data/train/c4/en/c4-train.00163-of-01024.json.gz", 14 | "/opt/ml/input/data/train/c4/en/c4-train.00577-of-01024.json.gz", 15 | "/opt/ml/input/data/train/c4/en/c4-train.00629-of-01024.json.gz", 16 | "/opt/ml/input/data/train/c4/en/c4-train.00661-of-01024.json.gz", 17 | "/opt/ml/input/data/train/c4/en/c4-train.00093-of-01024.json.gz", 18 | "/opt/ml/input/data/train/c4/en/c4-train.00057-of-01024.json.gz", 19 | "/opt/ml/input/data/train/c4/en/c4-train.00477-of-01024.json.gz", 20 | "/opt/ml/input/data/train/c4/en/c4-train.00037-of-01024.json.gz", 21 | "/opt/ml/input/data/train/c4/en/c4-train.00457-of-01024.json.gz", 22 | "/opt/ml/input/data/train/c4/en/c4-train.00973-of-01024.json.gz", 23 | "/opt/ml/input/data/train/c4/en/c4-train.00788-of-01024.json.gz", 24 | "/opt/ml/input/data/train/c4/en/c4-train.00848-of-01024.json.gz", 25 | "/opt/ml/input/data/train/c4/en/c4-train.00564-of-01024.json.gz", 26 | "/opt/ml/input/data/train/c4/en/c4-train.00406-of-01024.json.gz", 27 | "/opt/ml/input/data/train/c4/en/c4-train.00109-of-01024.json.gz", 28 | "/opt/ml/input/data/train/c4/en/c4-train.00520-of-01024.json.gz", 29 | "/opt/ml/input/data/train/c4/en/c4-train.00322-of-01024.json.gz", 30 | "/opt/ml/input/data/train/c4/en/c4-train.00380-of-01024.json.gz", 31 | "/opt/ml/input/data/train/c4/en/c4-train.00518-of-01024.json.gz", 32 | "/opt/ml/input/data/train/c4/en/c4-train.00854-of-01024.json.gz", 33 | "/opt/ml/input/data/train/c4/en/c4-train.00092-of-01024.json.gz", 34 | "/opt/ml/input/data/train/c4/en/c4-train.00741-of-01024.json.gz", 35 | "/opt/ml/input/data/train/c4/en/c4-train.00515-of-01024.json.gz", 36 | "/opt/ml/input/data/train/c4/en/c4-train.00417-of-01024.json.gz", 37 | "/opt/ml/input/data/train/c4/en/c4-train.00326-of-01024.json.gz", 38 | "/opt/ml/input/data/train/c4/en/c4-train.00033-of-01024.json.gz", 39 | "/opt/ml/input/data/train/c4/en/c4-train.00268-of-01024.json.gz", 40 | "/opt/ml/input/data/train/c4/en/c4-train.00835-of-01024.json.gz", 41 | "/opt/ml/input/data/train/c4/en/c4-train.00376-of-01024.json.gz", 42 | "/opt/ml/input/data/train/c4/en/c4-train.00798-of-01024.json.gz", 43 | "/opt/ml/input/data/train/c4/en/c4-train.00744-of-01024.json.gz", 44 | "/opt/ml/input/data/train/c4/en/c4-train.00009-of-01024.json.gz", 45 | "/opt/ml/input/data/train/c4/en/c4-train.00152-of-01024.json.gz", 46 | "/opt/ml/input/data/train/c4/en/c4-train.00116-of-01024.json.gz", 47 | "/opt/ml/input/data/train/c4/en/c4-train.00497-of-01024.json.gz", 48 | "/opt/ml/input/data/train/c4/en/c4-train.00475-of-01024.json.gz", 49 | "/opt/ml/input/data/train/c4/en/c4-train.00003-of-01024.json.gz", 50 | "/opt/ml/input/data/train/c4/en/c4-train.00890-of-01024.json.gz", 51 | "/opt/ml/input/data/train/c4/en/c4-train.00936-of-01024.json.gz" 52 | ] -------------------------------------------------------------------------------- /data/files/c4/en/p50/partition_950.json: -------------------------------------------------------------------------------- 1 | [ 2 | "/opt/ml/input/data/train/c4/en/c4-train.00998-of-01024.json.gz", 3 | "/opt/ml/input/data/train/c4/en/c4-train.00657-of-01024.json.gz", 4 | "/opt/ml/input/data/train/c4/en/c4-train.00597-of-01024.json.gz", 5 | "/opt/ml/input/data/train/c4/en/c4-train.00514-of-01024.json.gz", 6 | "/opt/ml/input/data/train/c4/en/c4-train.00390-of-01024.json.gz", 7 | "/opt/ml/input/data/train/c4/en/c4-train.00773-of-01024.json.gz", 8 | "/opt/ml/input/data/train/c4/en/c4-train.00931-of-01024.json.gz", 9 | "/opt/ml/input/data/train/c4/en/c4-train.00858-of-01024.json.gz", 10 | "/opt/ml/input/data/train/c4/en/c4-train.00852-of-01024.json.gz", 11 | "/opt/ml/input/data/train/c4/en/c4-train.00783-of-01024.json.gz", 12 | "/opt/ml/input/data/train/c4/en/c4-train.00994-of-01024.json.gz", 13 | "/opt/ml/input/data/train/c4/en/c4-train.00042-of-01024.json.gz", 14 | "/opt/ml/input/data/train/c4/en/c4-train.00503-of-01024.json.gz", 15 | "/opt/ml/input/data/train/c4/en/c4-train.00260-of-01024.json.gz", 16 | "/opt/ml/input/data/train/c4/en/c4-train.00243-of-01024.json.gz", 17 | "/opt/ml/input/data/train/c4/en/c4-train.00614-of-01024.json.gz", 18 | "/opt/ml/input/data/train/c4/en/c4-train.00706-of-01024.json.gz", 19 | "/opt/ml/input/data/train/c4/en/c4-train.00536-of-01024.json.gz", 20 | "/opt/ml/input/data/train/c4/en/c4-train.00502-of-01024.json.gz", 21 | "/opt/ml/input/data/train/c4/en/c4-train.00039-of-01024.json.gz", 22 | "/opt/ml/input/data/train/c4/en/c4-train.00627-of-01024.json.gz", 23 | "/opt/ml/input/data/train/c4/en/c4-train.00118-of-01024.json.gz", 24 | "/opt/ml/input/data/train/c4/en/c4-train.00712-of-01024.json.gz", 25 | "/opt/ml/input/data/train/c4/en/c4-train.00356-of-01024.json.gz", 26 | "/opt/ml/input/data/train/c4/en/c4-train.00845-of-01024.json.gz", 27 | "/opt/ml/input/data/train/c4/en/c4-train.00013-of-01024.json.gz", 28 | "/opt/ml/input/data/train/c4/en/c4-train.00480-of-01024.json.gz", 29 | "/opt/ml/input/data/train/c4/en/c4-train.00605-of-01024.json.gz", 30 | "/opt/ml/input/data/train/c4/en/c4-train.00825-of-01024.json.gz", 31 | "/opt/ml/input/data/train/c4/en/c4-train.00252-of-01024.json.gz", 32 | "/opt/ml/input/data/train/c4/en/c4-train.00185-of-01024.json.gz", 33 | "/opt/ml/input/data/train/c4/en/c4-train.00306-of-01024.json.gz", 34 | "/opt/ml/input/data/train/c4/en/c4-train.00688-of-01024.json.gz", 35 | "/opt/ml/input/data/train/c4/en/c4-train.01017-of-01024.json.gz", 36 | "/opt/ml/input/data/train/c4/en/c4-train.00050-of-01024.json.gz", 37 | "/opt/ml/input/data/train/c4/en/c4-train.01004-of-01024.json.gz", 38 | "/opt/ml/input/data/train/c4/en/c4-train.00740-of-01024.json.gz", 39 | "/opt/ml/input/data/train/c4/en/c4-train.00796-of-01024.json.gz", 40 | "/opt/ml/input/data/train/c4/en/c4-train.00831-of-01024.json.gz", 41 | "/opt/ml/input/data/train/c4/en/c4-train.00485-of-01024.json.gz", 42 | "/opt/ml/input/data/train/c4/en/c4-train.00677-of-01024.json.gz", 43 | "/opt/ml/input/data/train/c4/en/c4-train.00357-of-01024.json.gz", 44 | "/opt/ml/input/data/train/c4/en/c4-train.00537-of-01024.json.gz", 45 | "/opt/ml/input/data/train/c4/en/c4-train.00884-of-01024.json.gz", 46 | "/opt/ml/input/data/train/c4/en/c4-train.00073-of-01024.json.gz", 47 | "/opt/ml/input/data/train/c4/en/c4-train.00297-of-01024.json.gz", 48 | "/opt/ml/input/data/train/c4/en/c4-train.00317-of-01024.json.gz", 49 | "/opt/ml/input/data/train/c4/en/c4-train.00192-of-01024.json.gz", 50 | "/opt/ml/input/data/train/c4/en/c4-train.00323-of-01024.json.gz", 51 | "/opt/ml/input/data/train/c4/en/c4-train.00462-of-01024.json.gz" 52 | ] -------------------------------------------------------------------------------- /conf/roberta_split_fact_v1_1.yaml: -------------------------------------------------------------------------------- 1 | hydra: 2 | run: 3 | dir: ./ 4 | 5 | train_file: strategyqa/sub_data/train.json 6 | dev_file: strategyqa/sub_data/dev.json 7 | test_file: strategyqa/sub_data/test.json 8 | 9 | # Model 10 | model: 11 | _target_: models.roberta.RobertaForSequenceClassification.from_pretrained 12 | 13 | # Data loading 14 | read_tensor: 15 | _target_: data.strategy_qa.split_get_tensor_with_gold_para 16 | train_para_file: strategyqa/strategyqa_train_paragraphs.json 17 | max_seq_length: 512 18 | use_fact: True 19 | 20 | extended_vocab: 21 | 22 | # Data collator 23 | collator: 24 | _target_: data.collators.dict2dict.MetaCollator 25 | 26 | # Dataloader 27 | num_workers: 4 28 | prefetch_factor: 2 29 | 30 | # Wiki path pretrain v8.2 31 | model_name_or_path: pretrained-models/roberta-large 32 | pretrain: 33 | 34 | output_dir: experiments/strategy_qa.roberta.large.w_fact.w1.A40.v1.1.s${seed} 35 | 36 | do_train: True 37 | evaluate_during_training: True 38 | 39 | do_eval: True 40 | eval_sub_path: 41 | 42 | # Training hyper-parameters 43 | per_gpu_train_batch_size: 32 44 | per_gpu_eval_batch_size: 32 45 | learning_rate: 1e-5 46 | #learning_rate: 5e-6 47 | gradient_accumulation_steps: 1 48 | weight_decay: 0.01 49 | adam_epsilon: 1e-6 50 | adam_betas: "(0.9, 0.98)" 51 | #adam_betas: "(0.9, 0.999)" 52 | max_grad_norm: 0.0 53 | #max_grad_norm: 1.0 54 | num_train_epochs: 20 55 | max_steps: 0 56 | warmup_proportion: 0.1 57 | warmup_steps: 0 58 | 59 | 60 | logging_steps: 5 61 | save_steps: -1 62 | save_best: True 63 | eval_steps: 100 64 | ddp_eval: True 65 | no_cuda: False 66 | seed: 42 67 | local_rank: -1 68 | fp16: True 69 | fp16_opt_level: O1 70 | 71 | # Prediction config 72 | prediction_cfg: 73 | metric: "acc" 74 | measure: 1 75 | best_checkpoint: 76 | best_result: 77 | generator: False 78 | post_process: 79 | 80 | # fairscale.FullyShardedDP 81 | fairscale_config: 82 | _target_: general_util.fsdp_utils.default_initialize 83 | fp16: ${fp16} 84 | reshard_after_forward: False 85 | move_grads_to_cpu: False 86 | move_params_to_cpu: False 87 | 88 | # Deepspeed config 89 | ds_cfg: 90 | train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size} 91 | gradient_accumulation_steps: ${gradient_accumulation_steps} 92 | optimizer: 93 | type: AdamW 94 | params: 95 | lr: ${learning_rate} 96 | betas: [0.9, 0.999] 97 | eps: ${adam_epsilon} 98 | weight_decay: ${weight_decay} 99 | scheduler: 100 | type: WarmupDecayLR 101 | params: 102 | total_num_steps: 103 | warmup_max_lr: ${learning_rate} 104 | warmup_num_steps: 105 | warmup_type: linear 106 | gradient_clipping: ${max_grad_norm} 107 | fp16: 108 | enabled: ${fp16} 109 | initial_scale_power: 12 110 | zero_optimization: 111 | stage: 3 112 | # offload_optimizer: 113 | # device: cpu 114 | # pin_memory: True 115 | # offload_param: 116 | # device: cpu 117 | # pin_memory: True 118 | # activation_checkpointing: 119 | # partition_activations: True 120 | # cpu_checkpointing: True 121 | # contiguous_memory_optimization: False 122 | # number_checkpoints: False 123 | # synchronize_checkpoint_boundary: False 124 | # profile: False 125 | steps_per_print: 1024 126 | 127 | summary_helper: 128 | _target_: general_util.tensorboard_helper.SummaryWriterHelper 129 | batch_index_or_keys: 130 | outputs_index_or_keys: 131 | 132 | # Temporary variables 133 | n_gpu: 134 | device: 135 | train_batch_size: 136 | eval_batch_size: 137 | world_size: 138 | -------------------------------------------------------------------------------- /general_util/lightseq_utils.py: -------------------------------------------------------------------------------- 1 | from omegaconf import DictConfig 2 | 3 | from general_util.logger import get_child_logger 4 | 5 | from lightseq.training.ops.pytorch.transformer_encoder_layer import ( 6 | LSTransformerEncoderLayer, 7 | ) 8 | 9 | logger = get_child_logger("LightSeqUtils") 10 | 11 | 12 | class LSHFTransformerEncoderLayer(LSTransformerEncoderLayer): 13 | def __init__(self, *args, **kwargs): 14 | super(LSHFTransformerEncoderLayer, self).__init__(*args, **kwargs) 15 | 16 | def forward(self, hidden_states, encoder_padding_mask, *args, **kwargs): 17 | ls_encoder_padding_mask = encoder_padding_mask / -10000.0 18 | ls_encoder_padding_mask = ls_encoder_padding_mask.squeeze() 19 | output = super().forward(hidden_states, ls_encoder_padding_mask) 20 | return output, None, None, None 21 | 22 | 23 | def gen_bert_config(cfg: DictConfig, config): 24 | bert_config = LSTransformerEncoderLayer.get_config( 25 | max_batch_tokens=4096, 26 | max_seq_len=config.max_position_embeddings, 27 | hidden_size=config.hidden_size, 28 | intermediate_size=config.intermediate_size, 29 | nhead=config.num_attention_heads, 30 | attn_prob_dropout_ratio=config.attention_probs_dropout_prob, 31 | activation_dropout_ratio=config.hidden_dropout_prob, 32 | hidden_dropout_ratio=config.hidden_dropout_prob, 33 | pre_layer_norm=False, 34 | fp16=cfg.fp16, 35 | local_rank=cfg.local_rank, 36 | activation_fn="gelu", 37 | ) 38 | return bert_config 39 | 40 | 41 | def get_hf_bert_enc_layer_params(layer): 42 | init_ws = [] 43 | init_bs = [] 44 | 45 | init_ws.append(layer.attention.self.query.weight.detach().clone()) 46 | init_bs.append(layer.attention.self.query.bias.detach().clone()) 47 | init_ws.append(layer.attention.self.key.weight.detach().clone()) 48 | init_bs.append(layer.attention.self.key.bias.detach().clone()) 49 | init_ws.append(layer.attention.self.value.weight.detach().clone()) 50 | init_bs.append(layer.attention.self.value.bias.detach().clone()) 51 | init_ws.append(layer.attention.output.dense.weight.detach().clone()) 52 | init_bs.append(layer.attention.output.dense.bias.detach().clone()) 53 | init_ws.append(layer.attention.output.LayerNorm.weight.detach().clone()) 54 | init_bs.append(layer.attention.output.LayerNorm.bias.detach().clone()) 55 | 56 | init_ws.append(layer.intermediate.dense.weight.detach().clone()) 57 | init_bs.append(layer.intermediate.dense.bias.detach().clone()) 58 | init_ws.append(layer.output.dense.weight.detach().clone()) 59 | init_bs.append(layer.output.dense.bias.detach().clone()) 60 | init_ws.append(layer.output.LayerNorm.weight.detach().clone()) 61 | init_bs.append(layer.output.LayerNorm.bias.detach().clone()) 62 | 63 | return init_ws, init_bs 64 | 65 | 66 | def inject_ls_enc_layer(model, cfg, config): 67 | for i in range(config.num_hidden_layers): 68 | bert_config = gen_bert_config(cfg, config) 69 | init_ws, init_bs = get_hf_bert_enc_layer_params(model.bert.encoder.layer[i]) 70 | model.bert.encoder.layer[i] = LSHFTransformerEncoderLayer( 71 | bert_config, init_ws, init_bs 72 | ).cuda() 73 | 74 | 75 | def inject_ls_roberta_enc_layer(model, cfg, config): 76 | for i in range(config.num_hidden_layers): 77 | bert_config = gen_bert_config(cfg, config) 78 | init_ws, init_bs = get_hf_bert_enc_layer_params(model.roberta.encoder.layer[i]) 79 | model.roberta.encoder.layer[i] = LSHFTransformerEncoderLayer( 80 | bert_config, init_ws, init_bs 81 | ) 82 | 83 | -------------------------------------------------------------------------------- /modules/layers.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import Tensor 3 | 4 | 5 | def fold_tensor(x: Tensor): 6 | if x is None: 7 | return x 8 | return x.reshape(-1, x.size(-1)) 9 | 10 | 11 | def extract_sent_tokens(source: Tensor, sentence_index: Tensor, sent_token_mask: Tensor, sentence_ids: Tensor, sentence_ids_mask: Tensor): 12 | """ 13 | :param source: [batch, seq_len] 14 | :param sentence_index: [batch, max_sent_num, max_sent_len] 15 | :param sent_token_mask: [batch, max_sent_num, max_sent_len] 16 | :param sentence_ids: [batch, path_len] 17 | :param sentence_ids_mask: [batch, path_len] 18 | :return: 19 | """ 20 | batch = sentence_index.size(0) 21 | max_sent_len = sentence_index.size(-1) 22 | path_len = sentence_ids.size(1) 23 | ex_sentence_ids = sentence_ids.unsqueeze(-1).expand(-1, -1, max_sent_len) 24 | ex_sentence_ids_mask = sentence_ids_mask.unsqueeze(-1).expand(-1, -1, max_sent_len) 25 | # [batch, path_len, max_sent_len] 26 | gathered_sent_token_ids = torch.gather(sentence_index, dim=1, index=ex_sentence_ids).reshape(batch, -1) 27 | gathered_sent_token_mask = torch.gather(sent_token_mask, dim=1, index=ex_sentence_ids) 28 | # [batch, path_len * max_sent_len] 29 | gather_tokens = torch.gather(source, dim=1, index=gathered_sent_token_ids).reshape(batch, path_len, max_sent_len) 30 | # Union mask 31 | union_mask = gathered_sent_token_mask & ex_sentence_ids_mask 32 | return gather_tokens, union_mask 33 | 34 | 35 | def keep_grad_prompt(input_embeds: Tensor, prompt_pos: Tensor): 36 | kp_gradient_mask = input_embeds.new_zeros(input_embeds.size()[:-1]) # [batch, seq_len], the position to keep grad is set to ``1``. 37 | kp_gradient_mask = torch.scatter(kp_gradient_mask, dim=1, index=prompt_pos, value=1.0) 38 | kp_gradient_mask = kp_gradient_mask.unsqueeze(-1) 39 | 40 | input_embeds_sg = input_embeds.detach() 41 | input_embeds = kp_gradient_mask * input_embeds + (1 - kp_gradient_mask) * input_embeds_sg 42 | return input_embeds 43 | 44 | 45 | def get_accuracy(logits: Tensor, labels: Tensor, pad_id: int = -1): 46 | assert logits.size()[:-1] == labels.size() 47 | 48 | # logits = logits.detach().cpu() 49 | _, pred = logits.max(dim=-1) 50 | true_label_num = (labels != pad_id).sum().item() 51 | correct = (pred == labels).sum().item() 52 | if true_label_num == 0: 53 | return 0, 0 54 | acc = correct * 1.0 / true_label_num 55 | return acc, true_label_num 56 | 57 | 58 | def get_precision_recall(logits: Tensor, labels: Tensor, pad_id: int = -1, positive_id: int = 1): 59 | assert logits.size()[:-1] == labels.size() 60 | 61 | _, pred = logits.max(dim=-1) 62 | true_label_num = (labels != pad_id).sum().item() 63 | 64 | tp = ((pred == labels) & (labels == positive_id)).sum(dim=1) 65 | 66 | if true_label_num == 0: 67 | return 0., 0. 68 | 69 | masked_pred = pred.masked_fill(labels == pad_id, 0) 70 | tp_fp = (masked_pred == positive_id).sum(dim=1) 71 | precision = tp / tp_fp 72 | precision.masked_fill_(tp_fp == 0, 0) 73 | precision = precision.mean().item() 74 | # precision = (tp / (masked_pred == 1).sum(dim=1)).mean().item() 75 | 76 | masked_labels = labels.masked_fill(labels == pad_id, 0) 77 | # recall = (tp / (masked_labels == 1).sum(dim=1)).mean().item() 78 | tp_fn = (masked_labels == positive_id).sum(dim=1) 79 | recall = tp / tp_fn 80 | recall.masked_fill_(tp_fn == 0, 0) 81 | recall = recall.mean().item() 82 | 83 | return precision, recall, labels.size(0) 84 | 85 | 86 | def freeze_module(module: torch.nn.Module): 87 | for param in module.parameters(): 88 | param.requires_grad = False 89 | -------------------------------------------------------------------------------- /conf/llama/wiki/test.yaml: -------------------------------------------------------------------------------- 1 | hydra: 2 | run: 3 | dir: ./ 4 | 5 | exp_name: test 6 | 7 | train_file: 8 | _target_: data.flan_combine.obtain_flan_collection_group 9 | dev_file: 10 | test_file: /home/tianze/datasets/NaturalQuestions/v1.0-simplified_nq-dev-all.jsonl 11 | 12 | 13 | # Model 14 | model: 15 | _target_: models.llama.LlamaForConditionalGeneration.from_pretrained 16 | num_hidden_layers: 1 17 | use_peft: False 18 | 19 | # lora_config: 20 | # _target_: models.llama.LoraConfig 21 | # task_type: CAUSAL_LM 22 | # inference_mode: False 23 | # target_modules: ["q_proj", "v_proj"] 24 | # r: 8 25 | # lora_alpha: 16 26 | # lora_dropout: 0.1 27 | 28 | #model_eval: 29 | # _target_: models.llama.LlamaForMultipleChoiceCausalLM.from_pretrained_peft_eval 30 | # base_model_name_or_path: ${model_name_or_path} 31 | 32 | 33 | # Data loading 34 | read_tensor: 35 | _target_: data.collators.misc.NaturalQuestionsDataset 36 | 37 | 38 | extended_vocab: 39 | 40 | # Data collator 41 | collator: 42 | _target_: data.collators.misc.GeneralCollatorOverCollator 43 | max_seq_length: 2048 44 | tokenizer: ${model_name_or_path} 45 | decoder_only: True 46 | collator: 47 | 48 | # Dataloader 49 | num_workers: 4 50 | prefetch_factor: 2 51 | 52 | do_preprocess: False 53 | 54 | # Wiki path pretrain v8.2 55 | model_name_or_path: /home/tianze/other/llama-7b-hf 56 | pretrain: 57 | 58 | output_dir: /home/tianze/other/llama-7b-hf 59 | 60 | do_train: False 61 | evaluate_during_training: False 62 | 63 | do_eval: True 64 | eval_sub_path: 65 | 66 | # Training hyper-parameters 67 | per_gpu_train_batch_size: 1 68 | per_gpu_eval_batch_size: 1 69 | learning_rate: 1e-4 70 | gradient_accumulation_steps: 512 71 | weight_decay: 0.00 72 | adam_epsilon: 1e-6 73 | adam_betas: "(0.9, 0.95)" 74 | max_grad_norm: 5.0 75 | num_train_epochs: 1 76 | max_steps: 0 77 | warmup_proportion: 0.05 78 | warmup_steps: 0 79 | 80 | # Optimizer 81 | optimizer: 82 | use_nvlamb: 83 | bit_training: 84 | 85 | 86 | logging_steps: 1 87 | #save_best: True 88 | save_best: False 89 | save_steps: 100 90 | eval_steps: 100 91 | ddp_eval: True 92 | no_cuda: False 93 | seed: 42 94 | local_rank: -1 95 | fp16: True 96 | fp16_opt_level: O1 97 | fp16_bfloat16: True 98 | 99 | # Prediction config 100 | prediction_cfg: 101 | metric: "acc" 102 | measure: 1 103 | best_checkpoint: 104 | best_result: 105 | eval_forward_fn: 106 | _target_: general_util.evaluator.DiscriminatorForwardFn 107 | post_process: 108 | 109 | #dist_init: 110 | # _target_: general_util.dist_utils.setup_slurm_distributed 111 | 112 | 113 | # fairscale.FullyShardedDP 114 | fairscale_config: 115 | # _target_: general_util.fsdp_utils.recursive_initialize 116 | # _target_: general_util.fsdp_utils.default_initialize 117 | # _target_: general_util.fsdp_utils.default_initialize_v2 118 | _target_: general_util.torch_fsdp_utils.torch_fsdp_transformer_init 119 | fp16: ${fp16} 120 | # move_grads_to_cpu: False 121 | # move_params_to_cpu: False 122 | # flatten_parameters: False 123 | fp16_bfloat16: ${fp16_bfloat16} 124 | cpu_offload: False 125 | # disable_reshard_on_root: False 126 | 127 | 128 | # Lightseq config 129 | with_lightseq: False 130 | 131 | 132 | summary_helper: 133 | _target_: general_util.tensorboard_helper.SummaryWriterHelper 134 | batch_index_or_keys: 135 | # "train/pair_value_num": pair_value_num 136 | # "train/pair_label_num": pair_label_num 137 | # "train/dropped_op_cnt": dropped_op_cnt 138 | # "train/invalid_path": invalid_path 139 | outputs_index_or_keys: 140 | # "train/mlm_loss": mlm_loss 141 | # "train/cls_loss": cls_loss 142 | # "train/tagging_loss": tagging_loss 143 | # "train/path_gen_loss": path_gen_loss 144 | 145 | # Temporary variables 146 | n_gpu: 147 | device: 148 | train_batch_size: 149 | eval_batch_size: 150 | world_size: 151 | -------------------------------------------------------------------------------- /conf/llama/wiki/llama_7b_flan_v1_0.yaml: -------------------------------------------------------------------------------- 1 | hydra: 2 | run: 3 | dir: ./ 4 | 5 | train_file: ../research.data/flan_v2_shuffle/*.pt 6 | dev_file: 7 | test_file: 8 | 9 | # Model 10 | model: 11 | _target_: models.llama.LlamaForConditionalGeneration.from_pretrained 12 | use_peft: False 13 | # lora_config: 14 | # _target_: models.llama.LoraConfig 15 | # task_type: CAUSAL_LM 16 | # inference_mode: False 17 | # target_modules: ["q_proj", "v_proj"] 18 | # r: 8 19 | # lora_alpha: 16 20 | # lora_dropout: 0.1 21 | 22 | #model_eval: 23 | # _target_: models.llama.LlamaForMultipleChoiceCausalLM.from_pretrained_peft_eval 24 | # base_model_name_or_path: ${model_name_or_path} 25 | 26 | 27 | # Data loading 28 | read_tensor: 29 | _target_: data.collators.flan.FlanCollectionGroupDataset 30 | 31 | 32 | extended_vocab: 33 | 34 | # Data collator 35 | collator: 36 | _target_: data.collators.flan.FlanCollatorOverCollator 37 | collator: 38 | max_seq_length: 1024 39 | tokenizer: pretrained-models/LLaMA/llama-7b 40 | decoder_only: True 41 | 42 | # Dataloader 43 | num_workers: 4 44 | prefetch_factor: 2 45 | 46 | do_preprocess: False 47 | 48 | # Wiki path pretrain v8.2 49 | model_name_or_path: pretrained-modelscl/LLaMA/llama-7b 50 | pretrain: 51 | 52 | output_dir: experiments/llama.7b.flan.v1.0.seq1024.w8.adamw.500steps.NA100.0401 53 | 54 | do_train: True 55 | evaluate_during_training: False 56 | 57 | do_eval: True 58 | eval_sub_path: checkpoint-* 59 | 60 | # Training hyper-parameters 61 | per_gpu_train_batch_size: 1 62 | per_gpu_eval_batch_size: 1 63 | learning_rate: 1e-4 64 | gradient_accumulation_steps: 512 65 | weight_decay: 0.00 66 | adam_epsilon: 1e-6 67 | adam_betas: "(0.9, 0.999)" 68 | max_grad_norm: 1.0 69 | num_train_epochs: 1 70 | total_dataset_len: 760000000 71 | max_steps: 0 72 | warmup_proportion: 0 73 | warmup_steps: 5000 74 | 75 | # Optimizer 76 | optimizer: 77 | use_nvlamb: 78 | bit_training: 79 | 80 | 81 | logging_steps: 1 82 | #save_best: True 83 | save_best: False 84 | save_steps: 500 85 | eval_steps: 500 86 | ddp_eval: True 87 | no_cuda: False 88 | seed: 42 89 | local_rank: -1 90 | fp16: True 91 | fp16_opt_level: O1 92 | fp16_bfloat16: True 93 | 94 | # Prediction config 95 | prediction_cfg: 96 | metric: "acc" 97 | measure: 1 98 | best_checkpoint: 99 | best_result: 100 | eval_forward_fn: 101 | _target_: general_util.evaluator.DiscriminatorForwardFn 102 | post_process: 103 | 104 | #dist_init: 105 | # _target_: general_util.dist_utils.setup_slurm_distributed 106 | 107 | 108 | # fairscale.FullyShardedDP 109 | fairscale_config: 110 | # _target_: general_util.fsdp_utils.recursive_initialize 111 | _target_: general_util.fsdp_utils.default_initialize 112 | # _target_: general_util.fsdp_utils.default_initialize_v2 113 | # _target_: general_util.torch_fsdp_utils.torch_fsdp_transformer_init 114 | # _target_: general_util.torch_fsdp_utils.torch_fsdp_auto_wrap 115 | fp16: ${fp16} 116 | move_grads_to_cpu: False 117 | move_params_to_cpu: False 118 | flatten_parameters: False 119 | # fp16_bfloat16: ${fp16_bfloat16} 120 | # cpu_offload: False 121 | # disable_reshard_on_root: False 122 | 123 | 124 | # Lightseq config 125 | with_lightseq: False 126 | 127 | 128 | summary_helper: 129 | _target_: general_util.tensorboard_helper.SummaryWriterHelper 130 | batch_index_or_keys: 131 | # "train/pair_value_num": pair_value_num 132 | # "train/pair_label_num": pair_label_num 133 | # "train/dropped_op_cnt": dropped_op_cnt 134 | # "train/invalid_path": invalid_path 135 | outputs_index_or_keys: 136 | # "train/mlm_loss": mlm_loss 137 | # "train/cls_loss": cls_loss 138 | # "train/tagging_loss": tagging_loss 139 | # "train/path_gen_loss": path_gen_loss 140 | 141 | # Temporary variables 142 | n_gpu: 143 | device: 144 | train_batch_size: 145 | eval_batch_size: 146 | world_size: 147 | --------------------------------------------------------------------------------