├── models
├── __init__.py
├── mpt
│ ├── generation_config.json
│ ├── special_tokens_map.json
│ ├── tokenizer_config.json
│ ├── custom_embedding.py
│ ├── config.json
│ ├── adapt_tokenizer.py
│ ├── blocks.py
│ └── norm.py
└── patching_utils.py
├── modules
├── __init__.py
├── logits_processor.py
└── layers.py
├── general_util
├── __init__.py
├── tokenization_utils.py
├── logger.py
├── mixin.py
├── average_meter.py
├── dist_utils.py
└── lightseq_utils.py
├── data
├── preprocessor
│ ├── __init__.py
│ └── mmlu_merge.py
├── __init__.py
├── files
│ ├── wudao
│ │ ├── partition_000.json
│ │ └── file_samples_50.json
│ └── c4
│ │ └── en
│ │ ├── partition_1100.json
│ │ ├── p25
│ │ ├── partition_1025.json
│ │ ├── partition_100.json
│ │ ├── partition_1000.json
│ │ ├── partition_125.json
│ │ ├── partition_150.json
│ │ ├── partition_175.json
│ │ ├── partition_200.json
│ │ ├── partition_225.json
│ │ ├── partition_25.json
│ │ ├── partition_250.json
│ │ ├── partition_275.json
│ │ ├── partition_300.json
│ │ ├── partition_325.json
│ │ ├── partition_350.json
│ │ ├── partition_375.json
│ │ ├── partition_400.json
│ │ ├── partition_425.json
│ │ ├── partition_450.json
│ │ ├── partition_475.json
│ │ ├── partition_50.json
│ │ ├── partition_500.json
│ │ ├── partition_525.json
│ │ ├── partition_550.json
│ │ ├── partition_575.json
│ │ ├── partition_600.json
│ │ ├── partition_625.json
│ │ ├── partition_650.json
│ │ ├── partition_675.json
│ │ ├── partition_700.json
│ │ ├── partition_725.json
│ │ ├── partition_75.json
│ │ ├── partition_750.json
│ │ ├── partition_775.json
│ │ ├── partition_800.json
│ │ ├── partition_825.json
│ │ ├── partition_850.json
│ │ ├── partition_875.json
│ │ ├── partition_900.json
│ │ ├── partition_925.json
│ │ ├── partition_950.json
│ │ └── partition_975.json
│ │ └── p50
│ │ ├── partition_1050.json
│ │ ├── partition_100.json
│ │ ├── partition_1000.json
│ │ ├── partition_150.json
│ │ ├── partition_200.json
│ │ ├── partition_250.json
│ │ ├── partition_300.json
│ │ ├── partition_350.json
│ │ ├── partition_400.json
│ │ ├── partition_450.json
│ │ ├── partition_50.json
│ │ ├── partition_500.json
│ │ ├── partition_550.json
│ │ ├── partition_600.json
│ │ ├── partition_650.json
│ │ ├── partition_700.json
│ │ ├── partition_750.json
│ │ ├── partition_800.json
│ │ ├── partition_850.json
│ │ ├── partition_900.json
│ │ └── partition_950.json
├── test.py
├── flan_combine.py
├── collators
│ └── __init__.py
├── flan_sample.py
└── strategy_qa.py
├── panda_logo.PNG
├── requirements.txt
├── post_processors
├── dist_mixin.py
└── bleu.py
├── seed_multi_run.sh
├── make_delta.py
├── .gitignore
├── conf
├── base.yaml
├── roberta_split_fact_v1_1.yaml
└── llama
│ └── wiki
│ ├── test.yaml
│ └── llama_7b_flan_v1_0.yaml
└── convert2hf.py
/models/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/modules/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/general_util/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/data/preprocessor/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/data/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Write your own datasets under this directory.
3 | """
--------------------------------------------------------------------------------
/panda_logo.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dandelionsllm/pandallm/HEAD/panda_logo.PNG
--------------------------------------------------------------------------------
/models/mpt/generation_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "_from_model_config": true,
3 | "transformers_version": "4.28.1",
4 | "use_cache": false
5 | }
6 |
--------------------------------------------------------------------------------
/models/mpt/special_tokens_map.json:
--------------------------------------------------------------------------------
1 | {
2 | "bos_token": "<|endoftext|>",
3 | "eos_token": "<|endoftext|>",
4 | "unk_token": "<|endoftext|>"
5 | }
6 |
--------------------------------------------------------------------------------
/models/mpt/tokenizer_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "add_prefix_space": false,
3 | "bos_token": "<|endoftext|>",
4 | "clean_up_tokenization_spaces": true,
5 | "eos_token": "<|endoftext|>",
6 | "model_max_length": 8192,
7 | "tokenizer_class": "GPTNeoXTokenizer",
8 | "unk_token": "<|endoftext|>"
9 | }
10 |
--------------------------------------------------------------------------------
/data/files/wudao/partition_000.json:
--------------------------------------------------------------------------------
1 | [
2 | "/opt/ml/input/data/train/baike2018/baike2018qa_train.json",
3 | "/opt/ml/input/data/train/news_2016/news2016_train.json",
4 | "/opt/ml/input/data/train/translate/translate_train.json",
5 | "/opt/ml/input/data/train/webtext/web_text_2019_train.json",
6 | "/opt/ml/input/data/train/wikizh/wikizh.json"
7 | ]
--------------------------------------------------------------------------------
/models/mpt/custom_embedding.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 | from torch import Tensor
5 |
6 |
7 | class SharedEmbedding(nn.Embedding):
8 |
9 | def forward(self, input: Tensor, unembed: bool = False) -> Tensor:
10 | if unembed:
11 | return F.linear(input, self.weight)
12 | return super().forward(input)
13 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | wandb
2 | nltk
3 | tensorboard
4 | sentencepiece
5 | https://download.pytorch.org/whl/cu117/torch-2.0.1%2Bcu117-cp39-cp39-linux_x86_64.whl
6 | hydra-core
7 | fairscale
8 | deepspeed==0.9.5
9 | datasets
10 | bitsandbytes
11 | transformers
12 | git+https://github.com/huggingface/peft.git
13 | git+https://github.com/huggingface/accelerate.git
14 | einops
15 | tensor-parallel
--------------------------------------------------------------------------------
/post_processors/dist_mixin.py:
--------------------------------------------------------------------------------
1 | import torch.distributed as dist
2 | from typing import List, Any
3 |
4 |
5 | class DistGatherMixin:
6 | def gather(self):
7 | pass
8 |
9 | @staticmethod
10 | def gather_object(objects: List[Any]):
11 | output = [None for _ in range(dist.get_world_size())]
12 | dist.gather_object(objects,
13 | object_gather_list=output if dist.get_rank() == 0 else None,
14 | dst=0)
15 |
16 | if dist.get_rank() == 0:
17 | return output
18 | else:
19 | return None
20 |
--------------------------------------------------------------------------------
/data/test.py:
--------------------------------------------------------------------------------
1 | import os
2 | from torch.utils.data import Dataset
3 |
4 |
5 | class TestDataset(Dataset):
6 | def __init__(self, file_path, tokenizer, pseudo_dataset_len: int = -1):
7 | super().__init__()
8 | self.data = ["My name is Jiao Fangkai."]
9 | self.pseudo_dataset_len = pseudo_dataset_len
10 | # print("============================", os.environ["LOCAL_RANK"], "Test dataset initialized.")
11 |
12 | def __len__(self):
13 | if self.pseudo_dataset_len > 0:
14 | return self.pseudo_dataset_len
15 | return 100000000
16 |
17 | def __getitem__(self, index):
18 | return {
19 | "flan": {
20 | "inputs": self.data[0],
21 | "targets": self.data[0],
22 | },
23 | "index": index,
24 | }
25 |
--------------------------------------------------------------------------------
/seed_multi_run.sh:
--------------------------------------------------------------------------------
1 | conf=$1
2 | conf_name=$2
3 | num_rank=$3
4 | gpu=$4
5 | port=$5
6 |
7 | count=1
8 | for arg in "$@"; do
9 | if [ "$count" -gt "5" ]; then
10 | if [ "$num_rank" -gt "1" ]; then
11 | echo "CUDA_VISIBLE_DEVICES=$gpu python -m torch.distributed.run --nproc_per_node $num_rank --master_port $port trainer_base_fsdp_v3.py -cp $conf -cn $conf_name seed=${arg}"
12 |
13 | CUDA_VISIBLE_DEVICES=$gpu python -m torch.distributed.run --nproc_per_node $num_rank --master_port $port trainer_base_fsdp_v3.py -cp $conf -cn $conf_name seed=${arg}
14 | else
15 | echo "CUDA_VISIBLE_DEVICES=$gpu python trainer_base_fsdp_v3.py -cp $conf -cn $conf_name seed=${arg}"
16 |
17 | CUDA_VISIBLE_DEVICES=$gpu python trainer_base_fsdp_v3.py -cp $conf -cn $conf_name seed=${arg}
18 | fi
19 | fi
20 | let count=count+1
21 | done;
22 |
23 |
--------------------------------------------------------------------------------
/data/flan_combine.py:
--------------------------------------------------------------------------------
1 | data_group = [
2 | [
3 | "cot_fs_noopt_train.jsonl.gz",
4 | "cot_fs_opt_train.jsonl.gz",
5 | "cot_zs_noopt_train.jsonl.gz",
6 | "cot_zs_opt_train.jsonl.gz",
7 | "niv2_fs_noopt_train.jsonl.gz",
8 | "niv2_fs_opt_train.jsonl.gz",
9 | "niv2_zs_noopt_train.jsonl.gz",
10 | "niv2_zs_opt_train.jsonl.gz",
11 | ],
12 | [
13 | "dialog_zs_noopt_train.jsonl.gz",
14 | "dialog_zs_opt_train.jsonl.gz",
15 | ],
16 | "dialog_fs_noopt_train.jsonl.gz",
17 | "dialog_fs_opt_train.jsonl.gz",
18 | "flan_fs_noopt_train.jsonl.gz",
19 | "flan_fs_opt_train_part1.jsonl.gz",
20 | "flan_fs_opt_train_part2.jsonl.gz",
21 | "flan_fs_opt_train_part3.jsonl.gz",
22 | "flan_zs_noopt_train.jsonl.gz",
23 | "flan_zs_opt_train.jsonl.gz",
24 | "t0_fs_noopt_train.jsonl.gz",
25 | "t0_zs_noopt_train.jsonl.gz",
26 | "t0_zs_opt_train.jsonl.gz",
27 | ]
28 |
29 |
30 | def obtain_flan_collection_group():
31 | return data_group
32 |
--------------------------------------------------------------------------------
/modules/logits_processor.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from transformers.generation_logits_process import LogitsProcessor
3 |
4 | from modules.trie import Trie
5 |
6 |
7 | class TrieConstrainedLogitsProcessor(LogitsProcessor):
8 | def __init__(self, trie: Trie, sent_mode: bool = False):
9 | self.trie = trie
10 | # If `sent_mode` is `True`, please ensure that each sentence in trie has two copies,
11 | # one ends with `` and the other one ends with `<\s>` (eos token).
12 | self.sent_mode = sent_mode
13 | if sent_mode:
14 | assert self.trie.sep_token_id is not None
15 | self.sep_token_id = self.trie.sep_token_id
16 |
17 | def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
18 | sequence_ls = input_ids.tolist()
19 | scores_mask = scores.new_zeros(scores.size()).fill_(-10000.0)
20 | for seq_id, seq in enumerate(sequence_ls):
21 | if self.sent_mode:
22 | for idx in range(len(seq) - 1, -1, -1):
23 | if seq[idx] == self.sep_token_id:
24 | seq = seq[(idx + 1):]
25 | output = self.trie.get(seq)
26 | scores_mask[seq_id, output] = 0.0
27 | return scores + scores_mask
28 |
--------------------------------------------------------------------------------
/data/preprocessor/mmlu_merge.py:
--------------------------------------------------------------------------------
1 | import os
2 | import pandas as pd
3 | from tqdm import tqdm
4 | import warnings
5 | warnings.simplefilter(action='ignore', category=FutureWarning)
6 |
7 |
8 | def merge_data(data_dir, type):
9 | files = [f for f in os.listdir(f'{data_dir}/{type}') if f.endswith('.csv')]
10 | dfs = [pd.read_csv(f'{data_dir}/{type}/{f}', header=None) for f in files]
11 | df_all = pd.concat(dfs)
12 |
13 | df_new = pd.DataFrame(data={'inputs': [], 'targets': []})
14 | option_names = ['A', 'B', 'C', 'D']
15 | for i in tqdm(range(df_all.shape[0])):
16 | query, options = df_all.iloc[i, 0], df_all.iloc[i, 1:5].values
17 | options = [f'{name}. {opt}' for name, opt in zip(option_names, options)]
18 | query = [query] + options
19 | query = '\n'.join(query)
20 | answer = df_all.iloc[i, -1]
21 | row = pd.DataFrame({'inputs': [query], 'targets': [answer]})
22 | df_new = df_new.append(row, ignore_index=True)
23 |
24 | df_new.to_csv(f'{data_dir}/{type}.csv')
25 | print(f'{type} dataset merged successfully ...')
26 |
27 |
28 |
29 | if __name__ == '__main__':
30 | data_dir = '/home/tianze/datasets/MMLU'
31 |
32 | for type in ['test', 'val', 'dev', 'auxiliary_train']:
33 | merge_data(data_dir, type)
34 |
--------------------------------------------------------------------------------
/models/mpt/config.json:
--------------------------------------------------------------------------------
1 | {
2 | "architectures": [
3 | "MPTForCausalLM"
4 | ],
5 | "attn_config": {
6 | "alibi": true,
7 | "alibi_bias_max": 8,
8 | "attn_impl": "torch",
9 | "attn_pdrop": 0,
10 | "attn_type": "multihead_attention",
11 | "attn_uses_sequence_id": false,
12 | "clip_qkv": null,
13 | "prefix_lm": false,
14 | "qk_ln": false,
15 | "softmax_scale": null
16 | },
17 | "auto_map": {
18 | "AutoConfig": "configuration_mpt.MPTConfig",
19 | "AutoModelForCausalLM": "modeling_mpt.MPTForCausalLM"
20 | },
21 | "d_model": 7168,
22 | "emb_pdrop": 0,
23 | "embedding_fraction": 1.0,
24 | "expansion_ratio": 4,
25 | "init_config": {
26 | "emb_init_std": null,
27 | "emb_init_uniform_lim": null,
28 | "fan_mode": "fan_in",
29 | "init_div_is_residual": true,
30 | "init_gain": 0.0,
31 | "init_nonlinearity": "relu",
32 | "init_std": null,
33 | "name": "kaiming_normal_",
34 | "verbose": 0
35 | },
36 | "init_device": "cpu",
37 | "learned_pos_emb": true,
38 | "logit_scale": null,
39 | "max_seq_len": 8192,
40 | "model_type": "mpt",
41 | "n_heads": 64,
42 | "n_layers": 48,
43 | "no_bias": true,
44 | "norm_type": "low_precision_layernorm",
45 | "resid_pdrop": 0,
46 | "tokenizer_name": "EleutherAI/gpt-neox-20b",
47 | "torch_dtype": "bfloat16",
48 | "transformers_version": "4.28.1",
49 | "use_cache": false,
50 | "verbose": 0,
51 | "vocab_size": 50432
52 | }
53 |
--------------------------------------------------------------------------------
/data/files/wudao/file_samples_50.json:
--------------------------------------------------------------------------------
1 | [
2 | "part-2021022097.json",
3 | "part-2021023489.json",
4 | "part-2021012504.json",
5 | "part-2021022428.json",
6 | "part-2021012526.json",
7 | "part-2021023008.json",
8 | "part-2021022959.json",
9 | "part-2021024569.json",
10 | "part-2021023736.json",
11 | "part-2021024167.json",
12 | "part-2021022328.json",
13 | "part-2021021914.json",
14 | "part-2021013704.json",
15 | "part-2021022050.json",
16 | "part-2021012514.json",
17 | "part-2021023855.json",
18 | "part-2021016902.json",
19 | "part-2021022805.json",
20 | "part-2021022364.json",
21 | "part-2021021957.json",
22 | "part-2021014840.json",
23 | "part-2021022605.json",
24 | "part-2021023247.json",
25 | "part-2021022649.json",
26 | "part-2021020076.json",
27 | "part-2021016146.json",
28 | "part-2021024834.json",
29 | "part-2021012506.json",
30 | "part-2021021896.json",
31 | "part-2021012713.json",
32 | "part-2021022694.json",
33 | "part-2021023747.json",
34 | "part-2021012518.json",
35 | "part-2021023507.json",
36 | "part-2021019390.json",
37 | "part-2021017289.json",
38 | "part-2021023649.json",
39 | "part-2021023020.json",
40 | "part-2021012510.json",
41 | "part-2021023741.json",
42 | "part-2021012613.json",
43 | "part-2021023988.json",
44 | "part-2021013835.json",
45 | "part-2021021921.json",
46 | "part-2021022921.json",
47 | "part-2021020428.json",
48 | "part-2021023078.json",
49 | "part-2021012502.json",
50 | "part-2021022891.json",
51 | "part-2021022198.json"
52 | ]
--------------------------------------------------------------------------------
/data/files/c4/en/partition_1100.json:
--------------------------------------------------------------------------------
1 | ["/opt/ml/input/data/train/c4/en/c4-train.00649-of-01024.json.gz", "/opt/ml/input/data/train/c4/en/c4-train.00231-of-01024.json.gz", "/opt/ml/input/data/train/c4/en/c4-train.00779-of-01024.json.gz", "/opt/ml/input/data/train/c4/en/c4-train.00808-of-01024.json.gz", "/opt/ml/input/data/train/c4/en/c4-train.00789-of-01024.json.gz", "/opt/ml/input/data/train/c4/en/c4-train.00846-of-01024.json.gz", "/opt/ml/input/data/train/c4/en/c4-train.00503-of-01024.json.gz", "/opt/ml/input/data/train/c4/en/c4-train.00060-of-01024.json.gz", "/opt/ml/input/data/train/c4/en/c4-train.00082-of-01024.json.gz", "/opt/ml/input/data/train/c4/en/c4-train.00256-of-01024.json.gz", "/opt/ml/input/data/train/c4/en/c4-train.00387-of-01024.json.gz", "/opt/ml/input/data/train/c4/en/c4-train.00138-of-01024.json.gz", "/opt/ml/input/data/train/c4/en/c4-train.00930-of-01024.json.gz", "/opt/ml/input/data/train/c4/en/c4-train.00163-of-01024.json.gz", "/opt/ml/input/data/train/c4/en/c4-train.00622-of-01024.json.gz", "/opt/ml/input/data/train/c4/en/c4-train.00056-of-01024.json.gz", "/opt/ml/input/data/train/c4/en/c4-train.00349-of-01024.json.gz", "/opt/ml/input/data/train/c4/en/c4-train.00461-of-01024.json.gz", "/opt/ml/input/data/train/c4/en/c4-train.00282-of-01024.json.gz", "/opt/ml/input/data/train/c4/en/c4-train.00673-of-01024.json.gz", "/opt/ml/input/data/train/c4/en/c4-train.00224-of-01024.json.gz", "/opt/ml/input/data/train/c4/en/c4-train.00098-of-01024.json.gz", "/opt/ml/input/data/train/c4/en/c4-train.00179-of-01024.json.gz", "/opt/ml/input/data/train/c4/en/c4-train.00080-of-01024.json.gz"]
--------------------------------------------------------------------------------
/data/collators/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Write your own your own collators under the directory.
3 | """
4 |
5 | from typing import Dict, Union, Any, List
6 |
7 | import torch
8 | from torch import Tensor
9 | from torch.utils.data import Dataset
10 | from torch.utils.data.dataloader import default_collate
11 | from transformers.tokenization_utils import BatchEncoding
12 |
13 |
14 | class DictTensorDataset(Dataset):
15 | def __init__(self, data: Union[Dict[str, Tensor], BatchEncoding], meta_data: List[Dict[str, Any]] = None):
16 | self.data = data
17 | self.meta_data = meta_data
18 | self.keys = list(self.data.keys())
19 | for v in self.data.values():
20 | if meta_data is not None:
21 | assert len(v) == len(meta_data)
22 | else:
23 | assert len(v) == self.data[self.keys[0]].size(0)
24 |
25 | def __len__(self):
26 | return self.data[self.keys[0]].size(0)
27 |
28 | def __getitem__(self, idx):
29 | res = {k: v[idx] for k, v in self.data.items()}
30 | if self.meta_data is not None:
31 | res["meta_data"] = self.meta_data[idx]
32 | if "index" not in res or "index" not in res["meta_data"]:
33 | res["index"] = torch.LongTensor([idx])
34 | return res
35 |
36 |
37 | class MetaCollator:
38 | def __call__(self, batch):
39 | if "meta_data" not in batch[0]:
40 | return default_collate(batch)
41 |
42 | meta_data = [b.pop("meta_data") for b in batch]
43 | batch = default_collate(batch)
44 | batch["meta_data"] = meta_data
45 | return batch
46 |
--------------------------------------------------------------------------------
/data/files/c4/en/p25/partition_1025.json:
--------------------------------------------------------------------------------
1 | [
2 | "/opt/ml/input/data/train/c4/en/c4-train.00480-of-01024.json.gz",
3 | "/opt/ml/input/data/train/c4/en/c4-train.00605-of-01024.json.gz",
4 | "/opt/ml/input/data/train/c4/en/c4-train.00825-of-01024.json.gz",
5 | "/opt/ml/input/data/train/c4/en/c4-train.00252-of-01024.json.gz",
6 | "/opt/ml/input/data/train/c4/en/c4-train.00185-of-01024.json.gz",
7 | "/opt/ml/input/data/train/c4/en/c4-train.00306-of-01024.json.gz",
8 | "/opt/ml/input/data/train/c4/en/c4-train.00688-of-01024.json.gz",
9 | "/opt/ml/input/data/train/c4/en/c4-train.01017-of-01024.json.gz",
10 | "/opt/ml/input/data/train/c4/en/c4-train.00050-of-01024.json.gz",
11 | "/opt/ml/input/data/train/c4/en/c4-train.01004-of-01024.json.gz",
12 | "/opt/ml/input/data/train/c4/en/c4-train.00740-of-01024.json.gz",
13 | "/opt/ml/input/data/train/c4/en/c4-train.00796-of-01024.json.gz",
14 | "/opt/ml/input/data/train/c4/en/c4-train.00831-of-01024.json.gz",
15 | "/opt/ml/input/data/train/c4/en/c4-train.00485-of-01024.json.gz",
16 | "/opt/ml/input/data/train/c4/en/c4-train.00677-of-01024.json.gz",
17 | "/opt/ml/input/data/train/c4/en/c4-train.00357-of-01024.json.gz",
18 | "/opt/ml/input/data/train/c4/en/c4-train.00537-of-01024.json.gz",
19 | "/opt/ml/input/data/train/c4/en/c4-train.00884-of-01024.json.gz",
20 | "/opt/ml/input/data/train/c4/en/c4-train.00073-of-01024.json.gz",
21 | "/opt/ml/input/data/train/c4/en/c4-train.00297-of-01024.json.gz",
22 | "/opt/ml/input/data/train/c4/en/c4-train.00317-of-01024.json.gz",
23 | "/opt/ml/input/data/train/c4/en/c4-train.00192-of-01024.json.gz",
24 | "/opt/ml/input/data/train/c4/en/c4-train.00323-of-01024.json.gz",
25 | "/opt/ml/input/data/train/c4/en/c4-train.00462-of-01024.json.gz"
26 | ]
--------------------------------------------------------------------------------
/data/files/c4/en/p50/partition_1050.json:
--------------------------------------------------------------------------------
1 | [
2 | "/opt/ml/input/data/train/c4/en/c4-train.00791-of-01024.json.gz",
3 | "/opt/ml/input/data/train/c4/en/c4-train.00553-of-01024.json.gz",
4 | "/opt/ml/input/data/train/c4/en/c4-train.00213-of-01024.json.gz",
5 | "/opt/ml/input/data/train/c4/en/c4-train.00177-of-01024.json.gz",
6 | "/opt/ml/input/data/train/c4/en/c4-train.00134-of-01024.json.gz",
7 | "/opt/ml/input/data/train/c4/en/c4-train.00128-of-01024.json.gz",
8 | "/opt/ml/input/data/train/c4/en/c4-train.00069-of-01024.json.gz",
9 | "/opt/ml/input/data/train/c4/en/c4-train.00846-of-01024.json.gz",
10 | "/opt/ml/input/data/train/c4/en/c4-train.00339-of-01024.json.gz",
11 | "/opt/ml/input/data/train/c4/en/c4-train.00802-of-01024.json.gz",
12 | "/opt/ml/input/data/train/c4/en/c4-train.00205-of-01024.json.gz",
13 | "/opt/ml/input/data/train/c4/en/c4-train.00291-of-01024.json.gz",
14 | "/opt/ml/input/data/train/c4/en/c4-train.00868-of-01024.json.gz",
15 | "/opt/ml/input/data/train/c4/en/c4-train.00029-of-01024.json.gz",
16 | "/opt/ml/input/data/train/c4/en/c4-train.00579-of-01024.json.gz",
17 | "/opt/ml/input/data/train/c4/en/c4-train.00761-of-01024.json.gz",
18 | "/opt/ml/input/data/train/c4/en/c4-train.00930-of-01024.json.gz",
19 | "/opt/ml/input/data/train/c4/en/c4-train.00201-of-01024.json.gz",
20 | "/opt/ml/input/data/train/c4/en/c4-train.00879-of-01024.json.gz",
21 | "/opt/ml/input/data/train/c4/en/c4-train.00478-of-01024.json.gz",
22 | "/opt/ml/input/data/train/c4/en/c4-train.00259-of-01024.json.gz",
23 | "/opt/ml/input/data/train/c4/en/c4-train.00543-of-01024.json.gz",
24 | "/opt/ml/input/data/train/c4/en/c4-train.00215-of-01024.json.gz",
25 | "/opt/ml/input/data/train/c4/en/c4-train.00504-of-01024.json.gz"
26 | ]
--------------------------------------------------------------------------------
/data/files/c4/en/p25/partition_100.json:
--------------------------------------------------------------------------------
1 | [
2 | "/opt/ml/input/data/train/c4/en/c4-train.00435-of-01024.json.gz",
3 | "/opt/ml/input/data/train/c4/en/c4-train.00218-of-01024.json.gz",
4 | "/opt/ml/input/data/train/c4/en/c4-train.00981-of-01024.json.gz",
5 | "/opt/ml/input/data/train/c4/en/c4-train.00175-of-01024.json.gz",
6 | "/opt/ml/input/data/train/c4/en/c4-train.00079-of-01024.json.gz",
7 | "/opt/ml/input/data/train/c4/en/c4-train.00662-of-01024.json.gz",
8 | "/opt/ml/input/data/train/c4/en/c4-train.00344-of-01024.json.gz",
9 | "/opt/ml/input/data/train/c4/en/c4-train.00090-of-01024.json.gz",
10 | "/opt/ml/input/data/train/c4/en/c4-train.00918-of-01024.json.gz",
11 | "/opt/ml/input/data/train/c4/en/c4-train.00155-of-01024.json.gz",
12 | "/opt/ml/input/data/train/c4/en/c4-train.00131-of-01024.json.gz",
13 | "/opt/ml/input/data/train/c4/en/c4-train.00576-of-01024.json.gz",
14 | "/opt/ml/input/data/train/c4/en/c4-train.00604-of-01024.json.gz",
15 | "/opt/ml/input/data/train/c4/en/c4-train.00774-of-01024.json.gz",
16 | "/opt/ml/input/data/train/c4/en/c4-train.00659-of-01024.json.gz",
17 | "/opt/ml/input/data/train/c4/en/c4-train.00808-of-01024.json.gz",
18 | "/opt/ml/input/data/train/c4/en/c4-train.00026-of-01024.json.gz",
19 | "/opt/ml/input/data/train/c4/en/c4-train.00115-of-01024.json.gz",
20 | "/opt/ml/input/data/train/c4/en/c4-train.00467-of-01024.json.gz",
21 | "/opt/ml/input/data/train/c4/en/c4-train.00583-of-01024.json.gz",
22 | "/opt/ml/input/data/train/c4/en/c4-train.00933-of-01024.json.gz",
23 | "/opt/ml/input/data/train/c4/en/c4-train.00907-of-01024.json.gz",
24 | "/opt/ml/input/data/train/c4/en/c4-train.00105-of-01024.json.gz",
25 | "/opt/ml/input/data/train/c4/en/c4-train.00869-of-01024.json.gz",
26 | "/opt/ml/input/data/train/c4/en/c4-train.00658-of-01024.json.gz"
27 | ]
--------------------------------------------------------------------------------
/data/files/c4/en/p25/partition_1000.json:
--------------------------------------------------------------------------------
1 | [
2 | "/opt/ml/input/data/train/c4/en/c4-train.00657-of-01024.json.gz",
3 | "/opt/ml/input/data/train/c4/en/c4-train.00597-of-01024.json.gz",
4 | "/opt/ml/input/data/train/c4/en/c4-train.00514-of-01024.json.gz",
5 | "/opt/ml/input/data/train/c4/en/c4-train.00390-of-01024.json.gz",
6 | "/opt/ml/input/data/train/c4/en/c4-train.00773-of-01024.json.gz",
7 | "/opt/ml/input/data/train/c4/en/c4-train.00931-of-01024.json.gz",
8 | "/opt/ml/input/data/train/c4/en/c4-train.00858-of-01024.json.gz",
9 | "/opt/ml/input/data/train/c4/en/c4-train.00852-of-01024.json.gz",
10 | "/opt/ml/input/data/train/c4/en/c4-train.00783-of-01024.json.gz",
11 | "/opt/ml/input/data/train/c4/en/c4-train.00994-of-01024.json.gz",
12 | "/opt/ml/input/data/train/c4/en/c4-train.00042-of-01024.json.gz",
13 | "/opt/ml/input/data/train/c4/en/c4-train.00503-of-01024.json.gz",
14 | "/opt/ml/input/data/train/c4/en/c4-train.00260-of-01024.json.gz",
15 | "/opt/ml/input/data/train/c4/en/c4-train.00243-of-01024.json.gz",
16 | "/opt/ml/input/data/train/c4/en/c4-train.00614-of-01024.json.gz",
17 | "/opt/ml/input/data/train/c4/en/c4-train.00706-of-01024.json.gz",
18 | "/opt/ml/input/data/train/c4/en/c4-train.00536-of-01024.json.gz",
19 | "/opt/ml/input/data/train/c4/en/c4-train.00502-of-01024.json.gz",
20 | "/opt/ml/input/data/train/c4/en/c4-train.00039-of-01024.json.gz",
21 | "/opt/ml/input/data/train/c4/en/c4-train.00627-of-01024.json.gz",
22 | "/opt/ml/input/data/train/c4/en/c4-train.00118-of-01024.json.gz",
23 | "/opt/ml/input/data/train/c4/en/c4-train.00712-of-01024.json.gz",
24 | "/opt/ml/input/data/train/c4/en/c4-train.00356-of-01024.json.gz",
25 | "/opt/ml/input/data/train/c4/en/c4-train.00845-of-01024.json.gz",
26 | "/opt/ml/input/data/train/c4/en/c4-train.00013-of-01024.json.gz"
27 | ]
--------------------------------------------------------------------------------
/data/files/c4/en/p25/partition_125.json:
--------------------------------------------------------------------------------
1 | [
2 | "/opt/ml/input/data/train/c4/en/c4-train.00791-of-01024.json.gz",
3 | "/opt/ml/input/data/train/c4/en/c4-train.00553-of-01024.json.gz",
4 | "/opt/ml/input/data/train/c4/en/c4-train.00213-of-01024.json.gz",
5 | "/opt/ml/input/data/train/c4/en/c4-train.00177-of-01024.json.gz",
6 | "/opt/ml/input/data/train/c4/en/c4-train.00134-of-01024.json.gz",
7 | "/opt/ml/input/data/train/c4/en/c4-train.00128-of-01024.json.gz",
8 | "/opt/ml/input/data/train/c4/en/c4-train.00069-of-01024.json.gz",
9 | "/opt/ml/input/data/train/c4/en/c4-train.00846-of-01024.json.gz",
10 | "/opt/ml/input/data/train/c4/en/c4-train.00339-of-01024.json.gz",
11 | "/opt/ml/input/data/train/c4/en/c4-train.00802-of-01024.json.gz",
12 | "/opt/ml/input/data/train/c4/en/c4-train.00205-of-01024.json.gz",
13 | "/opt/ml/input/data/train/c4/en/c4-train.00291-of-01024.json.gz",
14 | "/opt/ml/input/data/train/c4/en/c4-train.00868-of-01024.json.gz",
15 | "/opt/ml/input/data/train/c4/en/c4-train.00029-of-01024.json.gz",
16 | "/opt/ml/input/data/train/c4/en/c4-train.00579-of-01024.json.gz",
17 | "/opt/ml/input/data/train/c4/en/c4-train.00761-of-01024.json.gz",
18 | "/opt/ml/input/data/train/c4/en/c4-train.00930-of-01024.json.gz",
19 | "/opt/ml/input/data/train/c4/en/c4-train.00201-of-01024.json.gz",
20 | "/opt/ml/input/data/train/c4/en/c4-train.00879-of-01024.json.gz",
21 | "/opt/ml/input/data/train/c4/en/c4-train.00478-of-01024.json.gz",
22 | "/opt/ml/input/data/train/c4/en/c4-train.00259-of-01024.json.gz",
23 | "/opt/ml/input/data/train/c4/en/c4-train.00543-of-01024.json.gz",
24 | "/opt/ml/input/data/train/c4/en/c4-train.00215-of-01024.json.gz",
25 | "/opt/ml/input/data/train/c4/en/c4-train.00504-of-01024.json.gz",
26 | "/opt/ml/input/data/train/c4/en/c4-train.00951-of-01024.json.gz"
27 | ]
--------------------------------------------------------------------------------
/data/files/c4/en/p25/partition_150.json:
--------------------------------------------------------------------------------
1 | [
2 | "/opt/ml/input/data/train/c4/en/c4-train.00145-of-01024.json.gz",
3 | "/opt/ml/input/data/train/c4/en/c4-train.00906-of-01024.json.gz",
4 | "/opt/ml/input/data/train/c4/en/c4-train.00934-of-01024.json.gz",
5 | "/opt/ml/input/data/train/c4/en/c4-train.00938-of-01024.json.gz",
6 | "/opt/ml/input/data/train/c4/en/c4-train.00355-of-01024.json.gz",
7 | "/opt/ml/input/data/train/c4/en/c4-train.00784-of-01024.json.gz",
8 | "/opt/ml/input/data/train/c4/en/c4-train.00246-of-01024.json.gz",
9 | "/opt/ml/input/data/train/c4/en/c4-train.00408-of-01024.json.gz",
10 | "/opt/ml/input/data/train/c4/en/c4-train.00955-of-01024.json.gz",
11 | "/opt/ml/input/data/train/c4/en/c4-train.00103-of-01024.json.gz",
12 | "/opt/ml/input/data/train/c4/en/c4-train.00978-of-01024.json.gz",
13 | "/opt/ml/input/data/train/c4/en/c4-train.00197-of-01024.json.gz",
14 | "/opt/ml/input/data/train/c4/en/c4-train.00967-of-01024.json.gz",
15 | "/opt/ml/input/data/train/c4/en/c4-train.00021-of-01024.json.gz",
16 | "/opt/ml/input/data/train/c4/en/c4-train.01008-of-01024.json.gz",
17 | "/opt/ml/input/data/train/c4/en/c4-train.00979-of-01024.json.gz",
18 | "/opt/ml/input/data/train/c4/en/c4-train.00922-of-01024.json.gz",
19 | "/opt/ml/input/data/train/c4/en/c4-train.00645-of-01024.json.gz",
20 | "/opt/ml/input/data/train/c4/en/c4-train.00915-of-01024.json.gz",
21 | "/opt/ml/input/data/train/c4/en/c4-train.00432-of-01024.json.gz",
22 | "/opt/ml/input/data/train/c4/en/c4-train.00096-of-01024.json.gz",
23 | "/opt/ml/input/data/train/c4/en/c4-train.00447-of-01024.json.gz",
24 | "/opt/ml/input/data/train/c4/en/c4-train.00491-of-01024.json.gz",
25 | "/opt/ml/input/data/train/c4/en/c4-train.00556-of-01024.json.gz",
26 | "/opt/ml/input/data/train/c4/en/c4-train.00031-of-01024.json.gz"
27 | ]
--------------------------------------------------------------------------------
/data/files/c4/en/p25/partition_175.json:
--------------------------------------------------------------------------------
1 | [
2 | "/opt/ml/input/data/train/c4/en/c4-train.00062-of-01024.json.gz",
3 | "/opt/ml/input/data/train/c4/en/c4-train.00282-of-01024.json.gz",
4 | "/opt/ml/input/data/train/c4/en/c4-train.00670-of-01024.json.gz",
5 | "/opt/ml/input/data/train/c4/en/c4-train.00698-of-01024.json.gz",
6 | "/opt/ml/input/data/train/c4/en/c4-train.00385-of-01024.json.gz",
7 | "/opt/ml/input/data/train/c4/en/c4-train.00707-of-01024.json.gz",
8 | "/opt/ml/input/data/train/c4/en/c4-train.00837-of-01024.json.gz",
9 | "/opt/ml/input/data/train/c4/en/c4-train.00329-of-01024.json.gz",
10 | "/opt/ml/input/data/train/c4/en/c4-train.00248-of-01024.json.gz",
11 | "/opt/ml/input/data/train/c4/en/c4-train.01006-of-01024.json.gz",
12 | "/opt/ml/input/data/train/c4/en/c4-train.00671-of-01024.json.gz",
13 | "/opt/ml/input/data/train/c4/en/c4-train.01021-of-01024.json.gz",
14 | "/opt/ml/input/data/train/c4/en/c4-train.00550-of-01024.json.gz",
15 | "/opt/ml/input/data/train/c4/en/c4-train.00615-of-01024.json.gz",
16 | "/opt/ml/input/data/train/c4/en/c4-train.00696-of-01024.json.gz",
17 | "/opt/ml/input/data/train/c4/en/c4-train.00893-of-01024.json.gz",
18 | "/opt/ml/input/data/train/c4/en/c4-train.01003-of-01024.json.gz",
19 | "/opt/ml/input/data/train/c4/en/c4-train.00782-of-01024.json.gz",
20 | "/opt/ml/input/data/train/c4/en/c4-train.00007-of-01024.json.gz",
21 | "/opt/ml/input/data/train/c4/en/c4-train.00595-of-01024.json.gz",
22 | "/opt/ml/input/data/train/c4/en/c4-train.00224-of-01024.json.gz",
23 | "/opt/ml/input/data/train/c4/en/c4-train.00969-of-01024.json.gz",
24 | "/opt/ml/input/data/train/c4/en/c4-train.00508-of-01024.json.gz",
25 | "/opt/ml/input/data/train/c4/en/c4-train.00151-of-01024.json.gz",
26 | "/opt/ml/input/data/train/c4/en/c4-train.00624-of-01024.json.gz"
27 | ]
--------------------------------------------------------------------------------
/data/files/c4/en/p25/partition_200.json:
--------------------------------------------------------------------------------
1 | [
2 | "/opt/ml/input/data/train/c4/en/c4-train.00236-of-01024.json.gz",
3 | "/opt/ml/input/data/train/c4/en/c4-train.00104-of-01024.json.gz",
4 | "/opt/ml/input/data/train/c4/en/c4-train.00652-of-01024.json.gz",
5 | "/opt/ml/input/data/train/c4/en/c4-train.00596-of-01024.json.gz",
6 | "/opt/ml/input/data/train/c4/en/c4-train.00720-of-01024.json.gz",
7 | "/opt/ml/input/data/train/c4/en/c4-train.00110-of-01024.json.gz",
8 | "/opt/ml/input/data/train/c4/en/c4-train.00944-of-01024.json.gz",
9 | "/opt/ml/input/data/train/c4/en/c4-train.00607-of-01024.json.gz",
10 | "/opt/ml/input/data/train/c4/en/c4-train.00334-of-01024.json.gz",
11 | "/opt/ml/input/data/train/c4/en/c4-train.00527-of-01024.json.gz",
12 | "/opt/ml/input/data/train/c4/en/c4-train.00270-of-01024.json.gz",
13 | "/opt/ml/input/data/train/c4/en/c4-train.00676-of-01024.json.gz",
14 | "/opt/ml/input/data/train/c4/en/c4-train.00908-of-01024.json.gz",
15 | "/opt/ml/input/data/train/c4/en/c4-train.00445-of-01024.json.gz",
16 | "/opt/ml/input/data/train/c4/en/c4-train.00146-of-01024.json.gz",
17 | "/opt/ml/input/data/train/c4/en/c4-train.00722-of-01024.json.gz",
18 | "/opt/ml/input/data/train/c4/en/c4-train.00693-of-01024.json.gz",
19 | "/opt/ml/input/data/train/c4/en/c4-train.00396-of-01024.json.gz",
20 | "/opt/ml/input/data/train/c4/en/c4-train.00141-of-01024.json.gz",
21 | "/opt/ml/input/data/train/c4/en/c4-train.00269-of-01024.json.gz",
22 | "/opt/ml/input/data/train/c4/en/c4-train.00496-of-01024.json.gz",
23 | "/opt/ml/input/data/train/c4/en/c4-train.00387-of-01024.json.gz",
24 | "/opt/ml/input/data/train/c4/en/c4-train.00745-of-01024.json.gz",
25 | "/opt/ml/input/data/train/c4/en/c4-train.00132-of-01024.json.gz",
26 | "/opt/ml/input/data/train/c4/en/c4-train.00261-of-01024.json.gz"
27 | ]
--------------------------------------------------------------------------------
/data/files/c4/en/p25/partition_225.json:
--------------------------------------------------------------------------------
1 | [
2 | "/opt/ml/input/data/train/c4/en/c4-train.00647-of-01024.json.gz",
3 | "/opt/ml/input/data/train/c4/en/c4-train.00420-of-01024.json.gz",
4 | "/opt/ml/input/data/train/c4/en/c4-train.00014-of-01024.json.gz",
5 | "/opt/ml/input/data/train/c4/en/c4-train.00665-of-01024.json.gz",
6 | "/opt/ml/input/data/train/c4/en/c4-train.00345-of-01024.json.gz",
7 | "/opt/ml/input/data/train/c4/en/c4-train.00621-of-01024.json.gz",
8 | "/opt/ml/input/data/train/c4/en/c4-train.01015-of-01024.json.gz",
9 | "/opt/ml/input/data/train/c4/en/c4-train.00947-of-01024.json.gz",
10 | "/opt/ml/input/data/train/c4/en/c4-train.00094-of-01024.json.gz",
11 | "/opt/ml/input/data/train/c4/en/c4-train.01000-of-01024.json.gz",
12 | "/opt/ml/input/data/train/c4/en/c4-train.00574-of-01024.json.gz",
13 | "/opt/ml/input/data/train/c4/en/c4-train.00421-of-01024.json.gz",
14 | "/opt/ml/input/data/train/c4/en/c4-train.00962-of-01024.json.gz",
15 | "/opt/ml/input/data/train/c4/en/c4-train.00481-of-01024.json.gz",
16 | "/opt/ml/input/data/train/c4/en/c4-train.00905-of-01024.json.gz",
17 | "/opt/ml/input/data/train/c4/en/c4-train.00158-of-01024.json.gz",
18 | "/opt/ml/input/data/train/c4/en/c4-train.00866-of-01024.json.gz",
19 | "/opt/ml/input/data/train/c4/en/c4-train.00038-of-01024.json.gz",
20 | "/opt/ml/input/data/train/c4/en/c4-train.00540-of-01024.json.gz",
21 | "/opt/ml/input/data/train/c4/en/c4-train.00551-of-01024.json.gz",
22 | "/opt/ml/input/data/train/c4/en/c4-train.00874-of-01024.json.gz",
23 | "/opt/ml/input/data/train/c4/en/c4-train.00341-of-01024.json.gz",
24 | "/opt/ml/input/data/train/c4/en/c4-train.00140-of-01024.json.gz",
25 | "/opt/ml/input/data/train/c4/en/c4-train.00753-of-01024.json.gz",
26 | "/opt/ml/input/data/train/c4/en/c4-train.00750-of-01024.json.gz"
27 | ]
--------------------------------------------------------------------------------
/data/files/c4/en/p25/partition_25.json:
--------------------------------------------------------------------------------
1 | [
2 | "/opt/ml/input/data/train/c4/en/c4-train.00086-of-01024.json.gz",
3 | "/opt/ml/input/data/train/c4/en/c4-train.00895-of-01024.json.gz",
4 | "/opt/ml/input/data/train/c4/en/c4-train.00988-of-01024.json.gz",
5 | "/opt/ml/input/data/train/c4/en/c4-train.00200-of-01024.json.gz",
6 | "/opt/ml/input/data/train/c4/en/c4-train.00081-of-01024.json.gz",
7 | "/opt/ml/input/data/train/c4/en/c4-train.00210-of-01024.json.gz",
8 | "/opt/ml/input/data/train/c4/en/c4-train.00411-of-01024.json.gz",
9 | "/opt/ml/input/data/train/c4/en/c4-train.00812-of-01024.json.gz",
10 | "/opt/ml/input/data/train/c4/en/c4-train.00410-of-01024.json.gz",
11 | "/opt/ml/input/data/train/c4/en/c4-train.00775-of-01024.json.gz",
12 | "/opt/ml/input/data/train/c4/en/c4-train.00591-of-01024.json.gz",
13 | "/opt/ml/input/data/train/c4/en/c4-train.00901-of-01024.json.gz",
14 | "/opt/ml/input/data/train/c4/en/c4-train.00349-of-01024.json.gz",
15 | "/opt/ml/input/data/train/c4/en/c4-train.00655-of-01024.json.gz",
16 | "/opt/ml/input/data/train/c4/en/c4-train.00511-of-01024.json.gz",
17 | "/opt/ml/input/data/train/c4/en/c4-train.00450-of-01024.json.gz",
18 | "/opt/ml/input/data/train/c4/en/c4-train.00430-of-01024.json.gz",
19 | "/opt/ml/input/data/train/c4/en/c4-train.00882-of-01024.json.gz",
20 | "/opt/ml/input/data/train/c4/en/c4-train.00559-of-01024.json.gz",
21 | "/opt/ml/input/data/train/c4/en/c4-train.00598-of-01024.json.gz",
22 | "/opt/ml/input/data/train/c4/en/c4-train.00120-of-01024.json.gz",
23 | "/opt/ml/input/data/train/c4/en/c4-train.00816-of-01024.json.gz",
24 | "/opt/ml/input/data/train/c4/en/c4-train.00780-of-01024.json.gz",
25 | "/opt/ml/input/data/train/c4/en/c4-train.00395-of-01024.json.gz",
26 | "/opt/ml/input/data/train/c4/en/c4-train.00726-of-01024.json.gz"
27 | ]
--------------------------------------------------------------------------------
/data/files/c4/en/p25/partition_250.json:
--------------------------------------------------------------------------------
1 | [
2 | "/opt/ml/input/data/train/c4/en/c4-train.00065-of-01024.json.gz",
3 | "/opt/ml/input/data/train/c4/en/c4-train.00719-of-01024.json.gz",
4 | "/opt/ml/input/data/train/c4/en/c4-train.00704-of-01024.json.gz",
5 | "/opt/ml/input/data/train/c4/en/c4-train.00126-of-01024.json.gz",
6 | "/opt/ml/input/data/train/c4/en/c4-train.00174-of-01024.json.gz",
7 | "/opt/ml/input/data/train/c4/en/c4-train.00950-of-01024.json.gz",
8 | "/opt/ml/input/data/train/c4/en/c4-train.00284-of-01024.json.gz",
9 | "/opt/ml/input/data/train/c4/en/c4-train.00343-of-01024.json.gz",
10 | "/opt/ml/input/data/train/c4/en/c4-train.00727-of-01024.json.gz",
11 | "/opt/ml/input/data/train/c4/en/c4-train.00795-of-01024.json.gz",
12 | "/opt/ml/input/data/train/c4/en/c4-train.00535-of-01024.json.gz",
13 | "/opt/ml/input/data/train/c4/en/c4-train.00899-of-01024.json.gz",
14 | "/opt/ml/input/data/train/c4/en/c4-train.00465-of-01024.json.gz",
15 | "/opt/ml/input/data/train/c4/en/c4-train.00382-of-01024.json.gz",
16 | "/opt/ml/input/data/train/c4/en/c4-train.00697-of-01024.json.gz",
17 | "/opt/ml/input/data/train/c4/en/c4-train.00531-of-01024.json.gz",
18 | "/opt/ml/input/data/train/c4/en/c4-train.00638-of-01024.json.gz",
19 | "/opt/ml/input/data/train/c4/en/c4-train.00180-of-01024.json.gz",
20 | "/opt/ml/input/data/train/c4/en/c4-train.00179-of-01024.json.gz",
21 | "/opt/ml/input/data/train/c4/en/c4-train.00800-of-01024.json.gz",
22 | "/opt/ml/input/data/train/c4/en/c4-train.00208-of-01024.json.gz",
23 | "/opt/ml/input/data/train/c4/en/c4-train.00006-of-01024.json.gz",
24 | "/opt/ml/input/data/train/c4/en/c4-train.00188-of-01024.json.gz",
25 | "/opt/ml/input/data/train/c4/en/c4-train.00827-of-01024.json.gz",
26 | "/opt/ml/input/data/train/c4/en/c4-train.00814-of-01024.json.gz"
27 | ]
--------------------------------------------------------------------------------
/data/files/c4/en/p25/partition_275.json:
--------------------------------------------------------------------------------
1 | [
2 | "/opt/ml/input/data/train/c4/en/c4-train.00593-of-01024.json.gz",
3 | "/opt/ml/input/data/train/c4/en/c4-train.00870-of-01024.json.gz",
4 | "/opt/ml/input/data/train/c4/en/c4-train.00985-of-01024.json.gz",
5 | "/opt/ml/input/data/train/c4/en/c4-train.00701-of-01024.json.gz",
6 | "/opt/ml/input/data/train/c4/en/c4-train.00348-of-01024.json.gz",
7 | "/opt/ml/input/data/train/c4/en/c4-train.00635-of-01024.json.gz",
8 | "/opt/ml/input/data/train/c4/en/c4-train.00754-of-01024.json.gz",
9 | "/opt/ml/input/data/train/c4/en/c4-train.00960-of-01024.json.gz",
10 | "/opt/ml/input/data/train/c4/en/c4-train.00046-of-01024.json.gz",
11 | "/opt/ml/input/data/train/c4/en/c4-train.00080-of-01024.json.gz",
12 | "/opt/ml/input/data/train/c4/en/c4-train.00059-of-01024.json.gz",
13 | "/opt/ml/input/data/train/c4/en/c4-train.00877-of-01024.json.gz",
14 | "/opt/ml/input/data/train/c4/en/c4-train.00875-of-01024.json.gz",
15 | "/opt/ml/input/data/train/c4/en/c4-train.00452-of-01024.json.gz",
16 | "/opt/ml/input/data/train/c4/en/c4-train.00817-of-01024.json.gz",
17 | "/opt/ml/input/data/train/c4/en/c4-train.00640-of-01024.json.gz",
18 | "/opt/ml/input/data/train/c4/en/c4-train.00759-of-01024.json.gz",
19 | "/opt/ml/input/data/train/c4/en/c4-train.00479-of-01024.json.gz",
20 | "/opt/ml/input/data/train/c4/en/c4-train.00861-of-01024.json.gz",
21 | "/opt/ml/input/data/train/c4/en/c4-train.00758-of-01024.json.gz",
22 | "/opt/ml/input/data/train/c4/en/c4-train.00247-of-01024.json.gz",
23 | "/opt/ml/input/data/train/c4/en/c4-train.00011-of-01024.json.gz",
24 | "/opt/ml/input/data/train/c4/en/c4-train.00572-of-01024.json.gz",
25 | "/opt/ml/input/data/train/c4/en/c4-train.00644-of-01024.json.gz",
26 | "/opt/ml/input/data/train/c4/en/c4-train.00735-of-01024.json.gz"
27 | ]
--------------------------------------------------------------------------------
/data/files/c4/en/p25/partition_300.json:
--------------------------------------------------------------------------------
1 | [
2 | "/opt/ml/input/data/train/c4/en/c4-train.00283-of-01024.json.gz",
3 | "/opt/ml/input/data/train/c4/en/c4-train.00887-of-01024.json.gz",
4 | "/opt/ml/input/data/train/c4/en/c4-train.00288-of-01024.json.gz",
5 | "/opt/ml/input/data/train/c4/en/c4-train.00649-of-01024.json.gz",
6 | "/opt/ml/input/data/train/c4/en/c4-train.00428-of-01024.json.gz",
7 | "/opt/ml/input/data/train/c4/en/c4-train.00173-of-01024.json.gz",
8 | "/opt/ml/input/data/train/c4/en/c4-train.00156-of-01024.json.gz",
9 | "/opt/ml/input/data/train/c4/en/c4-train.00330-of-01024.json.gz",
10 | "/opt/ml/input/data/train/c4/en/c4-train.00833-of-01024.json.gz",
11 | "/opt/ml/input/data/train/c4/en/c4-train.00053-of-01024.json.gz",
12 | "/opt/ml/input/data/train/c4/en/c4-train.00199-of-01024.json.gz",
13 | "/opt/ml/input/data/train/c4/en/c4-train.00377-of-01024.json.gz",
14 | "/opt/ml/input/data/train/c4/en/c4-train.00082-of-01024.json.gz",
15 | "/opt/ml/input/data/train/c4/en/c4-train.00097-of-01024.json.gz",
16 | "/opt/ml/input/data/train/c4/en/c4-train.00279-of-01024.json.gz",
17 | "/opt/ml/input/data/train/c4/en/c4-train.00828-of-01024.json.gz",
18 | "/opt/ml/input/data/train/c4/en/c4-train.00829-of-01024.json.gz",
19 | "/opt/ml/input/data/train/c4/en/c4-train.00771-of-01024.json.gz",
20 | "/opt/ml/input/data/train/c4/en/c4-train.00611-of-01024.json.gz",
21 | "/opt/ml/input/data/train/c4/en/c4-train.00716-of-01024.json.gz",
22 | "/opt/ml/input/data/train/c4/en/c4-train.00483-of-01024.json.gz",
23 | "/opt/ml/input/data/train/c4/en/c4-train.00842-of-01024.json.gz",
24 | "/opt/ml/input/data/train/c4/en/c4-train.00996-of-01024.json.gz",
25 | "/opt/ml/input/data/train/c4/en/c4-train.00290-of-01024.json.gz",
26 | "/opt/ml/input/data/train/c4/en/c4-train.00036-of-01024.json.gz"
27 | ]
--------------------------------------------------------------------------------
/data/files/c4/en/p25/partition_325.json:
--------------------------------------------------------------------------------
1 | [
2 | "/opt/ml/input/data/train/c4/en/c4-train.00076-of-01024.json.gz",
3 | "/opt/ml/input/data/train/c4/en/c4-train.00687-of-01024.json.gz",
4 | "/opt/ml/input/data/train/c4/en/c4-train.00834-of-01024.json.gz",
5 | "/opt/ml/input/data/train/c4/en/c4-train.00880-of-01024.json.gz",
6 | "/opt/ml/input/data/train/c4/en/c4-train.00176-of-01024.json.gz",
7 | "/opt/ml/input/data/train/c4/en/c4-train.00089-of-01024.json.gz",
8 | "/opt/ml/input/data/train/c4/en/c4-train.00932-of-01024.json.gz",
9 | "/opt/ml/input/data/train/c4/en/c4-train.00793-of-01024.json.gz",
10 | "/opt/ml/input/data/train/c4/en/c4-train.00602-of-01024.json.gz",
11 | "/opt/ml/input/data/train/c4/en/c4-train.00228-of-01024.json.gz",
12 | "/opt/ml/input/data/train/c4/en/c4-train.00675-of-01024.json.gz",
13 | "/opt/ml/input/data/train/c4/en/c4-train.00085-of-01024.json.gz",
14 | "/opt/ml/input/data/train/c4/en/c4-train.00752-of-01024.json.gz",
15 | "/opt/ml/input/data/train/c4/en/c4-train.00715-of-01024.json.gz",
16 | "/opt/ml/input/data/train/c4/en/c4-train.00911-of-01024.json.gz",
17 | "/opt/ml/input/data/train/c4/en/c4-train.00238-of-01024.json.gz",
18 | "/opt/ml/input/data/train/c4/en/c4-train.00919-of-01024.json.gz",
19 | "/opt/ml/input/data/train/c4/en/c4-train.00002-of-01024.json.gz",
20 | "/opt/ml/input/data/train/c4/en/c4-train.00785-of-01024.json.gz",
21 | "/opt/ml/input/data/train/c4/en/c4-train.00636-of-01024.json.gz",
22 | "/opt/ml/input/data/train/c4/en/c4-train.00402-of-01024.json.gz",
23 | "/opt/ml/input/data/train/c4/en/c4-train.00772-of-01024.json.gz",
24 | "/opt/ml/input/data/train/c4/en/c4-train.00459-of-01024.json.gz",
25 | "/opt/ml/input/data/train/c4/en/c4-train.00476-of-01024.json.gz",
26 | "/opt/ml/input/data/train/c4/en/c4-train.00904-of-01024.json.gz"
27 | ]
--------------------------------------------------------------------------------
/data/files/c4/en/p25/partition_350.json:
--------------------------------------------------------------------------------
1 | [
2 | "/opt/ml/input/data/train/c4/en/c4-train.00857-of-01024.json.gz",
3 | "/opt/ml/input/data/train/c4/en/c4-train.00321-of-01024.json.gz",
4 | "/opt/ml/input/data/train/c4/en/c4-train.00957-of-01024.json.gz",
5 | "/opt/ml/input/data/train/c4/en/c4-train.00100-of-01024.json.gz",
6 | "/opt/ml/input/data/train/c4/en/c4-train.00966-of-01024.json.gz",
7 | "/opt/ml/input/data/train/c4/en/c4-train.01020-of-01024.json.gz",
8 | "/opt/ml/input/data/train/c4/en/c4-train.00113-of-01024.json.gz",
9 | "/opt/ml/input/data/train/c4/en/c4-train.01016-of-01024.json.gz",
10 | "/opt/ml/input/data/train/c4/en/c4-train.00070-of-01024.json.gz",
11 | "/opt/ml/input/data/train/c4/en/c4-train.00557-of-01024.json.gz",
12 | "/opt/ml/input/data/train/c4/en/c4-train.00244-of-01024.json.gz",
13 | "/opt/ml/input/data/train/c4/en/c4-train.00945-of-01024.json.gz",
14 | "/opt/ml/input/data/train/c4/en/c4-train.00181-of-01024.json.gz",
15 | "/opt/ml/input/data/train/c4/en/c4-train.00225-of-01024.json.gz",
16 | "/opt/ml/input/data/train/c4/en/c4-train.00167-of-01024.json.gz",
17 | "/opt/ml/input/data/train/c4/en/c4-train.00946-of-01024.json.gz",
18 | "/opt/ml/input/data/train/c4/en/c4-train.00847-of-01024.json.gz",
19 | "/opt/ml/input/data/train/c4/en/c4-train.00746-of-01024.json.gz",
20 | "/opt/ml/input/data/train/c4/en/c4-train.00891-of-01024.json.gz",
21 | "/opt/ml/input/data/train/c4/en/c4-train.00373-of-01024.json.gz",
22 | "/opt/ml/input/data/train/c4/en/c4-train.00358-of-01024.json.gz",
23 | "/opt/ml/input/data/train/c4/en/c4-train.00807-of-01024.json.gz",
24 | "/opt/ml/input/data/train/c4/en/c4-train.00873-of-01024.json.gz",
25 | "/opt/ml/input/data/train/c4/en/c4-train.00016-of-01024.json.gz",
26 | "/opt/ml/input/data/train/c4/en/c4-train.00129-of-01024.json.gz"
27 | ]
--------------------------------------------------------------------------------
/data/files/c4/en/p25/partition_375.json:
--------------------------------------------------------------------------------
1 | [
2 | "/opt/ml/input/data/train/c4/en/c4-train.00903-of-01024.json.gz",
3 | "/opt/ml/input/data/train/c4/en/c4-train.00669-of-01024.json.gz",
4 | "/opt/ml/input/data/train/c4/en/c4-train.00272-of-01024.json.gz",
5 | "/opt/ml/input/data/train/c4/en/c4-train.00034-of-01024.json.gz",
6 | "/opt/ml/input/data/train/c4/en/c4-train.00047-of-01024.json.gz",
7 | "/opt/ml/input/data/train/c4/en/c4-train.00660-of-01024.json.gz",
8 | "/opt/ml/input/data/train/c4/en/c4-train.00999-of-01024.json.gz",
9 | "/opt/ml/input/data/train/c4/en/c4-train.00912-of-01024.json.gz",
10 | "/opt/ml/input/data/train/c4/en/c4-train.00308-of-01024.json.gz",
11 | "/opt/ml/input/data/train/c4/en/c4-train.00732-of-01024.json.gz",
12 | "/opt/ml/input/data/train/c4/en/c4-train.00440-of-01024.json.gz",
13 | "/opt/ml/input/data/train/c4/en/c4-train.00012-of-01024.json.gz",
14 | "/opt/ml/input/data/train/c4/en/c4-train.00643-of-01024.json.gz",
15 | "/opt/ml/input/data/train/c4/en/c4-train.01019-of-01024.json.gz",
16 | "/opt/ml/input/data/train/c4/en/c4-train.00219-of-01024.json.gz",
17 | "/opt/ml/input/data/train/c4/en/c4-train.00304-of-01024.json.gz",
18 | "/opt/ml/input/data/train/c4/en/c4-train.00253-of-01024.json.gz",
19 | "/opt/ml/input/data/train/c4/en/c4-train.00484-of-01024.json.gz",
20 | "/opt/ml/input/data/train/c4/en/c4-train.00335-of-01024.json.gz",
21 | "/opt/ml/input/data/train/c4/en/c4-train.00507-of-01024.json.gz",
22 | "/opt/ml/input/data/train/c4/en/c4-train.00285-of-01024.json.gz",
23 | "/opt/ml/input/data/train/c4/en/c4-train.00926-of-01024.json.gz",
24 | "/opt/ml/input/data/train/c4/en/c4-train.00325-of-01024.json.gz",
25 | "/opt/ml/input/data/train/c4/en/c4-train.00354-of-01024.json.gz",
26 | "/opt/ml/input/data/train/c4/en/c4-train.00311-of-01024.json.gz"
27 | ]
--------------------------------------------------------------------------------
/data/files/c4/en/p25/partition_400.json:
--------------------------------------------------------------------------------
1 | [
2 | "/opt/ml/input/data/train/c4/en/c4-train.00150-of-01024.json.gz",
3 | "/opt/ml/input/data/train/c4/en/c4-train.00137-of-01024.json.gz",
4 | "/opt/ml/input/data/train/c4/en/c4-train.00299-of-01024.json.gz",
5 | "/opt/ml/input/data/train/c4/en/c4-train.00954-of-01024.json.gz",
6 | "/opt/ml/input/data/train/c4/en/c4-train.00631-of-01024.json.gz",
7 | "/opt/ml/input/data/train/c4/en/c4-train.00404-of-01024.json.gz",
8 | "/opt/ml/input/data/train/c4/en/c4-train.00054-of-01024.json.gz",
9 | "/opt/ml/input/data/train/c4/en/c4-train.00694-of-01024.json.gz",
10 | "/opt/ml/input/data/train/c4/en/c4-train.00767-of-01024.json.gz",
11 | "/opt/ml/input/data/train/c4/en/c4-train.00910-of-01024.json.gz",
12 | "/opt/ml/input/data/train/c4/en/c4-train.00642-of-01024.json.gz",
13 | "/opt/ml/input/data/train/c4/en/c4-train.00600-of-01024.json.gz",
14 | "/opt/ml/input/data/train/c4/en/c4-train.00751-of-01024.json.gz",
15 | "/opt/ml/input/data/train/c4/en/c4-train.00490-of-01024.json.gz",
16 | "/opt/ml/input/data/train/c4/en/c4-train.00948-of-01024.json.gz",
17 | "/opt/ml/input/data/train/c4/en/c4-train.00017-of-01024.json.gz",
18 | "/opt/ml/input/data/train/c4/en/c4-train.00650-of-01024.json.gz",
19 | "/opt/ml/input/data/train/c4/en/c4-train.00987-of-01024.json.gz",
20 | "/opt/ml/input/data/train/c4/en/c4-train.00970-of-01024.json.gz",
21 | "/opt/ml/input/data/train/c4/en/c4-train.00865-of-01024.json.gz",
22 | "/opt/ml/input/data/train/c4/en/c4-train.00522-of-01024.json.gz",
23 | "/opt/ml/input/data/train/c4/en/c4-train.00589-of-01024.json.gz",
24 | "/opt/ml/input/data/train/c4/en/c4-train.00510-of-01024.json.gz",
25 | "/opt/ml/input/data/train/c4/en/c4-train.00468-of-01024.json.gz",
26 | "/opt/ml/input/data/train/c4/en/c4-train.00204-of-01024.json.gz"
27 | ]
--------------------------------------------------------------------------------
/data/files/c4/en/p25/partition_425.json:
--------------------------------------------------------------------------------
1 | [
2 | "/opt/ml/input/data/train/c4/en/c4-train.00801-of-01024.json.gz",
3 | "/opt/ml/input/data/train/c4/en/c4-train.00456-of-01024.json.gz",
4 | "/opt/ml/input/data/train/c4/en/c4-train.00449-of-01024.json.gz",
5 | "/opt/ml/input/data/train/c4/en/c4-train.00580-of-01024.json.gz",
6 | "/opt/ml/input/data/train/c4/en/c4-train.00418-of-01024.json.gz",
7 | "/opt/ml/input/data/train/c4/en/c4-train.00063-of-01024.json.gz",
8 | "/opt/ml/input/data/train/c4/en/c4-train.00451-of-01024.json.gz",
9 | "/opt/ml/input/data/train/c4/en/c4-train.00028-of-01024.json.gz",
10 | "/opt/ml/input/data/train/c4/en/c4-train.00972-of-01024.json.gz",
11 | "/opt/ml/input/data/train/c4/en/c4-train.00963-of-01024.json.gz",
12 | "/opt/ml/input/data/train/c4/en/c4-train.00512-of-01024.json.gz",
13 | "/opt/ml/input/data/train/c4/en/c4-train.00391-of-01024.json.gz",
14 | "/opt/ml/input/data/train/c4/en/c4-train.00769-of-01024.json.gz",
15 | "/opt/ml/input/data/train/c4/en/c4-train.00900-of-01024.json.gz",
16 | "/opt/ml/input/data/train/c4/en/c4-train.00427-of-01024.json.gz",
17 | "/opt/ml/input/data/train/c4/en/c4-train.00379-of-01024.json.gz",
18 | "/opt/ml/input/data/train/c4/en/c4-train.00613-of-01024.json.gz",
19 | "/opt/ml/input/data/train/c4/en/c4-train.00399-of-01024.json.gz",
20 | "/opt/ml/input/data/train/c4/en/c4-train.00227-of-01024.json.gz",
21 | "/opt/ml/input/data/train/c4/en/c4-train.00853-of-01024.json.gz",
22 | "/opt/ml/input/data/train/c4/en/c4-train.00588-of-01024.json.gz",
23 | "/opt/ml/input/data/train/c4/en/c4-train.00045-of-01024.json.gz",
24 | "/opt/ml/input/data/train/c4/en/c4-train.00189-of-01024.json.gz",
25 | "/opt/ml/input/data/train/c4/en/c4-train.00538-of-01024.json.gz",
26 | "/opt/ml/input/data/train/c4/en/c4-train.00859-of-01024.json.gz"
27 | ]
--------------------------------------------------------------------------------
/data/files/c4/en/p25/partition_450.json:
--------------------------------------------------------------------------------
1 | [
2 | "/opt/ml/input/data/train/c4/en/c4-train.00298-of-01024.json.gz",
3 | "/opt/ml/input/data/train/c4/en/c4-train.00342-of-01024.json.gz",
4 | "/opt/ml/input/data/train/c4/en/c4-train.00372-of-01024.json.gz",
5 | "/opt/ml/input/data/train/c4/en/c4-train.00961-of-01024.json.gz",
6 | "/opt/ml/input/data/train/c4/en/c4-train.00736-of-01024.json.gz",
7 | "/opt/ml/input/data/train/c4/en/c4-train.00639-of-01024.json.gz",
8 | "/opt/ml/input/data/train/c4/en/c4-train.00892-of-01024.json.gz",
9 | "/opt/ml/input/data/train/c4/en/c4-train.00249-of-01024.json.gz",
10 | "/opt/ml/input/data/train/c4/en/c4-train.00610-of-01024.json.gz",
11 | "/opt/ml/input/data/train/c4/en/c4-train.00609-of-01024.json.gz",
12 | "/opt/ml/input/data/train/c4/en/c4-train.00368-of-01024.json.gz",
13 | "/opt/ml/input/data/train/c4/en/c4-train.00381-of-01024.json.gz",
14 | "/opt/ml/input/data/train/c4/en/c4-train.00654-of-01024.json.gz",
15 | "/opt/ml/input/data/train/c4/en/c4-train.00820-of-01024.json.gz",
16 | "/opt/ml/input/data/train/c4/en/c4-train.00832-of-01024.json.gz",
17 | "/opt/ml/input/data/train/c4/en/c4-train.00982-of-01024.json.gz",
18 | "/opt/ml/input/data/train/c4/en/c4-train.00333-of-01024.json.gz",
19 | "/opt/ml/input/data/train/c4/en/c4-train.00501-of-01024.json.gz",
20 | "/opt/ml/input/data/train/c4/en/c4-train.00965-of-01024.json.gz",
21 | "/opt/ml/input/data/train/c4/en/c4-train.00562-of-01024.json.gz",
22 | "/opt/ml/input/data/train/c4/en/c4-train.00517-of-01024.json.gz",
23 | "/opt/ml/input/data/train/c4/en/c4-train.00191-of-01024.json.gz",
24 | "/opt/ml/input/data/train/c4/en/c4-train.00690-of-01024.json.gz",
25 | "/opt/ml/input/data/train/c4/en/c4-train.00806-of-01024.json.gz",
26 | "/opt/ml/input/data/train/c4/en/c4-train.00500-of-01024.json.gz"
27 | ]
--------------------------------------------------------------------------------
/data/files/c4/en/p25/partition_475.json:
--------------------------------------------------------------------------------
1 | [
2 | "/opt/ml/input/data/train/c4/en/c4-train.01010-of-01024.json.gz",
3 | "/opt/ml/input/data/train/c4/en/c4-train.01012-of-01024.json.gz",
4 | "/opt/ml/input/data/train/c4/en/c4-train.00590-of-01024.json.gz",
5 | "/opt/ml/input/data/train/c4/en/c4-train.00122-of-01024.json.gz",
6 | "/opt/ml/input/data/train/c4/en/c4-train.00709-of-01024.json.gz",
7 | "/opt/ml/input/data/train/c4/en/c4-train.00000-of-01024.json.gz",
8 | "/opt/ml/input/data/train/c4/en/c4-train.00287-of-01024.json.gz",
9 | "/opt/ml/input/data/train/c4/en/c4-train.00281-of-01024.json.gz",
10 | "/opt/ml/input/data/train/c4/en/c4-train.00986-of-01024.json.gz",
11 | "/opt/ml/input/data/train/c4/en/c4-train.00737-of-01024.json.gz",
12 | "/opt/ml/input/data/train/c4/en/c4-train.00098-of-01024.json.gz",
13 | "/opt/ml/input/data/train/c4/en/c4-train.00241-of-01024.json.gz",
14 | "/opt/ml/input/data/train/c4/en/c4-train.00976-of-01024.json.gz",
15 | "/opt/ml/input/data/train/c4/en/c4-train.00705-of-01024.json.gz",
16 | "/opt/ml/input/data/train/c4/en/c4-train.00876-of-01024.json.gz",
17 | "/opt/ml/input/data/train/c4/en/c4-train.00760-of-01024.json.gz",
18 | "/opt/ml/input/data/train/c4/en/c4-train.00923-of-01024.json.gz",
19 | "/opt/ml/input/data/train/c4/en/c4-train.00713-of-01024.json.gz",
20 | "/opt/ml/input/data/train/c4/en/c4-train.00599-of-01024.json.gz",
21 | "/opt/ml/input/data/train/c4/en/c4-train.00804-of-01024.json.gz",
22 | "/opt/ml/input/data/train/c4/en/c4-train.00894-of-01024.json.gz",
23 | "/opt/ml/input/data/train/c4/en/c4-train.00830-of-01024.json.gz",
24 | "/opt/ml/input/data/train/c4/en/c4-train.00971-of-01024.json.gz",
25 | "/opt/ml/input/data/train/c4/en/c4-train.00584-of-01024.json.gz",
26 | "/opt/ml/input/data/train/c4/en/c4-train.00770-of-01024.json.gz"
27 | ]
--------------------------------------------------------------------------------
/data/files/c4/en/p25/partition_50.json:
--------------------------------------------------------------------------------
1 | [
2 | "/opt/ml/input/data/train/c4/en/c4-train.00273-of-01024.json.gz",
3 | "/opt/ml/input/data/train/c4/en/c4-train.00768-of-01024.json.gz",
4 | "/opt/ml/input/data/train/c4/en/c4-train.00190-of-01024.json.gz",
5 | "/opt/ml/input/data/train/c4/en/c4-train.00499-of-01024.json.gz",
6 | "/opt/ml/input/data/train/c4/en/c4-train.00458-of-01024.json.gz",
7 | "/opt/ml/input/data/train/c4/en/c4-train.00506-of-01024.json.gz",
8 | "/opt/ml/input/data/train/c4/en/c4-train.00061-of-01024.json.gz",
9 | "/opt/ml/input/data/train/c4/en/c4-train.00005-of-01024.json.gz",
10 | "/opt/ml/input/data/train/c4/en/c4-train.00889-of-01024.json.gz",
11 | "/opt/ml/input/data/train/c4/en/c4-train.00442-of-01024.json.gz",
12 | "/opt/ml/input/data/train/c4/en/c4-train.00617-of-01024.json.gz",
13 | "/opt/ml/input/data/train/c4/en/c4-train.00786-of-01024.json.gz",
14 | "/opt/ml/input/data/train/c4/en/c4-train.00187-of-01024.json.gz",
15 | "/opt/ml/input/data/train/c4/en/c4-train.00839-of-01024.json.gz",
16 | "/opt/ml/input/data/train/c4/en/c4-train.00405-of-01024.json.gz",
17 | "/opt/ml/input/data/train/c4/en/c4-train.00555-of-01024.json.gz",
18 | "/opt/ml/input/data/train/c4/en/c4-train.00133-of-01024.json.gz",
19 | "/opt/ml/input/data/train/c4/en/c4-train.00634-of-01024.json.gz",
20 | "/opt/ml/input/data/train/c4/en/c4-train.00434-of-01024.json.gz",
21 | "/opt/ml/input/data/train/c4/en/c4-train.00519-of-01024.json.gz",
22 | "/opt/ml/input/data/train/c4/en/c4-train.00340-of-01024.json.gz",
23 | "/opt/ml/input/data/train/c4/en/c4-train.00810-of-01024.json.gz",
24 | "/opt/ml/input/data/train/c4/en/c4-train.00217-of-01024.json.gz",
25 | "/opt/ml/input/data/train/c4/en/c4-train.00087-of-01024.json.gz",
26 | "/opt/ml/input/data/train/c4/en/c4-train.00302-of-01024.json.gz"
27 | ]
--------------------------------------------------------------------------------
/data/files/c4/en/p25/partition_500.json:
--------------------------------------------------------------------------------
1 | [
2 | "/opt/ml/input/data/train/c4/en/c4-train.00679-of-01024.json.gz",
3 | "/opt/ml/input/data/train/c4/en/c4-train.00616-of-01024.json.gz",
4 | "/opt/ml/input/data/train/c4/en/c4-train.00561-of-01024.json.gz",
5 | "/opt/ml/input/data/train/c4/en/c4-train.00184-of-01024.json.gz",
6 | "/opt/ml/input/data/train/c4/en/c4-train.00072-of-01024.json.gz",
7 | "/opt/ml/input/data/train/c4/en/c4-train.00347-of-01024.json.gz",
8 | "/opt/ml/input/data/train/c4/en/c4-train.00949-of-01024.json.gz",
9 | "/opt/ml/input/data/train/c4/en/c4-train.00051-of-01024.json.gz",
10 | "/opt/ml/input/data/train/c4/en/c4-train.00401-of-01024.json.gz",
11 | "/opt/ml/input/data/train/c4/en/c4-train.00567-of-01024.json.gz",
12 | "/opt/ml/input/data/train/c4/en/c4-train.00305-of-01024.json.gz",
13 | "/opt/ml/input/data/train/c4/en/c4-train.00498-of-01024.json.gz",
14 | "/opt/ml/input/data/train/c4/en/c4-train.00896-of-01024.json.gz",
15 | "/opt/ml/input/data/train/c4/en/c4-train.00101-of-01024.json.gz",
16 | "/opt/ml/input/data/train/c4/en/c4-train.00169-of-01024.json.gz",
17 | "/opt/ml/input/data/train/c4/en/c4-train.00984-of-01024.json.gz",
18 | "/opt/ml/input/data/train/c4/en/c4-train.00815-of-01024.json.gz",
19 | "/opt/ml/input/data/train/c4/en/c4-train.00262-of-01024.json.gz",
20 | "/opt/ml/input/data/train/c4/en/c4-train.00164-of-01024.json.gz",
21 | "/opt/ml/input/data/train/c4/en/c4-train.00077-of-01024.json.gz",
22 | "/opt/ml/input/data/train/c4/en/c4-train.00710-of-01024.json.gz",
23 | "/opt/ml/input/data/train/c4/en/c4-train.00723-of-01024.json.gz",
24 | "/opt/ml/input/data/train/c4/en/c4-train.00172-of-01024.json.gz",
25 | "/opt/ml/input/data/train/c4/en/c4-train.00558-of-01024.json.gz",
26 | "/opt/ml/input/data/train/c4/en/c4-train.00993-of-01024.json.gz"
27 | ]
--------------------------------------------------------------------------------
/data/files/c4/en/p25/partition_525.json:
--------------------------------------------------------------------------------
1 | [
2 | "/opt/ml/input/data/train/c4/en/c4-train.00392-of-01024.json.gz",
3 | "/opt/ml/input/data/train/c4/en/c4-train.00123-of-01024.json.gz",
4 | "/opt/ml/input/data/train/c4/en/c4-train.00790-of-01024.json.gz",
5 | "/opt/ml/input/data/train/c4/en/c4-train.00083-of-01024.json.gz",
6 | "/opt/ml/input/data/train/c4/en/c4-train.00024-of-01024.json.gz",
7 | "/opt/ml/input/data/train/c4/en/c4-train.00792-of-01024.json.gz",
8 | "/opt/ml/input/data/train/c4/en/c4-train.00245-of-01024.json.gz",
9 | "/opt/ml/input/data/train/c4/en/c4-train.00648-of-01024.json.gz",
10 | "/opt/ml/input/data/train/c4/en/c4-train.00939-of-01024.json.gz",
11 | "/opt/ml/input/data/train/c4/en/c4-train.00964-of-01024.json.gz",
12 | "/opt/ml/input/data/train/c4/en/c4-train.00153-of-01024.json.gz",
13 | "/opt/ml/input/data/train/c4/en/c4-train.00867-of-01024.json.gz",
14 | "/opt/ml/input/data/train/c4/en/c4-train.00545-of-01024.json.gz",
15 | "/opt/ml/input/data/train/c4/en/c4-train.00119-of-01024.json.gz",
16 | "/opt/ml/input/data/train/c4/en/c4-train.00509-of-01024.json.gz",
17 | "/opt/ml/input/data/train/c4/en/c4-train.00416-of-01024.json.gz",
18 | "/opt/ml/input/data/train/c4/en/c4-train.00040-of-01024.json.gz",
19 | "/opt/ml/input/data/train/c4/en/c4-train.01018-of-01024.json.gz",
20 | "/opt/ml/input/data/train/c4/en/c4-train.01002-of-01024.json.gz",
21 | "/opt/ml/input/data/train/c4/en/c4-train.00369-of-01024.json.gz",
22 | "/opt/ml/input/data/train/c4/en/c4-train.01013-of-01024.json.gz",
23 | "/opt/ml/input/data/train/c4/en/c4-train.00968-of-01024.json.gz",
24 | "/opt/ml/input/data/train/c4/en/c4-train.00666-of-01024.json.gz",
25 | "/opt/ml/input/data/train/c4/en/c4-train.00102-of-01024.json.gz",
26 | "/opt/ml/input/data/train/c4/en/c4-train.00928-of-01024.json.gz"
27 | ]
--------------------------------------------------------------------------------
/data/files/c4/en/p25/partition_550.json:
--------------------------------------------------------------------------------
1 | [
2 | "/opt/ml/input/data/train/c4/en/c4-train.00068-of-01024.json.gz",
3 | "/opt/ml/input/data/train/c4/en/c4-train.00313-of-01024.json.gz",
4 | "/opt/ml/input/data/train/c4/en/c4-train.00294-of-01024.json.gz",
5 | "/opt/ml/input/data/train/c4/en/c4-train.00851-of-01024.json.gz",
6 | "/opt/ml/input/data/train/c4/en/c4-train.00794-of-01024.json.gz",
7 | "/opt/ml/input/data/train/c4/en/c4-train.00678-of-01024.json.gz",
8 | "/opt/ml/input/data/train/c4/en/c4-train.00681-of-01024.json.gz",
9 | "/opt/ml/input/data/train/c4/en/c4-train.00637-of-01024.json.gz",
10 | "/opt/ml/input/data/train/c4/en/c4-train.00747-of-01024.json.gz",
11 | "/opt/ml/input/data/train/c4/en/c4-train.00935-of-01024.json.gz",
12 | "/opt/ml/input/data/train/c4/en/c4-train.00168-of-01024.json.gz",
13 | "/opt/ml/input/data/train/c4/en/c4-train.00263-of-01024.json.gz",
14 | "/opt/ml/input/data/train/c4/en/c4-train.00223-of-01024.json.gz",
15 | "/opt/ml/input/data/train/c4/en/c4-train.00841-of-01024.json.gz",
16 | "/opt/ml/input/data/train/c4/en/c4-train.00563-of-01024.json.gz",
17 | "/opt/ml/input/data/train/c4/en/c4-train.00787-of-01024.json.gz",
18 | "/opt/ml/input/data/train/c4/en/c4-train.00980-of-01024.json.gz",
19 | "/opt/ml/input/data/train/c4/en/c4-train.00338-of-01024.json.gz",
20 | "/opt/ml/input/data/train/c4/en/c4-train.00286-of-01024.json.gz",
21 | "/opt/ml/input/data/train/c4/en/c4-train.00049-of-01024.json.gz",
22 | "/opt/ml/input/data/train/c4/en/c4-train.00403-of-01024.json.gz",
23 | "/opt/ml/input/data/train/c4/en/c4-train.00393-of-01024.json.gz",
24 | "/opt/ml/input/data/train/c4/en/c4-train.00492-of-01024.json.gz",
25 | "/opt/ml/input/data/train/c4/en/c4-train.00656-of-01024.json.gz",
26 | "/opt/ml/input/data/train/c4/en/c4-train.00840-of-01024.json.gz"
27 | ]
--------------------------------------------------------------------------------
/data/files/c4/en/p25/partition_575.json:
--------------------------------------------------------------------------------
1 | [
2 | "/opt/ml/input/data/train/c4/en/c4-train.00902-of-01024.json.gz",
3 | "/opt/ml/input/data/train/c4/en/c4-train.00351-of-01024.json.gz",
4 | "/opt/ml/input/data/train/c4/en/c4-train.00378-of-01024.json.gz",
5 | "/opt/ml/input/data/train/c4/en/c4-train.00757-of-01024.json.gz",
6 | "/opt/ml/input/data/train/c4/en/c4-train.00942-of-01024.json.gz",
7 | "/opt/ml/input/data/train/c4/en/c4-train.00441-of-01024.json.gz",
8 | "/opt/ml/input/data/train/c4/en/c4-train.00147-of-01024.json.gz",
9 | "/opt/ml/input/data/train/c4/en/c4-train.00471-of-01024.json.gz",
10 | "/opt/ml/input/data/train/c4/en/c4-train.00015-of-01024.json.gz",
11 | "/opt/ml/input/data/train/c4/en/c4-train.00075-of-01024.json.gz",
12 | "/opt/ml/input/data/train/c4/en/c4-train.00618-of-01024.json.gz",
13 | "/opt/ml/input/data/train/c4/en/c4-train.00818-of-01024.json.gz",
14 | "/opt/ml/input/data/train/c4/en/c4-train.00324-of-01024.json.gz",
15 | "/opt/ml/input/data/train/c4/en/c4-train.00370-of-01024.json.gz",
16 | "/opt/ml/input/data/train/c4/en/c4-train.00413-of-01024.json.gz",
17 | "/opt/ml/input/data/train/c4/en/c4-train.00139-of-01024.json.gz",
18 | "/opt/ml/input/data/train/c4/en/c4-train.00674-of-01024.json.gz",
19 | "/opt/ml/input/data/train/c4/en/c4-train.00114-of-01024.json.gz",
20 | "/opt/ml/input/data/train/c4/en/c4-train.00463-of-01024.json.gz",
21 | "/opt/ml/input/data/train/c4/en/c4-train.00917-of-01024.json.gz",
22 | "/opt/ml/input/data/train/c4/en/c4-train.00154-of-01024.json.gz",
23 | "/opt/ml/input/data/train/c4/en/c4-train.00549-of-01024.json.gz",
24 | "/opt/ml/input/data/train/c4/en/c4-train.00296-of-01024.json.gz",
25 | "/opt/ml/input/data/train/c4/en/c4-train.00419-of-01024.json.gz",
26 | "/opt/ml/input/data/train/c4/en/c4-train.00310-of-01024.json.gz"
27 | ]
--------------------------------------------------------------------------------
/data/files/c4/en/p25/partition_600.json:
--------------------------------------------------------------------------------
1 | [
2 | "/opt/ml/input/data/train/c4/en/c4-train.00157-of-01024.json.gz",
3 | "/opt/ml/input/data/train/c4/en/c4-train.00469-of-01024.json.gz",
4 | "/opt/ml/input/data/train/c4/en/c4-train.00521-of-01024.json.gz",
5 | "/opt/ml/input/data/train/c4/en/c4-train.00699-of-01024.json.gz",
6 | "/opt/ml/input/data/train/c4/en/c4-train.00730-of-01024.json.gz",
7 | "/opt/ml/input/data/train/c4/en/c4-train.00603-of-01024.json.gz",
8 | "/opt/ml/input/data/train/c4/en/c4-train.00781-of-01024.json.gz",
9 | "/opt/ml/input/data/train/c4/en/c4-train.00777-of-01024.json.gz",
10 | "/opt/ml/input/data/train/c4/en/c4-train.00560-of-01024.json.gz",
11 | "/opt/ml/input/data/train/c4/en/c4-train.00159-of-01024.json.gz",
12 | "/opt/ml/input/data/train/c4/en/c4-train.00032-of-01024.json.gz",
13 | "/opt/ml/input/data/train/c4/en/c4-train.00107-of-01024.json.gz",
14 | "/opt/ml/input/data/train/c4/en/c4-train.00064-of-01024.json.gz",
15 | "/opt/ml/input/data/train/c4/en/c4-train.00346-of-01024.json.gz",
16 | "/opt/ml/input/data/train/c4/en/c4-train.00571-of-01024.json.gz",
17 | "/opt/ml/input/data/train/c4/en/c4-train.00714-of-01024.json.gz",
18 | "/opt/ml/input/data/train/c4/en/c4-train.00135-of-01024.json.gz",
19 | "/opt/ml/input/data/train/c4/en/c4-train.00554-of-01024.json.gz",
20 | "/opt/ml/input/data/train/c4/en/c4-train.00533-of-01024.json.gz",
21 | "/opt/ml/input/data/train/c4/en/c4-train.00071-of-01024.json.gz",
22 | "/opt/ml/input/data/train/c4/en/c4-train.00628-of-01024.json.gz",
23 | "/opt/ml/input/data/train/c4/en/c4-train.00235-of-01024.json.gz",
24 | "/opt/ml/input/data/train/c4/en/c4-train.00487-of-01024.json.gz",
25 | "/opt/ml/input/data/train/c4/en/c4-train.00303-of-01024.json.gz",
26 | "/opt/ml/input/data/train/c4/en/c4-train.00683-of-01024.json.gz"
27 | ]
--------------------------------------------------------------------------------
/data/files/c4/en/p25/partition_625.json:
--------------------------------------------------------------------------------
1 | [
2 | "/opt/ml/input/data/train/c4/en/c4-train.00565-of-01024.json.gz",
3 | "/opt/ml/input/data/train/c4/en/c4-train.00663-of-01024.json.gz",
4 | "/opt/ml/input/data/train/c4/en/c4-train.00019-of-01024.json.gz",
5 | "/opt/ml/input/data/train/c4/en/c4-train.00052-of-01024.json.gz",
6 | "/opt/ml/input/data/train/c4/en/c4-train.00909-of-01024.json.gz",
7 | "/opt/ml/input/data/train/c4/en/c4-train.01009-of-01024.json.gz",
8 | "/opt/ml/input/data/train/c4/en/c4-train.00439-of-01024.json.gz",
9 | "/opt/ml/input/data/train/c4/en/c4-train.00620-of-01024.json.gz",
10 | "/opt/ml/input/data/train/c4/en/c4-train.00766-of-01024.json.gz",
11 | "/opt/ml/input/data/train/c4/en/c4-train.00350-of-01024.json.gz",
12 | "/opt/ml/input/data/train/c4/en/c4-train.00797-of-01024.json.gz",
13 | "/opt/ml/input/data/train/c4/en/c4-train.00526-of-01024.json.gz",
14 | "/opt/ml/input/data/train/c4/en/c4-train.00232-of-01024.json.gz",
15 | "/opt/ml/input/data/train/c4/en/c4-train.00362-of-01024.json.gz",
16 | "/opt/ml/input/data/train/c4/en/c4-train.00060-of-01024.json.gz",
17 | "/opt/ml/input/data/train/c4/en/c4-train.00044-of-01024.json.gz",
18 | "/opt/ml/input/data/train/c4/en/c4-train.00256-of-01024.json.gz",
19 | "/opt/ml/input/data/train/c4/en/c4-train.00196-of-01024.json.gz",
20 | "/opt/ml/input/data/train/c4/en/c4-train.00622-of-01024.json.gz",
21 | "/opt/ml/input/data/train/c4/en/c4-train.00216-of-01024.json.gz",
22 | "/opt/ml/input/data/train/c4/en/c4-train.00242-of-01024.json.gz",
23 | "/opt/ml/input/data/train/c4/en/c4-train.00470-of-01024.json.gz",
24 | "/opt/ml/input/data/train/c4/en/c4-train.00058-of-01024.json.gz",
25 | "/opt/ml/input/data/train/c4/en/c4-train.00363-of-01024.json.gz",
26 | "/opt/ml/input/data/train/c4/en/c4-train.00843-of-01024.json.gz"
27 | ]
--------------------------------------------------------------------------------
/data/files/c4/en/p25/partition_650.json:
--------------------------------------------------------------------------------
1 | [
2 | "/opt/ml/input/data/train/c4/en/c4-train.00493-of-01024.json.gz",
3 | "/opt/ml/input/data/train/c4/en/c4-train.00916-of-01024.json.gz",
4 | "/opt/ml/input/data/train/c4/en/c4-train.00983-of-01024.json.gz",
5 | "/opt/ml/input/data/train/c4/en/c4-train.00544-of-01024.json.gz",
6 | "/opt/ml/input/data/train/c4/en/c4-train.00778-of-01024.json.gz",
7 | "/opt/ml/input/data/train/c4/en/c4-train.00318-of-01024.json.gz",
8 | "/opt/ml/input/data/train/c4/en/c4-train.00454-of-01024.json.gz",
9 | "/opt/ml/input/data/train/c4/en/c4-train.00239-of-01024.json.gz",
10 | "/opt/ml/input/data/train/c4/en/c4-train.00220-of-01024.json.gz",
11 | "/opt/ml/input/data/train/c4/en/c4-train.00762-of-01024.json.gz",
12 | "/opt/ml/input/data/train/c4/en/c4-train.00927-of-01024.json.gz",
13 | "/opt/ml/input/data/train/c4/en/c4-train.00532-of-01024.json.gz",
14 | "/opt/ml/input/data/train/c4/en/c4-train.01014-of-01024.json.gz",
15 | "/opt/ml/input/data/train/c4/en/c4-train.00257-of-01024.json.gz",
16 | "/opt/ml/input/data/train/c4/en/c4-train.00160-of-01024.json.gz",
17 | "/opt/ml/input/data/train/c4/en/c4-train.00489-of-01024.json.gz",
18 | "/opt/ml/input/data/train/c4/en/c4-train.00048-of-01024.json.gz",
19 | "/opt/ml/input/data/train/c4/en/c4-train.00568-of-01024.json.gz",
20 | "/opt/ml/input/data/train/c4/en/c4-train.00528-of-01024.json.gz",
21 | "/opt/ml/input/data/train/c4/en/c4-train.00692-of-01024.json.gz",
22 | "/opt/ml/input/data/train/c4/en/c4-train.00729-of-01024.json.gz",
23 | "/opt/ml/input/data/train/c4/en/c4-train.00001-of-01024.json.gz",
24 | "/opt/ml/input/data/train/c4/en/c4-train.00623-of-01024.json.gz",
25 | "/opt/ml/input/data/train/c4/en/c4-train.00711-of-01024.json.gz",
26 | "/opt/ml/input/data/train/c4/en/c4-train.00027-of-01024.json.gz"
27 | ]
--------------------------------------------------------------------------------
/data/files/c4/en/p25/partition_675.json:
--------------------------------------------------------------------------------
1 | [
2 | "/opt/ml/input/data/train/c4/en/c4-train.00265-of-01024.json.gz",
3 | "/opt/ml/input/data/train/c4/en/c4-train.00755-of-01024.json.gz",
4 | "/opt/ml/input/data/train/c4/en/c4-train.00667-of-01024.json.gz",
5 | "/opt/ml/input/data/train/c4/en/c4-train.00921-of-01024.json.gz",
6 | "/opt/ml/input/data/train/c4/en/c4-train.00233-of-01024.json.gz",
7 | "/opt/ml/input/data/train/c4/en/c4-train.00940-of-01024.json.gz",
8 | "/opt/ml/input/data/train/c4/en/c4-train.00937-of-01024.json.gz",
9 | "/opt/ml/input/data/train/c4/en/c4-train.00161-of-01024.json.gz",
10 | "/opt/ml/input/data/train/c4/en/c4-train.00433-of-01024.json.gz",
11 | "/opt/ml/input/data/train/c4/en/c4-train.00171-of-01024.json.gz",
12 | "/opt/ml/input/data/train/c4/en/c4-train.00055-of-01024.json.gz",
13 | "/opt/ml/input/data/train/c4/en/c4-train.00525-of-01024.json.gz",
14 | "/opt/ml/input/data/train/c4/en/c4-train.00230-of-01024.json.gz",
15 | "/opt/ml/input/data/train/c4/en/c4-train.00524-of-01024.json.gz",
16 | "/opt/ml/input/data/train/c4/en/c4-train.00300-of-01024.json.gz",
17 | "/opt/ml/input/data/train/c4/en/c4-train.00193-of-01024.json.gz",
18 | "/opt/ml/input/data/train/c4/en/c4-train.00301-of-01024.json.gz",
19 | "/opt/ml/input/data/train/c4/en/c4-train.00422-of-01024.json.gz",
20 | "/opt/ml/input/data/train/c4/en/c4-train.00809-of-01024.json.gz",
21 | "/opt/ml/input/data/train/c4/en/c4-train.00668-of-01024.json.gz",
22 | "/opt/ml/input/data/train/c4/en/c4-train.00023-of-01024.json.gz",
23 | "/opt/ml/input/data/train/c4/en/c4-train.00924-of-01024.json.gz",
24 | "/opt/ml/input/data/train/c4/en/c4-train.00633-of-01024.json.gz",
25 | "/opt/ml/input/data/train/c4/en/c4-train.00414-of-01024.json.gz",
26 | "/opt/ml/input/data/train/c4/en/c4-train.01011-of-01024.json.gz"
27 | ]
--------------------------------------------------------------------------------
/data/files/c4/en/p25/partition_700.json:
--------------------------------------------------------------------------------
1 | [
2 | "/opt/ml/input/data/train/c4/en/c4-train.00819-of-01024.json.gz",
3 | "/opt/ml/input/data/train/c4/en/c4-train.01022-of-01024.json.gz",
4 | "/opt/ml/input/data/train/c4/en/c4-train.00995-of-01024.json.gz",
5 | "/opt/ml/input/data/train/c4/en/c4-train.00586-of-01024.json.gz",
6 | "/opt/ml/input/data/train/c4/en/c4-train.00004-of-01024.json.gz",
7 | "/opt/ml/input/data/train/c4/en/c4-train.00763-of-01024.json.gz",
8 | "/opt/ml/input/data/train/c4/en/c4-train.00673-of-01024.json.gz",
9 | "/opt/ml/input/data/train/c4/en/c4-train.00295-of-01024.json.gz",
10 | "/opt/ml/input/data/train/c4/en/c4-train.00423-of-01024.json.gz",
11 | "/opt/ml/input/data/train/c4/en/c4-train.00925-of-01024.json.gz",
12 | "/opt/ml/input/data/train/c4/en/c4-train.00474-of-01024.json.gz",
13 | "/opt/ml/input/data/train/c4/en/c4-train.00183-of-01024.json.gz",
14 | "/opt/ml/input/data/train/c4/en/c4-train.00898-of-01024.json.gz",
15 | "/opt/ml/input/data/train/c4/en/c4-train.00626-of-01024.json.gz",
16 | "/opt/ml/input/data/train/c4/en/c4-train.00084-of-01024.json.gz",
17 | "/opt/ml/input/data/train/c4/en/c4-train.00425-of-01024.json.gz",
18 | "/opt/ml/input/data/train/c4/en/c4-train.00229-of-01024.json.gz",
19 | "/opt/ml/input/data/train/c4/en/c4-train.00826-of-01024.json.gz",
20 | "/opt/ml/input/data/train/c4/en/c4-train.00764-of-01024.json.gz",
21 | "/opt/ml/input/data/train/c4/en/c4-train.00415-of-01024.json.gz",
22 | "/opt/ml/input/data/train/c4/en/c4-train.00121-of-01024.json.gz",
23 | "/opt/ml/input/data/train/c4/en/c4-train.00569-of-01024.json.gz",
24 | "/opt/ml/input/data/train/c4/en/c4-train.00337-of-01024.json.gz",
25 | "/opt/ml/input/data/train/c4/en/c4-train.00530-of-01024.json.gz",
26 | "/opt/ml/input/data/train/c4/en/c4-train.00444-of-01024.json.gz"
27 | ]
--------------------------------------------------------------------------------
/data/files/c4/en/p25/partition_725.json:
--------------------------------------------------------------------------------
1 | [
2 | "/opt/ml/input/data/train/c4/en/c4-train.00756-of-01024.json.gz",
3 | "/opt/ml/input/data/train/c4/en/c4-train.00601-of-01024.json.gz",
4 | "/opt/ml/input/data/train/c4/en/c4-train.00307-of-01024.json.gz",
5 | "/opt/ml/input/data/train/c4/en/c4-train.00221-of-01024.json.gz",
6 | "/opt/ml/input/data/train/c4/en/c4-train.00182-of-01024.json.gz",
7 | "/opt/ml/input/data/train/c4/en/c4-train.00207-of-01024.json.gz",
8 | "/opt/ml/input/data/train/c4/en/c4-train.00632-of-01024.json.gz",
9 | "/opt/ml/input/data/train/c4/en/c4-train.00186-of-01024.json.gz",
10 | "/opt/ml/input/data/train/c4/en/c4-train.00914-of-01024.json.gz",
11 | "/opt/ml/input/data/train/c4/en/c4-train.00513-of-01024.json.gz",
12 | "/opt/ml/input/data/train/c4/en/c4-train.00448-of-01024.json.gz",
13 | "/opt/ml/input/data/train/c4/en/c4-train.00612-of-01024.json.gz",
14 | "/opt/ml/input/data/train/c4/en/c4-train.00700-of-01024.json.gz",
15 | "/opt/ml/input/data/train/c4/en/c4-train.00254-of-01024.json.gz",
16 | "/opt/ml/input/data/train/c4/en/c4-train.00523-of-01024.json.gz",
17 | "/opt/ml/input/data/train/c4/en/c4-train.00138-of-01024.json.gz",
18 | "/opt/ml/input/data/train/c4/en/c4-train.00731-of-01024.json.gz",
19 | "/opt/ml/input/data/train/c4/en/c4-train.00166-of-01024.json.gz",
20 | "/opt/ml/input/data/train/c4/en/c4-train.00871-of-01024.json.gz",
21 | "/opt/ml/input/data/train/c4/en/c4-train.00920-of-01024.json.gz",
22 | "/opt/ml/input/data/train/c4/en/c4-train.00293-of-01024.json.gz",
23 | "/opt/ml/input/data/train/c4/en/c4-train.00365-of-01024.json.gz",
24 | "/opt/ml/input/data/train/c4/en/c4-train.00546-of-01024.json.gz",
25 | "/opt/ml/input/data/train/c4/en/c4-train.00407-of-01024.json.gz",
26 | "/opt/ml/input/data/train/c4/en/c4-train.00682-of-01024.json.gz"
27 | ]
--------------------------------------------------------------------------------
/data/files/c4/en/p25/partition_75.json:
--------------------------------------------------------------------------------
1 | [
2 | "/opt/ml/input/data/train/c4/en/c4-train.00226-of-01024.json.gz",
3 | "/opt/ml/input/data/train/c4/en/c4-train.00315-of-01024.json.gz",
4 | "/opt/ml/input/data/train/c4/en/c4-train.00371-of-01024.json.gz",
5 | "/opt/ml/input/data/train/c4/en/c4-train.00280-of-01024.json.gz",
6 | "/opt/ml/input/data/train/c4/en/c4-train.00258-of-01024.json.gz",
7 | "/opt/ml/input/data/train/c4/en/c4-train.00231-of-01024.json.gz",
8 | "/opt/ml/input/data/train/c4/en/c4-train.00018-of-01024.json.gz",
9 | "/opt/ml/input/data/train/c4/en/c4-train.00888-of-01024.json.gz",
10 | "/opt/ml/input/data/train/c4/en/c4-train.00721-of-01024.json.gz",
11 | "/opt/ml/input/data/train/c4/en/c4-train.00020-of-01024.json.gz",
12 | "/opt/ml/input/data/train/c4/en/c4-train.00144-of-01024.json.gz",
13 | "/opt/ml/input/data/train/c4/en/c4-train.00209-of-01024.json.gz",
14 | "/opt/ml/input/data/train/c4/en/c4-train.00789-of-01024.json.gz",
15 | "/opt/ml/input/data/train/c4/en/c4-train.00264-of-01024.json.gz",
16 | "/opt/ml/input/data/train/c4/en/c4-train.00431-of-01024.json.gz",
17 | "/opt/ml/input/data/train/c4/en/c4-train.00811-of-01024.json.gz",
18 | "/opt/ml/input/data/train/c4/en/c4-train.00863-of-01024.json.gz",
19 | "/opt/ml/input/data/train/c4/en/c4-train.00212-of-01024.json.gz",
20 | "/opt/ml/input/data/train/c4/en/c4-train.00862-of-01024.json.gz",
21 | "/opt/ml/input/data/train/c4/en/c4-train.00685-of-01024.json.gz",
22 | "/opt/ml/input/data/train/c4/en/c4-train.00897-of-01024.json.gz",
23 | "/opt/ml/input/data/train/c4/en/c4-train.00974-of-01024.json.gz",
24 | "/opt/ml/input/data/train/c4/en/c4-train.00630-of-01024.json.gz",
25 | "/opt/ml/input/data/train/c4/en/c4-train.00292-of-01024.json.gz",
26 | "/opt/ml/input/data/train/c4/en/c4-train.00198-of-01024.json.gz"
27 | ]
--------------------------------------------------------------------------------
/data/files/c4/en/p25/partition_750.json:
--------------------------------------------------------------------------------
1 | [
2 | "/opt/ml/input/data/train/c4/en/c4-train.00078-of-01024.json.gz",
3 | "/opt/ml/input/data/train/c4/en/c4-train.00581-of-01024.json.gz",
4 | "/opt/ml/input/data/train/c4/en/c4-train.00424-of-01024.json.gz",
5 | "/opt/ml/input/data/train/c4/en/c4-train.00641-of-01024.json.gz",
6 | "/opt/ml/input/data/train/c4/en/c4-train.00453-of-01024.json.gz",
7 | "/opt/ml/input/data/train/c4/en/c4-train.00608-of-01024.json.gz",
8 | "/opt/ml/input/data/train/c4/en/c4-train.00653-of-01024.json.gz",
9 | "/opt/ml/input/data/train/c4/en/c4-train.00977-of-01024.json.gz",
10 | "/opt/ml/input/data/train/c4/en/c4-train.00992-of-01024.json.gz",
11 | "/opt/ml/input/data/train/c4/en/c4-train.00495-of-01024.json.gz",
12 | "/opt/ml/input/data/train/c4/en/c4-train.00738-of-01024.json.gz",
13 | "/opt/ml/input/data/train/c4/en/c4-train.00312-of-01024.json.gz",
14 | "/opt/ml/input/data/train/c4/en/c4-train.00552-of-01024.json.gz",
15 | "/opt/ml/input/data/train/c4/en/c4-train.00397-of-01024.json.gz",
16 | "/opt/ml/input/data/train/c4/en/c4-train.00267-of-01024.json.gz",
17 | "/opt/ml/input/data/train/c4/en/c4-train.00529-of-01024.json.gz",
18 | "/opt/ml/input/data/train/c4/en/c4-train.00494-of-01024.json.gz",
19 | "/opt/ml/input/data/train/c4/en/c4-train.00566-of-01024.json.gz",
20 | "/opt/ml/input/data/train/c4/en/c4-train.00505-of-01024.json.gz",
21 | "/opt/ml/input/data/train/c4/en/c4-train.00276-of-01024.json.gz",
22 | "/opt/ml/input/data/train/c4/en/c4-train.00619-of-01024.json.gz",
23 | "/opt/ml/input/data/train/c4/en/c4-train.00733-of-01024.json.gz",
24 | "/opt/ml/input/data/train/c4/en/c4-train.00488-of-01024.json.gz",
25 | "/opt/ml/input/data/train/c4/en/c4-train.00486-of-01024.json.gz",
26 | "/opt/ml/input/data/train/c4/en/c4-train.00587-of-01024.json.gz"
27 | ]
--------------------------------------------------------------------------------
/data/files/c4/en/p25/partition_775.json:
--------------------------------------------------------------------------------
1 | [
2 | "/opt/ml/input/data/train/c4/en/c4-train.00941-of-01024.json.gz",
3 | "/opt/ml/input/data/train/c4/en/c4-train.00821-of-01024.json.gz",
4 | "/opt/ml/input/data/train/c4/en/c4-train.00708-of-01024.json.gz",
5 | "/opt/ml/input/data/train/c4/en/c4-train.00822-of-01024.json.gz",
6 | "/opt/ml/input/data/train/c4/en/c4-train.00366-of-01024.json.gz",
7 | "/opt/ml/input/data/train/c4/en/c4-train.00394-of-01024.json.gz",
8 | "/opt/ml/input/data/train/c4/en/c4-train.00250-of-01024.json.gz",
9 | "/opt/ml/input/data/train/c4/en/c4-train.00389-of-01024.json.gz",
10 | "/opt/ml/input/data/train/c4/en/c4-train.00353-of-01024.json.gz",
11 | "/opt/ml/input/data/train/c4/en/c4-train.00035-of-01024.json.gz",
12 | "/opt/ml/input/data/train/c4/en/c4-train.00202-of-01024.json.gz",
13 | "/opt/ml/input/data/train/c4/en/c4-train.00008-of-01024.json.gz",
14 | "/opt/ml/input/data/train/c4/en/c4-train.00364-of-01024.json.gz",
15 | "/opt/ml/input/data/train/c4/en/c4-train.00717-of-01024.json.gz",
16 | "/opt/ml/input/data/train/c4/en/c4-train.00222-of-01024.json.gz",
17 | "/opt/ml/input/data/train/c4/en/c4-train.00178-of-01024.json.gz",
18 | "/opt/ml/input/data/train/c4/en/c4-train.00240-of-01024.json.gz",
19 | "/opt/ml/input/data/train/c4/en/c4-train.00398-of-01024.json.gz",
20 | "/opt/ml/input/data/train/c4/en/c4-train.00672-of-01024.json.gz",
21 | "/opt/ml/input/data/train/c4/en/c4-train.00278-of-01024.json.gz",
22 | "/opt/ml/input/data/train/c4/en/c4-train.00959-of-01024.json.gz",
23 | "/opt/ml/input/data/train/c4/en/c4-train.00429-of-01024.json.gz",
24 | "/opt/ml/input/data/train/c4/en/c4-train.00592-of-01024.json.gz",
25 | "/opt/ml/input/data/train/c4/en/c4-train.00327-of-01024.json.gz",
26 | "/opt/ml/input/data/train/c4/en/c4-train.00686-of-01024.json.gz"
27 | ]
--------------------------------------------------------------------------------
/data/files/c4/en/p25/partition_800.json:
--------------------------------------------------------------------------------
1 | [
2 | "/opt/ml/input/data/train/c4/en/c4-train.00952-of-01024.json.gz",
3 | "/opt/ml/input/data/train/c4/en/c4-train.00516-of-01024.json.gz",
4 | "/opt/ml/input/data/train/c4/en/c4-train.00460-of-01024.json.gz",
5 | "/opt/ml/input/data/train/c4/en/c4-train.00748-of-01024.json.gz",
6 | "/opt/ml/input/data/train/c4/en/c4-train.00165-of-01024.json.gz",
7 | "/opt/ml/input/data/train/c4/en/c4-train.00309-of-01024.json.gz",
8 | "/opt/ml/input/data/train/c4/en/c4-train.00929-of-01024.json.gz",
9 | "/opt/ml/input/data/train/c4/en/c4-train.00734-of-01024.json.gz",
10 | "/opt/ml/input/data/train/c4/en/c4-train.00956-of-01024.json.gz",
11 | "/opt/ml/input/data/train/c4/en/c4-train.00143-of-01024.json.gz",
12 | "/opt/ml/input/data/train/c4/en/c4-train.00041-of-01024.json.gz",
13 | "/opt/ml/input/data/train/c4/en/c4-train.00022-of-01024.json.gz",
14 | "/opt/ml/input/data/train/c4/en/c4-train.00438-of-01024.json.gz",
15 | "/opt/ml/input/data/train/c4/en/c4-train.00776-of-01024.json.gz",
16 | "/opt/ml/input/data/train/c4/en/c4-train.00170-of-01024.json.gz",
17 | "/opt/ml/input/data/train/c4/en/c4-train.00646-of-01024.json.gz",
18 | "/opt/ml/input/data/train/c4/en/c4-train.00573-of-01024.json.gz",
19 | "/opt/ml/input/data/train/c4/en/c4-train.00997-of-01024.json.gz",
20 | "/opt/ml/input/data/train/c4/en/c4-train.00823-of-01024.json.gz",
21 | "/opt/ml/input/data/train/c4/en/c4-train.00206-of-01024.json.gz",
22 | "/opt/ml/input/data/train/c4/en/c4-train.00547-of-01024.json.gz",
23 | "/opt/ml/input/data/train/c4/en/c4-train.00194-of-01024.json.gz",
24 | "/opt/ml/input/data/train/c4/en/c4-train.00130-of-01024.json.gz",
25 | "/opt/ml/input/data/train/c4/en/c4-train.00067-of-01024.json.gz",
26 | "/opt/ml/input/data/train/c4/en/c4-train.00765-of-01024.json.gz"
27 | ]
--------------------------------------------------------------------------------
/data/files/c4/en/p25/partition_825.json:
--------------------------------------------------------------------------------
1 | [
2 | "/opt/ml/input/data/train/c4/en/c4-train.00799-of-01024.json.gz",
3 | "/opt/ml/input/data/train/c4/en/c4-train.00466-of-01024.json.gz",
4 | "/opt/ml/input/data/train/c4/en/c4-train.00680-of-01024.json.gz",
5 | "/opt/ml/input/data/train/c4/en/c4-train.00277-of-01024.json.gz",
6 | "/opt/ml/input/data/train/c4/en/c4-train.00885-of-01024.json.gz",
7 | "/opt/ml/input/data/train/c4/en/c4-train.00548-of-01024.json.gz",
8 | "/opt/ml/input/data/train/c4/en/c4-train.00703-of-01024.json.gz",
9 | "/opt/ml/input/data/train/c4/en/c4-train.00374-of-01024.json.gz",
10 | "/opt/ml/input/data/train/c4/en/c4-train.00779-of-01024.json.gz",
11 | "/opt/ml/input/data/train/c4/en/c4-train.00881-of-01024.json.gz",
12 | "/opt/ml/input/data/train/c4/en/c4-train.00749-of-01024.json.gz",
13 | "/opt/ml/input/data/train/c4/en/c4-train.00234-of-01024.json.gz",
14 | "/opt/ml/input/data/train/c4/en/c4-train.00844-of-01024.json.gz",
15 | "/opt/ml/input/data/train/c4/en/c4-train.00352-of-01024.json.gz",
16 | "/opt/ml/input/data/train/c4/en/c4-train.00943-of-01024.json.gz",
17 | "/opt/ml/input/data/train/c4/en/c4-train.00088-of-01024.json.gz",
18 | "/opt/ml/input/data/train/c4/en/c4-train.00195-of-01024.json.gz",
19 | "/opt/ml/input/data/train/c4/en/c4-train.00864-of-01024.json.gz",
20 | "/opt/ml/input/data/train/c4/en/c4-train.00066-of-01024.json.gz",
21 | "/opt/ml/input/data/train/c4/en/c4-train.00743-of-01024.json.gz",
22 | "/opt/ml/input/data/train/c4/en/c4-train.00437-of-01024.json.gz",
23 | "/opt/ml/input/data/train/c4/en/c4-train.00606-of-01024.json.gz",
24 | "/opt/ml/input/data/train/c4/en/c4-train.00728-of-01024.json.gz",
25 | "/opt/ml/input/data/train/c4/en/c4-train.00361-of-01024.json.gz",
26 | "/opt/ml/input/data/train/c4/en/c4-train.00095-of-01024.json.gz"
27 | ]
--------------------------------------------------------------------------------
/data/files/c4/en/p25/partition_850.json:
--------------------------------------------------------------------------------
1 | [
2 | "/opt/ml/input/data/train/c4/en/c4-train.00541-of-01024.json.gz",
3 | "/opt/ml/input/data/train/c4/en/c4-train.00539-of-01024.json.gz",
4 | "/opt/ml/input/data/train/c4/en/c4-train.00142-of-01024.json.gz",
5 | "/opt/ml/input/data/train/c4/en/c4-train.00913-of-01024.json.gz",
6 | "/opt/ml/input/data/train/c4/en/c4-train.00718-of-01024.json.gz",
7 | "/opt/ml/input/data/train/c4/en/c4-train.00446-of-01024.json.gz",
8 | "/opt/ml/input/data/train/c4/en/c4-train.00091-of-01024.json.gz",
9 | "/opt/ml/input/data/train/c4/en/c4-train.00332-of-01024.json.gz",
10 | "/opt/ml/input/data/train/c4/en/c4-train.00625-of-01024.json.gz",
11 | "/opt/ml/input/data/train/c4/en/c4-train.00803-of-01024.json.gz",
12 | "/opt/ml/input/data/train/c4/en/c4-train.00813-of-01024.json.gz",
13 | "/opt/ml/input/data/train/c4/en/c4-train.00975-of-01024.json.gz",
14 | "/opt/ml/input/data/train/c4/en/c4-train.00724-of-01024.json.gz",
15 | "/opt/ml/input/data/train/c4/en/c4-train.00412-of-01024.json.gz",
16 | "/opt/ml/input/data/train/c4/en/c4-train.00582-of-01024.json.gz",
17 | "/opt/ml/input/data/train/c4/en/c4-train.00074-of-01024.json.gz",
18 | "/opt/ml/input/data/train/c4/en/c4-train.00112-of-01024.json.gz",
19 | "/opt/ml/input/data/train/c4/en/c4-train.00883-of-01024.json.gz",
20 | "/opt/ml/input/data/train/c4/en/c4-train.00739-of-01024.json.gz",
21 | "/opt/ml/input/data/train/c4/en/c4-train.00443-of-01024.json.gz",
22 | "/opt/ml/input/data/train/c4/en/c4-train.00266-of-01024.json.gz",
23 | "/opt/ml/input/data/train/c4/en/c4-train.00203-of-01024.json.gz",
24 | "/opt/ml/input/data/train/c4/en/c4-train.00872-of-01024.json.gz",
25 | "/opt/ml/input/data/train/c4/en/c4-train.00211-of-01024.json.gz",
26 | "/opt/ml/input/data/train/c4/en/c4-train.00855-of-01024.json.gz"
27 | ]
--------------------------------------------------------------------------------
/data/files/c4/en/p25/partition_875.json:
--------------------------------------------------------------------------------
1 | [
2 | "/opt/ml/input/data/train/c4/en/c4-train.00136-of-01024.json.gz",
3 | "/opt/ml/input/data/train/c4/en/c4-train.00695-of-01024.json.gz",
4 | "/opt/ml/input/data/train/c4/en/c4-train.00271-of-01024.json.gz",
5 | "/opt/ml/input/data/train/c4/en/c4-train.00824-of-01024.json.gz",
6 | "/opt/ml/input/data/train/c4/en/c4-train.00367-of-01024.json.gz",
7 | "/opt/ml/input/data/train/c4/en/c4-train.00010-of-01024.json.gz",
8 | "/opt/ml/input/data/train/c4/en/c4-train.00125-of-01024.json.gz",
9 | "/opt/ml/input/data/train/c4/en/c4-train.00436-of-01024.json.gz",
10 | "/opt/ml/input/data/train/c4/en/c4-train.00958-of-01024.json.gz",
11 | "/opt/ml/input/data/train/c4/en/c4-train.00289-of-01024.json.gz",
12 | "/opt/ml/input/data/train/c4/en/c4-train.00742-of-01024.json.gz",
13 | "/opt/ml/input/data/train/c4/en/c4-train.00056-of-01024.json.gz",
14 | "/opt/ml/input/data/train/c4/en/c4-train.01001-of-01024.json.gz",
15 | "/opt/ml/input/data/train/c4/en/c4-train.00360-of-01024.json.gz",
16 | "/opt/ml/input/data/train/c4/en/c4-train.00856-of-01024.json.gz",
17 | "/opt/ml/input/data/train/c4/en/c4-train.00989-of-01024.json.gz",
18 | "/opt/ml/input/data/train/c4/en/c4-train.00991-of-01024.json.gz",
19 | "/opt/ml/input/data/train/c4/en/c4-train.00585-of-01024.json.gz",
20 | "/opt/ml/input/data/train/c4/en/c4-train.00534-of-01024.json.gz",
21 | "/opt/ml/input/data/train/c4/en/c4-train.00388-of-01024.json.gz",
22 | "/opt/ml/input/data/train/c4/en/c4-train.00578-of-01024.json.gz",
23 | "/opt/ml/input/data/train/c4/en/c4-train.00383-of-01024.json.gz",
24 | "/opt/ml/input/data/train/c4/en/c4-train.00149-of-01024.json.gz",
25 | "/opt/ml/input/data/train/c4/en/c4-train.01005-of-01024.json.gz",
26 | "/opt/ml/input/data/train/c4/en/c4-train.01023-of-01024.json.gz"
27 | ]
--------------------------------------------------------------------------------
/data/files/c4/en/p25/partition_900.json:
--------------------------------------------------------------------------------
1 | [
2 | "/opt/ml/input/data/train/c4/en/c4-train.00106-of-01024.json.gz",
3 | "/opt/ml/input/data/train/c4/en/c4-train.00043-of-01024.json.gz",
4 | "/opt/ml/input/data/train/c4/en/c4-train.00314-of-01024.json.gz",
5 | "/opt/ml/input/data/train/c4/en/c4-train.00409-of-01024.json.gz",
6 | "/opt/ml/input/data/train/c4/en/c4-train.00331-of-01024.json.gz",
7 | "/opt/ml/input/data/train/c4/en/c4-train.00850-of-01024.json.gz",
8 | "/opt/ml/input/data/train/c4/en/c4-train.00542-of-01024.json.gz",
9 | "/opt/ml/input/data/train/c4/en/c4-train.00953-of-01024.json.gz",
10 | "/opt/ml/input/data/train/c4/en/c4-train.00316-of-01024.json.gz",
11 | "/opt/ml/input/data/train/c4/en/c4-train.00426-of-01024.json.gz",
12 | "/opt/ml/input/data/train/c4/en/c4-train.00251-of-01024.json.gz",
13 | "/opt/ml/input/data/train/c4/en/c4-train.00594-of-01024.json.gz",
14 | "/opt/ml/input/data/train/c4/en/c4-train.00990-of-01024.json.gz",
15 | "/opt/ml/input/data/train/c4/en/c4-train.00689-of-01024.json.gz",
16 | "/opt/ml/input/data/train/c4/en/c4-train.00725-of-01024.json.gz",
17 | "/opt/ml/input/data/train/c4/en/c4-train.00570-of-01024.json.gz",
18 | "/opt/ml/input/data/train/c4/en/c4-train.00461-of-01024.json.gz",
19 | "/opt/ml/input/data/train/c4/en/c4-train.00805-of-01024.json.gz",
20 | "/opt/ml/input/data/train/c4/en/c4-train.00275-of-01024.json.gz",
21 | "/opt/ml/input/data/train/c4/en/c4-train.00691-of-01024.json.gz",
22 | "/opt/ml/input/data/train/c4/en/c4-train.00860-of-01024.json.gz",
23 | "/opt/ml/input/data/train/c4/en/c4-train.00384-of-01024.json.gz",
24 | "/opt/ml/input/data/train/c4/en/c4-train.00030-of-01024.json.gz",
25 | "/opt/ml/input/data/train/c4/en/c4-train.00237-of-01024.json.gz",
26 | "/opt/ml/input/data/train/c4/en/c4-train.00359-of-01024.json.gz"
27 | ]
--------------------------------------------------------------------------------
/data/files/c4/en/p25/partition_925.json:
--------------------------------------------------------------------------------
1 | [
2 | "/opt/ml/input/data/train/c4/en/c4-train.00400-of-01024.json.gz",
3 | "/opt/ml/input/data/train/c4/en/c4-train.00386-of-01024.json.gz",
4 | "/opt/ml/input/data/train/c4/en/c4-train.00473-of-01024.json.gz",
5 | "/opt/ml/input/data/train/c4/en/c4-train.00127-of-01024.json.gz",
6 | "/opt/ml/input/data/train/c4/en/c4-train.00684-of-01024.json.gz",
7 | "/opt/ml/input/data/train/c4/en/c4-train.00162-of-01024.json.gz",
8 | "/opt/ml/input/data/train/c4/en/c4-train.00108-of-01024.json.gz",
9 | "/opt/ml/input/data/train/c4/en/c4-train.00320-of-01024.json.gz",
10 | "/opt/ml/input/data/train/c4/en/c4-train.00099-of-01024.json.gz",
11 | "/opt/ml/input/data/train/c4/en/c4-train.00111-of-01024.json.gz",
12 | "/opt/ml/input/data/train/c4/en/c4-train.00375-of-01024.json.gz",
13 | "/opt/ml/input/data/train/c4/en/c4-train.00328-of-01024.json.gz",
14 | "/opt/ml/input/data/train/c4/en/c4-train.00148-of-01024.json.gz",
15 | "/opt/ml/input/data/train/c4/en/c4-train.00117-of-01024.json.gz",
16 | "/opt/ml/input/data/train/c4/en/c4-train.00664-of-01024.json.gz",
17 | "/opt/ml/input/data/train/c4/en/c4-train.00472-of-01024.json.gz",
18 | "/opt/ml/input/data/train/c4/en/c4-train.00124-of-01024.json.gz",
19 | "/opt/ml/input/data/train/c4/en/c4-train.00702-of-01024.json.gz",
20 | "/opt/ml/input/data/train/c4/en/c4-train.00886-of-01024.json.gz",
21 | "/opt/ml/input/data/train/c4/en/c4-train.00651-of-01024.json.gz",
22 | "/opt/ml/input/data/train/c4/en/c4-train.00838-of-01024.json.gz",
23 | "/opt/ml/input/data/train/c4/en/c4-train.00336-of-01024.json.gz",
24 | "/opt/ml/input/data/train/c4/en/c4-train.00274-of-01024.json.gz",
25 | "/opt/ml/input/data/train/c4/en/c4-train.00255-of-01024.json.gz",
26 | "/opt/ml/input/data/train/c4/en/c4-train.00025-of-01024.json.gz"
27 | ]
--------------------------------------------------------------------------------
/data/files/c4/en/p25/partition_950.json:
--------------------------------------------------------------------------------
1 | [
2 | "/opt/ml/input/data/train/c4/en/c4-train.00836-of-01024.json.gz",
3 | "/opt/ml/input/data/train/c4/en/c4-train.00575-of-01024.json.gz",
4 | "/opt/ml/input/data/train/c4/en/c4-train.00319-of-01024.json.gz",
5 | "/opt/ml/input/data/train/c4/en/c4-train.00455-of-01024.json.gz",
6 | "/opt/ml/input/data/train/c4/en/c4-train.01007-of-01024.json.gz",
7 | "/opt/ml/input/data/train/c4/en/c4-train.00482-of-01024.json.gz",
8 | "/opt/ml/input/data/train/c4/en/c4-train.00464-of-01024.json.gz",
9 | "/opt/ml/input/data/train/c4/en/c4-train.00214-of-01024.json.gz",
10 | "/opt/ml/input/data/train/c4/en/c4-train.00849-of-01024.json.gz",
11 | "/opt/ml/input/data/train/c4/en/c4-train.00878-of-01024.json.gz",
12 | "/opt/ml/input/data/train/c4/en/c4-train.00163-of-01024.json.gz",
13 | "/opt/ml/input/data/train/c4/en/c4-train.00577-of-01024.json.gz",
14 | "/opt/ml/input/data/train/c4/en/c4-train.00629-of-01024.json.gz",
15 | "/opt/ml/input/data/train/c4/en/c4-train.00661-of-01024.json.gz",
16 | "/opt/ml/input/data/train/c4/en/c4-train.00093-of-01024.json.gz",
17 | "/opt/ml/input/data/train/c4/en/c4-train.00057-of-01024.json.gz",
18 | "/opt/ml/input/data/train/c4/en/c4-train.00477-of-01024.json.gz",
19 | "/opt/ml/input/data/train/c4/en/c4-train.00037-of-01024.json.gz",
20 | "/opt/ml/input/data/train/c4/en/c4-train.00457-of-01024.json.gz",
21 | "/opt/ml/input/data/train/c4/en/c4-train.00973-of-01024.json.gz",
22 | "/opt/ml/input/data/train/c4/en/c4-train.00788-of-01024.json.gz",
23 | "/opt/ml/input/data/train/c4/en/c4-train.00848-of-01024.json.gz",
24 | "/opt/ml/input/data/train/c4/en/c4-train.00564-of-01024.json.gz",
25 | "/opt/ml/input/data/train/c4/en/c4-train.00406-of-01024.json.gz",
26 | "/opt/ml/input/data/train/c4/en/c4-train.00109-of-01024.json.gz"
27 | ]
--------------------------------------------------------------------------------
/data/files/c4/en/p25/partition_975.json:
--------------------------------------------------------------------------------
1 | [
2 | "/opt/ml/input/data/train/c4/en/c4-train.00520-of-01024.json.gz",
3 | "/opt/ml/input/data/train/c4/en/c4-train.00322-of-01024.json.gz",
4 | "/opt/ml/input/data/train/c4/en/c4-train.00380-of-01024.json.gz",
5 | "/opt/ml/input/data/train/c4/en/c4-train.00518-of-01024.json.gz",
6 | "/opt/ml/input/data/train/c4/en/c4-train.00854-of-01024.json.gz",
7 | "/opt/ml/input/data/train/c4/en/c4-train.00092-of-01024.json.gz",
8 | "/opt/ml/input/data/train/c4/en/c4-train.00741-of-01024.json.gz",
9 | "/opt/ml/input/data/train/c4/en/c4-train.00515-of-01024.json.gz",
10 | "/opt/ml/input/data/train/c4/en/c4-train.00417-of-01024.json.gz",
11 | "/opt/ml/input/data/train/c4/en/c4-train.00326-of-01024.json.gz",
12 | "/opt/ml/input/data/train/c4/en/c4-train.00033-of-01024.json.gz",
13 | "/opt/ml/input/data/train/c4/en/c4-train.00268-of-01024.json.gz",
14 | "/opt/ml/input/data/train/c4/en/c4-train.00835-of-01024.json.gz",
15 | "/opt/ml/input/data/train/c4/en/c4-train.00376-of-01024.json.gz",
16 | "/opt/ml/input/data/train/c4/en/c4-train.00798-of-01024.json.gz",
17 | "/opt/ml/input/data/train/c4/en/c4-train.00744-of-01024.json.gz",
18 | "/opt/ml/input/data/train/c4/en/c4-train.00009-of-01024.json.gz",
19 | "/opt/ml/input/data/train/c4/en/c4-train.00152-of-01024.json.gz",
20 | "/opt/ml/input/data/train/c4/en/c4-train.00116-of-01024.json.gz",
21 | "/opt/ml/input/data/train/c4/en/c4-train.00497-of-01024.json.gz",
22 | "/opt/ml/input/data/train/c4/en/c4-train.00475-of-01024.json.gz",
23 | "/opt/ml/input/data/train/c4/en/c4-train.00003-of-01024.json.gz",
24 | "/opt/ml/input/data/train/c4/en/c4-train.00890-of-01024.json.gz",
25 | "/opt/ml/input/data/train/c4/en/c4-train.00936-of-01024.json.gz",
26 | "/opt/ml/input/data/train/c4/en/c4-train.00998-of-01024.json.gz"
27 | ]
--------------------------------------------------------------------------------
/general_util/tokenization_utils.py:
--------------------------------------------------------------------------------
1 | from transformers import PreTrainedTokenizer
2 | import os
3 |
4 | from data.data_utils import tokenizer_get_name
5 | from general_util.logger import get_child_logger
6 |
7 | DEFAULT_PAD_TOKEN = "[PAD]"
8 | DEFAULT_EOS_TOKEN = ""
9 | DEFAULT_BOS_TOKEN = ""
10 | DEFAULT_UNK_TOKEN = ""
11 |
12 | logger = get_child_logger(__name__)
13 |
14 |
15 | def expand_special_tokenizer(tokenizer: PreTrainedTokenizer):
16 | if "llama" in tokenizer_get_name(tokenizer):
17 | special_tokens_map = {}
18 | eos_token = os.environ.get("EOS_TOKEN", None)
19 | if eos_token or (not tokenizer.eos_token):
20 | special_tokens_map["eos_token"] = eos_token if eos_token else DEFAULT_EOS_TOKEN
21 |
22 | bos_token = os.environ.get("BOS_TOKEN", None)
23 | if bos_token or (not tokenizer.bos_token):
24 | special_tokens_map["bos_token"] = bos_token if bos_token else DEFAULT_BOS_TOKEN
25 |
26 | unk_token = os.environ.get("UNK_TOKEN", None)
27 | if not tokenizer.unk_token:
28 | special_tokens_map["unk_token"] = unk_token if unk_token else DEFAULT_UNK_TOKEN
29 |
30 | pad_token = os.environ.get("PAD_TOKEN", None)
31 | if not tokenizer.pad_token:
32 | special_tokens_map["pad_token"] = pad_token if pad_token else DEFAULT_PAD_TOKEN
33 |
34 | new_tokens = tokenizer.add_special_tokens(
35 | special_tokens_dict=special_tokens_map
36 | )
37 | # new_tokens = tokenizer.add_special_tokens(special_tokens_dict=dict(pad_token=DEFAULT_PAD_TOKEN))
38 | # tokenizer.pad_token = tokenizer.eos_token
39 | # tokenizer.pad_token_id = tokenizer.eos_token_id
40 | # assert new_tokens == 1
41 | logger.info(tokenizer)
42 | logger.info(f"PAD TOKEN ID = {tokenizer.pad_token_id}")
43 |
--------------------------------------------------------------------------------
/post_processors/bleu.py:
--------------------------------------------------------------------------------
1 | from typing import Dict, List, Any
2 |
3 | from nltk import word_tokenize
4 | from nltk.translate.bleu_score import sentence_bleu
5 | from torch import distributed as dist
6 |
7 | from post_processors.dist_mixin import DistGatherMixin
8 |
9 |
10 | class BLEUMetric(DistGatherMixin):
11 | def __init__(self):
12 | self.predictions = []
13 |
14 | def __call__(self, meta_data: List[Dict[str, Any]], batch_model_outputs: Dict[str, Any], ddp: bool = False):
15 | sources = []
16 | targets = []
17 | for item in meta_data:
18 | sources.append(item["src"])
19 | if "tgt" in item and item["tgt"]:
20 | targets.append(item["tgt"])
21 | else:
22 | targets.append("")
23 |
24 | pred_seq = batch_model_outputs["generated_seq"]
25 | predictions = [
26 | {
27 | "source": src,
28 | "target": tgt,
29 | "prediction": pred,
30 | } for src, tgt, pred in zip(sources, targets, pred_seq)
31 | ]
32 |
33 | if ddp:
34 | obj = predictions
35 | gather_res = self.gather_object(obj)
36 | if dist.get_rank() == 0:
37 | tmp = []
38 | for item in gather_res:
39 | tmp.extend(item)
40 | predictions = tmp
41 |
42 | self.predictions.extend(predictions)
43 |
44 | del meta_data, batch_model_outputs, sources, targets, pred_seq, predictions
45 |
46 | def get_results(self):
47 | bleu = sum(
48 | [sentence_bleu([word_tokenize(pred["target"])], word_tokenize(pred["prediction"])) for pred in
49 | self.predictions]
50 | ) * 1.0 / len(self.predictions)
51 |
52 | return {"bleu": bleu}, self.predictions
53 |
--------------------------------------------------------------------------------
/general_util/logger.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import os
3 | import sys
4 |
5 | _root_name = 'FK'
6 |
7 |
8 | def get_child_logger(child_name):
9 | # _local_rank = getattr(os.environ, "LOCAL_RANK", "")
10 | #
11 | # if _root_name == "FK" and _local_rank:
12 | # return logging.getLogger(_root_name + '.' + _local_rank + '.' + child_name)
13 |
14 | return logging.getLogger(_root_name + '.' + child_name)
15 |
16 |
17 | def setting_logger(log_file: str, local_rank: int = -1):
18 | model_name = "-".join(log_file.replace('/', ' ').split()[1:])
19 |
20 | logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
21 | datefmt='%m/%d/%Y %H:%M:%S',
22 | level=logging.INFO if local_rank in [-1, 0] else logging.WARNING)
23 |
24 | # global _root_name
25 | # if local_rank != -1 and _root_name == "FK":
26 | # _root_name = _root_name + '.' + str(local_rank)
27 | logger = logging.getLogger(_root_name)
28 | logger.setLevel(logging.INFO if local_rank in [-1, 0] else logging.WARNING)
29 |
30 | rf_handler = logging.StreamHandler(sys.stderr)
31 | rf_handler.setLevel(logging.INFO)
32 | rf_handler.setFormatter(logging.Formatter(fmt='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
33 | datefmt='%m/%d/%Y %H:%M:%S'))
34 |
35 | output_dir = './log_dir'
36 | if not os.path.exists(output_dir):
37 | os.makedirs(output_dir)
38 | f_handler = logging.FileHandler(os.path.join(
39 | output_dir, model_name + '-output.log'))
40 | f_handler.setLevel(logging.INFO)
41 | f_handler.setFormatter(logging.Formatter(fmt="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
42 | datefmt='%m/%d/%Y %H:%M:%S'))
43 |
44 | logger.addHandler(f_handler)
45 | return logger
46 |
--------------------------------------------------------------------------------
/models/mpt/adapt_tokenizer.py:
--------------------------------------------------------------------------------
1 | from typing import Union
2 | from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
3 | Tokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
4 | NUM_SENTINEL_TOKENS: int = 100
5 |
6 | def adapt_tokenizer_for_denoising(tokenizer: Tokenizer):
7 | """Adds sentinel tokens and padding token (if missing).
8 |
9 | Expands the tokenizer vocabulary to include sentinel tokens
10 | used in mixture-of-denoiser tasks as well as a padding token.
11 |
12 | All added tokens are added as special tokens. No tokens are
13 | added if sentinel tokens and padding token already exist.
14 | """
15 | sentinels_to_add = [f'' for i in range(NUM_SENTINEL_TOKENS)]
16 | tokenizer.add_tokens(sentinels_to_add, special_tokens=True)
17 | if tokenizer.pad_token is None:
18 | tokenizer.add_tokens('', special_tokens=True)
19 | tokenizer.pad_token = ''
20 | assert tokenizer.pad_token_id is not None
21 | sentinels = ''.join([f'' for i in range(NUM_SENTINEL_TOKENS)])
22 | _sentinel_token_ids = tokenizer(sentinels, add_special_tokens=False).input_ids
23 | tokenizer.sentinel_token_ids = _sentinel_token_ids
24 |
25 | class AutoTokenizerForMOD(AutoTokenizer):
26 | """AutoTokenizer + Adaptation for MOD.
27 |
28 | A simple wrapper around AutoTokenizer to make instantiating
29 | an MOD-adapted tokenizer a bit easier.
30 |
31 | MOD-adapted tokenizers have sentinel tokens (e.g., ),
32 | a padding token, and a property to get the token ids of the
33 | sentinel tokens.
34 | """
35 |
36 | @classmethod
37 | def from_pretrained(cls, *args, **kwargs):
38 | """See `AutoTokenizer.from_pretrained` docstring."""
39 | tokenizer = super().from_pretrained(*args, **kwargs)
40 | adapt_tokenizer_for_denoising(tokenizer)
41 | return tokenizer
--------------------------------------------------------------------------------
/general_util/mixin.py:
--------------------------------------------------------------------------------
1 | from collections import defaultdict
2 | from typing import Dict, List, Tuple
3 |
4 | import torch
5 |
6 | from general_util.average_meter import LogMetric, AverageMeter
7 | from general_util.logger import get_child_logger
8 |
9 | logger = get_child_logger("Mixin")
10 |
11 |
12 | class LogMixin:
13 | eval_metrics: LogMetric = None
14 |
15 | def init_metric(self, *metric_names):
16 | self.eval_metrics = LogMetric(*metric_names)
17 |
18 | def get_eval_log(self, reset=False, ddp=False, device='cpu'):
19 |
20 | if self.eval_metrics is None:
21 | logger.warning("The `eval_metrics` attribute hasn't been initialized.")
22 |
23 | if ddp:
24 | for metric in self.eval_metrics.metrics.values():
25 | metric.gather(device=device)
26 |
27 | results = self.eval_metrics.get_log()
28 |
29 | _eval_metric_log = '\t'.join([f"{k}: {v}" for k, v in results.items()])
30 |
31 | if reset:
32 | self.eval_metrics.reset()
33 |
34 | return _eval_metric_log, results
35 |
36 |
37 | class MetricMixin:
38 | # TODO: 如何利用hydra解耦计算metric的方式和模型?
39 | def __init__(self, metrics: List[Tuple[str, str, str, str]]):
40 | self.metrics = {
41 | name: {
42 | "key": key,
43 | "val": val,
44 | "func": func,
45 | "meter": AverageMeter()
46 | } for key, val, func, name in metrics
47 | }
48 |
49 |
50 | class PredictionMixin:
51 | tensor_dict: Dict[str, List] = defaultdict(list)
52 |
53 | def reset_predict_tensors(self):
54 | self.tensor_dict = defaultdict(list)
55 |
56 | def concat_predict_tensors(self, **tensors: torch.Tensor):
57 | for k, v in tensors.items():
58 | self.tensor_dict[k].extend(v.detach().cpu().tolist())
59 |
60 | def get_predict_tensors(self):
61 | return self.tensor_dict
62 |
--------------------------------------------------------------------------------
/make_delta.py:
--------------------------------------------------------------------------------
1 | """
2 | Code is modified from https://github.com/lm-sys/FastChat/blob/main/fastchat/model/make_delta.py.
3 |
4 | Make the delta weights by subtracting base weights.
5 |
6 | Usage:
7 | python3 -m fastchat.model.make_delta --base ~/model_weights/llama-13b --target ~/model_weights/vicuna-13b --delta ~/model_weights/vicuna-13b-delta --hub-repo-id lmsys/vicuna-13b-delta-v1.1
8 | """
9 | import argparse
10 |
11 | import torch
12 | from tqdm import tqdm
13 | from transformers import AutoTokenizer, AutoModelForCausalLM
14 |
15 |
16 | def make_delta(base_model_path, target_model_path, delta_path):
17 | print(f"Loading the base model from {base_model_path}")
18 | base = AutoModelForCausalLM.from_pretrained(
19 | base_model_path, low_cpu_mem_usage=True
20 | )
21 |
22 | print(f"Loading the target model from {target_model_path}")
23 | target = AutoModelForCausalLM.from_pretrained(
24 | target_model_path, low_cpu_mem_usage=True
25 | )
26 | target_tokenizer = AutoTokenizer.from_pretrained(
27 | target_model_path, use_fast=False
28 | )
29 |
30 | print("Calculating the delta")
31 | for name, param in tqdm(target.state_dict().items(), desc="Calculating delta"):
32 | assert name in base.state_dict()
33 | if "embed_tokens" in name or "lm_head.weight" in name:
34 | continue
35 | try:
36 | param.data -= base.state_dict()[name]
37 | except:
38 | print(name)
39 | raise ValueError()
40 |
41 | print(f"Saving the delta to {delta_path}")
42 | if args.hub_repo_id:
43 | kwargs = {"push_to_hub": True, "repo_id": args.hub_repo_id}
44 | else:
45 | kwargs = {}
46 | target.save_pretrained(delta_path, **kwargs)
47 | target_tokenizer.save_pretrained(delta_path, **kwargs)
48 |
49 |
50 | if __name__ == "__main__":
51 | parser = argparse.ArgumentParser()
52 | parser.add_argument("--base-model-path", type=str, required=True)
53 | parser.add_argument("--target-model-path", type=str, required=True)
54 | parser.add_argument("--delta-path", type=str, required=True)
55 | parser.add_argument("--hub-repo-id", type=str)
56 | args = parser.parse_args()
57 |
58 | make_delta(args.base_model_path, args.target_model_path, args.delta_path)
59 |
--------------------------------------------------------------------------------
/data/flan_sample.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import gzip
3 | import json
4 | import os.path
5 |
6 | import torch
7 |
8 | cot_zs_submix = (
9 | ("cot_zs_noopt_train.jsonl.gz", 1),
10 | ("cot_zs_opt_train.jsonl.gz", 1),
11 | )
12 |
13 | dialog_zs_submix = (
14 | ("dialog_zs_noopt_train.jsonl.gz", 1),
15 | ("dialog_zs_opt_train.jsonl.gz", 1),
16 | )
17 |
18 | flan_zs_submix = (
19 | ("flan_zs_noopt_train.jsonl.gz", 1),
20 | ("flan_zs_opt_train.jsonl.gz", 1),
21 | )
22 |
23 | niv2_zs_submix = (
24 | ("niv2_zs_noopt_train.jsonl.gz", 1),
25 | ("niv2_zs_opt_train.jsonl.gz", 1),
26 | )
27 |
28 | t0_zs_submix = (
29 | ("t0_zs_noopt_train.jsonl.gz", 1),
30 | ("t0_zs_opt_train.jsonl.gz", 1),
31 | )
32 |
33 | flan_v2_submix = (
34 | (flan_zs_submix, 0.4), # mixing weight = 40%
35 | (t0_zs_submix, 0.32), # mixing weight = 32%
36 | (niv2_zs_submix, 0.2), # mixing weight = 20%
37 | (cot_zs_submix, 0.05), # mixing weight = 5%
38 | (dialog_zs_submix, 0.03), # mixing weight = 3%
39 | )
40 |
41 | if __name__ == '__main__':
42 | parser = argparse.ArgumentParser()
43 | parser.add_argument("--input_dir", type=str, default="data")
44 | parser.add_argument("--total_data_num", type=int, default=500000)
45 | parser.add_argument("--output_file", type=str, default="data")
46 | args = parser.parse_args()
47 |
48 | all_data = []
49 | for dataset, ratio in flan_v2_submix:
50 | data_num = int(args.total_data_num * ratio)
51 | dataset_data = []
52 | tmp = sum([sub_ratio for _, sub_ratio in dataset])
53 |
54 | for file, sub_ratio in dataset:
55 | sub_data_num = int(data_num * sub_ratio / tmp)
56 |
57 | sub_data_f = gzip.open(os.path.join(args.input_dir, file), "rt")
58 | sub_data = []
59 | line = sub_data_f.readline()
60 | while line:
61 | sub_data.append(json.loads(line))
62 | if len(sub_data) >= sub_data_num:
63 | break
64 | line = sub_data_f.readline()
65 | sub_data_f.close()
66 | print(f"Read {len(sub_data)} lines from {file}")
67 |
68 | sub_data = sub_data[:sub_data_num]
69 | print(sub_data[0])
70 | print(sub_data[-1])
71 | dataset_data.extend(sub_data)
72 |
73 | all_data.extend(dataset_data)
74 |
75 | print(f"Total data num: {len(all_data)}")
76 | torch.save(all_data, args.output_file)
77 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | *.py,cover
51 | .hypothesis/
52 | .pytest_cache/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | target/
76 |
77 | # Jupyter Notebook
78 | .ipynb_checkpoints
79 |
80 | # IPython
81 | profile_default/
82 | ipython_config.py
83 |
84 | # pyenv
85 | .python-version
86 |
87 | # pipenv
88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
91 | # install all needed dependencies.
92 | #Pipfile.lock
93 |
94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95 | __pypackages__/
96 |
97 | # Celery stuff
98 | celerybeat-schedule
99 | celerybeat.pid
100 |
101 | # SageMath parsed files
102 | *.sage.py
103 |
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 |
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 |
117 | # Rope project settings
118 | .ropeproject
119 |
120 | # mkdocs documentation
121 | /site
122 |
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 |
128 | # Pyre type checker
129 | .pyre/
130 |
--------------------------------------------------------------------------------
/data/strategy_qa.py:
--------------------------------------------------------------------------------
1 | import json
2 |
3 | import torch
4 | from transformers import PreTrainedTokenizer
5 | from transformers.tokenization_utils import TruncationStrategy, PaddingStrategy
6 |
7 | from data.collators.dict2dict import DictTensorDataset
8 | from general_util.logger import get_child_logger
9 |
10 | logger = get_child_logger(__name__)
11 |
12 |
13 | def split_get_tensor_with_gold_para(file_path: str, tokenizer: PreTrainedTokenizer,
14 | train_para_file: str, max_seq_length: int, use_fact: bool = False):
15 | data = json.load(open(file_path))
16 | train_paragraphs = json.load(open(train_para_file))
17 |
18 | text_inputs_a = []
19 | text_inputs_b = []
20 | labels = []
21 | for item in data:
22 | question = item["question"]
23 | label = int(item["answer"])
24 |
25 | if use_fact:
26 | paragraphs = item["facts"]
27 | else:
28 | para_ids = set()
29 | for evidence in item["evidence"]:
30 | for annotation in evidence:
31 | for evi_item in annotation:
32 | if isinstance(evi_item, list):
33 | for para_id in evi_item:
34 | if para_id in train_paragraphs:
35 | # paragraphs.append(train_paragraphs[para_id]["content"])
36 | # Remove duplicate paragraphs.
37 | para_ids.add(para_id)
38 | else:
39 | logger.warning(f"Cannot find paragraph with id: {para_id}")
40 | else:
41 | assert evi_item in ["operation", "no_evidence"], evi_item
42 | paragraphs = [train_paragraphs[para_id]["content"] for para_id in para_ids]
43 |
44 | context = " ".join(paragraphs)
45 |
46 | text_inputs_a.append(context)
47 | text_inputs_b.append(question)
48 | labels.append(label)
49 |
50 | model_inputs = tokenizer(text_inputs_a,
51 | text_pair=text_inputs_b,
52 | truncation=TruncationStrategy.LONGEST_FIRST,
53 | padding=PaddingStrategy.LONGEST,
54 | max_length=max_seq_length,
55 | return_tensors="pt")
56 | model_inputs["labels"] = torch.tensor(labels, dtype=torch.long)
57 |
58 | dataset = DictTensorDataset(model_inputs)
59 |
60 | logger.info(f"Max seq length: {model_inputs['input_ids'].size(1)}")
61 |
62 | return dataset
63 |
--------------------------------------------------------------------------------
/models/mpt/blocks.py:
--------------------------------------------------------------------------------
1 | """GPT Blocks used for the GPT Model."""
2 | from typing import Dict, Optional, Tuple
3 | import torch
4 | import torch.nn as nn
5 | from .attention import ATTN_CLASS_REGISTRY
6 | from .norm import NORM_CLASS_REGISTRY
7 |
8 | class MPTMLP(nn.Module):
9 |
10 | def __init__(self, d_model: int, expansion_ratio: int, device: Optional[str]=None):
11 | super().__init__()
12 | self.up_proj = nn.Linear(d_model, expansion_ratio * d_model, device=device)
13 | self.act = nn.GELU(approximate='none')
14 | self.down_proj = nn.Linear(expansion_ratio * d_model, d_model, device=device)
15 | self.down_proj._is_residual = True
16 |
17 | def forward(self, x):
18 | return self.down_proj(self.act(self.up_proj(x)))
19 |
20 | class MPTBlock(nn.Module):
21 |
22 | def __init__(self, d_model: int, n_heads: int, expansion_ratio: int, attn_config: Dict={'attn_type': 'multihead_attention', 'attn_pdrop': 0.0, 'attn_impl': 'triton', 'qk_ln': False, 'clip_qkv': None, 'softmax_scale': None, 'prefix_lm': False, 'attn_uses_sequence_id': False, 'alibi': False, 'alibi_bias_max': 8}, resid_pdrop: float=0.0, norm_type: str='low_precision_layernorm', verbose: int=0, device: Optional[str]=None, **kwargs):
23 | del kwargs
24 | super().__init__()
25 | norm_class = NORM_CLASS_REGISTRY[norm_type.lower()]
26 | attn_class = ATTN_CLASS_REGISTRY[attn_config['attn_type']]
27 | self.norm_1 = norm_class(d_model, device=device)
28 | self.attn = attn_class(attn_impl=attn_config['attn_impl'], clip_qkv=attn_config['clip_qkv'], qk_ln=attn_config['qk_ln'], softmax_scale=attn_config['softmax_scale'], attn_pdrop=attn_config['attn_pdrop'], d_model=d_model, n_heads=n_heads, verbose=verbose, device=device)
29 | self.norm_2 = norm_class(d_model, device=device)
30 | self.ffn = MPTMLP(d_model=d_model, expansion_ratio=expansion_ratio, device=device)
31 | self.resid_attn_dropout = nn.Dropout(resid_pdrop)
32 | self.resid_ffn_dropout = nn.Dropout(resid_pdrop)
33 |
34 | def forward(self, x: torch.Tensor, past_key_value: Optional[Tuple[torch.Tensor]]=None, attn_bias: Optional[torch.Tensor]=None, attention_mask: Optional[torch.ByteTensor]=None, is_causal: bool=True) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor]]]:
35 | a = self.norm_1(x)
36 | (b, attn_weights, past_key_value) = self.attn(a, past_key_value=past_key_value, attn_bias=attn_bias, attention_mask=attention_mask, is_causal=is_causal)
37 | x = x + self.resid_attn_dropout(b)
38 | m = self.norm_2(x)
39 | n = self.ffn(m)
40 | x = x + self.resid_ffn_dropout(n)
41 | return (x, attn_weights, past_key_value)
--------------------------------------------------------------------------------
/models/mpt/norm.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 | def _cast_if_autocast_enabled(tensor):
4 | if torch.is_autocast_enabled():
5 | if tensor.device.type == 'cuda':
6 | dtype = torch.get_autocast_gpu_dtype()
7 | elif tensor.device.type == 'cpu':
8 | dtype = torch.get_autocast_cpu_dtype()
9 | else:
10 | raise NotImplementedError()
11 | return tensor.to(dtype=dtype)
12 | return tensor
13 |
14 | class LPLayerNorm(torch.nn.LayerNorm):
15 |
16 | def __init__(self, normalized_shape, eps=1e-05, elementwise_affine=True, device=None, dtype=None):
17 | super().__init__(normalized_shape=normalized_shape, eps=eps, elementwise_affine=elementwise_affine, device=device, dtype=dtype)
18 |
19 | def forward(self, x):
20 | module_device = x.device
21 | downcast_x = _cast_if_autocast_enabled(x)
22 | downcast_weight = _cast_if_autocast_enabled(self.weight) if self.weight is not None else self.weight
23 | downcast_bias = _cast_if_autocast_enabled(self.bias) if self.bias is not None else self.bias
24 | with torch.autocast(enabled=False, device_type=module_device.type):
25 | return torch.nn.functional.layer_norm(downcast_x, self.normalized_shape, downcast_weight, downcast_bias, self.eps)
26 |
27 | def rms_norm(x, weight=None, eps=1e-05):
28 | output = x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + eps)
29 | if weight is not None:
30 | return output * weight
31 | return output
32 |
33 | class RMSNorm(torch.nn.Module):
34 |
35 | def __init__(self, normalized_shape, eps=1e-05, weight=True, dtype=None, device=None):
36 | super().__init__()
37 | self.eps = eps
38 | if weight:
39 | self.weight = torch.nn.Parameter(torch.ones(normalized_shape, dtype=dtype, device=device))
40 | else:
41 | self.register_parameter('weight', None)
42 |
43 | def forward(self, x):
44 | return rms_norm(x.float(), self.weight, self.eps).to(dtype=x.dtype)
45 |
46 | class LPRMSNorm(RMSNorm):
47 |
48 | def __init__(self, normalized_shape, eps=1e-05, weight=True, dtype=None, device=None):
49 | super().__init__(normalized_shape=normalized_shape, eps=eps, weight=weight, dtype=dtype, device=device)
50 |
51 | def forward(self, x):
52 | downcast_x = _cast_if_autocast_enabled(x)
53 | downcast_weight = _cast_if_autocast_enabled(self.weight) if self.weight is not None else self.weight
54 | with torch.autocast(enabled=False, device_type=x.device.type):
55 | return rms_norm(downcast_x, downcast_weight, self.eps).to(dtype=x.dtype)
56 | NORM_CLASS_REGISTRY = {'layernorm': torch.nn.LayerNorm, 'low_precision_layernorm': LPLayerNorm, 'rmsnorm': RMSNorm, 'low_precision_rmsnorm': LPRMSNorm}
--------------------------------------------------------------------------------
/models/patching_utils.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn.functional as F
3 |
4 |
5 | def compute_flash_attention(flash_attn, q, k, v, attention_mask=None, head_mask=None):
6 | # q, k, v: [bs, seq_len, num_attention_heads, attn_head_size]
7 | # attention_mask (float): [bs, seq_len]
8 | batch_size, max_len = q.size(0), q.size(1)
9 |
10 | qkv = torch.stack([q, k, v], dim=2).to(torch.float16) # need to truncate in case input is fp32
11 | cu_seqlens, max_seqlen = None, None
12 |
13 | if attention_mask is None:
14 | return flash_attn(qkv, cu_seqlens=cu_seqlens, max_seqlen=max_seqlen)
15 | else:
16 | # Limitation: non-contiguous attention mask will not be handled correctly
17 | # model will be able to pay attention between the first and last non-masked token, i.e. left- and right-side padding is supported.
18 | csums = (attention_mask >= 0).cumsum(dim=1)
19 | ends = csums.argmax(dim=1) + 1
20 | starts = ends - csums.max(dim=1).values
21 | seqlens = ends - starts
22 |
23 | qkv = torch.cat([qkv[i, starts[i]: ends[i]] for i in range(batch_size)], dim=0)
24 | zero = torch.zeros_like(seqlens[:1]) # torch.tensor([0]) with correct dtype and device
25 | cu_seqlens = torch.cat([zero, seqlens.cumsum(dim=0)], dim=0).to(torch.int32)
26 | max_seqlen = seqlens.max().item()
27 |
28 | out = flash_attn(qkv, cu_seqlens=cu_seqlens, max_seqlen=max_seqlen)
29 | # out: [num_unmasked_tokens, num_attention_heads, attn_head_size]
30 |
31 | seqs = [out[start:end] for start, end in zip(cu_seqlens[:-1], cu_seqlens[1:])]
32 | # stack and pad sequences together
33 | padded_seqs = [
34 | F.pad(seqs[i], (0, 0) * (seqs[i].dim() - 1) + (starts[i], max_len - ends[i]), value=0.0)
35 | for i in range(batch_size)
36 | ]
37 | out = torch.stack(padded_seqs)
38 | return out
39 |
40 | # if __name__ == "__main__":
41 | # from flash_attn.modules.mha import FlashSelfAttention
42 | #
43 | # flash_attn = FlashSelfAttention(causal=True)
44 | #
45 | # dtype = torch.float16
46 | # device = torch.device("cuda:0")
47 | #
48 | # batch_size, seq_len, num_heads, head_size = 4, 18, 8, 32
49 | # q = torch.randn(batch_size, seq_len, num_heads, head_size, dtype=dtype, device=device)
50 | # k = torch.randn(batch_size, seq_len, num_heads, head_size, dtype=dtype, device=device)
51 | # v = torch.randn(batch_size, seq_len, num_heads, head_size, dtype=dtype, device=device)
52 | #
53 | # attn_mask = torch.randn(batch_size, seq_len, dtype=dtype, device=device).abs().cumsum(dim=1)
54 | # attn_mask = ((attn_mask > 3) & (attn_mask < 10)).int().log()
55 | #
56 | # out = compute_flash_attention(flash_attn, q, k, v, attention_mask=attn_mask)
57 |
--------------------------------------------------------------------------------
/general_util/average_meter.py:
--------------------------------------------------------------------------------
1 | from typing import Dict
2 |
3 | import torch
4 | import torch.distributed as dist
5 |
6 |
7 | class AverageMeter(object):
8 | """Computes and stores the average and current value."""
9 |
10 | def __init__(self):
11 | self.val = 0
12 | self.avg = 0
13 | self.sum = 0
14 | self.count = 0
15 |
16 | def reset(self):
17 | self.val = 0
18 | self.avg = 0
19 | self.sum = 0
20 | self.count = 0
21 |
22 | def update(self, val, n=1):
23 | if isinstance(val, torch.Tensor):
24 | val = val.item()
25 | if isinstance(n, torch.Tensor):
26 | n = n.item()
27 |
28 | self.val = val
29 | self.sum += val * n
30 | self.count += n
31 | if self.count > 0:
32 | self.avg = self.sum / self.count
33 | else:
34 | self.avg = 0
35 |
36 | def save(self):
37 | return {
38 | 'val': self.val,
39 | 'avg': self.avg,
40 | 'sum': self.sum,
41 | 'count': self.count
42 | }
43 |
44 | def load(self, value: dict):
45 | if value is None:
46 | self.reset()
47 | self.val = value['val'] if 'val' in value else 0
48 | self.avg = value['avg'] if 'avg' in value else 0
49 | self.sum = value['sum'] if 'sum' in value else 0
50 | self.count = value['count'] if 'count' in value else 0
51 |
52 | def gather(self, device):
53 | tensor_list = [torch.zeros(2, device=device) for _ in range(dist.get_world_size())]
54 | tensor = torch.tensor([self.sum, self.count], device=device)
55 | dist.all_gather(tensor_list, tensor)
56 |
57 | all_tensor = torch.stack(tensor_list, dim=0)
58 | self.sum = all_tensor[:, 0].sum().item()
59 | self.count = all_tensor[:, 1].sum().item()
60 | if self.count > 0:
61 | self.avg = self.sum / self.count
62 | else:
63 | self.avg = 0
64 |
65 | del all_tensor
66 |
67 |
68 | class LogMetric(object):
69 | """
70 | Record all metrics for logging.
71 | """
72 |
73 | def __init__(self, *metric_names):
74 |
75 | self.metrics: Dict[str, AverageMeter] = {
76 | key: AverageMeter() for key in metric_names
77 | }
78 |
79 | def update(self, metric_name, val, n=1):
80 |
81 | self.metrics[metric_name].update(val, n)
82 |
83 | def reset(self, metric_name=None):
84 | if metric_name is None:
85 | for key in self.metrics.keys():
86 | self.metrics[key].reset()
87 | return
88 |
89 | self.metrics[metric_name].reset()
90 |
91 | def get_log(self):
92 |
93 | log = {
94 | key: self.metrics[key].avg for key in self.metrics
95 | }
96 | return log
97 |
--------------------------------------------------------------------------------
/general_util/dist_utils.py:
--------------------------------------------------------------------------------
1 | import datetime
2 | import os
3 | import subprocess
4 |
5 | import torch
6 | import torch.distributed as dist
7 | from omegaconf import DictConfig
8 |
9 |
10 | def vanilla_torch_dist(cfg: DictConfig, backend="nccl"):
11 | if "LOCAL_RANK" in os.environ and os.environ["LOCAL_RANK"] not in [-1, "-1"]:
12 | cfg.local_rank = int(os.environ["LOCAL_RANK"])
13 |
14 | if cfg.local_rank == -1 or cfg.no_cuda:
15 | device = str(torch.device("cuda" if torch.cuda.is_available() and not cfg.no_cuda else "cpu"))
16 | cfg.n_gpu = torch.cuda.device_count()
17 | else: # Initializes the distributed backend which will take care of synchronizing nodes/GPUs
18 | torch.cuda.set_device(cfg.local_rank)
19 | device = str(torch.device("cuda", cfg.local_rank))
20 | dist.init_process_group(backend=backend, timeout=datetime.timedelta(seconds=7200))
21 | cfg.n_gpu = 1
22 | cfg.world_size = dist.get_world_size()
23 | cfg.device = device
24 |
25 |
26 | def setup_slurm_distributed(cfg: DictConfig, backend="nccl", port=None):
27 | """
28 | Most code are copied from https://github.com/BIGBALLON/distribuuuu/blob/master/tutorial/mnmc_ddp_slurm.py.
29 | """
30 | num_gpus = torch.cuda.device_count()
31 | print(num_gpus)
32 | if num_gpus <= 1 or cfg.no_cuda:
33 | cfg.local_rank = -1
34 | cfg.device = str(torch.device("cuda" if torch.cuda.is_available() and not cfg.no_cuda else "cpu"))
35 | cfg.n_gpu = min(num_gpus, 1)
36 | cfg.ddp_eval = False
37 | return
38 |
39 | # Data Parallel or Model Parallel on multiple GPUs with single task.
40 | if int(os.environ["SLURM_NTASKS"]) == 1:
41 | cfg.n_gpu = num_gpus
42 | cfg.ddp_eval = False
43 | cfg.device = str(torch.device("cuda"))
44 | cfg.local_rank = -1
45 | return
46 |
47 | proc_id = int(os.environ["SLURM_PROCID"])
48 | n_tasks = int(os.environ["SLURM_NTASKS"])
49 | node_list = os.environ["SLURM_NODELIST"]
50 |
51 | torch.cuda.set_device(proc_id % num_gpus)
52 |
53 | addr = subprocess.getoutput(f"scontrol show hostname {node_list} | head -n1")
54 | # specify master port
55 | if port is not None:
56 | os.environ["MASTER_PORT"] = str(port)
57 | elif "MASTER_PORT" not in os.environ:
58 | os.environ["MASTER_PORT"] = "29500"
59 | if "MASTER_ADDR" not in os.environ:
60 | os.environ["MASTER_ADDR"] = addr
61 |
62 | os.environ["WORLD_SIZE"] = str(n_tasks)
63 | os.environ["LOCAL_RANK"] = str(proc_id % num_gpus)
64 | os.environ["RANK"] = str(proc_id)
65 |
66 | cfg.n_gpu = 1
67 | cfg.local_rank = int(os.environ["LOCAL_RANK"])
68 | # cfg.local_rank = int(os.environ["RANK"])
69 | cfg.world_size = int(os.environ["WORLD_SIZE"])
70 | cfg.device = str(torch.device("cuda", cfg.local_rank))
71 |
72 | dist.init_process_group(backend=backend, world_size=int(os.environ["WORLD_SIZE"]), rank=int(os.environ["RANK"]))
73 |
74 | # print(cfg.n_gpu, cfg.local_rank, cfg.world_size, cfg.device)
75 | # print(cfg.local_rank)
76 | cfg.local_rank = dist.get_rank()
77 | # print(cfg.local_rank)
78 |
--------------------------------------------------------------------------------
/conf/base.yaml:
--------------------------------------------------------------------------------
1 | hydra:
2 | run:
3 | dir: ./
4 |
5 | train_file:
6 | dev_file:
7 | test_file:
8 |
9 | # Model
10 | model:
11 | _target_: models.roberta_baseline.RobertaForMultipleChoiceForZeroShot.from_pretrained
12 |
13 | # Data loading
14 | read_tensor:
15 | _target_: data.reclor_sentence_prefix.convert_examples_into_features
16 | max_seq_length: 256
17 | num_workers: 2
18 | token_num: 5
19 |
20 | extended_vocab: ${read_tensor.token_num}
21 |
22 | # Data collator
23 | collator:
24 | _target_: data.collators.ReClorSentenceCollator
25 |
26 | # Dataloader
27 | num_workers: 4
28 | prefetch_factor: 2
29 |
30 | # Wiki path pretrain v8.2
31 | model_name_or_path: experiments/roberta.large.wiki_erica_path_v7_v8.2.2.1aug.ctx.1k.2080Ti/checkpoint-500
32 | pretrain:
33 |
34 | output_dir:
35 |
36 |
37 | do_train: True
38 | evaluate_during_training: True
39 |
40 | do_eval: True
41 | eval_sub_path:
42 |
43 | # Training hyper-parameters
44 | per_gpu_train_batch_size: 1
45 | per_gpu_eval_batch_size: 1
46 | learning_rate: 1.5e-5
47 | gradient_accumulation_steps: 12
48 | weight_decay: 0.01
49 | adam_epsilon: 1e-6
50 | adam_betas: "(0.9, 0.98)"
51 | max_grad_norm: 0.0
52 | num_train_epochs: 10
53 | max_steps: 0
54 | warmup_proportion: 0.1
55 | warmup_steps: 0
56 |
57 |
58 | logging_steps: 5
59 | save_steps: -1
60 | save_best: True
61 | eval_steps: 100
62 | no_cuda: False
63 | seed: 42
64 | local_rank: -1
65 | fp16: True
66 | fp16_opt_level: O1
67 |
68 | # Prediction config
69 | prediction_cfg:
70 | metric: "acc"
71 | measure: 1
72 | best_checkpoint:
73 | best_result:
74 |
75 | # fairscale.FullyShardedDP
76 | fairscale_config:
77 | _target_: general_util.fsdp_utils.default_initialize
78 | fp16: ${fp16}
79 | reshard_after_forward: False
80 | cpu_offload: False
81 | move_grads_to_cpu: False
82 | move_params_to_cpu: False
83 |
84 | # Deepspeed config
85 | ds_cfg:
86 | train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
87 | gradient_accumulation_steps: ${gradient_accumulation_steps}
88 | optimizer:
89 | type: AdamW
90 | params:
91 | lr: ${learning_rate}
92 | betas: [0.9, 0.999]
93 | eps: ${adam_epsilon}
94 | weight_decay: ${weight_decay}
95 | scheduler:
96 | type: WarmupDecayLR
97 | params:
98 | total_num_steps:
99 | warmup_max_lr: ${learning_rate}
100 | warmup_num_steps:
101 | warmup_type: linear
102 | gradient_clipping: ${max_grad_norm}
103 | fp16:
104 | enabled: ${fp16}
105 | initial_scale_power: 12
106 | zero_optimization:
107 | stage: 3
108 | # offload_optimizer:
109 | # device: cpu
110 | # pin_memory: True
111 | # offload_param:
112 | # device: cpu
113 | # pin_memory: True
114 | # activation_checkpointing:
115 | # partition_activations: True
116 | # cpu_checkpointing: True
117 | # contiguous_memory_optimization: False
118 | # number_checkpoints: False
119 | # synchronize_checkpoint_boundary: False
120 | # profile: False
121 | steps_per_print: 1024
122 |
123 | summary_helper:
124 | _target_: general_util.tensorboard_helper.SummaryWriterHelper
125 | batch_index_or_keys:
126 | outputs_index_or_keys:
127 |
128 | # Temporary variables
129 | n_gpu:
130 | device:
131 | train_batch_size:
132 | eval_batch_size:
133 | world_size:
134 |
--------------------------------------------------------------------------------
/convert2hf.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import json
3 | import os
4 | from glob import glob
5 | from pathlib import Path
6 |
7 | import torch
8 | import transformers
9 | from accelerate import init_empty_weights
10 | from transformers import AutoModelForCausalLM
11 |
12 | PARAM_MAP = {
13 | "7B": {
14 | "n_layers": 32,
15 | },
16 | "13B": {
17 | "n_layers": 40,
18 | },
19 | "30B": {
20 | "n_layers": 60,
21 | },
22 | "65B": {
23 | "n_layers": 80,
24 | },
25 | }
26 |
27 | ORIGINAL_TOKENIZER_SIZE = 32000
28 |
29 |
30 | def read_json(path):
31 | with open(path, "r") as f:
32 | return json.load(f)
33 |
34 |
35 | def write_json(text, path):
36 | with open(path, "w") as f:
37 | json.dump(text, f)
38 |
39 |
40 | def load_weights(checkpoint_dir, n_layers: int):
41 | state_dict = {}
42 | for pt in Path(checkpoint_dir).iterdir():
43 | print("Processing ", pt.name)
44 | if not pt.name.startswith('layer_'):
45 | continue
46 |
47 | sd = torch.load(pt, map_location="cpu")
48 |
49 | if pt.name.startswith("layer_00"):
50 | print(f"{pt.name} -> model.embed_tokens.weight")
51 | state_dict["model.embed_tokens.weight"] = sd["weight"]
52 | elif pt.name.startswith(f"layer_{n_layers + 1}"):
53 | print(f"{pt.name} -> model.norm.weight")
54 | state_dict["model.norm.weight"] = sd["weight"]
55 | elif pt.name.startswith(f"layer_{n_layers + 2}"):
56 | print(f"{pt.name} -> lm_head.weight")
57 | state_dict["lm_head.weight"] = sd["weight"]
58 | else:
59 | layer_idx = int(pt.name[len("layer_"):].split("-")[0]) - 1
60 | assert 0 <= layer_idx < n_layers
61 | for k, v in sd.items():
62 | state_dict[f"model.layers.{layer_idx}.{k}"] = v
63 | print(f"{pt.name} -> model.layers.{layer_idx}")
64 | return state_dict
65 |
66 |
67 | def write_model(input_base_path, model_size, config_dir):
68 | assert model_size in PARAM_MAP
69 | config = transformers.AutoConfig.from_pretrained(config_dir)
70 | with init_empty_weights():
71 | model = AutoModelForCausalLM.from_config(config)
72 |
73 | params = PARAM_MAP[model_size]
74 | n_layers = params["n_layers"]
75 |
76 | if os.path.exists(input_base_path):
77 | checkpoint_dirs = [input_base_path]
78 | else:
79 | checkpoint_dirs = glob(input_base_path, recursive=True)
80 | print(f"Found checkpoints: {checkpoint_dirs}")
81 |
82 | for checkpoint_dir in checkpoint_dirs:
83 | checkpoint_state_dict = load_weights(checkpoint_dir, n_layers)
84 | model.save_pretrained("/".join(checkpoint_dir.split("/")[:-1]), state_dict=checkpoint_state_dict, max_shard_size="3GB")
85 |
86 |
87 | def main():
88 | parser = argparse.ArgumentParser()
89 | parser.add_argument(
90 | "--input_dir",
91 | help="Location of LLaMA weights, which contains tokenizer.model and model folders",
92 | )
93 | parser.add_argument(
94 | "--model_size",
95 | choices=["7B", "13B", "30B", "65B"],
96 | )
97 | parser.add_argument(
98 | "--config_dir",
99 | )
100 | args = parser.parse_args()
101 | write_model(
102 | input_base_path=args.input_dir,
103 | model_size=args.model_size,
104 | config_dir=args.config_dir,
105 | )
106 |
107 |
108 | if __name__ == "__main__":
109 | main()
110 |
--------------------------------------------------------------------------------
/data/files/c4/en/p50/partition_100.json:
--------------------------------------------------------------------------------
1 | [
2 | "/opt/ml/input/data/train/c4/en/c4-train.00086-of-01024.json.gz",
3 | "/opt/ml/input/data/train/c4/en/c4-train.00895-of-01024.json.gz",
4 | "/opt/ml/input/data/train/c4/en/c4-train.00988-of-01024.json.gz",
5 | "/opt/ml/input/data/train/c4/en/c4-train.00200-of-01024.json.gz",
6 | "/opt/ml/input/data/train/c4/en/c4-train.00081-of-01024.json.gz",
7 | "/opt/ml/input/data/train/c4/en/c4-train.00210-of-01024.json.gz",
8 | "/opt/ml/input/data/train/c4/en/c4-train.00411-of-01024.json.gz",
9 | "/opt/ml/input/data/train/c4/en/c4-train.00812-of-01024.json.gz",
10 | "/opt/ml/input/data/train/c4/en/c4-train.00410-of-01024.json.gz",
11 | "/opt/ml/input/data/train/c4/en/c4-train.00775-of-01024.json.gz",
12 | "/opt/ml/input/data/train/c4/en/c4-train.00591-of-01024.json.gz",
13 | "/opt/ml/input/data/train/c4/en/c4-train.00901-of-01024.json.gz",
14 | "/opt/ml/input/data/train/c4/en/c4-train.00349-of-01024.json.gz",
15 | "/opt/ml/input/data/train/c4/en/c4-train.00655-of-01024.json.gz",
16 | "/opt/ml/input/data/train/c4/en/c4-train.00511-of-01024.json.gz",
17 | "/opt/ml/input/data/train/c4/en/c4-train.00450-of-01024.json.gz",
18 | "/opt/ml/input/data/train/c4/en/c4-train.00430-of-01024.json.gz",
19 | "/opt/ml/input/data/train/c4/en/c4-train.00882-of-01024.json.gz",
20 | "/opt/ml/input/data/train/c4/en/c4-train.00559-of-01024.json.gz",
21 | "/opt/ml/input/data/train/c4/en/c4-train.00598-of-01024.json.gz",
22 | "/opt/ml/input/data/train/c4/en/c4-train.00120-of-01024.json.gz",
23 | "/opt/ml/input/data/train/c4/en/c4-train.00816-of-01024.json.gz",
24 | "/opt/ml/input/data/train/c4/en/c4-train.00780-of-01024.json.gz",
25 | "/opt/ml/input/data/train/c4/en/c4-train.00395-of-01024.json.gz",
26 | "/opt/ml/input/data/train/c4/en/c4-train.00726-of-01024.json.gz",
27 | "/opt/ml/input/data/train/c4/en/c4-train.00273-of-01024.json.gz",
28 | "/opt/ml/input/data/train/c4/en/c4-train.00768-of-01024.json.gz",
29 | "/opt/ml/input/data/train/c4/en/c4-train.00190-of-01024.json.gz",
30 | "/opt/ml/input/data/train/c4/en/c4-train.00499-of-01024.json.gz",
31 | "/opt/ml/input/data/train/c4/en/c4-train.00458-of-01024.json.gz",
32 | "/opt/ml/input/data/train/c4/en/c4-train.00506-of-01024.json.gz",
33 | "/opt/ml/input/data/train/c4/en/c4-train.00061-of-01024.json.gz",
34 | "/opt/ml/input/data/train/c4/en/c4-train.00005-of-01024.json.gz",
35 | "/opt/ml/input/data/train/c4/en/c4-train.00889-of-01024.json.gz",
36 | "/opt/ml/input/data/train/c4/en/c4-train.00442-of-01024.json.gz",
37 | "/opt/ml/input/data/train/c4/en/c4-train.00617-of-01024.json.gz",
38 | "/opt/ml/input/data/train/c4/en/c4-train.00786-of-01024.json.gz",
39 | "/opt/ml/input/data/train/c4/en/c4-train.00187-of-01024.json.gz",
40 | "/opt/ml/input/data/train/c4/en/c4-train.00839-of-01024.json.gz",
41 | "/opt/ml/input/data/train/c4/en/c4-train.00405-of-01024.json.gz",
42 | "/opt/ml/input/data/train/c4/en/c4-train.00555-of-01024.json.gz",
43 | "/opt/ml/input/data/train/c4/en/c4-train.00133-of-01024.json.gz",
44 | "/opt/ml/input/data/train/c4/en/c4-train.00634-of-01024.json.gz",
45 | "/opt/ml/input/data/train/c4/en/c4-train.00434-of-01024.json.gz",
46 | "/opt/ml/input/data/train/c4/en/c4-train.00519-of-01024.json.gz",
47 | "/opt/ml/input/data/train/c4/en/c4-train.00340-of-01024.json.gz",
48 | "/opt/ml/input/data/train/c4/en/c4-train.00810-of-01024.json.gz",
49 | "/opt/ml/input/data/train/c4/en/c4-train.00217-of-01024.json.gz",
50 | "/opt/ml/input/data/train/c4/en/c4-train.00087-of-01024.json.gz",
51 | "/opt/ml/input/data/train/c4/en/c4-train.00302-of-01024.json.gz"
52 | ]
--------------------------------------------------------------------------------
/data/files/c4/en/p50/partition_1000.json:
--------------------------------------------------------------------------------
1 | [
2 | "/opt/ml/input/data/train/c4/en/c4-train.00226-of-01024.json.gz",
3 | "/opt/ml/input/data/train/c4/en/c4-train.00315-of-01024.json.gz",
4 | "/opt/ml/input/data/train/c4/en/c4-train.00371-of-01024.json.gz",
5 | "/opt/ml/input/data/train/c4/en/c4-train.00280-of-01024.json.gz",
6 | "/opt/ml/input/data/train/c4/en/c4-train.00258-of-01024.json.gz",
7 | "/opt/ml/input/data/train/c4/en/c4-train.00231-of-01024.json.gz",
8 | "/opt/ml/input/data/train/c4/en/c4-train.00018-of-01024.json.gz",
9 | "/opt/ml/input/data/train/c4/en/c4-train.00888-of-01024.json.gz",
10 | "/opt/ml/input/data/train/c4/en/c4-train.00721-of-01024.json.gz",
11 | "/opt/ml/input/data/train/c4/en/c4-train.00020-of-01024.json.gz",
12 | "/opt/ml/input/data/train/c4/en/c4-train.00144-of-01024.json.gz",
13 | "/opt/ml/input/data/train/c4/en/c4-train.00209-of-01024.json.gz",
14 | "/opt/ml/input/data/train/c4/en/c4-train.00789-of-01024.json.gz",
15 | "/opt/ml/input/data/train/c4/en/c4-train.00264-of-01024.json.gz",
16 | "/opt/ml/input/data/train/c4/en/c4-train.00431-of-01024.json.gz",
17 | "/opt/ml/input/data/train/c4/en/c4-train.00811-of-01024.json.gz",
18 | "/opt/ml/input/data/train/c4/en/c4-train.00863-of-01024.json.gz",
19 | "/opt/ml/input/data/train/c4/en/c4-train.00212-of-01024.json.gz",
20 | "/opt/ml/input/data/train/c4/en/c4-train.00862-of-01024.json.gz",
21 | "/opt/ml/input/data/train/c4/en/c4-train.00685-of-01024.json.gz",
22 | "/opt/ml/input/data/train/c4/en/c4-train.00897-of-01024.json.gz",
23 | "/opt/ml/input/data/train/c4/en/c4-train.00974-of-01024.json.gz",
24 | "/opt/ml/input/data/train/c4/en/c4-train.00630-of-01024.json.gz",
25 | "/opt/ml/input/data/train/c4/en/c4-train.00292-of-01024.json.gz",
26 | "/opt/ml/input/data/train/c4/en/c4-train.00198-of-01024.json.gz",
27 | "/opt/ml/input/data/train/c4/en/c4-train.00435-of-01024.json.gz",
28 | "/opt/ml/input/data/train/c4/en/c4-train.00218-of-01024.json.gz",
29 | "/opt/ml/input/data/train/c4/en/c4-train.00981-of-01024.json.gz",
30 | "/opt/ml/input/data/train/c4/en/c4-train.00175-of-01024.json.gz",
31 | "/opt/ml/input/data/train/c4/en/c4-train.00079-of-01024.json.gz",
32 | "/opt/ml/input/data/train/c4/en/c4-train.00662-of-01024.json.gz",
33 | "/opt/ml/input/data/train/c4/en/c4-train.00344-of-01024.json.gz",
34 | "/opt/ml/input/data/train/c4/en/c4-train.00090-of-01024.json.gz",
35 | "/opt/ml/input/data/train/c4/en/c4-train.00918-of-01024.json.gz",
36 | "/opt/ml/input/data/train/c4/en/c4-train.00155-of-01024.json.gz",
37 | "/opt/ml/input/data/train/c4/en/c4-train.00131-of-01024.json.gz",
38 | "/opt/ml/input/data/train/c4/en/c4-train.00576-of-01024.json.gz",
39 | "/opt/ml/input/data/train/c4/en/c4-train.00604-of-01024.json.gz",
40 | "/opt/ml/input/data/train/c4/en/c4-train.00774-of-01024.json.gz",
41 | "/opt/ml/input/data/train/c4/en/c4-train.00659-of-01024.json.gz",
42 | "/opt/ml/input/data/train/c4/en/c4-train.00808-of-01024.json.gz",
43 | "/opt/ml/input/data/train/c4/en/c4-train.00026-of-01024.json.gz",
44 | "/opt/ml/input/data/train/c4/en/c4-train.00115-of-01024.json.gz",
45 | "/opt/ml/input/data/train/c4/en/c4-train.00467-of-01024.json.gz",
46 | "/opt/ml/input/data/train/c4/en/c4-train.00583-of-01024.json.gz",
47 | "/opt/ml/input/data/train/c4/en/c4-train.00933-of-01024.json.gz",
48 | "/opt/ml/input/data/train/c4/en/c4-train.00907-of-01024.json.gz",
49 | "/opt/ml/input/data/train/c4/en/c4-train.00105-of-01024.json.gz",
50 | "/opt/ml/input/data/train/c4/en/c4-train.00869-of-01024.json.gz",
51 | "/opt/ml/input/data/train/c4/en/c4-train.00658-of-01024.json.gz"
52 | ]
--------------------------------------------------------------------------------
/data/files/c4/en/p50/partition_150.json:
--------------------------------------------------------------------------------
1 | [
2 | "/opt/ml/input/data/train/c4/en/c4-train.00951-of-01024.json.gz",
3 | "/opt/ml/input/data/train/c4/en/c4-train.00145-of-01024.json.gz",
4 | "/opt/ml/input/data/train/c4/en/c4-train.00906-of-01024.json.gz",
5 | "/opt/ml/input/data/train/c4/en/c4-train.00934-of-01024.json.gz",
6 | "/opt/ml/input/data/train/c4/en/c4-train.00938-of-01024.json.gz",
7 | "/opt/ml/input/data/train/c4/en/c4-train.00355-of-01024.json.gz",
8 | "/opt/ml/input/data/train/c4/en/c4-train.00784-of-01024.json.gz",
9 | "/opt/ml/input/data/train/c4/en/c4-train.00246-of-01024.json.gz",
10 | "/opt/ml/input/data/train/c4/en/c4-train.00408-of-01024.json.gz",
11 | "/opt/ml/input/data/train/c4/en/c4-train.00955-of-01024.json.gz",
12 | "/opt/ml/input/data/train/c4/en/c4-train.00103-of-01024.json.gz",
13 | "/opt/ml/input/data/train/c4/en/c4-train.00978-of-01024.json.gz",
14 | "/opt/ml/input/data/train/c4/en/c4-train.00197-of-01024.json.gz",
15 | "/opt/ml/input/data/train/c4/en/c4-train.00967-of-01024.json.gz",
16 | "/opt/ml/input/data/train/c4/en/c4-train.00021-of-01024.json.gz",
17 | "/opt/ml/input/data/train/c4/en/c4-train.01008-of-01024.json.gz",
18 | "/opt/ml/input/data/train/c4/en/c4-train.00979-of-01024.json.gz",
19 | "/opt/ml/input/data/train/c4/en/c4-train.00922-of-01024.json.gz",
20 | "/opt/ml/input/data/train/c4/en/c4-train.00645-of-01024.json.gz",
21 | "/opt/ml/input/data/train/c4/en/c4-train.00915-of-01024.json.gz",
22 | "/opt/ml/input/data/train/c4/en/c4-train.00432-of-01024.json.gz",
23 | "/opt/ml/input/data/train/c4/en/c4-train.00096-of-01024.json.gz",
24 | "/opt/ml/input/data/train/c4/en/c4-train.00447-of-01024.json.gz",
25 | "/opt/ml/input/data/train/c4/en/c4-train.00491-of-01024.json.gz",
26 | "/opt/ml/input/data/train/c4/en/c4-train.00556-of-01024.json.gz",
27 | "/opt/ml/input/data/train/c4/en/c4-train.00031-of-01024.json.gz",
28 | "/opt/ml/input/data/train/c4/en/c4-train.00062-of-01024.json.gz",
29 | "/opt/ml/input/data/train/c4/en/c4-train.00282-of-01024.json.gz",
30 | "/opt/ml/input/data/train/c4/en/c4-train.00670-of-01024.json.gz",
31 | "/opt/ml/input/data/train/c4/en/c4-train.00698-of-01024.json.gz",
32 | "/opt/ml/input/data/train/c4/en/c4-train.00385-of-01024.json.gz",
33 | "/opt/ml/input/data/train/c4/en/c4-train.00707-of-01024.json.gz",
34 | "/opt/ml/input/data/train/c4/en/c4-train.00837-of-01024.json.gz",
35 | "/opt/ml/input/data/train/c4/en/c4-train.00329-of-01024.json.gz",
36 | "/opt/ml/input/data/train/c4/en/c4-train.00248-of-01024.json.gz",
37 | "/opt/ml/input/data/train/c4/en/c4-train.01006-of-01024.json.gz",
38 | "/opt/ml/input/data/train/c4/en/c4-train.00671-of-01024.json.gz",
39 | "/opt/ml/input/data/train/c4/en/c4-train.01021-of-01024.json.gz",
40 | "/opt/ml/input/data/train/c4/en/c4-train.00550-of-01024.json.gz",
41 | "/opt/ml/input/data/train/c4/en/c4-train.00615-of-01024.json.gz",
42 | "/opt/ml/input/data/train/c4/en/c4-train.00696-of-01024.json.gz",
43 | "/opt/ml/input/data/train/c4/en/c4-train.00893-of-01024.json.gz",
44 | "/opt/ml/input/data/train/c4/en/c4-train.01003-of-01024.json.gz",
45 | "/opt/ml/input/data/train/c4/en/c4-train.00782-of-01024.json.gz",
46 | "/opt/ml/input/data/train/c4/en/c4-train.00007-of-01024.json.gz",
47 | "/opt/ml/input/data/train/c4/en/c4-train.00595-of-01024.json.gz",
48 | "/opt/ml/input/data/train/c4/en/c4-train.00224-of-01024.json.gz",
49 | "/opt/ml/input/data/train/c4/en/c4-train.00969-of-01024.json.gz",
50 | "/opt/ml/input/data/train/c4/en/c4-train.00508-of-01024.json.gz",
51 | "/opt/ml/input/data/train/c4/en/c4-train.00151-of-01024.json.gz"
52 | ]
--------------------------------------------------------------------------------
/data/files/c4/en/p50/partition_200.json:
--------------------------------------------------------------------------------
1 | [
2 | "/opt/ml/input/data/train/c4/en/c4-train.00624-of-01024.json.gz",
3 | "/opt/ml/input/data/train/c4/en/c4-train.00236-of-01024.json.gz",
4 | "/opt/ml/input/data/train/c4/en/c4-train.00104-of-01024.json.gz",
5 | "/opt/ml/input/data/train/c4/en/c4-train.00652-of-01024.json.gz",
6 | "/opt/ml/input/data/train/c4/en/c4-train.00596-of-01024.json.gz",
7 | "/opt/ml/input/data/train/c4/en/c4-train.00720-of-01024.json.gz",
8 | "/opt/ml/input/data/train/c4/en/c4-train.00110-of-01024.json.gz",
9 | "/opt/ml/input/data/train/c4/en/c4-train.00944-of-01024.json.gz",
10 | "/opt/ml/input/data/train/c4/en/c4-train.00607-of-01024.json.gz",
11 | "/opt/ml/input/data/train/c4/en/c4-train.00334-of-01024.json.gz",
12 | "/opt/ml/input/data/train/c4/en/c4-train.00527-of-01024.json.gz",
13 | "/opt/ml/input/data/train/c4/en/c4-train.00270-of-01024.json.gz",
14 | "/opt/ml/input/data/train/c4/en/c4-train.00676-of-01024.json.gz",
15 | "/opt/ml/input/data/train/c4/en/c4-train.00908-of-01024.json.gz",
16 | "/opt/ml/input/data/train/c4/en/c4-train.00445-of-01024.json.gz",
17 | "/opt/ml/input/data/train/c4/en/c4-train.00146-of-01024.json.gz",
18 | "/opt/ml/input/data/train/c4/en/c4-train.00722-of-01024.json.gz",
19 | "/opt/ml/input/data/train/c4/en/c4-train.00693-of-01024.json.gz",
20 | "/opt/ml/input/data/train/c4/en/c4-train.00396-of-01024.json.gz",
21 | "/opt/ml/input/data/train/c4/en/c4-train.00141-of-01024.json.gz",
22 | "/opt/ml/input/data/train/c4/en/c4-train.00269-of-01024.json.gz",
23 | "/opt/ml/input/data/train/c4/en/c4-train.00496-of-01024.json.gz",
24 | "/opt/ml/input/data/train/c4/en/c4-train.00387-of-01024.json.gz",
25 | "/opt/ml/input/data/train/c4/en/c4-train.00745-of-01024.json.gz",
26 | "/opt/ml/input/data/train/c4/en/c4-train.00132-of-01024.json.gz",
27 | "/opt/ml/input/data/train/c4/en/c4-train.00261-of-01024.json.gz",
28 | "/opt/ml/input/data/train/c4/en/c4-train.00647-of-01024.json.gz",
29 | "/opt/ml/input/data/train/c4/en/c4-train.00420-of-01024.json.gz",
30 | "/opt/ml/input/data/train/c4/en/c4-train.00014-of-01024.json.gz",
31 | "/opt/ml/input/data/train/c4/en/c4-train.00665-of-01024.json.gz",
32 | "/opt/ml/input/data/train/c4/en/c4-train.00345-of-01024.json.gz",
33 | "/opt/ml/input/data/train/c4/en/c4-train.00621-of-01024.json.gz",
34 | "/opt/ml/input/data/train/c4/en/c4-train.01015-of-01024.json.gz",
35 | "/opt/ml/input/data/train/c4/en/c4-train.00947-of-01024.json.gz",
36 | "/opt/ml/input/data/train/c4/en/c4-train.00094-of-01024.json.gz",
37 | "/opt/ml/input/data/train/c4/en/c4-train.01000-of-01024.json.gz",
38 | "/opt/ml/input/data/train/c4/en/c4-train.00574-of-01024.json.gz",
39 | "/opt/ml/input/data/train/c4/en/c4-train.00421-of-01024.json.gz",
40 | "/opt/ml/input/data/train/c4/en/c4-train.00962-of-01024.json.gz",
41 | "/opt/ml/input/data/train/c4/en/c4-train.00481-of-01024.json.gz",
42 | "/opt/ml/input/data/train/c4/en/c4-train.00905-of-01024.json.gz",
43 | "/opt/ml/input/data/train/c4/en/c4-train.00158-of-01024.json.gz",
44 | "/opt/ml/input/data/train/c4/en/c4-train.00866-of-01024.json.gz",
45 | "/opt/ml/input/data/train/c4/en/c4-train.00038-of-01024.json.gz",
46 | "/opt/ml/input/data/train/c4/en/c4-train.00540-of-01024.json.gz",
47 | "/opt/ml/input/data/train/c4/en/c4-train.00551-of-01024.json.gz",
48 | "/opt/ml/input/data/train/c4/en/c4-train.00874-of-01024.json.gz",
49 | "/opt/ml/input/data/train/c4/en/c4-train.00341-of-01024.json.gz",
50 | "/opt/ml/input/data/train/c4/en/c4-train.00140-of-01024.json.gz",
51 | "/opt/ml/input/data/train/c4/en/c4-train.00753-of-01024.json.gz"
52 | ]
--------------------------------------------------------------------------------
/data/files/c4/en/p50/partition_250.json:
--------------------------------------------------------------------------------
1 | [
2 | "/opt/ml/input/data/train/c4/en/c4-train.00750-of-01024.json.gz",
3 | "/opt/ml/input/data/train/c4/en/c4-train.00065-of-01024.json.gz",
4 | "/opt/ml/input/data/train/c4/en/c4-train.00719-of-01024.json.gz",
5 | "/opt/ml/input/data/train/c4/en/c4-train.00704-of-01024.json.gz",
6 | "/opt/ml/input/data/train/c4/en/c4-train.00126-of-01024.json.gz",
7 | "/opt/ml/input/data/train/c4/en/c4-train.00174-of-01024.json.gz",
8 | "/opt/ml/input/data/train/c4/en/c4-train.00950-of-01024.json.gz",
9 | "/opt/ml/input/data/train/c4/en/c4-train.00284-of-01024.json.gz",
10 | "/opt/ml/input/data/train/c4/en/c4-train.00343-of-01024.json.gz",
11 | "/opt/ml/input/data/train/c4/en/c4-train.00727-of-01024.json.gz",
12 | "/opt/ml/input/data/train/c4/en/c4-train.00795-of-01024.json.gz",
13 | "/opt/ml/input/data/train/c4/en/c4-train.00535-of-01024.json.gz",
14 | "/opt/ml/input/data/train/c4/en/c4-train.00899-of-01024.json.gz",
15 | "/opt/ml/input/data/train/c4/en/c4-train.00465-of-01024.json.gz",
16 | "/opt/ml/input/data/train/c4/en/c4-train.00382-of-01024.json.gz",
17 | "/opt/ml/input/data/train/c4/en/c4-train.00697-of-01024.json.gz",
18 | "/opt/ml/input/data/train/c4/en/c4-train.00531-of-01024.json.gz",
19 | "/opt/ml/input/data/train/c4/en/c4-train.00638-of-01024.json.gz",
20 | "/opt/ml/input/data/train/c4/en/c4-train.00180-of-01024.json.gz",
21 | "/opt/ml/input/data/train/c4/en/c4-train.00179-of-01024.json.gz",
22 | "/opt/ml/input/data/train/c4/en/c4-train.00800-of-01024.json.gz",
23 | "/opt/ml/input/data/train/c4/en/c4-train.00208-of-01024.json.gz",
24 | "/opt/ml/input/data/train/c4/en/c4-train.00006-of-01024.json.gz",
25 | "/opt/ml/input/data/train/c4/en/c4-train.00188-of-01024.json.gz",
26 | "/opt/ml/input/data/train/c4/en/c4-train.00827-of-01024.json.gz",
27 | "/opt/ml/input/data/train/c4/en/c4-train.00814-of-01024.json.gz",
28 | "/opt/ml/input/data/train/c4/en/c4-train.00593-of-01024.json.gz",
29 | "/opt/ml/input/data/train/c4/en/c4-train.00870-of-01024.json.gz",
30 | "/opt/ml/input/data/train/c4/en/c4-train.00985-of-01024.json.gz",
31 | "/opt/ml/input/data/train/c4/en/c4-train.00701-of-01024.json.gz",
32 | "/opt/ml/input/data/train/c4/en/c4-train.00348-of-01024.json.gz",
33 | "/opt/ml/input/data/train/c4/en/c4-train.00635-of-01024.json.gz",
34 | "/opt/ml/input/data/train/c4/en/c4-train.00754-of-01024.json.gz",
35 | "/opt/ml/input/data/train/c4/en/c4-train.00960-of-01024.json.gz",
36 | "/opt/ml/input/data/train/c4/en/c4-train.00046-of-01024.json.gz",
37 | "/opt/ml/input/data/train/c4/en/c4-train.00080-of-01024.json.gz",
38 | "/opt/ml/input/data/train/c4/en/c4-train.00059-of-01024.json.gz",
39 | "/opt/ml/input/data/train/c4/en/c4-train.00877-of-01024.json.gz",
40 | "/opt/ml/input/data/train/c4/en/c4-train.00875-of-01024.json.gz",
41 | "/opt/ml/input/data/train/c4/en/c4-train.00452-of-01024.json.gz",
42 | "/opt/ml/input/data/train/c4/en/c4-train.00817-of-01024.json.gz",
43 | "/opt/ml/input/data/train/c4/en/c4-train.00640-of-01024.json.gz",
44 | "/opt/ml/input/data/train/c4/en/c4-train.00759-of-01024.json.gz",
45 | "/opt/ml/input/data/train/c4/en/c4-train.00479-of-01024.json.gz",
46 | "/opt/ml/input/data/train/c4/en/c4-train.00861-of-01024.json.gz",
47 | "/opt/ml/input/data/train/c4/en/c4-train.00758-of-01024.json.gz",
48 | "/opt/ml/input/data/train/c4/en/c4-train.00247-of-01024.json.gz",
49 | "/opt/ml/input/data/train/c4/en/c4-train.00011-of-01024.json.gz",
50 | "/opt/ml/input/data/train/c4/en/c4-train.00572-of-01024.json.gz",
51 | "/opt/ml/input/data/train/c4/en/c4-train.00644-of-01024.json.gz"
52 | ]
--------------------------------------------------------------------------------
/data/files/c4/en/p50/partition_300.json:
--------------------------------------------------------------------------------
1 | [
2 | "/opt/ml/input/data/train/c4/en/c4-train.00735-of-01024.json.gz",
3 | "/opt/ml/input/data/train/c4/en/c4-train.00283-of-01024.json.gz",
4 | "/opt/ml/input/data/train/c4/en/c4-train.00887-of-01024.json.gz",
5 | "/opt/ml/input/data/train/c4/en/c4-train.00288-of-01024.json.gz",
6 | "/opt/ml/input/data/train/c4/en/c4-train.00649-of-01024.json.gz",
7 | "/opt/ml/input/data/train/c4/en/c4-train.00428-of-01024.json.gz",
8 | "/opt/ml/input/data/train/c4/en/c4-train.00173-of-01024.json.gz",
9 | "/opt/ml/input/data/train/c4/en/c4-train.00156-of-01024.json.gz",
10 | "/opt/ml/input/data/train/c4/en/c4-train.00330-of-01024.json.gz",
11 | "/opt/ml/input/data/train/c4/en/c4-train.00833-of-01024.json.gz",
12 | "/opt/ml/input/data/train/c4/en/c4-train.00053-of-01024.json.gz",
13 | "/opt/ml/input/data/train/c4/en/c4-train.00199-of-01024.json.gz",
14 | "/opt/ml/input/data/train/c4/en/c4-train.00377-of-01024.json.gz",
15 | "/opt/ml/input/data/train/c4/en/c4-train.00082-of-01024.json.gz",
16 | "/opt/ml/input/data/train/c4/en/c4-train.00097-of-01024.json.gz",
17 | "/opt/ml/input/data/train/c4/en/c4-train.00279-of-01024.json.gz",
18 | "/opt/ml/input/data/train/c4/en/c4-train.00828-of-01024.json.gz",
19 | "/opt/ml/input/data/train/c4/en/c4-train.00829-of-01024.json.gz",
20 | "/opt/ml/input/data/train/c4/en/c4-train.00771-of-01024.json.gz",
21 | "/opt/ml/input/data/train/c4/en/c4-train.00611-of-01024.json.gz",
22 | "/opt/ml/input/data/train/c4/en/c4-train.00716-of-01024.json.gz",
23 | "/opt/ml/input/data/train/c4/en/c4-train.00483-of-01024.json.gz",
24 | "/opt/ml/input/data/train/c4/en/c4-train.00842-of-01024.json.gz",
25 | "/opt/ml/input/data/train/c4/en/c4-train.00996-of-01024.json.gz",
26 | "/opt/ml/input/data/train/c4/en/c4-train.00290-of-01024.json.gz",
27 | "/opt/ml/input/data/train/c4/en/c4-train.00036-of-01024.json.gz",
28 | "/opt/ml/input/data/train/c4/en/c4-train.00076-of-01024.json.gz",
29 | "/opt/ml/input/data/train/c4/en/c4-train.00687-of-01024.json.gz",
30 | "/opt/ml/input/data/train/c4/en/c4-train.00834-of-01024.json.gz",
31 | "/opt/ml/input/data/train/c4/en/c4-train.00880-of-01024.json.gz",
32 | "/opt/ml/input/data/train/c4/en/c4-train.00176-of-01024.json.gz",
33 | "/opt/ml/input/data/train/c4/en/c4-train.00089-of-01024.json.gz",
34 | "/opt/ml/input/data/train/c4/en/c4-train.00932-of-01024.json.gz",
35 | "/opt/ml/input/data/train/c4/en/c4-train.00793-of-01024.json.gz",
36 | "/opt/ml/input/data/train/c4/en/c4-train.00602-of-01024.json.gz",
37 | "/opt/ml/input/data/train/c4/en/c4-train.00228-of-01024.json.gz",
38 | "/opt/ml/input/data/train/c4/en/c4-train.00675-of-01024.json.gz",
39 | "/opt/ml/input/data/train/c4/en/c4-train.00085-of-01024.json.gz",
40 | "/opt/ml/input/data/train/c4/en/c4-train.00752-of-01024.json.gz",
41 | "/opt/ml/input/data/train/c4/en/c4-train.00715-of-01024.json.gz",
42 | "/opt/ml/input/data/train/c4/en/c4-train.00911-of-01024.json.gz",
43 | "/opt/ml/input/data/train/c4/en/c4-train.00238-of-01024.json.gz",
44 | "/opt/ml/input/data/train/c4/en/c4-train.00919-of-01024.json.gz",
45 | "/opt/ml/input/data/train/c4/en/c4-train.00002-of-01024.json.gz",
46 | "/opt/ml/input/data/train/c4/en/c4-train.00785-of-01024.json.gz",
47 | "/opt/ml/input/data/train/c4/en/c4-train.00636-of-01024.json.gz",
48 | "/opt/ml/input/data/train/c4/en/c4-train.00402-of-01024.json.gz",
49 | "/opt/ml/input/data/train/c4/en/c4-train.00772-of-01024.json.gz",
50 | "/opt/ml/input/data/train/c4/en/c4-train.00459-of-01024.json.gz",
51 | "/opt/ml/input/data/train/c4/en/c4-train.00476-of-01024.json.gz"
52 | ]
--------------------------------------------------------------------------------
/data/files/c4/en/p50/partition_350.json:
--------------------------------------------------------------------------------
1 | [
2 | "/opt/ml/input/data/train/c4/en/c4-train.00904-of-01024.json.gz",
3 | "/opt/ml/input/data/train/c4/en/c4-train.00857-of-01024.json.gz",
4 | "/opt/ml/input/data/train/c4/en/c4-train.00321-of-01024.json.gz",
5 | "/opt/ml/input/data/train/c4/en/c4-train.00957-of-01024.json.gz",
6 | "/opt/ml/input/data/train/c4/en/c4-train.00100-of-01024.json.gz",
7 | "/opt/ml/input/data/train/c4/en/c4-train.00966-of-01024.json.gz",
8 | "/opt/ml/input/data/train/c4/en/c4-train.01020-of-01024.json.gz",
9 | "/opt/ml/input/data/train/c4/en/c4-train.00113-of-01024.json.gz",
10 | "/opt/ml/input/data/train/c4/en/c4-train.01016-of-01024.json.gz",
11 | "/opt/ml/input/data/train/c4/en/c4-train.00070-of-01024.json.gz",
12 | "/opt/ml/input/data/train/c4/en/c4-train.00557-of-01024.json.gz",
13 | "/opt/ml/input/data/train/c4/en/c4-train.00244-of-01024.json.gz",
14 | "/opt/ml/input/data/train/c4/en/c4-train.00945-of-01024.json.gz",
15 | "/opt/ml/input/data/train/c4/en/c4-train.00181-of-01024.json.gz",
16 | "/opt/ml/input/data/train/c4/en/c4-train.00225-of-01024.json.gz",
17 | "/opt/ml/input/data/train/c4/en/c4-train.00167-of-01024.json.gz",
18 | "/opt/ml/input/data/train/c4/en/c4-train.00946-of-01024.json.gz",
19 | "/opt/ml/input/data/train/c4/en/c4-train.00847-of-01024.json.gz",
20 | "/opt/ml/input/data/train/c4/en/c4-train.00746-of-01024.json.gz",
21 | "/opt/ml/input/data/train/c4/en/c4-train.00891-of-01024.json.gz",
22 | "/opt/ml/input/data/train/c4/en/c4-train.00373-of-01024.json.gz",
23 | "/opt/ml/input/data/train/c4/en/c4-train.00358-of-01024.json.gz",
24 | "/opt/ml/input/data/train/c4/en/c4-train.00807-of-01024.json.gz",
25 | "/opt/ml/input/data/train/c4/en/c4-train.00873-of-01024.json.gz",
26 | "/opt/ml/input/data/train/c4/en/c4-train.00016-of-01024.json.gz",
27 | "/opt/ml/input/data/train/c4/en/c4-train.00129-of-01024.json.gz",
28 | "/opt/ml/input/data/train/c4/en/c4-train.00903-of-01024.json.gz",
29 | "/opt/ml/input/data/train/c4/en/c4-train.00669-of-01024.json.gz",
30 | "/opt/ml/input/data/train/c4/en/c4-train.00272-of-01024.json.gz",
31 | "/opt/ml/input/data/train/c4/en/c4-train.00034-of-01024.json.gz",
32 | "/opt/ml/input/data/train/c4/en/c4-train.00047-of-01024.json.gz",
33 | "/opt/ml/input/data/train/c4/en/c4-train.00660-of-01024.json.gz",
34 | "/opt/ml/input/data/train/c4/en/c4-train.00999-of-01024.json.gz",
35 | "/opt/ml/input/data/train/c4/en/c4-train.00912-of-01024.json.gz",
36 | "/opt/ml/input/data/train/c4/en/c4-train.00308-of-01024.json.gz",
37 | "/opt/ml/input/data/train/c4/en/c4-train.00732-of-01024.json.gz",
38 | "/opt/ml/input/data/train/c4/en/c4-train.00440-of-01024.json.gz",
39 | "/opt/ml/input/data/train/c4/en/c4-train.00012-of-01024.json.gz",
40 | "/opt/ml/input/data/train/c4/en/c4-train.00643-of-01024.json.gz",
41 | "/opt/ml/input/data/train/c4/en/c4-train.01019-of-01024.json.gz",
42 | "/opt/ml/input/data/train/c4/en/c4-train.00219-of-01024.json.gz",
43 | "/opt/ml/input/data/train/c4/en/c4-train.00304-of-01024.json.gz",
44 | "/opt/ml/input/data/train/c4/en/c4-train.00253-of-01024.json.gz",
45 | "/opt/ml/input/data/train/c4/en/c4-train.00484-of-01024.json.gz",
46 | "/opt/ml/input/data/train/c4/en/c4-train.00335-of-01024.json.gz",
47 | "/opt/ml/input/data/train/c4/en/c4-train.00507-of-01024.json.gz",
48 | "/opt/ml/input/data/train/c4/en/c4-train.00285-of-01024.json.gz",
49 | "/opt/ml/input/data/train/c4/en/c4-train.00926-of-01024.json.gz",
50 | "/opt/ml/input/data/train/c4/en/c4-train.00325-of-01024.json.gz",
51 | "/opt/ml/input/data/train/c4/en/c4-train.00354-of-01024.json.gz"
52 | ]
--------------------------------------------------------------------------------
/data/files/c4/en/p50/partition_400.json:
--------------------------------------------------------------------------------
1 | [
2 | "/opt/ml/input/data/train/c4/en/c4-train.00311-of-01024.json.gz",
3 | "/opt/ml/input/data/train/c4/en/c4-train.00150-of-01024.json.gz",
4 | "/opt/ml/input/data/train/c4/en/c4-train.00137-of-01024.json.gz",
5 | "/opt/ml/input/data/train/c4/en/c4-train.00299-of-01024.json.gz",
6 | "/opt/ml/input/data/train/c4/en/c4-train.00954-of-01024.json.gz",
7 | "/opt/ml/input/data/train/c4/en/c4-train.00631-of-01024.json.gz",
8 | "/opt/ml/input/data/train/c4/en/c4-train.00404-of-01024.json.gz",
9 | "/opt/ml/input/data/train/c4/en/c4-train.00054-of-01024.json.gz",
10 | "/opt/ml/input/data/train/c4/en/c4-train.00694-of-01024.json.gz",
11 | "/opt/ml/input/data/train/c4/en/c4-train.00767-of-01024.json.gz",
12 | "/opt/ml/input/data/train/c4/en/c4-train.00910-of-01024.json.gz",
13 | "/opt/ml/input/data/train/c4/en/c4-train.00642-of-01024.json.gz",
14 | "/opt/ml/input/data/train/c4/en/c4-train.00600-of-01024.json.gz",
15 | "/opt/ml/input/data/train/c4/en/c4-train.00751-of-01024.json.gz",
16 | "/opt/ml/input/data/train/c4/en/c4-train.00490-of-01024.json.gz",
17 | "/opt/ml/input/data/train/c4/en/c4-train.00948-of-01024.json.gz",
18 | "/opt/ml/input/data/train/c4/en/c4-train.00017-of-01024.json.gz",
19 | "/opt/ml/input/data/train/c4/en/c4-train.00650-of-01024.json.gz",
20 | "/opt/ml/input/data/train/c4/en/c4-train.00987-of-01024.json.gz",
21 | "/opt/ml/input/data/train/c4/en/c4-train.00970-of-01024.json.gz",
22 | "/opt/ml/input/data/train/c4/en/c4-train.00865-of-01024.json.gz",
23 | "/opt/ml/input/data/train/c4/en/c4-train.00522-of-01024.json.gz",
24 | "/opt/ml/input/data/train/c4/en/c4-train.00589-of-01024.json.gz",
25 | "/opt/ml/input/data/train/c4/en/c4-train.00510-of-01024.json.gz",
26 | "/opt/ml/input/data/train/c4/en/c4-train.00468-of-01024.json.gz",
27 | "/opt/ml/input/data/train/c4/en/c4-train.00204-of-01024.json.gz",
28 | "/opt/ml/input/data/train/c4/en/c4-train.00801-of-01024.json.gz",
29 | "/opt/ml/input/data/train/c4/en/c4-train.00456-of-01024.json.gz",
30 | "/opt/ml/input/data/train/c4/en/c4-train.00449-of-01024.json.gz",
31 | "/opt/ml/input/data/train/c4/en/c4-train.00580-of-01024.json.gz",
32 | "/opt/ml/input/data/train/c4/en/c4-train.00418-of-01024.json.gz",
33 | "/opt/ml/input/data/train/c4/en/c4-train.00063-of-01024.json.gz",
34 | "/opt/ml/input/data/train/c4/en/c4-train.00451-of-01024.json.gz",
35 | "/opt/ml/input/data/train/c4/en/c4-train.00028-of-01024.json.gz",
36 | "/opt/ml/input/data/train/c4/en/c4-train.00972-of-01024.json.gz",
37 | "/opt/ml/input/data/train/c4/en/c4-train.00963-of-01024.json.gz",
38 | "/opt/ml/input/data/train/c4/en/c4-train.00512-of-01024.json.gz",
39 | "/opt/ml/input/data/train/c4/en/c4-train.00391-of-01024.json.gz",
40 | "/opt/ml/input/data/train/c4/en/c4-train.00769-of-01024.json.gz",
41 | "/opt/ml/input/data/train/c4/en/c4-train.00900-of-01024.json.gz",
42 | "/opt/ml/input/data/train/c4/en/c4-train.00427-of-01024.json.gz",
43 | "/opt/ml/input/data/train/c4/en/c4-train.00379-of-01024.json.gz",
44 | "/opt/ml/input/data/train/c4/en/c4-train.00613-of-01024.json.gz",
45 | "/opt/ml/input/data/train/c4/en/c4-train.00399-of-01024.json.gz",
46 | "/opt/ml/input/data/train/c4/en/c4-train.00227-of-01024.json.gz",
47 | "/opt/ml/input/data/train/c4/en/c4-train.00853-of-01024.json.gz",
48 | "/opt/ml/input/data/train/c4/en/c4-train.00588-of-01024.json.gz",
49 | "/opt/ml/input/data/train/c4/en/c4-train.00045-of-01024.json.gz",
50 | "/opt/ml/input/data/train/c4/en/c4-train.00189-of-01024.json.gz",
51 | "/opt/ml/input/data/train/c4/en/c4-train.00538-of-01024.json.gz"
52 | ]
--------------------------------------------------------------------------------
/data/files/c4/en/p50/partition_450.json:
--------------------------------------------------------------------------------
1 | [
2 | "/opt/ml/input/data/train/c4/en/c4-train.00859-of-01024.json.gz",
3 | "/opt/ml/input/data/train/c4/en/c4-train.00298-of-01024.json.gz",
4 | "/opt/ml/input/data/train/c4/en/c4-train.00342-of-01024.json.gz",
5 | "/opt/ml/input/data/train/c4/en/c4-train.00372-of-01024.json.gz",
6 | "/opt/ml/input/data/train/c4/en/c4-train.00961-of-01024.json.gz",
7 | "/opt/ml/input/data/train/c4/en/c4-train.00736-of-01024.json.gz",
8 | "/opt/ml/input/data/train/c4/en/c4-train.00639-of-01024.json.gz",
9 | "/opt/ml/input/data/train/c4/en/c4-train.00892-of-01024.json.gz",
10 | "/opt/ml/input/data/train/c4/en/c4-train.00249-of-01024.json.gz",
11 | "/opt/ml/input/data/train/c4/en/c4-train.00610-of-01024.json.gz",
12 | "/opt/ml/input/data/train/c4/en/c4-train.00609-of-01024.json.gz",
13 | "/opt/ml/input/data/train/c4/en/c4-train.00368-of-01024.json.gz",
14 | "/opt/ml/input/data/train/c4/en/c4-train.00381-of-01024.json.gz",
15 | "/opt/ml/input/data/train/c4/en/c4-train.00654-of-01024.json.gz",
16 | "/opt/ml/input/data/train/c4/en/c4-train.00820-of-01024.json.gz",
17 | "/opt/ml/input/data/train/c4/en/c4-train.00832-of-01024.json.gz",
18 | "/opt/ml/input/data/train/c4/en/c4-train.00982-of-01024.json.gz",
19 | "/opt/ml/input/data/train/c4/en/c4-train.00333-of-01024.json.gz",
20 | "/opt/ml/input/data/train/c4/en/c4-train.00501-of-01024.json.gz",
21 | "/opt/ml/input/data/train/c4/en/c4-train.00965-of-01024.json.gz",
22 | "/opt/ml/input/data/train/c4/en/c4-train.00562-of-01024.json.gz",
23 | "/opt/ml/input/data/train/c4/en/c4-train.00517-of-01024.json.gz",
24 | "/opt/ml/input/data/train/c4/en/c4-train.00191-of-01024.json.gz",
25 | "/opt/ml/input/data/train/c4/en/c4-train.00690-of-01024.json.gz",
26 | "/opt/ml/input/data/train/c4/en/c4-train.00806-of-01024.json.gz",
27 | "/opt/ml/input/data/train/c4/en/c4-train.00500-of-01024.json.gz",
28 | "/opt/ml/input/data/train/c4/en/c4-train.01010-of-01024.json.gz",
29 | "/opt/ml/input/data/train/c4/en/c4-train.01012-of-01024.json.gz",
30 | "/opt/ml/input/data/train/c4/en/c4-train.00590-of-01024.json.gz",
31 | "/opt/ml/input/data/train/c4/en/c4-train.00122-of-01024.json.gz",
32 | "/opt/ml/input/data/train/c4/en/c4-train.00709-of-01024.json.gz",
33 | "/opt/ml/input/data/train/c4/en/c4-train.00000-of-01024.json.gz",
34 | "/opt/ml/input/data/train/c4/en/c4-train.00287-of-01024.json.gz",
35 | "/opt/ml/input/data/train/c4/en/c4-train.00281-of-01024.json.gz",
36 | "/opt/ml/input/data/train/c4/en/c4-train.00986-of-01024.json.gz",
37 | "/opt/ml/input/data/train/c4/en/c4-train.00737-of-01024.json.gz",
38 | "/opt/ml/input/data/train/c4/en/c4-train.00098-of-01024.json.gz",
39 | "/opt/ml/input/data/train/c4/en/c4-train.00241-of-01024.json.gz",
40 | "/opt/ml/input/data/train/c4/en/c4-train.00976-of-01024.json.gz",
41 | "/opt/ml/input/data/train/c4/en/c4-train.00705-of-01024.json.gz",
42 | "/opt/ml/input/data/train/c4/en/c4-train.00876-of-01024.json.gz",
43 | "/opt/ml/input/data/train/c4/en/c4-train.00760-of-01024.json.gz",
44 | "/opt/ml/input/data/train/c4/en/c4-train.00923-of-01024.json.gz",
45 | "/opt/ml/input/data/train/c4/en/c4-train.00713-of-01024.json.gz",
46 | "/opt/ml/input/data/train/c4/en/c4-train.00599-of-01024.json.gz",
47 | "/opt/ml/input/data/train/c4/en/c4-train.00804-of-01024.json.gz",
48 | "/opt/ml/input/data/train/c4/en/c4-train.00894-of-01024.json.gz",
49 | "/opt/ml/input/data/train/c4/en/c4-train.00830-of-01024.json.gz",
50 | "/opt/ml/input/data/train/c4/en/c4-train.00971-of-01024.json.gz",
51 | "/opt/ml/input/data/train/c4/en/c4-train.00584-of-01024.json.gz"
52 | ]
--------------------------------------------------------------------------------
/data/files/c4/en/p50/partition_50.json:
--------------------------------------------------------------------------------
1 | [
2 | "/opt/ml/input/data/train/c4/en/c4-train.00770-of-01024.json.gz",
3 | "/opt/ml/input/data/train/c4/en/c4-train.00679-of-01024.json.gz",
4 | "/opt/ml/input/data/train/c4/en/c4-train.00616-of-01024.json.gz",
5 | "/opt/ml/input/data/train/c4/en/c4-train.00561-of-01024.json.gz",
6 | "/opt/ml/input/data/train/c4/en/c4-train.00184-of-01024.json.gz",
7 | "/opt/ml/input/data/train/c4/en/c4-train.00072-of-01024.json.gz",
8 | "/opt/ml/input/data/train/c4/en/c4-train.00347-of-01024.json.gz",
9 | "/opt/ml/input/data/train/c4/en/c4-train.00949-of-01024.json.gz",
10 | "/opt/ml/input/data/train/c4/en/c4-train.00051-of-01024.json.gz",
11 | "/opt/ml/input/data/train/c4/en/c4-train.00401-of-01024.json.gz",
12 | "/opt/ml/input/data/train/c4/en/c4-train.00567-of-01024.json.gz",
13 | "/opt/ml/input/data/train/c4/en/c4-train.00305-of-01024.json.gz",
14 | "/opt/ml/input/data/train/c4/en/c4-train.00498-of-01024.json.gz",
15 | "/opt/ml/input/data/train/c4/en/c4-train.00896-of-01024.json.gz",
16 | "/opt/ml/input/data/train/c4/en/c4-train.00101-of-01024.json.gz",
17 | "/opt/ml/input/data/train/c4/en/c4-train.00169-of-01024.json.gz",
18 | "/opt/ml/input/data/train/c4/en/c4-train.00984-of-01024.json.gz",
19 | "/opt/ml/input/data/train/c4/en/c4-train.00815-of-01024.json.gz",
20 | "/opt/ml/input/data/train/c4/en/c4-train.00262-of-01024.json.gz",
21 | "/opt/ml/input/data/train/c4/en/c4-train.00164-of-01024.json.gz",
22 | "/opt/ml/input/data/train/c4/en/c4-train.00077-of-01024.json.gz",
23 | "/opt/ml/input/data/train/c4/en/c4-train.00710-of-01024.json.gz",
24 | "/opt/ml/input/data/train/c4/en/c4-train.00723-of-01024.json.gz",
25 | "/opt/ml/input/data/train/c4/en/c4-train.00172-of-01024.json.gz",
26 | "/opt/ml/input/data/train/c4/en/c4-train.00558-of-01024.json.gz",
27 | "/opt/ml/input/data/train/c4/en/c4-train.00993-of-01024.json.gz",
28 | "/opt/ml/input/data/train/c4/en/c4-train.00392-of-01024.json.gz",
29 | "/opt/ml/input/data/train/c4/en/c4-train.00123-of-01024.json.gz",
30 | "/opt/ml/input/data/train/c4/en/c4-train.00790-of-01024.json.gz",
31 | "/opt/ml/input/data/train/c4/en/c4-train.00083-of-01024.json.gz",
32 | "/opt/ml/input/data/train/c4/en/c4-train.00024-of-01024.json.gz",
33 | "/opt/ml/input/data/train/c4/en/c4-train.00792-of-01024.json.gz",
34 | "/opt/ml/input/data/train/c4/en/c4-train.00245-of-01024.json.gz",
35 | "/opt/ml/input/data/train/c4/en/c4-train.00648-of-01024.json.gz",
36 | "/opt/ml/input/data/train/c4/en/c4-train.00939-of-01024.json.gz",
37 | "/opt/ml/input/data/train/c4/en/c4-train.00964-of-01024.json.gz",
38 | "/opt/ml/input/data/train/c4/en/c4-train.00153-of-01024.json.gz",
39 | "/opt/ml/input/data/train/c4/en/c4-train.00867-of-01024.json.gz",
40 | "/opt/ml/input/data/train/c4/en/c4-train.00545-of-01024.json.gz",
41 | "/opt/ml/input/data/train/c4/en/c4-train.00119-of-01024.json.gz",
42 | "/opt/ml/input/data/train/c4/en/c4-train.00509-of-01024.json.gz",
43 | "/opt/ml/input/data/train/c4/en/c4-train.00416-of-01024.json.gz",
44 | "/opt/ml/input/data/train/c4/en/c4-train.00040-of-01024.json.gz",
45 | "/opt/ml/input/data/train/c4/en/c4-train.01018-of-01024.json.gz",
46 | "/opt/ml/input/data/train/c4/en/c4-train.01002-of-01024.json.gz",
47 | "/opt/ml/input/data/train/c4/en/c4-train.00369-of-01024.json.gz",
48 | "/opt/ml/input/data/train/c4/en/c4-train.01013-of-01024.json.gz",
49 | "/opt/ml/input/data/train/c4/en/c4-train.00968-of-01024.json.gz",
50 | "/opt/ml/input/data/train/c4/en/c4-train.00666-of-01024.json.gz",
51 | "/opt/ml/input/data/train/c4/en/c4-train.00102-of-01024.json.gz"
52 | ]
--------------------------------------------------------------------------------
/data/files/c4/en/p50/partition_500.json:
--------------------------------------------------------------------------------
1 | [
2 | "/opt/ml/input/data/train/c4/en/c4-train.00928-of-01024.json.gz",
3 | "/opt/ml/input/data/train/c4/en/c4-train.00068-of-01024.json.gz",
4 | "/opt/ml/input/data/train/c4/en/c4-train.00313-of-01024.json.gz",
5 | "/opt/ml/input/data/train/c4/en/c4-train.00294-of-01024.json.gz",
6 | "/opt/ml/input/data/train/c4/en/c4-train.00851-of-01024.json.gz",
7 | "/opt/ml/input/data/train/c4/en/c4-train.00794-of-01024.json.gz",
8 | "/opt/ml/input/data/train/c4/en/c4-train.00678-of-01024.json.gz",
9 | "/opt/ml/input/data/train/c4/en/c4-train.00681-of-01024.json.gz",
10 | "/opt/ml/input/data/train/c4/en/c4-train.00637-of-01024.json.gz",
11 | "/opt/ml/input/data/train/c4/en/c4-train.00747-of-01024.json.gz",
12 | "/opt/ml/input/data/train/c4/en/c4-train.00935-of-01024.json.gz",
13 | "/opt/ml/input/data/train/c4/en/c4-train.00168-of-01024.json.gz",
14 | "/opt/ml/input/data/train/c4/en/c4-train.00263-of-01024.json.gz",
15 | "/opt/ml/input/data/train/c4/en/c4-train.00223-of-01024.json.gz",
16 | "/opt/ml/input/data/train/c4/en/c4-train.00841-of-01024.json.gz",
17 | "/opt/ml/input/data/train/c4/en/c4-train.00563-of-01024.json.gz",
18 | "/opt/ml/input/data/train/c4/en/c4-train.00787-of-01024.json.gz",
19 | "/opt/ml/input/data/train/c4/en/c4-train.00980-of-01024.json.gz",
20 | "/opt/ml/input/data/train/c4/en/c4-train.00338-of-01024.json.gz",
21 | "/opt/ml/input/data/train/c4/en/c4-train.00286-of-01024.json.gz",
22 | "/opt/ml/input/data/train/c4/en/c4-train.00049-of-01024.json.gz",
23 | "/opt/ml/input/data/train/c4/en/c4-train.00403-of-01024.json.gz",
24 | "/opt/ml/input/data/train/c4/en/c4-train.00393-of-01024.json.gz",
25 | "/opt/ml/input/data/train/c4/en/c4-train.00492-of-01024.json.gz",
26 | "/opt/ml/input/data/train/c4/en/c4-train.00656-of-01024.json.gz",
27 | "/opt/ml/input/data/train/c4/en/c4-train.00840-of-01024.json.gz",
28 | "/opt/ml/input/data/train/c4/en/c4-train.00902-of-01024.json.gz",
29 | "/opt/ml/input/data/train/c4/en/c4-train.00351-of-01024.json.gz",
30 | "/opt/ml/input/data/train/c4/en/c4-train.00378-of-01024.json.gz",
31 | "/opt/ml/input/data/train/c4/en/c4-train.00757-of-01024.json.gz",
32 | "/opt/ml/input/data/train/c4/en/c4-train.00942-of-01024.json.gz",
33 | "/opt/ml/input/data/train/c4/en/c4-train.00441-of-01024.json.gz",
34 | "/opt/ml/input/data/train/c4/en/c4-train.00147-of-01024.json.gz",
35 | "/opt/ml/input/data/train/c4/en/c4-train.00471-of-01024.json.gz",
36 | "/opt/ml/input/data/train/c4/en/c4-train.00015-of-01024.json.gz",
37 | "/opt/ml/input/data/train/c4/en/c4-train.00075-of-01024.json.gz",
38 | "/opt/ml/input/data/train/c4/en/c4-train.00618-of-01024.json.gz",
39 | "/opt/ml/input/data/train/c4/en/c4-train.00818-of-01024.json.gz",
40 | "/opt/ml/input/data/train/c4/en/c4-train.00324-of-01024.json.gz",
41 | "/opt/ml/input/data/train/c4/en/c4-train.00370-of-01024.json.gz",
42 | "/opt/ml/input/data/train/c4/en/c4-train.00413-of-01024.json.gz",
43 | "/opt/ml/input/data/train/c4/en/c4-train.00139-of-01024.json.gz",
44 | "/opt/ml/input/data/train/c4/en/c4-train.00674-of-01024.json.gz",
45 | "/opt/ml/input/data/train/c4/en/c4-train.00114-of-01024.json.gz",
46 | "/opt/ml/input/data/train/c4/en/c4-train.00463-of-01024.json.gz",
47 | "/opt/ml/input/data/train/c4/en/c4-train.00917-of-01024.json.gz",
48 | "/opt/ml/input/data/train/c4/en/c4-train.00154-of-01024.json.gz",
49 | "/opt/ml/input/data/train/c4/en/c4-train.00549-of-01024.json.gz",
50 | "/opt/ml/input/data/train/c4/en/c4-train.00296-of-01024.json.gz",
51 | "/opt/ml/input/data/train/c4/en/c4-train.00419-of-01024.json.gz"
52 | ]
--------------------------------------------------------------------------------
/data/files/c4/en/p50/partition_550.json:
--------------------------------------------------------------------------------
1 | [
2 | "/opt/ml/input/data/train/c4/en/c4-train.00310-of-01024.json.gz",
3 | "/opt/ml/input/data/train/c4/en/c4-train.00157-of-01024.json.gz",
4 | "/opt/ml/input/data/train/c4/en/c4-train.00469-of-01024.json.gz",
5 | "/opt/ml/input/data/train/c4/en/c4-train.00521-of-01024.json.gz",
6 | "/opt/ml/input/data/train/c4/en/c4-train.00699-of-01024.json.gz",
7 | "/opt/ml/input/data/train/c4/en/c4-train.00730-of-01024.json.gz",
8 | "/opt/ml/input/data/train/c4/en/c4-train.00603-of-01024.json.gz",
9 | "/opt/ml/input/data/train/c4/en/c4-train.00781-of-01024.json.gz",
10 | "/opt/ml/input/data/train/c4/en/c4-train.00777-of-01024.json.gz",
11 | "/opt/ml/input/data/train/c4/en/c4-train.00560-of-01024.json.gz",
12 | "/opt/ml/input/data/train/c4/en/c4-train.00159-of-01024.json.gz",
13 | "/opt/ml/input/data/train/c4/en/c4-train.00032-of-01024.json.gz",
14 | "/opt/ml/input/data/train/c4/en/c4-train.00107-of-01024.json.gz",
15 | "/opt/ml/input/data/train/c4/en/c4-train.00064-of-01024.json.gz",
16 | "/opt/ml/input/data/train/c4/en/c4-train.00346-of-01024.json.gz",
17 | "/opt/ml/input/data/train/c4/en/c4-train.00571-of-01024.json.gz",
18 | "/opt/ml/input/data/train/c4/en/c4-train.00714-of-01024.json.gz",
19 | "/opt/ml/input/data/train/c4/en/c4-train.00135-of-01024.json.gz",
20 | "/opt/ml/input/data/train/c4/en/c4-train.00554-of-01024.json.gz",
21 | "/opt/ml/input/data/train/c4/en/c4-train.00533-of-01024.json.gz",
22 | "/opt/ml/input/data/train/c4/en/c4-train.00071-of-01024.json.gz",
23 | "/opt/ml/input/data/train/c4/en/c4-train.00628-of-01024.json.gz",
24 | "/opt/ml/input/data/train/c4/en/c4-train.00235-of-01024.json.gz",
25 | "/opt/ml/input/data/train/c4/en/c4-train.00487-of-01024.json.gz",
26 | "/opt/ml/input/data/train/c4/en/c4-train.00303-of-01024.json.gz",
27 | "/opt/ml/input/data/train/c4/en/c4-train.00683-of-01024.json.gz",
28 | "/opt/ml/input/data/train/c4/en/c4-train.00565-of-01024.json.gz",
29 | "/opt/ml/input/data/train/c4/en/c4-train.00663-of-01024.json.gz",
30 | "/opt/ml/input/data/train/c4/en/c4-train.00019-of-01024.json.gz",
31 | "/opt/ml/input/data/train/c4/en/c4-train.00052-of-01024.json.gz",
32 | "/opt/ml/input/data/train/c4/en/c4-train.00909-of-01024.json.gz",
33 | "/opt/ml/input/data/train/c4/en/c4-train.01009-of-01024.json.gz",
34 | "/opt/ml/input/data/train/c4/en/c4-train.00439-of-01024.json.gz",
35 | "/opt/ml/input/data/train/c4/en/c4-train.00620-of-01024.json.gz",
36 | "/opt/ml/input/data/train/c4/en/c4-train.00766-of-01024.json.gz",
37 | "/opt/ml/input/data/train/c4/en/c4-train.00350-of-01024.json.gz",
38 | "/opt/ml/input/data/train/c4/en/c4-train.00797-of-01024.json.gz",
39 | "/opt/ml/input/data/train/c4/en/c4-train.00526-of-01024.json.gz",
40 | "/opt/ml/input/data/train/c4/en/c4-train.00232-of-01024.json.gz",
41 | "/opt/ml/input/data/train/c4/en/c4-train.00362-of-01024.json.gz",
42 | "/opt/ml/input/data/train/c4/en/c4-train.00060-of-01024.json.gz",
43 | "/opt/ml/input/data/train/c4/en/c4-train.00044-of-01024.json.gz",
44 | "/opt/ml/input/data/train/c4/en/c4-train.00256-of-01024.json.gz",
45 | "/opt/ml/input/data/train/c4/en/c4-train.00196-of-01024.json.gz",
46 | "/opt/ml/input/data/train/c4/en/c4-train.00622-of-01024.json.gz",
47 | "/opt/ml/input/data/train/c4/en/c4-train.00216-of-01024.json.gz",
48 | "/opt/ml/input/data/train/c4/en/c4-train.00242-of-01024.json.gz",
49 | "/opt/ml/input/data/train/c4/en/c4-train.00470-of-01024.json.gz",
50 | "/opt/ml/input/data/train/c4/en/c4-train.00058-of-01024.json.gz",
51 | "/opt/ml/input/data/train/c4/en/c4-train.00363-of-01024.json.gz"
52 | ]
--------------------------------------------------------------------------------
/data/files/c4/en/p50/partition_600.json:
--------------------------------------------------------------------------------
1 | [
2 | "/opt/ml/input/data/train/c4/en/c4-train.00843-of-01024.json.gz",
3 | "/opt/ml/input/data/train/c4/en/c4-train.00493-of-01024.json.gz",
4 | "/opt/ml/input/data/train/c4/en/c4-train.00916-of-01024.json.gz",
5 | "/opt/ml/input/data/train/c4/en/c4-train.00983-of-01024.json.gz",
6 | "/opt/ml/input/data/train/c4/en/c4-train.00544-of-01024.json.gz",
7 | "/opt/ml/input/data/train/c4/en/c4-train.00778-of-01024.json.gz",
8 | "/opt/ml/input/data/train/c4/en/c4-train.00318-of-01024.json.gz",
9 | "/opt/ml/input/data/train/c4/en/c4-train.00454-of-01024.json.gz",
10 | "/opt/ml/input/data/train/c4/en/c4-train.00239-of-01024.json.gz",
11 | "/opt/ml/input/data/train/c4/en/c4-train.00220-of-01024.json.gz",
12 | "/opt/ml/input/data/train/c4/en/c4-train.00762-of-01024.json.gz",
13 | "/opt/ml/input/data/train/c4/en/c4-train.00927-of-01024.json.gz",
14 | "/opt/ml/input/data/train/c4/en/c4-train.00532-of-01024.json.gz",
15 | "/opt/ml/input/data/train/c4/en/c4-train.01014-of-01024.json.gz",
16 | "/opt/ml/input/data/train/c4/en/c4-train.00257-of-01024.json.gz",
17 | "/opt/ml/input/data/train/c4/en/c4-train.00160-of-01024.json.gz",
18 | "/opt/ml/input/data/train/c4/en/c4-train.00489-of-01024.json.gz",
19 | "/opt/ml/input/data/train/c4/en/c4-train.00048-of-01024.json.gz",
20 | "/opt/ml/input/data/train/c4/en/c4-train.00568-of-01024.json.gz",
21 | "/opt/ml/input/data/train/c4/en/c4-train.00528-of-01024.json.gz",
22 | "/opt/ml/input/data/train/c4/en/c4-train.00692-of-01024.json.gz",
23 | "/opt/ml/input/data/train/c4/en/c4-train.00729-of-01024.json.gz",
24 | "/opt/ml/input/data/train/c4/en/c4-train.00001-of-01024.json.gz",
25 | "/opt/ml/input/data/train/c4/en/c4-train.00623-of-01024.json.gz",
26 | "/opt/ml/input/data/train/c4/en/c4-train.00711-of-01024.json.gz",
27 | "/opt/ml/input/data/train/c4/en/c4-train.00027-of-01024.json.gz",
28 | "/opt/ml/input/data/train/c4/en/c4-train.00265-of-01024.json.gz",
29 | "/opt/ml/input/data/train/c4/en/c4-train.00755-of-01024.json.gz",
30 | "/opt/ml/input/data/train/c4/en/c4-train.00667-of-01024.json.gz",
31 | "/opt/ml/input/data/train/c4/en/c4-train.00921-of-01024.json.gz",
32 | "/opt/ml/input/data/train/c4/en/c4-train.00233-of-01024.json.gz",
33 | "/opt/ml/input/data/train/c4/en/c4-train.00940-of-01024.json.gz",
34 | "/opt/ml/input/data/train/c4/en/c4-train.00937-of-01024.json.gz",
35 | "/opt/ml/input/data/train/c4/en/c4-train.00161-of-01024.json.gz",
36 | "/opt/ml/input/data/train/c4/en/c4-train.00433-of-01024.json.gz",
37 | "/opt/ml/input/data/train/c4/en/c4-train.00171-of-01024.json.gz",
38 | "/opt/ml/input/data/train/c4/en/c4-train.00055-of-01024.json.gz",
39 | "/opt/ml/input/data/train/c4/en/c4-train.00525-of-01024.json.gz",
40 | "/opt/ml/input/data/train/c4/en/c4-train.00230-of-01024.json.gz",
41 | "/opt/ml/input/data/train/c4/en/c4-train.00524-of-01024.json.gz",
42 | "/opt/ml/input/data/train/c4/en/c4-train.00300-of-01024.json.gz",
43 | "/opt/ml/input/data/train/c4/en/c4-train.00193-of-01024.json.gz",
44 | "/opt/ml/input/data/train/c4/en/c4-train.00301-of-01024.json.gz",
45 | "/opt/ml/input/data/train/c4/en/c4-train.00422-of-01024.json.gz",
46 | "/opt/ml/input/data/train/c4/en/c4-train.00809-of-01024.json.gz",
47 | "/opt/ml/input/data/train/c4/en/c4-train.00668-of-01024.json.gz",
48 | "/opt/ml/input/data/train/c4/en/c4-train.00023-of-01024.json.gz",
49 | "/opt/ml/input/data/train/c4/en/c4-train.00924-of-01024.json.gz",
50 | "/opt/ml/input/data/train/c4/en/c4-train.00633-of-01024.json.gz",
51 | "/opt/ml/input/data/train/c4/en/c4-train.00414-of-01024.json.gz"
52 | ]
--------------------------------------------------------------------------------
/data/files/c4/en/p50/partition_650.json:
--------------------------------------------------------------------------------
1 | [
2 | "/opt/ml/input/data/train/c4/en/c4-train.01011-of-01024.json.gz",
3 | "/opt/ml/input/data/train/c4/en/c4-train.00819-of-01024.json.gz",
4 | "/opt/ml/input/data/train/c4/en/c4-train.01022-of-01024.json.gz",
5 | "/opt/ml/input/data/train/c4/en/c4-train.00995-of-01024.json.gz",
6 | "/opt/ml/input/data/train/c4/en/c4-train.00586-of-01024.json.gz",
7 | "/opt/ml/input/data/train/c4/en/c4-train.00004-of-01024.json.gz",
8 | "/opt/ml/input/data/train/c4/en/c4-train.00763-of-01024.json.gz",
9 | "/opt/ml/input/data/train/c4/en/c4-train.00673-of-01024.json.gz",
10 | "/opt/ml/input/data/train/c4/en/c4-train.00295-of-01024.json.gz",
11 | "/opt/ml/input/data/train/c4/en/c4-train.00423-of-01024.json.gz",
12 | "/opt/ml/input/data/train/c4/en/c4-train.00925-of-01024.json.gz",
13 | "/opt/ml/input/data/train/c4/en/c4-train.00474-of-01024.json.gz",
14 | "/opt/ml/input/data/train/c4/en/c4-train.00183-of-01024.json.gz",
15 | "/opt/ml/input/data/train/c4/en/c4-train.00898-of-01024.json.gz",
16 | "/opt/ml/input/data/train/c4/en/c4-train.00626-of-01024.json.gz",
17 | "/opt/ml/input/data/train/c4/en/c4-train.00084-of-01024.json.gz",
18 | "/opt/ml/input/data/train/c4/en/c4-train.00425-of-01024.json.gz",
19 | "/opt/ml/input/data/train/c4/en/c4-train.00229-of-01024.json.gz",
20 | "/opt/ml/input/data/train/c4/en/c4-train.00826-of-01024.json.gz",
21 | "/opt/ml/input/data/train/c4/en/c4-train.00764-of-01024.json.gz",
22 | "/opt/ml/input/data/train/c4/en/c4-train.00415-of-01024.json.gz",
23 | "/opt/ml/input/data/train/c4/en/c4-train.00121-of-01024.json.gz",
24 | "/opt/ml/input/data/train/c4/en/c4-train.00569-of-01024.json.gz",
25 | "/opt/ml/input/data/train/c4/en/c4-train.00337-of-01024.json.gz",
26 | "/opt/ml/input/data/train/c4/en/c4-train.00530-of-01024.json.gz",
27 | "/opt/ml/input/data/train/c4/en/c4-train.00444-of-01024.json.gz",
28 | "/opt/ml/input/data/train/c4/en/c4-train.00756-of-01024.json.gz",
29 | "/opt/ml/input/data/train/c4/en/c4-train.00601-of-01024.json.gz",
30 | "/opt/ml/input/data/train/c4/en/c4-train.00307-of-01024.json.gz",
31 | "/opt/ml/input/data/train/c4/en/c4-train.00221-of-01024.json.gz",
32 | "/opt/ml/input/data/train/c4/en/c4-train.00182-of-01024.json.gz",
33 | "/opt/ml/input/data/train/c4/en/c4-train.00207-of-01024.json.gz",
34 | "/opt/ml/input/data/train/c4/en/c4-train.00632-of-01024.json.gz",
35 | "/opt/ml/input/data/train/c4/en/c4-train.00186-of-01024.json.gz",
36 | "/opt/ml/input/data/train/c4/en/c4-train.00914-of-01024.json.gz",
37 | "/opt/ml/input/data/train/c4/en/c4-train.00513-of-01024.json.gz",
38 | "/opt/ml/input/data/train/c4/en/c4-train.00448-of-01024.json.gz",
39 | "/opt/ml/input/data/train/c4/en/c4-train.00612-of-01024.json.gz",
40 | "/opt/ml/input/data/train/c4/en/c4-train.00700-of-01024.json.gz",
41 | "/opt/ml/input/data/train/c4/en/c4-train.00254-of-01024.json.gz",
42 | "/opt/ml/input/data/train/c4/en/c4-train.00523-of-01024.json.gz",
43 | "/opt/ml/input/data/train/c4/en/c4-train.00138-of-01024.json.gz",
44 | "/opt/ml/input/data/train/c4/en/c4-train.00731-of-01024.json.gz",
45 | "/opt/ml/input/data/train/c4/en/c4-train.00166-of-01024.json.gz",
46 | "/opt/ml/input/data/train/c4/en/c4-train.00871-of-01024.json.gz",
47 | "/opt/ml/input/data/train/c4/en/c4-train.00920-of-01024.json.gz",
48 | "/opt/ml/input/data/train/c4/en/c4-train.00293-of-01024.json.gz",
49 | "/opt/ml/input/data/train/c4/en/c4-train.00365-of-01024.json.gz",
50 | "/opt/ml/input/data/train/c4/en/c4-train.00546-of-01024.json.gz",
51 | "/opt/ml/input/data/train/c4/en/c4-train.00407-of-01024.json.gz"
52 | ]
--------------------------------------------------------------------------------
/data/files/c4/en/p50/partition_700.json:
--------------------------------------------------------------------------------
1 | [
2 | "/opt/ml/input/data/train/c4/en/c4-train.00682-of-01024.json.gz",
3 | "/opt/ml/input/data/train/c4/en/c4-train.00078-of-01024.json.gz",
4 | "/opt/ml/input/data/train/c4/en/c4-train.00581-of-01024.json.gz",
5 | "/opt/ml/input/data/train/c4/en/c4-train.00424-of-01024.json.gz",
6 | "/opt/ml/input/data/train/c4/en/c4-train.00641-of-01024.json.gz",
7 | "/opt/ml/input/data/train/c4/en/c4-train.00453-of-01024.json.gz",
8 | "/opt/ml/input/data/train/c4/en/c4-train.00608-of-01024.json.gz",
9 | "/opt/ml/input/data/train/c4/en/c4-train.00653-of-01024.json.gz",
10 | "/opt/ml/input/data/train/c4/en/c4-train.00977-of-01024.json.gz",
11 | "/opt/ml/input/data/train/c4/en/c4-train.00992-of-01024.json.gz",
12 | "/opt/ml/input/data/train/c4/en/c4-train.00495-of-01024.json.gz",
13 | "/opt/ml/input/data/train/c4/en/c4-train.00738-of-01024.json.gz",
14 | "/opt/ml/input/data/train/c4/en/c4-train.00312-of-01024.json.gz",
15 | "/opt/ml/input/data/train/c4/en/c4-train.00552-of-01024.json.gz",
16 | "/opt/ml/input/data/train/c4/en/c4-train.00397-of-01024.json.gz",
17 | "/opt/ml/input/data/train/c4/en/c4-train.00267-of-01024.json.gz",
18 | "/opt/ml/input/data/train/c4/en/c4-train.00529-of-01024.json.gz",
19 | "/opt/ml/input/data/train/c4/en/c4-train.00494-of-01024.json.gz",
20 | "/opt/ml/input/data/train/c4/en/c4-train.00566-of-01024.json.gz",
21 | "/opt/ml/input/data/train/c4/en/c4-train.00505-of-01024.json.gz",
22 | "/opt/ml/input/data/train/c4/en/c4-train.00276-of-01024.json.gz",
23 | "/opt/ml/input/data/train/c4/en/c4-train.00619-of-01024.json.gz",
24 | "/opt/ml/input/data/train/c4/en/c4-train.00733-of-01024.json.gz",
25 | "/opt/ml/input/data/train/c4/en/c4-train.00488-of-01024.json.gz",
26 | "/opt/ml/input/data/train/c4/en/c4-train.00486-of-01024.json.gz",
27 | "/opt/ml/input/data/train/c4/en/c4-train.00587-of-01024.json.gz",
28 | "/opt/ml/input/data/train/c4/en/c4-train.00941-of-01024.json.gz",
29 | "/opt/ml/input/data/train/c4/en/c4-train.00821-of-01024.json.gz",
30 | "/opt/ml/input/data/train/c4/en/c4-train.00708-of-01024.json.gz",
31 | "/opt/ml/input/data/train/c4/en/c4-train.00822-of-01024.json.gz",
32 | "/opt/ml/input/data/train/c4/en/c4-train.00366-of-01024.json.gz",
33 | "/opt/ml/input/data/train/c4/en/c4-train.00394-of-01024.json.gz",
34 | "/opt/ml/input/data/train/c4/en/c4-train.00250-of-01024.json.gz",
35 | "/opt/ml/input/data/train/c4/en/c4-train.00389-of-01024.json.gz",
36 | "/opt/ml/input/data/train/c4/en/c4-train.00353-of-01024.json.gz",
37 | "/opt/ml/input/data/train/c4/en/c4-train.00035-of-01024.json.gz",
38 | "/opt/ml/input/data/train/c4/en/c4-train.00202-of-01024.json.gz",
39 | "/opt/ml/input/data/train/c4/en/c4-train.00008-of-01024.json.gz",
40 | "/opt/ml/input/data/train/c4/en/c4-train.00364-of-01024.json.gz",
41 | "/opt/ml/input/data/train/c4/en/c4-train.00717-of-01024.json.gz",
42 | "/opt/ml/input/data/train/c4/en/c4-train.00222-of-01024.json.gz",
43 | "/opt/ml/input/data/train/c4/en/c4-train.00178-of-01024.json.gz",
44 | "/opt/ml/input/data/train/c4/en/c4-train.00240-of-01024.json.gz",
45 | "/opt/ml/input/data/train/c4/en/c4-train.00398-of-01024.json.gz",
46 | "/opt/ml/input/data/train/c4/en/c4-train.00672-of-01024.json.gz",
47 | "/opt/ml/input/data/train/c4/en/c4-train.00278-of-01024.json.gz",
48 | "/opt/ml/input/data/train/c4/en/c4-train.00959-of-01024.json.gz",
49 | "/opt/ml/input/data/train/c4/en/c4-train.00429-of-01024.json.gz",
50 | "/opt/ml/input/data/train/c4/en/c4-train.00592-of-01024.json.gz",
51 | "/opt/ml/input/data/train/c4/en/c4-train.00327-of-01024.json.gz"
52 | ]
--------------------------------------------------------------------------------
/data/files/c4/en/p50/partition_750.json:
--------------------------------------------------------------------------------
1 | [
2 | "/opt/ml/input/data/train/c4/en/c4-train.00686-of-01024.json.gz",
3 | "/opt/ml/input/data/train/c4/en/c4-train.00952-of-01024.json.gz",
4 | "/opt/ml/input/data/train/c4/en/c4-train.00516-of-01024.json.gz",
5 | "/opt/ml/input/data/train/c4/en/c4-train.00460-of-01024.json.gz",
6 | "/opt/ml/input/data/train/c4/en/c4-train.00748-of-01024.json.gz",
7 | "/opt/ml/input/data/train/c4/en/c4-train.00165-of-01024.json.gz",
8 | "/opt/ml/input/data/train/c4/en/c4-train.00309-of-01024.json.gz",
9 | "/opt/ml/input/data/train/c4/en/c4-train.00929-of-01024.json.gz",
10 | "/opt/ml/input/data/train/c4/en/c4-train.00734-of-01024.json.gz",
11 | "/opt/ml/input/data/train/c4/en/c4-train.00956-of-01024.json.gz",
12 | "/opt/ml/input/data/train/c4/en/c4-train.00143-of-01024.json.gz",
13 | "/opt/ml/input/data/train/c4/en/c4-train.00041-of-01024.json.gz",
14 | "/opt/ml/input/data/train/c4/en/c4-train.00022-of-01024.json.gz",
15 | "/opt/ml/input/data/train/c4/en/c4-train.00438-of-01024.json.gz",
16 | "/opt/ml/input/data/train/c4/en/c4-train.00776-of-01024.json.gz",
17 | "/opt/ml/input/data/train/c4/en/c4-train.00170-of-01024.json.gz",
18 | "/opt/ml/input/data/train/c4/en/c4-train.00646-of-01024.json.gz",
19 | "/opt/ml/input/data/train/c4/en/c4-train.00573-of-01024.json.gz",
20 | "/opt/ml/input/data/train/c4/en/c4-train.00997-of-01024.json.gz",
21 | "/opt/ml/input/data/train/c4/en/c4-train.00823-of-01024.json.gz",
22 | "/opt/ml/input/data/train/c4/en/c4-train.00206-of-01024.json.gz",
23 | "/opt/ml/input/data/train/c4/en/c4-train.00547-of-01024.json.gz",
24 | "/opt/ml/input/data/train/c4/en/c4-train.00194-of-01024.json.gz",
25 | "/opt/ml/input/data/train/c4/en/c4-train.00130-of-01024.json.gz",
26 | "/opt/ml/input/data/train/c4/en/c4-train.00067-of-01024.json.gz",
27 | "/opt/ml/input/data/train/c4/en/c4-train.00765-of-01024.json.gz",
28 | "/opt/ml/input/data/train/c4/en/c4-train.00799-of-01024.json.gz",
29 | "/opt/ml/input/data/train/c4/en/c4-train.00466-of-01024.json.gz",
30 | "/opt/ml/input/data/train/c4/en/c4-train.00680-of-01024.json.gz",
31 | "/opt/ml/input/data/train/c4/en/c4-train.00277-of-01024.json.gz",
32 | "/opt/ml/input/data/train/c4/en/c4-train.00885-of-01024.json.gz",
33 | "/opt/ml/input/data/train/c4/en/c4-train.00548-of-01024.json.gz",
34 | "/opt/ml/input/data/train/c4/en/c4-train.00703-of-01024.json.gz",
35 | "/opt/ml/input/data/train/c4/en/c4-train.00374-of-01024.json.gz",
36 | "/opt/ml/input/data/train/c4/en/c4-train.00779-of-01024.json.gz",
37 | "/opt/ml/input/data/train/c4/en/c4-train.00881-of-01024.json.gz",
38 | "/opt/ml/input/data/train/c4/en/c4-train.00749-of-01024.json.gz",
39 | "/opt/ml/input/data/train/c4/en/c4-train.00234-of-01024.json.gz",
40 | "/opt/ml/input/data/train/c4/en/c4-train.00844-of-01024.json.gz",
41 | "/opt/ml/input/data/train/c4/en/c4-train.00352-of-01024.json.gz",
42 | "/opt/ml/input/data/train/c4/en/c4-train.00943-of-01024.json.gz",
43 | "/opt/ml/input/data/train/c4/en/c4-train.00088-of-01024.json.gz",
44 | "/opt/ml/input/data/train/c4/en/c4-train.00195-of-01024.json.gz",
45 | "/opt/ml/input/data/train/c4/en/c4-train.00864-of-01024.json.gz",
46 | "/opt/ml/input/data/train/c4/en/c4-train.00066-of-01024.json.gz",
47 | "/opt/ml/input/data/train/c4/en/c4-train.00743-of-01024.json.gz",
48 | "/opt/ml/input/data/train/c4/en/c4-train.00437-of-01024.json.gz",
49 | "/opt/ml/input/data/train/c4/en/c4-train.00606-of-01024.json.gz",
50 | "/opt/ml/input/data/train/c4/en/c4-train.00728-of-01024.json.gz",
51 | "/opt/ml/input/data/train/c4/en/c4-train.00361-of-01024.json.gz"
52 | ]
--------------------------------------------------------------------------------
/data/files/c4/en/p50/partition_800.json:
--------------------------------------------------------------------------------
1 | [
2 | "/opt/ml/input/data/train/c4/en/c4-train.00095-of-01024.json.gz",
3 | "/opt/ml/input/data/train/c4/en/c4-train.00541-of-01024.json.gz",
4 | "/opt/ml/input/data/train/c4/en/c4-train.00539-of-01024.json.gz",
5 | "/opt/ml/input/data/train/c4/en/c4-train.00142-of-01024.json.gz",
6 | "/opt/ml/input/data/train/c4/en/c4-train.00913-of-01024.json.gz",
7 | "/opt/ml/input/data/train/c4/en/c4-train.00718-of-01024.json.gz",
8 | "/opt/ml/input/data/train/c4/en/c4-train.00446-of-01024.json.gz",
9 | "/opt/ml/input/data/train/c4/en/c4-train.00091-of-01024.json.gz",
10 | "/opt/ml/input/data/train/c4/en/c4-train.00332-of-01024.json.gz",
11 | "/opt/ml/input/data/train/c4/en/c4-train.00625-of-01024.json.gz",
12 | "/opt/ml/input/data/train/c4/en/c4-train.00803-of-01024.json.gz",
13 | "/opt/ml/input/data/train/c4/en/c4-train.00813-of-01024.json.gz",
14 | "/opt/ml/input/data/train/c4/en/c4-train.00975-of-01024.json.gz",
15 | "/opt/ml/input/data/train/c4/en/c4-train.00724-of-01024.json.gz",
16 | "/opt/ml/input/data/train/c4/en/c4-train.00412-of-01024.json.gz",
17 | "/opt/ml/input/data/train/c4/en/c4-train.00582-of-01024.json.gz",
18 | "/opt/ml/input/data/train/c4/en/c4-train.00074-of-01024.json.gz",
19 | "/opt/ml/input/data/train/c4/en/c4-train.00112-of-01024.json.gz",
20 | "/opt/ml/input/data/train/c4/en/c4-train.00883-of-01024.json.gz",
21 | "/opt/ml/input/data/train/c4/en/c4-train.00739-of-01024.json.gz",
22 | "/opt/ml/input/data/train/c4/en/c4-train.00443-of-01024.json.gz",
23 | "/opt/ml/input/data/train/c4/en/c4-train.00266-of-01024.json.gz",
24 | "/opt/ml/input/data/train/c4/en/c4-train.00203-of-01024.json.gz",
25 | "/opt/ml/input/data/train/c4/en/c4-train.00872-of-01024.json.gz",
26 | "/opt/ml/input/data/train/c4/en/c4-train.00211-of-01024.json.gz",
27 | "/opt/ml/input/data/train/c4/en/c4-train.00855-of-01024.json.gz",
28 | "/opt/ml/input/data/train/c4/en/c4-train.00136-of-01024.json.gz",
29 | "/opt/ml/input/data/train/c4/en/c4-train.00695-of-01024.json.gz",
30 | "/opt/ml/input/data/train/c4/en/c4-train.00271-of-01024.json.gz",
31 | "/opt/ml/input/data/train/c4/en/c4-train.00824-of-01024.json.gz",
32 | "/opt/ml/input/data/train/c4/en/c4-train.00367-of-01024.json.gz",
33 | "/opt/ml/input/data/train/c4/en/c4-train.00010-of-01024.json.gz",
34 | "/opt/ml/input/data/train/c4/en/c4-train.00125-of-01024.json.gz",
35 | "/opt/ml/input/data/train/c4/en/c4-train.00436-of-01024.json.gz",
36 | "/opt/ml/input/data/train/c4/en/c4-train.00958-of-01024.json.gz",
37 | "/opt/ml/input/data/train/c4/en/c4-train.00289-of-01024.json.gz",
38 | "/opt/ml/input/data/train/c4/en/c4-train.00742-of-01024.json.gz",
39 | "/opt/ml/input/data/train/c4/en/c4-train.00056-of-01024.json.gz",
40 | "/opt/ml/input/data/train/c4/en/c4-train.01001-of-01024.json.gz",
41 | "/opt/ml/input/data/train/c4/en/c4-train.00360-of-01024.json.gz",
42 | "/opt/ml/input/data/train/c4/en/c4-train.00856-of-01024.json.gz",
43 | "/opt/ml/input/data/train/c4/en/c4-train.00989-of-01024.json.gz",
44 | "/opt/ml/input/data/train/c4/en/c4-train.00991-of-01024.json.gz",
45 | "/opt/ml/input/data/train/c4/en/c4-train.00585-of-01024.json.gz",
46 | "/opt/ml/input/data/train/c4/en/c4-train.00534-of-01024.json.gz",
47 | "/opt/ml/input/data/train/c4/en/c4-train.00388-of-01024.json.gz",
48 | "/opt/ml/input/data/train/c4/en/c4-train.00578-of-01024.json.gz",
49 | "/opt/ml/input/data/train/c4/en/c4-train.00383-of-01024.json.gz",
50 | "/opt/ml/input/data/train/c4/en/c4-train.00149-of-01024.json.gz",
51 | "/opt/ml/input/data/train/c4/en/c4-train.01005-of-01024.json.gz"
52 | ]
--------------------------------------------------------------------------------
/data/files/c4/en/p50/partition_850.json:
--------------------------------------------------------------------------------
1 | [
2 | "/opt/ml/input/data/train/c4/en/c4-train.01023-of-01024.json.gz",
3 | "/opt/ml/input/data/train/c4/en/c4-train.00106-of-01024.json.gz",
4 | "/opt/ml/input/data/train/c4/en/c4-train.00043-of-01024.json.gz",
5 | "/opt/ml/input/data/train/c4/en/c4-train.00314-of-01024.json.gz",
6 | "/opt/ml/input/data/train/c4/en/c4-train.00409-of-01024.json.gz",
7 | "/opt/ml/input/data/train/c4/en/c4-train.00331-of-01024.json.gz",
8 | "/opt/ml/input/data/train/c4/en/c4-train.00850-of-01024.json.gz",
9 | "/opt/ml/input/data/train/c4/en/c4-train.00542-of-01024.json.gz",
10 | "/opt/ml/input/data/train/c4/en/c4-train.00953-of-01024.json.gz",
11 | "/opt/ml/input/data/train/c4/en/c4-train.00316-of-01024.json.gz",
12 | "/opt/ml/input/data/train/c4/en/c4-train.00426-of-01024.json.gz",
13 | "/opt/ml/input/data/train/c4/en/c4-train.00251-of-01024.json.gz",
14 | "/opt/ml/input/data/train/c4/en/c4-train.00594-of-01024.json.gz",
15 | "/opt/ml/input/data/train/c4/en/c4-train.00990-of-01024.json.gz",
16 | "/opt/ml/input/data/train/c4/en/c4-train.00689-of-01024.json.gz",
17 | "/opt/ml/input/data/train/c4/en/c4-train.00725-of-01024.json.gz",
18 | "/opt/ml/input/data/train/c4/en/c4-train.00570-of-01024.json.gz",
19 | "/opt/ml/input/data/train/c4/en/c4-train.00461-of-01024.json.gz",
20 | "/opt/ml/input/data/train/c4/en/c4-train.00805-of-01024.json.gz",
21 | "/opt/ml/input/data/train/c4/en/c4-train.00275-of-01024.json.gz",
22 | "/opt/ml/input/data/train/c4/en/c4-train.00691-of-01024.json.gz",
23 | "/opt/ml/input/data/train/c4/en/c4-train.00860-of-01024.json.gz",
24 | "/opt/ml/input/data/train/c4/en/c4-train.00384-of-01024.json.gz",
25 | "/opt/ml/input/data/train/c4/en/c4-train.00030-of-01024.json.gz",
26 | "/opt/ml/input/data/train/c4/en/c4-train.00237-of-01024.json.gz",
27 | "/opt/ml/input/data/train/c4/en/c4-train.00359-of-01024.json.gz",
28 | "/opt/ml/input/data/train/c4/en/c4-train.00400-of-01024.json.gz",
29 | "/opt/ml/input/data/train/c4/en/c4-train.00386-of-01024.json.gz",
30 | "/opt/ml/input/data/train/c4/en/c4-train.00473-of-01024.json.gz",
31 | "/opt/ml/input/data/train/c4/en/c4-train.00127-of-01024.json.gz",
32 | "/opt/ml/input/data/train/c4/en/c4-train.00684-of-01024.json.gz",
33 | "/opt/ml/input/data/train/c4/en/c4-train.00162-of-01024.json.gz",
34 | "/opt/ml/input/data/train/c4/en/c4-train.00108-of-01024.json.gz",
35 | "/opt/ml/input/data/train/c4/en/c4-train.00320-of-01024.json.gz",
36 | "/opt/ml/input/data/train/c4/en/c4-train.00099-of-01024.json.gz",
37 | "/opt/ml/input/data/train/c4/en/c4-train.00111-of-01024.json.gz",
38 | "/opt/ml/input/data/train/c4/en/c4-train.00375-of-01024.json.gz",
39 | "/opt/ml/input/data/train/c4/en/c4-train.00328-of-01024.json.gz",
40 | "/opt/ml/input/data/train/c4/en/c4-train.00148-of-01024.json.gz",
41 | "/opt/ml/input/data/train/c4/en/c4-train.00117-of-01024.json.gz",
42 | "/opt/ml/input/data/train/c4/en/c4-train.00664-of-01024.json.gz",
43 | "/opt/ml/input/data/train/c4/en/c4-train.00472-of-01024.json.gz",
44 | "/opt/ml/input/data/train/c4/en/c4-train.00124-of-01024.json.gz",
45 | "/opt/ml/input/data/train/c4/en/c4-train.00702-of-01024.json.gz",
46 | "/opt/ml/input/data/train/c4/en/c4-train.00886-of-01024.json.gz",
47 | "/opt/ml/input/data/train/c4/en/c4-train.00651-of-01024.json.gz",
48 | "/opt/ml/input/data/train/c4/en/c4-train.00838-of-01024.json.gz",
49 | "/opt/ml/input/data/train/c4/en/c4-train.00336-of-01024.json.gz",
50 | "/opt/ml/input/data/train/c4/en/c4-train.00274-of-01024.json.gz",
51 | "/opt/ml/input/data/train/c4/en/c4-train.00255-of-01024.json.gz"
52 | ]
--------------------------------------------------------------------------------
/data/files/c4/en/p50/partition_900.json:
--------------------------------------------------------------------------------
1 | [
2 | "/opt/ml/input/data/train/c4/en/c4-train.00025-of-01024.json.gz",
3 | "/opt/ml/input/data/train/c4/en/c4-train.00836-of-01024.json.gz",
4 | "/opt/ml/input/data/train/c4/en/c4-train.00575-of-01024.json.gz",
5 | "/opt/ml/input/data/train/c4/en/c4-train.00319-of-01024.json.gz",
6 | "/opt/ml/input/data/train/c4/en/c4-train.00455-of-01024.json.gz",
7 | "/opt/ml/input/data/train/c4/en/c4-train.01007-of-01024.json.gz",
8 | "/opt/ml/input/data/train/c4/en/c4-train.00482-of-01024.json.gz",
9 | "/opt/ml/input/data/train/c4/en/c4-train.00464-of-01024.json.gz",
10 | "/opt/ml/input/data/train/c4/en/c4-train.00214-of-01024.json.gz",
11 | "/opt/ml/input/data/train/c4/en/c4-train.00849-of-01024.json.gz",
12 | "/opt/ml/input/data/train/c4/en/c4-train.00878-of-01024.json.gz",
13 | "/opt/ml/input/data/train/c4/en/c4-train.00163-of-01024.json.gz",
14 | "/opt/ml/input/data/train/c4/en/c4-train.00577-of-01024.json.gz",
15 | "/opt/ml/input/data/train/c4/en/c4-train.00629-of-01024.json.gz",
16 | "/opt/ml/input/data/train/c4/en/c4-train.00661-of-01024.json.gz",
17 | "/opt/ml/input/data/train/c4/en/c4-train.00093-of-01024.json.gz",
18 | "/opt/ml/input/data/train/c4/en/c4-train.00057-of-01024.json.gz",
19 | "/opt/ml/input/data/train/c4/en/c4-train.00477-of-01024.json.gz",
20 | "/opt/ml/input/data/train/c4/en/c4-train.00037-of-01024.json.gz",
21 | "/opt/ml/input/data/train/c4/en/c4-train.00457-of-01024.json.gz",
22 | "/opt/ml/input/data/train/c4/en/c4-train.00973-of-01024.json.gz",
23 | "/opt/ml/input/data/train/c4/en/c4-train.00788-of-01024.json.gz",
24 | "/opt/ml/input/data/train/c4/en/c4-train.00848-of-01024.json.gz",
25 | "/opt/ml/input/data/train/c4/en/c4-train.00564-of-01024.json.gz",
26 | "/opt/ml/input/data/train/c4/en/c4-train.00406-of-01024.json.gz",
27 | "/opt/ml/input/data/train/c4/en/c4-train.00109-of-01024.json.gz",
28 | "/opt/ml/input/data/train/c4/en/c4-train.00520-of-01024.json.gz",
29 | "/opt/ml/input/data/train/c4/en/c4-train.00322-of-01024.json.gz",
30 | "/opt/ml/input/data/train/c4/en/c4-train.00380-of-01024.json.gz",
31 | "/opt/ml/input/data/train/c4/en/c4-train.00518-of-01024.json.gz",
32 | "/opt/ml/input/data/train/c4/en/c4-train.00854-of-01024.json.gz",
33 | "/opt/ml/input/data/train/c4/en/c4-train.00092-of-01024.json.gz",
34 | "/opt/ml/input/data/train/c4/en/c4-train.00741-of-01024.json.gz",
35 | "/opt/ml/input/data/train/c4/en/c4-train.00515-of-01024.json.gz",
36 | "/opt/ml/input/data/train/c4/en/c4-train.00417-of-01024.json.gz",
37 | "/opt/ml/input/data/train/c4/en/c4-train.00326-of-01024.json.gz",
38 | "/opt/ml/input/data/train/c4/en/c4-train.00033-of-01024.json.gz",
39 | "/opt/ml/input/data/train/c4/en/c4-train.00268-of-01024.json.gz",
40 | "/opt/ml/input/data/train/c4/en/c4-train.00835-of-01024.json.gz",
41 | "/opt/ml/input/data/train/c4/en/c4-train.00376-of-01024.json.gz",
42 | "/opt/ml/input/data/train/c4/en/c4-train.00798-of-01024.json.gz",
43 | "/opt/ml/input/data/train/c4/en/c4-train.00744-of-01024.json.gz",
44 | "/opt/ml/input/data/train/c4/en/c4-train.00009-of-01024.json.gz",
45 | "/opt/ml/input/data/train/c4/en/c4-train.00152-of-01024.json.gz",
46 | "/opt/ml/input/data/train/c4/en/c4-train.00116-of-01024.json.gz",
47 | "/opt/ml/input/data/train/c4/en/c4-train.00497-of-01024.json.gz",
48 | "/opt/ml/input/data/train/c4/en/c4-train.00475-of-01024.json.gz",
49 | "/opt/ml/input/data/train/c4/en/c4-train.00003-of-01024.json.gz",
50 | "/opt/ml/input/data/train/c4/en/c4-train.00890-of-01024.json.gz",
51 | "/opt/ml/input/data/train/c4/en/c4-train.00936-of-01024.json.gz"
52 | ]
--------------------------------------------------------------------------------
/data/files/c4/en/p50/partition_950.json:
--------------------------------------------------------------------------------
1 | [
2 | "/opt/ml/input/data/train/c4/en/c4-train.00998-of-01024.json.gz",
3 | "/opt/ml/input/data/train/c4/en/c4-train.00657-of-01024.json.gz",
4 | "/opt/ml/input/data/train/c4/en/c4-train.00597-of-01024.json.gz",
5 | "/opt/ml/input/data/train/c4/en/c4-train.00514-of-01024.json.gz",
6 | "/opt/ml/input/data/train/c4/en/c4-train.00390-of-01024.json.gz",
7 | "/opt/ml/input/data/train/c4/en/c4-train.00773-of-01024.json.gz",
8 | "/opt/ml/input/data/train/c4/en/c4-train.00931-of-01024.json.gz",
9 | "/opt/ml/input/data/train/c4/en/c4-train.00858-of-01024.json.gz",
10 | "/opt/ml/input/data/train/c4/en/c4-train.00852-of-01024.json.gz",
11 | "/opt/ml/input/data/train/c4/en/c4-train.00783-of-01024.json.gz",
12 | "/opt/ml/input/data/train/c4/en/c4-train.00994-of-01024.json.gz",
13 | "/opt/ml/input/data/train/c4/en/c4-train.00042-of-01024.json.gz",
14 | "/opt/ml/input/data/train/c4/en/c4-train.00503-of-01024.json.gz",
15 | "/opt/ml/input/data/train/c4/en/c4-train.00260-of-01024.json.gz",
16 | "/opt/ml/input/data/train/c4/en/c4-train.00243-of-01024.json.gz",
17 | "/opt/ml/input/data/train/c4/en/c4-train.00614-of-01024.json.gz",
18 | "/opt/ml/input/data/train/c4/en/c4-train.00706-of-01024.json.gz",
19 | "/opt/ml/input/data/train/c4/en/c4-train.00536-of-01024.json.gz",
20 | "/opt/ml/input/data/train/c4/en/c4-train.00502-of-01024.json.gz",
21 | "/opt/ml/input/data/train/c4/en/c4-train.00039-of-01024.json.gz",
22 | "/opt/ml/input/data/train/c4/en/c4-train.00627-of-01024.json.gz",
23 | "/opt/ml/input/data/train/c4/en/c4-train.00118-of-01024.json.gz",
24 | "/opt/ml/input/data/train/c4/en/c4-train.00712-of-01024.json.gz",
25 | "/opt/ml/input/data/train/c4/en/c4-train.00356-of-01024.json.gz",
26 | "/opt/ml/input/data/train/c4/en/c4-train.00845-of-01024.json.gz",
27 | "/opt/ml/input/data/train/c4/en/c4-train.00013-of-01024.json.gz",
28 | "/opt/ml/input/data/train/c4/en/c4-train.00480-of-01024.json.gz",
29 | "/opt/ml/input/data/train/c4/en/c4-train.00605-of-01024.json.gz",
30 | "/opt/ml/input/data/train/c4/en/c4-train.00825-of-01024.json.gz",
31 | "/opt/ml/input/data/train/c4/en/c4-train.00252-of-01024.json.gz",
32 | "/opt/ml/input/data/train/c4/en/c4-train.00185-of-01024.json.gz",
33 | "/opt/ml/input/data/train/c4/en/c4-train.00306-of-01024.json.gz",
34 | "/opt/ml/input/data/train/c4/en/c4-train.00688-of-01024.json.gz",
35 | "/opt/ml/input/data/train/c4/en/c4-train.01017-of-01024.json.gz",
36 | "/opt/ml/input/data/train/c4/en/c4-train.00050-of-01024.json.gz",
37 | "/opt/ml/input/data/train/c4/en/c4-train.01004-of-01024.json.gz",
38 | "/opt/ml/input/data/train/c4/en/c4-train.00740-of-01024.json.gz",
39 | "/opt/ml/input/data/train/c4/en/c4-train.00796-of-01024.json.gz",
40 | "/opt/ml/input/data/train/c4/en/c4-train.00831-of-01024.json.gz",
41 | "/opt/ml/input/data/train/c4/en/c4-train.00485-of-01024.json.gz",
42 | "/opt/ml/input/data/train/c4/en/c4-train.00677-of-01024.json.gz",
43 | "/opt/ml/input/data/train/c4/en/c4-train.00357-of-01024.json.gz",
44 | "/opt/ml/input/data/train/c4/en/c4-train.00537-of-01024.json.gz",
45 | "/opt/ml/input/data/train/c4/en/c4-train.00884-of-01024.json.gz",
46 | "/opt/ml/input/data/train/c4/en/c4-train.00073-of-01024.json.gz",
47 | "/opt/ml/input/data/train/c4/en/c4-train.00297-of-01024.json.gz",
48 | "/opt/ml/input/data/train/c4/en/c4-train.00317-of-01024.json.gz",
49 | "/opt/ml/input/data/train/c4/en/c4-train.00192-of-01024.json.gz",
50 | "/opt/ml/input/data/train/c4/en/c4-train.00323-of-01024.json.gz",
51 | "/opt/ml/input/data/train/c4/en/c4-train.00462-of-01024.json.gz"
52 | ]
--------------------------------------------------------------------------------
/conf/roberta_split_fact_v1_1.yaml:
--------------------------------------------------------------------------------
1 | hydra:
2 | run:
3 | dir: ./
4 |
5 | train_file: strategyqa/sub_data/train.json
6 | dev_file: strategyqa/sub_data/dev.json
7 | test_file: strategyqa/sub_data/test.json
8 |
9 | # Model
10 | model:
11 | _target_: models.roberta.RobertaForSequenceClassification.from_pretrained
12 |
13 | # Data loading
14 | read_tensor:
15 | _target_: data.strategy_qa.split_get_tensor_with_gold_para
16 | train_para_file: strategyqa/strategyqa_train_paragraphs.json
17 | max_seq_length: 512
18 | use_fact: True
19 |
20 | extended_vocab:
21 |
22 | # Data collator
23 | collator:
24 | _target_: data.collators.dict2dict.MetaCollator
25 |
26 | # Dataloader
27 | num_workers: 4
28 | prefetch_factor: 2
29 |
30 | # Wiki path pretrain v8.2
31 | model_name_or_path: pretrained-models/roberta-large
32 | pretrain:
33 |
34 | output_dir: experiments/strategy_qa.roberta.large.w_fact.w1.A40.v1.1.s${seed}
35 |
36 | do_train: True
37 | evaluate_during_training: True
38 |
39 | do_eval: True
40 | eval_sub_path:
41 |
42 | # Training hyper-parameters
43 | per_gpu_train_batch_size: 32
44 | per_gpu_eval_batch_size: 32
45 | learning_rate: 1e-5
46 | #learning_rate: 5e-6
47 | gradient_accumulation_steps: 1
48 | weight_decay: 0.01
49 | adam_epsilon: 1e-6
50 | adam_betas: "(0.9, 0.98)"
51 | #adam_betas: "(0.9, 0.999)"
52 | max_grad_norm: 0.0
53 | #max_grad_norm: 1.0
54 | num_train_epochs: 20
55 | max_steps: 0
56 | warmup_proportion: 0.1
57 | warmup_steps: 0
58 |
59 |
60 | logging_steps: 5
61 | save_steps: -1
62 | save_best: True
63 | eval_steps: 100
64 | ddp_eval: True
65 | no_cuda: False
66 | seed: 42
67 | local_rank: -1
68 | fp16: True
69 | fp16_opt_level: O1
70 |
71 | # Prediction config
72 | prediction_cfg:
73 | metric: "acc"
74 | measure: 1
75 | best_checkpoint:
76 | best_result:
77 | generator: False
78 | post_process:
79 |
80 | # fairscale.FullyShardedDP
81 | fairscale_config:
82 | _target_: general_util.fsdp_utils.default_initialize
83 | fp16: ${fp16}
84 | reshard_after_forward: False
85 | move_grads_to_cpu: False
86 | move_params_to_cpu: False
87 |
88 | # Deepspeed config
89 | ds_cfg:
90 | train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size}
91 | gradient_accumulation_steps: ${gradient_accumulation_steps}
92 | optimizer:
93 | type: AdamW
94 | params:
95 | lr: ${learning_rate}
96 | betas: [0.9, 0.999]
97 | eps: ${adam_epsilon}
98 | weight_decay: ${weight_decay}
99 | scheduler:
100 | type: WarmupDecayLR
101 | params:
102 | total_num_steps:
103 | warmup_max_lr: ${learning_rate}
104 | warmup_num_steps:
105 | warmup_type: linear
106 | gradient_clipping: ${max_grad_norm}
107 | fp16:
108 | enabled: ${fp16}
109 | initial_scale_power: 12
110 | zero_optimization:
111 | stage: 3
112 | # offload_optimizer:
113 | # device: cpu
114 | # pin_memory: True
115 | # offload_param:
116 | # device: cpu
117 | # pin_memory: True
118 | # activation_checkpointing:
119 | # partition_activations: True
120 | # cpu_checkpointing: True
121 | # contiguous_memory_optimization: False
122 | # number_checkpoints: False
123 | # synchronize_checkpoint_boundary: False
124 | # profile: False
125 | steps_per_print: 1024
126 |
127 | summary_helper:
128 | _target_: general_util.tensorboard_helper.SummaryWriterHelper
129 | batch_index_or_keys:
130 | outputs_index_or_keys:
131 |
132 | # Temporary variables
133 | n_gpu:
134 | device:
135 | train_batch_size:
136 | eval_batch_size:
137 | world_size:
138 |
--------------------------------------------------------------------------------
/general_util/lightseq_utils.py:
--------------------------------------------------------------------------------
1 | from omegaconf import DictConfig
2 |
3 | from general_util.logger import get_child_logger
4 |
5 | from lightseq.training.ops.pytorch.transformer_encoder_layer import (
6 | LSTransformerEncoderLayer,
7 | )
8 |
9 | logger = get_child_logger("LightSeqUtils")
10 |
11 |
12 | class LSHFTransformerEncoderLayer(LSTransformerEncoderLayer):
13 | def __init__(self, *args, **kwargs):
14 | super(LSHFTransformerEncoderLayer, self).__init__(*args, **kwargs)
15 |
16 | def forward(self, hidden_states, encoder_padding_mask, *args, **kwargs):
17 | ls_encoder_padding_mask = encoder_padding_mask / -10000.0
18 | ls_encoder_padding_mask = ls_encoder_padding_mask.squeeze()
19 | output = super().forward(hidden_states, ls_encoder_padding_mask)
20 | return output, None, None, None
21 |
22 |
23 | def gen_bert_config(cfg: DictConfig, config):
24 | bert_config = LSTransformerEncoderLayer.get_config(
25 | max_batch_tokens=4096,
26 | max_seq_len=config.max_position_embeddings,
27 | hidden_size=config.hidden_size,
28 | intermediate_size=config.intermediate_size,
29 | nhead=config.num_attention_heads,
30 | attn_prob_dropout_ratio=config.attention_probs_dropout_prob,
31 | activation_dropout_ratio=config.hidden_dropout_prob,
32 | hidden_dropout_ratio=config.hidden_dropout_prob,
33 | pre_layer_norm=False,
34 | fp16=cfg.fp16,
35 | local_rank=cfg.local_rank,
36 | activation_fn="gelu",
37 | )
38 | return bert_config
39 |
40 |
41 | def get_hf_bert_enc_layer_params(layer):
42 | init_ws = []
43 | init_bs = []
44 |
45 | init_ws.append(layer.attention.self.query.weight.detach().clone())
46 | init_bs.append(layer.attention.self.query.bias.detach().clone())
47 | init_ws.append(layer.attention.self.key.weight.detach().clone())
48 | init_bs.append(layer.attention.self.key.bias.detach().clone())
49 | init_ws.append(layer.attention.self.value.weight.detach().clone())
50 | init_bs.append(layer.attention.self.value.bias.detach().clone())
51 | init_ws.append(layer.attention.output.dense.weight.detach().clone())
52 | init_bs.append(layer.attention.output.dense.bias.detach().clone())
53 | init_ws.append(layer.attention.output.LayerNorm.weight.detach().clone())
54 | init_bs.append(layer.attention.output.LayerNorm.bias.detach().clone())
55 |
56 | init_ws.append(layer.intermediate.dense.weight.detach().clone())
57 | init_bs.append(layer.intermediate.dense.bias.detach().clone())
58 | init_ws.append(layer.output.dense.weight.detach().clone())
59 | init_bs.append(layer.output.dense.bias.detach().clone())
60 | init_ws.append(layer.output.LayerNorm.weight.detach().clone())
61 | init_bs.append(layer.output.LayerNorm.bias.detach().clone())
62 |
63 | return init_ws, init_bs
64 |
65 |
66 | def inject_ls_enc_layer(model, cfg, config):
67 | for i in range(config.num_hidden_layers):
68 | bert_config = gen_bert_config(cfg, config)
69 | init_ws, init_bs = get_hf_bert_enc_layer_params(model.bert.encoder.layer[i])
70 | model.bert.encoder.layer[i] = LSHFTransformerEncoderLayer(
71 | bert_config, init_ws, init_bs
72 | ).cuda()
73 |
74 |
75 | def inject_ls_roberta_enc_layer(model, cfg, config):
76 | for i in range(config.num_hidden_layers):
77 | bert_config = gen_bert_config(cfg, config)
78 | init_ws, init_bs = get_hf_bert_enc_layer_params(model.roberta.encoder.layer[i])
79 | model.roberta.encoder.layer[i] = LSHFTransformerEncoderLayer(
80 | bert_config, init_ws, init_bs
81 | )
82 |
83 |
--------------------------------------------------------------------------------
/modules/layers.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch import Tensor
3 |
4 |
5 | def fold_tensor(x: Tensor):
6 | if x is None:
7 | return x
8 | return x.reshape(-1, x.size(-1))
9 |
10 |
11 | def extract_sent_tokens(source: Tensor, sentence_index: Tensor, sent_token_mask: Tensor, sentence_ids: Tensor, sentence_ids_mask: Tensor):
12 | """
13 | :param source: [batch, seq_len]
14 | :param sentence_index: [batch, max_sent_num, max_sent_len]
15 | :param sent_token_mask: [batch, max_sent_num, max_sent_len]
16 | :param sentence_ids: [batch, path_len]
17 | :param sentence_ids_mask: [batch, path_len]
18 | :return:
19 | """
20 | batch = sentence_index.size(0)
21 | max_sent_len = sentence_index.size(-1)
22 | path_len = sentence_ids.size(1)
23 | ex_sentence_ids = sentence_ids.unsqueeze(-1).expand(-1, -1, max_sent_len)
24 | ex_sentence_ids_mask = sentence_ids_mask.unsqueeze(-1).expand(-1, -1, max_sent_len)
25 | # [batch, path_len, max_sent_len]
26 | gathered_sent_token_ids = torch.gather(sentence_index, dim=1, index=ex_sentence_ids).reshape(batch, -1)
27 | gathered_sent_token_mask = torch.gather(sent_token_mask, dim=1, index=ex_sentence_ids)
28 | # [batch, path_len * max_sent_len]
29 | gather_tokens = torch.gather(source, dim=1, index=gathered_sent_token_ids).reshape(batch, path_len, max_sent_len)
30 | # Union mask
31 | union_mask = gathered_sent_token_mask & ex_sentence_ids_mask
32 | return gather_tokens, union_mask
33 |
34 |
35 | def keep_grad_prompt(input_embeds: Tensor, prompt_pos: Tensor):
36 | kp_gradient_mask = input_embeds.new_zeros(input_embeds.size()[:-1]) # [batch, seq_len], the position to keep grad is set to ``1``.
37 | kp_gradient_mask = torch.scatter(kp_gradient_mask, dim=1, index=prompt_pos, value=1.0)
38 | kp_gradient_mask = kp_gradient_mask.unsqueeze(-1)
39 |
40 | input_embeds_sg = input_embeds.detach()
41 | input_embeds = kp_gradient_mask * input_embeds + (1 - kp_gradient_mask) * input_embeds_sg
42 | return input_embeds
43 |
44 |
45 | def get_accuracy(logits: Tensor, labels: Tensor, pad_id: int = -1):
46 | assert logits.size()[:-1] == labels.size()
47 |
48 | # logits = logits.detach().cpu()
49 | _, pred = logits.max(dim=-1)
50 | true_label_num = (labels != pad_id).sum().item()
51 | correct = (pred == labels).sum().item()
52 | if true_label_num == 0:
53 | return 0, 0
54 | acc = correct * 1.0 / true_label_num
55 | return acc, true_label_num
56 |
57 |
58 | def get_precision_recall(logits: Tensor, labels: Tensor, pad_id: int = -1, positive_id: int = 1):
59 | assert logits.size()[:-1] == labels.size()
60 |
61 | _, pred = logits.max(dim=-1)
62 | true_label_num = (labels != pad_id).sum().item()
63 |
64 | tp = ((pred == labels) & (labels == positive_id)).sum(dim=1)
65 |
66 | if true_label_num == 0:
67 | return 0., 0.
68 |
69 | masked_pred = pred.masked_fill(labels == pad_id, 0)
70 | tp_fp = (masked_pred == positive_id).sum(dim=1)
71 | precision = tp / tp_fp
72 | precision.masked_fill_(tp_fp == 0, 0)
73 | precision = precision.mean().item()
74 | # precision = (tp / (masked_pred == 1).sum(dim=1)).mean().item()
75 |
76 | masked_labels = labels.masked_fill(labels == pad_id, 0)
77 | # recall = (tp / (masked_labels == 1).sum(dim=1)).mean().item()
78 | tp_fn = (masked_labels == positive_id).sum(dim=1)
79 | recall = tp / tp_fn
80 | recall.masked_fill_(tp_fn == 0, 0)
81 | recall = recall.mean().item()
82 |
83 | return precision, recall, labels.size(0)
84 |
85 |
86 | def freeze_module(module: torch.nn.Module):
87 | for param in module.parameters():
88 | param.requires_grad = False
89 |
--------------------------------------------------------------------------------
/conf/llama/wiki/test.yaml:
--------------------------------------------------------------------------------
1 | hydra:
2 | run:
3 | dir: ./
4 |
5 | exp_name: test
6 |
7 | train_file:
8 | _target_: data.flan_combine.obtain_flan_collection_group
9 | dev_file:
10 | test_file: /home/tianze/datasets/NaturalQuestions/v1.0-simplified_nq-dev-all.jsonl
11 |
12 |
13 | # Model
14 | model:
15 | _target_: models.llama.LlamaForConditionalGeneration.from_pretrained
16 | num_hidden_layers: 1
17 | use_peft: False
18 |
19 | # lora_config:
20 | # _target_: models.llama.LoraConfig
21 | # task_type: CAUSAL_LM
22 | # inference_mode: False
23 | # target_modules: ["q_proj", "v_proj"]
24 | # r: 8
25 | # lora_alpha: 16
26 | # lora_dropout: 0.1
27 |
28 | #model_eval:
29 | # _target_: models.llama.LlamaForMultipleChoiceCausalLM.from_pretrained_peft_eval
30 | # base_model_name_or_path: ${model_name_or_path}
31 |
32 |
33 | # Data loading
34 | read_tensor:
35 | _target_: data.collators.misc.NaturalQuestionsDataset
36 |
37 |
38 | extended_vocab:
39 |
40 | # Data collator
41 | collator:
42 | _target_: data.collators.misc.GeneralCollatorOverCollator
43 | max_seq_length: 2048
44 | tokenizer: ${model_name_or_path}
45 | decoder_only: True
46 | collator:
47 |
48 | # Dataloader
49 | num_workers: 4
50 | prefetch_factor: 2
51 |
52 | do_preprocess: False
53 |
54 | # Wiki path pretrain v8.2
55 | model_name_or_path: /home/tianze/other/llama-7b-hf
56 | pretrain:
57 |
58 | output_dir: /home/tianze/other/llama-7b-hf
59 |
60 | do_train: False
61 | evaluate_during_training: False
62 |
63 | do_eval: True
64 | eval_sub_path:
65 |
66 | # Training hyper-parameters
67 | per_gpu_train_batch_size: 1
68 | per_gpu_eval_batch_size: 1
69 | learning_rate: 1e-4
70 | gradient_accumulation_steps: 512
71 | weight_decay: 0.00
72 | adam_epsilon: 1e-6
73 | adam_betas: "(0.9, 0.95)"
74 | max_grad_norm: 5.0
75 | num_train_epochs: 1
76 | max_steps: 0
77 | warmup_proportion: 0.05
78 | warmup_steps: 0
79 |
80 | # Optimizer
81 | optimizer:
82 | use_nvlamb:
83 | bit_training:
84 |
85 |
86 | logging_steps: 1
87 | #save_best: True
88 | save_best: False
89 | save_steps: 100
90 | eval_steps: 100
91 | ddp_eval: True
92 | no_cuda: False
93 | seed: 42
94 | local_rank: -1
95 | fp16: True
96 | fp16_opt_level: O1
97 | fp16_bfloat16: True
98 |
99 | # Prediction config
100 | prediction_cfg:
101 | metric: "acc"
102 | measure: 1
103 | best_checkpoint:
104 | best_result:
105 | eval_forward_fn:
106 | _target_: general_util.evaluator.DiscriminatorForwardFn
107 | post_process:
108 |
109 | #dist_init:
110 | # _target_: general_util.dist_utils.setup_slurm_distributed
111 |
112 |
113 | # fairscale.FullyShardedDP
114 | fairscale_config:
115 | # _target_: general_util.fsdp_utils.recursive_initialize
116 | # _target_: general_util.fsdp_utils.default_initialize
117 | # _target_: general_util.fsdp_utils.default_initialize_v2
118 | _target_: general_util.torch_fsdp_utils.torch_fsdp_transformer_init
119 | fp16: ${fp16}
120 | # move_grads_to_cpu: False
121 | # move_params_to_cpu: False
122 | # flatten_parameters: False
123 | fp16_bfloat16: ${fp16_bfloat16}
124 | cpu_offload: False
125 | # disable_reshard_on_root: False
126 |
127 |
128 | # Lightseq config
129 | with_lightseq: False
130 |
131 |
132 | summary_helper:
133 | _target_: general_util.tensorboard_helper.SummaryWriterHelper
134 | batch_index_or_keys:
135 | # "train/pair_value_num": pair_value_num
136 | # "train/pair_label_num": pair_label_num
137 | # "train/dropped_op_cnt": dropped_op_cnt
138 | # "train/invalid_path": invalid_path
139 | outputs_index_or_keys:
140 | # "train/mlm_loss": mlm_loss
141 | # "train/cls_loss": cls_loss
142 | # "train/tagging_loss": tagging_loss
143 | # "train/path_gen_loss": path_gen_loss
144 |
145 | # Temporary variables
146 | n_gpu:
147 | device:
148 | train_batch_size:
149 | eval_batch_size:
150 | world_size:
151 |
--------------------------------------------------------------------------------
/conf/llama/wiki/llama_7b_flan_v1_0.yaml:
--------------------------------------------------------------------------------
1 | hydra:
2 | run:
3 | dir: ./
4 |
5 | train_file: ../research.data/flan_v2_shuffle/*.pt
6 | dev_file:
7 | test_file:
8 |
9 | # Model
10 | model:
11 | _target_: models.llama.LlamaForConditionalGeneration.from_pretrained
12 | use_peft: False
13 | # lora_config:
14 | # _target_: models.llama.LoraConfig
15 | # task_type: CAUSAL_LM
16 | # inference_mode: False
17 | # target_modules: ["q_proj", "v_proj"]
18 | # r: 8
19 | # lora_alpha: 16
20 | # lora_dropout: 0.1
21 |
22 | #model_eval:
23 | # _target_: models.llama.LlamaForMultipleChoiceCausalLM.from_pretrained_peft_eval
24 | # base_model_name_or_path: ${model_name_or_path}
25 |
26 |
27 | # Data loading
28 | read_tensor:
29 | _target_: data.collators.flan.FlanCollectionGroupDataset
30 |
31 |
32 | extended_vocab:
33 |
34 | # Data collator
35 | collator:
36 | _target_: data.collators.flan.FlanCollatorOverCollator
37 | collator:
38 | max_seq_length: 1024
39 | tokenizer: pretrained-models/LLaMA/llama-7b
40 | decoder_only: True
41 |
42 | # Dataloader
43 | num_workers: 4
44 | prefetch_factor: 2
45 |
46 | do_preprocess: False
47 |
48 | # Wiki path pretrain v8.2
49 | model_name_or_path: pretrained-modelscl/LLaMA/llama-7b
50 | pretrain:
51 |
52 | output_dir: experiments/llama.7b.flan.v1.0.seq1024.w8.adamw.500steps.NA100.0401
53 |
54 | do_train: True
55 | evaluate_during_training: False
56 |
57 | do_eval: True
58 | eval_sub_path: checkpoint-*
59 |
60 | # Training hyper-parameters
61 | per_gpu_train_batch_size: 1
62 | per_gpu_eval_batch_size: 1
63 | learning_rate: 1e-4
64 | gradient_accumulation_steps: 512
65 | weight_decay: 0.00
66 | adam_epsilon: 1e-6
67 | adam_betas: "(0.9, 0.999)"
68 | max_grad_norm: 1.0
69 | num_train_epochs: 1
70 | total_dataset_len: 760000000
71 | max_steps: 0
72 | warmup_proportion: 0
73 | warmup_steps: 5000
74 |
75 | # Optimizer
76 | optimizer:
77 | use_nvlamb:
78 | bit_training:
79 |
80 |
81 | logging_steps: 1
82 | #save_best: True
83 | save_best: False
84 | save_steps: 500
85 | eval_steps: 500
86 | ddp_eval: True
87 | no_cuda: False
88 | seed: 42
89 | local_rank: -1
90 | fp16: True
91 | fp16_opt_level: O1
92 | fp16_bfloat16: True
93 |
94 | # Prediction config
95 | prediction_cfg:
96 | metric: "acc"
97 | measure: 1
98 | best_checkpoint:
99 | best_result:
100 | eval_forward_fn:
101 | _target_: general_util.evaluator.DiscriminatorForwardFn
102 | post_process:
103 |
104 | #dist_init:
105 | # _target_: general_util.dist_utils.setup_slurm_distributed
106 |
107 |
108 | # fairscale.FullyShardedDP
109 | fairscale_config:
110 | # _target_: general_util.fsdp_utils.recursive_initialize
111 | _target_: general_util.fsdp_utils.default_initialize
112 | # _target_: general_util.fsdp_utils.default_initialize_v2
113 | # _target_: general_util.torch_fsdp_utils.torch_fsdp_transformer_init
114 | # _target_: general_util.torch_fsdp_utils.torch_fsdp_auto_wrap
115 | fp16: ${fp16}
116 | move_grads_to_cpu: False
117 | move_params_to_cpu: False
118 | flatten_parameters: False
119 | # fp16_bfloat16: ${fp16_bfloat16}
120 | # cpu_offload: False
121 | # disable_reshard_on_root: False
122 |
123 |
124 | # Lightseq config
125 | with_lightseq: False
126 |
127 |
128 | summary_helper:
129 | _target_: general_util.tensorboard_helper.SummaryWriterHelper
130 | batch_index_or_keys:
131 | # "train/pair_value_num": pair_value_num
132 | # "train/pair_label_num": pair_label_num
133 | # "train/dropped_op_cnt": dropped_op_cnt
134 | # "train/invalid_path": invalid_path
135 | outputs_index_or_keys:
136 | # "train/mlm_loss": mlm_loss
137 | # "train/cls_loss": cls_loss
138 | # "train/tagging_loss": tagging_loss
139 | # "train/path_gen_loss": path_gen_loss
140 |
141 | # Temporary variables
142 | n_gpu:
143 | device:
144 | train_batch_size:
145 | eval_batch_size:
146 | world_size:
147 |
--------------------------------------------------------------------------------