├── .gitignore ├── README.md ├── circle-construction ├── diffusion │ ├── .gitignore │ ├── LICENSE │ ├── catsample.py │ ├── configs │ │ ├── config.yaml │ │ └── model │ │ │ ├── medium.yaml │ │ │ └── small.yaml │ ├── data.py │ ├── eval.sh │ ├── eval_qa.py │ ├── graph_lib.py │ ├── load_model.py │ ├── losses.py │ ├── model │ │ ├── __init__.py │ │ ├── ema.py │ │ ├── fused_add_dropout_scale.py │ │ ├── rotary.py │ │ ├── transformer.py │ │ └── utils.py │ ├── noise_lib.py │ ├── run_eval.sh │ ├── run_sample.py │ ├── run_sample_cond.py │ ├── run_train.py │ ├── run_train.sh │ ├── sampling.py │ ├── test.py │ ├── train.py │ └── utils.py ├── ntp │ ├── circle.ipynb │ ├── eval.sh │ ├── eval_qa.py │ ├── main.py │ ├── run_eval.sh │ ├── run_train.sh │ ├── train.sh │ └── utils.py └── teacherless │ ├── circle_hybrid.ipynb │ ├── eval.sh │ ├── eval_qa.py │ ├── main.py │ ├── run_eval.sh │ ├── run_train.sh │ ├── train.sh │ └── utils.py ├── docs └── teaser.png ├── line-construction ├── diffusion │ ├── .gitignore │ ├── LICENSE │ ├── catsample.py │ ├── configs │ │ ├── config.yaml │ │ └── model │ │ │ ├── medium.yaml │ │ │ └── small.yaml │ ├── data.py │ ├── eval.sh │ ├── eval_qa.py │ ├── graph_lib.py │ ├── load_model.py │ ├── losses.py │ ├── model │ │ ├── __init__.py │ │ ├── ema.py │ │ ├── fused_add_dropout_scale.py │ │ ├── rotary.py │ │ ├── transformer.py │ │ └── utils.py │ ├── noise_lib.py │ ├── run_eval.sh │ ├── run_sample.py │ ├── run_sample_cond.py │ ├── run_train.py │ ├── run_train.sh │ ├── sampling.py │ ├── test.py │ ├── train.py │ └── utils.py ├── ntp │ ├── eval.sh │ ├── eval_qa.py │ ├── line.ipynb │ ├── main.py │ ├── run_eval.sh │ ├── run_train.sh │ ├── train.sh │ └── utils.py └── teacherless │ ├── eval.sh │ ├── eval_qa.py │ ├── line_hybrid.ipynb │ ├── main.py │ ├── run_eval.sh │ ├── run_train.sh │ ├── train.sh │ └── utils.py ├── sibling-discovery ├── diffusion │ ├── .gitignore │ ├── LICENSE │ ├── catsample.py │ ├── configs │ │ ├── config.yaml │ │ └── model │ │ │ ├── medium.yaml │ │ │ └── small.yaml │ ├── data.py │ ├── eval.sh │ ├── eval_qa.py │ ├── graph_lib.py │ ├── load_model.py │ ├── losses.py │ ├── model │ │ ├── __init__.py │ │ ├── ema.py │ │ ├── fused_add_dropout_scale.py │ │ ├── rotary.py │ │ ├── transformer.py │ │ └── utils.py │ ├── noise_lib.py │ ├── run_eval.sh │ ├── run_sample.py │ ├── run_sample_cond.py │ ├── run_train.py │ ├── run_train.sh │ ├── sampling.py │ ├── test.py │ ├── train.py │ └── utils.py ├── ntp │ ├── eval.sh │ ├── eval_qa.py │ ├── main.py │ ├── run_eval.sh │ ├── run_train.sh │ ├── sibling.ipynb │ ├── sibling_no_hash.ipynb │ ├── train.sh │ └── utils.py └── teacherless │ ├── eval.sh │ ├── eval_qa.py │ ├── main.py │ ├── run_eval.sh │ ├── run_train.sh │ ├── sibling_hybrid.ipynb │ ├── train.sh │ └── utils.py ├── simpletransformers ├── .all-contributorsrc ├── .github │ ├── FUNDING.yml │ ├── ISSUE_TEMPLATE │ │ ├── bug_report.md │ │ └── feature_request.md │ ├── stale.yml │ └── workflows │ │ └── pythonpublish.yml ├── .gitignore ├── CHANGELOG.md ├── LICENSE ├── Makefile ├── README.md ├── bin │ └── simple-viewer ├── requirements-dev.txt ├── setup.cfg ├── setup.py ├── simpletransformers │ ├── __init__.py │ ├── classification │ │ ├── __init__.py │ │ ├── classification_model.py │ │ ├── classification_utils.py │ │ ├── multi_label_classification_model.py │ │ ├── multi_modal_classification_model.py │ │ └── transformer_models │ │ │ ├── __init__.py │ │ │ ├── albert_model.py │ │ │ ├── bert_model.py │ │ │ ├── camembert_model.py │ │ │ ├── distilbert_model.py │ │ │ ├── electra_model.py │ │ │ ├── flaubert_model.py │ │ │ ├── layoutlm_model.py │ │ │ ├── longformer_model.py │ │ │ ├── mmbt_model.py │ │ │ ├── mobilebert_model.py │ │ │ ├── roberta_model.py │ │ │ ├── xlm_model.py │ │ │ ├── xlm_roberta_model.py │ │ │ └── xlnet_model.py │ ├── config │ │ ├── __init__.py │ │ ├── global_args.py │ │ ├── model_args.py │ │ └── utils.py │ ├── conv_ai │ │ ├── __init__.py │ │ ├── conv_ai_model.py │ │ └── conv_ai_utils.py │ ├── custom_models │ │ ├── __init__.py │ │ └── models.py │ ├── experimental │ │ ├── __init__.py │ │ └── classification │ │ │ ├── __init__.py │ │ │ ├── classification_model.py │ │ │ ├── classification_utils.py │ │ │ ├── multi_label_classification_model.py │ │ │ └── transformer_models │ │ │ ├── __init__.py │ │ │ ├── albert_model.py │ │ │ ├── bert_model.py │ │ │ ├── camembert_model.py │ │ │ ├── distilbert_model.py │ │ │ ├── roberta_model.py │ │ │ ├── xlm_model.py │ │ │ └── xlnet_model.py │ ├── language_generation │ │ ├── __init__.py │ │ ├── language_generation_model.py │ │ └── language_generation_utils.py │ ├── language_modeling │ │ ├── __init__.py │ │ ├── language_modeling_model.py │ │ └── language_modeling_utils.py │ ├── language_representation │ │ ├── __init__.py │ │ ├── representation_model.py │ │ └── transformer_models │ │ │ ├── __init__.py │ │ │ ├── bert_model.py │ │ │ └── gpt2_model.py │ ├── losses │ │ ├── __init__.py │ │ ├── dice_loss.py │ │ ├── focal_loss.py │ │ ├── loss_utils.py │ │ └── tversky_loss.py │ ├── model.py │ ├── ner │ │ ├── __init__.py │ │ ├── ner_dataset_loading_script │ │ │ └── ner_dataset_loading_script.py │ │ ├── ner_model.py │ │ └── ner_utils.py │ ├── question_answering │ │ ├── __init__.py │ │ ├── qa_dataset_loading_script │ │ │ └── qa_dataset_loading_script.py │ │ ├── question_answering_model.py │ │ └── question_answering_utils.py │ ├── retrieval │ │ ├── __init__.py │ │ ├── retrieval_dataset_loading_script │ │ │ └── retrieval_dataset_loading_script.py │ │ ├── retrieval_model.py │ │ └── retrieval_utils.py │ ├── seq2seq │ │ ├── __init__.py │ │ ├── seq2seq_model.py │ │ └── seq2seq_utils.py │ ├── streamlit │ │ ├── __init__.py │ │ ├── classification_view.py │ │ ├── ner_view.py │ │ ├── qa_view.py │ │ ├── simple_view.py │ │ ├── streamlit_utils.py │ │ └── t5_view.py │ └── t5 │ │ ├── __init__.py │ │ ├── t5_model.py │ │ └── t5_utils.py ├── tests │ ├── language_modeling │ │ └── test_language_modeling_only.py │ ├── test_classification.py │ ├── test_language_modeling.py │ ├── test_language_representation.py │ ├── test_named_entity_recognition.py │ ├── test_question_answering.py │ ├── test_seq2seq.py │ ├── test_t5.py │ └── train.txt └── train.txt └── triangle-discovery ├── diffusion ├── .gitignore ├── LICENSE ├── catsample.py ├── configs │ ├── config.yaml │ └── model │ │ ├── medium.yaml │ │ └── small.yaml ├── data.py ├── eval.sh ├── eval_qa.py ├── graph_lib.py ├── load_model.py ├── losses.py ├── model │ ├── __init__.py │ ├── ema.py │ ├── fused_add_dropout_scale.py │ ├── rotary.py │ ├── transformer.py │ └── utils.py ├── noise_lib.py ├── run_eval.sh ├── run_sample.py ├── run_sample_cond.py ├── run_train.py ├── run_train.sh ├── sampling.py ├── test.py ├── train.py └── utils.py ├── ntp ├── eval.sh ├── eval_qa.py ├── main.py ├── run_eval.sh ├── run_train.sh ├── train.sh ├── triangle.ipynb ├── triangle_no_hash.ipynb └── utils.py └── teacherless ├── eval.sh ├── eval_qa.py ├── main.py ├── run_eval.sh ├── run_train.sh ├── train.sh ├── triangle_hybrid.ipynb └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Initially taken from Github's Python gitignore file 2 | 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # tests and logs 12 | tests/fixtures/cached_*_text.txt 13 | logs/ 14 | lightning_logs/ 15 | lang_code_data/ 16 | 17 | # Distribution / packaging 18 | .Python 19 | build/ 20 | develop-eggs/ 21 | dist/ 22 | downloads/ 23 | eggs/ 24 | .eggs/ 25 | lib/ 26 | lib64/ 27 | parts/ 28 | sdist/ 29 | var/ 30 | wheels/ 31 | *.egg-info/ 32 | .installed.cfg 33 | *.egg 34 | MANIFEST 35 | 36 | # PyInstaller 37 | # Usually these files are written by a python script from a template 38 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 39 | *.manifest 40 | *.spec 41 | 42 | # Installer logs 43 | pip-log.txt 44 | pip-delete-this-directory.txt 45 | 46 | # Unit test / coverage reports 47 | htmlcov/ 48 | .tox/ 49 | .nox/ 50 | .coverage 51 | .coverage.* 52 | .cache 53 | nosetests.xml 54 | coverage.xml 55 | *.cover 56 | .hypothesis/ 57 | .pytest_cache/ 58 | 59 | # Translations 60 | *.mo 61 | *.pot 62 | 63 | # Django stuff: 64 | *.log 65 | local_settings.py 66 | db.sqlite3 67 | 68 | # Flask stuff: 69 | instance/ 70 | .webassets-cache 71 | 72 | # Scrapy stuff: 73 | .scrapy 74 | 75 | # Sphinx documentation 76 | docs/_build/ 77 | 78 | # PyBuilder 79 | target/ 80 | 81 | # Jupyter Notebook 82 | .ipynb_checkpoints 83 | 84 | # IPython 85 | profile_default/ 86 | ipython_config.py 87 | 88 | # pyenv 89 | .python-version 90 | 91 | # celery beat schedule file 92 | celerybeat-schedule 93 | 94 | # SageMath parsed files 95 | *.sage.py 96 | 97 | # Environments 98 | .env 99 | .venv 100 | env/ 101 | venv/ 102 | ENV/ 103 | env.bak/ 104 | venv.bak/ 105 | 106 | # Spyder project settings 107 | .spyderproject 108 | .spyproject 109 | 110 | # Rope project settings 111 | .ropeproject 112 | 113 | # mkdocs documentation 114 | /site 115 | 116 | # mypy 117 | .mypy_cache/ 118 | .dmypy.json 119 | dmypy.json 120 | 121 | # Pyre type checker 122 | .pyre/ 123 | 124 | # vscode 125 | .vs 126 | .vscode 127 | 128 | # Pycharm 129 | .idea 130 | 131 | # TF code 132 | tensorflow_code 133 | 134 | # Models 135 | proc_data 136 | 137 | # examples 138 | runs 139 | /runs_old 140 | /wandb 141 | /examples/runs 142 | /examples/**/*.args 143 | /examples/rag/sweep 144 | 145 | # data 146 | serialization_dir 147 | 148 | # emacs 149 | *.*~ 150 | debug.env 151 | 152 | # vim 153 | .*.swp 154 | 155 | #ctags 156 | tags 157 | 158 | # .lock 159 | *.lock 160 | 161 | # DS_Store (MacOS) 162 | .DS_Store 163 | # RL pipelines may produce mp4 outputs 164 | *.mp4 165 | 166 | # dependencies 167 | /transformers 168 | 169 | # ruff 170 | .ruff_cache 171 | 172 | wandb 173 | 174 | # checkpoints 175 | */checkpoint-* 176 | 177 | # temporarily ignore 178 | default-outputs/* 179 | outputs/* 180 | log_files/* 181 | step_test/* 182 | 183 | **/render* 184 | **/*.zip 185 | 186 | exp_data 187 | 188 | # data 189 | 190 | # results 191 | 192 | **/*.pt -------------------------------------------------------------------------------- /circle-construction/diffusion/.gitignore: -------------------------------------------------------------------------------- 1 | .vscode/ 2 | build/ 3 | dist/ 4 | *.egg-info/ 5 | __pycache__ 6 | .ipynb_checkpoints/ 7 | .DS_Store 8 | **.pyc 9 | *.png 10 | *.txt 11 | **/outputs/ 12 | **/wandb/ 13 | **/exp/ 14 | **/exp_local/ 15 | data/ 16 | eval/ 17 | assets/ 18 | **.pth 19 | **.npz 20 | core 21 | **.log 22 | *.jsonl -------------------------------------------------------------------------------- /circle-construction/diffusion/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Aaron Lou 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /circle-construction/diffusion/catsample.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | 4 | 5 | def gumbel_softmax(categorical_probs, hard=False, eps=1e-9): 6 | logits = categorical_probs.clamp(min=1e-9).log() 7 | return F.gumbel_softmax(logits, hard=hard) 8 | 9 | 10 | def sample_categorical(categorical_probs, method="hard"): 11 | if method == "hard": 12 | gumbel_norm = 1e-10 - (torch.rand_like(categorical_probs) + 1e-10).log() 13 | return (categorical_probs / gumbel_norm).argmax(dim=-1) 14 | else: 15 | raise ValueError(f"Method {method} for sampling categorical variables is not valid.") 16 | -------------------------------------------------------------------------------- /circle-construction/diffusion/configs/config.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - _self_ 3 | - model: small 4 | - override hydra/launcher: submitit_slurm 5 | 6 | ngpus: 1 7 | tokens: 50257 8 | add_vocab: "" 9 | 10 | training: 11 | batch_size: 64 # 512 12 | accum: 1 13 | n_iters: 400000 14 | snapshot_freq: 10000 15 | log_freq: 50 16 | eval_freq: 500 17 | snapshot_freq_for_preemption: 10000 18 | weight: standard 19 | snapshot_sampling: True 20 | ema: 0.9999 21 | 22 | data: 23 | train: openwebtext 24 | valid: wikitext103 25 | cache_dir: data 26 | 27 | graph: 28 | type: absorb 29 | file: data 30 | report_all: False 31 | 32 | noise: 33 | type: loglinear 34 | sigma_min: 1e-4 35 | sigma_max: 20 36 | 37 | sampling: 38 | predictor: euler 39 | steps: 128 40 | noise_removal: True 41 | 42 | eval: 43 | batch_size: 32 44 | perplexity: False 45 | perplexity_batch_size: 32 46 | 47 | optim: 48 | weight_decay: 0 49 | optimizer: AdamW 50 | lr: 1e-4 51 | beta1: 0.9 52 | beta2: 0.999 53 | eps: 1e-8 54 | warmup: 2500 55 | grad_clip: 1. 56 | 57 | 58 | hydra: 59 | run: 60 | dir: /data/locus/project_data/project_data2/chenwu2/creativity_results/${data.train}/${now:%Y.%m.%d}/${now:%H%M%S} -------------------------------------------------------------------------------- /circle-construction/diffusion/configs/model/medium.yaml: -------------------------------------------------------------------------------- 1 | name: medium 2 | type: ddit 3 | hidden_size: 1024 4 | cond_dim: 128 5 | length: 29 6 | n_blocks: 24 7 | n_heads: 16 8 | scale_by_sigma: True 9 | dropout: 0.1 -------------------------------------------------------------------------------- /circle-construction/diffusion/configs/model/small.yaml: -------------------------------------------------------------------------------- 1 | name: small 2 | type: ddit 3 | hidden_size: 768 4 | cond_dim: 128 5 | length: 29 6 | n_blocks: 12 7 | n_heads: 12 8 | scale_by_sigma: True 9 | dropout: 0.1 -------------------------------------------------------------------------------- /circle-construction/diffusion/eval.sh: -------------------------------------------------------------------------------- 1 | # $1: dataset 2 | # $2: weight_decay 3 | # $3: n_layers 4 | 5 | EXP_DIR=creativity_results/creativity_data/circle.10.9.0.10000/train.json/train/checkpoint_outputs 6 | 7 | DATA_DIR=/data/locus/project_data/project_data2/chenwu2/creativity_data 8 | 9 | python eval_qa.py --dir $EXP_DIR --dataset $1 --data_dir $DATA_DIR 10 | -------------------------------------------------------------------------------- /circle-construction/diffusion/load_model.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | from model import SEDD 4 | import utils 5 | from model.ema import ExponentialMovingAverage 6 | import graph_lib 7 | import noise_lib 8 | 9 | from omegaconf import OmegaConf 10 | 11 | def load_model_hf(dir, device): 12 | score_model = SEDD.from_pretrained(dir).to(device) 13 | graph = graph_lib.get_graph(score_model.config, device) 14 | noise = noise_lib.get_noise(score_model.config).to(device) 15 | return score_model, graph, noise 16 | 17 | 18 | def load_model_local(root_dir, ckpt_dir, added_tokens, device): 19 | cfg = utils.load_hydra_config_from_run(root_dir) 20 | if added_tokens: 21 | cfg.tokens = cfg.tokens + len(added_tokens) 22 | graph = graph_lib.get_graph(cfg, device) 23 | noise = noise_lib.get_noise(cfg).to(device) 24 | score_model = SEDD(cfg).to(device) 25 | ema = ExponentialMovingAverage(score_model.parameters(), decay=cfg.training.ema) 26 | 27 | # ckpt_dir = os.path.join(root_dir, "checkpoints-meta", "checkpoint.pth") 28 | loaded_state = torch.load(ckpt_dir, map_location=device) 29 | 30 | score_model.load_state_dict(loaded_state['model']) 31 | ema.load_state_dict(loaded_state['ema']) 32 | 33 | ema.store(score_model.parameters()) 34 | ema.copy_to(score_model.parameters()) 35 | return score_model, graph, noise 36 | 37 | 38 | def load_model(root_dir, ckpt_dir, added_tokens, device): 39 | try: 40 | return load_model_hf(root_dir, device) 41 | except: 42 | return load_model_local(root_dir, ckpt_dir, added_tokens, device) -------------------------------------------------------------------------------- /circle-construction/diffusion/model/__init__.py: -------------------------------------------------------------------------------- 1 | from .transformer import SEDD -------------------------------------------------------------------------------- /circle-construction/diffusion/model/ema.py: -------------------------------------------------------------------------------- 1 | # Modified from https://raw.githubusercontent.com/fadel/pytorch_ema/master/torch_ema/ema.py 2 | 3 | from __future__ import division 4 | from __future__ import unicode_literals 5 | 6 | import torch 7 | 8 | 9 | # Partially based on: https://github.com/tensorflow/tensorflow/blob/r1.13/tensorflow/python/training/moving_averages.py 10 | class ExponentialMovingAverage: 11 | """ 12 | Maintains (exponential) moving average of a set of parameters. 13 | """ 14 | 15 | def __init__(self, parameters, decay, use_num_updates=True): 16 | """ 17 | Args: 18 | parameters: Iterable of `torch.nn.Parameter`; usually the result of 19 | `model.parameters()`. 20 | decay: The exponential decay. 21 | use_num_updates: Whether to use number of updates when computing 22 | averages. 23 | """ 24 | if decay < 0.0 or decay > 1.0: 25 | raise ValueError('Decay must be between 0 and 1') 26 | self.decay = decay 27 | self.num_updates = 0 if use_num_updates else None 28 | self.shadow_params = [p.clone().detach() 29 | for p in parameters if p.requires_grad] 30 | self.collected_params = [] 31 | 32 | def update(self, parameters): 33 | """ 34 | Update currently maintained parameters. 35 | 36 | Call this every time the parameters are updated, such as the result of 37 | the `optimizer.step()` call. 38 | 39 | Args: 40 | parameters: Iterable of `torch.nn.Parameter`; usually the same set of 41 | parameters used to initialize this object. 42 | """ 43 | decay = self.decay 44 | if self.num_updates is not None: 45 | self.num_updates += 1 46 | decay = min(decay, (1 + self.num_updates) / 47 | (10 + self.num_updates)) 48 | one_minus_decay = 1.0 - decay 49 | with torch.no_grad(): 50 | parameters = [p for p in parameters if p.requires_grad] 51 | for s_param, param in zip(self.shadow_params, parameters): 52 | s_param.sub_(one_minus_decay * (s_param - param)) 53 | 54 | 55 | def copy_to(self, parameters): 56 | """ 57 | Copy current parameters into given collection of parameters. 58 | 59 | Args: 60 | parameters: Iterable of `torch.nn.Parameter`; the parameters to be 61 | updated with the stored moving averages. 62 | """ 63 | parameters = [p for p in parameters if p.requires_grad] 64 | for s_param, param in zip(self.shadow_params, parameters): 65 | if param.requires_grad: 66 | param.data.copy_(s_param.data) 67 | 68 | def store(self, parameters): 69 | """ 70 | Save the current parameters for restoring later. 71 | 72 | Args: 73 | parameters: Iterable of `torch.nn.Parameter`; the parameters to be 74 | temporarily stored. 75 | """ 76 | self.collected_params = [param.clone() for param in parameters] 77 | 78 | def restore(self, parameters): 79 | """ 80 | Restore the parameters stored with the `store` method. 81 | Useful to validate the model with EMA parameters without affecting the 82 | original optimization process. Store the parameters before the 83 | `copy_to` method. After validation (or model saving), use this to 84 | restore the former parameters. 85 | 86 | Args: 87 | parameters: Iterable of `torch.nn.Parameter`; the parameters to be 88 | updated with the stored parameters. 89 | """ 90 | for c_param, param in zip(self.collected_params, parameters): 91 | param.data.copy_(c_param.data) 92 | 93 | def state_dict(self): 94 | return dict(decay=self.decay, num_updates=self.num_updates, 95 | shadow_params=self.shadow_params) 96 | 97 | def load_state_dict(self, state_dict): 98 | self.decay = state_dict['decay'] 99 | self.num_updates = state_dict['num_updates'] 100 | self.shadow_params = state_dict['shadow_params'] -------------------------------------------------------------------------------- /circle-construction/diffusion/model/fused_add_dropout_scale.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from typing import Optional 4 | from torch import Tensor 5 | 6 | # flags required to enable jit fusion kernels 7 | torch._C._jit_set_profiling_mode(False) 8 | torch._C._jit_set_profiling_executor(False) 9 | torch._C._jit_override_can_fuse_on_cpu(True) 10 | torch._C._jit_override_can_fuse_on_gpu(True) 11 | 12 | 13 | def bias_dropout_add_scale( 14 | x: Tensor, bias: Optional[Tensor], scale: Tensor, residual: Optional[Tensor], prob: float, training: bool 15 | ) -> Tensor: 16 | if bias is not None: 17 | out = scale * F.dropout(x + bias, p=prob, training=training) 18 | else: 19 | out = scale * F.dropout(x, p=prob, training=training) 20 | 21 | if residual is not None: 22 | out = residual + out 23 | return out 24 | 25 | 26 | def get_bias_dropout_add_scale(training): 27 | def _bias_dropout_add(x, bias, scale, residual, prob): 28 | return bias_dropout_add_scale(x, bias, scale, residual, prob, training) 29 | 30 | return _bias_dropout_add 31 | 32 | 33 | def modulate(x: Tensor, shift: Tensor, scale: Tensor) -> Tensor: 34 | return x * (1 + scale) + shift 35 | 36 | 37 | @torch.jit.script 38 | def bias_dropout_add_scale_fused_train( 39 | x: Tensor, bias: Optional[Tensor], scale: Tensor, residual: Optional[Tensor], prob: float 40 | ) -> Tensor: 41 | return bias_dropout_add_scale(x, bias, scale, residual, prob, True) 42 | 43 | 44 | @torch.jit.script 45 | def bias_dropout_add_scale_fused_inference( 46 | x: Tensor, bias: Optional[Tensor], scale: Tensor, residual: Optional[Tensor], prob: float 47 | ) -> Tensor: 48 | return bias_dropout_add_scale(x, bias, scale, residual, prob, False) 49 | 50 | @torch.jit.script 51 | def modulate_fused(x: Tensor, shift: Tensor, scale: Tensor) -> Tensor: 52 | return modulate(x, shift, scale) -------------------------------------------------------------------------------- /circle-construction/diffusion/model/rotary.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | 4 | 5 | class Rotary(torch.nn.Module): 6 | def __init__(self, dim, base=10_000): 7 | super().__init__() 8 | inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim)) 9 | self.register_buffer("inv_freq", inv_freq) 10 | self.seq_len_cached = None 11 | self.cos_cached = None 12 | self.sin_cached = None 13 | 14 | def forward(self, x, seq_dim=1): 15 | seq_len = x.shape[seq_dim] 16 | if seq_len != self.seq_len_cached: 17 | self.seq_len_cached = seq_len 18 | t = torch.arange(x.shape[seq_dim], device=x.device).type_as(self.inv_freq) 19 | freqs = torch.einsum("i,j->ij", t, self.inv_freq.clone()) 20 | emb = torch.cat((freqs, freqs), dim=-1).to(x.device) 21 | # dims are: batch, seq_len, qkv, head, dim 22 | self.cos_cached = emb.cos()[None, :, None, None, :].repeat(1,1,3,1,1) 23 | self.sin_cached = emb.sin()[None, :, None, None, :].repeat(1,1,3,1,1) 24 | # This makes the transformation on v an identity. 25 | self.cos_cached[:,:,2,:,:].fill_(1.) 26 | self.sin_cached[:,:,2,:,:].fill_(0.) 27 | 28 | return self.cos_cached, self.sin_cached 29 | 30 | 31 | def rotate_half(x): 32 | x1, x2 = x[..., : x.shape[-1] // 2], x[..., x.shape[-1] // 2 :] 33 | return torch.cat( 34 | (-x2, x1), dim=-1 35 | ) 36 | 37 | 38 | @torch.jit.script 39 | def _apply_rotary_pos_emb_torchscript(qkv, cos, sin): 40 | return (qkv * cos) + (rotate_half(qkv) * sin) 41 | 42 | 43 | def apply_rotary_pos_emb(qkv, cos, sin): 44 | try: 45 | import flash_attn.layers.rotary 46 | cos = cos[0,:,0,0,:cos.shape[-1]//2] 47 | sin = sin[0,:,0,0,:sin.shape[-1]//2] 48 | return flash_attn.layers.rotary.apply_rotary_emb_qkv_( 49 | qkv, cos, sin 50 | ) 51 | except: 52 | return _apply_rotary_pos_emb_torchscript(qkv, cos, sin) -------------------------------------------------------------------------------- /circle-construction/diffusion/model/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | 4 | 5 | def get_model_fn(model, train=False): 6 | """Create a function to give the output of the score-based model. 7 | 8 | Args: 9 | model: The score model. 10 | train: `True` for training and `False` for evaluation. 11 | mlm: If the input model is a mlm and models the base probability 12 | 13 | Returns: 14 | A model function. 15 | """ 16 | 17 | def model_fn(x, sigma): 18 | """Compute the output of the score-based model. 19 | 20 | Args: 21 | x: A mini-batch of input data. 22 | labels: A mini-batch of conditioning variables for time steps. Should be interpreted differently 23 | for different models. 24 | 25 | Returns: 26 | A tuple of (model output, new mutable states) 27 | """ 28 | if train: 29 | model.train() 30 | else: 31 | model.eval() 32 | 33 | # otherwise output the raw values (we handle mlm training in losses.py) 34 | return model(x, sigma) 35 | 36 | return model_fn 37 | 38 | 39 | def get_score_fn(model, train=False, sampling=False): 40 | if sampling: 41 | assert not train, "Must sample in eval mode" 42 | model_fn = get_model_fn(model, train=train) 43 | 44 | with torch.cuda.amp.autocast(dtype=torch.bfloat16): 45 | def score_fn(x, sigma): 46 | sigma = sigma.reshape(-1) 47 | score = model_fn(x, sigma) 48 | 49 | if sampling: 50 | # when sampling return true score (not log used for training) 51 | return score.exp() 52 | 53 | return score 54 | 55 | return score_fn 56 | -------------------------------------------------------------------------------- /circle-construction/diffusion/noise_lib.py: -------------------------------------------------------------------------------- 1 | import abc 2 | import torch 3 | import torch.nn as nn 4 | import numpy as np 5 | 6 | 7 | def get_noise(config): 8 | if config.noise.type == "geometric": 9 | return GeometricNoise(config.noise.sigma_min, config.noise.sigma_max) 10 | elif config.noise.type == "loglinear": 11 | return LogLinearNoise() 12 | else: 13 | raise ValueError(f"{config.noise.type} is not a valid noise") 14 | 15 | 16 | class Noise(abc.ABC, nn.Module): 17 | """ 18 | Baseline forward method to get the total + rate of noise at a timestep 19 | """ 20 | def forward(self, t): 21 | return self.total_noise(t), self.rate_noise(t) 22 | 23 | """ 24 | Assume time goes from 0 to 1 25 | """ 26 | @abc.abstractmethod 27 | def rate_noise(self, t): 28 | """ 29 | Rate of change of noise ie g(t) 30 | """ 31 | pass 32 | 33 | @abc.abstractmethod 34 | def total_noise(self, t): 35 | """ 36 | Total noise ie \int_0^t g(t) dt + g(0) 37 | """ 38 | pass 39 | 40 | 41 | class GeometricNoise(Noise, nn.Module): 42 | def __init__(self, sigma_min=1e-3, sigma_max=1, learnable=False): 43 | super().__init__() 44 | self.sigmas = 1.0 * torch.tensor([sigma_min, sigma_max]) 45 | if learnable: 46 | self.sigmas = nn.Parameter(self.sigmas) 47 | self.empty = nn.Parameter(torch.tensor(0.0)) 48 | 49 | def rate_noise(self, t): 50 | return self.sigmas[0] ** (1 - t) * self.sigmas[1] ** t * (self.sigmas[1].log() - self.sigmas[0].log()) 51 | 52 | def total_noise(self, t): 53 | return self.sigmas[0] ** (1 - t) * self.sigmas[1] ** t 54 | 55 | 56 | class LogLinearNoise(Noise, nn.Module): 57 | """ 58 | Log Linear noise schedule built so that 1 - 1/e^(n(t)) interpolates between 0 and ~1 59 | when t goes from 0 to 1. Used for absorbing 60 | 61 | Total noise is -log(1 - (1 - eps) * t), so the sigma will be (1 - eps) * t 62 | """ 63 | def __init__(self, eps=1e-3): 64 | super().__init__() 65 | self.eps = eps 66 | self.empty = nn.Parameter(torch.tensor(0.0)) 67 | 68 | def rate_noise(self, t): 69 | return (1 - self.eps) / (1 - (1 - self.eps) * t) 70 | 71 | def total_noise(self, t): 72 | return -torch.log1p(-(1 - self.eps) * t) 73 | 74 | -------------------------------------------------------------------------------- /circle-construction/diffusion/run_eval.sh: -------------------------------------------------------------------------------- 1 | bash eval.sh circle.10.9.0.10000 0.0 8 -------------------------------------------------------------------------------- /circle-construction/diffusion/run_sample.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import argparse 3 | import json 4 | 5 | from load_model import load_model 6 | from transformers import GPT2TokenizerFast 7 | import torch.nn.functional as F 8 | import sampling 9 | 10 | 11 | def main(): 12 | parser = argparse.ArgumentParser(description="Generate some samples") 13 | parser.add_argument("--model_path", default="louaaron/sedd-medium", type=str) 14 | parser.add_argument("--dataset", default="wikitext103", type=str) 15 | parser.add_argument("--batch_size", type=int, default=1) 16 | parser.add_argument("--steps", type=int, default=1024) 17 | parser.add_argument("--add_vocab", type=str, default=None) 18 | args = parser.parse_args() 19 | 20 | 21 | device = torch.device('cuda') 22 | model, graph, noise = load_model(args.model_path, device) 23 | tokenizer = GPT2TokenizerFast.from_pretrained('gpt2') 24 | if args.add_vocab: 25 | with open(args.add_vocab, 'r') as file: 26 | added_tokens = json.load(file) 27 | tokenizer.add_tokens(added_tokens) 28 | 29 | sampling_fn = sampling.get_pc_sampler( 30 | graph, noise, (args.batch_size, 1024), 'analytic', args.steps, device=device 31 | ) 32 | 33 | samples = sampling_fn(model) 34 | 35 | text_samples = tokenizer.batch_decode(samples) 36 | for i in text_samples: 37 | print(i) 38 | print("=================================================") 39 | 40 | if __name__=="__main__": 41 | main() -------------------------------------------------------------------------------- /circle-construction/diffusion/run_sample_cond.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import argparse 3 | import json 4 | 5 | from load_model import load_model 6 | from transformers import GPT2TokenizerFast 7 | import sampling 8 | 9 | 10 | def main(): 11 | parser = argparse.ArgumentParser(description="Generate some samples") 12 | parser.add_argument("--model_path", default="louaaron/sedd-medium", type=str) 13 | parser.add_argument("--dataset", default="wikitext103", type=str) 14 | parser.add_argument("--batch_size", type=int, default=1) 15 | parser.add_argument("--steps", type=int, default=1024) 16 | parser.add_argument("--prefix", type=str, default="Hi, my name is") 17 | parser.add_argument("--suffix", type=str, default=" and that's why I'm late.") 18 | parser.add_argument("--add_vocab", type=str, default=None) 19 | args = parser.parse_args() 20 | 21 | tokenizer = GPT2TokenizerFast.from_pretrained('gpt2') 22 | if args.add_vocab: 23 | with open(args.add_vocab, 'r') as file: 24 | added_tokens = json.load(file) 25 | tokenizer.add_tokens(added_tokens) 26 | 27 | prefix_ids = tokenizer(args.prefix).input_ids 28 | suffix_ids = tokenizer(args.suffix).input_ids 29 | input_ids = prefix_ids + suffix_ids 30 | input_locs = list(range(len(prefix_ids))) + list(range(1024-len(suffix_ids), 1024)) 31 | 32 | # more generaly commands can be defined with something like below: 33 | # input_ids = [0, 1, 512, 8080, 50256, 20000] 34 | # input_locs = [5, 6, 19, 20, 1000, 10001] 35 | 36 | 37 | input_ids = torch.tensor(input_ids, device="cuda")[None].repeat(args.batch_size, 1) 38 | 39 | def proj_fun(x): 40 | x[:, input_locs] = input_ids 41 | return x 42 | 43 | device = torch.device('cuda') 44 | model, graph, noise = load_model(args.model_path, device) 45 | 46 | 47 | sampling_fn = sampling.get_pc_sampler( 48 | graph, noise, (args.batch_size, 1024), 'analytic', args.steps, device=device, proj_fun=proj_fun 49 | ) 50 | 51 | samples = proj_fun(sampling_fn(model)) 52 | 53 | text_samples = tokenizer.batch_decode(samples) 54 | for i in text_samples: 55 | print(i) 56 | print("=================================================") 57 | 58 | if __name__=="__main__": 59 | main() -------------------------------------------------------------------------------- /circle-construction/diffusion/run_train.sh: -------------------------------------------------------------------------------- 1 | python train.py \ 2 | noise.type=loglinear \ 3 | graph.type=absorb \ 4 | model=small \ 5 | training.accum=1 \ 6 | data.train=creativity_data/circle.10.9.0.10000/train.json \ 7 | data.valid=creativity_data/circle.10.9.0.10000/valid.json \ 8 | add_vocab=creativity_data/circle.10.9.0.10000/vocab.json \ 9 | hydra.run.dir=/data/locus/project_data/project_data2/chenwu2/creativity_results/creativity_data/circle.10.9.0.10000/train.json/train 10 | 11 | python test.py \ 12 | --model_checkpoint_dir creativity_results/creativity_data/circle.10.9.0.10000/train.json/train \ 13 | --dataset creativity_data/circle.10.9.0.10000 \ 14 | --add_vocab creativity_data/circle.10.9.0.10000/vocab.json -------------------------------------------------------------------------------- /circle-construction/diffusion/test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import argparse 3 | import json 4 | 5 | from load_model import load_model 6 | from transformers import GPT2TokenizerFast 7 | import sampling 8 | from tqdm import tqdm 9 | 10 | import os 11 | 12 | 13 | def main(): 14 | parser = argparse.ArgumentParser(description="Generate some samples") 15 | parser.add_argument("--model_checkpoint_dir", default="", type=str) 16 | parser.add_argument("--dataset", default=None, type=str) 17 | parser.add_argument("--steps", type=int, default=128) 18 | parser.add_argument("--add_vocab", type=str, default=None) 19 | args = parser.parse_args() 20 | 21 | tokenizer = GPT2TokenizerFast.from_pretrained('gpt2') 22 | if args.add_vocab: 23 | with open(args.add_vocab, 'r') as file: 24 | added_tokens = json.load(file) 25 | tokenizer.add_tokens(added_tokens) 26 | else: 27 | added_tokens = [] 28 | 29 | # Load the dataset 30 | with open(os.path.join(args.dataset, "test.json"), "r") as f: 31 | test_data = json.load(f) 32 | 33 | # List all files under the model checkpoint directory 34 | checkpoints = [os.path.join(args.model_checkpoint_dir, "checkpoints", f) for f in os.listdir(os.path.join(args.model_checkpoint_dir, "checkpoints"))] 35 | print(checkpoints) 36 | 37 | for checkpoint in checkpoints: 38 | device = torch.device('cuda') 39 | model, graph, noise = load_model(args.model_checkpoint_dir, checkpoint, added_tokens, device) 40 | # Create a checkpoint_dir for the current checkpoint 41 | checkpoint_dir = os.path.join(args.model_checkpoint_dir, "checkpoint_outputs", os.path.basename(checkpoint)) 42 | if os.path.exists(checkpoint_dir): 43 | print(f"Skipping {checkpoint_dir} because it already exists") 44 | continue 45 | os.makedirs(checkpoint_dir, exist_ok=True) 46 | 47 | def generate_output(input_text): 48 | prefix_ids = tokenizer(input_text).input_ids 49 | # suffix_ids = tokenizer("<|endoftext|>").input_ids 50 | input_ids = prefix_ids 51 | input_locs = list(range(len(prefix_ids))) 52 | 53 | input_ids = torch.tensor(input_ids, device="cuda")[None].repeat(1, 1) 54 | 55 | def proj_fun(x): 56 | x[:, input_locs] = input_ids 57 | return x 58 | 59 | sampling_fn = sampling.get_pc_sampler( 60 | graph, noise, (1, 29), 'analytic', args.steps, device=device, proj_fun=proj_fun 61 | ) 62 | 63 | samples = proj_fun(sampling_fn(model)) 64 | 65 | text_samples = tokenizer.batch_decode(samples) 66 | assert len(text_samples) == 1 67 | text_samples = text_samples[0].split("<|endoftext|>")[0] 68 | return text_samples 69 | 70 | all_items = [] 71 | for sample in tqdm(test_data): 72 | item = {} 73 | item["input_text"] = sample["input_text"] 74 | item["target_text"] = sample["target_text"] 75 | item["type"] = sample["type"] 76 | 77 | output = generate_output(sample["input_text"]) 78 | print(sample["input_text"]) 79 | print(sample["target_text"]) 80 | print(output) 81 | print() 82 | item["model_output"] = output 83 | all_items.append(item) 84 | 85 | # Save the results 86 | with open(os.path.join(checkpoint_dir, "all_items.json"), "w") as f: 87 | json.dump(all_items, f, indent=4) 88 | 89 | 90 | if __name__=="__main__": 91 | main() -------------------------------------------------------------------------------- /circle-construction/diffusion/train.py: -------------------------------------------------------------------------------- 1 | """Training and evaluation""" 2 | 3 | import hydra 4 | import os 5 | import numpy as np 6 | import run_train 7 | import utils 8 | import torch.multiprocessing as mp 9 | from hydra.core.hydra_config import HydraConfig 10 | from hydra.types import RunMode 11 | from omegaconf import OmegaConf, open_dict 12 | 13 | 14 | @hydra.main(version_base=None, config_path="configs", config_name="config") 15 | def main(cfg): 16 | ngpus = cfg.ngpus 17 | if "load_dir" in cfg: 18 | hydra_cfg_path = os.path.join(cfg.load_dir, ".hydra/hydra.yaml") 19 | hydra_cfg = OmegaConf.load(hydra_cfg_path).hydra 20 | 21 | cfg = utils.load_hydra_config_from_run(cfg.load_dir) 22 | 23 | work_dir = cfg.work_dir 24 | utils.makedirs(work_dir) 25 | else: 26 | hydra_cfg = HydraConfig.get() 27 | work_dir = hydra_cfg.run.dir if hydra_cfg.mode == RunMode.RUN else os.path.join(hydra_cfg.sweep.dir, hydra_cfg.sweep.subdir) 28 | utils.makedirs(work_dir) 29 | 30 | with open_dict(cfg): 31 | cfg.ngpus = ngpus 32 | cfg.work_dir = work_dir 33 | cfg.wandb_name = os.path.basename(os.path.normpath(work_dir)) 34 | 35 | # Run the training pipeline 36 | port = int(np.random.randint(10000, 20000)) 37 | logger = utils.get_logger(os.path.join(work_dir, "logs")) 38 | 39 | hydra_cfg = HydraConfig.get() 40 | if hydra_cfg.mode != RunMode.RUN: 41 | logger.info(f"Run id: {hydra_cfg.job.id}") 42 | 43 | try: 44 | mp.set_start_method("forkserver") 45 | mp.spawn(run_train.run_multiprocess, args=(ngpus, cfg, port), nprocs=ngpus, join=True) 46 | except Exception as e: 47 | logger.critical(e, exc_info=True) 48 | 49 | 50 | if __name__ == "__main__": 51 | main() -------------------------------------------------------------------------------- /circle-construction/diffusion/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | import os 4 | import logging 5 | from omegaconf import OmegaConf, open_dict 6 | 7 | 8 | def load_hydra_config_from_run(load_dir): 9 | cfg_path = os.path.join(load_dir, ".hydra/config.yaml") 10 | cfg = OmegaConf.load(cfg_path) 11 | return cfg 12 | 13 | 14 | def makedirs(dirname): 15 | os.makedirs(dirname, exist_ok=True) 16 | 17 | 18 | def get_logger(logpath, package_files=[], displaying=True, saving=True, debug=False): 19 | logger = logging.getLogger() 20 | if debug: 21 | level = logging.DEBUG 22 | else: 23 | level = logging.INFO 24 | 25 | if (logger.hasHandlers()): 26 | logger.handlers.clear() 27 | 28 | logger.setLevel(level) 29 | formatter = logging.Formatter('%(asctime)s - %(message)s') 30 | if saving: 31 | info_file_handler = logging.FileHandler(logpath, mode="a") 32 | info_file_handler.setLevel(level) 33 | info_file_handler.setFormatter(formatter) 34 | logger.addHandler(info_file_handler) 35 | if displaying: 36 | console_handler = logging.StreamHandler() 37 | console_handler.setLevel(level) 38 | console_handler.setFormatter(formatter) 39 | logger.addHandler(console_handler) 40 | 41 | for f in package_files: 42 | logger.info(f) 43 | with open(f, "r") as package_f: 44 | logger.info(package_f.read()) 45 | 46 | return logger 47 | 48 | 49 | def restore_checkpoint(ckpt_dir, state, device): 50 | if not os.path.exists(ckpt_dir): 51 | makedirs(os.path.dirname(ckpt_dir)) 52 | logging.warning(f"No checkpoint found at {ckpt_dir}. Returned the same state as input") 53 | return state 54 | else: 55 | loaded_state = torch.load(ckpt_dir, map_location=device) 56 | state['optimizer'].load_state_dict(loaded_state['optimizer']) 57 | state['model'].module.load_state_dict(loaded_state['model'], strict=False) 58 | state['ema'].load_state_dict(loaded_state['ema']) 59 | state['step'] = loaded_state['step'] 60 | return state 61 | 62 | 63 | def save_checkpoint(ckpt_dir, state): 64 | saved_state = { 65 | 'optimizer': state['optimizer'].state_dict(), 66 | 'model': state['model'].module.state_dict(), 67 | 'ema': state['ema'].state_dict(), 68 | 'step': state['step'] 69 | } 70 | torch.save(saved_state, ckpt_dir) -------------------------------------------------------------------------------- /circle-construction/ntp/eval.sh: -------------------------------------------------------------------------------- 1 | # $1: dataset 2 | # $2: weight_decay 3 | # $3: n_layers 4 | 5 | EXP_DIR=../creativity 6 | DATA_DIR=/data/locus/project_data/project_data2/chenwu2/creativity_data 7 | 8 | python eval_qa.py --dir $EXP_DIR/$1_$2_$3 --dataset $1 --data_dir $DATA_DIR 9 | -------------------------------------------------------------------------------- /circle-construction/ntp/run_eval.sh: -------------------------------------------------------------------------------- 1 | bash eval.sh circle.10.9.10.10000 0.0 12 2 | -------------------------------------------------------------------------------- /circle-construction/ntp/run_train.sh: -------------------------------------------------------------------------------- 1 | bash train.sh circle.10.9.10.10000 0.0 12 0 -------------------------------------------------------------------------------- /circle-construction/ntp/train.sh: -------------------------------------------------------------------------------- 1 | MODEL_PATH=gpt2 2 | 3 | DATASET=/data/locus/project_data/project_data2/chenwu2/creativity_data/$1/ 4 | WEIGHT_DECAY=$2 5 | N_LAYERS=$3 6 | GPU=$4 7 | 8 | EXP_DIR=../creativity 9 | 10 | OUTPUT_DIR=$EXP_DIR/$1_$2_$3 11 | 12 | CUDA_VISIBLE_DEVICES=$GPU python main.py \ 13 | --data_dir $DATASET \ 14 | --model_name_or_path ${MODEL_PATH} \ 15 | --weight_decay $WEIGHT_DECAY \ 16 | --output_dir $OUTPUT_DIR \ 17 | --max_seq_length 128 \ 18 | --max_length 128 \ 19 | --block_size 128 \ 20 | --train_batch_size 64 \ 21 | --eval_batch_size 64 \ 22 | --learning_rate 1e-4 \ 23 | --gradient_accumulation_steps 1 \ 24 | --save_step 5000 \ 25 | --save_step_dense 1000 \ 26 | --max_steps 40000 \ 27 | --do_train \ 28 | --scheduler constant_schedule_with_warmup \ 29 | --fp16 \ 30 | --evaluate_during_training \ 31 | --predict_during_training \ 32 | --add_tokens \ 33 | --n_layer $N_LAYERS 34 | -------------------------------------------------------------------------------- /circle-construction/ntp/utils.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import json 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | 7 | 8 | def read_data_source_target(file_name, return_num=False, return_json=False, cutoff=None): 9 | """ 10 | file_name: a .json file containing a list of items, each has 'input_text', 'target_text', as keys 11 | """ 12 | with open(file_name, 'r', encoding='utf-8') as f: 13 | data = json.load(f) 14 | 15 | if cutoff is not None: 16 | data = data[:cutoff] 17 | 18 | if return_json: 19 | if return_num: 20 | return data, len(data) 21 | return data 22 | 23 | keys = list(data[0].keys()) 24 | source_target_pair = [] 25 | for item in data: 26 | source_target_pair.append([item[key] for key in keys]) 27 | 28 | if return_num: 29 | return pd.DataFrame(source_target_pair, columns=keys), len(data) 30 | return pd.DataFrame(source_target_pair, columns=keys) 31 | -------------------------------------------------------------------------------- /circle-construction/teacherless/eval.sh: -------------------------------------------------------------------------------- 1 | # $1: dataset 2 | # $2: weight_decay 3 | # $3: n_layers 4 | 5 | EXP_DIR=../creativity 6 | DATA_DIR=/data/locus/project_data/project_data2/chenwu2/creativity_data 7 | 8 | python eval_qa.py --dir $EXP_DIR/$1_$2_$3 --dataset $1 --data_dir $DATA_DIR 9 | -------------------------------------------------------------------------------- /circle-construction/teacherless/run_eval.sh: -------------------------------------------------------------------------------- 1 | bash eval.sh circle_hybrid.10.9.10.10000 0.0 12 -------------------------------------------------------------------------------- /circle-construction/teacherless/run_train.sh: -------------------------------------------------------------------------------- 1 | bash train.sh circle_hybrid.10.9.10.10000 0.0 12 0 -------------------------------------------------------------------------------- /circle-construction/teacherless/train.sh: -------------------------------------------------------------------------------- 1 | MODEL_PATH=gpt2 2 | 3 | DATASET=/data/locus/project_data/project_data2/chenwu2/creativity_data/$1/ 4 | WEIGHT_DECAY=$2 5 | N_LAYERS=$3 6 | GPU=$4 7 | 8 | EXP_DIR=../creativity 9 | 10 | OUTPUT_DIR=$EXP_DIR/$1_$2_$3 11 | 12 | CUDA_VISIBLE_DEVICES=$GPU python main.py \ 13 | --data_dir $DATASET \ 14 | --model_name_or_path ${MODEL_PATH} \ 15 | --weight_decay $WEIGHT_DECAY \ 16 | --output_dir $OUTPUT_DIR \ 17 | --max_seq_length 128 \ 18 | --max_length 128 \ 19 | --block_size 128 \ 20 | --train_batch_size 64 \ 21 | --eval_batch_size 64 \ 22 | --learning_rate 1e-4 \ 23 | --gradient_accumulation_steps 1 \ 24 | --save_step 5000 \ 25 | --save_step_dense 1000 \ 26 | --max_steps 40000 \ 27 | --do_train \ 28 | --scheduler constant_schedule_with_warmup \ 29 | --fp16 \ 30 | --evaluate_during_training \ 31 | --predict_during_training \ 32 | --add_tokens \ 33 | --n_layer $N_LAYERS 34 | -------------------------------------------------------------------------------- /circle-construction/teacherless/utils.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import json 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | 7 | 8 | def read_data_source_target(file_name, return_num=False, return_json=False, cutoff=None): 9 | """ 10 | file_name: a .json file containing a list of items, each has 'input_text', 'target_text', as keys 11 | """ 12 | with open(file_name, 'r', encoding='utf-8') as f: 13 | data = json.load(f) 14 | 15 | if cutoff is not None: 16 | data = data[:cutoff] 17 | 18 | if return_json: 19 | if return_num: 20 | return data, len(data) 21 | return data 22 | 23 | keys = list(data[0].keys()) 24 | source_target_pair = [] 25 | for item in data: 26 | source_target_pair.append([item[key] for key in keys]) 27 | 28 | if return_num: 29 | return pd.DataFrame(source_target_pair, columns=keys), len(data) 30 | return pd.DataFrame(source_target_pair, columns=keys) 31 | -------------------------------------------------------------------------------- /docs/teaser.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChenWu98/algorithmic-creativity/7176dc7d35ca17605a76fa873ec97ad62a8537ac/docs/teaser.png -------------------------------------------------------------------------------- /line-construction/diffusion/.gitignore: -------------------------------------------------------------------------------- 1 | .vscode/ 2 | build/ 3 | dist/ 4 | *.egg-info/ 5 | __pycache__ 6 | .ipynb_checkpoints/ 7 | .DS_Store 8 | **.pyc 9 | *.png 10 | *.txt 11 | **/outputs/ 12 | **/wandb/ 13 | **/exp/ 14 | **/exp_local/ 15 | data/ 16 | eval/ 17 | assets/ 18 | **.pth 19 | **.npz 20 | core 21 | **.log 22 | *.jsonl -------------------------------------------------------------------------------- /line-construction/diffusion/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Aaron Lou 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /line-construction/diffusion/catsample.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | 4 | 5 | def gumbel_softmax(categorical_probs, hard=False, eps=1e-9): 6 | logits = categorical_probs.clamp(min=1e-9).log() 7 | return F.gumbel_softmax(logits, hard=hard) 8 | 9 | 10 | def sample_categorical(categorical_probs, method="hard"): 11 | if method == "hard": 12 | gumbel_norm = 1e-10 - (torch.rand_like(categorical_probs) + 1e-10).log() 13 | return (categorical_probs / gumbel_norm).argmax(dim=-1) 14 | else: 15 | raise ValueError(f"Method {method} for sampling categorical variables is not valid.") 16 | -------------------------------------------------------------------------------- /line-construction/diffusion/configs/config.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - _self_ 3 | - model: small 4 | - override hydra/launcher: submitit_slurm 5 | 6 | ngpus: 1 7 | tokens: 50257 8 | add_vocab: "" 9 | 10 | training: 11 | batch_size: 64 # 512 12 | accum: 1 13 | n_iters: 400000 14 | snapshot_freq: 10000 15 | log_freq: 50 16 | eval_freq: 500 17 | snapshot_freq_for_preemption: 10000 18 | weight: standard 19 | snapshot_sampling: True 20 | ema: 0.9999 21 | 22 | data: 23 | train: openwebtext 24 | valid: wikitext103 25 | cache_dir: data 26 | 27 | graph: 28 | type: absorb 29 | file: data 30 | report_all: False 31 | 32 | noise: 33 | type: loglinear 34 | sigma_min: 1e-4 35 | sigma_max: 20 36 | 37 | sampling: 38 | predictor: euler 39 | steps: 128 40 | noise_removal: True 41 | 42 | eval: 43 | batch_size: 32 44 | perplexity: False 45 | perplexity_batch_size: 32 46 | 47 | optim: 48 | weight_decay: 0 49 | optimizer: AdamW 50 | lr: 1e-4 51 | beta1: 0.9 52 | beta2: 0.999 53 | eps: 1e-8 54 | warmup: 2500 55 | grad_clip: 1. 56 | 57 | 58 | hydra: 59 | run: 60 | dir: /data/locus/project_data/project_data2/chenwu2/creativity_results/${data.train}/${now:%Y.%m.%d}/${now:%H%M%S} -------------------------------------------------------------------------------- /line-construction/diffusion/configs/model/medium.yaml: -------------------------------------------------------------------------------- 1 | name: medium 2 | type: ddit 3 | hidden_size: 1024 4 | cond_dim: 128 5 | length: 29 6 | n_blocks: 24 7 | n_heads: 16 8 | scale_by_sigma: True 9 | dropout: 0.1 -------------------------------------------------------------------------------- /line-construction/diffusion/configs/model/small.yaml: -------------------------------------------------------------------------------- 1 | name: small 2 | type: ddit 3 | hidden_size: 768 4 | cond_dim: 128 5 | length: 29 # set to 29 for no-hash-string; 45 for hash-string 6 | n_blocks: 12 7 | n_heads: 12 8 | scale_by_sigma: True 9 | dropout: 0.1 10 | -------------------------------------------------------------------------------- /line-construction/diffusion/eval.sh: -------------------------------------------------------------------------------- 1 | # $1: dataset 2 | # $2: weight_decay 3 | # $3: n_layers 4 | 5 | EXP_DIR=creativity_results/creativity_data/line.10.9.0.10000/train.json/train/checkpoint_outputs 6 | 7 | DATA_DIR=/data/locus/project_data/project_data2/chenwu2/creativity_data 8 | 9 | python eval_qa.py --dir $EXP_DIR --dataset $1 --data_dir $DATA_DIR 10 | -------------------------------------------------------------------------------- /line-construction/diffusion/load_model.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | from model import SEDD 4 | import utils 5 | from model.ema import ExponentialMovingAverage 6 | import graph_lib 7 | import noise_lib 8 | 9 | from omegaconf import OmegaConf 10 | 11 | def load_model_hf(dir, device): 12 | score_model = SEDD.from_pretrained(dir).to(device) 13 | graph = graph_lib.get_graph(score_model.config, device) 14 | noise = noise_lib.get_noise(score_model.config).to(device) 15 | return score_model, graph, noise 16 | 17 | 18 | def load_model_local(root_dir, ckpt_dir, added_tokens, device): 19 | cfg = utils.load_hydra_config_from_run(root_dir) 20 | if added_tokens: 21 | cfg.tokens = cfg.tokens + len(added_tokens) 22 | graph = graph_lib.get_graph(cfg, device) 23 | noise = noise_lib.get_noise(cfg).to(device) 24 | score_model = SEDD(cfg).to(device) 25 | ema = ExponentialMovingAverage(score_model.parameters(), decay=cfg.training.ema) 26 | 27 | # ckpt_dir = os.path.join(root_dir, "checkpoints-meta", "checkpoint.pth") 28 | loaded_state = torch.load(ckpt_dir, map_location=device) 29 | 30 | score_model.load_state_dict(loaded_state['model']) 31 | ema.load_state_dict(loaded_state['ema']) 32 | 33 | ema.store(score_model.parameters()) 34 | ema.copy_to(score_model.parameters()) 35 | return score_model, graph, noise 36 | 37 | 38 | def load_model(root_dir, ckpt_dir, added_tokens, device): 39 | try: 40 | return load_model_hf(root_dir, device) 41 | except: 42 | return load_model_local(root_dir, ckpt_dir, added_tokens, device) -------------------------------------------------------------------------------- /line-construction/diffusion/model/__init__.py: -------------------------------------------------------------------------------- 1 | from .transformer import SEDD -------------------------------------------------------------------------------- /line-construction/diffusion/model/ema.py: -------------------------------------------------------------------------------- 1 | # Modified from https://raw.githubusercontent.com/fadel/pytorch_ema/master/torch_ema/ema.py 2 | 3 | from __future__ import division 4 | from __future__ import unicode_literals 5 | 6 | import torch 7 | 8 | 9 | # Partially based on: https://github.com/tensorflow/tensorflow/blob/r1.13/tensorflow/python/training/moving_averages.py 10 | class ExponentialMovingAverage: 11 | """ 12 | Maintains (exponential) moving average of a set of parameters. 13 | """ 14 | 15 | def __init__(self, parameters, decay, use_num_updates=True): 16 | """ 17 | Args: 18 | parameters: Iterable of `torch.nn.Parameter`; usually the result of 19 | `model.parameters()`. 20 | decay: The exponential decay. 21 | use_num_updates: Whether to use number of updates when computing 22 | averages. 23 | """ 24 | if decay < 0.0 or decay > 1.0: 25 | raise ValueError('Decay must be between 0 and 1') 26 | self.decay = decay 27 | self.num_updates = 0 if use_num_updates else None 28 | self.shadow_params = [p.clone().detach() 29 | for p in parameters if p.requires_grad] 30 | self.collected_params = [] 31 | 32 | def update(self, parameters): 33 | """ 34 | Update currently maintained parameters. 35 | 36 | Call this every time the parameters are updated, such as the result of 37 | the `optimizer.step()` call. 38 | 39 | Args: 40 | parameters: Iterable of `torch.nn.Parameter`; usually the same set of 41 | parameters used to initialize this object. 42 | """ 43 | decay = self.decay 44 | if self.num_updates is not None: 45 | self.num_updates += 1 46 | decay = min(decay, (1 + self.num_updates) / 47 | (10 + self.num_updates)) 48 | one_minus_decay = 1.0 - decay 49 | with torch.no_grad(): 50 | parameters = [p for p in parameters if p.requires_grad] 51 | for s_param, param in zip(self.shadow_params, parameters): 52 | s_param.sub_(one_minus_decay * (s_param - param)) 53 | 54 | 55 | def copy_to(self, parameters): 56 | """ 57 | Copy current parameters into given collection of parameters. 58 | 59 | Args: 60 | parameters: Iterable of `torch.nn.Parameter`; the parameters to be 61 | updated with the stored moving averages. 62 | """ 63 | parameters = [p for p in parameters if p.requires_grad] 64 | for s_param, param in zip(self.shadow_params, parameters): 65 | if param.requires_grad: 66 | param.data.copy_(s_param.data) 67 | 68 | def store(self, parameters): 69 | """ 70 | Save the current parameters for restoring later. 71 | 72 | Args: 73 | parameters: Iterable of `torch.nn.Parameter`; the parameters to be 74 | temporarily stored. 75 | """ 76 | self.collected_params = [param.clone() for param in parameters] 77 | 78 | def restore(self, parameters): 79 | """ 80 | Restore the parameters stored with the `store` method. 81 | Useful to validate the model with EMA parameters without affecting the 82 | original optimization process. Store the parameters before the 83 | `copy_to` method. After validation (or model saving), use this to 84 | restore the former parameters. 85 | 86 | Args: 87 | parameters: Iterable of `torch.nn.Parameter`; the parameters to be 88 | updated with the stored parameters. 89 | """ 90 | for c_param, param in zip(self.collected_params, parameters): 91 | param.data.copy_(c_param.data) 92 | 93 | def state_dict(self): 94 | return dict(decay=self.decay, num_updates=self.num_updates, 95 | shadow_params=self.shadow_params) 96 | 97 | def load_state_dict(self, state_dict): 98 | self.decay = state_dict['decay'] 99 | self.num_updates = state_dict['num_updates'] 100 | self.shadow_params = state_dict['shadow_params'] -------------------------------------------------------------------------------- /line-construction/diffusion/model/fused_add_dropout_scale.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from typing import Optional 4 | from torch import Tensor 5 | 6 | # flags required to enable jit fusion kernels 7 | torch._C._jit_set_profiling_mode(False) 8 | torch._C._jit_set_profiling_executor(False) 9 | torch._C._jit_override_can_fuse_on_cpu(True) 10 | torch._C._jit_override_can_fuse_on_gpu(True) 11 | 12 | 13 | def bias_dropout_add_scale( 14 | x: Tensor, bias: Optional[Tensor], scale: Tensor, residual: Optional[Tensor], prob: float, training: bool 15 | ) -> Tensor: 16 | if bias is not None: 17 | out = scale * F.dropout(x + bias, p=prob, training=training) 18 | else: 19 | out = scale * F.dropout(x, p=prob, training=training) 20 | 21 | if residual is not None: 22 | out = residual + out 23 | return out 24 | 25 | 26 | def get_bias_dropout_add_scale(training): 27 | def _bias_dropout_add(x, bias, scale, residual, prob): 28 | return bias_dropout_add_scale(x, bias, scale, residual, prob, training) 29 | 30 | return _bias_dropout_add 31 | 32 | 33 | def modulate(x: Tensor, shift: Tensor, scale: Tensor) -> Tensor: 34 | return x * (1 + scale) + shift 35 | 36 | 37 | @torch.jit.script 38 | def bias_dropout_add_scale_fused_train( 39 | x: Tensor, bias: Optional[Tensor], scale: Tensor, residual: Optional[Tensor], prob: float 40 | ) -> Tensor: 41 | return bias_dropout_add_scale(x, bias, scale, residual, prob, True) 42 | 43 | 44 | @torch.jit.script 45 | def bias_dropout_add_scale_fused_inference( 46 | x: Tensor, bias: Optional[Tensor], scale: Tensor, residual: Optional[Tensor], prob: float 47 | ) -> Tensor: 48 | return bias_dropout_add_scale(x, bias, scale, residual, prob, False) 49 | 50 | @torch.jit.script 51 | def modulate_fused(x: Tensor, shift: Tensor, scale: Tensor) -> Tensor: 52 | return modulate(x, shift, scale) -------------------------------------------------------------------------------- /line-construction/diffusion/model/rotary.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | 4 | 5 | class Rotary(torch.nn.Module): 6 | def __init__(self, dim, base=10_000): 7 | super().__init__() 8 | inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim)) 9 | self.register_buffer("inv_freq", inv_freq) 10 | self.seq_len_cached = None 11 | self.cos_cached = None 12 | self.sin_cached = None 13 | 14 | def forward(self, x, seq_dim=1): 15 | seq_len = x.shape[seq_dim] 16 | if seq_len != self.seq_len_cached: 17 | self.seq_len_cached = seq_len 18 | t = torch.arange(x.shape[seq_dim], device=x.device).type_as(self.inv_freq) 19 | freqs = torch.einsum("i,j->ij", t, self.inv_freq.clone()) 20 | emb = torch.cat((freqs, freqs), dim=-1).to(x.device) 21 | # dims are: batch, seq_len, qkv, head, dim 22 | self.cos_cached = emb.cos()[None, :, None, None, :].repeat(1,1,3,1,1) 23 | self.sin_cached = emb.sin()[None, :, None, None, :].repeat(1,1,3,1,1) 24 | # This makes the transformation on v an identity. 25 | self.cos_cached[:,:,2,:,:].fill_(1.) 26 | self.sin_cached[:,:,2,:,:].fill_(0.) 27 | 28 | return self.cos_cached, self.sin_cached 29 | 30 | 31 | def rotate_half(x): 32 | x1, x2 = x[..., : x.shape[-1] // 2], x[..., x.shape[-1] // 2 :] 33 | return torch.cat( 34 | (-x2, x1), dim=-1 35 | ) 36 | 37 | 38 | @torch.jit.script 39 | def _apply_rotary_pos_emb_torchscript(qkv, cos, sin): 40 | return (qkv * cos) + (rotate_half(qkv) * sin) 41 | 42 | 43 | def apply_rotary_pos_emb(qkv, cos, sin): 44 | try: 45 | import flash_attn.layers.rotary 46 | cos = cos[0,:,0,0,:cos.shape[-1]//2] 47 | sin = sin[0,:,0,0,:sin.shape[-1]//2] 48 | return flash_attn.layers.rotary.apply_rotary_emb_qkv_( 49 | qkv, cos, sin 50 | ) 51 | except: 52 | return _apply_rotary_pos_emb_torchscript(qkv, cos, sin) -------------------------------------------------------------------------------- /line-construction/diffusion/model/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | 4 | 5 | def get_model_fn(model, train=False): 6 | """Create a function to give the output of the score-based model. 7 | 8 | Args: 9 | model: The score model. 10 | train: `True` for training and `False` for evaluation. 11 | mlm: If the input model is a mlm and models the base probability 12 | 13 | Returns: 14 | A model function. 15 | """ 16 | 17 | def model_fn(x, sigma): 18 | """Compute the output of the score-based model. 19 | 20 | Args: 21 | x: A mini-batch of input data. 22 | labels: A mini-batch of conditioning variables for time steps. Should be interpreted differently 23 | for different models. 24 | 25 | Returns: 26 | A tuple of (model output, new mutable states) 27 | """ 28 | if train: 29 | model.train() 30 | else: 31 | model.eval() 32 | 33 | # otherwise output the raw values (we handle mlm training in losses.py) 34 | return model(x, sigma) 35 | 36 | return model_fn 37 | 38 | 39 | def get_score_fn(model, train=False, sampling=False): 40 | if sampling: 41 | assert not train, "Must sample in eval mode" 42 | model_fn = get_model_fn(model, train=train) 43 | 44 | with torch.cuda.amp.autocast(dtype=torch.bfloat16): 45 | def score_fn(x, sigma): 46 | sigma = sigma.reshape(-1) 47 | score = model_fn(x, sigma) 48 | 49 | if sampling: 50 | # when sampling return true score (not log used for training) 51 | return score.exp() 52 | 53 | return score 54 | 55 | return score_fn 56 | -------------------------------------------------------------------------------- /line-construction/diffusion/noise_lib.py: -------------------------------------------------------------------------------- 1 | import abc 2 | import torch 3 | import torch.nn as nn 4 | import numpy as np 5 | 6 | 7 | def get_noise(config): 8 | if config.noise.type == "geometric": 9 | return GeometricNoise(config.noise.sigma_min, config.noise.sigma_max) 10 | elif config.noise.type == "loglinear": 11 | return LogLinearNoise() 12 | else: 13 | raise ValueError(f"{config.noise.type} is not a valid noise") 14 | 15 | 16 | class Noise(abc.ABC, nn.Module): 17 | """ 18 | Baseline forward method to get the total + rate of noise at a timestep 19 | """ 20 | def forward(self, t): 21 | return self.total_noise(t), self.rate_noise(t) 22 | 23 | """ 24 | Assume time goes from 0 to 1 25 | """ 26 | @abc.abstractmethod 27 | def rate_noise(self, t): 28 | """ 29 | Rate of change of noise ie g(t) 30 | """ 31 | pass 32 | 33 | @abc.abstractmethod 34 | def total_noise(self, t): 35 | """ 36 | Total noise ie \int_0^t g(t) dt + g(0) 37 | """ 38 | pass 39 | 40 | 41 | class GeometricNoise(Noise, nn.Module): 42 | def __init__(self, sigma_min=1e-3, sigma_max=1, learnable=False): 43 | super().__init__() 44 | self.sigmas = 1.0 * torch.tensor([sigma_min, sigma_max]) 45 | if learnable: 46 | self.sigmas = nn.Parameter(self.sigmas) 47 | self.empty = nn.Parameter(torch.tensor(0.0)) 48 | 49 | def rate_noise(self, t): 50 | return self.sigmas[0] ** (1 - t) * self.sigmas[1] ** t * (self.sigmas[1].log() - self.sigmas[0].log()) 51 | 52 | def total_noise(self, t): 53 | return self.sigmas[0] ** (1 - t) * self.sigmas[1] ** t 54 | 55 | 56 | class LogLinearNoise(Noise, nn.Module): 57 | """ 58 | Log Linear noise schedule built so that 1 - 1/e^(n(t)) interpolates between 0 and ~1 59 | when t goes from 0 to 1. Used for absorbing 60 | 61 | Total noise is -log(1 - (1 - eps) * t), so the sigma will be (1 - eps) * t 62 | """ 63 | def __init__(self, eps=1e-3): 64 | super().__init__() 65 | self.eps = eps 66 | self.empty = nn.Parameter(torch.tensor(0.0)) 67 | 68 | def rate_noise(self, t): 69 | return (1 - self.eps) / (1 - (1 - self.eps) * t) 70 | 71 | def total_noise(self, t): 72 | return -torch.log1p(-(1 - self.eps) * t) 73 | 74 | -------------------------------------------------------------------------------- /line-construction/diffusion/run_eval.sh: -------------------------------------------------------------------------------- 1 | bash eval.sh line.10.9.0.10000 0.0 8 -------------------------------------------------------------------------------- /line-construction/diffusion/run_sample.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import argparse 3 | import json 4 | 5 | from load_model import load_model 6 | from transformers import GPT2TokenizerFast 7 | import torch.nn.functional as F 8 | import sampling 9 | 10 | 11 | def main(): 12 | parser = argparse.ArgumentParser(description="Generate some samples") 13 | parser.add_argument("--model_path", default="louaaron/sedd-medium", type=str) 14 | parser.add_argument("--dataset", default="wikitext103", type=str) 15 | parser.add_argument("--batch_size", type=int, default=1) 16 | parser.add_argument("--steps", type=int, default=1024) 17 | parser.add_argument("--add_vocab", type=str, default=None) 18 | args = parser.parse_args() 19 | 20 | 21 | device = torch.device('cuda') 22 | model, graph, noise = load_model(args.model_path, device) 23 | tokenizer = GPT2TokenizerFast.from_pretrained('gpt2') 24 | if args.add_vocab: 25 | with open(args.add_vocab, 'r') as file: 26 | added_tokens = json.load(file) 27 | tokenizer.add_tokens(added_tokens) 28 | 29 | sampling_fn = sampling.get_pc_sampler( 30 | graph, noise, (args.batch_size, 1024), 'analytic', args.steps, device=device 31 | ) 32 | 33 | samples = sampling_fn(model) 34 | 35 | text_samples = tokenizer.batch_decode(samples) 36 | for i in text_samples: 37 | print(i) 38 | print("=================================================") 39 | 40 | if __name__=="__main__": 41 | main() -------------------------------------------------------------------------------- /line-construction/diffusion/run_sample_cond.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import argparse 3 | import json 4 | 5 | from load_model import load_model 6 | from transformers import GPT2TokenizerFast 7 | import sampling 8 | 9 | 10 | def main(): 11 | parser = argparse.ArgumentParser(description="Generate some samples") 12 | parser.add_argument("--model_path", default="louaaron/sedd-medium", type=str) 13 | parser.add_argument("--dataset", default="wikitext103", type=str) 14 | parser.add_argument("--batch_size", type=int, default=1) 15 | parser.add_argument("--steps", type=int, default=1024) 16 | parser.add_argument("--prefix", type=str, default="Hi, my name is") 17 | parser.add_argument("--suffix", type=str, default=" and that's why I'm late.") 18 | parser.add_argument("--add_vocab", type=str, default=None) 19 | args = parser.parse_args() 20 | 21 | tokenizer = GPT2TokenizerFast.from_pretrained('gpt2') 22 | if args.add_vocab: 23 | with open(args.add_vocab, 'r') as file: 24 | added_tokens = json.load(file) 25 | tokenizer.add_tokens(added_tokens) 26 | 27 | prefix_ids = tokenizer(args.prefix).input_ids 28 | suffix_ids = tokenizer(args.suffix).input_ids 29 | input_ids = prefix_ids + suffix_ids 30 | input_locs = list(range(len(prefix_ids))) + list(range(1024-len(suffix_ids), 1024)) 31 | 32 | # more generaly commands can be defined with something like below: 33 | # input_ids = [0, 1, 512, 8080, 50256, 20000] 34 | # input_locs = [5, 6, 19, 20, 1000, 10001] 35 | 36 | 37 | input_ids = torch.tensor(input_ids, device="cuda")[None].repeat(args.batch_size, 1) 38 | 39 | def proj_fun(x): 40 | x[:, input_locs] = input_ids 41 | return x 42 | 43 | device = torch.device('cuda') 44 | model, graph, noise = load_model(args.model_path, device) 45 | 46 | 47 | sampling_fn = sampling.get_pc_sampler( 48 | graph, noise, (args.batch_size, 1024), 'analytic', args.steps, device=device, proj_fun=proj_fun 49 | ) 50 | 51 | samples = proj_fun(sampling_fn(model)) 52 | 53 | text_samples = tokenizer.batch_decode(samples) 54 | for i in text_samples: 55 | print(i) 56 | print("=================================================") 57 | 58 | if __name__=="__main__": 59 | main() -------------------------------------------------------------------------------- /line-construction/diffusion/run_train.sh: -------------------------------------------------------------------------------- 1 | python train.py \ 2 | noise.type=loglinear \ 3 | graph.type=absorb \ 4 | model=small \ 5 | training.accum=1 \ 6 | data.train=creativity_data/line.10.9.0.10000/train.json \ 7 | data.valid=creativity_data/line.10.9.0.10000/valid.json \ 8 | add_vocab=creativity_data/line.10.9.0.10000/vocab.json \ 9 | hydra.run.dir=/data/locus/project_data/project_data2/chenwu2/creativity_results/creativity_data/line.10.9.0.10000/train.json/train 10 | 11 | python test.py \ 12 | --model_checkpoint_dir creativity_results/creativity_data/line.10.9.0.10000/train.json/train \ 13 | --dataset creativity_data/line.10.9.0.10000 \ 14 | --add_vocab creativity_data/line.10.9.0.10000/vocab.json -------------------------------------------------------------------------------- /line-construction/diffusion/test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import argparse 3 | import json 4 | 5 | from load_model import load_model 6 | from transformers import GPT2TokenizerFast 7 | import sampling 8 | from tqdm import tqdm 9 | 10 | import os 11 | 12 | 13 | def main(): 14 | parser = argparse.ArgumentParser(description="Generate some samples") 15 | parser.add_argument("--model_checkpoint_dir", default="", type=str) 16 | parser.add_argument("--dataset", default=None, type=str) 17 | parser.add_argument("--steps", type=int, default=128) 18 | parser.add_argument("--add_vocab", type=str, default=None) 19 | args = parser.parse_args() 20 | 21 | tokenizer = GPT2TokenizerFast.from_pretrained('gpt2') 22 | if args.add_vocab: 23 | with open(args.add_vocab, 'r') as file: 24 | added_tokens = json.load(file) 25 | tokenizer.add_tokens(added_tokens) 26 | else: 27 | added_tokens = [] 28 | 29 | # Load the dataset 30 | with open(os.path.join(args.dataset, "test.json"), "r") as f: 31 | test_data = json.load(f) 32 | 33 | # List all files under the model checkpoint directory 34 | checkpoints = [os.path.join(args.model_checkpoint_dir, "checkpoints", f) for f in os.listdir(os.path.join(args.model_checkpoint_dir, "checkpoints"))] 35 | print(checkpoints) 36 | 37 | for checkpoint in checkpoints: 38 | device = torch.device('cuda') 39 | model, graph, noise = load_model(args.model_checkpoint_dir, checkpoint, added_tokens, device) 40 | # Create a checkpoint_dir for the current checkpoint 41 | checkpoint_dir = os.path.join(args.model_checkpoint_dir, "checkpoint_outputs", os.path.basename(checkpoint)) 42 | if os.path.exists(checkpoint_dir): 43 | print(f"Skipping {checkpoint_dir} because it already exists") 44 | continue 45 | os.makedirs(checkpoint_dir, exist_ok=True) 46 | 47 | def generate_output(input_text): 48 | prefix_ids = tokenizer(input_text).input_ids 49 | # suffix_ids = tokenizer("<|endoftext|>").input_ids 50 | input_ids = prefix_ids 51 | input_locs = list(range(len(prefix_ids))) 52 | 53 | input_ids = torch.tensor(input_ids, device="cuda")[None].repeat(1, 1) 54 | 55 | def proj_fun(x): 56 | x[:, input_locs] = input_ids 57 | return x 58 | 59 | sampling_fn = sampling.get_pc_sampler( 60 | graph, noise, (1, 29), 'analytic', args.steps, device=device, proj_fun=proj_fun 61 | ) 62 | 63 | samples = proj_fun(sampling_fn(model)) 64 | 65 | text_samples = tokenizer.batch_decode(samples) 66 | assert len(text_samples) == 1 67 | text_samples = text_samples[0].split("<|endoftext|>")[0] 68 | return text_samples 69 | 70 | all_items = [] 71 | for sample in tqdm(test_data): 72 | item = {} 73 | item["input_text"] = sample["input_text"] 74 | item["target_text"] = sample["target_text"] 75 | item["type"] = sample["type"] 76 | 77 | output = generate_output(sample["input_text"]) 78 | print(sample["input_text"]) 79 | print(sample["target_text"]) 80 | print(output) 81 | print() 82 | item["model_output"] = output 83 | all_items.append(item) 84 | 85 | # Save the results 86 | with open(os.path.join(checkpoint_dir, "all_items.json"), "w") as f: 87 | json.dump(all_items, f, indent=4) 88 | 89 | 90 | if __name__=="__main__": 91 | main() -------------------------------------------------------------------------------- /line-construction/diffusion/train.py: -------------------------------------------------------------------------------- 1 | """Training and evaluation""" 2 | 3 | import hydra 4 | import os 5 | import numpy as np 6 | import run_train 7 | import utils 8 | import torch.multiprocessing as mp 9 | from hydra.core.hydra_config import HydraConfig 10 | from hydra.types import RunMode 11 | from omegaconf import OmegaConf, open_dict 12 | 13 | 14 | @hydra.main(version_base=None, config_path="configs", config_name="config") 15 | def main(cfg): 16 | ngpus = cfg.ngpus 17 | if "load_dir" in cfg: 18 | hydra_cfg_path = os.path.join(cfg.load_dir, ".hydra/hydra.yaml") 19 | hydra_cfg = OmegaConf.load(hydra_cfg_path).hydra 20 | 21 | cfg = utils.load_hydra_config_from_run(cfg.load_dir) 22 | 23 | work_dir = cfg.work_dir 24 | utils.makedirs(work_dir) 25 | else: 26 | hydra_cfg = HydraConfig.get() 27 | work_dir = hydra_cfg.run.dir if hydra_cfg.mode == RunMode.RUN else os.path.join(hydra_cfg.sweep.dir, hydra_cfg.sweep.subdir) 28 | utils.makedirs(work_dir) 29 | 30 | with open_dict(cfg): 31 | cfg.ngpus = ngpus 32 | cfg.work_dir = work_dir 33 | cfg.wandb_name = os.path.basename(os.path.normpath(work_dir)) 34 | 35 | # Run the training pipeline 36 | port = int(np.random.randint(10000, 20000)) 37 | logger = utils.get_logger(os.path.join(work_dir, "logs")) 38 | 39 | hydra_cfg = HydraConfig.get() 40 | if hydra_cfg.mode != RunMode.RUN: 41 | logger.info(f"Run id: {hydra_cfg.job.id}") 42 | 43 | try: 44 | mp.set_start_method("forkserver") 45 | mp.spawn(run_train.run_multiprocess, args=(ngpus, cfg, port), nprocs=ngpus, join=True) 46 | except Exception as e: 47 | logger.critical(e, exc_info=True) 48 | 49 | 50 | if __name__ == "__main__": 51 | main() -------------------------------------------------------------------------------- /line-construction/diffusion/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | import os 4 | import logging 5 | from omegaconf import OmegaConf, open_dict 6 | 7 | 8 | def load_hydra_config_from_run(load_dir): 9 | cfg_path = os.path.join(load_dir, ".hydra/config.yaml") 10 | cfg = OmegaConf.load(cfg_path) 11 | return cfg 12 | 13 | 14 | def makedirs(dirname): 15 | os.makedirs(dirname, exist_ok=True) 16 | 17 | 18 | def get_logger(logpath, package_files=[], displaying=True, saving=True, debug=False): 19 | logger = logging.getLogger() 20 | if debug: 21 | level = logging.DEBUG 22 | else: 23 | level = logging.INFO 24 | 25 | if (logger.hasHandlers()): 26 | logger.handlers.clear() 27 | 28 | logger.setLevel(level) 29 | formatter = logging.Formatter('%(asctime)s - %(message)s') 30 | if saving: 31 | info_file_handler = logging.FileHandler(logpath, mode="a") 32 | info_file_handler.setLevel(level) 33 | info_file_handler.setFormatter(formatter) 34 | logger.addHandler(info_file_handler) 35 | if displaying: 36 | console_handler = logging.StreamHandler() 37 | console_handler.setLevel(level) 38 | console_handler.setFormatter(formatter) 39 | logger.addHandler(console_handler) 40 | 41 | for f in package_files: 42 | logger.info(f) 43 | with open(f, "r") as package_f: 44 | logger.info(package_f.read()) 45 | 46 | return logger 47 | 48 | 49 | def restore_checkpoint(ckpt_dir, state, device): 50 | if not os.path.exists(ckpt_dir): 51 | makedirs(os.path.dirname(ckpt_dir)) 52 | logging.warning(f"No checkpoint found at {ckpt_dir}. Returned the same state as input") 53 | return state 54 | else: 55 | loaded_state = torch.load(ckpt_dir, map_location=device) 56 | state['optimizer'].load_state_dict(loaded_state['optimizer']) 57 | state['model'].module.load_state_dict(loaded_state['model'], strict=False) 58 | state['ema'].load_state_dict(loaded_state['ema']) 59 | state['step'] = loaded_state['step'] 60 | return state 61 | 62 | 63 | def save_checkpoint(ckpt_dir, state): 64 | saved_state = { 65 | 'optimizer': state['optimizer'].state_dict(), 66 | 'model': state['model'].module.state_dict(), 67 | 'ema': state['ema'].state_dict(), 68 | 'step': state['step'] 69 | } 70 | torch.save(saved_state, ckpt_dir) -------------------------------------------------------------------------------- /line-construction/ntp/eval.sh: -------------------------------------------------------------------------------- 1 | # $1: dataset 2 | # $2: weight_decay 3 | # $3: n_layers 4 | 5 | EXP_DIR=../creativity 6 | DATA_DIR=/data/locus/project_data/project_data2/chenwu2/creativity_data 7 | 8 | python eval_qa.py --dir $EXP_DIR/$1_$2_$3 --dataset $1 --data_dir $DATA_DIR 9 | -------------------------------------------------------------------------------- /line-construction/ntp/run_eval.sh: -------------------------------------------------------------------------------- 1 | bash eval.sh line.10.9.10.10000 0.0 12 2 | -------------------------------------------------------------------------------- /line-construction/ntp/run_train.sh: -------------------------------------------------------------------------------- 1 | bash train.sh line.10.9.10.10000 0.0 12 0 2 | -------------------------------------------------------------------------------- /line-construction/ntp/train.sh: -------------------------------------------------------------------------------- 1 | MODEL_PATH=gpt2 2 | 3 | DATASET=/data/locus/project_data/project_data2/chenwu2/creativity_data/$1/ 4 | WEIGHT_DECAY=$2 5 | N_LAYERS=$3 6 | GPU=$4 7 | 8 | EXP_DIR=../creativity 9 | 10 | OUTPUT_DIR=$EXP_DIR/$1_$2_$3 11 | 12 | CUDA_VISIBLE_DEVICES=$GPU python main.py \ 13 | --data_dir $DATASET \ 14 | --model_name_or_path ${MODEL_PATH} \ 15 | --weight_decay $WEIGHT_DECAY \ 16 | --output_dir $OUTPUT_DIR \ 17 | --max_seq_length 128 \ 18 | --max_length 128 \ 19 | --block_size 128 \ 20 | --train_batch_size 64 \ 21 | --eval_batch_size 64 \ 22 | --learning_rate 1e-4 \ 23 | --gradient_accumulation_steps 1 \ 24 | --save_step 5000 \ 25 | --save_step_dense 1000 \ 26 | --max_steps 40000 \ 27 | --do_train \ 28 | --scheduler constant_schedule_with_warmup \ 29 | --fp16 \ 30 | --evaluate_during_training \ 31 | --predict_during_training \ 32 | --add_tokens \ 33 | --n_layer $N_LAYERS 34 | -------------------------------------------------------------------------------- /line-construction/ntp/utils.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import json 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | 7 | 8 | def read_data_source_target(file_name, return_num=False, return_json=False, cutoff=None): 9 | """ 10 | file_name: a .json file containing a list of items, each has 'input_text', 'target_text', as keys 11 | """ 12 | with open(file_name, 'r', encoding='utf-8') as f: 13 | data = json.load(f) 14 | 15 | if cutoff is not None: 16 | data = data[:cutoff] 17 | 18 | if return_json: 19 | if return_num: 20 | return data, len(data) 21 | return data 22 | 23 | keys = list(data[0].keys()) 24 | source_target_pair = [] 25 | for item in data: 26 | source_target_pair.append([item[key] for key in keys]) 27 | 28 | if return_num: 29 | return pd.DataFrame(source_target_pair, columns=keys), len(data) 30 | return pd.DataFrame(source_target_pair, columns=keys) 31 | -------------------------------------------------------------------------------- /line-construction/teacherless/eval.sh: -------------------------------------------------------------------------------- 1 | # $1: dataset 2 | # $2: weight_decay 3 | # $3: n_layers 4 | 5 | EXP_DIR=../creativity 6 | DATA_DIR=/data/locus/project_data/project_data2/chenwu2/creativity_data 7 | 8 | python eval_qa.py --dir $EXP_DIR/$1_$2_$3 --dataset $1 --data_dir $DATA_DIR 9 | -------------------------------------------------------------------------------- /line-construction/teacherless/run_eval.sh: -------------------------------------------------------------------------------- 1 | bash eval.sh line_hybrid.10.9.10.10000 0.0 12 2 | -------------------------------------------------------------------------------- /line-construction/teacherless/run_train.sh: -------------------------------------------------------------------------------- 1 | bash train.sh line_hybrid.10.9.10.10000 0.0 12 0 2 | -------------------------------------------------------------------------------- /line-construction/teacherless/train.sh: -------------------------------------------------------------------------------- 1 | MODEL_PATH=gpt2 2 | 3 | DATASET=/data/locus/project_data/project_data2/chenwu2/creativity_data/$1/ 4 | WEIGHT_DECAY=$2 5 | N_LAYERS=$3 6 | GPU=$4 7 | 8 | EXP_DIR=../creativity 9 | 10 | OUTPUT_DIR=$EXP_DIR/$1_$2_$3 11 | 12 | CUDA_VISIBLE_DEVICES=$GPU python main.py \ 13 | --data_dir $DATASET \ 14 | --model_name_or_path ${MODEL_PATH} \ 15 | --weight_decay $WEIGHT_DECAY \ 16 | --output_dir $OUTPUT_DIR \ 17 | --max_seq_length 128 \ 18 | --max_length 128 \ 19 | --block_size 128 \ 20 | --train_batch_size 64 \ 21 | --eval_batch_size 64 \ 22 | --learning_rate 1e-4 \ 23 | --gradient_accumulation_steps 1 \ 24 | --save_step 5000 \ 25 | --save_step_dense 1000 \ 26 | --max_steps 40000 \ 27 | --do_train \ 28 | --scheduler constant_schedule_with_warmup \ 29 | --fp16 \ 30 | --evaluate_during_training \ 31 | --predict_during_training \ 32 | --add_tokens \ 33 | --n_layer $N_LAYERS 34 | -------------------------------------------------------------------------------- /line-construction/teacherless/utils.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import json 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | 7 | 8 | def read_data_source_target(file_name, return_num=False, return_json=False, cutoff=None): 9 | """ 10 | file_name: a .json file containing a list of items, each has 'input_text', 'target_text', as keys 11 | """ 12 | with open(file_name, 'r', encoding='utf-8') as f: 13 | data = json.load(f) 14 | 15 | if cutoff is not None: 16 | data = data[:cutoff] 17 | 18 | if return_json: 19 | if return_num: 20 | return data, len(data) 21 | return data 22 | 23 | keys = list(data[0].keys()) 24 | source_target_pair = [] 25 | for item in data: 26 | source_target_pair.append([item[key] for key in keys]) 27 | 28 | if return_num: 29 | return pd.DataFrame(source_target_pair, columns=keys), len(data) 30 | return pd.DataFrame(source_target_pair, columns=keys) 31 | -------------------------------------------------------------------------------- /sibling-discovery/diffusion/.gitignore: -------------------------------------------------------------------------------- 1 | .vscode/ 2 | build/ 3 | dist/ 4 | *.egg-info/ 5 | __pycache__ 6 | .ipynb_checkpoints/ 7 | .DS_Store 8 | **.pyc 9 | *.png 10 | *.txt 11 | **/outputs/ 12 | **/wandb/ 13 | **/exp/ 14 | **/exp_local/ 15 | data/ 16 | eval/ 17 | assets/ 18 | **.pth 19 | **.npz 20 | core 21 | **.log 22 | *.jsonl -------------------------------------------------------------------------------- /sibling-discovery/diffusion/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Aaron Lou 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /sibling-discovery/diffusion/catsample.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | 4 | 5 | def gumbel_softmax(categorical_probs, hard=False, eps=1e-9): 6 | logits = categorical_probs.clamp(min=1e-9).log() 7 | return F.gumbel_softmax(logits, hard=hard) 8 | 9 | 10 | def sample_categorical(categorical_probs, method="hard"): 11 | if method == "hard": 12 | gumbel_norm = 1e-10 - (torch.rand_like(categorical_probs) + 1e-10).log() 13 | return (categorical_probs / gumbel_norm).argmax(dim=-1) 14 | else: 15 | raise ValueError(f"Method {method} for sampling categorical variables is not valid.") 16 | -------------------------------------------------------------------------------- /sibling-discovery/diffusion/configs/config.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - _self_ 3 | - model: small 4 | - override hydra/launcher: submitit_slurm 5 | 6 | ngpus: 1 7 | tokens: 50257 8 | add_vocab: "" 9 | 10 | training: 11 | batch_size: 64 # 512 12 | accum: 1 13 | n_iters: 800000 14 | snapshot_freq: 40000 15 | log_freq: 50 16 | eval_freq: 500 17 | snapshot_freq_for_preemption: 40000 18 | weight: standard 19 | snapshot_sampling: True 20 | ema: 0.9999 21 | 22 | data: 23 | train: openwebtext 24 | valid: wikitext103 25 | cache_dir: data 26 | 27 | graph: 28 | type: absorb 29 | file: data 30 | report_all: False 31 | 32 | noise: 33 | type: loglinear 34 | sigma_min: 1e-4 35 | sigma_max: 20 36 | 37 | sampling: 38 | predictor: euler 39 | steps: 32 40 | noise_removal: True 41 | 42 | eval: 43 | batch_size: 32 44 | perplexity: False 45 | perplexity_batch_size: 32 46 | 47 | optim: 48 | weight_decay: 0 49 | optimizer: AdamW 50 | lr: 1e-4 51 | beta1: 0.9 52 | beta2: 0.999 53 | eps: 1e-8 54 | warmup: 2500 55 | grad_clip: 1. 56 | 57 | 58 | hydra: 59 | run: 60 | dir: /data/locus/project_data/project_data2/chenwu2/creativity_results/${data.train}/${now:%Y.%m.%d}/${now:%H%M%S} -------------------------------------------------------------------------------- /sibling-discovery/diffusion/configs/model/medium.yaml: -------------------------------------------------------------------------------- 1 | name: medium 2 | type: ddit 3 | hidden_size: 1024 4 | cond_dim: 128 5 | length: 32 6 | n_blocks: 24 7 | n_heads: 16 8 | scale_by_sigma: True 9 | dropout: 0.1 -------------------------------------------------------------------------------- /sibling-discovery/diffusion/configs/model/small.yaml: -------------------------------------------------------------------------------- 1 | name: small 2 | type: ddit 3 | hidden_size: 768 4 | cond_dim: 128 5 | length: 32 6 | n_blocks: 12 7 | n_heads: 12 8 | scale_by_sigma: True 9 | dropout: 0.1 -------------------------------------------------------------------------------- /sibling-discovery/diffusion/eval.sh: -------------------------------------------------------------------------------- 1 | # $1: dataset 2 | # $2: weight_decay 3 | # $3: n_layers 4 | 5 | EXP_DIR=creativity_results/creativity_data/sibling.5.500.10.50000/train.json/train/checkpoint_outputs 6 | 7 | DATA_DIR=/data/locus/project_data/project_data2/chenwu2/creativity_data 8 | 9 | python eval_qa.py --dir $EXP_DIR --dataset $1 --data_dir $DATA_DIR 10 | -------------------------------------------------------------------------------- /sibling-discovery/diffusion/load_model.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | from model import SEDD 4 | import utils 5 | from model.ema import ExponentialMovingAverage 6 | import graph_lib 7 | import noise_lib 8 | 9 | from omegaconf import OmegaConf 10 | 11 | def load_model_hf(dir, device): 12 | score_model = SEDD.from_pretrained(dir).to(device) 13 | graph = graph_lib.get_graph(score_model.config, device) 14 | noise = noise_lib.get_noise(score_model.config).to(device) 15 | return score_model, graph, noise 16 | 17 | 18 | def load_model_local(root_dir, ckpt_dir, added_tokens, device): 19 | cfg = utils.load_hydra_config_from_run(root_dir) 20 | if added_tokens: 21 | cfg.tokens = cfg.tokens + len(added_tokens) 22 | graph = graph_lib.get_graph(cfg, device) 23 | noise = noise_lib.get_noise(cfg).to(device) 24 | score_model = SEDD(cfg).to(device) 25 | ema = ExponentialMovingAverage(score_model.parameters(), decay=cfg.training.ema) 26 | 27 | # ckpt_dir = os.path.join(root_dir, "checkpoints-meta", "checkpoint.pth") 28 | loaded_state = torch.load(ckpt_dir, map_location=device) 29 | 30 | score_model.load_state_dict(loaded_state['model']) 31 | ema.load_state_dict(loaded_state['ema']) 32 | 33 | ema.store(score_model.parameters()) 34 | ema.copy_to(score_model.parameters()) 35 | return score_model, graph, noise 36 | 37 | 38 | def load_model(root_dir, ckpt_dir, added_tokens, device): 39 | try: 40 | return load_model_hf(root_dir, device) 41 | except: 42 | return load_model_local(root_dir, ckpt_dir, added_tokens, device) -------------------------------------------------------------------------------- /sibling-discovery/diffusion/model/__init__.py: -------------------------------------------------------------------------------- 1 | from .transformer import SEDD -------------------------------------------------------------------------------- /sibling-discovery/diffusion/model/fused_add_dropout_scale.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from typing import Optional 4 | from torch import Tensor 5 | 6 | # flags required to enable jit fusion kernels 7 | torch._C._jit_set_profiling_mode(False) 8 | torch._C._jit_set_profiling_executor(False) 9 | torch._C._jit_override_can_fuse_on_cpu(True) 10 | torch._C._jit_override_can_fuse_on_gpu(True) 11 | 12 | 13 | def bias_dropout_add_scale( 14 | x: Tensor, bias: Optional[Tensor], scale: Tensor, residual: Optional[Tensor], prob: float, training: bool 15 | ) -> Tensor: 16 | if bias is not None: 17 | out = scale * F.dropout(x + bias, p=prob, training=training) 18 | else: 19 | out = scale * F.dropout(x, p=prob, training=training) 20 | 21 | if residual is not None: 22 | out = residual + out 23 | return out 24 | 25 | 26 | def get_bias_dropout_add_scale(training): 27 | def _bias_dropout_add(x, bias, scale, residual, prob): 28 | return bias_dropout_add_scale(x, bias, scale, residual, prob, training) 29 | 30 | return _bias_dropout_add 31 | 32 | 33 | def modulate(x: Tensor, shift: Tensor, scale: Tensor) -> Tensor: 34 | return x * (1 + scale) + shift 35 | 36 | 37 | @torch.jit.script 38 | def bias_dropout_add_scale_fused_train( 39 | x: Tensor, bias: Optional[Tensor], scale: Tensor, residual: Optional[Tensor], prob: float 40 | ) -> Tensor: 41 | return bias_dropout_add_scale(x, bias, scale, residual, prob, True) 42 | 43 | 44 | @torch.jit.script 45 | def bias_dropout_add_scale_fused_inference( 46 | x: Tensor, bias: Optional[Tensor], scale: Tensor, residual: Optional[Tensor], prob: float 47 | ) -> Tensor: 48 | return bias_dropout_add_scale(x, bias, scale, residual, prob, False) 49 | 50 | @torch.jit.script 51 | def modulate_fused(x: Tensor, shift: Tensor, scale: Tensor) -> Tensor: 52 | return modulate(x, shift, scale) -------------------------------------------------------------------------------- /sibling-discovery/diffusion/model/rotary.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | 4 | 5 | class Rotary(torch.nn.Module): 6 | def __init__(self, dim, base=10_000): 7 | super().__init__() 8 | inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim)) 9 | self.register_buffer("inv_freq", inv_freq) 10 | self.seq_len_cached = None 11 | self.cos_cached = None 12 | self.sin_cached = None 13 | 14 | def forward(self, x, seq_dim=1): 15 | seq_len = x.shape[seq_dim] 16 | if seq_len != self.seq_len_cached: 17 | self.seq_len_cached = seq_len 18 | t = torch.arange(x.shape[seq_dim], device=x.device).type_as(self.inv_freq) 19 | freqs = torch.einsum("i,j->ij", t, self.inv_freq.clone()) 20 | emb = torch.cat((freqs, freqs), dim=-1).to(x.device) 21 | # dims are: batch, seq_len, qkv, head, dim 22 | self.cos_cached = emb.cos()[None, :, None, None, :].repeat(1,1,3,1,1) 23 | self.sin_cached = emb.sin()[None, :, None, None, :].repeat(1,1,3,1,1) 24 | # This makes the transformation on v an identity. 25 | self.cos_cached[:,:,2,:,:].fill_(1.) 26 | self.sin_cached[:,:,2,:,:].fill_(0.) 27 | 28 | return self.cos_cached, self.sin_cached 29 | 30 | 31 | def rotate_half(x): 32 | x1, x2 = x[..., : x.shape[-1] // 2], x[..., x.shape[-1] // 2 :] 33 | return torch.cat( 34 | (-x2, x1), dim=-1 35 | ) 36 | 37 | 38 | @torch.jit.script 39 | def _apply_rotary_pos_emb_torchscript(qkv, cos, sin): 40 | return (qkv * cos) + (rotate_half(qkv) * sin) 41 | 42 | 43 | def apply_rotary_pos_emb(qkv, cos, sin): 44 | try: 45 | import flash_attn.layers.rotary 46 | cos = cos[0,:,0,0,:cos.shape[-1]//2] 47 | sin = sin[0,:,0,0,:sin.shape[-1]//2] 48 | return flash_attn.layers.rotary.apply_rotary_emb_qkv_( 49 | qkv, cos, sin 50 | ) 51 | except: 52 | return _apply_rotary_pos_emb_torchscript(qkv, cos, sin) -------------------------------------------------------------------------------- /sibling-discovery/diffusion/model/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | 4 | 5 | def get_model_fn(model, train=False): 6 | """Create a function to give the output of the score-based model. 7 | 8 | Args: 9 | model: The score model. 10 | train: `True` for training and `False` for evaluation. 11 | mlm: If the input model is a mlm and models the base probability 12 | 13 | Returns: 14 | A model function. 15 | """ 16 | 17 | def model_fn(x, sigma): 18 | """Compute the output of the score-based model. 19 | 20 | Args: 21 | x: A mini-batch of input data. 22 | labels: A mini-batch of conditioning variables for time steps. Should be interpreted differently 23 | for different models. 24 | 25 | Returns: 26 | A tuple of (model output, new mutable states) 27 | """ 28 | if train: 29 | model.train() 30 | else: 31 | model.eval() 32 | 33 | # otherwise output the raw values (we handle mlm training in losses.py) 34 | return model(x, sigma) 35 | 36 | return model_fn 37 | 38 | 39 | def get_score_fn(model, train=False, sampling=False): 40 | if sampling: 41 | assert not train, "Must sample in eval mode" 42 | model_fn = get_model_fn(model, train=train) 43 | 44 | with torch.cuda.amp.autocast(dtype=torch.bfloat16): 45 | def score_fn(x, sigma): 46 | sigma = sigma.reshape(-1) 47 | score = model_fn(x, sigma) 48 | 49 | if sampling: 50 | # when sampling return true score (not log used for training) 51 | return score.exp() 52 | 53 | return score 54 | 55 | return score_fn 56 | -------------------------------------------------------------------------------- /sibling-discovery/diffusion/noise_lib.py: -------------------------------------------------------------------------------- 1 | import abc 2 | import torch 3 | import torch.nn as nn 4 | import numpy as np 5 | 6 | 7 | def get_noise(config): 8 | if config.noise.type == "geometric": 9 | return GeometricNoise(config.noise.sigma_min, config.noise.sigma_max) 10 | elif config.noise.type == "loglinear": 11 | return LogLinearNoise() 12 | else: 13 | raise ValueError(f"{config.noise.type} is not a valid noise") 14 | 15 | 16 | class Noise(abc.ABC, nn.Module): 17 | """ 18 | Baseline forward method to get the total + rate of noise at a timestep 19 | """ 20 | def forward(self, t): 21 | return self.total_noise(t), self.rate_noise(t) 22 | 23 | """ 24 | Assume time goes from 0 to 1 25 | """ 26 | @abc.abstractmethod 27 | def rate_noise(self, t): 28 | """ 29 | Rate of change of noise ie g(t) 30 | """ 31 | pass 32 | 33 | @abc.abstractmethod 34 | def total_noise(self, t): 35 | """ 36 | Total noise ie \int_0^t g(t) dt + g(0) 37 | """ 38 | pass 39 | 40 | 41 | class GeometricNoise(Noise, nn.Module): 42 | def __init__(self, sigma_min=1e-3, sigma_max=1, learnable=False): 43 | super().__init__() 44 | self.sigmas = 1.0 * torch.tensor([sigma_min, sigma_max]) 45 | if learnable: 46 | self.sigmas = nn.Parameter(self.sigmas) 47 | self.empty = nn.Parameter(torch.tensor(0.0)) 48 | 49 | def rate_noise(self, t): 50 | return self.sigmas[0] ** (1 - t) * self.sigmas[1] ** t * (self.sigmas[1].log() - self.sigmas[0].log()) 51 | 52 | def total_noise(self, t): 53 | return self.sigmas[0] ** (1 - t) * self.sigmas[1] ** t 54 | 55 | 56 | class LogLinearNoise(Noise, nn.Module): 57 | """ 58 | Log Linear noise schedule built so that 1 - 1/e^(n(t)) interpolates between 0 and ~1 59 | when t goes from 0 to 1. Used for absorbing 60 | 61 | Total noise is -log(1 - (1 - eps) * t), so the sigma will be (1 - eps) * t 62 | """ 63 | def __init__(self, eps=1e-3): 64 | super().__init__() 65 | self.eps = eps 66 | self.empty = nn.Parameter(torch.tensor(0.0)) 67 | 68 | def rate_noise(self, t): 69 | return (1 - self.eps) / (1 - (1 - self.eps) * t) 70 | 71 | def total_noise(self, t): 72 | return -torch.log1p(-(1 - self.eps) * t) 73 | 74 | -------------------------------------------------------------------------------- /sibling-discovery/diffusion/run_eval.sh: -------------------------------------------------------------------------------- 1 | bash eval.sh sibling.5.500.10.50000 0.0 8 -------------------------------------------------------------------------------- /sibling-discovery/diffusion/run_sample.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import argparse 3 | import json 4 | 5 | from load_model import load_model 6 | from transformers import GPT2TokenizerFast 7 | import torch.nn.functional as F 8 | import sampling 9 | 10 | 11 | def main(): 12 | parser = argparse.ArgumentParser(description="Generate some samples") 13 | parser.add_argument("--model_path", default="louaaron/sedd-medium", type=str) 14 | parser.add_argument("--dataset", default="wikitext103", type=str) 15 | parser.add_argument("--batch_size", type=int, default=1) 16 | parser.add_argument("--steps", type=int, default=1024) 17 | parser.add_argument("--add_vocab", type=str, default=None) 18 | args = parser.parse_args() 19 | 20 | 21 | device = torch.device('cuda') 22 | model, graph, noise = load_model(args.model_path, device) 23 | tokenizer = GPT2TokenizerFast.from_pretrained('gpt2') 24 | if args.add_vocab: 25 | with open(args.add_vocab, 'r') as file: 26 | added_tokens = json.load(file) 27 | tokenizer.add_tokens(added_tokens) 28 | 29 | sampling_fn = sampling.get_pc_sampler( 30 | graph, noise, (args.batch_size, 1024), 'analytic', args.steps, device=device 31 | ) 32 | 33 | samples = sampling_fn(model) 34 | 35 | text_samples = tokenizer.batch_decode(samples) 36 | for i in text_samples: 37 | print(i) 38 | print("=================================================") 39 | 40 | if __name__=="__main__": 41 | main() -------------------------------------------------------------------------------- /sibling-discovery/diffusion/run_sample_cond.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import argparse 3 | import json 4 | 5 | from load_model import load_model 6 | from transformers import GPT2TokenizerFast 7 | import sampling 8 | 9 | 10 | def main(): 11 | parser = argparse.ArgumentParser(description="Generate some samples") 12 | parser.add_argument("--model_path", default="louaaron/sedd-medium", type=str) 13 | parser.add_argument("--dataset", default="wikitext103", type=str) 14 | parser.add_argument("--batch_size", type=int, default=1) 15 | parser.add_argument("--steps", type=int, default=1024) 16 | parser.add_argument("--prefix", type=str, default="Hi, my name is") 17 | parser.add_argument("--suffix", type=str, default=" and that's why I'm late.") 18 | parser.add_argument("--add_vocab", type=str, default=None) 19 | args = parser.parse_args() 20 | 21 | tokenizer = GPT2TokenizerFast.from_pretrained('gpt2') 22 | if args.add_vocab: 23 | with open(args.add_vocab, 'r') as file: 24 | added_tokens = json.load(file) 25 | tokenizer.add_tokens(added_tokens) 26 | 27 | prefix_ids = tokenizer(args.prefix).input_ids 28 | suffix_ids = tokenizer(args.suffix).input_ids 29 | input_ids = prefix_ids + suffix_ids 30 | input_locs = list(range(len(prefix_ids))) + list(range(1024-len(suffix_ids), 1024)) 31 | 32 | # more generaly commands can be defined with something like below: 33 | # input_ids = [0, 1, 512, 8080, 50256, 20000] 34 | # input_locs = [5, 6, 19, 20, 1000, 10001] 35 | 36 | 37 | input_ids = torch.tensor(input_ids, device="cuda")[None].repeat(args.batch_size, 1) 38 | 39 | def proj_fun(x): 40 | x[:, input_locs] = input_ids 41 | return x 42 | 43 | device = torch.device('cuda') 44 | model, graph, noise = load_model(args.model_path, device) 45 | 46 | 47 | sampling_fn = sampling.get_pc_sampler( 48 | graph, noise, (args.batch_size, 1024), 'analytic', args.steps, device=device, proj_fun=proj_fun 49 | ) 50 | 51 | samples = proj_fun(sampling_fn(model)) 52 | 53 | text_samples = tokenizer.batch_decode(samples) 54 | for i in text_samples: 55 | print(i) 56 | print("=================================================") 57 | 58 | if __name__=="__main__": 59 | main() -------------------------------------------------------------------------------- /sibling-discovery/diffusion/run_train.sh: -------------------------------------------------------------------------------- 1 | python train.py \ 2 | noise.type=loglinear \ 3 | graph.type=absorb \ 4 | model=small \ 5 | training.accum=1 \ 6 | data.train=creativity_data/sibling.5.500.10.50000/train.json \ 7 | data.valid=creativity_data/sibling.5.500.10.50000/valid.json \ 8 | add_vocab=creativity_data/sibling.5.500.10.50000/vocab.json \ 9 | hydra.run.dir=/data/locus/project_data/project_data2/chenwu2/creativity_results/creativity_data/sibling.5.500.10.50000/train.json/train 10 | 11 | python test.py \ 12 | --model_checkpoint_dir creativity_results/creativity_data/sibling.5.500.10.50000/train.json/train \ 13 | --dataset creativity_data/sibling.5.500.10.50000 \ 14 | --add_vocab creativity_data/sibling.5.500.10.50000/vocab.json -------------------------------------------------------------------------------- /sibling-discovery/diffusion/test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import argparse 3 | import json 4 | 5 | from load_model import load_model 6 | from transformers import GPT2TokenizerFast 7 | import sampling 8 | from tqdm import tqdm 9 | 10 | import os 11 | 12 | 13 | def main(): 14 | parser = argparse.ArgumentParser(description="Generate some samples") 15 | parser.add_argument("--model_checkpoint_dir", default="", type=str) 16 | parser.add_argument("--dataset", default=None, type=str) 17 | parser.add_argument("--steps", type=int, default=32) 18 | parser.add_argument("--add_vocab", type=str, default=None) 19 | args = parser.parse_args() 20 | 21 | tokenizer = GPT2TokenizerFast.from_pretrained('gpt2') 22 | if args.add_vocab: 23 | with open(args.add_vocab, 'r') as file: 24 | added_tokens = json.load(file) 25 | tokenizer.add_tokens(added_tokens) 26 | else: 27 | added_tokens = [] 28 | 29 | # Load the dataset 30 | with open(os.path.join(args.dataset, "test.json"), "r") as f: 31 | test_data = json.load(f) 32 | 33 | # List all files under the model checkpoint directory 34 | checkpoints = [os.path.join(args.model_checkpoint_dir, "checkpoints", f) for f in os.listdir(os.path.join(args.model_checkpoint_dir, "checkpoints"))] 35 | print(checkpoints) 36 | 37 | for checkpoint in checkpoints: 38 | device = torch.device('cuda') 39 | model, graph, noise = load_model(args.model_checkpoint_dir, checkpoint, added_tokens, device) 40 | # Create a checkpoint_dir for the current checkpoint 41 | checkpoint_dir = os.path.join(args.model_checkpoint_dir, "checkpoint_outputs", os.path.basename(checkpoint)) 42 | os.makedirs(checkpoint_dir, exist_ok=True) 43 | 44 | def generate_output(input_text): 45 | prefix_ids = tokenizer(input_text).input_ids 46 | # suffix_ids = tokenizer("<|endoftext|>").input_ids 47 | input_ids = prefix_ids 48 | input_locs = list(range(len(prefix_ids))) 49 | 50 | input_ids = torch.tensor(input_ids, device="cuda")[None].repeat(1, 1) 51 | 52 | def proj_fun(x): 53 | x[:, input_locs] = input_ids 54 | return x 55 | 56 | sampling_fn = sampling.get_pc_sampler( 57 | graph, noise, (1, 20), 'analytic', args.steps, device=device, proj_fun=proj_fun 58 | ) 59 | 60 | samples = proj_fun(sampling_fn(model)) 61 | 62 | text_samples = tokenizer.batch_decode(samples) 63 | assert len(text_samples) == 1 64 | text_samples = text_samples[0].split("<|endoftext|>")[0] 65 | return text_samples 66 | 67 | all_items = [] 68 | for sample in tqdm(test_data): 69 | item = {} 70 | item["input_text"] = sample["input_text"] 71 | item["target_text"] = sample["target_text"] 72 | item["type"] = sample["type"] 73 | 74 | output = generate_output(sample["input_text"]) 75 | print(sample["input_text"]) 76 | print(sample["target_text"]) 77 | print(output) 78 | print() 79 | item["model_output"] = output 80 | all_items.append(item) 81 | 82 | # Save the results 83 | with open(os.path.join(checkpoint_dir, "all_items.json"), "w") as f: 84 | json.dump(all_items, f, indent=4) 85 | 86 | 87 | if __name__=="__main__": 88 | main() -------------------------------------------------------------------------------- /sibling-discovery/diffusion/train.py: -------------------------------------------------------------------------------- 1 | """Training and evaluation""" 2 | 3 | import hydra 4 | import os 5 | import numpy as np 6 | import run_train 7 | import utils 8 | import torch.multiprocessing as mp 9 | from hydra.core.hydra_config import HydraConfig 10 | from hydra.types import RunMode 11 | from omegaconf import OmegaConf, open_dict 12 | 13 | 14 | @hydra.main(version_base=None, config_path="configs", config_name="config") 15 | def main(cfg): 16 | ngpus = cfg.ngpus 17 | if "load_dir" in cfg: 18 | hydra_cfg_path = os.path.join(cfg.load_dir, ".hydra/hydra.yaml") 19 | hydra_cfg = OmegaConf.load(hydra_cfg_path).hydra 20 | 21 | cfg = utils.load_hydra_config_from_run(cfg.load_dir) 22 | 23 | work_dir = cfg.work_dir 24 | utils.makedirs(work_dir) 25 | else: 26 | hydra_cfg = HydraConfig.get() 27 | work_dir = hydra_cfg.run.dir if hydra_cfg.mode == RunMode.RUN else os.path.join(hydra_cfg.sweep.dir, hydra_cfg.sweep.subdir) 28 | utils.makedirs(work_dir) 29 | 30 | with open_dict(cfg): 31 | cfg.ngpus = ngpus 32 | cfg.work_dir = work_dir 33 | cfg.wandb_name = os.path.basename(os.path.normpath(work_dir)) 34 | 35 | # Run the training pipeline 36 | port = int(np.random.randint(10000, 20000)) 37 | logger = utils.get_logger(os.path.join(work_dir, "logs")) 38 | 39 | hydra_cfg = HydraConfig.get() 40 | if hydra_cfg.mode != RunMode.RUN: 41 | logger.info(f"Run id: {hydra_cfg.job.id}") 42 | 43 | try: 44 | mp.set_start_method("forkserver") 45 | mp.spawn(run_train.run_multiprocess, args=(ngpus, cfg, port), nprocs=ngpus, join=True) 46 | except Exception as e: 47 | logger.critical(e, exc_info=True) 48 | 49 | 50 | if __name__ == "__main__": 51 | main() -------------------------------------------------------------------------------- /sibling-discovery/diffusion/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | import os 4 | import logging 5 | from omegaconf import OmegaConf, open_dict 6 | 7 | 8 | def load_hydra_config_from_run(load_dir): 9 | cfg_path = os.path.join(load_dir, ".hydra/config.yaml") 10 | cfg = OmegaConf.load(cfg_path) 11 | return cfg 12 | 13 | 14 | def makedirs(dirname): 15 | os.makedirs(dirname, exist_ok=True) 16 | 17 | 18 | def get_logger(logpath, package_files=[], displaying=True, saving=True, debug=False): 19 | logger = logging.getLogger() 20 | if debug: 21 | level = logging.DEBUG 22 | else: 23 | level = logging.INFO 24 | 25 | if (logger.hasHandlers()): 26 | logger.handlers.clear() 27 | 28 | logger.setLevel(level) 29 | formatter = logging.Formatter('%(asctime)s - %(message)s') 30 | if saving: 31 | info_file_handler = logging.FileHandler(logpath, mode="a") 32 | info_file_handler.setLevel(level) 33 | info_file_handler.setFormatter(formatter) 34 | logger.addHandler(info_file_handler) 35 | if displaying: 36 | console_handler = logging.StreamHandler() 37 | console_handler.setLevel(level) 38 | console_handler.setFormatter(formatter) 39 | logger.addHandler(console_handler) 40 | 41 | for f in package_files: 42 | logger.info(f) 43 | with open(f, "r") as package_f: 44 | logger.info(package_f.read()) 45 | 46 | return logger 47 | 48 | 49 | def restore_checkpoint(ckpt_dir, state, device): 50 | if not os.path.exists(ckpt_dir): 51 | makedirs(os.path.dirname(ckpt_dir)) 52 | logging.warning(f"No checkpoint found at {ckpt_dir}. Returned the same state as input") 53 | return state 54 | else: 55 | loaded_state = torch.load(ckpt_dir, map_location=device) 56 | state['optimizer'].load_state_dict(loaded_state['optimizer']) 57 | state['model'].module.load_state_dict(loaded_state['model'], strict=False) 58 | state['ema'].load_state_dict(loaded_state['ema']) 59 | state['step'] = loaded_state['step'] 60 | return state 61 | 62 | 63 | def save_checkpoint(ckpt_dir, state): 64 | saved_state = { 65 | 'optimizer': state['optimizer'].state_dict(), 66 | 'model': state['model'].module.state_dict(), 67 | 'ema': state['ema'].state_dict(), 68 | 'step': state['step'] 69 | } 70 | torch.save(saved_state, ckpt_dir) -------------------------------------------------------------------------------- /sibling-discovery/ntp/eval.sh: -------------------------------------------------------------------------------- 1 | # $1: dataset 2 | # $2: weight_decay 3 | # $3: n_layers 4 | 5 | EXP_DIR=../creativity 6 | DATA_DIR=/data/locus/project_data/project_data2/chenwu2/creativity_data 7 | 8 | python eval_qa.py --dir $EXP_DIR/$1_$2_$3 --dataset $1 --data_dir $DATA_DIR 9 | -------------------------------------------------------------------------------- /sibling-discovery/ntp/run_eval.sh: -------------------------------------------------------------------------------- 1 | bash eval.sh sibling.5.500.10.50000 0.0 8 2 | -------------------------------------------------------------------------------- /sibling-discovery/ntp/run_train.sh: -------------------------------------------------------------------------------- 1 | bash train.sh sibling.5.500.10.50000 0.0 8 0 2 | -------------------------------------------------------------------------------- /sibling-discovery/ntp/train.sh: -------------------------------------------------------------------------------- 1 | MODEL_PATH=gpt2 2 | 3 | DATASET=/data/locus/project_data/project_data2/chenwu2/creativity_data/$1/ 4 | WEIGHT_DECAY=$2 5 | N_LAYERS=$3 6 | GPU=$4 7 | 8 | EXP_DIR=../creativity 9 | 10 | OUTPUT_DIR=$EXP_DIR/$1_$2_$3 11 | 12 | CUDA_VISIBLE_DEVICES=$GPU python main.py \ 13 | --data_dir $DATASET \ 14 | --model_name_or_path ${MODEL_PATH} \ 15 | --weight_decay $WEIGHT_DECAY \ 16 | --output_dir $OUTPUT_DIR \ 17 | --max_seq_length 128 \ 18 | --max_length 128 \ 19 | --block_size 128 \ 20 | --train_batch_size 64 \ 21 | --eval_batch_size 64 \ 22 | --learning_rate 1e-4 \ 23 | --gradient_accumulation_steps 1 \ 24 | --save_step 40000 \ 25 | --save_step_dense 20000 \ 26 | --max_steps 800000 \ 27 | --do_train \ 28 | --scheduler constant_schedule_with_warmup \ 29 | --fp16 \ 30 | --evaluate_during_training \ 31 | --predict_during_training \ 32 | --init_weights \ 33 | --add_tokens \ 34 | --n_layer $N_LAYERS 35 | -------------------------------------------------------------------------------- /sibling-discovery/ntp/utils.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import json 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | 7 | 8 | def read_data_source_target(file_name, return_num=False, return_json=False, cutoff=None): 9 | """ 10 | file_name: a .json file containing a list of items, each has 'input_text', 'target_text', as keys 11 | """ 12 | with open(file_name, 'r', encoding='utf-8') as f: 13 | data = json.load(f) 14 | 15 | if cutoff is not None: 16 | data = data[:cutoff] 17 | 18 | if return_json: 19 | if return_num: 20 | return data, len(data) 21 | return data 22 | 23 | keys = list(data[0].keys()) 24 | source_target_pair = [] 25 | for item in data: 26 | source_target_pair.append([item[key] for key in keys]) 27 | 28 | if return_num: 29 | return pd.DataFrame(source_target_pair, columns=keys), len(data) 30 | return pd.DataFrame(source_target_pair, columns=keys) 31 | -------------------------------------------------------------------------------- /sibling-discovery/teacherless/eval.sh: -------------------------------------------------------------------------------- 1 | # $1: dataset 2 | # $2: weight_decay 3 | # $3: n_layers 4 | 5 | EXP_DIR=../creativity 6 | DATA_DIR=/data/locus/project_data/project_data2/chenwu2/creativity_data 7 | 8 | python eval_qa.py --dir $EXP_DIR/$1_$2_$3 --dataset $1 --data_dir $DATA_DIR 9 | -------------------------------------------------------------------------------- /sibling-discovery/teacherless/run_eval.sh: -------------------------------------------------------------------------------- 1 | bash eval.sh sibling_hybrid.5.500.10.100000 0.0 8 -------------------------------------------------------------------------------- /sibling-discovery/teacherless/run_train.sh: -------------------------------------------------------------------------------- 1 | bash train.sh sibling_hybrid.5.500.10.50000 0.0 8 0 -------------------------------------------------------------------------------- /sibling-discovery/teacherless/train.sh: -------------------------------------------------------------------------------- 1 | MODEL_PATH=gpt2 2 | 3 | DATASET=/data/locus/project_data/project_data2/chenwu2/creativity_data/$1/ 4 | WEIGHT_DECAY=$2 5 | N_LAYERS=$3 6 | GPU=$4 7 | 8 | EXP_DIR=../creativity 9 | 10 | OUTPUT_DIR=$EXP_DIR/$1_$2_$3 11 | 12 | CUDA_VISIBLE_DEVICES=$GPU python main.py \ 13 | --data_dir $DATASET \ 14 | --model_name_or_path ${MODEL_PATH} \ 15 | --weight_decay $WEIGHT_DECAY \ 16 | --output_dir $OUTPUT_DIR \ 17 | --max_seq_length 128 \ 18 | --max_length 128 \ 19 | --block_size 128 \ 20 | --train_batch_size 64 \ 21 | --eval_batch_size 64 \ 22 | --learning_rate 1e-4 \ 23 | --gradient_accumulation_steps 1 \ 24 | --save_step 40000 \ 25 | --save_step_dense 20000 \ 26 | --max_steps 1200000 \ 27 | --do_train \ 28 | --scheduler constant_schedule_with_warmup \ 29 | --fp16 \ 30 | --evaluate_during_training \ 31 | --predict_during_training \ 32 | --init_weights \ 33 | --add_tokens \ 34 | --n_layer $N_LAYERS 35 | -------------------------------------------------------------------------------- /sibling-discovery/teacherless/utils.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import json 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | 7 | 8 | def read_data_source_target(file_name, return_num=False, return_json=False, cutoff=None): 9 | """ 10 | file_name: a .json file containing a list of items, each has 'input_text', 'target_text', as keys 11 | """ 12 | with open(file_name, 'r', encoding='utf-8') as f: 13 | data = json.load(f) 14 | 15 | if cutoff is not None: 16 | data = data[:cutoff] 17 | 18 | if return_json: 19 | if return_num: 20 | return data, len(data) 21 | return data 22 | 23 | keys = list(data[0].keys()) 24 | source_target_pair = [] 25 | for item in data: 26 | source_target_pair.append([item[key] for key in keys]) 27 | 28 | if return_num: 29 | return pd.DataFrame(source_target_pair, columns=keys), len(data) 30 | return pd.DataFrame(source_target_pair, columns=keys) 31 | -------------------------------------------------------------------------------- /simpletransformers/.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2] 4 | patreon: thilinarajapakse 5 | open_collective: # Replace with a single Open Collective username 6 | ko_fi: # Replace with a single Ko-fi username 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry 9 | liberapay: # Replace with a single Liberapay username 10 | issuehunt: # Replace with a single IssueHunt username 11 | otechie: # Replace with a single Otechie username 12 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2'] 13 | -------------------------------------------------------------------------------- /simpletransformers/.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. Please specify the class causing the issue. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 16 | **Expected behavior** 17 | A clear and concise description of what you expected to happen. 18 | 19 | **Screenshots** 20 | If applicable, add screenshots to help explain your problem. 21 | 22 | **Desktop (please complete the following information):** 23 | - OS 24 | 25 | **Additional context** 26 | Add any other context about the problem here. 27 | -------------------------------------------------------------------------------- /simpletransformers/.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /simpletransformers/.github/stale.yml: -------------------------------------------------------------------------------- 1 | # Number of days of inactivity before an issue becomes stale 2 | daysUntilStale: 60 3 | # Number of days of inactivity before a stale issue is closed 4 | daysUntilClose: 7 5 | # Issues with these labels will never be considered stale 6 | exemptLabels: 7 | - pinned 8 | - security 9 | # Label to use when marking an issue as stale 10 | staleLabel: stale 11 | # Comment to post when marking an issue as stale. Set to `false` to disable 12 | markComment: > 13 | This issue has been automatically marked as stale because it has not had 14 | recent activity. It will be closed if no further activity occurs. Thank you 15 | for your contributions. 16 | # Comment to post when closing a stale issue. Set to `false` to disable 17 | closeComment: false 18 | -------------------------------------------------------------------------------- /simpletransformers/.github/workflows/pythonpublish.yml: -------------------------------------------------------------------------------- 1 | name: Upload Python Package 2 | 3 | on: push 4 | 5 | jobs: 6 | deploy: 7 | runs-on: ubuntu-latest 8 | steps: 9 | - uses: actions/checkout@v1 10 | - name: Set up Python 11 | uses: actions/setup-python@v1 12 | with: 13 | python-version: '3.x' 14 | - name: Install dependencies 15 | run: | 16 | python -m pip install --upgrade pip 17 | pip install setuptools wheel twine 18 | python setup.py sdist bdist_wheel 19 | - name: Publish package 20 | if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags') 21 | uses: pypa/gh-action-pypi-publish@release/v1 22 | with: 23 | user: __token__ 24 | password: ${{ secrets.PYPI_PASSWORD }} 25 | -------------------------------------------------------------------------------- /simpletransformers/.gitignore: -------------------------------------------------------------------------------- 1 | .jekyll-cache/* 2 | *.lock 3 | 4 | # Wandb 5 | wandb/ 6 | 7 | # Outputs from examples 8 | **/cache_dir 9 | **/runs 10 | **/outputs 11 | **/data 12 | 13 | # manual_test_scripts 14 | test_scripts/ 15 | 16 | # Byte-compiled / optimized / DLL files 17 | __pycache__/ 18 | *.py[cod] 19 | *$py.class 20 | 21 | # C extensions 22 | *.so 23 | 24 | # Distribution / packaging 25 | .Python 26 | build/ 27 | develop-eggs/ 28 | dist/ 29 | downloads/ 30 | eggs/ 31 | .eggs/ 32 | lib/ 33 | lib64/ 34 | parts/ 35 | sdist/ 36 | var/ 37 | wheels/ 38 | pip-wheel-metadata/ 39 | share/python-wheels/ 40 | *.egg-info/ 41 | .installed.cfg 42 | *.egg 43 | MANIFEST 44 | 45 | # PyInstaller 46 | # Usually these files are written by a python script from a template 47 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 48 | *.manifest 49 | *.spec 50 | 51 | # Installer logs 52 | pip-log.txt 53 | pip-delete-this-directory.txt 54 | 55 | # Unit test / coverage reports 56 | htmlcov/ 57 | .tox/ 58 | .nox/ 59 | .coverage 60 | .coverage.* 61 | .cache 62 | nosetests.xml 63 | coverage.xml 64 | *.cover 65 | *.py,cover 66 | .hypothesis/ 67 | .pytest_cache/ 68 | 69 | # Translations 70 | *.mo 71 | *.pot 72 | 73 | # Django stuff: 74 | *.log 75 | local_settings.py 76 | db.sqlite3 77 | db.sqlite3-journal 78 | 79 | # Flask stuff: 80 | instance/ 81 | .webassets-cache 82 | 83 | # Scrapy stuff: 84 | .scrapy 85 | 86 | # Sphinx documentation 87 | docs/_build/ 88 | 89 | # PyBuilder 90 | target/ 91 | 92 | # Jupyter Notebook 93 | .ipynb_checkpoints 94 | 95 | # IPython 96 | profile_default/ 97 | ipython_config.py 98 | 99 | # pyenv 100 | .python-version 101 | 102 | # pipenv 103 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 104 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 105 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 106 | # install all needed dependencies. 107 | #Pipfile.lock 108 | 109 | # celery beat schedule file 110 | celerybeat-schedule 111 | 112 | # SageMath parsed files 113 | *.sage.py 114 | 115 | # Environments 116 | .env 117 | .venv 118 | env/ 119 | venv/ 120 | ENV/ 121 | env.bak/ 122 | venv.bak/ 123 | 124 | # Spyder project settings 125 | .spyderproject 126 | .spyproject 127 | 128 | # Rope project settings 129 | .ropeproject 130 | 131 | # mkdocs documentation 132 | /site 133 | 134 | # mypy 135 | .mypy_cache/ 136 | .dmypy.json 137 | dmypy.json 138 | 139 | # Pyre type checker 140 | .pyre/ 141 | #Stale Notebooks 142 | Untitled.ipynb 143 | 144 | # IDE folders 145 | .idea 146 | .vscode/ 147 | -------------------------------------------------------------------------------- /simpletransformers/Makefile: -------------------------------------------------------------------------------- 1 | install: 2 | pip install -e . 3 | pip install -r requirements-dev.txt 4 | pip list 5 | 6 | clean: 7 | find . -name '*.pyc' -exec rm -f {} + 8 | find . -name '*.pyo' -exec rm -f {} + 9 | find . -name '*~' -exec rm -f {} + 10 | 11 | clean-test: 12 | -rm -r .coverage* 13 | -rm -r data 14 | -rm -r runs 15 | -rm -r outputs 16 | -rm -r cache_dir 17 | -rm -r wandb 18 | -rm train.txt 19 | 20 | formatter: 21 | black --line-length 119 simpletransformers tests --exclude simpletransformers/experimental\ 22 | 23 | lint: clean 24 | flake8 simpletransformers tests --exclude=simpletransformers/experimental 25 | black --check --line-length 119 . simpletransformers tests --exclude simpletransformers/experimental 26 | 27 | types: 28 | pytype --keep-going simpletransformers --exclude simpletransformers/experimental 29 | 30 | test: clean 31 | pytest tests --cov simpletransformers/classification simpletransformers/ner simpletransformers/question_answering simpletransformers/language_modeling simpletransformers/t5 simpletransformers/seq2seq 32 | 33 | test-lm: clean 34 | pytest tests/language_modeling --cov simpletransformers/language_modeling 35 | 36 | # if this runs through we can be sure the readme is properly shown on pypi 37 | check-readme: 38 | python setup.py check --restructuredtext 39 | -------------------------------------------------------------------------------- /simpletransformers/bin/simple-viewer: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | cat >run_simple_transformers_streamlit_app.py <<'END_SCRIPT' 3 | #!/usr/bin/env python 4 | from simpletransformers.streamlit.simple_view import streamlit_runner 5 | 6 | 7 | streamlit_runner() 8 | 9 | END_SCRIPT 10 | 11 | # Run 12 | streamlit run run_simple_transformers_streamlit_app.py 13 | 14 | rm run_simple_transformers_streamlit_app.py 15 | -------------------------------------------------------------------------------- /simpletransformers/requirements-dev.txt: -------------------------------------------------------------------------------- 1 | # test 2 | pytest-cov==2.7.1 3 | pytest-localserver==0.5.0 4 | pytest==7.1.2 5 | 6 | # lint/format/types 7 | black==24.3.0 8 | flake8==3.7.8 9 | pytype==2019.7.11 10 | -------------------------------------------------------------------------------- /simpletransformers/setup.cfg: -------------------------------------------------------------------------------- 1 | [tool:pytest] 2 | python_functions=test_ 3 | 4 | codestyle_max_line_length = 119 5 | 6 | log_cli = true 7 | log_cli_level = WARNING 8 | 9 | [metadata] 10 | description-file = README.md 11 | license_file = LICENSE 12 | 13 | [pycodestyle] 14 | max-line-length = 119 15 | 16 | [flake8] 17 | max-line-length = 119 18 | ignore = E203 , W503, F401 19 | -------------------------------------------------------------------------------- /simpletransformers/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import find_packages, setup 2 | 3 | with open("README.md", "r") as fh: 4 | long_description = fh.read() 5 | 6 | setup( 7 | name="simpletransformers", 8 | version="0.64.3", 9 | author="Thilina Rajapakse", 10 | author_email="chaturangarajapakshe@gmail.com", 11 | description="An easy-to-use wrapper library for the Transformers library.", 12 | long_description=long_description, 13 | long_description_content_type="text/markdown", 14 | url="https://github.com/ThilinaRajapakse/simpletransformers/", 15 | packages=find_packages(), 16 | scripts=["bin/simple-viewer"], 17 | classifiers=[ 18 | "Intended Audience :: Science/Research", 19 | "License :: OSI Approved :: Apache Software License", 20 | "Programming Language :: Python :: 3", 21 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 22 | ], 23 | python_requires=">=3.6", 24 | install_requires=[ 25 | "numpy", 26 | "requests", 27 | "tqdm>=4.47.0", 28 | "regex", 29 | "transformers>=4.31.0", 30 | "datasets", 31 | "scipy", 32 | "scikit-learn", 33 | "seqeval", 34 | "tensorboard", 35 | "pandas", 36 | "tokenizers", 37 | "wandb>=0.10.32", 38 | "streamlit", 39 | "sentencepiece", 40 | ], 41 | ) 42 | -------------------------------------------------------------------------------- /simpletransformers/simpletransformers/__init__.py: -------------------------------------------------------------------------------- 1 | name = "simpletransformers" 2 | -------------------------------------------------------------------------------- /simpletransformers/simpletransformers/classification/__init__.py: -------------------------------------------------------------------------------- 1 | from simpletransformers.classification.classification_model import ClassificationModel 2 | from simpletransformers.classification.multi_label_classification_model import ( 3 | MultiLabelClassificationModel, 4 | ) 5 | from simpletransformers.classification.multi_modal_classification_model import ( 6 | MultiModalClassificationModel, 7 | ) 8 | from simpletransformers.config.model_args import ( 9 | ClassificationArgs, 10 | MultiLabelClassificationArgs, 11 | MultiModalClassificationArgs, 12 | ) 13 | -------------------------------------------------------------------------------- /simpletransformers/simpletransformers/classification/transformer_models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChenWu98/algorithmic-creativity/7176dc7d35ca17605a76fa873ec97ad62a8537ac/simpletransformers/simpletransformers/classification/transformer_models/__init__.py -------------------------------------------------------------------------------- /simpletransformers/simpletransformers/classification/transformer_models/camembert_model.py: -------------------------------------------------------------------------------- 1 | from transformers.models.camembert.configuration_camembert import CamembertConfig 2 | from transformers.models.camembert.modeling_camembert import ( 3 | CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_LIST, 4 | ) 5 | 6 | from simpletransformers.classification.transformer_models.roberta_model import ( 7 | RobertaForSequenceClassification, 8 | ) 9 | 10 | 11 | class CamembertForSequenceClassification(RobertaForSequenceClassification): 12 | r""" 13 | **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: 14 | Labels for computing the sequence classification/regression loss. 15 | Indices should be in ``[0, ..., config.num_labels]``. 16 | If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss), 17 | If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy). 18 | Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: 19 | **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: 20 | Classification (or regression if config.num_labels==1) loss. 21 | **logits**: ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)`` 22 | Classification (or regression if config.num_labels==1) scores (before SoftMax). 23 | **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) 24 | list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) 25 | of shape ``(batch_size, sequence_length, hidden_size)``: 26 | Hidden-states of the model at the output of each layer plus the initial embedding outputs. 27 | **attentions**: (`optional`, returned when ``config.output_attentions=True``) 28 | list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: 29 | Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. 30 | Examples:: 31 | tokenizer = CamembertTokenizer.from_pretrained('camembert-base') 32 | model = CamembertForSequenceClassification.from_pretrained('camembert-base') 33 | input_ids = torch.tensor(tokenizer.encode("J'aime le camembert !")).unsqueeze(0) # Batch size 1 34 | labels = torch.tensor([1]).unsqueeze(0) # Batch size 1 35 | outputs = model(input_ids, labels=labels) 36 | loss, logits = outputs[:2] 37 | """ # noqa: ignore flake8" 38 | config_class = CamembertConfig 39 | pretrained_model_archive_map = CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_LIST 40 | -------------------------------------------------------------------------------- /simpletransformers/simpletransformers/classification/transformer_models/electra_model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.nn import CrossEntropyLoss, MSELoss 4 | from transformers.models.electra.modeling_electra import ( 5 | ElectraModel, 6 | ElectraPreTrainedModel, 7 | ElectraClassificationHead, 8 | ) 9 | 10 | 11 | class ElectraForSequenceClassification(ElectraPreTrainedModel): 12 | r""" 13 | **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: 14 | Labels for computing the sequence classification/regression loss. 15 | Indices should be in ``[0, ..., config.num_labels - 1]``. 16 | If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss), 17 | If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy). 18 | Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: 19 | **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: 20 | Classification (or regression if config.num_labels==1) loss. 21 | **logits**: ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)`` 22 | Classification (or regression if config.num_labels==1) scores (before SoftMax). 23 | **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) 24 | list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) 25 | of shape ``(batch_size, sequence_length, hidden_size)``: 26 | Hidden-states of the model at the output of each layer plus the initial embedding outputs. 27 | **attentions**: (`optional`, returned when ``config.output_attentions=True``) 28 | list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: 29 | Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. 30 | Examples:: 31 | tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') 32 | model = BertForSequenceClassification.from_pretrained('bert-base-uncased') 33 | input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 34 | labels = torch.tensor([1]).unsqueeze(0) # Batch size 1 35 | outputs = model(input_ids, labels=labels) 36 | loss, logits = outputs[:2] 37 | """ # noqa: ignore flake8" 38 | 39 | def __init__(self, config, weight=None): 40 | super(ElectraForSequenceClassification, self).__init__(config) 41 | self.num_labels = config.num_labels 42 | 43 | self.bert = ElectraModel(config) 44 | self.classifier = ElectraClassificationHead(config) 45 | 46 | self.init_weights() 47 | 48 | def forward( 49 | self, 50 | input_ids=None, 51 | attention_mask=None, 52 | token_type_ids=None, 53 | position_ids=None, 54 | head_mask=None, 55 | inputs_embeds=None, 56 | labels=None, 57 | ): 58 | 59 | discriminator_hidden_states = self.electra( 60 | input_ids, 61 | attention_mask, 62 | token_type_ids, 63 | position_ids, 64 | head_mask, 65 | inputs_embeds, 66 | ) 67 | 68 | sequence_output = discriminator_hidden_states[0] 69 | logits = self.classifier(sequence_output) 70 | 71 | loss = None 72 | if labels is not None: 73 | if self.num_labels == 1: 74 | # We are doing regression 75 | loss_fct = MSELoss() 76 | loss = loss_fct(logits.view(-1), labels.view(-1)) 77 | else: 78 | if self.weight is not None: 79 | weight = self.weight.to(labels.device) 80 | else: 81 | weight = None 82 | loss_fct = CrossEntropyLoss(weight=weight) 83 | loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) 84 | 85 | output = (logits,) + discriminator_hidden_states[1:] 86 | return ((loss,) + output) if loss is not None else output 87 | -------------------------------------------------------------------------------- /simpletransformers/simpletransformers/classification/transformer_models/layoutlm_model.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | from torch.nn import CrossEntropyLoss, MSELoss 3 | from transformers.models.bert.modeling_bert import BertPreTrainedModel 4 | from transformers.models.layoutlm.modeling_layoutlm import LayoutLMModel 5 | 6 | 7 | class LayoutLMForSequenceClassification(BertPreTrainedModel): 8 | def __init__(self, config, weight=None): 9 | super(LayoutLMForSequenceClassification, self).__init__(config) 10 | self.num_labels = config.num_labels 11 | 12 | self.bert = LayoutLMModel(config) 13 | self.dropout = nn.Dropout(config.hidden_dropout_prob) 14 | self.classifier = nn.Linear(config.hidden_size, self.config.num_labels) 15 | self.weight = weight 16 | 17 | self.init_weights() 18 | 19 | def forward( 20 | self, 21 | input_ids, 22 | bbox, 23 | attention_mask=None, 24 | token_type_ids=None, 25 | position_ids=None, 26 | head_mask=None, 27 | inputs_embeds=None, 28 | labels=None, 29 | ): 30 | 31 | outputs = self.bert( 32 | input_ids=input_ids, 33 | bbox=bbox, 34 | attention_mask=attention_mask, 35 | token_type_ids=token_type_ids, 36 | position_ids=position_ids, 37 | head_mask=head_mask, 38 | ) 39 | 40 | pooled_output = outputs[1] 41 | 42 | pooled_output = self.dropout(pooled_output) 43 | logits = self.classifier(pooled_output) 44 | 45 | outputs = (logits,) + outputs[ 46 | 2: 47 | ] # add hidden states and attention if they are here 48 | 49 | if labels is not None: 50 | if self.num_labels == 1: 51 | # We are doing regression 52 | loss_fct = MSELoss() 53 | loss = loss_fct(logits.view(-1), labels.view(-1)) 54 | else: 55 | if self.weight is not None: 56 | weight = self.weight.to(labels.device) 57 | else: 58 | weight = None 59 | loss_fct = CrossEntropyLoss(weight=weight) 60 | loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) 61 | outputs = (loss,) + outputs 62 | 63 | return outputs # (loss), logits, (hidden_states), (attentions) 64 | -------------------------------------------------------------------------------- /simpletransformers/simpletransformers/classification/transformer_models/longformer_model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.nn import CrossEntropyLoss, MSELoss 4 | from transformers.models.longformer.modeling_longformer import ( 5 | LongformerModel, 6 | LongformerPreTrainedModel, 7 | LongformerClassificationHead, 8 | ) 9 | 10 | 11 | class LongformerForSequenceClassification(LongformerPreTrainedModel): 12 | def __init__(self, config, weight=None): 13 | super(LongformerForSequenceClassification, self).__init__(config) 14 | self.num_labels = config.num_labels 15 | 16 | self.longformer = LongformerModel(config) 17 | self.classifier = LongformerClassificationHead(config) 18 | self.weight = weight 19 | 20 | self.init_weights() 21 | 22 | def forward( 23 | self, 24 | input_ids=None, 25 | attention_mask=None, 26 | global_attention_mask=None, 27 | token_type_ids=None, 28 | position_ids=None, 29 | inputs_embeds=None, 30 | labels=None, 31 | output_attentions=None, 32 | output_hidden_states=None, 33 | ): 34 | r""" 35 | labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): 36 | Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ..., 37 | config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), 38 | If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). 39 | """ 40 | if global_attention_mask is None: 41 | global_attention_mask = torch.zeros_like(input_ids) 42 | # global attention on cls token 43 | global_attention_mask[:, 0] = 1 44 | 45 | outputs = self.longformer( 46 | input_ids, 47 | attention_mask=attention_mask, 48 | global_attention_mask=global_attention_mask, 49 | token_type_ids=token_type_ids, 50 | position_ids=position_ids, 51 | inputs_embeds=inputs_embeds, 52 | output_attentions=output_attentions, 53 | output_hidden_states=output_hidden_states, 54 | ) 55 | sequence_output = outputs[0] 56 | logits = self.classifier(sequence_output) 57 | 58 | loss = None 59 | if labels is not None: 60 | if self.num_labels == 1: 61 | # We are doing regression 62 | loss_fct = MSELoss() 63 | loss = loss_fct(logits.view(-1), labels.view(-1)) 64 | else: 65 | if self.weight is not None: 66 | weight = self.weight.to(labels.device) 67 | else: 68 | weight = None 69 | loss_fct = CrossEntropyLoss(weight=weight) 70 | loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) 71 | 72 | output = (logits,) + outputs[2:] 73 | return ((loss,) + output) if loss is not None else output 74 | -------------------------------------------------------------------------------- /simpletransformers/simpletransformers/classification/transformer_models/mobilebert_model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.nn import CrossEntropyLoss, MSELoss 4 | from transformers.models.mobilebert.modeling_mobilebert import ( 5 | MobileBertModel, 6 | MobileBertPreTrainedModel, 7 | ) 8 | 9 | 10 | class MobileBertForSequenceClassification(MobileBertPreTrainedModel): 11 | def __init__(self, config, weight=None): 12 | super(MobileBertForSequenceClassification, self).__init__(config) 13 | self.num_labels = config.num_labels 14 | 15 | self.mobilebert = MobileBertModel(config) 16 | self.dropout = nn.Dropout(config.hidden_dropout_prob) 17 | self.classifier = nn.Linear(config.hidden_size, self.num_labels) 18 | self.weight = weight 19 | 20 | self.init_weights() 21 | 22 | def forward( 23 | self, 24 | input_ids=None, 25 | attention_mask=None, 26 | token_type_ids=None, 27 | position_ids=None, 28 | head_mask=None, 29 | inputs_embeds=None, 30 | labels=None, 31 | output_attentions=None, 32 | output_hidden_states=None, 33 | ): 34 | r""" 35 | labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): 36 | Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ..., 37 | config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), 38 | If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). 39 | """ 40 | outputs = self.mobilebert( 41 | input_ids, 42 | attention_mask=attention_mask, 43 | token_type_ids=token_type_ids, 44 | position_ids=position_ids, 45 | head_mask=head_mask, 46 | inputs_embeds=inputs_embeds, 47 | output_attentions=output_attentions, 48 | output_hidden_states=output_hidden_states, 49 | ) 50 | pooled_output = outputs[1] 51 | pooled_output = self.dropout(pooled_output) 52 | logits = self.classifier(pooled_output) 53 | 54 | loss = None 55 | if labels is not None: 56 | if self.num_labels == 1: 57 | # We are doing regression 58 | loss_fct = MSELoss() 59 | loss = loss_fct(logits.view(-1), labels.view(-1)) 60 | else: 61 | if self.weight is not None: 62 | weight = self.weight.to(labels.device) 63 | else: 64 | weight = None 65 | loss_fct = CrossEntropyLoss(weight=weight) 66 | loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) 67 | 68 | output = (logits,) + outputs[2:] 69 | return ((loss,) + output) if loss is not None else output 70 | -------------------------------------------------------------------------------- /simpletransformers/simpletransformers/classification/transformer_models/xlm_roberta_model.py: -------------------------------------------------------------------------------- 1 | from transformers.models.xlm_roberta.configuration_xlm_roberta import XLMRobertaConfig 2 | from transformers.models.xlm_roberta.modeling_xlm_roberta import ( 3 | XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST, 4 | ) 5 | 6 | from simpletransformers.classification.transformer_models.roberta_model import ( 7 | RobertaForSequenceClassification, 8 | ) 9 | 10 | 11 | class XLMRobertaForSequenceClassification(RobertaForSequenceClassification): 12 | config_class = XLMRobertaConfig 13 | pretrained_model_archive_map = XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST 14 | -------------------------------------------------------------------------------- /simpletransformers/simpletransformers/config/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChenWu98/algorithmic-creativity/7176dc7d35ca17605a76fa873ec97ad62a8537ac/simpletransformers/simpletransformers/config/__init__.py -------------------------------------------------------------------------------- /simpletransformers/simpletransformers/config/global_args.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from multiprocessing import cpu_count 3 | 4 | global_args = { 5 | "adam_epsilon": 1e-8, 6 | "best_model_dir": "outputs/best_model", 7 | "cache_dir": "cache_dir/", 8 | "config": {}, 9 | "do_lower_case": False, 10 | "early_stopping_consider_epochs": False, 11 | "early_stopping_delta": 0, 12 | "early_stopping_metric": "eval_loss", 13 | "early_stopping_metric_minimize": True, 14 | "early_stopping_patience": 3, 15 | "encoding": None, 16 | "eval_batch_size": 8, 17 | "evaluate_during_training": False, 18 | "evaluate_during_training_silent": True, 19 | "evaluate_during_training_steps": 2000, 20 | "evaluate_during_training_verbose": False, 21 | "fp16": True, 22 | "gradient_accumulation_steps": 1, 23 | "learning_rate": 4e-5, 24 | "local_rank": -1, 25 | "logging_steps": 50, 26 | "manual_seed": None, 27 | "max_grad_norm": 1.0, 28 | "max_seq_length": 128, 29 | "multiprocessing_chunksize": 500, 30 | "n_gpu": 1, 31 | "no_cache": False, 32 | "no_save": False, 33 | "num_train_epochs": 1, 34 | "output_dir": "outputs/", 35 | "overwrite_output_dir": False, 36 | "process_count": cpu_count() - 2 if cpu_count() > 2 else 1, 37 | "reprocess_input_data": True, 38 | "save_best_model": True, 39 | "save_eval_checkpoints": True, 40 | "save_model_every_epoch": True, 41 | "save_steps": 2000, 42 | "save_optimizer_and_scheduler": True, 43 | "silent": False, 44 | "tensorboard_dir": None, 45 | "train_batch_size": 8, 46 | "use_cached_eval_features": False, 47 | "use_early_stopping": False, 48 | "use_multiprocessing": True, 49 | "wandb_kwargs": {}, 50 | "wandb_project": None, 51 | "warmup_ratio": 0.06, 52 | "warmup_steps": 0, 53 | "weight_decay": 0, 54 | } 55 | 56 | if sys.platform == "win32": 57 | global_args["process_count"] = min(global_args["process_count"], 61) 58 | -------------------------------------------------------------------------------- /simpletransformers/simpletransformers/config/utils.py: -------------------------------------------------------------------------------- 1 | def sweep_config_to_sweep_values(sweep_config): 2 | """ 3 | Converts an instance of wandb.Config to plain values map. 4 | 5 | wandb.Config varies across versions quite significantly, 6 | so we use the `keys` method that works consistently. 7 | """ 8 | 9 | return {key: sweep_config[key] for key in sweep_config.keys()} 10 | -------------------------------------------------------------------------------- /simpletransformers/simpletransformers/conv_ai/__init__.py: -------------------------------------------------------------------------------- 1 | from simpletransformers.config.model_args import ConvAIArgs 2 | from simpletransformers.conv_ai.conv_ai_model import ConvAIModel 3 | -------------------------------------------------------------------------------- /simpletransformers/simpletransformers/custom_models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChenWu98/algorithmic-creativity/7176dc7d35ca17605a76fa873ec97ad62a8537ac/simpletransformers/simpletransformers/custom_models/__init__.py -------------------------------------------------------------------------------- /simpletransformers/simpletransformers/experimental/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChenWu98/algorithmic-creativity/7176dc7d35ca17605a76fa873ec97ad62a8537ac/simpletransformers/simpletransformers/experimental/__init__.py -------------------------------------------------------------------------------- /simpletransformers/simpletransformers/experimental/classification/__init__.py: -------------------------------------------------------------------------------- 1 | from simpletransformers.experimental.classification.classification_model import ( 2 | ClassificationModel, 3 | ) 4 | from simpletransformers.experimental.classification.multi_label_classification_model import ( 5 | MultiLabelClassificationModel, 6 | ) 7 | -------------------------------------------------------------------------------- /simpletransformers/simpletransformers/experimental/classification/transformer_models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChenWu98/algorithmic-creativity/7176dc7d35ca17605a76fa873ec97ad62a8537ac/simpletransformers/simpletransformers/experimental/classification/transformer_models/__init__.py -------------------------------------------------------------------------------- /simpletransformers/simpletransformers/experimental/classification/transformer_models/xlm_model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.nn import CrossEntropyLoss, MSELoss 4 | from transformers.models.xlm.modeling_xlm import ( 5 | SequenceSummary, 6 | XLMModel, 7 | XLMPreTrainedModel, 8 | ) 9 | 10 | 11 | class XLMForSequenceClassification(XLMPreTrainedModel): 12 | r""" 13 | **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: 14 | Labels for computing the sequence classification/regression loss. 15 | Indices should be in ``[0, ..., config.num_labels - 1]``. 16 | If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss), 17 | If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy). 18 | Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: 19 | **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: 20 | Classification (or regression if config.num_labels==1) loss. 21 | **logits**: ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)`` 22 | Classification (or regression if config.num_labels==1) scores (before SoftMax). 23 | **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) 24 | list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) 25 | of shape ``(batch_size, sequence_length, hidden_size)``: 26 | Hidden-states of the model at the output of each layer plus the initial embedding outputs. 27 | **attentions**: (`optional`, returned when ``config.output_attentions=True``) 28 | list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: 29 | Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. 30 | Examples:: 31 | tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048') 32 | model = XLMForSequenceClassification.from_pretrained('xlm-mlm-en-2048') 33 | input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 34 | labels = torch.tensor([1]).unsqueeze(0) # Batch size 1 35 | outputs = model(input_ids, labels=labels) 36 | loss, logits = outputs[:2] 37 | """ 38 | 39 | def __init__(self, config, weight=None): 40 | super(XLMForSequenceClassification, self).__init__(config) 41 | self.num_labels = config.num_labels 42 | self.weight = weight 43 | 44 | self.transformer = XLMModel(config) 45 | self.sequence_summary = SequenceSummary(config) 46 | 47 | self.init_weights() 48 | 49 | def forward( 50 | self, 51 | input_ids=None, 52 | attention_mask=None, 53 | langs=None, 54 | token_type_ids=None, 55 | position_ids=None, 56 | lengths=None, 57 | cache=None, 58 | head_mask=None, 59 | inputs_embeds=None, 60 | labels=None, 61 | ): 62 | transformer_outputs = self.transformer( 63 | input_ids, 64 | attention_mask=attention_mask, 65 | langs=langs, 66 | token_type_ids=token_type_ids, 67 | position_ids=position_ids, 68 | lengths=lengths, 69 | cache=cache, 70 | head_mask=head_mask, 71 | ) 72 | 73 | output = transformer_outputs[0] 74 | logits = self.sequence_summary(output) 75 | 76 | outputs = (logits,) + transformer_outputs[ 77 | 1: 78 | ] # Keep new_mems and attention/hidden states if they are here 79 | 80 | if labels is not None: 81 | if self.num_labels == 1: 82 | # We are doing regression 83 | loss_fct = MSELoss() 84 | loss = loss_fct(logits.view(-1), labels.view(-1)) 85 | else: 86 | loss_fct = CrossEntropyLoss(weight=self.weight) 87 | loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) 88 | outputs = (loss,) + outputs 89 | 90 | return outputs 91 | -------------------------------------------------------------------------------- /simpletransformers/simpletransformers/language_generation/__init__.py: -------------------------------------------------------------------------------- 1 | from simpletransformers.config.model_args import LanguageGenerationArgs 2 | from simpletransformers.language_generation.language_generation_model import ( 3 | LanguageGenerationModel, 4 | ) 5 | -------------------------------------------------------------------------------- /simpletransformers/simpletransformers/language_generation/language_generation_utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | logger = logging.getLogger(__name__) 4 | 5 | # Padding text to help Transformer-XL and XLNet with short prompts as proposed by Aman Rusia 6 | # in https://github.com/rusiaaman/XLNet-gen#methodology 7 | # and https://medium.com/@amanrusia/xlnet-speaks-comparison-to-gpt-2-ea1a4e9ba39e 8 | PADDING_TEXT = """In 1991, the remains of Russian Tsar Nicholas II and his family 9 | (except for Alexei and Maria) are discovered. 10 | The voice of Nicholas's young son, Tsarevich Alexei Nikolaevich, narrates the 11 | remainder of the story. 1883 Western Siberia, 12 | a young Grigori Rasputin is asked by his father and a group of men to perform magic. 13 | Rasputin has a vision and denounces one of the men as a horse thief. Although his 14 | father initially slaps him for making such an accusation, Rasputin watches as the 15 | man is chased outside and beaten. Twenty years later, Rasputin sees a vision of 16 | the Virgin Mary, prompting him to become a priest. Rasputin quickly becomes famous, 17 | with people, even a bishop, begging for his blessing. """ 18 | 19 | 20 | def prepare_ctrl_input(args, _, tokenizer, prompt_text): 21 | if args.temperature > 0.7: 22 | logger.info( 23 | "CTRL typically works better with lower temperatures (and lower top_k)." 24 | ) 25 | 26 | encoded_prompt = tokenizer.encode(prompt_text, add_special_tokens=False) 27 | if not any(encoded_prompt[0] == x for x in tokenizer.control_codes.values()): 28 | logger.info( 29 | "WARNING! You are not starting your generation from a control code so you won't get good results" 30 | ) 31 | return prompt_text 32 | 33 | 34 | def prepare_xlm_input(args, model, tokenizer, prompt_text): 35 | # Set the language 36 | use_lang_emb = hasattr(model.config, "use_lang_emb") and model.config.use_lang_emb 37 | if hasattr(model.config, "lang2id") and use_lang_emb: 38 | available_languages = model.config.lang2id.keys() 39 | if args.xlm_language in available_languages: 40 | language = args.xlm_language 41 | else: 42 | language = None 43 | while language not in available_languages: 44 | language = input( 45 | "Using XLM. Select language in " 46 | + str(list(available_languages)) 47 | + " >>> " 48 | ) 49 | 50 | model.config.lang_id = model.config.lang2id[language] 51 | 52 | return prompt_text 53 | 54 | 55 | def prepare_xlnet_input(args, _, tokenizer, prompt_text): 56 | prompt_text = ( 57 | args.padding_text if args.padding_text else PADDING_TEXT 58 | ) + prompt_text 59 | return prompt_text 60 | 61 | 62 | def prepare_transfoxl_input(args, _, tokenizer, prompt_text): 63 | prompt_text = ( 64 | args.padding_text if args.padding_text else PADDING_TEXT 65 | ) + prompt_text 66 | return prompt_text 67 | 68 | 69 | PREPROCESSING_FUNCTIONS = { 70 | "ctrl": prepare_ctrl_input, 71 | "xlm": prepare_xlm_input, 72 | "xlnet": prepare_xlnet_input, 73 | "transfo-xl": prepare_transfoxl_input, 74 | } 75 | -------------------------------------------------------------------------------- /simpletransformers/simpletransformers/language_modeling/__init__.py: -------------------------------------------------------------------------------- 1 | from simpletransformers.config.model_args import LanguageModelingArgs 2 | from simpletransformers.language_modeling.language_modeling_model import ( 3 | LanguageModelingModel, 4 | ) 5 | -------------------------------------------------------------------------------- /simpletransformers/simpletransformers/language_representation/__init__.py: -------------------------------------------------------------------------------- 1 | from simpletransformers.language_representation.representation_model import ( 2 | RepresentationModel, 3 | ) 4 | -------------------------------------------------------------------------------- /simpletransformers/simpletransformers/language_representation/transformer_models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChenWu98/algorithmic-creativity/7176dc7d35ca17605a76fa873ec97ad62a8537ac/simpletransformers/simpletransformers/language_representation/transformer_models/__init__.py -------------------------------------------------------------------------------- /simpletransformers/simpletransformers/language_representation/transformer_models/bert_model.py: -------------------------------------------------------------------------------- 1 | from transformers.models.bert.modeling_bert import BertModel, BertPreTrainedModel 2 | 3 | 4 | # supports both BERT & ROBERTA BASED MODELS 5 | class BertForTextRepresentation(BertPreTrainedModel): 6 | r""" 7 | Outputs: `List` of token vectors, 1 list of max_seq token vectors per sentence given 8 | """ # noqa: ignore flake8" 9 | 10 | def __init__(self, config, weight=None): 11 | super(BertForTextRepresentation, self).__init__(config) 12 | self.bert = BertModel(config) 13 | self.weight = weight 14 | self.init_weights() 15 | 16 | def forward( 17 | self, 18 | input_ids=None, 19 | attention_mask=None, 20 | token_type_ids=None, 21 | position_ids=None, 22 | head_mask=None, 23 | ): 24 | outputs = self.bert( 25 | input_ids, 26 | attention_mask=attention_mask, 27 | token_type_ids=token_type_ids, 28 | position_ids=position_ids, 29 | head_mask=head_mask, 30 | output_hidden_states=True, 31 | ) 32 | hidden_states = outputs[2] 33 | return hidden_states[-1] 34 | -------------------------------------------------------------------------------- /simpletransformers/simpletransformers/language_representation/transformer_models/gpt2_model.py: -------------------------------------------------------------------------------- 1 | from transformers.models.gpt2.modeling_gpt2 import GPT2Model, GPT2PreTrainedModel 2 | 3 | 4 | # supports both BERT & ROBERTA BASED MODELS 5 | class GPT2ForTextRepresentation(GPT2PreTrainedModel): 6 | r""" 7 | Outputs: `List` of token vectors, 1 list of max_seq token vectors per sentence given 8 | """ # noqa: ignore flake8" 9 | 10 | def __init__(self, config, weight=None): 11 | super(GPT2ForTextRepresentation, self).__init__(config) 12 | self.gpt2 = GPT2Model(config) 13 | self.weight = weight 14 | self.init_weights() 15 | 16 | def resize_token_embeddings(self, new_len): 17 | return self.gpt2.resize_token_embeddings(new_len) 18 | 19 | def forward( 20 | self, 21 | input_ids=None, 22 | attention_mask=None, 23 | token_type_ids=None, 24 | position_ids=None, 25 | head_mask=None, 26 | ): 27 | outputs = self.gpt2( 28 | input_ids, 29 | attention_mask=attention_mask, 30 | token_type_ids=token_type_ids, 31 | position_ids=position_ids, 32 | head_mask=head_mask, 33 | output_hidden_states=True, 34 | ) 35 | hidden_states = outputs[2] 36 | return hidden_states[-1] 37 | -------------------------------------------------------------------------------- /simpletransformers/simpletransformers/losses/__init__.py: -------------------------------------------------------------------------------- 1 | from simpletransformers.losses.focal_loss import FocalLoss 2 | from simpletransformers.losses.dice_loss import DiceLoss 3 | from simpletransformers.losses.tversky_loss import TverskyLoss 4 | -------------------------------------------------------------------------------- /simpletransformers/simpletransformers/losses/loss_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import warnings 3 | from torch.nn import CrossEntropyLoss 4 | from simpletransformers.losses import FocalLoss, DiceLoss, TverskyLoss 5 | 6 | 7 | def init_loss(weight, device, args): 8 | if weight and args.loss_type: 9 | warnings.warn( 10 | f"weight and args.loss_type parametters are set at the same time" 11 | f"will use weighted cross entropy loss. To use {args.loss_type} set weight to None" 12 | ) 13 | if weight: 14 | loss_fct = CrossEntropyLoss(weight=torch.Tensor(weight).to(device)) 15 | elif args.loss_type: 16 | if args.loss_type == "focal": 17 | loss_fct = FocalLoss(**args.loss_args) 18 | elif args.loss_type == "dice": 19 | loss_fct = DiceLoss(**args.loss_args) 20 | elif args.loss_type == "tversky": 21 | loss_fct = TverskyLoss(**args.loss_args) 22 | else: 23 | raise NotImplementedError(f"unknown {args.loss_type} loss function") 24 | else: 25 | loss_fct = None 26 | 27 | return loss_fct 28 | -------------------------------------------------------------------------------- /simpletransformers/simpletransformers/model.py: -------------------------------------------------------------------------------- 1 | from simpletransformers.classification.classification_model import ClassificationModel 2 | 3 | TransformerModel = ClassificationModel 4 | -------------------------------------------------------------------------------- /simpletransformers/simpletransformers/ner/__init__.py: -------------------------------------------------------------------------------- 1 | from simpletransformers.config.model_args import NERArgs 2 | from simpletransformers.ner.ner_model import NERModel 3 | -------------------------------------------------------------------------------- /simpletransformers/simpletransformers/ner/ner_dataset_loading_script/ner_dataset_loading_script.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 HuggingFace Datasets Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | # Lint as: python3 17 | """Introduction to the CoNLL-2003 Shared Task: Language-Independent Named Entity Recognition""" 18 | 19 | import logging 20 | 21 | import datasets 22 | 23 | 24 | """ 25 | Adapted from the Huggingface code at https://github.com/huggingface/datasets/blob/master/datasets/conll2003/conll2003.py 26 | """ 27 | 28 | 29 | class NERConfig(datasets.BuilderConfig): 30 | """BuilderConfig for NER""" 31 | 32 | def __init__(self, **kwargs): 33 | """BuilderConfig for NER. 34 | Args: 35 | **kwargs: keyword arguments forwarded to super. 36 | """ 37 | super(NERConfig, self).__init__(**kwargs) 38 | 39 | 40 | class NER(datasets.GeneratorBasedBuilder): 41 | """NER dataset.""" 42 | 43 | BUILDER_CONFIG_CLASS = NERConfig 44 | 45 | def _info(self): 46 | return datasets.DatasetInfo( 47 | features=datasets.Features( 48 | { 49 | "sentence_id": datasets.Value("string"), 50 | "words": datasets.Sequence(datasets.Value("string")), 51 | "labels": datasets.Sequence(datasets.Value("string")), 52 | } 53 | ), 54 | supervised_keys=None, 55 | ) 56 | 57 | def _split_generators(self, dl_manager): 58 | """Returns SplitGenerators.""" 59 | 60 | return [ 61 | datasets.SplitGenerator( 62 | name=datasets.Split.TRAIN, 63 | gen_kwargs={"filepath": self.config.data_files}, 64 | ), 65 | ] 66 | 67 | def _generate_examples(self, filepath): 68 | logging.info("⏳ Generating examples from = %s", filepath) 69 | with open(filepath, encoding="utf-8") as f: 70 | guid = 0 71 | words = [] 72 | labels = [] 73 | for line in f: 74 | if line.startswith("-DOCSTART-") or line == "" or line == "\n": 75 | if words: 76 | yield guid, { 77 | "sentence_id": str(guid), 78 | "words": words, 79 | "labels": labels, 80 | } 81 | guid += 1 82 | words = [] 83 | labels = [] 84 | else: 85 | # conll2003 words are space separated 86 | splits = line.split(" ") 87 | words.append(splits[0]) 88 | labels.append(splits[-1].rstrip()) 89 | # last example 90 | yield guid, { 91 | "sentence_id": str(guid), 92 | "words": words, 93 | "labels": labels, 94 | } 95 | -------------------------------------------------------------------------------- /simpletransformers/simpletransformers/question_answering/__init__.py: -------------------------------------------------------------------------------- 1 | from simpletransformers.config.model_args import QuestionAnsweringArgs 2 | from simpletransformers.question_answering.question_answering_model import ( 3 | QuestionAnsweringModel, 4 | ) 5 | -------------------------------------------------------------------------------- /simpletransformers/simpletransformers/question_answering/qa_dataset_loading_script/qa_dataset_loading_script.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, print_function 2 | 3 | import json 4 | 5 | import datasets 6 | 7 | 8 | """ 9 | Adapted from the Huggingface code at https://github.com/huggingface/datasets/blob/master/datasets/squad_v2/squad_v2.py 10 | """ 11 | 12 | 13 | class QAConfig(datasets.BuilderConfig): 14 | """BuilderConfig for SQUAD.""" 15 | 16 | def __init__(self, is_training, **kwargs): 17 | """BuilderConfig for SQUADV2. 18 | Args: 19 | **kwargs: keyword arguments forwarded to super. 20 | """ 21 | super(QAConfig, self).__init__(**kwargs) 22 | self.is_training = is_training 23 | 24 | 25 | class QA(datasets.GeneratorBasedBuilder): 26 | BUILDER_CONFIG_CLASS = QAConfig 27 | 28 | def _info(self): 29 | return datasets.DatasetInfo( 30 | features=datasets.Features( 31 | { 32 | "qas_id": datasets.Value("string"), 33 | "question_text": datasets.Value("string"), 34 | "context_text": datasets.Value("string"), 35 | "answer_text": datasets.Value("string"), 36 | "start_position_character": datasets.Value("int32"), 37 | "is_impossible": datasets.Value("bool"), 38 | "answers": datasets.features.Sequence( 39 | { 40 | "text": datasets.Value("string"), 41 | "answer_start": datasets.Value("int32"), 42 | } 43 | ), 44 | } 45 | ), 46 | supervised_keys=None, 47 | ) 48 | 49 | def _split_generators(self, dl_manager): 50 | """Returns SplitGenerators.""" 51 | 52 | return [ 53 | datasets.SplitGenerator( 54 | name=datasets.Split.TRAIN, 55 | gen_kwargs={"filepath": self.config.data_files}, 56 | ), 57 | ] 58 | 59 | def _generate_examples(self, filepath): 60 | """Yields examples.""" 61 | # TODO(squad_v2): Yields (key, example) tuples from the dataset 62 | with open(filepath, encoding="utf-8") as f: 63 | examples_to_process = json.load(f) 64 | for paragraph in examples_to_process: 65 | context_text = paragraph["context"].strip() 66 | for qa in paragraph["qas"]: 67 | qas_id = qa["id"] 68 | question_text = qa["question"] 69 | start_position_character = -1 70 | answer_text = "" 71 | answers = [] 72 | 73 | if "is_impossible" in qa: 74 | is_impossible = qa["is_impossible"] 75 | else: 76 | is_impossible = False 77 | 78 | if not is_impossible: 79 | if self.config.is_training: 80 | answer = qa["answers"][0] 81 | answer_text = answer["text"] 82 | start_position_character = answer["answer_start"] 83 | else: 84 | answers = qa["answers"] 85 | 86 | yield qas_id, { 87 | "qas_id": qas_id, 88 | "question_text": question_text, 89 | "context_text": context_text, 90 | "answer_text": answer_text, 91 | "start_position_character": start_position_character, 92 | "is_impossible": is_impossible, 93 | "answers": answers, 94 | } 95 | -------------------------------------------------------------------------------- /simpletransformers/simpletransformers/retrieval/__init__.py: -------------------------------------------------------------------------------- 1 | from simpletransformers.config.model_args import RetrievalArgs 2 | from simpletransformers.retrieval.retrieval_model import RetrievalModel 3 | -------------------------------------------------------------------------------- /simpletransformers/simpletransformers/seq2seq/__init__.py: -------------------------------------------------------------------------------- 1 | from simpletransformers.config.model_args import Seq2SeqArgs 2 | from simpletransformers.seq2seq.seq2seq_model import Seq2SeqModel 3 | -------------------------------------------------------------------------------- /simpletransformers/simpletransformers/streamlit/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChenWu98/algorithmic-creativity/7176dc7d35ca17605a76fa873ec97ad62a8537ac/simpletransformers/simpletransformers/streamlit/__init__.py -------------------------------------------------------------------------------- /simpletransformers/simpletransformers/streamlit/ner_view.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import pandas as pd 3 | 4 | from simpletransformers.ner import NERModel 5 | from simpletransformers.streamlit.streamlit_utils import ( 6 | get, 7 | simple_transformers_model, 8 | get_color, 9 | ) 10 | 11 | 12 | ENTITY_WRAPPER = """{} {}""" # noqa 13 | ENTITY_LABEL_WRAPPER = """{}""" # noqa 14 | 15 | 16 | def format_word(word, entity, entity_checkboxes, entity_color_map): 17 | if entity_checkboxes[entity]: 18 | return ENTITY_WRAPPER.format( 19 | entity_color_map[entity], word, ENTITY_LABEL_WRAPPER.format(entity) 20 | ) 21 | else: 22 | return word 23 | 24 | 25 | @st.cache(hash_funcs={NERModel: simple_transformers_model}) 26 | def get_prediction(model, input_text): 27 | predictions, _ = model.predict([input_text]) 28 | 29 | return predictions 30 | 31 | 32 | def ner_viewer(model): 33 | session_state = get( 34 | max_seq_length=model.args.max_seq_length, 35 | ) 36 | model.args.max_seq_length = session_state.max_seq_length 37 | 38 | entity_list = model.args.labels_list 39 | 40 | st.sidebar.subheader("Entities") 41 | entity_checkboxes = { 42 | entity: st.sidebar.checkbox(entity, value=True) for entity in entity_list 43 | } 44 | entity_color_map = {entity: get_color(i) for i, entity in enumerate(entity_list)} 45 | 46 | st.sidebar.subheader("Parameters") 47 | model.args.max_seq_length = st.sidebar.slider( 48 | "Max Seq Length", min_value=1, max_value=512, value=model.args.max_seq_length 49 | ) 50 | 51 | st.subheader("Enter text: ") 52 | input_text = st.text_area("") 53 | 54 | prediction = get_prediction(model, input_text)[0] 55 | 56 | to_write = " ".join( 57 | [ 58 | format_word(word, entity, entity_checkboxes, entity_color_map) 59 | for pred in prediction 60 | for word, entity in pred.items() 61 | ] 62 | ) 63 | 64 | st.subheader(f"Predictions") 65 | st.write(to_write, unsafe_allow_html=True) 66 | -------------------------------------------------------------------------------- /simpletransformers/simpletransformers/t5/__init__.py: -------------------------------------------------------------------------------- 1 | from simpletransformers.config.model_args import T5Args 2 | from simpletransformers.t5.t5_model import T5Model 3 | -------------------------------------------------------------------------------- /simpletransformers/tests/language_modeling/test_language_modeling_only.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pandas as pd 4 | import pytest 5 | 6 | from simpletransformers.language_modeling import LanguageModelingModel 7 | 8 | 9 | @pytest.mark.parametrize( 10 | "model_type, model_name", 11 | [ 12 | ("bert", "bert-base-uncased"), 13 | ("longformer", "allenai/longformer-base-4096"), 14 | ("bert", None), 15 | ("electra", None), 16 | ("longformer", None), 17 | # ("xlnet", "xlnet-base-cased"), 18 | # ("xlm", "xlm-mlm-17-1280"), 19 | ("roberta", "roberta-base"), 20 | # ("distilbert", "distilbert-base-uncased"), 21 | # ("albert", "albert-base-v1"), 22 | # ("camembert", "camembert-base"), 23 | # ("xlmroberta", "xlm-roberta-base"), 24 | # ("flaubert", "flaubert-base-cased"), 25 | ], 26 | ) 27 | def test_language_modeling(model_type, model_name): 28 | with open("train.txt", "w") as f: 29 | for i in range(100): 30 | f.writelines("Hello world with Simple Transformers! \n") 31 | 32 | if model_type == "electra": 33 | model_args = { 34 | "reprocess_input_data": True, 35 | "overwrite_output_dir": True, 36 | "num_train_epochs": 1, 37 | "no_save": True, 38 | "vocab_size": 100, 39 | "generator_config": { 40 | "embedding_size": 512, 41 | "hidden_size": 256, 42 | "num_hidden_layers": 1, 43 | }, 44 | "discriminator_config": { 45 | "embedding_size": 512, 46 | "hidden_size": 256, 47 | "num_hidden_layers": 2, 48 | }, 49 | } 50 | else: 51 | model_args = { 52 | "reprocess_input_data": True, 53 | "overwrite_output_dir": True, 54 | "num_train_epochs": 1, 55 | "no_save": True, 56 | } 57 | if model_name is None: 58 | model_args["vocab_size"] = 100 59 | 60 | model = LanguageModelingModel( 61 | model_type, 62 | model_name, 63 | args=model_args, 64 | train_files="train.txt", 65 | use_cuda=False, 66 | ) 67 | 68 | # Train the model 69 | model.train_model("train.txt") 70 | -------------------------------------------------------------------------------- /simpletransformers/tests/test_language_modeling.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pandas as pd 4 | import pytest 5 | 6 | from simpletransformers.language_modeling import LanguageModelingModel 7 | 8 | 9 | @pytest.mark.parametrize( 10 | "model_type, model_name", 11 | [ 12 | ("bert", "bert-base-uncased"), 13 | ("longformer", "allenai/longformer-base-4096"), 14 | ("bert", None), 15 | ("electra", None), 16 | ("longformer", None), 17 | # ("xlnet", "xlnet-base-cased"), 18 | # ("xlm", "xlm-mlm-17-1280"), 19 | ("roberta", "roberta-base"), 20 | # ("distilbert", "distilbert-base-uncased"), 21 | # ("albert", "albert-base-v1"), 22 | # ("camembert", "camembert-base"), 23 | # ("xlmroberta", "xlm-roberta-base"), 24 | # ("flaubert", "flaubert-base-cased"), 25 | ], 26 | ) 27 | def test_language_modeling(model_type, model_name): 28 | with open("train.txt", "w") as f: 29 | for i in range(100): 30 | f.writelines("Hello world with Simple Transformers! \n") 31 | 32 | if model_type == "electra": 33 | model_args = { 34 | "reprocess_input_data": True, 35 | "overwrite_output_dir": True, 36 | "num_train_epochs": 1, 37 | "no_save": True, 38 | "vocab_size": 100, 39 | "generator_config": { 40 | "embedding_size": 512, 41 | "hidden_size": 256, 42 | "num_hidden_layers": 1, 43 | }, 44 | "discriminator_config": { 45 | "embedding_size": 512, 46 | "hidden_size": 256, 47 | "num_hidden_layers": 2, 48 | }, 49 | } 50 | else: 51 | model_args = { 52 | "reprocess_input_data": True, 53 | "overwrite_output_dir": True, 54 | "num_train_epochs": 1, 55 | "no_save": True, 56 | } 57 | if model_name is None: 58 | model_args["vocab_size"] = 100 59 | 60 | if model_name is None: 61 | model_args["vocab_size"] = 100 62 | 63 | model = LanguageModelingModel( 64 | model_type, 65 | model_name, 66 | args=model_args, 67 | train_files="train.txt", 68 | use_cuda=False, 69 | ) 70 | 71 | # Train the model 72 | model.train_model("train.txt") 73 | -------------------------------------------------------------------------------- /simpletransformers/tests/test_language_representation.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from simpletransformers.language_representation import RepresentationModel 4 | 5 | 6 | @pytest.mark.parametrize( 7 | "model_type, model_name", 8 | [ 9 | ("bert", "bert-base-uncased"), 10 | ("roberta", "roberta-base"), 11 | ("gpt2", "distilgpt2"), 12 | ], 13 | ) 14 | @pytest.mark.parametrize("combine_strategy", ["mean", "concat", None]) 15 | def test_shapes(model_type, model_name, combine_strategy): 16 | sentence_list = ["Example sentence 1", "Example sentence 2"] 17 | # Create a ClassificationModel 18 | model = RepresentationModel( 19 | model_type, 20 | model_name, 21 | use_cuda=False, 22 | args={ 23 | "no_save": True, 24 | "reprocess_input_data": True, 25 | "overwrite_output_dir": True, 26 | }, 27 | ) 28 | encoded_sentences = model.encode_sentences( 29 | sentence_list, combine_strategy=combine_strategy 30 | ) 31 | longest_seq = ( 32 | 3 # RepresentationModel truncates sentences to the longest sentence in the list 33 | ) 34 | if model_type == "bert" or model_type == "roberta": 35 | longest_seq += 2 # add [CLS] & [SEP] tokens added by BERT & ROBERTA Models 36 | # last dimention is the embedding dimension, it depends on the model 37 | if combine_strategy == None: 38 | assert encoded_sentences.shape == (len(sentence_list), longest_seq, 768) 39 | if combine_strategy == "concat": 40 | assert encoded_sentences.shape == (len(sentence_list), longest_seq * 768) 41 | if combine_strategy == "mean": 42 | assert encoded_sentences.shape == (len(sentence_list), 768) 43 | -------------------------------------------------------------------------------- /simpletransformers/tests/test_named_entity_recognition.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import pytest 3 | 4 | from simpletransformers.ner import NERModel 5 | 6 | 7 | @pytest.mark.parametrize( 8 | "model_type, model_name", 9 | [ 10 | ("bert", "bert-base-uncased"), 11 | ("bigbird", "google/bigbird-roberta-base"), 12 | ("longformer", "allenai/longformer-base-4096"), 13 | # ("xlnet", "xlnet-base-cased"), 14 | # ("xlm", "xlm-mlm-17-1280"), 15 | ("roberta", "roberta-base"), 16 | # ("distilbert", "distilbert-base-uncased"), 17 | # ("albert", "albert-base-v1"), 18 | # ("camembert", "camembert-base"), 19 | # ("xlmroberta", "xlm-roberta-base"), 20 | # ("flaubert", "flaubert-base-cased"), 21 | ], 22 | ) 23 | def test_named_entity_recognition(model_type, model_name): 24 | # Creating train_df and eval_df for demonstration 25 | train_data = [ 26 | [0, "Simple", "B-MISC"], 27 | [0, "Transformers", "I-MISC"], 28 | [0, "started", "O"], 29 | [1, "with", "O"], 30 | [0, "text", "O"], 31 | [0, "classification", "B-MISC"], 32 | [1, "Simple", "B-MISC"], 33 | [1, "Transformers", "I-MISC"], 34 | [1, "can", "O"], 35 | [1, "now", "O"], 36 | [1, "perform", "O"], 37 | [1, "NER", "B-MISC"], 38 | ] 39 | train_df = pd.DataFrame(train_data, columns=["sentence_id", "words", "labels"]) 40 | 41 | eval_data = [ 42 | [0, "Simple", "B-MISC"], 43 | [0, "Transformers", "I-MISC"], 44 | [0, "was", "O"], 45 | [1, "built", "O"], 46 | [1, "for", "O"], 47 | [0, "text", "O"], 48 | [0, "classification", "B-MISC"], 49 | [1, "Simple", "B-MISC"], 50 | [1, "Transformers", "I-MISC"], 51 | [1, "then", "O"], 52 | [1, "expanded", "O"], 53 | [1, "to", "O"], 54 | [1, "perform", "O"], 55 | [1, "NER", "B-MISC"], 56 | ] 57 | eval_df = pd.DataFrame(eval_data, columns=["sentence_id", "words", "labels"]) 58 | 59 | # Create a NERModel 60 | model = NERModel( 61 | model_type, 62 | model_name, 63 | args={ 64 | "no_save": True, 65 | "overwrite_output_dir": True, 66 | "reprocess_input_data": False, 67 | }, 68 | use_cuda=False, 69 | ) 70 | 71 | # Train the model 72 | model.train_model(train_df) 73 | 74 | # Evaluate the model 75 | result, model_outputs, predictions = model.eval_model(eval_df) 76 | 77 | # Predictions on arbitary text strings 78 | predictions, raw_outputs = model.predict(["Some arbitary sentence"]) 79 | -------------------------------------------------------------------------------- /simpletransformers/tests/test_question_answering.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import os 4 | 5 | import pytest 6 | 7 | from simpletransformers.question_answering import QuestionAnsweringModel 8 | 9 | 10 | @pytest.mark.parametrize( 11 | "model_type, model_name", 12 | [ 13 | ("bert", "bert-base-uncased"), 14 | ("longformer", "allenai/longformer-base-4096"), 15 | # ("reformer", "google/reformer-crime-and-punishment"), 16 | # ("xlnet", "xlnet-base-cased"), 17 | # ("xlm", "xlm-mlm-17-1280"), 18 | # ("roberta", "roberta-base"), 19 | # ("distilbert", "distilbert-base-uncased"), 20 | # ("albert", "albert-base-v1"), 21 | # ("camembert", "camembert-base"), 22 | # ("xlmroberta", "xlm-roberta-base"), 23 | # ("flaubert", "flaubert-base-cased"), 24 | ], 25 | ) 26 | def test_question_answering(model_type, model_name): 27 | # Create dummy data to use for training. 28 | train_data = [ 29 | { 30 | "context": "This is the first context", 31 | "qas": [ 32 | { 33 | "id": "00001", 34 | "is_impossible": False, 35 | "question": "Which context is this?", 36 | "answers": [{"text": "the first", "answer_start": 8}], 37 | } 38 | ], 39 | }, 40 | { 41 | "context": "Other legislation followed, including the Migratory Bird Conservation Act of 1929, a 1937 treaty prohibiting the hunting of right and gray whales,\ 42 | and the Bald Eagle Protection Act of 1940. These later laws had a low cost to society—the species were relatively rare—and little opposition was raised", 43 | "qas": [ 44 | { 45 | "id": "00002", 46 | "is_impossible": False, 47 | "question": "What was the cost to society?", 48 | "answers": [{"text": "low cost", "answer_start": 225}], 49 | }, 50 | { 51 | "id": "00003", 52 | "is_impossible": False, 53 | "question": "What was the name of the 1937 treaty?", 54 | "answers": [ 55 | {"text": "Bald Eagle Protection Act", "answer_start": 167} 56 | ], 57 | }, 58 | { 59 | "id": "00004", 60 | "is_impossible": True, 61 | "question": "How did Alexandar Hamilton die?", 62 | "answers": [], 63 | }, 64 | ], 65 | }, 66 | ] # noqa 67 | 68 | for i in range(4): 69 | train_data.extend(train_data) 70 | 71 | # Save as a JSON file 72 | os.makedirs("data", exist_ok=True) 73 | with open("data/train.json", "w") as f: 74 | json.dump(train_data, f) 75 | 76 | logging.basicConfig(level=logging.WARNING) 77 | transformers_logger = logging.getLogger("transformers") 78 | transformers_logger.setLevel(logging.ERROR) 79 | 80 | # Create the QuestionAnsweringModel 81 | model = QuestionAnsweringModel( 82 | model_type, 83 | model_name, 84 | args={ 85 | "no_save": True, 86 | "reprocess_input_data": True, 87 | "overwrite_output_dir": True, 88 | }, 89 | use_cuda=False, 90 | ) 91 | 92 | # Train the model 93 | model.train_model("data/train.json") 94 | 95 | # Evaluate the model. (Being lazy and evaluating on the train data itself) 96 | result, text = model.eval_model("data/train.json") 97 | 98 | # Making predictions using the model. 99 | to_predict = [ 100 | { 101 | "context": "This is the context used for demonstrating predictions.", 102 | "qas": [{"question": "What is this context?", "id": "0"}], 103 | } 104 | ] 105 | 106 | model.predict(to_predict) 107 | -------------------------------------------------------------------------------- /simpletransformers/tests/test_seq2seq.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pandas as pd 4 | import pytest 5 | 6 | from simpletransformers.seq2seq import Seq2SeqArgs, Seq2SeqModel 7 | 8 | 9 | @pytest.mark.parametrize( 10 | "encoder_decoder_type, encoder_decoder_name, encoder_type, use_hf_datasets", 11 | [ 12 | ("bart", "facebook/bart-base", "bart", True), 13 | ("bart", "facebook/bart-base", "bart", False), 14 | ("roberta-base", "bert-base-cased", "roberta", True), 15 | ("roberta-base", "bert-base-cased", "roberta", False), 16 | ], 17 | ) 18 | def test_seq2seq( 19 | encoder_decoder_type, encoder_decoder_name, encoder_type, use_hf_datasets 20 | ): 21 | train_data = [ 22 | ["one", "1"], 23 | ["two", "2"], 24 | ] 25 | 26 | train_df = pd.DataFrame(train_data, columns=["input_text", "target_text"]) 27 | 28 | eval_data = [ 29 | ["three", "3"], 30 | ["four", "4"], 31 | ] 32 | 33 | eval_df = pd.DataFrame(eval_data, columns=["input_text", "target_text"]) 34 | 35 | model_args = { 36 | "reprocess_input_data": True, 37 | "overwrite_output_dir": True, 38 | "max_seq_length": 128, 39 | "train_batch_size": 2, 40 | "num_train_epochs": 2, 41 | "use_multiprocessing": False, 42 | "max_length": 15, 43 | "manual_seed": 4, 44 | "do_sample": False, 45 | "num_return_sequences": 1, 46 | "use_hf_datasets": use_hf_datasets, 47 | } 48 | 49 | if encoder_type == "bart": 50 | model = Seq2SeqModel( 51 | encoder_decoder_type=encoder_decoder_type, 52 | encoder_decoder_name=encoder_decoder_name, 53 | args=model_args, 54 | use_cuda=False, 55 | ) 56 | else: 57 | model = Seq2SeqModel( 58 | encoder_type=encoder_type, 59 | encoder_name=encoder_decoder_type, 60 | decoder_name=encoder_decoder_name, 61 | args=model_args, 62 | use_cuda=False, 63 | ) 64 | 65 | model.train_model(train_df) 66 | 67 | model.eval_model(eval_df) 68 | 69 | a = model.predict(["five"])[0] 70 | -------------------------------------------------------------------------------- /simpletransformers/tests/test_t5.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import pytest 3 | 4 | from simpletransformers.t5 import T5Model 5 | 6 | 7 | def test_t5(): 8 | train_data = [ 9 | ["convert", "one", "1"], 10 | ["convert", "two", "2"], 11 | ] 12 | 13 | train_df = pd.DataFrame(train_data, columns=["prefix", "input_text", "target_text"]) 14 | 15 | eval_data = [ 16 | ["convert", "three", "3"], 17 | ["convert", "four", "4"], 18 | ] 19 | 20 | eval_df = pd.DataFrame(eval_data, columns=["prefix", "input_text", "target_text"]) 21 | 22 | eval_df = train_df.copy() 23 | 24 | model_args = { 25 | "reprocess_input_data": True, 26 | "overwrite_output_dir": True, 27 | "max_seq_length": 10, 28 | "train_batch_size": 2, 29 | "num_train_epochs": 2, 30 | "save_model_every_epoch": False, 31 | "max_length": 20, 32 | "num_beams": 1, 33 | } 34 | 35 | # Create T5 Model 36 | model = T5Model("t5", "t5-base", args=model_args, use_cuda=False) 37 | 38 | # Train T5 Model on new task 39 | model.train_model(train_df) 40 | 41 | # Evaluate T5 Model on new task 42 | model.eval_model(eval_df) 43 | 44 | # Predict with trained T5 model 45 | model.predict(["convert: four", "convert: five"]) 46 | 47 | # Load test 48 | model = T5Model("t5", "outputs", args=model_args, use_cuda=False) 49 | 50 | # Evaluate T5 Model on new task 51 | model.eval_model(eval_df) 52 | 53 | # Predict with trained T5 model 54 | model.predict(["convert: four", "convert: five"]) 55 | -------------------------------------------------------------------------------- /triangle-discovery/diffusion/.gitignore: -------------------------------------------------------------------------------- 1 | .vscode/ 2 | build/ 3 | dist/ 4 | *.egg-info/ 5 | __pycache__ 6 | .ipynb_checkpoints/ 7 | .DS_Store 8 | **.pyc 9 | *.png 10 | *.txt 11 | **/outputs/ 12 | **/wandb/ 13 | **/exp/ 14 | **/exp_local/ 15 | data/ 16 | eval/ 17 | assets/ 18 | **.pth 19 | **.npz 20 | core 21 | **.log 22 | *.jsonl -------------------------------------------------------------------------------- /triangle-discovery/diffusion/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Aaron Lou 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /triangle-discovery/diffusion/catsample.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | 4 | 5 | def gumbel_softmax(categorical_probs, hard=False, eps=1e-9): 6 | logits = categorical_probs.clamp(min=1e-9).log() 7 | return F.gumbel_softmax(logits, hard=hard) 8 | 9 | 10 | def sample_categorical(categorical_probs, method="hard"): 11 | if method == "hard": 12 | gumbel_norm = 1e-10 - (torch.rand_like(categorical_probs) + 1e-10).log() 13 | return (categorical_probs / gumbel_norm).argmax(dim=-1) 14 | else: 15 | raise ValueError(f"Method {method} for sampling categorical variables is not valid.") 16 | -------------------------------------------------------------------------------- /triangle-discovery/diffusion/configs/config.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - _self_ 3 | - model: small 4 | - override hydra/launcher: submitit_slurm 5 | 6 | ngpus: 1 7 | tokens: 50257 8 | add_vocab: "" 9 | 10 | training: 11 | batch_size: 64 # 512 12 | accum: 1 13 | n_iters: 100000 14 | snapshot_freq: 5000 15 | log_freq: 50 16 | eval_freq: 500 17 | snapshot_freq_for_preemption: 5000 18 | weight: standard 19 | snapshot_sampling: True 20 | ema: 0.9999 21 | 22 | data: 23 | train: openwebtext 24 | valid: wikitext103 25 | cache_dir: data 26 | 27 | graph: 28 | type: absorb 29 | file: data 30 | report_all: False 31 | 32 | noise: 33 | type: loglinear 34 | sigma_min: 1e-4 35 | sigma_max: 20 36 | 37 | sampling: 38 | predictor: euler 39 | steps: 128 40 | noise_removal: True 41 | 42 | eval: 43 | batch_size: 32 44 | perplexity: False 45 | perplexity_batch_size: 32 46 | 47 | optim: 48 | weight_decay: 0 49 | optimizer: AdamW 50 | lr: 1e-4 51 | beta1: 0.9 52 | beta2: 0.999 53 | eps: 1e-8 54 | warmup: 2500 55 | grad_clip: 1. 56 | 57 | 58 | hydra: 59 | run: 60 | dir: /data/locus/project_data/project_data2/chenwu2/creativity_results/${data.train}/${now:%Y.%m.%d}/${now:%H%M%S} -------------------------------------------------------------------------------- /triangle-discovery/diffusion/configs/model/medium.yaml: -------------------------------------------------------------------------------- 1 | name: medium 2 | type: ddit 3 | hidden_size: 1024 4 | cond_dim: 128 5 | length: 32 6 | n_blocks: 24 7 | n_heads: 16 8 | scale_by_sigma: True 9 | dropout: 0.1 -------------------------------------------------------------------------------- /triangle-discovery/diffusion/configs/model/small.yaml: -------------------------------------------------------------------------------- 1 | name: small 2 | type: ddit 3 | hidden_size: 768 4 | cond_dim: 128 5 | length: 32 6 | n_blocks: 12 7 | n_heads: 12 8 | scale_by_sigma: True 9 | dropout: 0.1 -------------------------------------------------------------------------------- /triangle-discovery/diffusion/eval.sh: -------------------------------------------------------------------------------- 1 | # $1: dataset 2 | # $2: weight_decay 3 | # $3: n_layers 4 | 5 | EXP_DIR=creativity_results/creativity_data/triangle.0/train.json/train/checkpoint_outputs 6 | 7 | DATA_DIR=/data/locus/project_data/project_data2/chenwu2/creativity_data 8 | 9 | python eval_qa.py --dir $EXP_DIR --dataset $1 --data_dir $DATA_DIR 10 | -------------------------------------------------------------------------------- /triangle-discovery/diffusion/load_model.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | from model import SEDD 4 | import utils 5 | from model.ema import ExponentialMovingAverage 6 | import graph_lib 7 | import noise_lib 8 | 9 | from omegaconf import OmegaConf 10 | 11 | def load_model_hf(dir, device): 12 | score_model = SEDD.from_pretrained(dir).to(device) 13 | graph = graph_lib.get_graph(score_model.config, device) 14 | noise = noise_lib.get_noise(score_model.config).to(device) 15 | return score_model, graph, noise 16 | 17 | 18 | def load_model_local(root_dir, ckpt_dir, added_tokens, device): 19 | cfg = utils.load_hydra_config_from_run(root_dir) 20 | if added_tokens: 21 | cfg.tokens = cfg.tokens + len(added_tokens) 22 | graph = graph_lib.get_graph(cfg, device) 23 | noise = noise_lib.get_noise(cfg).to(device) 24 | score_model = SEDD(cfg).to(device) 25 | ema = ExponentialMovingAverage(score_model.parameters(), decay=cfg.training.ema) 26 | 27 | # ckpt_dir = os.path.join(root_dir, "checkpoints-meta", "checkpoint.pth") 28 | loaded_state = torch.load(ckpt_dir, map_location=device) 29 | 30 | score_model.load_state_dict(loaded_state['model']) 31 | ema.load_state_dict(loaded_state['ema']) 32 | 33 | ema.store(score_model.parameters()) 34 | ema.copy_to(score_model.parameters()) 35 | return score_model, graph, noise 36 | 37 | 38 | def load_model(root_dir, ckpt_dir, added_tokens, device): 39 | try: 40 | return load_model_hf(root_dir, device) 41 | except: 42 | return load_model_local(root_dir, ckpt_dir, added_tokens, device) -------------------------------------------------------------------------------- /triangle-discovery/diffusion/model/__init__.py: -------------------------------------------------------------------------------- 1 | from .transformer import SEDD -------------------------------------------------------------------------------- /triangle-discovery/diffusion/model/fused_add_dropout_scale.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from typing import Optional 4 | from torch import Tensor 5 | 6 | # flags required to enable jit fusion kernels 7 | torch._C._jit_set_profiling_mode(False) 8 | torch._C._jit_set_profiling_executor(False) 9 | torch._C._jit_override_can_fuse_on_cpu(True) 10 | torch._C._jit_override_can_fuse_on_gpu(True) 11 | 12 | 13 | def bias_dropout_add_scale( 14 | x: Tensor, bias: Optional[Tensor], scale: Tensor, residual: Optional[Tensor], prob: float, training: bool 15 | ) -> Tensor: 16 | if bias is not None: 17 | out = scale * F.dropout(x + bias, p=prob, training=training) 18 | else: 19 | out = scale * F.dropout(x, p=prob, training=training) 20 | 21 | if residual is not None: 22 | out = residual + out 23 | return out 24 | 25 | 26 | def get_bias_dropout_add_scale(training): 27 | def _bias_dropout_add(x, bias, scale, residual, prob): 28 | return bias_dropout_add_scale(x, bias, scale, residual, prob, training) 29 | 30 | return _bias_dropout_add 31 | 32 | 33 | def modulate(x: Tensor, shift: Tensor, scale: Tensor) -> Tensor: 34 | return x * (1 + scale) + shift 35 | 36 | 37 | @torch.jit.script 38 | def bias_dropout_add_scale_fused_train( 39 | x: Tensor, bias: Optional[Tensor], scale: Tensor, residual: Optional[Tensor], prob: float 40 | ) -> Tensor: 41 | return bias_dropout_add_scale(x, bias, scale, residual, prob, True) 42 | 43 | 44 | @torch.jit.script 45 | def bias_dropout_add_scale_fused_inference( 46 | x: Tensor, bias: Optional[Tensor], scale: Tensor, residual: Optional[Tensor], prob: float 47 | ) -> Tensor: 48 | return bias_dropout_add_scale(x, bias, scale, residual, prob, False) 49 | 50 | @torch.jit.script 51 | def modulate_fused(x: Tensor, shift: Tensor, scale: Tensor) -> Tensor: 52 | return modulate(x, shift, scale) -------------------------------------------------------------------------------- /triangle-discovery/diffusion/model/rotary.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | 4 | 5 | class Rotary(torch.nn.Module): 6 | def __init__(self, dim, base=10_000): 7 | super().__init__() 8 | inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim)) 9 | self.register_buffer("inv_freq", inv_freq) 10 | self.seq_len_cached = None 11 | self.cos_cached = None 12 | self.sin_cached = None 13 | 14 | def forward(self, x, seq_dim=1): 15 | seq_len = x.shape[seq_dim] 16 | if seq_len != self.seq_len_cached: 17 | self.seq_len_cached = seq_len 18 | t = torch.arange(x.shape[seq_dim], device=x.device).type_as(self.inv_freq) 19 | freqs = torch.einsum("i,j->ij", t, self.inv_freq.clone()) 20 | emb = torch.cat((freqs, freqs), dim=-1).to(x.device) 21 | # dims are: batch, seq_len, qkv, head, dim 22 | self.cos_cached = emb.cos()[None, :, None, None, :].repeat(1,1,3,1,1) 23 | self.sin_cached = emb.sin()[None, :, None, None, :].repeat(1,1,3,1,1) 24 | # This makes the transformation on v an identity. 25 | self.cos_cached[:,:,2,:,:].fill_(1.) 26 | self.sin_cached[:,:,2,:,:].fill_(0.) 27 | 28 | return self.cos_cached, self.sin_cached 29 | 30 | 31 | def rotate_half(x): 32 | x1, x2 = x[..., : x.shape[-1] // 2], x[..., x.shape[-1] // 2 :] 33 | return torch.cat( 34 | (-x2, x1), dim=-1 35 | ) 36 | 37 | 38 | @torch.jit.script 39 | def _apply_rotary_pos_emb_torchscript(qkv, cos, sin): 40 | return (qkv * cos) + (rotate_half(qkv) * sin) 41 | 42 | 43 | def apply_rotary_pos_emb(qkv, cos, sin): 44 | try: 45 | import flash_attn.layers.rotary 46 | cos = cos[0,:,0,0,:cos.shape[-1]//2] 47 | sin = sin[0,:,0,0,:sin.shape[-1]//2] 48 | return flash_attn.layers.rotary.apply_rotary_emb_qkv_( 49 | qkv, cos, sin 50 | ) 51 | except: 52 | return _apply_rotary_pos_emb_torchscript(qkv, cos, sin) -------------------------------------------------------------------------------- /triangle-discovery/diffusion/model/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | 4 | 5 | def get_model_fn(model, train=False): 6 | """Create a function to give the output of the score-based model. 7 | 8 | Args: 9 | model: The score model. 10 | train: `True` for training and `False` for evaluation. 11 | mlm: If the input model is a mlm and models the base probability 12 | 13 | Returns: 14 | A model function. 15 | """ 16 | 17 | def model_fn(x, sigma): 18 | """Compute the output of the score-based model. 19 | 20 | Args: 21 | x: A mini-batch of input data. 22 | labels: A mini-batch of conditioning variables for time steps. Should be interpreted differently 23 | for different models. 24 | 25 | Returns: 26 | A tuple of (model output, new mutable states) 27 | """ 28 | if train: 29 | model.train() 30 | else: 31 | model.eval() 32 | 33 | # otherwise output the raw values (we handle mlm training in losses.py) 34 | return model(x, sigma) 35 | 36 | return model_fn 37 | 38 | 39 | def get_score_fn(model, train=False, sampling=False): 40 | if sampling: 41 | assert not train, "Must sample in eval mode" 42 | model_fn = get_model_fn(model, train=train) 43 | 44 | with torch.cuda.amp.autocast(dtype=torch.bfloat16): 45 | def score_fn(x, sigma): 46 | sigma = sigma.reshape(-1) 47 | score = model_fn(x, sigma) 48 | 49 | if sampling: 50 | # when sampling return true score (not log used for training) 51 | return score.exp() 52 | 53 | return score 54 | 55 | return score_fn 56 | -------------------------------------------------------------------------------- /triangle-discovery/diffusion/noise_lib.py: -------------------------------------------------------------------------------- 1 | import abc 2 | import torch 3 | import torch.nn as nn 4 | import numpy as np 5 | 6 | 7 | def get_noise(config): 8 | if config.noise.type == "geometric": 9 | return GeometricNoise(config.noise.sigma_min, config.noise.sigma_max) 10 | elif config.noise.type == "loglinear": 11 | return LogLinearNoise() 12 | else: 13 | raise ValueError(f"{config.noise.type} is not a valid noise") 14 | 15 | 16 | class Noise(abc.ABC, nn.Module): 17 | """ 18 | Baseline forward method to get the total + rate of noise at a timestep 19 | """ 20 | def forward(self, t): 21 | return self.total_noise(t), self.rate_noise(t) 22 | 23 | """ 24 | Assume time goes from 0 to 1 25 | """ 26 | @abc.abstractmethod 27 | def rate_noise(self, t): 28 | """ 29 | Rate of change of noise ie g(t) 30 | """ 31 | pass 32 | 33 | @abc.abstractmethod 34 | def total_noise(self, t): 35 | """ 36 | Total noise ie \int_0^t g(t) dt + g(0) 37 | """ 38 | pass 39 | 40 | 41 | class GeometricNoise(Noise, nn.Module): 42 | def __init__(self, sigma_min=1e-3, sigma_max=1, learnable=False): 43 | super().__init__() 44 | self.sigmas = 1.0 * torch.tensor([sigma_min, sigma_max]) 45 | if learnable: 46 | self.sigmas = nn.Parameter(self.sigmas) 47 | self.empty = nn.Parameter(torch.tensor(0.0)) 48 | 49 | def rate_noise(self, t): 50 | return self.sigmas[0] ** (1 - t) * self.sigmas[1] ** t * (self.sigmas[1].log() - self.sigmas[0].log()) 51 | 52 | def total_noise(self, t): 53 | return self.sigmas[0] ** (1 - t) * self.sigmas[1] ** t 54 | 55 | 56 | class LogLinearNoise(Noise, nn.Module): 57 | """ 58 | Log Linear noise schedule built so that 1 - 1/e^(n(t)) interpolates between 0 and ~1 59 | when t goes from 0 to 1. Used for absorbing 60 | 61 | Total noise is -log(1 - (1 - eps) * t), so the sigma will be (1 - eps) * t 62 | """ 63 | def __init__(self, eps=1e-3): 64 | super().__init__() 65 | self.eps = eps 66 | self.empty = nn.Parameter(torch.tensor(0.0)) 67 | 68 | def rate_noise(self, t): 69 | return (1 - self.eps) / (1 - (1 - self.eps) * t) 70 | 71 | def total_noise(self, t): 72 | return -torch.log1p(-(1 - self.eps) * t) 73 | 74 | -------------------------------------------------------------------------------- /triangle-discovery/diffusion/run_eval.sh: -------------------------------------------------------------------------------- 1 | bash eval.sh triangle.0 0.0 8 2 | -------------------------------------------------------------------------------- /triangle-discovery/diffusion/run_sample.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import argparse 3 | import json 4 | 5 | from load_model import load_model 6 | from transformers import GPT2TokenizerFast 7 | import torch.nn.functional as F 8 | import sampling 9 | 10 | 11 | def main(): 12 | parser = argparse.ArgumentParser(description="Generate some samples") 13 | parser.add_argument("--model_path", default="louaaron/sedd-medium", type=str) 14 | parser.add_argument("--dataset", default="wikitext103", type=str) 15 | parser.add_argument("--batch_size", type=int, default=1) 16 | parser.add_argument("--steps", type=int, default=1024) 17 | parser.add_argument("--add_vocab", type=str, default=None) 18 | args = parser.parse_args() 19 | 20 | 21 | device = torch.device('cuda') 22 | model, graph, noise = load_model(args.model_path, device) 23 | tokenizer = GPT2TokenizerFast.from_pretrained('gpt2') 24 | if args.add_vocab: 25 | with open(args.add_vocab, 'r') as file: 26 | added_tokens = json.load(file) 27 | tokenizer.add_tokens(added_tokens) 28 | 29 | sampling_fn = sampling.get_pc_sampler( 30 | graph, noise, (args.batch_size, 1024), 'analytic', args.steps, device=device 31 | ) 32 | 33 | samples = sampling_fn(model) 34 | 35 | text_samples = tokenizer.batch_decode(samples) 36 | for i in text_samples: 37 | print(i) 38 | print("=================================================") 39 | 40 | if __name__=="__main__": 41 | main() -------------------------------------------------------------------------------- /triangle-discovery/diffusion/run_sample_cond.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import argparse 3 | import json 4 | 5 | from load_model import load_model 6 | from transformers import GPT2TokenizerFast 7 | import sampling 8 | 9 | 10 | def main(): 11 | parser = argparse.ArgumentParser(description="Generate some samples") 12 | parser.add_argument("--model_path", default="louaaron/sedd-medium", type=str) 13 | parser.add_argument("--dataset", default="wikitext103", type=str) 14 | parser.add_argument("--batch_size", type=int, default=1) 15 | parser.add_argument("--steps", type=int, default=1024) 16 | parser.add_argument("--prefix", type=str, default="Hi, my name is") 17 | parser.add_argument("--suffix", type=str, default=" and that's why I'm late.") 18 | parser.add_argument("--add_vocab", type=str, default=None) 19 | args = parser.parse_args() 20 | 21 | tokenizer = GPT2TokenizerFast.from_pretrained('gpt2') 22 | if args.add_vocab: 23 | with open(args.add_vocab, 'r') as file: 24 | added_tokens = json.load(file) 25 | tokenizer.add_tokens(added_tokens) 26 | 27 | prefix_ids = tokenizer(args.prefix).input_ids 28 | suffix_ids = tokenizer(args.suffix).input_ids 29 | input_ids = prefix_ids + suffix_ids 30 | input_locs = list(range(len(prefix_ids))) + list(range(1024-len(suffix_ids), 1024)) 31 | 32 | # more generaly commands can be defined with something like below: 33 | # input_ids = [0, 1, 512, 8080, 50256, 20000] 34 | # input_locs = [5, 6, 19, 20, 1000, 10001] 35 | 36 | 37 | input_ids = torch.tensor(input_ids, device="cuda")[None].repeat(args.batch_size, 1) 38 | 39 | def proj_fun(x): 40 | x[:, input_locs] = input_ids 41 | return x 42 | 43 | device = torch.device('cuda') 44 | model, graph, noise = load_model(args.model_path, device) 45 | 46 | 47 | sampling_fn = sampling.get_pc_sampler( 48 | graph, noise, (args.batch_size, 1024), 'analytic', args.steps, device=device, proj_fun=proj_fun 49 | ) 50 | 51 | samples = proj_fun(sampling_fn(model)) 52 | 53 | text_samples = tokenizer.batch_decode(samples) 54 | for i in text_samples: 55 | print(i) 56 | print("=================================================") 57 | 58 | if __name__=="__main__": 59 | main() -------------------------------------------------------------------------------- /triangle-discovery/diffusion/run_train.sh: -------------------------------------------------------------------------------- 1 | python train.py \ 2 | noise.type=loglinear \ 3 | graph.type=absorb \ 4 | model=small \ 5 | training.accum=1 \ 6 | data.train=creativity_data/triangle.0/train.json \ 7 | data.valid=creativity_data/triangle.0/valid.json \ 8 | add_vocab=creativity_data/triangle.0/vocab.json \ 9 | hydra.run.dir=/data/locus/project_data/project_data2/chenwu2/creativity_results/creativity_data/triangle.0/train.json/train 10 | 11 | python test.py \ 12 | --model_checkpoint_dir creativity_results/creativity_data/triangle.0/train.json/train \ 13 | --dataset creativity_data/triangle.0 \ 14 | --add_vocab creativity_data/triangle.0/vocab.json -------------------------------------------------------------------------------- /triangle-discovery/diffusion/test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import argparse 3 | import json 4 | 5 | from load_model import load_model 6 | from transformers import GPT2TokenizerFast 7 | import sampling 8 | from tqdm import tqdm 9 | 10 | import os 11 | 12 | 13 | def main(): 14 | parser = argparse.ArgumentParser(description="Generate some samples") 15 | parser.add_argument("--model_checkpoint_dir", default="", type=str) 16 | parser.add_argument("--dataset", default=None, type=str) 17 | parser.add_argument("--steps", type=int, default=128) 18 | parser.add_argument("--add_vocab", type=str, default=None) 19 | args = parser.parse_args() 20 | 21 | tokenizer = GPT2TokenizerFast.from_pretrained('gpt2') 22 | if args.add_vocab: 23 | with open(args.add_vocab, 'r') as file: 24 | added_tokens = json.load(file) 25 | tokenizer.add_tokens(added_tokens) 26 | else: 27 | added_tokens = [] 28 | 29 | # Load the dataset 30 | with open(os.path.join(args.dataset, "test.json"), "r") as f: 31 | test_data = json.load(f) 32 | 33 | # List all files under the model checkpoint directory 34 | checkpoints = [os.path.join(args.model_checkpoint_dir, "checkpoints", f) for f in os.listdir(os.path.join(args.model_checkpoint_dir, "checkpoints"))] 35 | print(checkpoints) 36 | 37 | for checkpoint in checkpoints: 38 | device = torch.device('cuda') 39 | model, graph, noise = load_model(args.model_checkpoint_dir, checkpoint, added_tokens, device) 40 | # Create a checkpoint_dir for the current checkpoint 41 | checkpoint_dir = os.path.join(args.model_checkpoint_dir, "checkpoint_outputs", os.path.basename(checkpoint)) 42 | if os.path.exists(checkpoint_dir): 43 | print(f"Skipping {checkpoint_dir} because it already exists") 44 | continue 45 | os.makedirs(checkpoint_dir, exist_ok=True) 46 | 47 | def generate_output(input_text): 48 | prefix_ids = tokenizer(input_text).input_ids 49 | # suffix_ids = tokenizer("<|endoftext|>").input_ids 50 | input_ids = prefix_ids 51 | input_locs = list(range(len(prefix_ids))) 52 | 53 | input_ids = torch.tensor(input_ids, device="cuda")[None].repeat(1, 1) 54 | 55 | def proj_fun(x): 56 | x[:, input_locs] = input_ids 57 | return x 58 | 59 | sampling_fn = sampling.get_pc_sampler( 60 | graph, noise, (1, 32), 'analytic', args.steps, device=device, proj_fun=proj_fun 61 | ) 62 | 63 | samples = proj_fun(sampling_fn(model)) 64 | 65 | text_samples = tokenizer.batch_decode(samples) 66 | assert len(text_samples) == 1 67 | text_samples = text_samples[0].split("<|endoftext|>")[0] 68 | return text_samples 69 | 70 | all_items = [] 71 | for sample in tqdm(test_data): 72 | item = {} 73 | item["input_text"] = sample["input_text"] 74 | item["target_text"] = sample["target_text"] 75 | item["type"] = sample["type"] 76 | 77 | output = generate_output(sample["input_text"]) 78 | print(sample["input_text"]) 79 | print(sample["target_text"]) 80 | print(output) 81 | print() 82 | item["model_output"] = output 83 | all_items.append(item) 84 | 85 | # Save the results 86 | with open(os.path.join(checkpoint_dir, "all_items.json"), "w") as f: 87 | json.dump(all_items, f, indent=4) 88 | 89 | 90 | if __name__=="__main__": 91 | main() -------------------------------------------------------------------------------- /triangle-discovery/diffusion/train.py: -------------------------------------------------------------------------------- 1 | """Training and evaluation""" 2 | 3 | import hydra 4 | import os 5 | import numpy as np 6 | import run_train 7 | import utils 8 | import torch.multiprocessing as mp 9 | from hydra.core.hydra_config import HydraConfig 10 | from hydra.types import RunMode 11 | from omegaconf import OmegaConf, open_dict 12 | 13 | 14 | @hydra.main(version_base=None, config_path="configs", config_name="config") 15 | def main(cfg): 16 | ngpus = cfg.ngpus 17 | if "load_dir" in cfg: 18 | hydra_cfg_path = os.path.join(cfg.load_dir, ".hydra/hydra.yaml") 19 | hydra_cfg = OmegaConf.load(hydra_cfg_path).hydra 20 | 21 | cfg = utils.load_hydra_config_from_run(cfg.load_dir) 22 | 23 | work_dir = cfg.work_dir 24 | utils.makedirs(work_dir) 25 | else: 26 | hydra_cfg = HydraConfig.get() 27 | work_dir = hydra_cfg.run.dir if hydra_cfg.mode == RunMode.RUN else os.path.join(hydra_cfg.sweep.dir, hydra_cfg.sweep.subdir) 28 | utils.makedirs(work_dir) 29 | 30 | with open_dict(cfg): 31 | cfg.ngpus = ngpus 32 | cfg.work_dir = work_dir 33 | cfg.wandb_name = os.path.basename(os.path.normpath(work_dir)) 34 | 35 | # Run the training pipeline 36 | port = int(np.random.randint(10000, 20000)) 37 | logger = utils.get_logger(os.path.join(work_dir, "logs")) 38 | 39 | hydra_cfg = HydraConfig.get() 40 | if hydra_cfg.mode != RunMode.RUN: 41 | logger.info(f"Run id: {hydra_cfg.job.id}") 42 | 43 | try: 44 | mp.set_start_method("forkserver") 45 | mp.spawn(run_train.run_multiprocess, args=(ngpus, cfg, port), nprocs=ngpus, join=True) 46 | except Exception as e: 47 | logger.critical(e, exc_info=True) 48 | 49 | 50 | if __name__ == "__main__": 51 | main() -------------------------------------------------------------------------------- /triangle-discovery/diffusion/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | import os 4 | import logging 5 | from omegaconf import OmegaConf, open_dict 6 | 7 | 8 | def load_hydra_config_from_run(load_dir): 9 | cfg_path = os.path.join(load_dir, ".hydra/config.yaml") 10 | cfg = OmegaConf.load(cfg_path) 11 | return cfg 12 | 13 | 14 | def makedirs(dirname): 15 | os.makedirs(dirname, exist_ok=True) 16 | 17 | 18 | def get_logger(logpath, package_files=[], displaying=True, saving=True, debug=False): 19 | logger = logging.getLogger() 20 | if debug: 21 | level = logging.DEBUG 22 | else: 23 | level = logging.INFO 24 | 25 | if (logger.hasHandlers()): 26 | logger.handlers.clear() 27 | 28 | logger.setLevel(level) 29 | formatter = logging.Formatter('%(asctime)s - %(message)s') 30 | if saving: 31 | info_file_handler = logging.FileHandler(logpath, mode="a") 32 | info_file_handler.setLevel(level) 33 | info_file_handler.setFormatter(formatter) 34 | logger.addHandler(info_file_handler) 35 | if displaying: 36 | console_handler = logging.StreamHandler() 37 | console_handler.setLevel(level) 38 | console_handler.setFormatter(formatter) 39 | logger.addHandler(console_handler) 40 | 41 | for f in package_files: 42 | logger.info(f) 43 | with open(f, "r") as package_f: 44 | logger.info(package_f.read()) 45 | 46 | return logger 47 | 48 | 49 | def restore_checkpoint(ckpt_dir, state, device): 50 | if not os.path.exists(ckpt_dir): 51 | makedirs(os.path.dirname(ckpt_dir)) 52 | logging.warning(f"No checkpoint found at {ckpt_dir}. Returned the same state as input") 53 | return state 54 | else: 55 | loaded_state = torch.load(ckpt_dir, map_location=device) 56 | state['optimizer'].load_state_dict(loaded_state['optimizer']) 57 | state['model'].module.load_state_dict(loaded_state['model'], strict=False) 58 | state['ema'].load_state_dict(loaded_state['ema']) 59 | state['step'] = loaded_state['step'] 60 | return state 61 | 62 | 63 | def save_checkpoint(ckpt_dir, state): 64 | saved_state = { 65 | 'optimizer': state['optimizer'].state_dict(), 66 | 'model': state['model'].module.state_dict(), 67 | 'ema': state['ema'].state_dict(), 68 | 'step': state['step'] 69 | } 70 | torch.save(saved_state, ckpt_dir) -------------------------------------------------------------------------------- /triangle-discovery/ntp/eval.sh: -------------------------------------------------------------------------------- 1 | # $1: dataset 2 | # $2: weight_decay 3 | # $3: n_layers 4 | 5 | EXP_DIR=../creativity 6 | DATA_DIR=/data/locus/project_data/project_data2/chenwu2/creativity_data 7 | 8 | python eval_qa.py --dir $EXP_DIR/$1_$2_$3 --dataset $1 --data_dir $DATA_DIR 9 | -------------------------------------------------------------------------------- /triangle-discovery/ntp/run_eval.sh: -------------------------------------------------------------------------------- 1 | bash eval.sh triangle.10 0.0 12 -------------------------------------------------------------------------------- /triangle-discovery/ntp/run_train.sh: -------------------------------------------------------------------------------- 1 | bash train.sh triangle.10 0.0 12 0 -------------------------------------------------------------------------------- /triangle-discovery/ntp/train.sh: -------------------------------------------------------------------------------- 1 | MODEL_PATH=gpt2 2 | 3 | DATASET=/data/locus/project_data/project_data2/chenwu2/creativity_data/$1/ 4 | WEIGHT_DECAY=$2 5 | N_LAYERS=$3 6 | GPU=$4 7 | 8 | EXP_DIR=../creativity 9 | 10 | OUTPUT_DIR=$EXP_DIR/$1_$2_$3 11 | 12 | CUDA_VISIBLE_DEVICES=$GPU python main.py \ 13 | --data_dir $DATASET \ 14 | --model_name_or_path ${MODEL_PATH} \ 15 | --weight_decay $WEIGHT_DECAY \ 16 | --output_dir $OUTPUT_DIR \ 17 | --max_seq_length 128 \ 18 | --max_length 128 \ 19 | --block_size 128 \ 20 | --train_batch_size 64 \ 21 | --eval_batch_size 64 \ 22 | --learning_rate 1e-4 \ 23 | --gradient_accumulation_steps 1 \ 24 | --save_step 40000 \ 25 | --save_step_dense 20000 \ 26 | --max_steps 800000 \ 27 | --do_train \ 28 | --scheduler constant_schedule_with_warmup \ 29 | --fp16 \ 30 | --evaluate_during_training \ 31 | --predict_during_training \ 32 | --add_tokens \ 33 | --n_layer $N_LAYERS 34 | -------------------------------------------------------------------------------- /triangle-discovery/ntp/utils.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import json 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | 7 | 8 | def read_data_source_target(file_name, return_num=False, return_json=False, cutoff=None): 9 | """ 10 | file_name: a .json file containing a list of items, each has 'input_text', 'target_text', as keys 11 | """ 12 | with open(file_name, 'r', encoding='utf-8') as f: 13 | data = json.load(f) 14 | 15 | if cutoff is not None: 16 | data = data[:cutoff] 17 | 18 | if return_json: 19 | if return_num: 20 | return data, len(data) 21 | return data 22 | 23 | keys = list(data[0].keys()) 24 | source_target_pair = [] 25 | for item in data: 26 | source_target_pair.append([item[key] for key in keys]) 27 | 28 | if return_num: 29 | return pd.DataFrame(source_target_pair, columns=keys), len(data) 30 | return pd.DataFrame(source_target_pair, columns=keys) 31 | -------------------------------------------------------------------------------- /triangle-discovery/teacherless/eval.sh: -------------------------------------------------------------------------------- 1 | # $1: dataset 2 | # $2: weight_decay 3 | # $3: n_layers 4 | 5 | EXP_DIR=../creativity 6 | DATA_DIR=/data/locus/project_data/project_data2/chenwu2/creativity_data 7 | 8 | python eval_qa.py --dir $EXP_DIR/$1_$2_$3 --dataset $1 --data_dir $DATA_DIR 9 | 10 | -------------------------------------------------------------------------------- /triangle-discovery/teacherless/run_eval.sh: -------------------------------------------------------------------------------- 1 | bash eval.sh triangle_hybrid.10 0.0 12 2 | -------------------------------------------------------------------------------- /triangle-discovery/teacherless/run_train.sh: -------------------------------------------------------------------------------- 1 | bash train.sh triangle_hybrid.10 0.0 12 0 2 | -------------------------------------------------------------------------------- /triangle-discovery/teacherless/train.sh: -------------------------------------------------------------------------------- 1 | MODEL_PATH=gpt2 2 | 3 | DATASET=/data/locus/project_data/project_data2/chenwu2/creativity_data/$1/ 4 | WEIGHT_DECAY=$2 5 | N_LAYERS=$3 6 | GPU=$4 7 | 8 | EXP_DIR=../creativity 9 | 10 | OUTPUT_DIR=$EXP_DIR/$1_$2_$3 11 | 12 | CUDA_VISIBLE_DEVICES=$GPU python main.py \ 13 | --data_dir $DATASET \ 14 | --model_name_or_path ${MODEL_PATH} \ 15 | --weight_decay $WEIGHT_DECAY \ 16 | --output_dir $OUTPUT_DIR \ 17 | --max_seq_length 128 \ 18 | --max_length 128 \ 19 | --block_size 128 \ 20 | --train_batch_size 64 \ 21 | --eval_batch_size 64 \ 22 | --learning_rate 1e-4 \ 23 | --gradient_accumulation_steps 1 \ 24 | --save_step 40000 \ 25 | --save_step_dense 20000 \ 26 | --max_steps 1200000 \ 27 | --do_train \ 28 | --scheduler constant_schedule_with_warmup \ 29 | --fp16 \ 30 | --evaluate_during_training \ 31 | --predict_during_training \ 32 | --add_tokens \ 33 | --n_layer $N_LAYERS 34 | -------------------------------------------------------------------------------- /triangle-discovery/teacherless/utils.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import json 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | 7 | 8 | def read_data_source_target(file_name, return_num=False, return_json=False, cutoff=None): 9 | """ 10 | file_name: a .json file containing a list of items, each has 'input_text', 'target_text', as keys 11 | """ 12 | with open(file_name, 'r', encoding='utf-8') as f: 13 | data = json.load(f) 14 | 15 | if cutoff is not None: 16 | data = data[:cutoff] 17 | 18 | if return_json: 19 | if return_num: 20 | return data, len(data) 21 | return data 22 | 23 | keys = list(data[0].keys()) 24 | source_target_pair = [] 25 | for item in data: 26 | source_target_pair.append([item[key] for key in keys]) 27 | 28 | if return_num: 29 | return pd.DataFrame(source_target_pair, columns=keys), len(data) 30 | return pd.DataFrame(source_target_pair, columns=keys) 31 | --------------------------------------------------------------------------------