├── .gitignore
├── README.md
├── circle-construction
    ├── diffusion
    │   ├── .gitignore
    │   ├── LICENSE
    │   ├── catsample.py
    │   ├── configs
    │   │   ├── config.yaml
    │   │   └── model
    │   │   │   ├── medium.yaml
    │   │   │   └── small.yaml
    │   ├── data.py
    │   ├── eval.sh
    │   ├── eval_qa.py
    │   ├── graph_lib.py
    │   ├── load_model.py
    │   ├── losses.py
    │   ├── model
    │   │   ├── __init__.py
    │   │   ├── ema.py
    │   │   ├── fused_add_dropout_scale.py
    │   │   ├── rotary.py
    │   │   ├── transformer.py
    │   │   └── utils.py
    │   ├── noise_lib.py
    │   ├── run_eval.sh
    │   ├── run_sample.py
    │   ├── run_sample_cond.py
    │   ├── run_train.py
    │   ├── run_train.sh
    │   ├── sampling.py
    │   ├── test.py
    │   ├── train.py
    │   └── utils.py
    ├── ntp
    │   ├── circle.ipynb
    │   ├── eval.sh
    │   ├── eval_qa.py
    │   ├── main.py
    │   ├── run_eval.sh
    │   ├── run_train.sh
    │   ├── train.sh
    │   └── utils.py
    └── teacherless
    │   ├── circle_hybrid.ipynb
    │   ├── eval.sh
    │   ├── eval_qa.py
    │   ├── main.py
    │   ├── run_eval.sh
    │   ├── run_train.sh
    │   ├── train.sh
    │   └── utils.py
├── docs
    └── teaser.png
├── line-construction
    ├── diffusion
    │   ├── .gitignore
    │   ├── LICENSE
    │   ├── catsample.py
    │   ├── configs
    │   │   ├── config.yaml
    │   │   └── model
    │   │   │   ├── medium.yaml
    │   │   │   └── small.yaml
    │   ├── data.py
    │   ├── eval.sh
    │   ├── eval_qa.py
    │   ├── graph_lib.py
    │   ├── load_model.py
    │   ├── losses.py
    │   ├── model
    │   │   ├── __init__.py
    │   │   ├── ema.py
    │   │   ├── fused_add_dropout_scale.py
    │   │   ├── rotary.py
    │   │   ├── transformer.py
    │   │   └── utils.py
    │   ├── noise_lib.py
    │   ├── run_eval.sh
    │   ├── run_sample.py
    │   ├── run_sample_cond.py
    │   ├── run_train.py
    │   ├── run_train.sh
    │   ├── sampling.py
    │   ├── test.py
    │   ├── train.py
    │   └── utils.py
    ├── ntp
    │   ├── eval.sh
    │   ├── eval_qa.py
    │   ├── line.ipynb
    │   ├── main.py
    │   ├── run_eval.sh
    │   ├── run_train.sh
    │   ├── train.sh
    │   └── utils.py
    └── teacherless
    │   ├── eval.sh
    │   ├── eval_qa.py
    │   ├── line_hybrid.ipynb
    │   ├── main.py
    │   ├── run_eval.sh
    │   ├── run_train.sh
    │   ├── train.sh
    │   └── utils.py
├── sibling-discovery
    ├── diffusion
    │   ├── .gitignore
    │   ├── LICENSE
    │   ├── catsample.py
    │   ├── configs
    │   │   ├── config.yaml
    │   │   └── model
    │   │   │   ├── medium.yaml
    │   │   │   └── small.yaml
    │   ├── data.py
    │   ├── eval.sh
    │   ├── eval_qa.py
    │   ├── graph_lib.py
    │   ├── load_model.py
    │   ├── losses.py
    │   ├── model
    │   │   ├── __init__.py
    │   │   ├── ema.py
    │   │   ├── fused_add_dropout_scale.py
    │   │   ├── rotary.py
    │   │   ├── transformer.py
    │   │   └── utils.py
    │   ├── noise_lib.py
    │   ├── run_eval.sh
    │   ├── run_sample.py
    │   ├── run_sample_cond.py
    │   ├── run_train.py
    │   ├── run_train.sh
    │   ├── sampling.py
    │   ├── test.py
    │   ├── train.py
    │   └── utils.py
    ├── ntp
    │   ├── eval.sh
    │   ├── eval_qa.py
    │   ├── main.py
    │   ├── run_eval.sh
    │   ├── run_train.sh
    │   ├── sibling.ipynb
    │   ├── sibling_no_hash.ipynb
    │   ├── train.sh
    │   └── utils.py
    └── teacherless
    │   ├── eval.sh
    │   ├── eval_qa.py
    │   ├── main.py
    │   ├── run_eval.sh
    │   ├── run_train.sh
    │   ├── sibling_hybrid.ipynb
    │   ├── train.sh
    │   └── utils.py
├── simpletransformers
    ├── .all-contributorsrc
    ├── .github
    │   ├── FUNDING.yml
    │   ├── ISSUE_TEMPLATE
    │   │   ├── bug_report.md
    │   │   └── feature_request.md
    │   ├── stale.yml
    │   └── workflows
    │   │   └── pythonpublish.yml
    ├── .gitignore
    ├── CHANGELOG.md
    ├── LICENSE
    ├── Makefile
    ├── README.md
    ├── bin
    │   └── simple-viewer
    ├── requirements-dev.txt
    ├── setup.cfg
    ├── setup.py
    ├── simpletransformers
    │   ├── __init__.py
    │   ├── classification
    │   │   ├── __init__.py
    │   │   ├── classification_model.py
    │   │   ├── classification_utils.py
    │   │   ├── multi_label_classification_model.py
    │   │   ├── multi_modal_classification_model.py
    │   │   └── transformer_models
    │   │   │   ├── __init__.py
    │   │   │   ├── albert_model.py
    │   │   │   ├── bert_model.py
    │   │   │   ├── camembert_model.py
    │   │   │   ├── distilbert_model.py
    │   │   │   ├── electra_model.py
    │   │   │   ├── flaubert_model.py
    │   │   │   ├── layoutlm_model.py
    │   │   │   ├── longformer_model.py
    │   │   │   ├── mmbt_model.py
    │   │   │   ├── mobilebert_model.py
    │   │   │   ├── roberta_model.py
    │   │   │   ├── xlm_model.py
    │   │   │   ├── xlm_roberta_model.py
    │   │   │   └── xlnet_model.py
    │   ├── config
    │   │   ├── __init__.py
    │   │   ├── global_args.py
    │   │   ├── model_args.py
    │   │   └── utils.py
    │   ├── conv_ai
    │   │   ├── __init__.py
    │   │   ├── conv_ai_model.py
    │   │   └── conv_ai_utils.py
    │   ├── custom_models
    │   │   ├── __init__.py
    │   │   └── models.py
    │   ├── experimental
    │   │   ├── __init__.py
    │   │   └── classification
    │   │   │   ├── __init__.py
    │   │   │   ├── classification_model.py
    │   │   │   ├── classification_utils.py
    │   │   │   ├── multi_label_classification_model.py
    │   │   │   └── transformer_models
    │   │   │       ├── __init__.py
    │   │   │       ├── albert_model.py
    │   │   │       ├── bert_model.py
    │   │   │       ├── camembert_model.py
    │   │   │       ├── distilbert_model.py
    │   │   │       ├── roberta_model.py
    │   │   │       ├── xlm_model.py
    │   │   │       └── xlnet_model.py
    │   ├── language_generation
    │   │   ├── __init__.py
    │   │   ├── language_generation_model.py
    │   │   └── language_generation_utils.py
    │   ├── language_modeling
    │   │   ├── __init__.py
    │   │   ├── language_modeling_model.py
    │   │   └── language_modeling_utils.py
    │   ├── language_representation
    │   │   ├── __init__.py
    │   │   ├── representation_model.py
    │   │   └── transformer_models
    │   │   │   ├── __init__.py
    │   │   │   ├── bert_model.py
    │   │   │   └── gpt2_model.py
    │   ├── losses
    │   │   ├── __init__.py
    │   │   ├── dice_loss.py
    │   │   ├── focal_loss.py
    │   │   ├── loss_utils.py
    │   │   └── tversky_loss.py
    │   ├── model.py
    │   ├── ner
    │   │   ├── __init__.py
    │   │   ├── ner_dataset_loading_script
    │   │   │   └── ner_dataset_loading_script.py
    │   │   ├── ner_model.py
    │   │   └── ner_utils.py
    │   ├── question_answering
    │   │   ├── __init__.py
    │   │   ├── qa_dataset_loading_script
    │   │   │   └── qa_dataset_loading_script.py
    │   │   ├── question_answering_model.py
    │   │   └── question_answering_utils.py
    │   ├── retrieval
    │   │   ├── __init__.py
    │   │   ├── retrieval_dataset_loading_script
    │   │   │   └── retrieval_dataset_loading_script.py
    │   │   ├── retrieval_model.py
    │   │   └── retrieval_utils.py
    │   ├── seq2seq
    │   │   ├── __init__.py
    │   │   ├── seq2seq_model.py
    │   │   └── seq2seq_utils.py
    │   ├── streamlit
    │   │   ├── __init__.py
    │   │   ├── classification_view.py
    │   │   ├── ner_view.py
    │   │   ├── qa_view.py
    │   │   ├── simple_view.py
    │   │   ├── streamlit_utils.py
    │   │   └── t5_view.py
    │   └── t5
    │   │   ├── __init__.py
    │   │   ├── t5_model.py
    │   │   └── t5_utils.py
    ├── tests
    │   ├── language_modeling
    │   │   └── test_language_modeling_only.py
    │   ├── test_classification.py
    │   ├── test_language_modeling.py
    │   ├── test_language_representation.py
    │   ├── test_named_entity_recognition.py
    │   ├── test_question_answering.py
    │   ├── test_seq2seq.py
    │   ├── test_t5.py
    │   └── train.txt
    └── train.txt
└── triangle-discovery
    ├── diffusion
        ├── .gitignore
        ├── LICENSE
        ├── catsample.py
        ├── configs
        │   ├── config.yaml
        │   └── model
        │   │   ├── medium.yaml
        │   │   └── small.yaml
        ├── data.py
        ├── eval.sh
        ├── eval_qa.py
        ├── graph_lib.py
        ├── load_model.py
        ├── losses.py
        ├── model
        │   ├── __init__.py
        │   ├── ema.py
        │   ├── fused_add_dropout_scale.py
        │   ├── rotary.py
        │   ├── transformer.py
        │   └── utils.py
        ├── noise_lib.py
        ├── run_eval.sh
        ├── run_sample.py
        ├── run_sample_cond.py
        ├── run_train.py
        ├── run_train.sh
        ├── sampling.py
        ├── test.py
        ├── train.py
        └── utils.py
    ├── ntp
        ├── eval.sh
        ├── eval_qa.py
        ├── main.py
        ├── run_eval.sh
        ├── run_train.sh
        ├── train.sh
        ├── triangle.ipynb
        ├── triangle_no_hash.ipynb
        └── utils.py
    └── teacherless
        ├── eval.sh
        ├── eval_qa.py
        ├── main.py
        ├── run_eval.sh
        ├── run_train.sh
        ├── train.sh
        ├── triangle_hybrid.ipynb
        └── utils.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Initially taken from Github's Python gitignore file
  2 | 
  3 | # Byte-compiled / optimized / DLL files
  4 | __pycache__/
  5 | *.py[cod]
  6 | *$py.class
  7 | 
  8 | # C extensions
  9 | *.so
 10 | 
 11 | # tests and logs
 12 | tests/fixtures/cached_*_text.txt
 13 | logs/
 14 | lightning_logs/
 15 | lang_code_data/
 16 | 
 17 | # Distribution / packaging
 18 | .Python
 19 | build/
 20 | develop-eggs/
 21 | dist/
 22 | downloads/
 23 | eggs/
 24 | .eggs/
 25 | lib/
 26 | lib64/
 27 | parts/
 28 | sdist/
 29 | var/
 30 | wheels/
 31 | *.egg-info/
 32 | .installed.cfg
 33 | *.egg
 34 | MANIFEST
 35 | 
 36 | # PyInstaller
 37 | #  Usually these files are written by a python script from a template
 38 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 39 | *.manifest
 40 | *.spec
 41 | 
 42 | # Installer logs
 43 | pip-log.txt
 44 | pip-delete-this-directory.txt
 45 | 
 46 | # Unit test / coverage reports
 47 | htmlcov/
 48 | .tox/
 49 | .nox/
 50 | .coverage
 51 | .coverage.*
 52 | .cache
 53 | nosetests.xml
 54 | coverage.xml
 55 | *.cover
 56 | .hypothesis/
 57 | .pytest_cache/
 58 | 
 59 | # Translations
 60 | *.mo
 61 | *.pot
 62 | 
 63 | # Django stuff:
 64 | *.log
 65 | local_settings.py
 66 | db.sqlite3
 67 | 
 68 | # Flask stuff:
 69 | instance/
 70 | .webassets-cache
 71 | 
 72 | # Scrapy stuff:
 73 | .scrapy
 74 | 
 75 | # Sphinx documentation
 76 | docs/_build/
 77 | 
 78 | # PyBuilder
 79 | target/
 80 | 
 81 | # Jupyter Notebook
 82 | .ipynb_checkpoints
 83 | 
 84 | # IPython
 85 | profile_default/
 86 | ipython_config.py
 87 | 
 88 | # pyenv
 89 | .python-version
 90 | 
 91 | # celery beat schedule file
 92 | celerybeat-schedule
 93 | 
 94 | # SageMath parsed files
 95 | *.sage.py
 96 | 
 97 | # Environments
 98 | .env
 99 | .venv
100 | env/
101 | venv/
102 | ENV/
103 | env.bak/
104 | venv.bak/
105 | 
106 | # Spyder project settings
107 | .spyderproject
108 | .spyproject
109 | 
110 | # Rope project settings
111 | .ropeproject
112 | 
113 | # mkdocs documentation
114 | /site
115 | 
116 | # mypy
117 | .mypy_cache/
118 | .dmypy.json
119 | dmypy.json
120 | 
121 | # Pyre type checker
122 | .pyre/
123 | 
124 | # vscode
125 | .vs
126 | .vscode
127 | 
128 | # Pycharm
129 | .idea
130 | 
131 | # TF code
132 | tensorflow_code
133 | 
134 | # Models
135 | proc_data
136 | 
137 | # examples
138 | runs
139 | /runs_old
140 | /wandb
141 | /examples/runs
142 | /examples/**/*.args
143 | /examples/rag/sweep
144 | 
145 | # data
146 | serialization_dir
147 | 
148 | # emacs
149 | *.*~
150 | debug.env
151 | 
152 | # vim
153 | .*.swp
154 | 
155 | #ctags
156 | tags
157 | 
158 | # .lock
159 | *.lock
160 | 
161 | # DS_Store (MacOS)
162 | .DS_Store
163 | # RL pipelines may produce mp4 outputs
164 | *.mp4
165 | 
166 | # dependencies
167 | /transformers
168 | 
169 | # ruff
170 | .ruff_cache
171 | 
172 | wandb
173 | 
174 | # checkpoints
175 | */checkpoint-*
176 | 
177 | # temporarily ignore
178 | default-outputs/*
179 | outputs/*
180 | log_files/*
181 | step_test/*
182 | 
183 | **/render*
184 | **/*.zip
185 | 
186 | exp_data
187 | 
188 | # data
189 | 
190 | # results
191 | 
192 | **/*.pt


--------------------------------------------------------------------------------
/circle-construction/diffusion/.gitignore:
--------------------------------------------------------------------------------
 1 | .vscode/
 2 | build/
 3 | dist/
 4 | *.egg-info/
 5 | __pycache__
 6 | .ipynb_checkpoints/
 7 | .DS_Store
 8 | **.pyc
 9 | *.png
10 | *.txt
11 | **/outputs/
12 | **/wandb/
13 | **/exp/
14 | **/exp_local/
15 | data/
16 | eval/
17 | assets/
18 | **.pth
19 | **.npz
20 | core
21 | **.log
22 | *.jsonl


--------------------------------------------------------------------------------
/circle-construction/diffusion/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Aaron Lou
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/circle-construction/diffusion/catsample.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | 
 4 | 
 5 | def gumbel_softmax(categorical_probs, hard=False, eps=1e-9):
 6 |     logits = categorical_probs.clamp(min=1e-9).log()
 7 |     return F.gumbel_softmax(logits, hard=hard)
 8 | 
 9 | 
10 | def sample_categorical(categorical_probs, method="hard"):
11 |     if method == "hard":
12 |         gumbel_norm = 1e-10 - (torch.rand_like(categorical_probs) + 1e-10).log()
13 |         return (categorical_probs / gumbel_norm).argmax(dim=-1)
14 |     else:
15 |         raise ValueError(f"Method {method} for sampling categorical variables is not valid.")
16 |     


--------------------------------------------------------------------------------
/circle-construction/diffusion/configs/config.yaml:
--------------------------------------------------------------------------------
 1 | defaults:
 2 |   - _self_
 3 |   - model: small
 4 |   - override hydra/launcher: submitit_slurm
 5 | 
 6 | ngpus: 1
 7 | tokens: 50257
 8 | add_vocab: ""
 9 | 
10 | training:
11 |   batch_size: 64  # 512
12 |   accum: 1
13 |   n_iters: 400000
14 |   snapshot_freq: 10000
15 |   log_freq: 50
16 |   eval_freq: 500
17 |   snapshot_freq_for_preemption: 10000
18 |   weight: standard
19 |   snapshot_sampling: True
20 |   ema: 0.9999
21 | 
22 | data:
23 |   train: openwebtext
24 |   valid: wikitext103
25 |   cache_dir: data
26 | 
27 | graph:
28 |   type: absorb
29 |   file: data
30 |   report_all: False
31 | 
32 | noise:
33 |   type: loglinear
34 |   sigma_min: 1e-4
35 |   sigma_max: 20
36 | 
37 | sampling:
38 |   predictor: euler
39 |   steps: 128
40 |   noise_removal: True
41 | 
42 | eval:
43 |   batch_size: 32
44 |   perplexity: False
45 |   perplexity_batch_size: 32
46 | 
47 | optim:
48 |   weight_decay: 0
49 |   optimizer: AdamW
50 |   lr: 1e-4
51 |   beta1: 0.9
52 |   beta2: 0.999
53 |   eps: 1e-8
54 |   warmup: 2500
55 |   grad_clip: 1.
56 | 
57 | 
58 | hydra:
59 |   run:
60 |     dir: /data/locus/project_data/project_data2/chenwu2/creativity_results/${data.train}/${now:%Y.%m.%d}/${now:%H%M%S}


--------------------------------------------------------------------------------
/circle-construction/diffusion/configs/model/medium.yaml:
--------------------------------------------------------------------------------
1 | name: medium
2 | type: ddit
3 | hidden_size: 1024
4 | cond_dim: 128
5 | length: 29
6 | n_blocks: 24
7 | n_heads: 16
8 | scale_by_sigma: True
9 | dropout: 0.1


--------------------------------------------------------------------------------
/circle-construction/diffusion/configs/model/small.yaml:
--------------------------------------------------------------------------------
1 | name: small
2 | type: ddit
3 | hidden_size: 768
4 | cond_dim: 128
5 | length: 29
6 | n_blocks: 12
7 | n_heads: 12
8 | scale_by_sigma: True
9 | dropout: 0.1


--------------------------------------------------------------------------------
/circle-construction/diffusion/eval.sh:
--------------------------------------------------------------------------------
 1 | # $1: dataset
 2 | # $2: weight_decay
 3 | # $3: n_layers
 4 | 
 5 | EXP_DIR=creativity_results/creativity_data/circle.10.9.0.10000/train.json/train/checkpoint_outputs
 6 | 
 7 | DATA_DIR=/data/locus/project_data/project_data2/chenwu2/creativity_data
 8 | 
 9 | python eval_qa.py --dir $EXP_DIR --dataset $1 --data_dir $DATA_DIR
10 | 


--------------------------------------------------------------------------------
/circle-construction/diffusion/load_model.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch
 3 | from model import SEDD
 4 | import utils
 5 | from model.ema import ExponentialMovingAverage
 6 | import graph_lib
 7 | import noise_lib
 8 | 
 9 | from omegaconf import OmegaConf
10 | 
11 | def load_model_hf(dir, device):
12 |     score_model = SEDD.from_pretrained(dir).to(device)
13 |     graph = graph_lib.get_graph(score_model.config, device)
14 |     noise = noise_lib.get_noise(score_model.config).to(device)
15 |     return score_model, graph, noise
16 | 
17 | 
18 | def load_model_local(root_dir, ckpt_dir, added_tokens, device):
19 |     cfg = utils.load_hydra_config_from_run(root_dir)
20 |     if added_tokens:
21 |         cfg.tokens = cfg.tokens + len(added_tokens)
22 |     graph = graph_lib.get_graph(cfg, device)
23 |     noise = noise_lib.get_noise(cfg).to(device)
24 |     score_model = SEDD(cfg).to(device)
25 |     ema = ExponentialMovingAverage(score_model.parameters(), decay=cfg.training.ema)
26 | 
27 |     # ckpt_dir = os.path.join(root_dir, "checkpoints-meta", "checkpoint.pth")
28 |     loaded_state = torch.load(ckpt_dir, map_location=device)
29 | 
30 |     score_model.load_state_dict(loaded_state['model'])
31 |     ema.load_state_dict(loaded_state['ema'])
32 | 
33 |     ema.store(score_model.parameters())
34 |     ema.copy_to(score_model.parameters())
35 |     return score_model, graph, noise
36 | 
37 | 
38 | def load_model(root_dir, ckpt_dir, added_tokens, device):
39 |     try:
40 |         return load_model_hf(root_dir, device)
41 |     except:
42 |         return load_model_local(root_dir, ckpt_dir, added_tokens, device)


--------------------------------------------------------------------------------
/circle-construction/diffusion/model/__init__.py:
--------------------------------------------------------------------------------
1 | from .transformer import SEDD


--------------------------------------------------------------------------------
/circle-construction/diffusion/model/ema.py:
--------------------------------------------------------------------------------
  1 | # Modified from https://raw.githubusercontent.com/fadel/pytorch_ema/master/torch_ema/ema.py
  2 | 
  3 | from __future__ import division
  4 | from __future__ import unicode_literals
  5 | 
  6 | import torch
  7 | 
  8 | 
  9 | # Partially based on: https://github.com/tensorflow/tensorflow/blob/r1.13/tensorflow/python/training/moving_averages.py
 10 | class ExponentialMovingAverage:
 11 |     """
 12 |     Maintains (exponential) moving average of a set of parameters.
 13 |     """
 14 | 
 15 |     def __init__(self, parameters, decay, use_num_updates=True):
 16 |         """
 17 |         Args:
 18 |             parameters: Iterable of `torch.nn.Parameter`; usually the result of
 19 |                 `model.parameters()`.
 20 |             decay: The exponential decay.
 21 |             use_num_updates: Whether to use number of updates when computing
 22 |                 averages.
 23 |         """
 24 |         if decay < 0.0 or decay > 1.0:
 25 |             raise ValueError('Decay must be between 0 and 1')
 26 |         self.decay = decay
 27 |         self.num_updates = 0 if use_num_updates else None
 28 |         self.shadow_params = [p.clone().detach()
 29 |                               for p in parameters if p.requires_grad]
 30 |         self.collected_params = []
 31 | 
 32 |     def update(self, parameters):
 33 |         """
 34 |         Update currently maintained parameters.
 35 | 
 36 |         Call this every time the parameters are updated, such as the result of
 37 |         the `optimizer.step()` call.
 38 | 
 39 |         Args:
 40 |             parameters: Iterable of `torch.nn.Parameter`; usually the same set of
 41 |                 parameters used to initialize this object.
 42 |         """
 43 |         decay = self.decay
 44 |         if self.num_updates is not None:
 45 |             self.num_updates += 1
 46 |             decay = min(decay, (1 + self.num_updates) /
 47 |                         (10 + self.num_updates))
 48 |         one_minus_decay = 1.0 - decay
 49 |         with torch.no_grad():
 50 |             parameters = [p for p in parameters if p.requires_grad]
 51 |             for s_param, param in zip(self.shadow_params, parameters):
 52 |                 s_param.sub_(one_minus_decay * (s_param - param))
 53 |                 
 54 | 
 55 |     def copy_to(self, parameters):
 56 |         """
 57 |         Copy current parameters into given collection of parameters.
 58 | 
 59 |         Args:
 60 |             parameters: Iterable of `torch.nn.Parameter`; the parameters to be
 61 |                 updated with the stored moving averages.
 62 |         """
 63 |         parameters = [p for p in parameters if p.requires_grad]
 64 |         for s_param, param in zip(self.shadow_params, parameters):
 65 |             if param.requires_grad:
 66 |                 param.data.copy_(s_param.data)
 67 | 
 68 |     def store(self, parameters):
 69 |         """
 70 |         Save the current parameters for restoring later.
 71 | 
 72 |         Args:
 73 |             parameters: Iterable of `torch.nn.Parameter`; the parameters to be
 74 |                 temporarily stored.
 75 |         """
 76 |         self.collected_params = [param.clone() for param in parameters]
 77 | 
 78 |     def restore(self, parameters):
 79 |         """
 80 |         Restore the parameters stored with the `store` method.
 81 |         Useful to validate the model with EMA parameters without affecting the
 82 |         original optimization process. Store the parameters before the
 83 |         `copy_to` method. After validation (or model saving), use this to
 84 |         restore the former parameters.
 85 | 
 86 |         Args:
 87 |             parameters: Iterable of `torch.nn.Parameter`; the parameters to be
 88 |                 updated with the stored parameters.
 89 |         """
 90 |         for c_param, param in zip(self.collected_params, parameters):
 91 |             param.data.copy_(c_param.data)
 92 | 
 93 |     def state_dict(self):
 94 |         return dict(decay=self.decay, num_updates=self.num_updates,
 95 |                     shadow_params=self.shadow_params)
 96 | 
 97 |     def load_state_dict(self, state_dict):
 98 |         self.decay = state_dict['decay']
 99 |         self.num_updates = state_dict['num_updates']
100 |         self.shadow_params = state_dict['shadow_params']


--------------------------------------------------------------------------------
/circle-construction/diffusion/model/fused_add_dropout_scale.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | from typing import Optional
 4 | from torch import Tensor
 5 | 
 6 | # flags required to enable jit fusion kernels
 7 | torch._C._jit_set_profiling_mode(False)
 8 | torch._C._jit_set_profiling_executor(False)
 9 | torch._C._jit_override_can_fuse_on_cpu(True)
10 | torch._C._jit_override_can_fuse_on_gpu(True)
11 | 
12 | 
13 | def bias_dropout_add_scale(
14 |     x: Tensor, bias: Optional[Tensor], scale: Tensor, residual: Optional[Tensor], prob: float, training: bool
15 | ) -> Tensor:
16 |     if bias is not None:
17 |         out = scale * F.dropout(x + bias, p=prob, training=training)
18 |     else:
19 |         out = scale * F.dropout(x, p=prob, training=training)
20 | 
21 |     if residual is not None:
22 |         out = residual + out
23 |     return out
24 | 
25 | 
26 | def get_bias_dropout_add_scale(training):
27 |     def _bias_dropout_add(x, bias, scale, residual, prob):
28 |         return bias_dropout_add_scale(x, bias, scale, residual, prob, training)
29 | 
30 |     return _bias_dropout_add
31 | 
32 | 
33 | def modulate(x: Tensor, shift: Tensor, scale: Tensor) -> Tensor:
34 |     return x * (1 + scale) + shift
35 | 
36 | 
37 | @torch.jit.script
38 | def bias_dropout_add_scale_fused_train(
39 |     x: Tensor, bias: Optional[Tensor], scale: Tensor, residual: Optional[Tensor], prob: float
40 | ) -> Tensor:
41 |     return bias_dropout_add_scale(x, bias, scale, residual, prob, True)
42 | 
43 | 
44 | @torch.jit.script
45 | def bias_dropout_add_scale_fused_inference(
46 |     x: Tensor, bias: Optional[Tensor], scale: Tensor, residual: Optional[Tensor], prob: float
47 | ) -> Tensor:
48 |     return bias_dropout_add_scale(x, bias, scale, residual, prob, False)
49 | 
50 | @torch.jit.script
51 | def modulate_fused(x: Tensor, shift: Tensor, scale: Tensor) -> Tensor:
52 |     return modulate(x, shift, scale)


--------------------------------------------------------------------------------
/circle-construction/diffusion/model/rotary.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | 
 4 | 
 5 | class Rotary(torch.nn.Module):
 6 |     def __init__(self, dim, base=10_000):
 7 |         super().__init__()
 8 |         inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
 9 |         self.register_buffer("inv_freq", inv_freq)
10 |         self.seq_len_cached = None
11 |         self.cos_cached = None
12 |         self.sin_cached = None
13 | 
14 |     def forward(self, x, seq_dim=1):
15 |         seq_len = x.shape[seq_dim]
16 |         if seq_len != self.seq_len_cached:
17 |             self.seq_len_cached = seq_len
18 |             t = torch.arange(x.shape[seq_dim], device=x.device).type_as(self.inv_freq)
19 |             freqs = torch.einsum("i,j->ij", t, self.inv_freq.clone())
20 |             emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
21 |             # dims are: batch, seq_len, qkv, head, dim
22 |             self.cos_cached = emb.cos()[None, :, None, None, :].repeat(1,1,3,1,1)
23 |             self.sin_cached = emb.sin()[None, :, None, None, :].repeat(1,1,3,1,1)
24 |             # This makes the transformation on v an identity.
25 |             self.cos_cached[:,:,2,:,:].fill_(1.)
26 |             self.sin_cached[:,:,2,:,:].fill_(0.)
27 | 
28 |         return self.cos_cached, self.sin_cached
29 | 
30 | 
31 | def rotate_half(x):
32 |     x1, x2 = x[..., : x.shape[-1] // 2], x[..., x.shape[-1] // 2 :]
33 |     return torch.cat(
34 |         (-x2, x1), dim=-1
35 |     )
36 | 
37 | 
38 | @torch.jit.script
39 | def _apply_rotary_pos_emb_torchscript(qkv, cos, sin):
40 |     return (qkv * cos) + (rotate_half(qkv) * sin)
41 | 
42 | 
43 | def apply_rotary_pos_emb(qkv, cos, sin):
44 |     try:
45 |         import flash_attn.layers.rotary
46 |         cos = cos[0,:,0,0,:cos.shape[-1]//2]
47 |         sin = sin[0,:,0,0,:sin.shape[-1]//2]
48 |         return flash_attn.layers.rotary.apply_rotary_emb_qkv_(
49 |             qkv, cos, sin
50 |         )
51 |     except:
52 |         return _apply_rotary_pos_emb_torchscript(qkv, cos, sin)


--------------------------------------------------------------------------------
/circle-construction/diffusion/model/utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | 
 4 | 
 5 | def get_model_fn(model, train=False):
 6 |     """Create a function to give the output of the score-based model.
 7 | 
 8 |     Args:
 9 |         model: The score model.
10 |         train: `True` for training and `False` for evaluation.
11 |         mlm: If the input model is a mlm and models the base probability 
12 | 
13 |     Returns:
14 |         A model function.
15 |     """
16 | 
17 |     def model_fn(x, sigma):
18 |         """Compute the output of the score-based model.
19 | 
20 |         Args:
21 |             x: A mini-batch of input data.
22 |             labels: A mini-batch of conditioning variables for time steps. Should be interpreted differently
23 |               for different models.
24 | 
25 |         Returns:
26 |             A tuple of (model output, new mutable states)
27 |         """
28 |         if train:
29 |             model.train()
30 |         else:
31 |             model.eval()
32 |         
33 |             # otherwise output the raw values (we handle mlm training in losses.py)
34 |         return model(x, sigma)
35 | 
36 |     return model_fn
37 | 
38 | 
39 | def get_score_fn(model, train=False, sampling=False):
40 |     if sampling:
41 |         assert not train, "Must sample in eval mode"
42 |     model_fn = get_model_fn(model, train=train)
43 | 
44 |     with torch.cuda.amp.autocast(dtype=torch.bfloat16):
45 |         def score_fn(x, sigma):
46 |             sigma = sigma.reshape(-1)
47 |             score = model_fn(x, sigma)
48 |             
49 |             if sampling:
50 |                 # when sampling return true score (not log used for training)
51 |                 return score.exp()
52 |                 
53 |             return score
54 | 
55 |     return score_fn
56 | 


--------------------------------------------------------------------------------
/circle-construction/diffusion/noise_lib.py:
--------------------------------------------------------------------------------
 1 | import abc
 2 | import torch
 3 | import torch.nn as nn
 4 | import numpy as np
 5 | 
 6 | 
 7 | def get_noise(config):
 8 |     if config.noise.type == "geometric":
 9 |         return GeometricNoise(config.noise.sigma_min, config.noise.sigma_max)
10 |     elif config.noise.type == "loglinear":
11 |         return LogLinearNoise()
12 |     else:
13 |         raise ValueError(f"{config.noise.type} is not a valid noise")
14 | 
15 | 
16 | class Noise(abc.ABC, nn.Module):
17 |     """
18 |     Baseline forward method to get the total + rate of noise at a timestep
19 |     """
20 |     def forward(self, t):
21 |         return self.total_noise(t), self.rate_noise(t)
22 | 
23 |     """
24 |     Assume time goes from 0 to 1
25 |     """
26 |     @abc.abstractmethod
27 |     def rate_noise(self, t):
28 |         """
29 |         Rate of change of noise ie g(t)
30 |         """
31 |         pass
32 | 
33 |     @abc.abstractmethod
34 |     def total_noise(self, t):
35 |         """
36 |         Total noise ie \int_0^t g(t) dt + g(0)
37 |         """
38 |         pass
39 | 
40 | 
41 | class GeometricNoise(Noise, nn.Module):
42 |     def __init__(self, sigma_min=1e-3, sigma_max=1, learnable=False):
43 |         super().__init__()
44 |         self.sigmas = 1.0 * torch.tensor([sigma_min, sigma_max])
45 |         if learnable:
46 |             self.sigmas = nn.Parameter(self.sigmas)
47 |         self.empty = nn.Parameter(torch.tensor(0.0))
48 | 
49 |     def rate_noise(self, t):
50 |         return self.sigmas[0] ** (1 - t) * self.sigmas[1] ** t * (self.sigmas[1].log() - self.sigmas[0].log())
51 | 
52 |     def total_noise(self, t):
53 |         return self.sigmas[0] ** (1 - t) * self.sigmas[1] ** t
54 | 
55 | 
56 | class LogLinearNoise(Noise, nn.Module):
57 |     """
58 |     Log Linear noise schedule built so that 1 - 1/e^(n(t)) interpolates between 0 and ~1
59 |     when t goes from 0 to 1. Used for absorbing
60 | 
61 |     Total noise is -log(1 - (1 - eps) * t), so the sigma will be (1 - eps) * t
62 |     """
63 |     def __init__(self, eps=1e-3):
64 |         super().__init__()
65 |         self.eps = eps
66 |         self.empty = nn.Parameter(torch.tensor(0.0))
67 | 
68 |     def rate_noise(self, t):
69 |         return (1 - self.eps) / (1 - (1 - self.eps) * t)
70 | 
71 |     def total_noise(self, t):
72 |         return -torch.log1p(-(1 - self.eps) * t)
73 | 
74 | 


--------------------------------------------------------------------------------
/circle-construction/diffusion/run_eval.sh:
--------------------------------------------------------------------------------
1 | bash eval.sh circle.10.9.0.10000 0.0 8


--------------------------------------------------------------------------------
/circle-construction/diffusion/run_sample.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import argparse
 3 | import json
 4 | 
 5 | from load_model import load_model
 6 | from transformers import GPT2TokenizerFast
 7 | import torch.nn.functional as F
 8 | import sampling
 9 | 
10 | 
11 | def main():
12 |     parser = argparse.ArgumentParser(description="Generate some samples")
13 |     parser.add_argument("--model_path", default="louaaron/sedd-medium", type=str)
14 |     parser.add_argument("--dataset", default="wikitext103", type=str)
15 |     parser.add_argument("--batch_size", type=int, default=1)
16 |     parser.add_argument("--steps", type=int, default=1024)
17 |     parser.add_argument("--add_vocab", type=str, default=None)
18 |     args = parser.parse_args()
19 | 
20 |     
21 |     device = torch.device('cuda')
22 |     model, graph, noise = load_model(args.model_path, device)
23 |     tokenizer = GPT2TokenizerFast.from_pretrained('gpt2')
24 |     if args.add_vocab:
25 |         with open(args.add_vocab, 'r') as file:
26 |             added_tokens = json.load(file)
27 |         tokenizer.add_tokens(added_tokens)
28 | 
29 |     sampling_fn = sampling.get_pc_sampler(
30 |         graph, noise, (args.batch_size, 1024), 'analytic', args.steps, device=device
31 |     )
32 | 
33 |     samples = sampling_fn(model)
34 | 
35 |     text_samples = tokenizer.batch_decode(samples)
36 |     for i in text_samples:
37 |         print(i)
38 |         print("=================================================")
39 | 
40 | if __name__=="__main__":
41 |     main()


--------------------------------------------------------------------------------
/circle-construction/diffusion/run_sample_cond.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import argparse
 3 | import json
 4 | 
 5 | from load_model import load_model
 6 | from transformers import GPT2TokenizerFast
 7 | import sampling
 8 | 
 9 | 
10 | def main():
11 |     parser = argparse.ArgumentParser(description="Generate some samples")
12 |     parser.add_argument("--model_path", default="louaaron/sedd-medium", type=str)
13 |     parser.add_argument("--dataset", default="wikitext103", type=str)
14 |     parser.add_argument("--batch_size", type=int, default=1)
15 |     parser.add_argument("--steps", type=int, default=1024)
16 |     parser.add_argument("--prefix", type=str, default="Hi, my name is")
17 |     parser.add_argument("--suffix", type=str, default=" and that's why I'm late.")
18 |     parser.add_argument("--add_vocab", type=str, default=None)
19 |     args = parser.parse_args()
20 | 
21 |     tokenizer = GPT2TokenizerFast.from_pretrained('gpt2')
22 |     if args.add_vocab:
23 |         with open(args.add_vocab, 'r') as file:
24 |             added_tokens = json.load(file)
25 |         tokenizer.add_tokens(added_tokens)
26 | 
27 |     prefix_ids = tokenizer(args.prefix).input_ids
28 |     suffix_ids = tokenizer(args.suffix).input_ids
29 |     input_ids = prefix_ids + suffix_ids
30 |     input_locs = list(range(len(prefix_ids))) + list(range(1024-len(suffix_ids), 1024))
31 | 
32 |     # more generaly commands can be defined with something like below:
33 |     # input_ids = [0, 1, 512, 8080, 50256, 20000]
34 |     # input_locs = [5, 6, 19, 20, 1000, 10001]
35 | 
36 | 
37 |     input_ids = torch.tensor(input_ids, device="cuda")[None].repeat(args.batch_size, 1)
38 | 
39 |     def proj_fun(x):
40 |         x[:, input_locs] = input_ids
41 |         return x
42 |     
43 |     device = torch.device('cuda')
44 |     model, graph, noise = load_model(args.model_path, device)
45 |     
46 | 
47 |     sampling_fn = sampling.get_pc_sampler(
48 |         graph, noise, (args.batch_size, 1024), 'analytic', args.steps, device=device, proj_fun=proj_fun
49 |     )
50 | 
51 |     samples = proj_fun(sampling_fn(model))
52 | 
53 |     text_samples = tokenizer.batch_decode(samples)
54 |     for i in text_samples:
55 |         print(i)
56 |         print("=================================================")
57 | 
58 | if __name__=="__main__":
59 |     main()


--------------------------------------------------------------------------------
/circle-construction/diffusion/run_train.sh:
--------------------------------------------------------------------------------
 1 | python train.py \
 2 |     noise.type=loglinear \
 3 |     graph.type=absorb \
 4 |     model=small \
 5 |     training.accum=1 \
 6 |     data.train=creativity_data/circle.10.9.0.10000/train.json \
 7 |     data.valid=creativity_data/circle.10.9.0.10000/valid.json \
 8 |     add_vocab=creativity_data/circle.10.9.0.10000/vocab.json \
 9 |     hydra.run.dir=/data/locus/project_data/project_data2/chenwu2/creativity_results/creativity_data/circle.10.9.0.10000/train.json/train
10 | 
11 | python test.py \
12 |     --model_checkpoint_dir creativity_results/creativity_data/circle.10.9.0.10000/train.json/train \
13 |     --dataset creativity_data/circle.10.9.0.10000 \
14 |     --add_vocab creativity_data/circle.10.9.0.10000/vocab.json


--------------------------------------------------------------------------------
/circle-construction/diffusion/test.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import argparse
 3 | import json
 4 | 
 5 | from load_model import load_model
 6 | from transformers import GPT2TokenizerFast
 7 | import sampling
 8 | from tqdm import tqdm
 9 | 
10 | import os
11 | 
12 | 
13 | def main():
14 |     parser = argparse.ArgumentParser(description="Generate some samples")
15 |     parser.add_argument("--model_checkpoint_dir", default="", type=str)
16 |     parser.add_argument("--dataset", default=None, type=str)
17 |     parser.add_argument("--steps", type=int, default=128)
18 |     parser.add_argument("--add_vocab", type=str, default=None)
19 |     args = parser.parse_args()
20 | 
21 |     tokenizer = GPT2TokenizerFast.from_pretrained('gpt2')
22 |     if args.add_vocab:
23 |         with open(args.add_vocab, 'r') as file:
24 |             added_tokens = json.load(file)
25 |         tokenizer.add_tokens(added_tokens)
26 |     else:
27 |         added_tokens = []
28 |     
29 |     # Load the dataset
30 |     with open(os.path.join(args.dataset, "test.json"), "r") as f:
31 |         test_data = json.load(f)
32 |     
33 |     # List all files under the model checkpoint directory
34 |     checkpoints = [os.path.join(args.model_checkpoint_dir, "checkpoints", f) for f in os.listdir(os.path.join(args.model_checkpoint_dir, "checkpoints"))]
35 |     print(checkpoints)
36 | 
37 |     for checkpoint in checkpoints:
38 |         device = torch.device('cuda')
39 |         model, graph, noise = load_model(args.model_checkpoint_dir, checkpoint, added_tokens, device)
40 |         # Create a checkpoint_dir for the current checkpoint
41 |         checkpoint_dir = os.path.join(args.model_checkpoint_dir, "checkpoint_outputs", os.path.basename(checkpoint))
42 |         if os.path.exists(checkpoint_dir):
43 |             print(f"Skipping {checkpoint_dir} because it already exists")
44 |             continue
45 |         os.makedirs(checkpoint_dir, exist_ok=True)
46 | 
47 |         def generate_output(input_text):
48 |             prefix_ids = tokenizer(input_text).input_ids
49 |             # suffix_ids = tokenizer("<|endoftext|>").input_ids
50 |             input_ids = prefix_ids
51 |             input_locs = list(range(len(prefix_ids)))
52 | 
53 |             input_ids = torch.tensor(input_ids, device="cuda")[None].repeat(1, 1)
54 | 
55 |             def proj_fun(x):
56 |                 x[:, input_locs] = input_ids
57 |                 return x
58 |             
59 |             sampling_fn = sampling.get_pc_sampler(
60 |                 graph, noise, (1, 29), 'analytic', args.steps, device=device, proj_fun=proj_fun
61 |             )
62 | 
63 |             samples = proj_fun(sampling_fn(model))
64 | 
65 |             text_samples = tokenizer.batch_decode(samples)
66 |             assert len(text_samples) == 1
67 |             text_samples = text_samples[0].split("<|endoftext|>")[0]
68 |             return text_samples
69 |         
70 |         all_items = []
71 |         for sample in tqdm(test_data):
72 |             item = {}
73 |             item["input_text"] = sample["input_text"]
74 |             item["target_text"] = sample["target_text"]
75 |             item["type"] = sample["type"]
76 | 
77 |             output = generate_output(sample["input_text"])
78 |             print(sample["input_text"])
79 |             print(sample["target_text"])
80 |             print(output)
81 |             print()
82 |             item["model_output"] = output
83 |             all_items.append(item)
84 |         
85 |         # Save the results
86 |         with open(os.path.join(checkpoint_dir, "all_items.json"), "w") as f:
87 |             json.dump(all_items, f, indent=4)
88 | 
89 | 
90 | if __name__=="__main__":
91 |     main()


--------------------------------------------------------------------------------
/circle-construction/diffusion/train.py:
--------------------------------------------------------------------------------
 1 | """Training and evaluation"""
 2 | 
 3 | import hydra
 4 | import os
 5 | import numpy as np
 6 | import run_train
 7 | import utils
 8 | import torch.multiprocessing as mp
 9 | from hydra.core.hydra_config import HydraConfig
10 | from hydra.types import RunMode
11 | from omegaconf import OmegaConf, open_dict
12 | 
13 | 
14 | @hydra.main(version_base=None, config_path="configs", config_name="config")
15 | def main(cfg):
16 |     ngpus = cfg.ngpus
17 |     if "load_dir" in cfg:
18 |         hydra_cfg_path = os.path.join(cfg.load_dir, ".hydra/hydra.yaml")
19 |         hydra_cfg = OmegaConf.load(hydra_cfg_path).hydra
20 | 
21 |         cfg = utils.load_hydra_config_from_run(cfg.load_dir)
22 |         
23 |         work_dir = cfg.work_dir
24 |         utils.makedirs(work_dir)
25 |     else:
26 |         hydra_cfg = HydraConfig.get()
27 |         work_dir = hydra_cfg.run.dir if hydra_cfg.mode == RunMode.RUN else os.path.join(hydra_cfg.sweep.dir, hydra_cfg.sweep.subdir)
28 |         utils.makedirs(work_dir)
29 | 
30 |     with open_dict(cfg):
31 |         cfg.ngpus = ngpus
32 |         cfg.work_dir = work_dir
33 |         cfg.wandb_name = os.path.basename(os.path.normpath(work_dir))
34 | 
35 | 	# Run the training pipeline
36 |     port = int(np.random.randint(10000, 20000))
37 |     logger = utils.get_logger(os.path.join(work_dir, "logs"))
38 | 
39 |     hydra_cfg = HydraConfig.get()
40 |     if hydra_cfg.mode != RunMode.RUN:
41 |         logger.info(f"Run id: {hydra_cfg.job.id}")
42 | 
43 |     try:
44 |         mp.set_start_method("forkserver")
45 |         mp.spawn(run_train.run_multiprocess, args=(ngpus, cfg, port), nprocs=ngpus, join=True)
46 |     except Exception as e:
47 |         logger.critical(e, exc_info=True)
48 | 
49 | 
50 | if __name__ == "__main__":
51 |     main()


--------------------------------------------------------------------------------
/circle-construction/diffusion/utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | import os
 4 | import logging
 5 | from omegaconf import OmegaConf, open_dict
 6 | 
 7 | 
 8 | def load_hydra_config_from_run(load_dir):
 9 |     cfg_path = os.path.join(load_dir, ".hydra/config.yaml")
10 |     cfg = OmegaConf.load(cfg_path)
11 |     return cfg
12 | 
13 | 
14 | def makedirs(dirname):
15 |     os.makedirs(dirname, exist_ok=True)
16 | 
17 | 
18 | def get_logger(logpath, package_files=[], displaying=True, saving=True, debug=False):
19 |     logger = logging.getLogger()
20 |     if debug:
21 |         level = logging.DEBUG
22 |     else:
23 |         level = logging.INFO
24 | 
25 |     if (logger.hasHandlers()):
26 |         logger.handlers.clear()
27 | 
28 |     logger.setLevel(level)
29 |     formatter = logging.Formatter('%(asctime)s - %(message)s')
30 |     if saving:
31 |         info_file_handler = logging.FileHandler(logpath, mode="a")
32 |         info_file_handler.setLevel(level)
33 |         info_file_handler.setFormatter(formatter)
34 |         logger.addHandler(info_file_handler)
35 |     if displaying:
36 |         console_handler = logging.StreamHandler()
37 |         console_handler.setLevel(level)
38 |         console_handler.setFormatter(formatter)
39 |         logger.addHandler(console_handler)
40 | 
41 |     for f in package_files:
42 |         logger.info(f)
43 |         with open(f, "r") as package_f:
44 |             logger.info(package_f.read())
45 | 
46 |     return logger
47 | 
48 | 
49 | def restore_checkpoint(ckpt_dir, state, device):
50 |     if not os.path.exists(ckpt_dir):
51 |         makedirs(os.path.dirname(ckpt_dir))
52 |         logging.warning(f"No checkpoint found at {ckpt_dir}. Returned the same state as input")
53 |         return state
54 |     else:
55 |         loaded_state = torch.load(ckpt_dir, map_location=device)
56 |         state['optimizer'].load_state_dict(loaded_state['optimizer'])
57 |         state['model'].module.load_state_dict(loaded_state['model'], strict=False)
58 |         state['ema'].load_state_dict(loaded_state['ema'])
59 |         state['step'] = loaded_state['step']
60 |         return state
61 | 
62 | 
63 | def save_checkpoint(ckpt_dir, state):
64 |     saved_state = {
65 |         'optimizer': state['optimizer'].state_dict(),
66 |         'model': state['model'].module.state_dict(),
67 |         'ema': state['ema'].state_dict(),
68 |         'step': state['step']
69 |     }
70 |     torch.save(saved_state, ckpt_dir)


--------------------------------------------------------------------------------
/circle-construction/ntp/eval.sh:
--------------------------------------------------------------------------------
1 | # $1: dataset
2 | # $2: weight_decay
3 | # $3: n_layers
4 | 
5 | EXP_DIR=../creativity
6 | DATA_DIR=/data/locus/project_data/project_data2/chenwu2/creativity_data
7 | 
8 | python eval_qa.py --dir $EXP_DIR/$1_$2_$3 --dataset $1 --data_dir $DATA_DIR
9 | 


--------------------------------------------------------------------------------
/circle-construction/ntp/run_eval.sh:
--------------------------------------------------------------------------------
1 | bash eval.sh circle.10.9.10.10000 0.0 12
2 | 


--------------------------------------------------------------------------------
/circle-construction/ntp/run_train.sh:
--------------------------------------------------------------------------------
1 | bash train.sh circle.10.9.10.10000 0.0 12 0


--------------------------------------------------------------------------------
/circle-construction/ntp/train.sh:
--------------------------------------------------------------------------------
 1 | MODEL_PATH=gpt2
 2 | 
 3 | DATASET=/data/locus/project_data/project_data2/chenwu2/creativity_data/$1/
 4 | WEIGHT_DECAY=$2
 5 | N_LAYERS=$3
 6 | GPU=$4
 7 | 
 8 | EXP_DIR=../creativity
 9 | 
10 | OUTPUT_DIR=$EXP_DIR/$1_$2_$3
11 | 
12 | CUDA_VISIBLE_DEVICES=$GPU python main.py \
13 |     --data_dir $DATASET \
14 |     --model_name_or_path ${MODEL_PATH} \
15 |     --weight_decay $WEIGHT_DECAY \
16 |     --output_dir $OUTPUT_DIR \
17 |     --max_seq_length 128 \
18 |     --max_length 128 \
19 |     --block_size 128 \
20 |     --train_batch_size 64 \
21 |     --eval_batch_size 64 \
22 |     --learning_rate 1e-4 \
23 |     --gradient_accumulation_steps 1 \
24 |     --save_step 5000 \
25 |     --save_step_dense 1000 \
26 |     --max_steps 40000 \
27 |     --do_train \
28 |     --scheduler constant_schedule_with_warmup \
29 |     --fp16 \
30 |     --evaluate_during_training \
31 |     --predict_during_training \
32 |     --add_tokens \
33 |     --n_layer $N_LAYERS
34 | 


--------------------------------------------------------------------------------
/circle-construction/ntp/utils.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import json
 3 | import torch
 4 | import torch.nn as nn
 5 | import torch.nn.functional as F
 6 | 
 7 | 
 8 | def read_data_source_target(file_name, return_num=False, return_json=False, cutoff=None):
 9 |     """
10 |     file_name: a .json file containing a list of items, each has 'input_text', 'target_text', as keys
11 |     """
12 |     with open(file_name, 'r', encoding='utf-8') as f:
13 |         data = json.load(f)
14 |     
15 |     if cutoff is not None:
16 |         data = data[:cutoff]
17 | 
18 |     if return_json:
19 |         if return_num:
20 |             return data, len(data)
21 |         return data
22 | 
23 |     keys = list(data[0].keys())
24 |     source_target_pair = []
25 |     for item in data:
26 |         source_target_pair.append([item[key] for key in keys])
27 | 
28 |     if return_num:
29 |         return pd.DataFrame(source_target_pair, columns=keys), len(data)
30 |     return pd.DataFrame(source_target_pair, columns=keys)
31 | 


--------------------------------------------------------------------------------
/circle-construction/teacherless/eval.sh:
--------------------------------------------------------------------------------
1 | # $1: dataset
2 | # $2: weight_decay
3 | # $3: n_layers
4 | 
5 | EXP_DIR=../creativity
6 | DATA_DIR=/data/locus/project_data/project_data2/chenwu2/creativity_data
7 | 
8 | python eval_qa.py --dir $EXP_DIR/$1_$2_$3 --dataset $1 --data_dir $DATA_DIR
9 | 


--------------------------------------------------------------------------------
/circle-construction/teacherless/run_eval.sh:
--------------------------------------------------------------------------------
1 | bash eval.sh circle_hybrid.10.9.10.10000 0.0 12


--------------------------------------------------------------------------------
/circle-construction/teacherless/run_train.sh:
--------------------------------------------------------------------------------
1 | bash train.sh circle_hybrid.10.9.10.10000 0.0 12 0


--------------------------------------------------------------------------------
/circle-construction/teacherless/train.sh:
--------------------------------------------------------------------------------
 1 | MODEL_PATH=gpt2
 2 | 
 3 | DATASET=/data/locus/project_data/project_data2/chenwu2/creativity_data/$1/
 4 | WEIGHT_DECAY=$2
 5 | N_LAYERS=$3
 6 | GPU=$4
 7 | 
 8 | EXP_DIR=../creativity
 9 | 
10 | OUTPUT_DIR=$EXP_DIR/$1_$2_$3
11 | 
12 | CUDA_VISIBLE_DEVICES=$GPU python main.py \
13 |     --data_dir $DATASET \
14 |     --model_name_or_path ${MODEL_PATH} \
15 |     --weight_decay $WEIGHT_DECAY \
16 |     --output_dir $OUTPUT_DIR \
17 |     --max_seq_length 128 \
18 |     --max_length 128 \
19 |     --block_size 128 \
20 |     --train_batch_size 64 \
21 |     --eval_batch_size 64 \
22 |     --learning_rate 1e-4 \
23 |     --gradient_accumulation_steps 1 \
24 |     --save_step 5000 \
25 |     --save_step_dense 1000 \
26 |     --max_steps 40000 \
27 |     --do_train \
28 |     --scheduler constant_schedule_with_warmup \
29 |     --fp16 \
30 |     --evaluate_during_training \
31 |     --predict_during_training \
32 |     --add_tokens \
33 |     --n_layer $N_LAYERS
34 | 


--------------------------------------------------------------------------------
/circle-construction/teacherless/utils.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import json
 3 | import torch
 4 | import torch.nn as nn
 5 | import torch.nn.functional as F
 6 | 
 7 | 
 8 | def read_data_source_target(file_name, return_num=False, return_json=False, cutoff=None):
 9 |     """
10 |     file_name: a .json file containing a list of items, each has 'input_text', 'target_text', as keys
11 |     """
12 |     with open(file_name, 'r', encoding='utf-8') as f:
13 |         data = json.load(f)
14 |     
15 |     if cutoff is not None:
16 |         data = data[:cutoff]
17 | 
18 |     if return_json:
19 |         if return_num:
20 |             return data, len(data)
21 |         return data
22 | 
23 |     keys = list(data[0].keys())
24 |     source_target_pair = []
25 |     for item in data:
26 |         source_target_pair.append([item[key] for key in keys])
27 | 
28 |     if return_num:
29 |         return pd.DataFrame(source_target_pair, columns=keys), len(data)
30 |     return pd.DataFrame(source_target_pair, columns=keys)
31 | 


--------------------------------------------------------------------------------
/docs/teaser.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChenWu98/algorithmic-creativity/7176dc7d35ca17605a76fa873ec97ad62a8537ac/docs/teaser.png


--------------------------------------------------------------------------------
/line-construction/diffusion/.gitignore:
--------------------------------------------------------------------------------
 1 | .vscode/
 2 | build/
 3 | dist/
 4 | *.egg-info/
 5 | __pycache__
 6 | .ipynb_checkpoints/
 7 | .DS_Store
 8 | **.pyc
 9 | *.png
10 | *.txt
11 | **/outputs/
12 | **/wandb/
13 | **/exp/
14 | **/exp_local/
15 | data/
16 | eval/
17 | assets/
18 | **.pth
19 | **.npz
20 | core
21 | **.log
22 | *.jsonl


--------------------------------------------------------------------------------
/line-construction/diffusion/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Aaron Lou
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/line-construction/diffusion/catsample.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | 
 4 | 
 5 | def gumbel_softmax(categorical_probs, hard=False, eps=1e-9):
 6 |     logits = categorical_probs.clamp(min=1e-9).log()
 7 |     return F.gumbel_softmax(logits, hard=hard)
 8 | 
 9 | 
10 | def sample_categorical(categorical_probs, method="hard"):
11 |     if method == "hard":
12 |         gumbel_norm = 1e-10 - (torch.rand_like(categorical_probs) + 1e-10).log()
13 |         return (categorical_probs / gumbel_norm).argmax(dim=-1)
14 |     else:
15 |         raise ValueError(f"Method {method} for sampling categorical variables is not valid.")
16 |     


--------------------------------------------------------------------------------
/line-construction/diffusion/configs/config.yaml:
--------------------------------------------------------------------------------
 1 | defaults:
 2 |   - _self_
 3 |   - model: small
 4 |   - override hydra/launcher: submitit_slurm
 5 | 
 6 | ngpus: 1
 7 | tokens: 50257
 8 | add_vocab: ""
 9 | 
10 | training:
11 |   batch_size: 64  # 512
12 |   accum: 1
13 |   n_iters: 400000
14 |   snapshot_freq: 10000
15 |   log_freq: 50
16 |   eval_freq: 500
17 |   snapshot_freq_for_preemption: 10000
18 |   weight: standard
19 |   snapshot_sampling: True
20 |   ema: 0.9999
21 | 
22 | data:
23 |   train: openwebtext
24 |   valid: wikitext103
25 |   cache_dir: data
26 | 
27 | graph:
28 |   type: absorb
29 |   file: data
30 |   report_all: False
31 | 
32 | noise:
33 |   type: loglinear
34 |   sigma_min: 1e-4
35 |   sigma_max: 20
36 | 
37 | sampling:
38 |   predictor: euler
39 |   steps: 128
40 |   noise_removal: True
41 | 
42 | eval:
43 |   batch_size: 32
44 |   perplexity: False
45 |   perplexity_batch_size: 32
46 | 
47 | optim:
48 |   weight_decay: 0
49 |   optimizer: AdamW
50 |   lr: 1e-4
51 |   beta1: 0.9
52 |   beta2: 0.999
53 |   eps: 1e-8
54 |   warmup: 2500
55 |   grad_clip: 1.
56 | 
57 | 
58 | hydra:
59 |   run:
60 |     dir: /data/locus/project_data/project_data2/chenwu2/creativity_results/${data.train}/${now:%Y.%m.%d}/${now:%H%M%S}


--------------------------------------------------------------------------------
/line-construction/diffusion/configs/model/medium.yaml:
--------------------------------------------------------------------------------
1 | name: medium
2 | type: ddit
3 | hidden_size: 1024
4 | cond_dim: 128
5 | length: 29
6 | n_blocks: 24
7 | n_heads: 16
8 | scale_by_sigma: True
9 | dropout: 0.1


--------------------------------------------------------------------------------
/line-construction/diffusion/configs/model/small.yaml:
--------------------------------------------------------------------------------
 1 | name: small
 2 | type: ddit
 3 | hidden_size: 768
 4 | cond_dim: 128
 5 | length: 29  # set to 29 for no-hash-string; 45 for hash-string
 6 | n_blocks: 12
 7 | n_heads: 12
 8 | scale_by_sigma: True
 9 | dropout: 0.1
10 | 


--------------------------------------------------------------------------------
/line-construction/diffusion/eval.sh:
--------------------------------------------------------------------------------
 1 | # $1: dataset
 2 | # $2: weight_decay
 3 | # $3: n_layers
 4 | 
 5 | EXP_DIR=creativity_results/creativity_data/line.10.9.0.10000/train.json/train/checkpoint_outputs
 6 | 
 7 | DATA_DIR=/data/locus/project_data/project_data2/chenwu2/creativity_data
 8 | 
 9 | python eval_qa.py --dir $EXP_DIR --dataset $1 --data_dir $DATA_DIR
10 | 


--------------------------------------------------------------------------------
/line-construction/diffusion/load_model.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch
 3 | from model import SEDD
 4 | import utils
 5 | from model.ema import ExponentialMovingAverage
 6 | import graph_lib
 7 | import noise_lib
 8 | 
 9 | from omegaconf import OmegaConf
10 | 
11 | def load_model_hf(dir, device):
12 |     score_model = SEDD.from_pretrained(dir).to(device)
13 |     graph = graph_lib.get_graph(score_model.config, device)
14 |     noise = noise_lib.get_noise(score_model.config).to(device)
15 |     return score_model, graph, noise
16 | 
17 | 
18 | def load_model_local(root_dir, ckpt_dir, added_tokens, device):
19 |     cfg = utils.load_hydra_config_from_run(root_dir)
20 |     if added_tokens:
21 |         cfg.tokens = cfg.tokens + len(added_tokens)
22 |     graph = graph_lib.get_graph(cfg, device)
23 |     noise = noise_lib.get_noise(cfg).to(device)
24 |     score_model = SEDD(cfg).to(device)
25 |     ema = ExponentialMovingAverage(score_model.parameters(), decay=cfg.training.ema)
26 | 
27 |     # ckpt_dir = os.path.join(root_dir, "checkpoints-meta", "checkpoint.pth")
28 |     loaded_state = torch.load(ckpt_dir, map_location=device)
29 | 
30 |     score_model.load_state_dict(loaded_state['model'])
31 |     ema.load_state_dict(loaded_state['ema'])
32 | 
33 |     ema.store(score_model.parameters())
34 |     ema.copy_to(score_model.parameters())
35 |     return score_model, graph, noise
36 | 
37 | 
38 | def load_model(root_dir, ckpt_dir, added_tokens, device):
39 |     try:
40 |         return load_model_hf(root_dir, device)
41 |     except:
42 |         return load_model_local(root_dir, ckpt_dir, added_tokens, device)


--------------------------------------------------------------------------------
/line-construction/diffusion/model/__init__.py:
--------------------------------------------------------------------------------
1 | from .transformer import SEDD


--------------------------------------------------------------------------------
/line-construction/diffusion/model/ema.py:
--------------------------------------------------------------------------------
  1 | # Modified from https://raw.githubusercontent.com/fadel/pytorch_ema/master/torch_ema/ema.py
  2 | 
  3 | from __future__ import division
  4 | from __future__ import unicode_literals
  5 | 
  6 | import torch
  7 | 
  8 | 
  9 | # Partially based on: https://github.com/tensorflow/tensorflow/blob/r1.13/tensorflow/python/training/moving_averages.py
 10 | class ExponentialMovingAverage:
 11 |     """
 12 |     Maintains (exponential) moving average of a set of parameters.
 13 |     """
 14 | 
 15 |     def __init__(self, parameters, decay, use_num_updates=True):
 16 |         """
 17 |         Args:
 18 |             parameters: Iterable of `torch.nn.Parameter`; usually the result of
 19 |                 `model.parameters()`.
 20 |             decay: The exponential decay.
 21 |             use_num_updates: Whether to use number of updates when computing
 22 |                 averages.
 23 |         """
 24 |         if decay < 0.0 or decay > 1.0:
 25 |             raise ValueError('Decay must be between 0 and 1')
 26 |         self.decay = decay
 27 |         self.num_updates = 0 if use_num_updates else None
 28 |         self.shadow_params = [p.clone().detach()
 29 |                               for p in parameters if p.requires_grad]
 30 |         self.collected_params = []
 31 | 
 32 |     def update(self, parameters):
 33 |         """
 34 |         Update currently maintained parameters.
 35 | 
 36 |         Call this every time the parameters are updated, such as the result of
 37 |         the `optimizer.step()` call.
 38 | 
 39 |         Args:
 40 |             parameters: Iterable of `torch.nn.Parameter`; usually the same set of
 41 |                 parameters used to initialize this object.
 42 |         """
 43 |         decay = self.decay
 44 |         if self.num_updates is not None:
 45 |             self.num_updates += 1
 46 |             decay = min(decay, (1 + self.num_updates) /
 47 |                         (10 + self.num_updates))
 48 |         one_minus_decay = 1.0 - decay
 49 |         with torch.no_grad():
 50 |             parameters = [p for p in parameters if p.requires_grad]
 51 |             for s_param, param in zip(self.shadow_params, parameters):
 52 |                 s_param.sub_(one_minus_decay * (s_param - param))
 53 |                 
 54 | 
 55 |     def copy_to(self, parameters):
 56 |         """
 57 |         Copy current parameters into given collection of parameters.
 58 | 
 59 |         Args:
 60 |             parameters: Iterable of `torch.nn.Parameter`; the parameters to be
 61 |                 updated with the stored moving averages.
 62 |         """
 63 |         parameters = [p for p in parameters if p.requires_grad]
 64 |         for s_param, param in zip(self.shadow_params, parameters):
 65 |             if param.requires_grad:
 66 |                 param.data.copy_(s_param.data)
 67 | 
 68 |     def store(self, parameters):
 69 |         """
 70 |         Save the current parameters for restoring later.
 71 | 
 72 |         Args:
 73 |             parameters: Iterable of `torch.nn.Parameter`; the parameters to be
 74 |                 temporarily stored.
 75 |         """
 76 |         self.collected_params = [param.clone() for param in parameters]
 77 | 
 78 |     def restore(self, parameters):
 79 |         """
 80 |         Restore the parameters stored with the `store` method.
 81 |         Useful to validate the model with EMA parameters without affecting the
 82 |         original optimization process. Store the parameters before the
 83 |         `copy_to` method. After validation (or model saving), use this to
 84 |         restore the former parameters.
 85 | 
 86 |         Args:
 87 |             parameters: Iterable of `torch.nn.Parameter`; the parameters to be
 88 |                 updated with the stored parameters.
 89 |         """
 90 |         for c_param, param in zip(self.collected_params, parameters):
 91 |             param.data.copy_(c_param.data)
 92 | 
 93 |     def state_dict(self):
 94 |         return dict(decay=self.decay, num_updates=self.num_updates,
 95 |                     shadow_params=self.shadow_params)
 96 | 
 97 |     def load_state_dict(self, state_dict):
 98 |         self.decay = state_dict['decay']
 99 |         self.num_updates = state_dict['num_updates']
100 |         self.shadow_params = state_dict['shadow_params']


--------------------------------------------------------------------------------
/line-construction/diffusion/model/fused_add_dropout_scale.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | from typing import Optional
 4 | from torch import Tensor
 5 | 
 6 | # flags required to enable jit fusion kernels
 7 | torch._C._jit_set_profiling_mode(False)
 8 | torch._C._jit_set_profiling_executor(False)
 9 | torch._C._jit_override_can_fuse_on_cpu(True)
10 | torch._C._jit_override_can_fuse_on_gpu(True)
11 | 
12 | 
13 | def bias_dropout_add_scale(
14 |     x: Tensor, bias: Optional[Tensor], scale: Tensor, residual: Optional[Tensor], prob: float, training: bool
15 | ) -> Tensor:
16 |     if bias is not None:
17 |         out = scale * F.dropout(x + bias, p=prob, training=training)
18 |     else:
19 |         out = scale * F.dropout(x, p=prob, training=training)
20 | 
21 |     if residual is not None:
22 |         out = residual + out
23 |     return out
24 | 
25 | 
26 | def get_bias_dropout_add_scale(training):
27 |     def _bias_dropout_add(x, bias, scale, residual, prob):
28 |         return bias_dropout_add_scale(x, bias, scale, residual, prob, training)
29 | 
30 |     return _bias_dropout_add
31 | 
32 | 
33 | def modulate(x: Tensor, shift: Tensor, scale: Tensor) -> Tensor:
34 |     return x * (1 + scale) + shift
35 | 
36 | 
37 | @torch.jit.script
38 | def bias_dropout_add_scale_fused_train(
39 |     x: Tensor, bias: Optional[Tensor], scale: Tensor, residual: Optional[Tensor], prob: float
40 | ) -> Tensor:
41 |     return bias_dropout_add_scale(x, bias, scale, residual, prob, True)
42 | 
43 | 
44 | @torch.jit.script
45 | def bias_dropout_add_scale_fused_inference(
46 |     x: Tensor, bias: Optional[Tensor], scale: Tensor, residual: Optional[Tensor], prob: float
47 | ) -> Tensor:
48 |     return bias_dropout_add_scale(x, bias, scale, residual, prob, False)
49 | 
50 | @torch.jit.script
51 | def modulate_fused(x: Tensor, shift: Tensor, scale: Tensor) -> Tensor:
52 |     return modulate(x, shift, scale)


--------------------------------------------------------------------------------
/line-construction/diffusion/model/rotary.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | 
 4 | 
 5 | class Rotary(torch.nn.Module):
 6 |     def __init__(self, dim, base=10_000):
 7 |         super().__init__()
 8 |         inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
 9 |         self.register_buffer("inv_freq", inv_freq)
10 |         self.seq_len_cached = None
11 |         self.cos_cached = None
12 |         self.sin_cached = None
13 | 
14 |     def forward(self, x, seq_dim=1):
15 |         seq_len = x.shape[seq_dim]
16 |         if seq_len != self.seq_len_cached:
17 |             self.seq_len_cached = seq_len
18 |             t = torch.arange(x.shape[seq_dim], device=x.device).type_as(self.inv_freq)
19 |             freqs = torch.einsum("i,j->ij", t, self.inv_freq.clone())
20 |             emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
21 |             # dims are: batch, seq_len, qkv, head, dim
22 |             self.cos_cached = emb.cos()[None, :, None, None, :].repeat(1,1,3,1,1)
23 |             self.sin_cached = emb.sin()[None, :, None, None, :].repeat(1,1,3,1,1)
24 |             # This makes the transformation on v an identity.
25 |             self.cos_cached[:,:,2,:,:].fill_(1.)
26 |             self.sin_cached[:,:,2,:,:].fill_(0.)
27 | 
28 |         return self.cos_cached, self.sin_cached
29 | 
30 | 
31 | def rotate_half(x):
32 |     x1, x2 = x[..., : x.shape[-1] // 2], x[..., x.shape[-1] // 2 :]
33 |     return torch.cat(
34 |         (-x2, x1), dim=-1
35 |     )
36 | 
37 | 
38 | @torch.jit.script
39 | def _apply_rotary_pos_emb_torchscript(qkv, cos, sin):
40 |     return (qkv * cos) + (rotate_half(qkv) * sin)
41 | 
42 | 
43 | def apply_rotary_pos_emb(qkv, cos, sin):
44 |     try:
45 |         import flash_attn.layers.rotary
46 |         cos = cos[0,:,0,0,:cos.shape[-1]//2]
47 |         sin = sin[0,:,0,0,:sin.shape[-1]//2]
48 |         return flash_attn.layers.rotary.apply_rotary_emb_qkv_(
49 |             qkv, cos, sin
50 |         )
51 |     except:
52 |         return _apply_rotary_pos_emb_torchscript(qkv, cos, sin)


--------------------------------------------------------------------------------
/line-construction/diffusion/model/utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | 
 4 | 
 5 | def get_model_fn(model, train=False):
 6 |     """Create a function to give the output of the score-based model.
 7 | 
 8 |     Args:
 9 |         model: The score model.
10 |         train: `True` for training and `False` for evaluation.
11 |         mlm: If the input model is a mlm and models the base probability 
12 | 
13 |     Returns:
14 |         A model function.
15 |     """
16 | 
17 |     def model_fn(x, sigma):
18 |         """Compute the output of the score-based model.
19 | 
20 |         Args:
21 |             x: A mini-batch of input data.
22 |             labels: A mini-batch of conditioning variables for time steps. Should be interpreted differently
23 |               for different models.
24 | 
25 |         Returns:
26 |             A tuple of (model output, new mutable states)
27 |         """
28 |         if train:
29 |             model.train()
30 |         else:
31 |             model.eval()
32 |         
33 |             # otherwise output the raw values (we handle mlm training in losses.py)
34 |         return model(x, sigma)
35 | 
36 |     return model_fn
37 | 
38 | 
39 | def get_score_fn(model, train=False, sampling=False):
40 |     if sampling:
41 |         assert not train, "Must sample in eval mode"
42 |     model_fn = get_model_fn(model, train=train)
43 | 
44 |     with torch.cuda.amp.autocast(dtype=torch.bfloat16):
45 |         def score_fn(x, sigma):
46 |             sigma = sigma.reshape(-1)
47 |             score = model_fn(x, sigma)
48 |             
49 |             if sampling:
50 |                 # when sampling return true score (not log used for training)
51 |                 return score.exp()
52 |                 
53 |             return score
54 | 
55 |     return score_fn
56 | 


--------------------------------------------------------------------------------
/line-construction/diffusion/noise_lib.py:
--------------------------------------------------------------------------------
 1 | import abc
 2 | import torch
 3 | import torch.nn as nn
 4 | import numpy as np
 5 | 
 6 | 
 7 | def get_noise(config):
 8 |     if config.noise.type == "geometric":
 9 |         return GeometricNoise(config.noise.sigma_min, config.noise.sigma_max)
10 |     elif config.noise.type == "loglinear":
11 |         return LogLinearNoise()
12 |     else:
13 |         raise ValueError(f"{config.noise.type} is not a valid noise")
14 | 
15 | 
16 | class Noise(abc.ABC, nn.Module):
17 |     """
18 |     Baseline forward method to get the total + rate of noise at a timestep
19 |     """
20 |     def forward(self, t):
21 |         return self.total_noise(t), self.rate_noise(t)
22 | 
23 |     """
24 |     Assume time goes from 0 to 1
25 |     """
26 |     @abc.abstractmethod
27 |     def rate_noise(self, t):
28 |         """
29 |         Rate of change of noise ie g(t)
30 |         """
31 |         pass
32 | 
33 |     @abc.abstractmethod
34 |     def total_noise(self, t):
35 |         """
36 |         Total noise ie \int_0^t g(t) dt + g(0)
37 |         """
38 |         pass
39 | 
40 | 
41 | class GeometricNoise(Noise, nn.Module):
42 |     def __init__(self, sigma_min=1e-3, sigma_max=1, learnable=False):
43 |         super().__init__()
44 |         self.sigmas = 1.0 * torch.tensor([sigma_min, sigma_max])
45 |         if learnable:
46 |             self.sigmas = nn.Parameter(self.sigmas)
47 |         self.empty = nn.Parameter(torch.tensor(0.0))
48 | 
49 |     def rate_noise(self, t):
50 |         return self.sigmas[0] ** (1 - t) * self.sigmas[1] ** t * (self.sigmas[1].log() - self.sigmas[0].log())
51 | 
52 |     def total_noise(self, t):
53 |         return self.sigmas[0] ** (1 - t) * self.sigmas[1] ** t
54 | 
55 | 
56 | class LogLinearNoise(Noise, nn.Module):
57 |     """
58 |     Log Linear noise schedule built so that 1 - 1/e^(n(t)) interpolates between 0 and ~1
59 |     when t goes from 0 to 1. Used for absorbing
60 | 
61 |     Total noise is -log(1 - (1 - eps) * t), so the sigma will be (1 - eps) * t
62 |     """
63 |     def __init__(self, eps=1e-3):
64 |         super().__init__()
65 |         self.eps = eps
66 |         self.empty = nn.Parameter(torch.tensor(0.0))
67 | 
68 |     def rate_noise(self, t):
69 |         return (1 - self.eps) / (1 - (1 - self.eps) * t)
70 | 
71 |     def total_noise(self, t):
72 |         return -torch.log1p(-(1 - self.eps) * t)
73 | 
74 | 


--------------------------------------------------------------------------------
/line-construction/diffusion/run_eval.sh:
--------------------------------------------------------------------------------
1 | bash eval.sh line.10.9.0.10000 0.0 8


--------------------------------------------------------------------------------
/line-construction/diffusion/run_sample.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import argparse
 3 | import json
 4 | 
 5 | from load_model import load_model
 6 | from transformers import GPT2TokenizerFast
 7 | import torch.nn.functional as F
 8 | import sampling
 9 | 
10 | 
11 | def main():
12 |     parser = argparse.ArgumentParser(description="Generate some samples")
13 |     parser.add_argument("--model_path", default="louaaron/sedd-medium", type=str)
14 |     parser.add_argument("--dataset", default="wikitext103", type=str)
15 |     parser.add_argument("--batch_size", type=int, default=1)
16 |     parser.add_argument("--steps", type=int, default=1024)
17 |     parser.add_argument("--add_vocab", type=str, default=None)
18 |     args = parser.parse_args()
19 | 
20 |     
21 |     device = torch.device('cuda')
22 |     model, graph, noise = load_model(args.model_path, device)
23 |     tokenizer = GPT2TokenizerFast.from_pretrained('gpt2')
24 |     if args.add_vocab:
25 |         with open(args.add_vocab, 'r') as file:
26 |             added_tokens = json.load(file)
27 |         tokenizer.add_tokens(added_tokens)
28 | 
29 |     sampling_fn = sampling.get_pc_sampler(
30 |         graph, noise, (args.batch_size, 1024), 'analytic', args.steps, device=device
31 |     )
32 | 
33 |     samples = sampling_fn(model)
34 | 
35 |     text_samples = tokenizer.batch_decode(samples)
36 |     for i in text_samples:
37 |         print(i)
38 |         print("=================================================")
39 | 
40 | if __name__=="__main__":
41 |     main()


--------------------------------------------------------------------------------
/line-construction/diffusion/run_sample_cond.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import argparse
 3 | import json
 4 | 
 5 | from load_model import load_model
 6 | from transformers import GPT2TokenizerFast
 7 | import sampling
 8 | 
 9 | 
10 | def main():
11 |     parser = argparse.ArgumentParser(description="Generate some samples")
12 |     parser.add_argument("--model_path", default="louaaron/sedd-medium", type=str)
13 |     parser.add_argument("--dataset", default="wikitext103", type=str)
14 |     parser.add_argument("--batch_size", type=int, default=1)
15 |     parser.add_argument("--steps", type=int, default=1024)
16 |     parser.add_argument("--prefix", type=str, default="Hi, my name is")
17 |     parser.add_argument("--suffix", type=str, default=" and that's why I'm late.")
18 |     parser.add_argument("--add_vocab", type=str, default=None)
19 |     args = parser.parse_args()
20 | 
21 |     tokenizer = GPT2TokenizerFast.from_pretrained('gpt2')
22 |     if args.add_vocab:
23 |         with open(args.add_vocab, 'r') as file:
24 |             added_tokens = json.load(file)
25 |         tokenizer.add_tokens(added_tokens)
26 | 
27 |     prefix_ids = tokenizer(args.prefix).input_ids
28 |     suffix_ids = tokenizer(args.suffix).input_ids
29 |     input_ids = prefix_ids + suffix_ids
30 |     input_locs = list(range(len(prefix_ids))) + list(range(1024-len(suffix_ids), 1024))
31 | 
32 |     # more generaly commands can be defined with something like below:
33 |     # input_ids = [0, 1, 512, 8080, 50256, 20000]
34 |     # input_locs = [5, 6, 19, 20, 1000, 10001]
35 | 
36 | 
37 |     input_ids = torch.tensor(input_ids, device="cuda")[None].repeat(args.batch_size, 1)
38 | 
39 |     def proj_fun(x):
40 |         x[:, input_locs] = input_ids
41 |         return x
42 |     
43 |     device = torch.device('cuda')
44 |     model, graph, noise = load_model(args.model_path, device)
45 |     
46 | 
47 |     sampling_fn = sampling.get_pc_sampler(
48 |         graph, noise, (args.batch_size, 1024), 'analytic', args.steps, device=device, proj_fun=proj_fun
49 |     )
50 | 
51 |     samples = proj_fun(sampling_fn(model))
52 | 
53 |     text_samples = tokenizer.batch_decode(samples)
54 |     for i in text_samples:
55 |         print(i)
56 |         print("=================================================")
57 | 
58 | if __name__=="__main__":
59 |     main()


--------------------------------------------------------------------------------
/line-construction/diffusion/run_train.sh:
--------------------------------------------------------------------------------
 1 | python train.py \
 2 |     noise.type=loglinear \
 3 |     graph.type=absorb \
 4 |     model=small \
 5 |     training.accum=1 \
 6 |     data.train=creativity_data/line.10.9.0.10000/train.json \
 7 |     data.valid=creativity_data/line.10.9.0.10000/valid.json \
 8 |     add_vocab=creativity_data/line.10.9.0.10000/vocab.json \
 9 |     hydra.run.dir=/data/locus/project_data/project_data2/chenwu2/creativity_results/creativity_data/line.10.9.0.10000/train.json/train
10 | 
11 | python test.py \
12 |     --model_checkpoint_dir creativity_results/creativity_data/line.10.9.0.10000/train.json/train \
13 |     --dataset creativity_data/line.10.9.0.10000 \
14 |     --add_vocab creativity_data/line.10.9.0.10000/vocab.json


--------------------------------------------------------------------------------
/line-construction/diffusion/test.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import argparse
 3 | import json
 4 | 
 5 | from load_model import load_model
 6 | from transformers import GPT2TokenizerFast
 7 | import sampling
 8 | from tqdm import tqdm
 9 | 
10 | import os
11 | 
12 | 
13 | def main():
14 |     parser = argparse.ArgumentParser(description="Generate some samples")
15 |     parser.add_argument("--model_checkpoint_dir", default="", type=str)
16 |     parser.add_argument("--dataset", default=None, type=str)
17 |     parser.add_argument("--steps", type=int, default=128)
18 |     parser.add_argument("--add_vocab", type=str, default=None)
19 |     args = parser.parse_args()
20 | 
21 |     tokenizer = GPT2TokenizerFast.from_pretrained('gpt2')
22 |     if args.add_vocab:
23 |         with open(args.add_vocab, 'r') as file:
24 |             added_tokens = json.load(file)
25 |         tokenizer.add_tokens(added_tokens)
26 |     else:
27 |         added_tokens = []
28 |     
29 |     # Load the dataset
30 |     with open(os.path.join(args.dataset, "test.json"), "r") as f:
31 |         test_data = json.load(f)
32 |     
33 |     # List all files under the model checkpoint directory
34 |     checkpoints = [os.path.join(args.model_checkpoint_dir, "checkpoints", f) for f in os.listdir(os.path.join(args.model_checkpoint_dir, "checkpoints"))]
35 |     print(checkpoints)
36 | 
37 |     for checkpoint in checkpoints:
38 |         device = torch.device('cuda')
39 |         model, graph, noise = load_model(args.model_checkpoint_dir, checkpoint, added_tokens, device)
40 |         # Create a checkpoint_dir for the current checkpoint
41 |         checkpoint_dir = os.path.join(args.model_checkpoint_dir, "checkpoint_outputs", os.path.basename(checkpoint))
42 |         if os.path.exists(checkpoint_dir):
43 |             print(f"Skipping {checkpoint_dir} because it already exists")
44 |             continue
45 |         os.makedirs(checkpoint_dir, exist_ok=True)
46 | 
47 |         def generate_output(input_text):
48 |             prefix_ids = tokenizer(input_text).input_ids
49 |             # suffix_ids = tokenizer("<|endoftext|>").input_ids
50 |             input_ids = prefix_ids
51 |             input_locs = list(range(len(prefix_ids)))
52 | 
53 |             input_ids = torch.tensor(input_ids, device="cuda")[None].repeat(1, 1)
54 | 
55 |             def proj_fun(x):
56 |                 x[:, input_locs] = input_ids
57 |                 return x
58 |             
59 |             sampling_fn = sampling.get_pc_sampler(
60 |                 graph, noise, (1, 29), 'analytic', args.steps, device=device, proj_fun=proj_fun
61 |             )
62 | 
63 |             samples = proj_fun(sampling_fn(model))
64 | 
65 |             text_samples = tokenizer.batch_decode(samples)
66 |             assert len(text_samples) == 1
67 |             text_samples = text_samples[0].split("<|endoftext|>")[0]
68 |             return text_samples
69 |         
70 |         all_items = []
71 |         for sample in tqdm(test_data):
72 |             item = {}
73 |             item["input_text"] = sample["input_text"]
74 |             item["target_text"] = sample["target_text"]
75 |             item["type"] = sample["type"]
76 | 
77 |             output = generate_output(sample["input_text"])
78 |             print(sample["input_text"])
79 |             print(sample["target_text"])
80 |             print(output)
81 |             print()
82 |             item["model_output"] = output
83 |             all_items.append(item)
84 |         
85 |         # Save the results
86 |         with open(os.path.join(checkpoint_dir, "all_items.json"), "w") as f:
87 |             json.dump(all_items, f, indent=4)
88 | 
89 | 
90 | if __name__=="__main__":
91 |     main()


--------------------------------------------------------------------------------
/line-construction/diffusion/train.py:
--------------------------------------------------------------------------------
 1 | """Training and evaluation"""
 2 | 
 3 | import hydra
 4 | import os
 5 | import numpy as np
 6 | import run_train
 7 | import utils
 8 | import torch.multiprocessing as mp
 9 | from hydra.core.hydra_config import HydraConfig
10 | from hydra.types import RunMode
11 | from omegaconf import OmegaConf, open_dict
12 | 
13 | 
14 | @hydra.main(version_base=None, config_path="configs", config_name="config")
15 | def main(cfg):
16 |     ngpus = cfg.ngpus
17 |     if "load_dir" in cfg:
18 |         hydra_cfg_path = os.path.join(cfg.load_dir, ".hydra/hydra.yaml")
19 |         hydra_cfg = OmegaConf.load(hydra_cfg_path).hydra
20 | 
21 |         cfg = utils.load_hydra_config_from_run(cfg.load_dir)
22 |         
23 |         work_dir = cfg.work_dir
24 |         utils.makedirs(work_dir)
25 |     else:
26 |         hydra_cfg = HydraConfig.get()
27 |         work_dir = hydra_cfg.run.dir if hydra_cfg.mode == RunMode.RUN else os.path.join(hydra_cfg.sweep.dir, hydra_cfg.sweep.subdir)
28 |         utils.makedirs(work_dir)
29 | 
30 |     with open_dict(cfg):
31 |         cfg.ngpus = ngpus
32 |         cfg.work_dir = work_dir
33 |         cfg.wandb_name = os.path.basename(os.path.normpath(work_dir))
34 | 
35 | 	# Run the training pipeline
36 |     port = int(np.random.randint(10000, 20000))
37 |     logger = utils.get_logger(os.path.join(work_dir, "logs"))
38 | 
39 |     hydra_cfg = HydraConfig.get()
40 |     if hydra_cfg.mode != RunMode.RUN:
41 |         logger.info(f"Run id: {hydra_cfg.job.id}")
42 | 
43 |     try:
44 |         mp.set_start_method("forkserver")
45 |         mp.spawn(run_train.run_multiprocess, args=(ngpus, cfg, port), nprocs=ngpus, join=True)
46 |     except Exception as e:
47 |         logger.critical(e, exc_info=True)
48 | 
49 | 
50 | if __name__ == "__main__":
51 |     main()


--------------------------------------------------------------------------------
/line-construction/diffusion/utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | import os
 4 | import logging
 5 | from omegaconf import OmegaConf, open_dict
 6 | 
 7 | 
 8 | def load_hydra_config_from_run(load_dir):
 9 |     cfg_path = os.path.join(load_dir, ".hydra/config.yaml")
10 |     cfg = OmegaConf.load(cfg_path)
11 |     return cfg
12 | 
13 | 
14 | def makedirs(dirname):
15 |     os.makedirs(dirname, exist_ok=True)
16 | 
17 | 
18 | def get_logger(logpath, package_files=[], displaying=True, saving=True, debug=False):
19 |     logger = logging.getLogger()
20 |     if debug:
21 |         level = logging.DEBUG
22 |     else:
23 |         level = logging.INFO
24 | 
25 |     if (logger.hasHandlers()):
26 |         logger.handlers.clear()
27 | 
28 |     logger.setLevel(level)
29 |     formatter = logging.Formatter('%(asctime)s - %(message)s')
30 |     if saving:
31 |         info_file_handler = logging.FileHandler(logpath, mode="a")
32 |         info_file_handler.setLevel(level)
33 |         info_file_handler.setFormatter(formatter)
34 |         logger.addHandler(info_file_handler)
35 |     if displaying:
36 |         console_handler = logging.StreamHandler()
37 |         console_handler.setLevel(level)
38 |         console_handler.setFormatter(formatter)
39 |         logger.addHandler(console_handler)
40 | 
41 |     for f in package_files:
42 |         logger.info(f)
43 |         with open(f, "r") as package_f:
44 |             logger.info(package_f.read())
45 | 
46 |     return logger
47 | 
48 | 
49 | def restore_checkpoint(ckpt_dir, state, device):
50 |     if not os.path.exists(ckpt_dir):
51 |         makedirs(os.path.dirname(ckpt_dir))
52 |         logging.warning(f"No checkpoint found at {ckpt_dir}. Returned the same state as input")
53 |         return state
54 |     else:
55 |         loaded_state = torch.load(ckpt_dir, map_location=device)
56 |         state['optimizer'].load_state_dict(loaded_state['optimizer'])
57 |         state['model'].module.load_state_dict(loaded_state['model'], strict=False)
58 |         state['ema'].load_state_dict(loaded_state['ema'])
59 |         state['step'] = loaded_state['step']
60 |         return state
61 | 
62 | 
63 | def save_checkpoint(ckpt_dir, state):
64 |     saved_state = {
65 |         'optimizer': state['optimizer'].state_dict(),
66 |         'model': state['model'].module.state_dict(),
67 |         'ema': state['ema'].state_dict(),
68 |         'step': state['step']
69 |     }
70 |     torch.save(saved_state, ckpt_dir)


--------------------------------------------------------------------------------
/line-construction/ntp/eval.sh:
--------------------------------------------------------------------------------
1 | # $1: dataset
2 | # $2: weight_decay
3 | # $3: n_layers
4 | 
5 | EXP_DIR=../creativity
6 | DATA_DIR=/data/locus/project_data/project_data2/chenwu2/creativity_data
7 | 
8 | python eval_qa.py --dir $EXP_DIR/$1_$2_$3 --dataset $1 --data_dir $DATA_DIR
9 | 


--------------------------------------------------------------------------------
/line-construction/ntp/run_eval.sh:
--------------------------------------------------------------------------------
1 | bash eval.sh line.10.9.10.10000 0.0 12
2 | 


--------------------------------------------------------------------------------
/line-construction/ntp/run_train.sh:
--------------------------------------------------------------------------------
1 | bash train.sh line.10.9.10.10000 0.0 12 0
2 | 


--------------------------------------------------------------------------------
/line-construction/ntp/train.sh:
--------------------------------------------------------------------------------
 1 | MODEL_PATH=gpt2
 2 | 
 3 | DATASET=/data/locus/project_data/project_data2/chenwu2/creativity_data/$1/
 4 | WEIGHT_DECAY=$2
 5 | N_LAYERS=$3
 6 | GPU=$4
 7 | 
 8 | EXP_DIR=../creativity
 9 | 
10 | OUTPUT_DIR=$EXP_DIR/$1_$2_$3
11 | 
12 | CUDA_VISIBLE_DEVICES=$GPU python main.py \
13 |     --data_dir $DATASET \
14 |     --model_name_or_path ${MODEL_PATH} \
15 |     --weight_decay $WEIGHT_DECAY \
16 |     --output_dir $OUTPUT_DIR \
17 |     --max_seq_length 128 \
18 |     --max_length 128 \
19 |     --block_size 128 \
20 |     --train_batch_size 64 \
21 |     --eval_batch_size 64 \
22 |     --learning_rate 1e-4 \
23 |     --gradient_accumulation_steps 1 \
24 |     --save_step 5000 \
25 |     --save_step_dense 1000 \
26 |     --max_steps 40000 \
27 |     --do_train \
28 |     --scheduler constant_schedule_with_warmup \
29 |     --fp16 \
30 |     --evaluate_during_training \
31 |     --predict_during_training \
32 |     --add_tokens \
33 |     --n_layer $N_LAYERS
34 | 


--------------------------------------------------------------------------------
/line-construction/ntp/utils.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import json
 3 | import torch
 4 | import torch.nn as nn
 5 | import torch.nn.functional as F
 6 | 
 7 | 
 8 | def read_data_source_target(file_name, return_num=False, return_json=False, cutoff=None):
 9 |     """
10 |     file_name: a .json file containing a list of items, each has 'input_text', 'target_text', as keys
11 |     """
12 |     with open(file_name, 'r', encoding='utf-8') as f:
13 |         data = json.load(f)
14 |     
15 |     if cutoff is not None:
16 |         data = data[:cutoff]
17 | 
18 |     if return_json:
19 |         if return_num:
20 |             return data, len(data)
21 |         return data
22 | 
23 |     keys = list(data[0].keys())
24 |     source_target_pair = []
25 |     for item in data:
26 |         source_target_pair.append([item[key] for key in keys])
27 | 
28 |     if return_num:
29 |         return pd.DataFrame(source_target_pair, columns=keys), len(data)
30 |     return pd.DataFrame(source_target_pair, columns=keys)
31 | 


--------------------------------------------------------------------------------
/line-construction/teacherless/eval.sh:
--------------------------------------------------------------------------------
1 | # $1: dataset
2 | # $2: weight_decay
3 | # $3: n_layers
4 | 
5 | EXP_DIR=../creativity
6 | DATA_DIR=/data/locus/project_data/project_data2/chenwu2/creativity_data
7 | 
8 | python eval_qa.py --dir $EXP_DIR/$1_$2_$3 --dataset $1 --data_dir $DATA_DIR
9 | 


--------------------------------------------------------------------------------
/line-construction/teacherless/run_eval.sh:
--------------------------------------------------------------------------------
1 | bash eval.sh line_hybrid.10.9.10.10000 0.0 12
2 | 


--------------------------------------------------------------------------------
/line-construction/teacherless/run_train.sh:
--------------------------------------------------------------------------------
1 | bash train.sh line_hybrid.10.9.10.10000 0.0 12 0
2 | 


--------------------------------------------------------------------------------
/line-construction/teacherless/train.sh:
--------------------------------------------------------------------------------
 1 | MODEL_PATH=gpt2
 2 | 
 3 | DATASET=/data/locus/project_data/project_data2/chenwu2/creativity_data/$1/
 4 | WEIGHT_DECAY=$2
 5 | N_LAYERS=$3
 6 | GPU=$4
 7 | 
 8 | EXP_DIR=../creativity
 9 | 
10 | OUTPUT_DIR=$EXP_DIR/$1_$2_$3
11 | 
12 | CUDA_VISIBLE_DEVICES=$GPU python main.py \
13 |     --data_dir $DATASET \
14 |     --model_name_or_path ${MODEL_PATH} \
15 |     --weight_decay $WEIGHT_DECAY \
16 |     --output_dir $OUTPUT_DIR \
17 |     --max_seq_length 128 \
18 |     --max_length 128 \
19 |     --block_size 128 \
20 |     --train_batch_size 64 \
21 |     --eval_batch_size 64 \
22 |     --learning_rate 1e-4 \
23 |     --gradient_accumulation_steps 1 \
24 |     --save_step 5000 \
25 |     --save_step_dense 1000 \
26 |     --max_steps 40000 \
27 |     --do_train \
28 |     --scheduler constant_schedule_with_warmup \
29 |     --fp16 \
30 |     --evaluate_during_training \
31 |     --predict_during_training \
32 |     --add_tokens \
33 |     --n_layer $N_LAYERS
34 | 


--------------------------------------------------------------------------------
/line-construction/teacherless/utils.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import json
 3 | import torch
 4 | import torch.nn as nn
 5 | import torch.nn.functional as F
 6 | 
 7 | 
 8 | def read_data_source_target(file_name, return_num=False, return_json=False, cutoff=None):
 9 |     """
10 |     file_name: a .json file containing a list of items, each has 'input_text', 'target_text', as keys
11 |     """
12 |     with open(file_name, 'r', encoding='utf-8') as f:
13 |         data = json.load(f)
14 |     
15 |     if cutoff is not None:
16 |         data = data[:cutoff]
17 | 
18 |     if return_json:
19 |         if return_num:
20 |             return data, len(data)
21 |         return data
22 | 
23 |     keys = list(data[0].keys())
24 |     source_target_pair = []
25 |     for item in data:
26 |         source_target_pair.append([item[key] for key in keys])
27 | 
28 |     if return_num:
29 |         return pd.DataFrame(source_target_pair, columns=keys), len(data)
30 |     return pd.DataFrame(source_target_pair, columns=keys)
31 | 


--------------------------------------------------------------------------------
/sibling-discovery/diffusion/.gitignore:
--------------------------------------------------------------------------------
 1 | .vscode/
 2 | build/
 3 | dist/
 4 | *.egg-info/
 5 | __pycache__
 6 | .ipynb_checkpoints/
 7 | .DS_Store
 8 | **.pyc
 9 | *.png
10 | *.txt
11 | **/outputs/
12 | **/wandb/
13 | **/exp/
14 | **/exp_local/
15 | data/
16 | eval/
17 | assets/
18 | **.pth
19 | **.npz
20 | core
21 | **.log
22 | *.jsonl


--------------------------------------------------------------------------------
/sibling-discovery/diffusion/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Aaron Lou
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/sibling-discovery/diffusion/catsample.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | 
 4 | 
 5 | def gumbel_softmax(categorical_probs, hard=False, eps=1e-9):
 6 |     logits = categorical_probs.clamp(min=1e-9).log()
 7 |     return F.gumbel_softmax(logits, hard=hard)
 8 | 
 9 | 
10 | def sample_categorical(categorical_probs, method="hard"):
11 |     if method == "hard":
12 |         gumbel_norm = 1e-10 - (torch.rand_like(categorical_probs) + 1e-10).log()
13 |         return (categorical_probs / gumbel_norm).argmax(dim=-1)
14 |     else:
15 |         raise ValueError(f"Method {method} for sampling categorical variables is not valid.")
16 |     


--------------------------------------------------------------------------------
/sibling-discovery/diffusion/configs/config.yaml:
--------------------------------------------------------------------------------
 1 | defaults:
 2 |   - _self_
 3 |   - model: small
 4 |   - override hydra/launcher: submitit_slurm
 5 | 
 6 | ngpus: 1
 7 | tokens: 50257
 8 | add_vocab: ""
 9 | 
10 | training:
11 |   batch_size: 64  # 512
12 |   accum: 1
13 |   n_iters: 800000
14 |   snapshot_freq: 40000
15 |   log_freq: 50
16 |   eval_freq: 500
17 |   snapshot_freq_for_preemption: 40000
18 |   weight: standard
19 |   snapshot_sampling: True
20 |   ema: 0.9999
21 | 
22 | data:
23 |   train: openwebtext
24 |   valid: wikitext103
25 |   cache_dir: data
26 | 
27 | graph:
28 |   type: absorb
29 |   file: data
30 |   report_all: False
31 | 
32 | noise:
33 |   type: loglinear
34 |   sigma_min: 1e-4
35 |   sigma_max: 20
36 | 
37 | sampling:
38 |   predictor: euler
39 |   steps: 32
40 |   noise_removal: True
41 | 
42 | eval:
43 |   batch_size: 32
44 |   perplexity: False
45 |   perplexity_batch_size: 32
46 | 
47 | optim:
48 |   weight_decay: 0
49 |   optimizer: AdamW
50 |   lr: 1e-4
51 |   beta1: 0.9
52 |   beta2: 0.999
53 |   eps: 1e-8
54 |   warmup: 2500
55 |   grad_clip: 1.
56 | 
57 | 
58 | hydra:
59 |   run:
60 |     dir: /data/locus/project_data/project_data2/chenwu2/creativity_results/${data.train}/${now:%Y.%m.%d}/${now:%H%M%S}


--------------------------------------------------------------------------------
/sibling-discovery/diffusion/configs/model/medium.yaml:
--------------------------------------------------------------------------------
1 | name: medium
2 | type: ddit
3 | hidden_size: 1024
4 | cond_dim: 128
5 | length: 32
6 | n_blocks: 24
7 | n_heads: 16
8 | scale_by_sigma: True
9 | dropout: 0.1


--------------------------------------------------------------------------------
/sibling-discovery/diffusion/configs/model/small.yaml:
--------------------------------------------------------------------------------
1 | name: small
2 | type: ddit
3 | hidden_size: 768
4 | cond_dim: 128
5 | length: 32
6 | n_blocks: 12
7 | n_heads: 12
8 | scale_by_sigma: True
9 | dropout: 0.1


--------------------------------------------------------------------------------
/sibling-discovery/diffusion/eval.sh:
--------------------------------------------------------------------------------
 1 | # $1: dataset
 2 | # $2: weight_decay
 3 | # $3: n_layers
 4 | 
 5 | EXP_DIR=creativity_results/creativity_data/sibling.5.500.10.50000/train.json/train/checkpoint_outputs
 6 | 
 7 | DATA_DIR=/data/locus/project_data/project_data2/chenwu2/creativity_data
 8 | 
 9 | python eval_qa.py --dir $EXP_DIR --dataset $1 --data_dir $DATA_DIR
10 | 


--------------------------------------------------------------------------------
/sibling-discovery/diffusion/load_model.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch
 3 | from model import SEDD
 4 | import utils
 5 | from model.ema import ExponentialMovingAverage
 6 | import graph_lib
 7 | import noise_lib
 8 | 
 9 | from omegaconf import OmegaConf
10 | 
11 | def load_model_hf(dir, device):
12 |     score_model = SEDD.from_pretrained(dir).to(device)
13 |     graph = graph_lib.get_graph(score_model.config, device)
14 |     noise = noise_lib.get_noise(score_model.config).to(device)
15 |     return score_model, graph, noise
16 | 
17 | 
18 | def load_model_local(root_dir, ckpt_dir, added_tokens, device):
19 |     cfg = utils.load_hydra_config_from_run(root_dir)
20 |     if added_tokens:
21 |         cfg.tokens = cfg.tokens + len(added_tokens)
22 |     graph = graph_lib.get_graph(cfg, device)
23 |     noise = noise_lib.get_noise(cfg).to(device)
24 |     score_model = SEDD(cfg).to(device)
25 |     ema = ExponentialMovingAverage(score_model.parameters(), decay=cfg.training.ema)
26 | 
27 |     # ckpt_dir = os.path.join(root_dir, "checkpoints-meta", "checkpoint.pth")
28 |     loaded_state = torch.load(ckpt_dir, map_location=device)
29 | 
30 |     score_model.load_state_dict(loaded_state['model'])
31 |     ema.load_state_dict(loaded_state['ema'])
32 | 
33 |     ema.store(score_model.parameters())
34 |     ema.copy_to(score_model.parameters())
35 |     return score_model, graph, noise
36 | 
37 | 
38 | def load_model(root_dir, ckpt_dir, added_tokens, device):
39 |     try:
40 |         return load_model_hf(root_dir, device)
41 |     except:
42 |         return load_model_local(root_dir, ckpt_dir, added_tokens, device)


--------------------------------------------------------------------------------
/sibling-discovery/diffusion/model/__init__.py:
--------------------------------------------------------------------------------
1 | from .transformer import SEDD


--------------------------------------------------------------------------------
/sibling-discovery/diffusion/model/fused_add_dropout_scale.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | from typing import Optional
 4 | from torch import Tensor
 5 | 
 6 | # flags required to enable jit fusion kernels
 7 | torch._C._jit_set_profiling_mode(False)
 8 | torch._C._jit_set_profiling_executor(False)
 9 | torch._C._jit_override_can_fuse_on_cpu(True)
10 | torch._C._jit_override_can_fuse_on_gpu(True)
11 | 
12 | 
13 | def bias_dropout_add_scale(
14 |     x: Tensor, bias: Optional[Tensor], scale: Tensor, residual: Optional[Tensor], prob: float, training: bool
15 | ) -> Tensor:
16 |     if bias is not None:
17 |         out = scale * F.dropout(x + bias, p=prob, training=training)
18 |     else:
19 |         out = scale * F.dropout(x, p=prob, training=training)
20 | 
21 |     if residual is not None:
22 |         out = residual + out
23 |     return out
24 | 
25 | 
26 | def get_bias_dropout_add_scale(training):
27 |     def _bias_dropout_add(x, bias, scale, residual, prob):
28 |         return bias_dropout_add_scale(x, bias, scale, residual, prob, training)
29 | 
30 |     return _bias_dropout_add
31 | 
32 | 
33 | def modulate(x: Tensor, shift: Tensor, scale: Tensor) -> Tensor:
34 |     return x * (1 + scale) + shift
35 | 
36 | 
37 | @torch.jit.script
38 | def bias_dropout_add_scale_fused_train(
39 |     x: Tensor, bias: Optional[Tensor], scale: Tensor, residual: Optional[Tensor], prob: float
40 | ) -> Tensor:
41 |     return bias_dropout_add_scale(x, bias, scale, residual, prob, True)
42 | 
43 | 
44 | @torch.jit.script
45 | def bias_dropout_add_scale_fused_inference(
46 |     x: Tensor, bias: Optional[Tensor], scale: Tensor, residual: Optional[Tensor], prob: float
47 | ) -> Tensor:
48 |     return bias_dropout_add_scale(x, bias, scale, residual, prob, False)
49 | 
50 | @torch.jit.script
51 | def modulate_fused(x: Tensor, shift: Tensor, scale: Tensor) -> Tensor:
52 |     return modulate(x, shift, scale)


--------------------------------------------------------------------------------
/sibling-discovery/diffusion/model/rotary.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | 
 4 | 
 5 | class Rotary(torch.nn.Module):
 6 |     def __init__(self, dim, base=10_000):
 7 |         super().__init__()
 8 |         inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
 9 |         self.register_buffer("inv_freq", inv_freq)
10 |         self.seq_len_cached = None
11 |         self.cos_cached = None
12 |         self.sin_cached = None
13 | 
14 |     def forward(self, x, seq_dim=1):
15 |         seq_len = x.shape[seq_dim]
16 |         if seq_len != self.seq_len_cached:
17 |             self.seq_len_cached = seq_len
18 |             t = torch.arange(x.shape[seq_dim], device=x.device).type_as(self.inv_freq)
19 |             freqs = torch.einsum("i,j->ij", t, self.inv_freq.clone())
20 |             emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
21 |             # dims are: batch, seq_len, qkv, head, dim
22 |             self.cos_cached = emb.cos()[None, :, None, None, :].repeat(1,1,3,1,1)
23 |             self.sin_cached = emb.sin()[None, :, None, None, :].repeat(1,1,3,1,1)
24 |             # This makes the transformation on v an identity.
25 |             self.cos_cached[:,:,2,:,:].fill_(1.)
26 |             self.sin_cached[:,:,2,:,:].fill_(0.)
27 | 
28 |         return self.cos_cached, self.sin_cached
29 | 
30 | 
31 | def rotate_half(x):
32 |     x1, x2 = x[..., : x.shape[-1] // 2], x[..., x.shape[-1] // 2 :]
33 |     return torch.cat(
34 |         (-x2, x1), dim=-1
35 |     )
36 | 
37 | 
38 | @torch.jit.script
39 | def _apply_rotary_pos_emb_torchscript(qkv, cos, sin):
40 |     return (qkv * cos) + (rotate_half(qkv) * sin)
41 | 
42 | 
43 | def apply_rotary_pos_emb(qkv, cos, sin):
44 |     try:
45 |         import flash_attn.layers.rotary
46 |         cos = cos[0,:,0,0,:cos.shape[-1]//2]
47 |         sin = sin[0,:,0,0,:sin.shape[-1]//2]
48 |         return flash_attn.layers.rotary.apply_rotary_emb_qkv_(
49 |             qkv, cos, sin
50 |         )
51 |     except:
52 |         return _apply_rotary_pos_emb_torchscript(qkv, cos, sin)


--------------------------------------------------------------------------------
/sibling-discovery/diffusion/model/utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | 
 4 | 
 5 | def get_model_fn(model, train=False):
 6 |     """Create a function to give the output of the score-based model.
 7 | 
 8 |     Args:
 9 |         model: The score model.
10 |         train: `True` for training and `False` for evaluation.
11 |         mlm: If the input model is a mlm and models the base probability 
12 | 
13 |     Returns:
14 |         A model function.
15 |     """
16 | 
17 |     def model_fn(x, sigma):
18 |         """Compute the output of the score-based model.
19 | 
20 |         Args:
21 |             x: A mini-batch of input data.
22 |             labels: A mini-batch of conditioning variables for time steps. Should be interpreted differently
23 |               for different models.
24 | 
25 |         Returns:
26 |             A tuple of (model output, new mutable states)
27 |         """
28 |         if train:
29 |             model.train()
30 |         else:
31 |             model.eval()
32 |         
33 |             # otherwise output the raw values (we handle mlm training in losses.py)
34 |         return model(x, sigma)
35 | 
36 |     return model_fn
37 | 
38 | 
39 | def get_score_fn(model, train=False, sampling=False):
40 |     if sampling:
41 |         assert not train, "Must sample in eval mode"
42 |     model_fn = get_model_fn(model, train=train)
43 | 
44 |     with torch.cuda.amp.autocast(dtype=torch.bfloat16):
45 |         def score_fn(x, sigma):
46 |             sigma = sigma.reshape(-1)
47 |             score = model_fn(x, sigma)
48 |             
49 |             if sampling:
50 |                 # when sampling return true score (not log used for training)
51 |                 return score.exp()
52 |                 
53 |             return score
54 | 
55 |     return score_fn
56 | 


--------------------------------------------------------------------------------
/sibling-discovery/diffusion/noise_lib.py:
--------------------------------------------------------------------------------
 1 | import abc
 2 | import torch
 3 | import torch.nn as nn
 4 | import numpy as np
 5 | 
 6 | 
 7 | def get_noise(config):
 8 |     if config.noise.type == "geometric":
 9 |         return GeometricNoise(config.noise.sigma_min, config.noise.sigma_max)
10 |     elif config.noise.type == "loglinear":
11 |         return LogLinearNoise()
12 |     else:
13 |         raise ValueError(f"{config.noise.type} is not a valid noise")
14 | 
15 | 
16 | class Noise(abc.ABC, nn.Module):
17 |     """
18 |     Baseline forward method to get the total + rate of noise at a timestep
19 |     """
20 |     def forward(self, t):
21 |         return self.total_noise(t), self.rate_noise(t)
22 | 
23 |     """
24 |     Assume time goes from 0 to 1
25 |     """
26 |     @abc.abstractmethod
27 |     def rate_noise(self, t):
28 |         """
29 |         Rate of change of noise ie g(t)
30 |         """
31 |         pass
32 | 
33 |     @abc.abstractmethod
34 |     def total_noise(self, t):
35 |         """
36 |         Total noise ie \int_0^t g(t) dt + g(0)
37 |         """
38 |         pass
39 | 
40 | 
41 | class GeometricNoise(Noise, nn.Module):
42 |     def __init__(self, sigma_min=1e-3, sigma_max=1, learnable=False):
43 |         super().__init__()
44 |         self.sigmas = 1.0 * torch.tensor([sigma_min, sigma_max])
45 |         if learnable:
46 |             self.sigmas = nn.Parameter(self.sigmas)
47 |         self.empty = nn.Parameter(torch.tensor(0.0))
48 | 
49 |     def rate_noise(self, t):
50 |         return self.sigmas[0] ** (1 - t) * self.sigmas[1] ** t * (self.sigmas[1].log() - self.sigmas[0].log())
51 | 
52 |     def total_noise(self, t):
53 |         return self.sigmas[0] ** (1 - t) * self.sigmas[1] ** t
54 | 
55 | 
56 | class LogLinearNoise(Noise, nn.Module):
57 |     """
58 |     Log Linear noise schedule built so that 1 - 1/e^(n(t)) interpolates between 0 and ~1
59 |     when t goes from 0 to 1. Used for absorbing
60 | 
61 |     Total noise is -log(1 - (1 - eps) * t), so the sigma will be (1 - eps) * t
62 |     """
63 |     def __init__(self, eps=1e-3):
64 |         super().__init__()
65 |         self.eps = eps
66 |         self.empty = nn.Parameter(torch.tensor(0.0))
67 | 
68 |     def rate_noise(self, t):
69 |         return (1 - self.eps) / (1 - (1 - self.eps) * t)
70 | 
71 |     def total_noise(self, t):
72 |         return -torch.log1p(-(1 - self.eps) * t)
73 | 
74 | 


--------------------------------------------------------------------------------
/sibling-discovery/diffusion/run_eval.sh:
--------------------------------------------------------------------------------
1 | bash eval.sh sibling.5.500.10.50000 0.0 8


--------------------------------------------------------------------------------
/sibling-discovery/diffusion/run_sample.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import argparse
 3 | import json
 4 | 
 5 | from load_model import load_model
 6 | from transformers import GPT2TokenizerFast
 7 | import torch.nn.functional as F
 8 | import sampling
 9 | 
10 | 
11 | def main():
12 |     parser = argparse.ArgumentParser(description="Generate some samples")
13 |     parser.add_argument("--model_path", default="louaaron/sedd-medium", type=str)
14 |     parser.add_argument("--dataset", default="wikitext103", type=str)
15 |     parser.add_argument("--batch_size", type=int, default=1)
16 |     parser.add_argument("--steps", type=int, default=1024)
17 |     parser.add_argument("--add_vocab", type=str, default=None)
18 |     args = parser.parse_args()
19 | 
20 |     
21 |     device = torch.device('cuda')
22 |     model, graph, noise = load_model(args.model_path, device)
23 |     tokenizer = GPT2TokenizerFast.from_pretrained('gpt2')
24 |     if args.add_vocab:
25 |         with open(args.add_vocab, 'r') as file:
26 |             added_tokens = json.load(file)
27 |         tokenizer.add_tokens(added_tokens)
28 | 
29 |     sampling_fn = sampling.get_pc_sampler(
30 |         graph, noise, (args.batch_size, 1024), 'analytic', args.steps, device=device
31 |     )
32 | 
33 |     samples = sampling_fn(model)
34 | 
35 |     text_samples = tokenizer.batch_decode(samples)
36 |     for i in text_samples:
37 |         print(i)
38 |         print("=================================================")
39 | 
40 | if __name__=="__main__":
41 |     main()


--------------------------------------------------------------------------------
/sibling-discovery/diffusion/run_sample_cond.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import argparse
 3 | import json
 4 | 
 5 | from load_model import load_model
 6 | from transformers import GPT2TokenizerFast
 7 | import sampling
 8 | 
 9 | 
10 | def main():
11 |     parser = argparse.ArgumentParser(description="Generate some samples")
12 |     parser.add_argument("--model_path", default="louaaron/sedd-medium", type=str)
13 |     parser.add_argument("--dataset", default="wikitext103", type=str)
14 |     parser.add_argument("--batch_size", type=int, default=1)
15 |     parser.add_argument("--steps", type=int, default=1024)
16 |     parser.add_argument("--prefix", type=str, default="Hi, my name is")
17 |     parser.add_argument("--suffix", type=str, default=" and that's why I'm late.")
18 |     parser.add_argument("--add_vocab", type=str, default=None)
19 |     args = parser.parse_args()
20 | 
21 |     tokenizer = GPT2TokenizerFast.from_pretrained('gpt2')
22 |     if args.add_vocab:
23 |         with open(args.add_vocab, 'r') as file:
24 |             added_tokens = json.load(file)
25 |         tokenizer.add_tokens(added_tokens)
26 | 
27 |     prefix_ids = tokenizer(args.prefix).input_ids
28 |     suffix_ids = tokenizer(args.suffix).input_ids
29 |     input_ids = prefix_ids + suffix_ids
30 |     input_locs = list(range(len(prefix_ids))) + list(range(1024-len(suffix_ids), 1024))
31 | 
32 |     # more generaly commands can be defined with something like below:
33 |     # input_ids = [0, 1, 512, 8080, 50256, 20000]
34 |     # input_locs = [5, 6, 19, 20, 1000, 10001]
35 | 
36 | 
37 |     input_ids = torch.tensor(input_ids, device="cuda")[None].repeat(args.batch_size, 1)
38 | 
39 |     def proj_fun(x):
40 |         x[:, input_locs] = input_ids
41 |         return x
42 |     
43 |     device = torch.device('cuda')
44 |     model, graph, noise = load_model(args.model_path, device)
45 |     
46 | 
47 |     sampling_fn = sampling.get_pc_sampler(
48 |         graph, noise, (args.batch_size, 1024), 'analytic', args.steps, device=device, proj_fun=proj_fun
49 |     )
50 | 
51 |     samples = proj_fun(sampling_fn(model))
52 | 
53 |     text_samples = tokenizer.batch_decode(samples)
54 |     for i in text_samples:
55 |         print(i)
56 |         print("=================================================")
57 | 
58 | if __name__=="__main__":
59 |     main()


--------------------------------------------------------------------------------
/sibling-discovery/diffusion/run_train.sh:
--------------------------------------------------------------------------------
 1 | python train.py \
 2 |     noise.type=loglinear \
 3 |     graph.type=absorb \
 4 |     model=small \
 5 |     training.accum=1 \
 6 |     data.train=creativity_data/sibling.5.500.10.50000/train.json \
 7 |     data.valid=creativity_data/sibling.5.500.10.50000/valid.json \
 8 |     add_vocab=creativity_data/sibling.5.500.10.50000/vocab.json \
 9 |     hydra.run.dir=/data/locus/project_data/project_data2/chenwu2/creativity_results/creativity_data/sibling.5.500.10.50000/train.json/train
10 | 
11 | python test.py \
12 |     --model_checkpoint_dir creativity_results/creativity_data/sibling.5.500.10.50000/train.json/train \
13 |     --dataset creativity_data/sibling.5.500.10.50000 \
14 |     --add_vocab creativity_data/sibling.5.500.10.50000/vocab.json


--------------------------------------------------------------------------------
/sibling-discovery/diffusion/test.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import argparse
 3 | import json
 4 | 
 5 | from load_model import load_model
 6 | from transformers import GPT2TokenizerFast
 7 | import sampling
 8 | from tqdm import tqdm
 9 | 
10 | import os
11 | 
12 | 
13 | def main():
14 |     parser = argparse.ArgumentParser(description="Generate some samples")
15 |     parser.add_argument("--model_checkpoint_dir", default="", type=str)
16 |     parser.add_argument("--dataset", default=None, type=str)
17 |     parser.add_argument("--steps", type=int, default=32)
18 |     parser.add_argument("--add_vocab", type=str, default=None)
19 |     args = parser.parse_args()
20 | 
21 |     tokenizer = GPT2TokenizerFast.from_pretrained('gpt2')
22 |     if args.add_vocab:
23 |         with open(args.add_vocab, 'r') as file:
24 |             added_tokens = json.load(file)
25 |         tokenizer.add_tokens(added_tokens)
26 |     else:
27 |         added_tokens = []
28 |     
29 |     # Load the dataset
30 |     with open(os.path.join(args.dataset, "test.json"), "r") as f:
31 |         test_data = json.load(f)
32 |     
33 |     # List all files under the model checkpoint directory
34 |     checkpoints = [os.path.join(args.model_checkpoint_dir, "checkpoints", f) for f in os.listdir(os.path.join(args.model_checkpoint_dir, "checkpoints"))]
35 |     print(checkpoints)
36 | 
37 |     for checkpoint in checkpoints:
38 |         device = torch.device('cuda')
39 |         model, graph, noise = load_model(args.model_checkpoint_dir, checkpoint, added_tokens, device)
40 |         # Create a checkpoint_dir for the current checkpoint
41 |         checkpoint_dir = os.path.join(args.model_checkpoint_dir, "checkpoint_outputs", os.path.basename(checkpoint))
42 |         os.makedirs(checkpoint_dir, exist_ok=True)
43 | 
44 |         def generate_output(input_text):
45 |             prefix_ids = tokenizer(input_text).input_ids
46 |             # suffix_ids = tokenizer("<|endoftext|>").input_ids
47 |             input_ids = prefix_ids
48 |             input_locs = list(range(len(prefix_ids)))
49 | 
50 |             input_ids = torch.tensor(input_ids, device="cuda")[None].repeat(1, 1)
51 | 
52 |             def proj_fun(x):
53 |                 x[:, input_locs] = input_ids
54 |                 return x
55 |             
56 |             sampling_fn = sampling.get_pc_sampler(
57 |                 graph, noise, (1, 20), 'analytic', args.steps, device=device, proj_fun=proj_fun
58 |             )
59 | 
60 |             samples = proj_fun(sampling_fn(model))
61 | 
62 |             text_samples = tokenizer.batch_decode(samples)
63 |             assert len(text_samples) == 1
64 |             text_samples = text_samples[0].split("<|endoftext|>")[0]
65 |             return text_samples
66 |         
67 |         all_items = []
68 |         for sample in tqdm(test_data):
69 |             item = {}
70 |             item["input_text"] = sample["input_text"]
71 |             item["target_text"] = sample["target_text"]
72 |             item["type"] = sample["type"]
73 | 
74 |             output = generate_output(sample["input_text"])
75 |             print(sample["input_text"])
76 |             print(sample["target_text"])
77 |             print(output)
78 |             print()
79 |             item["model_output"] = output
80 |             all_items.append(item)
81 |         
82 |         # Save the results
83 |         with open(os.path.join(checkpoint_dir, "all_items.json"), "w") as f:
84 |             json.dump(all_items, f, indent=4)
85 | 
86 | 
87 | if __name__=="__main__":
88 |     main()


--------------------------------------------------------------------------------
/sibling-discovery/diffusion/train.py:
--------------------------------------------------------------------------------
 1 | """Training and evaluation"""
 2 | 
 3 | import hydra
 4 | import os
 5 | import numpy as np
 6 | import run_train
 7 | import utils
 8 | import torch.multiprocessing as mp
 9 | from hydra.core.hydra_config import HydraConfig
10 | from hydra.types import RunMode
11 | from omegaconf import OmegaConf, open_dict
12 | 
13 | 
14 | @hydra.main(version_base=None, config_path="configs", config_name="config")
15 | def main(cfg):
16 |     ngpus = cfg.ngpus
17 |     if "load_dir" in cfg:
18 |         hydra_cfg_path = os.path.join(cfg.load_dir, ".hydra/hydra.yaml")
19 |         hydra_cfg = OmegaConf.load(hydra_cfg_path).hydra
20 | 
21 |         cfg = utils.load_hydra_config_from_run(cfg.load_dir)
22 |         
23 |         work_dir = cfg.work_dir
24 |         utils.makedirs(work_dir)
25 |     else:
26 |         hydra_cfg = HydraConfig.get()
27 |         work_dir = hydra_cfg.run.dir if hydra_cfg.mode == RunMode.RUN else os.path.join(hydra_cfg.sweep.dir, hydra_cfg.sweep.subdir)
28 |         utils.makedirs(work_dir)
29 | 
30 |     with open_dict(cfg):
31 |         cfg.ngpus = ngpus
32 |         cfg.work_dir = work_dir
33 |         cfg.wandb_name = os.path.basename(os.path.normpath(work_dir))
34 | 
35 | 	# Run the training pipeline
36 |     port = int(np.random.randint(10000, 20000))
37 |     logger = utils.get_logger(os.path.join(work_dir, "logs"))
38 | 
39 |     hydra_cfg = HydraConfig.get()
40 |     if hydra_cfg.mode != RunMode.RUN:
41 |         logger.info(f"Run id: {hydra_cfg.job.id}")
42 | 
43 |     try:
44 |         mp.set_start_method("forkserver")
45 |         mp.spawn(run_train.run_multiprocess, args=(ngpus, cfg, port), nprocs=ngpus, join=True)
46 |     except Exception as e:
47 |         logger.critical(e, exc_info=True)
48 | 
49 | 
50 | if __name__ == "__main__":
51 |     main()


--------------------------------------------------------------------------------
/sibling-discovery/diffusion/utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | import os
 4 | import logging
 5 | from omegaconf import OmegaConf, open_dict
 6 | 
 7 | 
 8 | def load_hydra_config_from_run(load_dir):
 9 |     cfg_path = os.path.join(load_dir, ".hydra/config.yaml")
10 |     cfg = OmegaConf.load(cfg_path)
11 |     return cfg
12 | 
13 | 
14 | def makedirs(dirname):
15 |     os.makedirs(dirname, exist_ok=True)
16 | 
17 | 
18 | def get_logger(logpath, package_files=[], displaying=True, saving=True, debug=False):
19 |     logger = logging.getLogger()
20 |     if debug:
21 |         level = logging.DEBUG
22 |     else:
23 |         level = logging.INFO
24 | 
25 |     if (logger.hasHandlers()):
26 |         logger.handlers.clear()
27 | 
28 |     logger.setLevel(level)
29 |     formatter = logging.Formatter('%(asctime)s - %(message)s')
30 |     if saving:
31 |         info_file_handler = logging.FileHandler(logpath, mode="a")
32 |         info_file_handler.setLevel(level)
33 |         info_file_handler.setFormatter(formatter)
34 |         logger.addHandler(info_file_handler)
35 |     if displaying:
36 |         console_handler = logging.StreamHandler()
37 |         console_handler.setLevel(level)
38 |         console_handler.setFormatter(formatter)
39 |         logger.addHandler(console_handler)
40 | 
41 |     for f in package_files:
42 |         logger.info(f)
43 |         with open(f, "r") as package_f:
44 |             logger.info(package_f.read())
45 | 
46 |     return logger
47 | 
48 | 
49 | def restore_checkpoint(ckpt_dir, state, device):
50 |     if not os.path.exists(ckpt_dir):
51 |         makedirs(os.path.dirname(ckpt_dir))
52 |         logging.warning(f"No checkpoint found at {ckpt_dir}. Returned the same state as input")
53 |         return state
54 |     else:
55 |         loaded_state = torch.load(ckpt_dir, map_location=device)
56 |         state['optimizer'].load_state_dict(loaded_state['optimizer'])
57 |         state['model'].module.load_state_dict(loaded_state['model'], strict=False)
58 |         state['ema'].load_state_dict(loaded_state['ema'])
59 |         state['step'] = loaded_state['step']
60 |         return state
61 | 
62 | 
63 | def save_checkpoint(ckpt_dir, state):
64 |     saved_state = {
65 |         'optimizer': state['optimizer'].state_dict(),
66 |         'model': state['model'].module.state_dict(),
67 |         'ema': state['ema'].state_dict(),
68 |         'step': state['step']
69 |     }
70 |     torch.save(saved_state, ckpt_dir)


--------------------------------------------------------------------------------
/sibling-discovery/ntp/eval.sh:
--------------------------------------------------------------------------------
1 | # $1: dataset
2 | # $2: weight_decay
3 | # $3: n_layers
4 | 
5 | EXP_DIR=../creativity
6 | DATA_DIR=/data/locus/project_data/project_data2/chenwu2/creativity_data
7 | 
8 | python eval_qa.py --dir $EXP_DIR/$1_$2_$3 --dataset $1 --data_dir $DATA_DIR
9 | 


--------------------------------------------------------------------------------
/sibling-discovery/ntp/run_eval.sh:
--------------------------------------------------------------------------------
1 | bash eval.sh sibling.5.500.10.50000 0.0 8
2 | 


--------------------------------------------------------------------------------
/sibling-discovery/ntp/run_train.sh:
--------------------------------------------------------------------------------
1 | bash train.sh sibling.5.500.10.50000 0.0 8 0
2 | 


--------------------------------------------------------------------------------
/sibling-discovery/ntp/train.sh:
--------------------------------------------------------------------------------
 1 | MODEL_PATH=gpt2
 2 | 
 3 | DATASET=/data/locus/project_data/project_data2/chenwu2/creativity_data/$1/
 4 | WEIGHT_DECAY=$2
 5 | N_LAYERS=$3
 6 | GPU=$4
 7 | 
 8 | EXP_DIR=../creativity
 9 | 
10 | OUTPUT_DIR=$EXP_DIR/$1_$2_$3
11 | 
12 | CUDA_VISIBLE_DEVICES=$GPU python main.py \
13 |     --data_dir $DATASET \
14 |     --model_name_or_path ${MODEL_PATH} \
15 |     --weight_decay $WEIGHT_DECAY \
16 |     --output_dir $OUTPUT_DIR \
17 |     --max_seq_length 128 \
18 |     --max_length 128 \
19 |     --block_size 128 \
20 |     --train_batch_size 64 \
21 |     --eval_batch_size 64 \
22 |     --learning_rate 1e-4 \
23 |     --gradient_accumulation_steps 1 \
24 |     --save_step 40000 \
25 |     --save_step_dense 20000 \
26 |     --max_steps 800000 \
27 |     --do_train \
28 |     --scheduler constant_schedule_with_warmup \
29 |     --fp16 \
30 |     --evaluate_during_training \
31 |     --predict_during_training \
32 |     --init_weights \
33 |     --add_tokens \
34 |     --n_layer $N_LAYERS
35 | 


--------------------------------------------------------------------------------
/sibling-discovery/ntp/utils.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import json
 3 | import torch
 4 | import torch.nn as nn
 5 | import torch.nn.functional as F
 6 | 
 7 | 
 8 | def read_data_source_target(file_name, return_num=False, return_json=False, cutoff=None):
 9 |     """
10 |     file_name: a .json file containing a list of items, each has 'input_text', 'target_text', as keys
11 |     """
12 |     with open(file_name, 'r', encoding='utf-8') as f:
13 |         data = json.load(f)
14 |     
15 |     if cutoff is not None:
16 |         data = data[:cutoff]
17 | 
18 |     if return_json:
19 |         if return_num:
20 |             return data, len(data)
21 |         return data
22 | 
23 |     keys = list(data[0].keys())
24 |     source_target_pair = []
25 |     for item in data:
26 |         source_target_pair.append([item[key] for key in keys])
27 | 
28 |     if return_num:
29 |         return pd.DataFrame(source_target_pair, columns=keys), len(data)
30 |     return pd.DataFrame(source_target_pair, columns=keys)
31 | 


--------------------------------------------------------------------------------
/sibling-discovery/teacherless/eval.sh:
--------------------------------------------------------------------------------
1 | # $1: dataset
2 | # $2: weight_decay
3 | # $3: n_layers
4 | 
5 | EXP_DIR=../creativity
6 | DATA_DIR=/data/locus/project_data/project_data2/chenwu2/creativity_data
7 | 
8 | python eval_qa.py --dir $EXP_DIR/$1_$2_$3 --dataset $1 --data_dir $DATA_DIR
9 | 


--------------------------------------------------------------------------------
/sibling-discovery/teacherless/run_eval.sh:
--------------------------------------------------------------------------------
1 | bash eval.sh sibling_hybrid.5.500.10.100000 0.0 8


--------------------------------------------------------------------------------
/sibling-discovery/teacherless/run_train.sh:
--------------------------------------------------------------------------------
1 | bash train.sh sibling_hybrid.5.500.10.50000 0.0 8 0


--------------------------------------------------------------------------------
/sibling-discovery/teacherless/train.sh:
--------------------------------------------------------------------------------
 1 | MODEL_PATH=gpt2
 2 | 
 3 | DATASET=/data/locus/project_data/project_data2/chenwu2/creativity_data/$1/
 4 | WEIGHT_DECAY=$2
 5 | N_LAYERS=$3
 6 | GPU=$4
 7 | 
 8 | EXP_DIR=../creativity
 9 | 
10 | OUTPUT_DIR=$EXP_DIR/$1_$2_$3
11 | 
12 | CUDA_VISIBLE_DEVICES=$GPU python main.py \
13 |     --data_dir $DATASET \
14 |     --model_name_or_path ${MODEL_PATH} \
15 |     --weight_decay $WEIGHT_DECAY \
16 |     --output_dir $OUTPUT_DIR \
17 |     --max_seq_length 128 \
18 |     --max_length 128 \
19 |     --block_size 128 \
20 |     --train_batch_size 64 \
21 |     --eval_batch_size 64 \
22 |     --learning_rate 1e-4 \
23 |     --gradient_accumulation_steps 1 \
24 |     --save_step 40000 \
25 |     --save_step_dense 20000 \
26 |     --max_steps 1200000 \
27 |     --do_train \
28 |     --scheduler constant_schedule_with_warmup \
29 |     --fp16 \
30 |     --evaluate_during_training \
31 |     --predict_during_training \
32 |     --init_weights \
33 |     --add_tokens \
34 |     --n_layer $N_LAYERS
35 | 


--------------------------------------------------------------------------------
/sibling-discovery/teacherless/utils.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import json
 3 | import torch
 4 | import torch.nn as nn
 5 | import torch.nn.functional as F
 6 | 
 7 | 
 8 | def read_data_source_target(file_name, return_num=False, return_json=False, cutoff=None):
 9 |     """
10 |     file_name: a .json file containing a list of items, each has 'input_text', 'target_text', as keys
11 |     """
12 |     with open(file_name, 'r', encoding='utf-8') as f:
13 |         data = json.load(f)
14 |     
15 |     if cutoff is not None:
16 |         data = data[:cutoff]
17 | 
18 |     if return_json:
19 |         if return_num:
20 |             return data, len(data)
21 |         return data
22 | 
23 |     keys = list(data[0].keys())
24 |     source_target_pair = []
25 |     for item in data:
26 |         source_target_pair.append([item[key] for key in keys])
27 | 
28 |     if return_num:
29 |         return pd.DataFrame(source_target_pair, columns=keys), len(data)
30 |     return pd.DataFrame(source_target_pair, columns=keys)
31 | 


--------------------------------------------------------------------------------
/simpletransformers/.github/FUNDING.yml:
--------------------------------------------------------------------------------
 1 | # These are supported funding model platforms
 2 | 
 3 | github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
 4 | patreon: thilinarajapakse
 5 | open_collective: # Replace with a single Open Collective username
 6 | ko_fi: # Replace with a single Ko-fi username
 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
 9 | liberapay: # Replace with a single Liberapay username
10 | issuehunt: # Replace with a single IssueHunt username
11 | otechie: # Replace with a single Otechie username
12 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']
13 | 


--------------------------------------------------------------------------------
/simpletransformers/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | A clear and concise description of what the bug is. Please specify the class causing the issue.
12 | 
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 | 
16 | **Expected behavior**
17 | A clear and concise description of what you expected to happen.
18 | 
19 | **Screenshots**
20 | If applicable, add screenshots to help explain your problem.
21 | 
22 | **Desktop (please complete the following information):**
23 |  - OS
24 | 
25 | **Additional context**
26 | Add any other context about the problem here.
27 | 


--------------------------------------------------------------------------------
/simpletransformers/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 | 
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 | 
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 | 
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 | 


--------------------------------------------------------------------------------
/simpletransformers/.github/stale.yml:
--------------------------------------------------------------------------------
 1 | # Number of days of inactivity before an issue becomes stale
 2 | daysUntilStale: 60
 3 | # Number of days of inactivity before a stale issue is closed
 4 | daysUntilClose: 7
 5 | # Issues with these labels will never be considered stale
 6 | exemptLabels:
 7 |   - pinned
 8 |   - security
 9 | # Label to use when marking an issue as stale
10 | staleLabel: stale
11 | # Comment to post when marking an issue as stale. Set to `false` to disable
12 | markComment: >
13 |   This issue has been automatically marked as stale because it has not had
14 |   recent activity. It will be closed if no further activity occurs. Thank you
15 |   for your contributions.
16 | # Comment to post when closing a stale issue. Set to `false` to disable
17 | closeComment: false
18 | 


--------------------------------------------------------------------------------
/simpletransformers/.github/workflows/pythonpublish.yml:
--------------------------------------------------------------------------------
 1 | name: Upload Python Package
 2 | 
 3 | on: push
 4 | 
 5 | jobs:
 6 |   deploy:
 7 |     runs-on: ubuntu-latest
 8 |     steps:
 9 |     - uses: actions/checkout@v1
10 |     - name: Set up Python
11 |       uses: actions/setup-python@v1
12 |       with:
13 |         python-version: '3.x'
14 |     - name: Install dependencies
15 |       run: |
16 |         python -m pip install --upgrade pip
17 |         pip install setuptools wheel twine
18 |         python setup.py sdist bdist_wheel
19 |     - name: Publish package
20 |       if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags')
21 |       uses: pypa/gh-action-pypi-publish@release/v1
22 |       with:
23 |         user: __token__
24 |         password: ${{ secrets.PYPI_PASSWORD }}
25 | 


--------------------------------------------------------------------------------
/simpletransformers/.gitignore:
--------------------------------------------------------------------------------
  1 | .jekyll-cache/*
  2 | *.lock
  3 | 
  4 | # Wandb
  5 | wandb/
  6 | 
  7 | # Outputs from examples
  8 | **/cache_dir
  9 | **/runs
 10 | **/outputs
 11 | **/data
 12 | 
 13 | # manual_test_scripts
 14 | test_scripts/
 15 | 
 16 | # Byte-compiled / optimized / DLL files
 17 | __pycache__/
 18 | *.py[cod]
 19 | *$py.class
 20 | 
 21 | # C extensions
 22 | *.so
 23 | 
 24 | # Distribution / packaging
 25 | .Python
 26 | build/
 27 | develop-eggs/
 28 | dist/
 29 | downloads/
 30 | eggs/
 31 | .eggs/
 32 | lib/
 33 | lib64/
 34 | parts/
 35 | sdist/
 36 | var/
 37 | wheels/
 38 | pip-wheel-metadata/
 39 | share/python-wheels/
 40 | *.egg-info/
 41 | .installed.cfg
 42 | *.egg
 43 | MANIFEST
 44 | 
 45 | # PyInstaller
 46 | #  Usually these files are written by a python script from a template
 47 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 48 | *.manifest
 49 | *.spec
 50 | 
 51 | # Installer logs
 52 | pip-log.txt
 53 | pip-delete-this-directory.txt
 54 | 
 55 | # Unit test / coverage reports
 56 | htmlcov/
 57 | .tox/
 58 | .nox/
 59 | .coverage
 60 | .coverage.*
 61 | .cache
 62 | nosetests.xml
 63 | coverage.xml
 64 | *.cover
 65 | *.py,cover
 66 | .hypothesis/
 67 | .pytest_cache/
 68 | 
 69 | # Translations
 70 | *.mo
 71 | *.pot
 72 | 
 73 | # Django stuff:
 74 | *.log
 75 | local_settings.py
 76 | db.sqlite3
 77 | db.sqlite3-journal
 78 | 
 79 | # Flask stuff:
 80 | instance/
 81 | .webassets-cache
 82 | 
 83 | # Scrapy stuff:
 84 | .scrapy
 85 | 
 86 | # Sphinx documentation
 87 | docs/_build/
 88 | 
 89 | # PyBuilder
 90 | target/
 91 | 
 92 | # Jupyter Notebook
 93 | .ipynb_checkpoints
 94 | 
 95 | # IPython
 96 | profile_default/
 97 | ipython_config.py
 98 | 
 99 | # pyenv
100 | .python-version
101 | 
102 | # pipenv
103 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
104 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
105 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
106 | #   install all needed dependencies.
107 | #Pipfile.lock
108 | 
109 | # celery beat schedule file
110 | celerybeat-schedule
111 | 
112 | # SageMath parsed files
113 | *.sage.py
114 | 
115 | # Environments
116 | .env
117 | .venv
118 | env/
119 | venv/
120 | ENV/
121 | env.bak/
122 | venv.bak/
123 | 
124 | # Spyder project settings
125 | .spyderproject
126 | .spyproject
127 | 
128 | # Rope project settings
129 | .ropeproject
130 | 
131 | # mkdocs documentation
132 | /site
133 | 
134 | # mypy
135 | .mypy_cache/
136 | .dmypy.json
137 | dmypy.json
138 | 
139 | # Pyre type checker
140 | .pyre/
141 | #Stale Notebooks
142 | Untitled.ipynb
143 | 
144 | # IDE folders
145 | .idea
146 | .vscode/
147 | 


--------------------------------------------------------------------------------
/simpletransformers/Makefile:
--------------------------------------------------------------------------------
 1 | install:
 2 | 	pip install -e .
 3 | 	pip install -r requirements-dev.txt
 4 | 	pip list
 5 | 
 6 | clean:
 7 | 	find . -name '*.pyc' -exec rm -f {} +
 8 | 	find . -name '*.pyo' -exec rm -f {} +
 9 | 	find . -name '*~' -exec rm -f  {} +
10 | 
11 | clean-test:
12 | 	-rm -r .coverage*
13 | 	-rm -r data
14 | 	-rm -r runs
15 | 	-rm -r outputs
16 | 	-rm -r cache_dir
17 | 	-rm -r wandb
18 | 	-rm train.txt
19 | 
20 | formatter:
21 | 	black --line-length 119 simpletransformers tests --exclude simpletransformers/experimental\
22 | 
23 | lint: clean
24 | 	flake8 simpletransformers tests --exclude=simpletransformers/experimental
25 | 	black --check --line-length 119 . simpletransformers tests --exclude simpletransformers/experimental
26 | 
27 | types:
28 | 	pytype --keep-going simpletransformers --exclude simpletransformers/experimental
29 | 
30 | test: clean
31 | 	pytest tests --cov simpletransformers/classification simpletransformers/ner simpletransformers/question_answering simpletransformers/language_modeling simpletransformers/t5 simpletransformers/seq2seq
32 | 
33 | test-lm: clean
34 | 	pytest tests/language_modeling --cov simpletransformers/language_modeling
35 | 
36 | # if this runs through we can be sure the readme is properly shown on pypi
37 | check-readme:
38 | 	python setup.py check --restructuredtext
39 | 


--------------------------------------------------------------------------------
/simpletransformers/bin/simple-viewer:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | cat >run_simple_transformers_streamlit_app.py <<'END_SCRIPT'
 3 | #!/usr/bin/env python
 4 | from simpletransformers.streamlit.simple_view import streamlit_runner
 5 | 
 6 | 
 7 | streamlit_runner()
 8 | 
 9 | END_SCRIPT
10 | 
11 | # Run
12 | streamlit run run_simple_transformers_streamlit_app.py
13 | 
14 | rm run_simple_transformers_streamlit_app.py
15 | 


--------------------------------------------------------------------------------
/simpletransformers/requirements-dev.txt:
--------------------------------------------------------------------------------
 1 | # test
 2 | pytest-cov==2.7.1
 3 | pytest-localserver==0.5.0
 4 | pytest==7.1.2
 5 | 
 6 | # lint/format/types
 7 | black==24.3.0
 8 | flake8==3.7.8
 9 | pytype==2019.7.11
10 | 


--------------------------------------------------------------------------------
/simpletransformers/setup.cfg:
--------------------------------------------------------------------------------
 1 | [tool:pytest]
 2 | python_functions=test_
 3 | 
 4 | codestyle_max_line_length = 119
 5 | 
 6 | log_cli = true
 7 | log_cli_level = WARNING
 8 | 
 9 | [metadata]
10 | description-file = README.md
11 | license_file = LICENSE
12 | 
13 | [pycodestyle]
14 | max-line-length = 119
15 | 
16 | [flake8]
17 | max-line-length = 119
18 | ignore = E203 , W503, F401
19 | 


--------------------------------------------------------------------------------
/simpletransformers/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import find_packages, setup
 2 | 
 3 | with open("README.md", "r") as fh:
 4 |     long_description = fh.read()
 5 | 
 6 | setup(
 7 |     name="simpletransformers",
 8 |     version="0.64.3",
 9 |     author="Thilina Rajapakse",
10 |     author_email="chaturangarajapakshe@gmail.com",
11 |     description="An easy-to-use wrapper library for the Transformers library.",
12 |     long_description=long_description,
13 |     long_description_content_type="text/markdown",
14 |     url="https://github.com/ThilinaRajapakse/simpletransformers/",
15 |     packages=find_packages(),
16 |     scripts=["bin/simple-viewer"],
17 |     classifiers=[
18 |         "Intended Audience :: Science/Research",
19 |         "License :: OSI Approved :: Apache Software License",
20 |         "Programming Language :: Python :: 3",
21 |         "Topic :: Scientific/Engineering :: Artificial Intelligence",
22 |     ],
23 |     python_requires=">=3.6",
24 |     install_requires=[
25 |         "numpy",
26 |         "requests",
27 |         "tqdm>=4.47.0",
28 |         "regex",
29 |         "transformers>=4.31.0",
30 |         "datasets",
31 |         "scipy",
32 |         "scikit-learn",
33 |         "seqeval",
34 |         "tensorboard",
35 |         "pandas",
36 |         "tokenizers",
37 |         "wandb>=0.10.32",
38 |         "streamlit",
39 |         "sentencepiece",
40 |     ],
41 | )
42 | 


--------------------------------------------------------------------------------
/simpletransformers/simpletransformers/__init__.py:
--------------------------------------------------------------------------------
1 | name = "simpletransformers"
2 | 


--------------------------------------------------------------------------------
/simpletransformers/simpletransformers/classification/__init__.py:
--------------------------------------------------------------------------------
 1 | from simpletransformers.classification.classification_model import ClassificationModel
 2 | from simpletransformers.classification.multi_label_classification_model import (
 3 |     MultiLabelClassificationModel,
 4 | )
 5 | from simpletransformers.classification.multi_modal_classification_model import (
 6 |     MultiModalClassificationModel,
 7 | )
 8 | from simpletransformers.config.model_args import (
 9 |     ClassificationArgs,
10 |     MultiLabelClassificationArgs,
11 |     MultiModalClassificationArgs,
12 | )
13 | 


--------------------------------------------------------------------------------
/simpletransformers/simpletransformers/classification/transformer_models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChenWu98/algorithmic-creativity/7176dc7d35ca17605a76fa873ec97ad62a8537ac/simpletransformers/simpletransformers/classification/transformer_models/__init__.py


--------------------------------------------------------------------------------
/simpletransformers/simpletransformers/classification/transformer_models/camembert_model.py:
--------------------------------------------------------------------------------
 1 | from transformers.models.camembert.configuration_camembert import CamembertConfig
 2 | from transformers.models.camembert.modeling_camembert import (
 3 |     CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
 4 | )
 5 | 
 6 | from simpletransformers.classification.transformer_models.roberta_model import (
 7 |     RobertaForSequenceClassification,
 8 | )
 9 | 
10 | 
11 | class CamembertForSequenceClassification(RobertaForSequenceClassification):
12 |     r"""
13 |         **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
14 |             Labels for computing the sequence classification/regression loss.
15 |             Indices should be in ``[0, ..., config.num_labels]``.
16 |             If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
17 |             If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).
18 |     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
19 |         **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
20 |             Classification (or regression if config.num_labels==1) loss.
21 |         **logits**: ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)``
22 |             Classification (or regression if config.num_labels==1) scores (before SoftMax).
23 |         **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
24 |             list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
25 |             of shape ``(batch_size, sequence_length, hidden_size)``:
26 |             Hidden-states of the model at the output of each layer plus the initial embedding outputs.
27 |         **attentions**: (`optional`, returned when ``config.output_attentions=True``)
28 |             list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
29 |             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
30 |     Examples::
31 |         tokenizer = CamembertTokenizer.from_pretrained('camembert-base')
32 |         model = CamembertForSequenceClassification.from_pretrained('camembert-base')
33 |         input_ids = torch.tensor(tokenizer.encode("J'aime le camembert !")).unsqueeze(0)  # Batch size 1
34 |         labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
35 |         outputs = model(input_ids, labels=labels)
36 |         loss, logits = outputs[:2]
37 |     """  # noqa: ignore flake8"
38 |     config_class = CamembertConfig
39 |     pretrained_model_archive_map = CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_LIST
40 | 


--------------------------------------------------------------------------------
/simpletransformers/simpletransformers/classification/transformer_models/electra_model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from torch.nn import CrossEntropyLoss, MSELoss
 4 | from transformers.models.electra.modeling_electra import (
 5 |     ElectraModel,
 6 |     ElectraPreTrainedModel,
 7 |     ElectraClassificationHead,
 8 | )
 9 | 
10 | 
11 | class ElectraForSequenceClassification(ElectraPreTrainedModel):
12 |     r"""
13 |         **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
14 |             Labels for computing the sequence classification/regression loss.
15 |             Indices should be in ``[0, ..., config.num_labels - 1]``.
16 |             If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
17 |             If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).
18 |     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
19 |         **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
20 |             Classification (or regression if config.num_labels==1) loss.
21 |         **logits**: ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)``
22 |             Classification (or regression if config.num_labels==1) scores (before SoftMax).
23 |         **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
24 |             list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
25 |             of shape ``(batch_size, sequence_length, hidden_size)``:
26 |             Hidden-states of the model at the output of each layer plus the initial embedding outputs.
27 |         **attentions**: (`optional`, returned when ``config.output_attentions=True``)
28 |             list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
29 |             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
30 |     Examples::
31 |         tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
32 |         model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
33 |         input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
34 |         labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
35 |         outputs = model(input_ids, labels=labels)
36 |         loss, logits = outputs[:2]
37 |     """  # noqa: ignore flake8"
38 | 
39 |     def __init__(self, config, weight=None):
40 |         super(ElectraForSequenceClassification, self).__init__(config)
41 |         self.num_labels = config.num_labels
42 | 
43 |         self.bert = ElectraModel(config)
44 |         self.classifier = ElectraClassificationHead(config)
45 | 
46 |         self.init_weights()
47 | 
48 |     def forward(
49 |         self,
50 |         input_ids=None,
51 |         attention_mask=None,
52 |         token_type_ids=None,
53 |         position_ids=None,
54 |         head_mask=None,
55 |         inputs_embeds=None,
56 |         labels=None,
57 |     ):
58 | 
59 |         discriminator_hidden_states = self.electra(
60 |             input_ids,
61 |             attention_mask,
62 |             token_type_ids,
63 |             position_ids,
64 |             head_mask,
65 |             inputs_embeds,
66 |         )
67 | 
68 |         sequence_output = discriminator_hidden_states[0]
69 |         logits = self.classifier(sequence_output)
70 | 
71 |         loss = None
72 |         if labels is not None:
73 |             if self.num_labels == 1:
74 |                 #  We are doing regression
75 |                 loss_fct = MSELoss()
76 |                 loss = loss_fct(logits.view(-1), labels.view(-1))
77 |             else:
78 |                 if self.weight is not None:
79 |                     weight = self.weight.to(labels.device)
80 |                 else:
81 |                     weight = None
82 |                 loss_fct = CrossEntropyLoss(weight=weight)
83 |                 loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
84 | 
85 |         output = (logits,) + discriminator_hidden_states[1:]
86 |         return ((loss,) + output) if loss is not None else output
87 | 


--------------------------------------------------------------------------------
/simpletransformers/simpletransformers/classification/transformer_models/layoutlm_model.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | from torch.nn import CrossEntropyLoss, MSELoss
 3 | from transformers.models.bert.modeling_bert import BertPreTrainedModel
 4 | from transformers.models.layoutlm.modeling_layoutlm import LayoutLMModel
 5 | 
 6 | 
 7 | class LayoutLMForSequenceClassification(BertPreTrainedModel):
 8 |     def __init__(self, config, weight=None):
 9 |         super(LayoutLMForSequenceClassification, self).__init__(config)
10 |         self.num_labels = config.num_labels
11 | 
12 |         self.bert = LayoutLMModel(config)
13 |         self.dropout = nn.Dropout(config.hidden_dropout_prob)
14 |         self.classifier = nn.Linear(config.hidden_size, self.config.num_labels)
15 |         self.weight = weight
16 | 
17 |         self.init_weights()
18 | 
19 |     def forward(
20 |         self,
21 |         input_ids,
22 |         bbox,
23 |         attention_mask=None,
24 |         token_type_ids=None,
25 |         position_ids=None,
26 |         head_mask=None,
27 |         inputs_embeds=None,
28 |         labels=None,
29 |     ):
30 | 
31 |         outputs = self.bert(
32 |             input_ids=input_ids,
33 |             bbox=bbox,
34 |             attention_mask=attention_mask,
35 |             token_type_ids=token_type_ids,
36 |             position_ids=position_ids,
37 |             head_mask=head_mask,
38 |         )
39 | 
40 |         pooled_output = outputs[1]
41 | 
42 |         pooled_output = self.dropout(pooled_output)
43 |         logits = self.classifier(pooled_output)
44 | 
45 |         outputs = (logits,) + outputs[
46 |             2:
47 |         ]  # add hidden states and attention if they are here
48 | 
49 |         if labels is not None:
50 |             if self.num_labels == 1:
51 |                 #  We are doing regression
52 |                 loss_fct = MSELoss()
53 |                 loss = loss_fct(logits.view(-1), labels.view(-1))
54 |             else:
55 |                 if self.weight is not None:
56 |                     weight = self.weight.to(labels.device)
57 |                 else:
58 |                     weight = None
59 |                 loss_fct = CrossEntropyLoss(weight=weight)
60 |                 loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
61 |             outputs = (loss,) + outputs
62 | 
63 |         return outputs  # (loss), logits, (hidden_states), (attentions)
64 | 


--------------------------------------------------------------------------------
/simpletransformers/simpletransformers/classification/transformer_models/longformer_model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from torch.nn import CrossEntropyLoss, MSELoss
 4 | from transformers.models.longformer.modeling_longformer import (
 5 |     LongformerModel,
 6 |     LongformerPreTrainedModel,
 7 |     LongformerClassificationHead,
 8 | )
 9 | 
10 | 
11 | class LongformerForSequenceClassification(LongformerPreTrainedModel):
12 |     def __init__(self, config, weight=None):
13 |         super(LongformerForSequenceClassification, self).__init__(config)
14 |         self.num_labels = config.num_labels
15 | 
16 |         self.longformer = LongformerModel(config)
17 |         self.classifier = LongformerClassificationHead(config)
18 |         self.weight = weight
19 | 
20 |         self.init_weights()
21 | 
22 |     def forward(
23 |         self,
24 |         input_ids=None,
25 |         attention_mask=None,
26 |         global_attention_mask=None,
27 |         token_type_ids=None,
28 |         position_ids=None,
29 |         inputs_embeds=None,
30 |         labels=None,
31 |         output_attentions=None,
32 |         output_hidden_states=None,
33 |     ):
34 |         r"""
35 |         labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
36 |             Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
37 |             config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
38 |             If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
39 |         """
40 |         if global_attention_mask is None:
41 |             global_attention_mask = torch.zeros_like(input_ids)
42 |             # global attention on cls token
43 |             global_attention_mask[:, 0] = 1
44 | 
45 |         outputs = self.longformer(
46 |             input_ids,
47 |             attention_mask=attention_mask,
48 |             global_attention_mask=global_attention_mask,
49 |             token_type_ids=token_type_ids,
50 |             position_ids=position_ids,
51 |             inputs_embeds=inputs_embeds,
52 |             output_attentions=output_attentions,
53 |             output_hidden_states=output_hidden_states,
54 |         )
55 |         sequence_output = outputs[0]
56 |         logits = self.classifier(sequence_output)
57 | 
58 |         loss = None
59 |         if labels is not None:
60 |             if self.num_labels == 1:
61 |                 #  We are doing regression
62 |                 loss_fct = MSELoss()
63 |                 loss = loss_fct(logits.view(-1), labels.view(-1))
64 |             else:
65 |                 if self.weight is not None:
66 |                     weight = self.weight.to(labels.device)
67 |                 else:
68 |                     weight = None
69 |                 loss_fct = CrossEntropyLoss(weight=weight)
70 |                 loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
71 | 
72 |         output = (logits,) + outputs[2:]
73 |         return ((loss,) + output) if loss is not None else output
74 | 


--------------------------------------------------------------------------------
/simpletransformers/simpletransformers/classification/transformer_models/mobilebert_model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from torch.nn import CrossEntropyLoss, MSELoss
 4 | from transformers.models.mobilebert.modeling_mobilebert import (
 5 |     MobileBertModel,
 6 |     MobileBertPreTrainedModel,
 7 | )
 8 | 
 9 | 
10 | class MobileBertForSequenceClassification(MobileBertPreTrainedModel):
11 |     def __init__(self, config, weight=None):
12 |         super(MobileBertForSequenceClassification, self).__init__(config)
13 |         self.num_labels = config.num_labels
14 | 
15 |         self.mobilebert = MobileBertModel(config)
16 |         self.dropout = nn.Dropout(config.hidden_dropout_prob)
17 |         self.classifier = nn.Linear(config.hidden_size, self.num_labels)
18 |         self.weight = weight
19 | 
20 |         self.init_weights()
21 | 
22 |     def forward(
23 |         self,
24 |         input_ids=None,
25 |         attention_mask=None,
26 |         token_type_ids=None,
27 |         position_ids=None,
28 |         head_mask=None,
29 |         inputs_embeds=None,
30 |         labels=None,
31 |         output_attentions=None,
32 |         output_hidden_states=None,
33 |     ):
34 |         r"""
35 |         labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
36 |             Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
37 |             config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
38 |             If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
39 |         """
40 |         outputs = self.mobilebert(
41 |             input_ids,
42 |             attention_mask=attention_mask,
43 |             token_type_ids=token_type_ids,
44 |             position_ids=position_ids,
45 |             head_mask=head_mask,
46 |             inputs_embeds=inputs_embeds,
47 |             output_attentions=output_attentions,
48 |             output_hidden_states=output_hidden_states,
49 |         )
50 |         pooled_output = outputs[1]
51 |         pooled_output = self.dropout(pooled_output)
52 |         logits = self.classifier(pooled_output)
53 | 
54 |         loss = None
55 |         if labels is not None:
56 |             if self.num_labels == 1:
57 |                 #  We are doing regression
58 |                 loss_fct = MSELoss()
59 |                 loss = loss_fct(logits.view(-1), labels.view(-1))
60 |             else:
61 |                 if self.weight is not None:
62 |                     weight = self.weight.to(labels.device)
63 |                 else:
64 |                     weight = None
65 |                 loss_fct = CrossEntropyLoss(weight=weight)
66 |                 loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
67 | 
68 |         output = (logits,) + outputs[2:]
69 |         return ((loss,) + output) if loss is not None else output
70 | 


--------------------------------------------------------------------------------
/simpletransformers/simpletransformers/classification/transformer_models/xlm_roberta_model.py:
--------------------------------------------------------------------------------
 1 | from transformers.models.xlm_roberta.configuration_xlm_roberta import XLMRobertaConfig
 2 | from transformers.models.xlm_roberta.modeling_xlm_roberta import (
 3 |     XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST,
 4 | )
 5 | 
 6 | from simpletransformers.classification.transformer_models.roberta_model import (
 7 |     RobertaForSequenceClassification,
 8 | )
 9 | 
10 | 
11 | class XLMRobertaForSequenceClassification(RobertaForSequenceClassification):
12 |     config_class = XLMRobertaConfig
13 |     pretrained_model_archive_map = XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST
14 | 


--------------------------------------------------------------------------------
/simpletransformers/simpletransformers/config/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChenWu98/algorithmic-creativity/7176dc7d35ca17605a76fa873ec97ad62a8537ac/simpletransformers/simpletransformers/config/__init__.py


--------------------------------------------------------------------------------
/simpletransformers/simpletransformers/config/global_args.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from multiprocessing import cpu_count
 3 | 
 4 | global_args = {
 5 |     "adam_epsilon": 1e-8,
 6 |     "best_model_dir": "outputs/best_model",
 7 |     "cache_dir": "cache_dir/",
 8 |     "config": {},
 9 |     "do_lower_case": False,
10 |     "early_stopping_consider_epochs": False,
11 |     "early_stopping_delta": 0,
12 |     "early_stopping_metric": "eval_loss",
13 |     "early_stopping_metric_minimize": True,
14 |     "early_stopping_patience": 3,
15 |     "encoding": None,
16 |     "eval_batch_size": 8,
17 |     "evaluate_during_training": False,
18 |     "evaluate_during_training_silent": True,
19 |     "evaluate_during_training_steps": 2000,
20 |     "evaluate_during_training_verbose": False,
21 |     "fp16": True,
22 |     "gradient_accumulation_steps": 1,
23 |     "learning_rate": 4e-5,
24 |     "local_rank": -1,
25 |     "logging_steps": 50,
26 |     "manual_seed": None,
27 |     "max_grad_norm": 1.0,
28 |     "max_seq_length": 128,
29 |     "multiprocessing_chunksize": 500,
30 |     "n_gpu": 1,
31 |     "no_cache": False,
32 |     "no_save": False,
33 |     "num_train_epochs": 1,
34 |     "output_dir": "outputs/",
35 |     "overwrite_output_dir": False,
36 |     "process_count": cpu_count() - 2 if cpu_count() > 2 else 1,
37 |     "reprocess_input_data": True,
38 |     "save_best_model": True,
39 |     "save_eval_checkpoints": True,
40 |     "save_model_every_epoch": True,
41 |     "save_steps": 2000,
42 |     "save_optimizer_and_scheduler": True,
43 |     "silent": False,
44 |     "tensorboard_dir": None,
45 |     "train_batch_size": 8,
46 |     "use_cached_eval_features": False,
47 |     "use_early_stopping": False,
48 |     "use_multiprocessing": True,
49 |     "wandb_kwargs": {},
50 |     "wandb_project": None,
51 |     "warmup_ratio": 0.06,
52 |     "warmup_steps": 0,
53 |     "weight_decay": 0,
54 | }
55 | 
56 | if sys.platform == "win32":
57 |     global_args["process_count"] = min(global_args["process_count"], 61)
58 | 


--------------------------------------------------------------------------------
/simpletransformers/simpletransformers/config/utils.py:
--------------------------------------------------------------------------------
 1 | def sweep_config_to_sweep_values(sweep_config):
 2 |     """
 3 |     Converts an instance of wandb.Config to plain values map.
 4 | 
 5 |     wandb.Config varies across versions quite significantly,
 6 |     so we use the `keys` method that works consistently.
 7 |     """
 8 | 
 9 |     return {key: sweep_config[key] for key in sweep_config.keys()}
10 | 


--------------------------------------------------------------------------------
/simpletransformers/simpletransformers/conv_ai/__init__.py:
--------------------------------------------------------------------------------
1 | from simpletransformers.config.model_args import ConvAIArgs
2 | from simpletransformers.conv_ai.conv_ai_model import ConvAIModel
3 | 


--------------------------------------------------------------------------------
/simpletransformers/simpletransformers/custom_models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChenWu98/algorithmic-creativity/7176dc7d35ca17605a76fa873ec97ad62a8537ac/simpletransformers/simpletransformers/custom_models/__init__.py


--------------------------------------------------------------------------------
/simpletransformers/simpletransformers/experimental/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChenWu98/algorithmic-creativity/7176dc7d35ca17605a76fa873ec97ad62a8537ac/simpletransformers/simpletransformers/experimental/__init__.py


--------------------------------------------------------------------------------
/simpletransformers/simpletransformers/experimental/classification/__init__.py:
--------------------------------------------------------------------------------
1 | from simpletransformers.experimental.classification.classification_model import (
2 |     ClassificationModel,
3 | )
4 | from simpletransformers.experimental.classification.multi_label_classification_model import (
5 |     MultiLabelClassificationModel,
6 | )
7 | 


--------------------------------------------------------------------------------
/simpletransformers/simpletransformers/experimental/classification/transformer_models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChenWu98/algorithmic-creativity/7176dc7d35ca17605a76fa873ec97ad62a8537ac/simpletransformers/simpletransformers/experimental/classification/transformer_models/__init__.py


--------------------------------------------------------------------------------
/simpletransformers/simpletransformers/experimental/classification/transformer_models/xlm_model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from torch.nn import CrossEntropyLoss, MSELoss
 4 | from transformers.models.xlm.modeling_xlm import (
 5 |     SequenceSummary,
 6 |     XLMModel,
 7 |     XLMPreTrainedModel,
 8 | )
 9 | 
10 | 
11 | class XLMForSequenceClassification(XLMPreTrainedModel):
12 |     r"""
13 |         **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
14 |             Labels for computing the sequence classification/regression loss.
15 |             Indices should be in ``[0, ..., config.num_labels - 1]``.
16 |             If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
17 |             If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).
18 |     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
19 |         **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
20 |             Classification (or regression if config.num_labels==1) loss.
21 |         **logits**: ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)``
22 |             Classification (or regression if config.num_labels==1) scores (before SoftMax).
23 |         **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
24 |             list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
25 |             of shape ``(batch_size, sequence_length, hidden_size)``:
26 |             Hidden-states of the model at the output of each layer plus the initial embedding outputs.
27 |         **attentions**: (`optional`, returned when ``config.output_attentions=True``)
28 |             list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
29 |             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
30 |     Examples::
31 |         tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
32 |         model = XLMForSequenceClassification.from_pretrained('xlm-mlm-en-2048')
33 |         input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
34 |         labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
35 |         outputs = model(input_ids, labels=labels)
36 |         loss, logits = outputs[:2]
37 |     """
38 | 
39 |     def __init__(self, config, weight=None):
40 |         super(XLMForSequenceClassification, self).__init__(config)
41 |         self.num_labels = config.num_labels
42 |         self.weight = weight
43 | 
44 |         self.transformer = XLMModel(config)
45 |         self.sequence_summary = SequenceSummary(config)
46 | 
47 |         self.init_weights()
48 | 
49 |     def forward(
50 |         self,
51 |         input_ids=None,
52 |         attention_mask=None,
53 |         langs=None,
54 |         token_type_ids=None,
55 |         position_ids=None,
56 |         lengths=None,
57 |         cache=None,
58 |         head_mask=None,
59 |         inputs_embeds=None,
60 |         labels=None,
61 |     ):
62 |         transformer_outputs = self.transformer(
63 |             input_ids,
64 |             attention_mask=attention_mask,
65 |             langs=langs,
66 |             token_type_ids=token_type_ids,
67 |             position_ids=position_ids,
68 |             lengths=lengths,
69 |             cache=cache,
70 |             head_mask=head_mask,
71 |         )
72 | 
73 |         output = transformer_outputs[0]
74 |         logits = self.sequence_summary(output)
75 | 
76 |         outputs = (logits,) + transformer_outputs[
77 |             1:
78 |         ]  # Keep new_mems and attention/hidden states if they are here
79 | 
80 |         if labels is not None:
81 |             if self.num_labels == 1:
82 |                 #  We are doing regression
83 |                 loss_fct = MSELoss()
84 |                 loss = loss_fct(logits.view(-1), labels.view(-1))
85 |             else:
86 |                 loss_fct = CrossEntropyLoss(weight=self.weight)
87 |                 loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
88 |             outputs = (loss,) + outputs
89 | 
90 |         return outputs
91 | 


--------------------------------------------------------------------------------
/simpletransformers/simpletransformers/language_generation/__init__.py:
--------------------------------------------------------------------------------
1 | from simpletransformers.config.model_args import LanguageGenerationArgs
2 | from simpletransformers.language_generation.language_generation_model import (
3 |     LanguageGenerationModel,
4 | )
5 | 


--------------------------------------------------------------------------------
/simpletransformers/simpletransformers/language_generation/language_generation_utils.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | logger = logging.getLogger(__name__)
 4 | 
 5 | # Padding text to help Transformer-XL and XLNet with short prompts as proposed by Aman Rusia
 6 | # in https://github.com/rusiaaman/XLNet-gen#methodology
 7 | # and https://medium.com/@amanrusia/xlnet-speaks-comparison-to-gpt-2-ea1a4e9ba39e
 8 | PADDING_TEXT = """In 1991, the remains of Russian Tsar Nicholas II and his family
 9 | (except for Alexei and Maria) are discovered.
10 | The voice of Nicholas's young son, Tsarevich Alexei Nikolaevich, narrates the
11 | remainder of the story. 1883 Western Siberia,
12 | a young Grigori Rasputin is asked by his father and a group of men to perform magic.
13 | Rasputin has a vision and denounces one of the men as a horse thief. Although his
14 | father initially slaps him for making such an accusation, Rasputin watches as the
15 | man is chased outside and beaten. Twenty years later, Rasputin sees a vision of
16 | the Virgin Mary, prompting him to become a priest. Rasputin quickly becomes famous,
17 | with people, even a bishop, begging for his blessing. <eod> </s> <eos>"""
18 | 
19 | 
20 | def prepare_ctrl_input(args, _, tokenizer, prompt_text):
21 |     if args.temperature > 0.7:
22 |         logger.info(
23 |             "CTRL typically works better with lower temperatures (and lower top_k)."
24 |         )
25 | 
26 |     encoded_prompt = tokenizer.encode(prompt_text, add_special_tokens=False)
27 |     if not any(encoded_prompt[0] == x for x in tokenizer.control_codes.values()):
28 |         logger.info(
29 |             "WARNING! You are not starting your generation from a control code so you won't get good results"
30 |         )
31 |     return prompt_text
32 | 
33 | 
34 | def prepare_xlm_input(args, model, tokenizer, prompt_text):
35 |     # Set the language
36 |     use_lang_emb = hasattr(model.config, "use_lang_emb") and model.config.use_lang_emb
37 |     if hasattr(model.config, "lang2id") and use_lang_emb:
38 |         available_languages = model.config.lang2id.keys()
39 |         if args.xlm_language in available_languages:
40 |             language = args.xlm_language
41 |         else:
42 |             language = None
43 |             while language not in available_languages:
44 |                 language = input(
45 |                     "Using XLM. Select language in "
46 |                     + str(list(available_languages))
47 |                     + " >>> "
48 |                 )
49 | 
50 |         model.config.lang_id = model.config.lang2id[language]
51 | 
52 |     return prompt_text
53 | 
54 | 
55 | def prepare_xlnet_input(args, _, tokenizer, prompt_text):
56 |     prompt_text = (
57 |         args.padding_text if args.padding_text else PADDING_TEXT
58 |     ) + prompt_text
59 |     return prompt_text
60 | 
61 | 
62 | def prepare_transfoxl_input(args, _, tokenizer, prompt_text):
63 |     prompt_text = (
64 |         args.padding_text if args.padding_text else PADDING_TEXT
65 |     ) + prompt_text
66 |     return prompt_text
67 | 
68 | 
69 | PREPROCESSING_FUNCTIONS = {
70 |     "ctrl": prepare_ctrl_input,
71 |     "xlm": prepare_xlm_input,
72 |     "xlnet": prepare_xlnet_input,
73 |     "transfo-xl": prepare_transfoxl_input,
74 | }
75 | 


--------------------------------------------------------------------------------
/simpletransformers/simpletransformers/language_modeling/__init__.py:
--------------------------------------------------------------------------------
1 | from simpletransformers.config.model_args import LanguageModelingArgs
2 | from simpletransformers.language_modeling.language_modeling_model import (
3 |     LanguageModelingModel,
4 | )
5 | 


--------------------------------------------------------------------------------
/simpletransformers/simpletransformers/language_representation/__init__.py:
--------------------------------------------------------------------------------
1 | from simpletransformers.language_representation.representation_model import (
2 |     RepresentationModel,
3 | )
4 | 


--------------------------------------------------------------------------------
/simpletransformers/simpletransformers/language_representation/transformer_models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChenWu98/algorithmic-creativity/7176dc7d35ca17605a76fa873ec97ad62a8537ac/simpletransformers/simpletransformers/language_representation/transformer_models/__init__.py


--------------------------------------------------------------------------------
/simpletransformers/simpletransformers/language_representation/transformer_models/bert_model.py:
--------------------------------------------------------------------------------
 1 | from transformers.models.bert.modeling_bert import BertModel, BertPreTrainedModel
 2 | 
 3 | 
 4 | # supports both BERT & ROBERTA BASED MODELS
 5 | class BertForTextRepresentation(BertPreTrainedModel):
 6 |     r"""
 7 |     Outputs: `List` of token vectors, 1 list of max_seq token vectors per sentence given
 8 |     """  # noqa: ignore flake8"
 9 | 
10 |     def __init__(self, config, weight=None):
11 |         super(BertForTextRepresentation, self).__init__(config)
12 |         self.bert = BertModel(config)
13 |         self.weight = weight
14 |         self.init_weights()
15 | 
16 |     def forward(
17 |         self,
18 |         input_ids=None,
19 |         attention_mask=None,
20 |         token_type_ids=None,
21 |         position_ids=None,
22 |         head_mask=None,
23 |     ):
24 |         outputs = self.bert(
25 |             input_ids,
26 |             attention_mask=attention_mask,
27 |             token_type_ids=token_type_ids,
28 |             position_ids=position_ids,
29 |             head_mask=head_mask,
30 |             output_hidden_states=True,
31 |         )
32 |         hidden_states = outputs[2]
33 |         return hidden_states[-1]
34 | 


--------------------------------------------------------------------------------
/simpletransformers/simpletransformers/language_representation/transformer_models/gpt2_model.py:
--------------------------------------------------------------------------------
 1 | from transformers.models.gpt2.modeling_gpt2 import GPT2Model, GPT2PreTrainedModel
 2 | 
 3 | 
 4 | # supports both BERT & ROBERTA BASED MODELS
 5 | class GPT2ForTextRepresentation(GPT2PreTrainedModel):
 6 |     r"""
 7 |     Outputs: `List` of token vectors, 1 list of max_seq token vectors per sentence given
 8 |     """  # noqa: ignore flake8"
 9 | 
10 |     def __init__(self, config, weight=None):
11 |         super(GPT2ForTextRepresentation, self).__init__(config)
12 |         self.gpt2 = GPT2Model(config)
13 |         self.weight = weight
14 |         self.init_weights()
15 | 
16 |     def resize_token_embeddings(self, new_len):
17 |         return self.gpt2.resize_token_embeddings(new_len)
18 | 
19 |     def forward(
20 |         self,
21 |         input_ids=None,
22 |         attention_mask=None,
23 |         token_type_ids=None,
24 |         position_ids=None,
25 |         head_mask=None,
26 |     ):
27 |         outputs = self.gpt2(
28 |             input_ids,
29 |             attention_mask=attention_mask,
30 |             token_type_ids=token_type_ids,
31 |             position_ids=position_ids,
32 |             head_mask=head_mask,
33 |             output_hidden_states=True,
34 |         )
35 |         hidden_states = outputs[2]
36 |         return hidden_states[-1]
37 | 


--------------------------------------------------------------------------------
/simpletransformers/simpletransformers/losses/__init__.py:
--------------------------------------------------------------------------------
1 | from simpletransformers.losses.focal_loss import FocalLoss
2 | from simpletransformers.losses.dice_loss import DiceLoss
3 | from simpletransformers.losses.tversky_loss import TverskyLoss
4 | 


--------------------------------------------------------------------------------
/simpletransformers/simpletransformers/losses/loss_utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import warnings
 3 | from torch.nn import CrossEntropyLoss
 4 | from simpletransformers.losses import FocalLoss, DiceLoss, TverskyLoss
 5 | 
 6 | 
 7 | def init_loss(weight, device, args):
 8 |     if weight and args.loss_type:
 9 |         warnings.warn(
10 |             f"weight and args.loss_type parametters are set at the same time"
11 |             f"will use weighted cross entropy loss. To use {args.loss_type} set weight to None"
12 |         )
13 |     if weight:
14 |         loss_fct = CrossEntropyLoss(weight=torch.Tensor(weight).to(device))
15 |     elif args.loss_type:
16 |         if args.loss_type == "focal":
17 |             loss_fct = FocalLoss(**args.loss_args)
18 |         elif args.loss_type == "dice":
19 |             loss_fct = DiceLoss(**args.loss_args)
20 |         elif args.loss_type == "tversky":
21 |             loss_fct = TverskyLoss(**args.loss_args)
22 |         else:
23 |             raise NotImplementedError(f"unknown {args.loss_type} loss function")
24 |     else:
25 |         loss_fct = None
26 | 
27 |     return loss_fct
28 | 


--------------------------------------------------------------------------------
/simpletransformers/simpletransformers/model.py:
--------------------------------------------------------------------------------
1 | from simpletransformers.classification.classification_model import ClassificationModel
2 | 
3 | TransformerModel = ClassificationModel
4 | 


--------------------------------------------------------------------------------
/simpletransformers/simpletransformers/ner/__init__.py:
--------------------------------------------------------------------------------
1 | from simpletransformers.config.model_args import NERArgs
2 | from simpletransformers.ner.ner_model import NERModel
3 | 


--------------------------------------------------------------------------------
/simpletransformers/simpletransformers/ner/ner_dataset_loading_script/ner_dataset_loading_script.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2020 HuggingFace Datasets Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | # Lint as: python3
17 | """Introduction to the CoNLL-2003 Shared Task: Language-Independent Named Entity Recognition"""
18 | 
19 | import logging
20 | 
21 | import datasets
22 | 
23 | 
24 | """
25 | Adapted from the Huggingface code at https://github.com/huggingface/datasets/blob/master/datasets/conll2003/conll2003.py
26 | """
27 | 
28 | 
29 | class NERConfig(datasets.BuilderConfig):
30 |     """BuilderConfig for NER"""
31 | 
32 |     def __init__(self, **kwargs):
33 |         """BuilderConfig for NER.
34 |         Args:
35 |           **kwargs: keyword arguments forwarded to super.
36 |         """
37 |         super(NERConfig, self).__init__(**kwargs)
38 | 
39 | 
40 | class NER(datasets.GeneratorBasedBuilder):
41 |     """NER dataset."""
42 | 
43 |     BUILDER_CONFIG_CLASS = NERConfig
44 | 
45 |     def _info(self):
46 |         return datasets.DatasetInfo(
47 |             features=datasets.Features(
48 |                 {
49 |                     "sentence_id": datasets.Value("string"),
50 |                     "words": datasets.Sequence(datasets.Value("string")),
51 |                     "labels": datasets.Sequence(datasets.Value("string")),
52 |                 }
53 |             ),
54 |             supervised_keys=None,
55 |         )
56 | 
57 |     def _split_generators(self, dl_manager):
58 |         """Returns SplitGenerators."""
59 | 
60 |         return [
61 |             datasets.SplitGenerator(
62 |                 name=datasets.Split.TRAIN,
63 |                 gen_kwargs={"filepath": self.config.data_files},
64 |             ),
65 |         ]
66 | 
67 |     def _generate_examples(self, filepath):
68 |         logging.info("⏳ Generating examples from = %s", filepath)
69 |         with open(filepath, encoding="utf-8") as f:
70 |             guid = 0
71 |             words = []
72 |             labels = []
73 |             for line in f:
74 |                 if line.startswith("-DOCSTART-") or line == "" or line == "\n":
75 |                     if words:
76 |                         yield guid, {
77 |                             "sentence_id": str(guid),
78 |                             "words": words,
79 |                             "labels": labels,
80 |                         }
81 |                         guid += 1
82 |                         words = []
83 |                         labels = []
84 |                 else:
85 |                     # conll2003 words are space separated
86 |                     splits = line.split(" ")
87 |                     words.append(splits[0])
88 |                     labels.append(splits[-1].rstrip())
89 |             # last example
90 |             yield guid, {
91 |                 "sentence_id": str(guid),
92 |                 "words": words,
93 |                 "labels": labels,
94 |             }
95 | 


--------------------------------------------------------------------------------
/simpletransformers/simpletransformers/question_answering/__init__.py:
--------------------------------------------------------------------------------
1 | from simpletransformers.config.model_args import QuestionAnsweringArgs
2 | from simpletransformers.question_answering.question_answering_model import (
3 |     QuestionAnsweringModel,
4 | )
5 | 


--------------------------------------------------------------------------------
/simpletransformers/simpletransformers/question_answering/qa_dataset_loading_script/qa_dataset_loading_script.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import, division, print_function
 2 | 
 3 | import json
 4 | 
 5 | import datasets
 6 | 
 7 | 
 8 | """
 9 | Adapted from the Huggingface code at https://github.com/huggingface/datasets/blob/master/datasets/squad_v2/squad_v2.py
10 | """
11 | 
12 | 
13 | class QAConfig(datasets.BuilderConfig):
14 |     """BuilderConfig for SQUAD."""
15 | 
16 |     def __init__(self, is_training, **kwargs):
17 |         """BuilderConfig for SQUADV2.
18 |         Args:
19 |           **kwargs: keyword arguments forwarded to super.
20 |         """
21 |         super(QAConfig, self).__init__(**kwargs)
22 |         self.is_training = is_training
23 | 
24 | 
25 | class QA(datasets.GeneratorBasedBuilder):
26 |     BUILDER_CONFIG_CLASS = QAConfig
27 | 
28 |     def _info(self):
29 |         return datasets.DatasetInfo(
30 |             features=datasets.Features(
31 |                 {
32 |                     "qas_id": datasets.Value("string"),
33 |                     "question_text": datasets.Value("string"),
34 |                     "context_text": datasets.Value("string"),
35 |                     "answer_text": datasets.Value("string"),
36 |                     "start_position_character": datasets.Value("int32"),
37 |                     "is_impossible": datasets.Value("bool"),
38 |                     "answers": datasets.features.Sequence(
39 |                         {
40 |                             "text": datasets.Value("string"),
41 |                             "answer_start": datasets.Value("int32"),
42 |                         }
43 |                     ),
44 |                 }
45 |             ),
46 |             supervised_keys=None,
47 |         )
48 | 
49 |     def _split_generators(self, dl_manager):
50 |         """Returns SplitGenerators."""
51 | 
52 |         return [
53 |             datasets.SplitGenerator(
54 |                 name=datasets.Split.TRAIN,
55 |                 gen_kwargs={"filepath": self.config.data_files},
56 |             ),
57 |         ]
58 | 
59 |     def _generate_examples(self, filepath):
60 |         """Yields examples."""
61 |         # TODO(squad_v2): Yields (key, example) tuples from the dataset
62 |         with open(filepath, encoding="utf-8") as f:
63 |             examples_to_process = json.load(f)
64 |             for paragraph in examples_to_process:
65 |                 context_text = paragraph["context"].strip()
66 |                 for qa in paragraph["qas"]:
67 |                     qas_id = qa["id"]
68 |                     question_text = qa["question"]
69 |                     start_position_character = -1
70 |                     answer_text = ""
71 |                     answers = []
72 | 
73 |                     if "is_impossible" in qa:
74 |                         is_impossible = qa["is_impossible"]
75 |                     else:
76 |                         is_impossible = False
77 | 
78 |                     if not is_impossible:
79 |                         if self.config.is_training:
80 |                             answer = qa["answers"][0]
81 |                             answer_text = answer["text"]
82 |                             start_position_character = answer["answer_start"]
83 |                         else:
84 |                             answers = qa["answers"]
85 | 
86 |                     yield qas_id, {
87 |                         "qas_id": qas_id,
88 |                         "question_text": question_text,
89 |                         "context_text": context_text,
90 |                         "answer_text": answer_text,
91 |                         "start_position_character": start_position_character,
92 |                         "is_impossible": is_impossible,
93 |                         "answers": answers,
94 |                     }
95 | 


--------------------------------------------------------------------------------
/simpletransformers/simpletransformers/retrieval/__init__.py:
--------------------------------------------------------------------------------
1 | from simpletransformers.config.model_args import RetrievalArgs
2 | from simpletransformers.retrieval.retrieval_model import RetrievalModel
3 | 


--------------------------------------------------------------------------------
/simpletransformers/simpletransformers/seq2seq/__init__.py:
--------------------------------------------------------------------------------
1 | from simpletransformers.config.model_args import Seq2SeqArgs
2 | from simpletransformers.seq2seq.seq2seq_model import Seq2SeqModel
3 | 


--------------------------------------------------------------------------------
/simpletransformers/simpletransformers/streamlit/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChenWu98/algorithmic-creativity/7176dc7d35ca17605a76fa873ec97ad62a8537ac/simpletransformers/simpletransformers/streamlit/__init__.py


--------------------------------------------------------------------------------
/simpletransformers/simpletransformers/streamlit/ner_view.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | import pandas as pd
 3 | 
 4 | from simpletransformers.ner import NERModel
 5 | from simpletransformers.streamlit.streamlit_utils import (
 6 |     get,
 7 |     simple_transformers_model,
 8 |     get_color,
 9 | )
10 | 
11 | 
12 | ENTITY_WRAPPER = """<mark style="background: rgba{}; font-weight: 450; border-radius: 0.5rem; margin: 0.1em; padding: 0.25rem; display: inline-block">{} {}</mark>"""  # noqa
13 | ENTITY_LABEL_WRAPPER = """<span style="background: #fff; font-size: 0.56em; font-weight: bold; padding: 0.3em 0.3em; vertical-align: middle; margin: 0 0 0.15rem 0.5rem; line-height: 1; display: inline-block">{}</span>"""  # noqa
14 | 
15 | 
16 | def format_word(word, entity, entity_checkboxes, entity_color_map):
17 |     if entity_checkboxes[entity]:
18 |         return ENTITY_WRAPPER.format(
19 |             entity_color_map[entity], word, ENTITY_LABEL_WRAPPER.format(entity)
20 |         )
21 |     else:
22 |         return word
23 | 
24 | 
25 | @st.cache(hash_funcs={NERModel: simple_transformers_model})
26 | def get_prediction(model, input_text):
27 |     predictions, _ = model.predict([input_text])
28 | 
29 |     return predictions
30 | 
31 | 
32 | def ner_viewer(model):
33 |     session_state = get(
34 |         max_seq_length=model.args.max_seq_length,
35 |     )
36 |     model.args.max_seq_length = session_state.max_seq_length
37 | 
38 |     entity_list = model.args.labels_list
39 | 
40 |     st.sidebar.subheader("Entities")
41 |     entity_checkboxes = {
42 |         entity: st.sidebar.checkbox(entity, value=True) for entity in entity_list
43 |     }
44 |     entity_color_map = {entity: get_color(i) for i, entity in enumerate(entity_list)}
45 | 
46 |     st.sidebar.subheader("Parameters")
47 |     model.args.max_seq_length = st.sidebar.slider(
48 |         "Max Seq Length", min_value=1, max_value=512, value=model.args.max_seq_length
49 |     )
50 | 
51 |     st.subheader("Enter text: ")
52 |     input_text = st.text_area("")
53 | 
54 |     prediction = get_prediction(model, input_text)[0]
55 | 
56 |     to_write = " ".join(
57 |         [
58 |             format_word(word, entity, entity_checkboxes, entity_color_map)
59 |             for pred in prediction
60 |             for word, entity in pred.items()
61 |         ]
62 |     )
63 | 
64 |     st.subheader(f"Predictions")
65 |     st.write(to_write, unsafe_allow_html=True)
66 | 


--------------------------------------------------------------------------------
/simpletransformers/simpletransformers/t5/__init__.py:
--------------------------------------------------------------------------------
1 | from simpletransformers.config.model_args import T5Args
2 | from simpletransformers.t5.t5_model import T5Model
3 | 


--------------------------------------------------------------------------------
/simpletransformers/tests/language_modeling/test_language_modeling_only.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import pandas as pd
 4 | import pytest
 5 | 
 6 | from simpletransformers.language_modeling import LanguageModelingModel
 7 | 
 8 | 
 9 | @pytest.mark.parametrize(
10 |     "model_type, model_name",
11 |     [
12 |         ("bert", "bert-base-uncased"),
13 |         ("longformer", "allenai/longformer-base-4096"),
14 |         ("bert", None),
15 |         ("electra", None),
16 |         ("longformer", None),
17 |         # ("xlnet", "xlnet-base-cased"),
18 |         # ("xlm", "xlm-mlm-17-1280"),
19 |         ("roberta", "roberta-base"),
20 |         # ("distilbert", "distilbert-base-uncased"),
21 |         # ("albert", "albert-base-v1"),
22 |         # ("camembert", "camembert-base"),
23 |         # ("xlmroberta", "xlm-roberta-base"),
24 |         # ("flaubert", "flaubert-base-cased"),
25 |     ],
26 | )
27 | def test_language_modeling(model_type, model_name):
28 |     with open("train.txt", "w") as f:
29 |         for i in range(100):
30 |             f.writelines("Hello world with Simple Transformers! \n")
31 | 
32 |     if model_type == "electra":
33 |         model_args = {
34 |             "reprocess_input_data": True,
35 |             "overwrite_output_dir": True,
36 |             "num_train_epochs": 1,
37 |             "no_save": True,
38 |             "vocab_size": 100,
39 |             "generator_config": {
40 |                 "embedding_size": 512,
41 |                 "hidden_size": 256,
42 |                 "num_hidden_layers": 1,
43 |             },
44 |             "discriminator_config": {
45 |                 "embedding_size": 512,
46 |                 "hidden_size": 256,
47 |                 "num_hidden_layers": 2,
48 |             },
49 |         }
50 |     else:
51 |         model_args = {
52 |             "reprocess_input_data": True,
53 |             "overwrite_output_dir": True,
54 |             "num_train_epochs": 1,
55 |             "no_save": True,
56 |         }
57 |         if model_name is None:
58 |             model_args["vocab_size"] = 100
59 | 
60 |     model = LanguageModelingModel(
61 |         model_type,
62 |         model_name,
63 |         args=model_args,
64 |         train_files="train.txt",
65 |         use_cuda=False,
66 |     )
67 | 
68 |     # Train the model
69 |     model.train_model("train.txt")
70 | 


--------------------------------------------------------------------------------
/simpletransformers/tests/test_language_modeling.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import pandas as pd
 4 | import pytest
 5 | 
 6 | from simpletransformers.language_modeling import LanguageModelingModel
 7 | 
 8 | 
 9 | @pytest.mark.parametrize(
10 |     "model_type, model_name",
11 |     [
12 |         ("bert", "bert-base-uncased"),
13 |         ("longformer", "allenai/longformer-base-4096"),
14 |         ("bert", None),
15 |         ("electra", None),
16 |         ("longformer", None),
17 |         # ("xlnet", "xlnet-base-cased"),
18 |         # ("xlm", "xlm-mlm-17-1280"),
19 |         ("roberta", "roberta-base"),
20 |         # ("distilbert", "distilbert-base-uncased"),
21 |         # ("albert", "albert-base-v1"),
22 |         # ("camembert", "camembert-base"),
23 |         # ("xlmroberta", "xlm-roberta-base"),
24 |         # ("flaubert", "flaubert-base-cased"),
25 |     ],
26 | )
27 | def test_language_modeling(model_type, model_name):
28 |     with open("train.txt", "w") as f:
29 |         for i in range(100):
30 |             f.writelines("Hello world with Simple Transformers! \n")
31 | 
32 |     if model_type == "electra":
33 |         model_args = {
34 |             "reprocess_input_data": True,
35 |             "overwrite_output_dir": True,
36 |             "num_train_epochs": 1,
37 |             "no_save": True,
38 |             "vocab_size": 100,
39 |             "generator_config": {
40 |                 "embedding_size": 512,
41 |                 "hidden_size": 256,
42 |                 "num_hidden_layers": 1,
43 |             },
44 |             "discriminator_config": {
45 |                 "embedding_size": 512,
46 |                 "hidden_size": 256,
47 |                 "num_hidden_layers": 2,
48 |             },
49 |         }
50 |     else:
51 |         model_args = {
52 |             "reprocess_input_data": True,
53 |             "overwrite_output_dir": True,
54 |             "num_train_epochs": 1,
55 |             "no_save": True,
56 |         }
57 |         if model_name is None:
58 |             model_args["vocab_size"] = 100
59 | 
60 |     if model_name is None:
61 |         model_args["vocab_size"] = 100
62 | 
63 |     model = LanguageModelingModel(
64 |         model_type,
65 |         model_name,
66 |         args=model_args,
67 |         train_files="train.txt",
68 |         use_cuda=False,
69 |     )
70 | 
71 |     # Train the model
72 |     model.train_model("train.txt")
73 | 


--------------------------------------------------------------------------------
/simpletransformers/tests/test_language_representation.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from simpletransformers.language_representation import RepresentationModel
 4 | 
 5 | 
 6 | @pytest.mark.parametrize(
 7 |     "model_type, model_name",
 8 |     [
 9 |         ("bert", "bert-base-uncased"),
10 |         ("roberta", "roberta-base"),
11 |         ("gpt2", "distilgpt2"),
12 |     ],
13 | )
14 | @pytest.mark.parametrize("combine_strategy", ["mean", "concat", None])
15 | def test_shapes(model_type, model_name, combine_strategy):
16 |     sentence_list = ["Example sentence 1", "Example sentence 2"]
17 |     # Create a ClassificationModel
18 |     model = RepresentationModel(
19 |         model_type,
20 |         model_name,
21 |         use_cuda=False,
22 |         args={
23 |             "no_save": True,
24 |             "reprocess_input_data": True,
25 |             "overwrite_output_dir": True,
26 |         },
27 |     )
28 |     encoded_sentences = model.encode_sentences(
29 |         sentence_list, combine_strategy=combine_strategy
30 |     )
31 |     longest_seq = (
32 |         3  # RepresentationModel truncates sentences to the longest sentence in the list
33 |     )
34 |     if model_type == "bert" or model_type == "roberta":
35 |         longest_seq += 2  # add [CLS] & [SEP] tokens added by BERT & ROBERTA Models
36 |     # last dimention is the embedding dimension, it depends on the model
37 |     if combine_strategy == None:
38 |         assert encoded_sentences.shape == (len(sentence_list), longest_seq, 768)
39 |     if combine_strategy == "concat":
40 |         assert encoded_sentences.shape == (len(sentence_list), longest_seq * 768)
41 |     if combine_strategy == "mean":
42 |         assert encoded_sentences.shape == (len(sentence_list), 768)
43 | 


--------------------------------------------------------------------------------
/simpletransformers/tests/test_named_entity_recognition.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import pytest
 3 | 
 4 | from simpletransformers.ner import NERModel
 5 | 
 6 | 
 7 | @pytest.mark.parametrize(
 8 |     "model_type, model_name",
 9 |     [
10 |         ("bert", "bert-base-uncased"),
11 |         ("bigbird", "google/bigbird-roberta-base"),
12 |         ("longformer", "allenai/longformer-base-4096"),
13 |         # ("xlnet", "xlnet-base-cased"),
14 |         # ("xlm", "xlm-mlm-17-1280"),
15 |         ("roberta", "roberta-base"),
16 |         # ("distilbert", "distilbert-base-uncased"),
17 |         # ("albert", "albert-base-v1"),
18 |         # ("camembert", "camembert-base"),
19 |         # ("xlmroberta", "xlm-roberta-base"),
20 |         # ("flaubert", "flaubert-base-cased"),
21 |     ],
22 | )
23 | def test_named_entity_recognition(model_type, model_name):
24 |     # Creating train_df  and eval_df for demonstration
25 |     train_data = [
26 |         [0, "Simple", "B-MISC"],
27 |         [0, "Transformers", "I-MISC"],
28 |         [0, "started", "O"],
29 |         [1, "with", "O"],
30 |         [0, "text", "O"],
31 |         [0, "classification", "B-MISC"],
32 |         [1, "Simple", "B-MISC"],
33 |         [1, "Transformers", "I-MISC"],
34 |         [1, "can", "O"],
35 |         [1, "now", "O"],
36 |         [1, "perform", "O"],
37 |         [1, "NER", "B-MISC"],
38 |     ]
39 |     train_df = pd.DataFrame(train_data, columns=["sentence_id", "words", "labels"])
40 | 
41 |     eval_data = [
42 |         [0, "Simple", "B-MISC"],
43 |         [0, "Transformers", "I-MISC"],
44 |         [0, "was", "O"],
45 |         [1, "built", "O"],
46 |         [1, "for", "O"],
47 |         [0, "text", "O"],
48 |         [0, "classification", "B-MISC"],
49 |         [1, "Simple", "B-MISC"],
50 |         [1, "Transformers", "I-MISC"],
51 |         [1, "then", "O"],
52 |         [1, "expanded", "O"],
53 |         [1, "to", "O"],
54 |         [1, "perform", "O"],
55 |         [1, "NER", "B-MISC"],
56 |     ]
57 |     eval_df = pd.DataFrame(eval_data, columns=["sentence_id", "words", "labels"])
58 | 
59 |     # Create a NERModel
60 |     model = NERModel(
61 |         model_type,
62 |         model_name,
63 |         args={
64 |             "no_save": True,
65 |             "overwrite_output_dir": True,
66 |             "reprocess_input_data": False,
67 |         },
68 |         use_cuda=False,
69 |     )
70 | 
71 |     # Train the model
72 |     model.train_model(train_df)
73 | 
74 |     # Evaluate the model
75 |     result, model_outputs, predictions = model.eval_model(eval_df)
76 | 
77 |     # Predictions on arbitary text strings
78 |     predictions, raw_outputs = model.predict(["Some arbitary sentence"])
79 | 


--------------------------------------------------------------------------------
/simpletransformers/tests/test_question_answering.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import logging
  3 | import os
  4 | 
  5 | import pytest
  6 | 
  7 | from simpletransformers.question_answering import QuestionAnsweringModel
  8 | 
  9 | 
 10 | @pytest.mark.parametrize(
 11 |     "model_type, model_name",
 12 |     [
 13 |         ("bert", "bert-base-uncased"),
 14 |         ("longformer", "allenai/longformer-base-4096"),
 15 |         # ("reformer", "google/reformer-crime-and-punishment"),
 16 |         # ("xlnet", "xlnet-base-cased"),
 17 |         # ("xlm", "xlm-mlm-17-1280"),
 18 |         # ("roberta", "roberta-base"),
 19 |         # ("distilbert", "distilbert-base-uncased"),
 20 |         # ("albert", "albert-base-v1"),
 21 |         # ("camembert", "camembert-base"),
 22 |         # ("xlmroberta", "xlm-roberta-base"),
 23 |         # ("flaubert", "flaubert-base-cased"),
 24 |     ],
 25 | )
 26 | def test_question_answering(model_type, model_name):
 27 |     # Create dummy data to use for training.
 28 |     train_data = [
 29 |         {
 30 |             "context": "This is the first context",
 31 |             "qas": [
 32 |                 {
 33 |                     "id": "00001",
 34 |                     "is_impossible": False,
 35 |                     "question": "Which context is this?",
 36 |                     "answers": [{"text": "the first", "answer_start": 8}],
 37 |                 }
 38 |             ],
 39 |         },
 40 |         {
 41 |             "context": "Other legislation followed, including the Migratory Bird Conservation Act of 1929, a 1937 treaty prohibiting the hunting of right and gray whales,\
 42 |                 and the Bald Eagle Protection Act of 1940. These later laws had a low cost to society—the species were relatively rare—and little opposition was raised",
 43 |             "qas": [
 44 |                 {
 45 |                     "id": "00002",
 46 |                     "is_impossible": False,
 47 |                     "question": "What was the cost to society?",
 48 |                     "answers": [{"text": "low cost", "answer_start": 225}],
 49 |                 },
 50 |                 {
 51 |                     "id": "00003",
 52 |                     "is_impossible": False,
 53 |                     "question": "What was the name of the 1937 treaty?",
 54 |                     "answers": [
 55 |                         {"text": "Bald Eagle Protection Act", "answer_start": 167}
 56 |                     ],
 57 |                 },
 58 |                 {
 59 |                     "id": "00004",
 60 |                     "is_impossible": True,
 61 |                     "question": "How did Alexandar Hamilton die?",
 62 |                     "answers": [],
 63 |                 },
 64 |             ],
 65 |         },
 66 |     ]  # noqa
 67 | 
 68 |     for i in range(4):
 69 |         train_data.extend(train_data)
 70 | 
 71 |     # Save as a JSON file
 72 |     os.makedirs("data", exist_ok=True)
 73 |     with open("data/train.json", "w") as f:
 74 |         json.dump(train_data, f)
 75 | 
 76 |     logging.basicConfig(level=logging.WARNING)
 77 |     transformers_logger = logging.getLogger("transformers")
 78 |     transformers_logger.setLevel(logging.ERROR)
 79 | 
 80 |     # Create the QuestionAnsweringModel
 81 |     model = QuestionAnsweringModel(
 82 |         model_type,
 83 |         model_name,
 84 |         args={
 85 |             "no_save": True,
 86 |             "reprocess_input_data": True,
 87 |             "overwrite_output_dir": True,
 88 |         },
 89 |         use_cuda=False,
 90 |     )
 91 | 
 92 |     # Train the model
 93 |     model.train_model("data/train.json")
 94 | 
 95 |     # Evaluate the model. (Being lazy and evaluating on the train data itself)
 96 |     result, text = model.eval_model("data/train.json")
 97 | 
 98 |     # Making predictions using the model.
 99 |     to_predict = [
100 |         {
101 |             "context": "This is the context used for demonstrating predictions.",
102 |             "qas": [{"question": "What is this context?", "id": "0"}],
103 |         }
104 |     ]
105 | 
106 |     model.predict(to_predict)
107 | 


--------------------------------------------------------------------------------
/simpletransformers/tests/test_seq2seq.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import pandas as pd
 4 | import pytest
 5 | 
 6 | from simpletransformers.seq2seq import Seq2SeqArgs, Seq2SeqModel
 7 | 
 8 | 
 9 | @pytest.mark.parametrize(
10 |     "encoder_decoder_type, encoder_decoder_name, encoder_type, use_hf_datasets",
11 |     [
12 |         ("bart", "facebook/bart-base", "bart", True),
13 |         ("bart", "facebook/bart-base", "bart", False),
14 |         ("roberta-base", "bert-base-cased", "roberta", True),
15 |         ("roberta-base", "bert-base-cased", "roberta", False),
16 |     ],
17 | )
18 | def test_seq2seq(
19 |     encoder_decoder_type, encoder_decoder_name, encoder_type, use_hf_datasets
20 | ):
21 |     train_data = [
22 |         ["one", "1"],
23 |         ["two", "2"],
24 |     ]
25 | 
26 |     train_df = pd.DataFrame(train_data, columns=["input_text", "target_text"])
27 | 
28 |     eval_data = [
29 |         ["three", "3"],
30 |         ["four", "4"],
31 |     ]
32 | 
33 |     eval_df = pd.DataFrame(eval_data, columns=["input_text", "target_text"])
34 | 
35 |     model_args = {
36 |         "reprocess_input_data": True,
37 |         "overwrite_output_dir": True,
38 |         "max_seq_length": 128,
39 |         "train_batch_size": 2,
40 |         "num_train_epochs": 2,
41 |         "use_multiprocessing": False,
42 |         "max_length": 15,
43 |         "manual_seed": 4,
44 |         "do_sample": False,
45 |         "num_return_sequences": 1,
46 |         "use_hf_datasets": use_hf_datasets,
47 |     }
48 | 
49 |     if encoder_type == "bart":
50 |         model = Seq2SeqModel(
51 |             encoder_decoder_type=encoder_decoder_type,
52 |             encoder_decoder_name=encoder_decoder_name,
53 |             args=model_args,
54 |             use_cuda=False,
55 |         )
56 |     else:
57 |         model = Seq2SeqModel(
58 |             encoder_type=encoder_type,
59 |             encoder_name=encoder_decoder_type,
60 |             decoder_name=encoder_decoder_name,
61 |             args=model_args,
62 |             use_cuda=False,
63 |         )
64 | 
65 |     model.train_model(train_df)
66 | 
67 |     model.eval_model(eval_df)
68 | 
69 |     a = model.predict(["five"])[0]
70 | 


--------------------------------------------------------------------------------
/simpletransformers/tests/test_t5.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import pytest
 3 | 
 4 | from simpletransformers.t5 import T5Model
 5 | 
 6 | 
 7 | def test_t5():
 8 |     train_data = [
 9 |         ["convert", "one", "1"],
10 |         ["convert", "two", "2"],
11 |     ]
12 | 
13 |     train_df = pd.DataFrame(train_data, columns=["prefix", "input_text", "target_text"])
14 | 
15 |     eval_data = [
16 |         ["convert", "three", "3"],
17 |         ["convert", "four", "4"],
18 |     ]
19 | 
20 |     eval_df = pd.DataFrame(eval_data, columns=["prefix", "input_text", "target_text"])
21 | 
22 |     eval_df = train_df.copy()
23 | 
24 |     model_args = {
25 |         "reprocess_input_data": True,
26 |         "overwrite_output_dir": True,
27 |         "max_seq_length": 10,
28 |         "train_batch_size": 2,
29 |         "num_train_epochs": 2,
30 |         "save_model_every_epoch": False,
31 |         "max_length": 20,
32 |         "num_beams": 1,
33 |     }
34 | 
35 |     # Create T5 Model
36 |     model = T5Model("t5", "t5-base", args=model_args, use_cuda=False)
37 | 
38 |     # Train T5 Model on new task
39 |     model.train_model(train_df)
40 | 
41 |     # Evaluate T5 Model on new task
42 |     model.eval_model(eval_df)
43 | 
44 |     # Predict with trained T5 model
45 |     model.predict(["convert: four", "convert: five"])
46 | 
47 |     # Load test
48 |     model = T5Model("t5", "outputs", args=model_args, use_cuda=False)
49 | 
50 |     # Evaluate T5 Model on new task
51 |     model.eval_model(eval_df)
52 | 
53 |     # Predict with trained T5 model
54 |     model.predict(["convert: four", "convert: five"])
55 | 


--------------------------------------------------------------------------------
/triangle-discovery/diffusion/.gitignore:
--------------------------------------------------------------------------------
 1 | .vscode/
 2 | build/
 3 | dist/
 4 | *.egg-info/
 5 | __pycache__
 6 | .ipynb_checkpoints/
 7 | .DS_Store
 8 | **.pyc
 9 | *.png
10 | *.txt
11 | **/outputs/
12 | **/wandb/
13 | **/exp/
14 | **/exp_local/
15 | data/
16 | eval/
17 | assets/
18 | **.pth
19 | **.npz
20 | core
21 | **.log
22 | *.jsonl


--------------------------------------------------------------------------------
/triangle-discovery/diffusion/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Aaron Lou
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/triangle-discovery/diffusion/catsample.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | 
 4 | 
 5 | def gumbel_softmax(categorical_probs, hard=False, eps=1e-9):
 6 |     logits = categorical_probs.clamp(min=1e-9).log()
 7 |     return F.gumbel_softmax(logits, hard=hard)
 8 | 
 9 | 
10 | def sample_categorical(categorical_probs, method="hard"):
11 |     if method == "hard":
12 |         gumbel_norm = 1e-10 - (torch.rand_like(categorical_probs) + 1e-10).log()
13 |         return (categorical_probs / gumbel_norm).argmax(dim=-1)
14 |     else:
15 |         raise ValueError(f"Method {method} for sampling categorical variables is not valid.")
16 |     


--------------------------------------------------------------------------------
/triangle-discovery/diffusion/configs/config.yaml:
--------------------------------------------------------------------------------
 1 | defaults:
 2 |   - _self_
 3 |   - model: small
 4 |   - override hydra/launcher: submitit_slurm
 5 | 
 6 | ngpus: 1
 7 | tokens: 50257
 8 | add_vocab: ""
 9 | 
10 | training:
11 |   batch_size: 64  # 512
12 |   accum: 1
13 |   n_iters: 100000
14 |   snapshot_freq: 5000
15 |   log_freq: 50
16 |   eval_freq: 500
17 |   snapshot_freq_for_preemption: 5000
18 |   weight: standard
19 |   snapshot_sampling: True
20 |   ema: 0.9999
21 | 
22 | data:
23 |   train: openwebtext
24 |   valid: wikitext103
25 |   cache_dir: data
26 | 
27 | graph:
28 |   type: absorb
29 |   file: data
30 |   report_all: False
31 | 
32 | noise:
33 |   type: loglinear
34 |   sigma_min: 1e-4
35 |   sigma_max: 20
36 | 
37 | sampling:
38 |   predictor: euler
39 |   steps: 128
40 |   noise_removal: True
41 | 
42 | eval:
43 |   batch_size: 32
44 |   perplexity: False
45 |   perplexity_batch_size: 32
46 | 
47 | optim:
48 |   weight_decay: 0
49 |   optimizer: AdamW
50 |   lr: 1e-4
51 |   beta1: 0.9
52 |   beta2: 0.999
53 |   eps: 1e-8
54 |   warmup: 2500
55 |   grad_clip: 1.
56 | 
57 | 
58 | hydra:
59 |   run:
60 |     dir: /data/locus/project_data/project_data2/chenwu2/creativity_results/${data.train}/${now:%Y.%m.%d}/${now:%H%M%S}


--------------------------------------------------------------------------------
/triangle-discovery/diffusion/configs/model/medium.yaml:
--------------------------------------------------------------------------------
1 | name: medium
2 | type: ddit
3 | hidden_size: 1024
4 | cond_dim: 128
5 | length: 32
6 | n_blocks: 24
7 | n_heads: 16
8 | scale_by_sigma: True
9 | dropout: 0.1


--------------------------------------------------------------------------------
/triangle-discovery/diffusion/configs/model/small.yaml:
--------------------------------------------------------------------------------
1 | name: small
2 | type: ddit
3 | hidden_size: 768
4 | cond_dim: 128
5 | length: 32
6 | n_blocks: 12
7 | n_heads: 12
8 | scale_by_sigma: True
9 | dropout: 0.1


--------------------------------------------------------------------------------
/triangle-discovery/diffusion/eval.sh:
--------------------------------------------------------------------------------
 1 | # $1: dataset
 2 | # $2: weight_decay
 3 | # $3: n_layers
 4 | 
 5 | EXP_DIR=creativity_results/creativity_data/triangle.0/train.json/train/checkpoint_outputs
 6 | 
 7 | DATA_DIR=/data/locus/project_data/project_data2/chenwu2/creativity_data
 8 | 
 9 | python eval_qa.py --dir $EXP_DIR --dataset $1 --data_dir $DATA_DIR
10 | 


--------------------------------------------------------------------------------
/triangle-discovery/diffusion/load_model.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch
 3 | from model import SEDD
 4 | import utils
 5 | from model.ema import ExponentialMovingAverage
 6 | import graph_lib
 7 | import noise_lib
 8 | 
 9 | from omegaconf import OmegaConf
10 | 
11 | def load_model_hf(dir, device):
12 |     score_model = SEDD.from_pretrained(dir).to(device)
13 |     graph = graph_lib.get_graph(score_model.config, device)
14 |     noise = noise_lib.get_noise(score_model.config).to(device)
15 |     return score_model, graph, noise
16 | 
17 | 
18 | def load_model_local(root_dir, ckpt_dir, added_tokens, device):
19 |     cfg = utils.load_hydra_config_from_run(root_dir)
20 |     if added_tokens:
21 |         cfg.tokens = cfg.tokens + len(added_tokens)
22 |     graph = graph_lib.get_graph(cfg, device)
23 |     noise = noise_lib.get_noise(cfg).to(device)
24 |     score_model = SEDD(cfg).to(device)
25 |     ema = ExponentialMovingAverage(score_model.parameters(), decay=cfg.training.ema)
26 | 
27 |     # ckpt_dir = os.path.join(root_dir, "checkpoints-meta", "checkpoint.pth")
28 |     loaded_state = torch.load(ckpt_dir, map_location=device)
29 | 
30 |     score_model.load_state_dict(loaded_state['model'])
31 |     ema.load_state_dict(loaded_state['ema'])
32 | 
33 |     ema.store(score_model.parameters())
34 |     ema.copy_to(score_model.parameters())
35 |     return score_model, graph, noise
36 | 
37 | 
38 | def load_model(root_dir, ckpt_dir, added_tokens, device):
39 |     try:
40 |         return load_model_hf(root_dir, device)
41 |     except:
42 |         return load_model_local(root_dir, ckpt_dir, added_tokens, device)


--------------------------------------------------------------------------------
/triangle-discovery/diffusion/model/__init__.py:
--------------------------------------------------------------------------------
1 | from .transformer import SEDD


--------------------------------------------------------------------------------
/triangle-discovery/diffusion/model/fused_add_dropout_scale.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | from typing import Optional
 4 | from torch import Tensor
 5 | 
 6 | # flags required to enable jit fusion kernels
 7 | torch._C._jit_set_profiling_mode(False)
 8 | torch._C._jit_set_profiling_executor(False)
 9 | torch._C._jit_override_can_fuse_on_cpu(True)
10 | torch._C._jit_override_can_fuse_on_gpu(True)
11 | 
12 | 
13 | def bias_dropout_add_scale(
14 |     x: Tensor, bias: Optional[Tensor], scale: Tensor, residual: Optional[Tensor], prob: float, training: bool
15 | ) -> Tensor:
16 |     if bias is not None:
17 |         out = scale * F.dropout(x + bias, p=prob, training=training)
18 |     else:
19 |         out = scale * F.dropout(x, p=prob, training=training)
20 | 
21 |     if residual is not None:
22 |         out = residual + out
23 |     return out
24 | 
25 | 
26 | def get_bias_dropout_add_scale(training):
27 |     def _bias_dropout_add(x, bias, scale, residual, prob):
28 |         return bias_dropout_add_scale(x, bias, scale, residual, prob, training)
29 | 
30 |     return _bias_dropout_add
31 | 
32 | 
33 | def modulate(x: Tensor, shift: Tensor, scale: Tensor) -> Tensor:
34 |     return x * (1 + scale) + shift
35 | 
36 | 
37 | @torch.jit.script
38 | def bias_dropout_add_scale_fused_train(
39 |     x: Tensor, bias: Optional[Tensor], scale: Tensor, residual: Optional[Tensor], prob: float
40 | ) -> Tensor:
41 |     return bias_dropout_add_scale(x, bias, scale, residual, prob, True)
42 | 
43 | 
44 | @torch.jit.script
45 | def bias_dropout_add_scale_fused_inference(
46 |     x: Tensor, bias: Optional[Tensor], scale: Tensor, residual: Optional[Tensor], prob: float
47 | ) -> Tensor:
48 |     return bias_dropout_add_scale(x, bias, scale, residual, prob, False)
49 | 
50 | @torch.jit.script
51 | def modulate_fused(x: Tensor, shift: Tensor, scale: Tensor) -> Tensor:
52 |     return modulate(x, shift, scale)


--------------------------------------------------------------------------------
/triangle-discovery/diffusion/model/rotary.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | 
 4 | 
 5 | class Rotary(torch.nn.Module):
 6 |     def __init__(self, dim, base=10_000):
 7 |         super().__init__()
 8 |         inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
 9 |         self.register_buffer("inv_freq", inv_freq)
10 |         self.seq_len_cached = None
11 |         self.cos_cached = None
12 |         self.sin_cached = None
13 | 
14 |     def forward(self, x, seq_dim=1):
15 |         seq_len = x.shape[seq_dim]
16 |         if seq_len != self.seq_len_cached:
17 |             self.seq_len_cached = seq_len
18 |             t = torch.arange(x.shape[seq_dim], device=x.device).type_as(self.inv_freq)
19 |             freqs = torch.einsum("i,j->ij", t, self.inv_freq.clone())
20 |             emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
21 |             # dims are: batch, seq_len, qkv, head, dim
22 |             self.cos_cached = emb.cos()[None, :, None, None, :].repeat(1,1,3,1,1)
23 |             self.sin_cached = emb.sin()[None, :, None, None, :].repeat(1,1,3,1,1)
24 |             # This makes the transformation on v an identity.
25 |             self.cos_cached[:,:,2,:,:].fill_(1.)
26 |             self.sin_cached[:,:,2,:,:].fill_(0.)
27 | 
28 |         return self.cos_cached, self.sin_cached
29 | 
30 | 
31 | def rotate_half(x):
32 |     x1, x2 = x[..., : x.shape[-1] // 2], x[..., x.shape[-1] // 2 :]
33 |     return torch.cat(
34 |         (-x2, x1), dim=-1
35 |     )
36 | 
37 | 
38 | @torch.jit.script
39 | def _apply_rotary_pos_emb_torchscript(qkv, cos, sin):
40 |     return (qkv * cos) + (rotate_half(qkv) * sin)
41 | 
42 | 
43 | def apply_rotary_pos_emb(qkv, cos, sin):
44 |     try:
45 |         import flash_attn.layers.rotary
46 |         cos = cos[0,:,0,0,:cos.shape[-1]//2]
47 |         sin = sin[0,:,0,0,:sin.shape[-1]//2]
48 |         return flash_attn.layers.rotary.apply_rotary_emb_qkv_(
49 |             qkv, cos, sin
50 |         )
51 |     except:
52 |         return _apply_rotary_pos_emb_torchscript(qkv, cos, sin)


--------------------------------------------------------------------------------
/triangle-discovery/diffusion/model/utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | 
 4 | 
 5 | def get_model_fn(model, train=False):
 6 |     """Create a function to give the output of the score-based model.
 7 | 
 8 |     Args:
 9 |         model: The score model.
10 |         train: `True` for training and `False` for evaluation.
11 |         mlm: If the input model is a mlm and models the base probability 
12 | 
13 |     Returns:
14 |         A model function.
15 |     """
16 | 
17 |     def model_fn(x, sigma):
18 |         """Compute the output of the score-based model.
19 | 
20 |         Args:
21 |             x: A mini-batch of input data.
22 |             labels: A mini-batch of conditioning variables for time steps. Should be interpreted differently
23 |               for different models.
24 | 
25 |         Returns:
26 |             A tuple of (model output, new mutable states)
27 |         """
28 |         if train:
29 |             model.train()
30 |         else:
31 |             model.eval()
32 |         
33 |             # otherwise output the raw values (we handle mlm training in losses.py)
34 |         return model(x, sigma)
35 | 
36 |     return model_fn
37 | 
38 | 
39 | def get_score_fn(model, train=False, sampling=False):
40 |     if sampling:
41 |         assert not train, "Must sample in eval mode"
42 |     model_fn = get_model_fn(model, train=train)
43 | 
44 |     with torch.cuda.amp.autocast(dtype=torch.bfloat16):
45 |         def score_fn(x, sigma):
46 |             sigma = sigma.reshape(-1)
47 |             score = model_fn(x, sigma)
48 |             
49 |             if sampling:
50 |                 # when sampling return true score (not log used for training)
51 |                 return score.exp()
52 |                 
53 |             return score
54 | 
55 |     return score_fn
56 | 


--------------------------------------------------------------------------------
/triangle-discovery/diffusion/noise_lib.py:
--------------------------------------------------------------------------------
 1 | import abc
 2 | import torch
 3 | import torch.nn as nn
 4 | import numpy as np
 5 | 
 6 | 
 7 | def get_noise(config):
 8 |     if config.noise.type == "geometric":
 9 |         return GeometricNoise(config.noise.sigma_min, config.noise.sigma_max)
10 |     elif config.noise.type == "loglinear":
11 |         return LogLinearNoise()
12 |     else:
13 |         raise ValueError(f"{config.noise.type} is not a valid noise")
14 | 
15 | 
16 | class Noise(abc.ABC, nn.Module):
17 |     """
18 |     Baseline forward method to get the total + rate of noise at a timestep
19 |     """
20 |     def forward(self, t):
21 |         return self.total_noise(t), self.rate_noise(t)
22 | 
23 |     """
24 |     Assume time goes from 0 to 1
25 |     """
26 |     @abc.abstractmethod
27 |     def rate_noise(self, t):
28 |         """
29 |         Rate of change of noise ie g(t)
30 |         """
31 |         pass
32 | 
33 |     @abc.abstractmethod
34 |     def total_noise(self, t):
35 |         """
36 |         Total noise ie \int_0^t g(t) dt + g(0)
37 |         """
38 |         pass
39 | 
40 | 
41 | class GeometricNoise(Noise, nn.Module):
42 |     def __init__(self, sigma_min=1e-3, sigma_max=1, learnable=False):
43 |         super().__init__()
44 |         self.sigmas = 1.0 * torch.tensor([sigma_min, sigma_max])
45 |         if learnable:
46 |             self.sigmas = nn.Parameter(self.sigmas)
47 |         self.empty = nn.Parameter(torch.tensor(0.0))
48 | 
49 |     def rate_noise(self, t):
50 |         return self.sigmas[0] ** (1 - t) * self.sigmas[1] ** t * (self.sigmas[1].log() - self.sigmas[0].log())
51 | 
52 |     def total_noise(self, t):
53 |         return self.sigmas[0] ** (1 - t) * self.sigmas[1] ** t
54 | 
55 | 
56 | class LogLinearNoise(Noise, nn.Module):
57 |     """
58 |     Log Linear noise schedule built so that 1 - 1/e^(n(t)) interpolates between 0 and ~1
59 |     when t goes from 0 to 1. Used for absorbing
60 | 
61 |     Total noise is -log(1 - (1 - eps) * t), so the sigma will be (1 - eps) * t
62 |     """
63 |     def __init__(self, eps=1e-3):
64 |         super().__init__()
65 |         self.eps = eps
66 |         self.empty = nn.Parameter(torch.tensor(0.0))
67 | 
68 |     def rate_noise(self, t):
69 |         return (1 - self.eps) / (1 - (1 - self.eps) * t)
70 | 
71 |     def total_noise(self, t):
72 |         return -torch.log1p(-(1 - self.eps) * t)
73 | 
74 | 


--------------------------------------------------------------------------------
/triangle-discovery/diffusion/run_eval.sh:
--------------------------------------------------------------------------------
1 | bash eval.sh triangle.0 0.0 8
2 | 


--------------------------------------------------------------------------------
/triangle-discovery/diffusion/run_sample.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import argparse
 3 | import json
 4 | 
 5 | from load_model import load_model
 6 | from transformers import GPT2TokenizerFast
 7 | import torch.nn.functional as F
 8 | import sampling
 9 | 
10 | 
11 | def main():
12 |     parser = argparse.ArgumentParser(description="Generate some samples")
13 |     parser.add_argument("--model_path", default="louaaron/sedd-medium", type=str)
14 |     parser.add_argument("--dataset", default="wikitext103", type=str)
15 |     parser.add_argument("--batch_size", type=int, default=1)
16 |     parser.add_argument("--steps", type=int, default=1024)
17 |     parser.add_argument("--add_vocab", type=str, default=None)
18 |     args = parser.parse_args()
19 | 
20 |     
21 |     device = torch.device('cuda')
22 |     model, graph, noise = load_model(args.model_path, device)
23 |     tokenizer = GPT2TokenizerFast.from_pretrained('gpt2')
24 |     if args.add_vocab:
25 |         with open(args.add_vocab, 'r') as file:
26 |             added_tokens = json.load(file)
27 |         tokenizer.add_tokens(added_tokens)
28 | 
29 |     sampling_fn = sampling.get_pc_sampler(
30 |         graph, noise, (args.batch_size, 1024), 'analytic', args.steps, device=device
31 |     )
32 | 
33 |     samples = sampling_fn(model)
34 | 
35 |     text_samples = tokenizer.batch_decode(samples)
36 |     for i in text_samples:
37 |         print(i)
38 |         print("=================================================")
39 | 
40 | if __name__=="__main__":
41 |     main()


--------------------------------------------------------------------------------
/triangle-discovery/diffusion/run_sample_cond.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import argparse
 3 | import json
 4 | 
 5 | from load_model import load_model
 6 | from transformers import GPT2TokenizerFast
 7 | import sampling
 8 | 
 9 | 
10 | def main():
11 |     parser = argparse.ArgumentParser(description="Generate some samples")
12 |     parser.add_argument("--model_path", default="louaaron/sedd-medium", type=str)
13 |     parser.add_argument("--dataset", default="wikitext103", type=str)
14 |     parser.add_argument("--batch_size", type=int, default=1)
15 |     parser.add_argument("--steps", type=int, default=1024)
16 |     parser.add_argument("--prefix", type=str, default="Hi, my name is")
17 |     parser.add_argument("--suffix", type=str, default=" and that's why I'm late.")
18 |     parser.add_argument("--add_vocab", type=str, default=None)
19 |     args = parser.parse_args()
20 | 
21 |     tokenizer = GPT2TokenizerFast.from_pretrained('gpt2')
22 |     if args.add_vocab:
23 |         with open(args.add_vocab, 'r') as file:
24 |             added_tokens = json.load(file)
25 |         tokenizer.add_tokens(added_tokens)
26 | 
27 |     prefix_ids = tokenizer(args.prefix).input_ids
28 |     suffix_ids = tokenizer(args.suffix).input_ids
29 |     input_ids = prefix_ids + suffix_ids
30 |     input_locs = list(range(len(prefix_ids))) + list(range(1024-len(suffix_ids), 1024))
31 | 
32 |     # more generaly commands can be defined with something like below:
33 |     # input_ids = [0, 1, 512, 8080, 50256, 20000]
34 |     # input_locs = [5, 6, 19, 20, 1000, 10001]
35 | 
36 | 
37 |     input_ids = torch.tensor(input_ids, device="cuda")[None].repeat(args.batch_size, 1)
38 | 
39 |     def proj_fun(x):
40 |         x[:, input_locs] = input_ids
41 |         return x
42 |     
43 |     device = torch.device('cuda')
44 |     model, graph, noise = load_model(args.model_path, device)
45 |     
46 | 
47 |     sampling_fn = sampling.get_pc_sampler(
48 |         graph, noise, (args.batch_size, 1024), 'analytic', args.steps, device=device, proj_fun=proj_fun
49 |     )
50 | 
51 |     samples = proj_fun(sampling_fn(model))
52 | 
53 |     text_samples = tokenizer.batch_decode(samples)
54 |     for i in text_samples:
55 |         print(i)
56 |         print("=================================================")
57 | 
58 | if __name__=="__main__":
59 |     main()


--------------------------------------------------------------------------------
/triangle-discovery/diffusion/run_train.sh:
--------------------------------------------------------------------------------
 1 | python train.py \
 2 |     noise.type=loglinear \
 3 |     graph.type=absorb \
 4 |     model=small \
 5 |     training.accum=1 \
 6 |     data.train=creativity_data/triangle.0/train.json \
 7 |     data.valid=creativity_data/triangle.0/valid.json \
 8 |     add_vocab=creativity_data/triangle.0/vocab.json \
 9 |     hydra.run.dir=/data/locus/project_data/project_data2/chenwu2/creativity_results/creativity_data/triangle.0/train.json/train
10 | 
11 | python test.py \
12 |     --model_checkpoint_dir creativity_results/creativity_data/triangle.0/train.json/train \
13 |     --dataset creativity_data/triangle.0 \
14 |     --add_vocab creativity_data/triangle.0/vocab.json


--------------------------------------------------------------------------------
/triangle-discovery/diffusion/test.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import argparse
 3 | import json
 4 | 
 5 | from load_model import load_model
 6 | from transformers import GPT2TokenizerFast
 7 | import sampling
 8 | from tqdm import tqdm
 9 | 
10 | import os
11 | 
12 | 
13 | def main():
14 |     parser = argparse.ArgumentParser(description="Generate some samples")
15 |     parser.add_argument("--model_checkpoint_dir", default="", type=str)
16 |     parser.add_argument("--dataset", default=None, type=str)
17 |     parser.add_argument("--steps", type=int, default=128)
18 |     parser.add_argument("--add_vocab", type=str, default=None)
19 |     args = parser.parse_args()
20 | 
21 |     tokenizer = GPT2TokenizerFast.from_pretrained('gpt2')
22 |     if args.add_vocab:
23 |         with open(args.add_vocab, 'r') as file:
24 |             added_tokens = json.load(file)
25 |         tokenizer.add_tokens(added_tokens)
26 |     else:
27 |         added_tokens = []
28 |     
29 |     # Load the dataset
30 |     with open(os.path.join(args.dataset, "test.json"), "r") as f:
31 |         test_data = json.load(f)
32 |     
33 |     # List all files under the model checkpoint directory
34 |     checkpoints = [os.path.join(args.model_checkpoint_dir, "checkpoints", f) for f in os.listdir(os.path.join(args.model_checkpoint_dir, "checkpoints"))]
35 |     print(checkpoints)
36 | 
37 |     for checkpoint in checkpoints:
38 |         device = torch.device('cuda')
39 |         model, graph, noise = load_model(args.model_checkpoint_dir, checkpoint, added_tokens, device)
40 |         # Create a checkpoint_dir for the current checkpoint
41 |         checkpoint_dir = os.path.join(args.model_checkpoint_dir, "checkpoint_outputs", os.path.basename(checkpoint))
42 |         if os.path.exists(checkpoint_dir):
43 |             print(f"Skipping {checkpoint_dir} because it already exists")
44 |             continue
45 |         os.makedirs(checkpoint_dir, exist_ok=True)
46 | 
47 |         def generate_output(input_text):
48 |             prefix_ids = tokenizer(input_text).input_ids
49 |             # suffix_ids = tokenizer("<|endoftext|>").input_ids
50 |             input_ids = prefix_ids
51 |             input_locs = list(range(len(prefix_ids)))
52 | 
53 |             input_ids = torch.tensor(input_ids, device="cuda")[None].repeat(1, 1)
54 | 
55 |             def proj_fun(x):
56 |                 x[:, input_locs] = input_ids
57 |                 return x
58 |             
59 |             sampling_fn = sampling.get_pc_sampler(
60 |                 graph, noise, (1, 32), 'analytic', args.steps, device=device, proj_fun=proj_fun
61 |             )
62 | 
63 |             samples = proj_fun(sampling_fn(model))
64 | 
65 |             text_samples = tokenizer.batch_decode(samples)
66 |             assert len(text_samples) == 1
67 |             text_samples = text_samples[0].split("<|endoftext|>")[0]
68 |             return text_samples
69 |         
70 |         all_items = []
71 |         for sample in tqdm(test_data):
72 |             item = {}
73 |             item["input_text"] = sample["input_text"]
74 |             item["target_text"] = sample["target_text"]
75 |             item["type"] = sample["type"]
76 | 
77 |             output = generate_output(sample["input_text"])
78 |             print(sample["input_text"])
79 |             print(sample["target_text"])
80 |             print(output)
81 |             print()
82 |             item["model_output"] = output
83 |             all_items.append(item)
84 |         
85 |         # Save the results
86 |         with open(os.path.join(checkpoint_dir, "all_items.json"), "w") as f:
87 |             json.dump(all_items, f, indent=4)
88 | 
89 | 
90 | if __name__=="__main__":
91 |     main()


--------------------------------------------------------------------------------
/triangle-discovery/diffusion/train.py:
--------------------------------------------------------------------------------
 1 | """Training and evaluation"""
 2 | 
 3 | import hydra
 4 | import os
 5 | import numpy as np
 6 | import run_train
 7 | import utils
 8 | import torch.multiprocessing as mp
 9 | from hydra.core.hydra_config import HydraConfig
10 | from hydra.types import RunMode
11 | from omegaconf import OmegaConf, open_dict
12 | 
13 | 
14 | @hydra.main(version_base=None, config_path="configs", config_name="config")
15 | def main(cfg):
16 |     ngpus = cfg.ngpus
17 |     if "load_dir" in cfg:
18 |         hydra_cfg_path = os.path.join(cfg.load_dir, ".hydra/hydra.yaml")
19 |         hydra_cfg = OmegaConf.load(hydra_cfg_path).hydra
20 | 
21 |         cfg = utils.load_hydra_config_from_run(cfg.load_dir)
22 |         
23 |         work_dir = cfg.work_dir
24 |         utils.makedirs(work_dir)
25 |     else:
26 |         hydra_cfg = HydraConfig.get()
27 |         work_dir = hydra_cfg.run.dir if hydra_cfg.mode == RunMode.RUN else os.path.join(hydra_cfg.sweep.dir, hydra_cfg.sweep.subdir)
28 |         utils.makedirs(work_dir)
29 | 
30 |     with open_dict(cfg):
31 |         cfg.ngpus = ngpus
32 |         cfg.work_dir = work_dir
33 |         cfg.wandb_name = os.path.basename(os.path.normpath(work_dir))
34 | 
35 | 	# Run the training pipeline
36 |     port = int(np.random.randint(10000, 20000))
37 |     logger = utils.get_logger(os.path.join(work_dir, "logs"))
38 | 
39 |     hydra_cfg = HydraConfig.get()
40 |     if hydra_cfg.mode != RunMode.RUN:
41 |         logger.info(f"Run id: {hydra_cfg.job.id}")
42 | 
43 |     try:
44 |         mp.set_start_method("forkserver")
45 |         mp.spawn(run_train.run_multiprocess, args=(ngpus, cfg, port), nprocs=ngpus, join=True)
46 |     except Exception as e:
47 |         logger.critical(e, exc_info=True)
48 | 
49 | 
50 | if __name__ == "__main__":
51 |     main()


--------------------------------------------------------------------------------
/triangle-discovery/diffusion/utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | import os
 4 | import logging
 5 | from omegaconf import OmegaConf, open_dict
 6 | 
 7 | 
 8 | def load_hydra_config_from_run(load_dir):
 9 |     cfg_path = os.path.join(load_dir, ".hydra/config.yaml")
10 |     cfg = OmegaConf.load(cfg_path)
11 |     return cfg
12 | 
13 | 
14 | def makedirs(dirname):
15 |     os.makedirs(dirname, exist_ok=True)
16 | 
17 | 
18 | def get_logger(logpath, package_files=[], displaying=True, saving=True, debug=False):
19 |     logger = logging.getLogger()
20 |     if debug:
21 |         level = logging.DEBUG
22 |     else:
23 |         level = logging.INFO
24 | 
25 |     if (logger.hasHandlers()):
26 |         logger.handlers.clear()
27 | 
28 |     logger.setLevel(level)
29 |     formatter = logging.Formatter('%(asctime)s - %(message)s')
30 |     if saving:
31 |         info_file_handler = logging.FileHandler(logpath, mode="a")
32 |         info_file_handler.setLevel(level)
33 |         info_file_handler.setFormatter(formatter)
34 |         logger.addHandler(info_file_handler)
35 |     if displaying:
36 |         console_handler = logging.StreamHandler()
37 |         console_handler.setLevel(level)
38 |         console_handler.setFormatter(formatter)
39 |         logger.addHandler(console_handler)
40 | 
41 |     for f in package_files:
42 |         logger.info(f)
43 |         with open(f, "r") as package_f:
44 |             logger.info(package_f.read())
45 | 
46 |     return logger
47 | 
48 | 
49 | def restore_checkpoint(ckpt_dir, state, device):
50 |     if not os.path.exists(ckpt_dir):
51 |         makedirs(os.path.dirname(ckpt_dir))
52 |         logging.warning(f"No checkpoint found at {ckpt_dir}. Returned the same state as input")
53 |         return state
54 |     else:
55 |         loaded_state = torch.load(ckpt_dir, map_location=device)
56 |         state['optimizer'].load_state_dict(loaded_state['optimizer'])
57 |         state['model'].module.load_state_dict(loaded_state['model'], strict=False)
58 |         state['ema'].load_state_dict(loaded_state['ema'])
59 |         state['step'] = loaded_state['step']
60 |         return state
61 | 
62 | 
63 | def save_checkpoint(ckpt_dir, state):
64 |     saved_state = {
65 |         'optimizer': state['optimizer'].state_dict(),
66 |         'model': state['model'].module.state_dict(),
67 |         'ema': state['ema'].state_dict(),
68 |         'step': state['step']
69 |     }
70 |     torch.save(saved_state, ckpt_dir)


--------------------------------------------------------------------------------
/triangle-discovery/ntp/eval.sh:
--------------------------------------------------------------------------------
1 | # $1: dataset
2 | # $2: weight_decay
3 | # $3: n_layers
4 | 
5 | EXP_DIR=../creativity
6 | DATA_DIR=/data/locus/project_data/project_data2/chenwu2/creativity_data
7 | 
8 | python eval_qa.py --dir $EXP_DIR/$1_$2_$3 --dataset $1 --data_dir $DATA_DIR
9 | 


--------------------------------------------------------------------------------
/triangle-discovery/ntp/run_eval.sh:
--------------------------------------------------------------------------------
1 | bash eval.sh triangle.10 0.0 12


--------------------------------------------------------------------------------
/triangle-discovery/ntp/run_train.sh:
--------------------------------------------------------------------------------
1 | bash train.sh triangle.10 0.0 12 0


--------------------------------------------------------------------------------
/triangle-discovery/ntp/train.sh:
--------------------------------------------------------------------------------
 1 | MODEL_PATH=gpt2
 2 | 
 3 | DATASET=/data/locus/project_data/project_data2/chenwu2/creativity_data/$1/
 4 | WEIGHT_DECAY=$2
 5 | N_LAYERS=$3
 6 | GPU=$4
 7 | 
 8 | EXP_DIR=../creativity
 9 | 
10 | OUTPUT_DIR=$EXP_DIR/$1_$2_$3
11 | 
12 | CUDA_VISIBLE_DEVICES=$GPU python main.py \
13 |     --data_dir $DATASET \
14 |     --model_name_or_path ${MODEL_PATH} \
15 |     --weight_decay $WEIGHT_DECAY \
16 |     --output_dir $OUTPUT_DIR \
17 |     --max_seq_length 128 \
18 |     --max_length 128 \
19 |     --block_size 128 \
20 |     --train_batch_size 64 \
21 |     --eval_batch_size 64 \
22 |     --learning_rate 1e-4 \
23 |     --gradient_accumulation_steps 1 \
24 |     --save_step 40000 \
25 |     --save_step_dense 20000 \
26 |     --max_steps 800000 \
27 |     --do_train \
28 |     --scheduler constant_schedule_with_warmup \
29 |     --fp16 \
30 |     --evaluate_during_training \
31 |     --predict_during_training \
32 |     --add_tokens \
33 |     --n_layer $N_LAYERS
34 | 


--------------------------------------------------------------------------------
/triangle-discovery/ntp/utils.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import json
 3 | import torch
 4 | import torch.nn as nn
 5 | import torch.nn.functional as F
 6 | 
 7 | 
 8 | def read_data_source_target(file_name, return_num=False, return_json=False, cutoff=None):
 9 |     """
10 |     file_name: a .json file containing a list of items, each has 'input_text', 'target_text', as keys
11 |     """
12 |     with open(file_name, 'r', encoding='utf-8') as f:
13 |         data = json.load(f)
14 |     
15 |     if cutoff is not None:
16 |         data = data[:cutoff]
17 | 
18 |     if return_json:
19 |         if return_num:
20 |             return data, len(data)
21 |         return data
22 | 
23 |     keys = list(data[0].keys())
24 |     source_target_pair = []
25 |     for item in data:
26 |         source_target_pair.append([item[key] for key in keys])
27 | 
28 |     if return_num:
29 |         return pd.DataFrame(source_target_pair, columns=keys), len(data)
30 |     return pd.DataFrame(source_target_pair, columns=keys)
31 | 


--------------------------------------------------------------------------------
/triangle-discovery/teacherless/eval.sh:
--------------------------------------------------------------------------------
 1 | # $1: dataset
 2 | # $2: weight_decay
 3 | # $3: n_layers
 4 | 
 5 | EXP_DIR=../creativity
 6 | DATA_DIR=/data/locus/project_data/project_data2/chenwu2/creativity_data
 7 | 
 8 | python eval_qa.py --dir $EXP_DIR/$1_$2_$3 --dataset $1 --data_dir $DATA_DIR
 9 | 
10 | 


--------------------------------------------------------------------------------
/triangle-discovery/teacherless/run_eval.sh:
--------------------------------------------------------------------------------
1 | bash eval.sh triangle_hybrid.10 0.0 12
2 | 


--------------------------------------------------------------------------------
/triangle-discovery/teacherless/run_train.sh:
--------------------------------------------------------------------------------
1 | bash train.sh triangle_hybrid.10 0.0 12 0
2 | 


--------------------------------------------------------------------------------
/triangle-discovery/teacherless/train.sh:
--------------------------------------------------------------------------------
 1 | MODEL_PATH=gpt2
 2 | 
 3 | DATASET=/data/locus/project_data/project_data2/chenwu2/creativity_data/$1/
 4 | WEIGHT_DECAY=$2
 5 | N_LAYERS=$3
 6 | GPU=$4
 7 | 
 8 | EXP_DIR=../creativity
 9 | 
10 | OUTPUT_DIR=$EXP_DIR/$1_$2_$3
11 | 
12 | CUDA_VISIBLE_DEVICES=$GPU python main.py \
13 |     --data_dir $DATASET \
14 |     --model_name_or_path ${MODEL_PATH} \
15 |     --weight_decay $WEIGHT_DECAY \
16 |     --output_dir $OUTPUT_DIR \
17 |     --max_seq_length 128 \
18 |     --max_length 128 \
19 |     --block_size 128 \
20 |     --train_batch_size 64 \
21 |     --eval_batch_size 64 \
22 |     --learning_rate 1e-4 \
23 |     --gradient_accumulation_steps 1 \
24 |     --save_step 40000 \
25 |     --save_step_dense 20000 \
26 |     --max_steps 1200000 \
27 |     --do_train \
28 |     --scheduler constant_schedule_with_warmup \
29 |     --fp16 \
30 |     --evaluate_during_training \
31 |     --predict_during_training \
32 |     --add_tokens \
33 |     --n_layer $N_LAYERS
34 | 


--------------------------------------------------------------------------------
/triangle-discovery/teacherless/utils.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import json
 3 | import torch
 4 | import torch.nn as nn
 5 | import torch.nn.functional as F
 6 | 
 7 | 
 8 | def read_data_source_target(file_name, return_num=False, return_json=False, cutoff=None):
 9 |     """
10 |     file_name: a .json file containing a list of items, each has 'input_text', 'target_text', as keys
11 |     """
12 |     with open(file_name, 'r', encoding='utf-8') as f:
13 |         data = json.load(f)
14 |     
15 |     if cutoff is not None:
16 |         data = data[:cutoff]
17 | 
18 |     if return_json:
19 |         if return_num:
20 |             return data, len(data)
21 |         return data
22 | 
23 |     keys = list(data[0].keys())
24 |     source_target_pair = []
25 |     for item in data:
26 |         source_target_pair.append([item[key] for key in keys])
27 | 
28 |     if return_num:
29 |         return pd.DataFrame(source_target_pair, columns=keys), len(data)
30 |     return pd.DataFrame(source_target_pair, columns=keys)
31 | 


--------------------------------------------------------------------------------