├── .gitignore ├── LICENSE ├── README.md ├── configs ├── __init__.py ├── config.py ├── datamodule │ ├── __init__.py │ └── longcrawl64.py ├── experiment │ └── longcrawl64 │ │ ├── delta_net │ │ ├── 125m_2b.yaml │ │ ├── 360m_7b.yaml │ │ ├── 760m_16b.yaml │ │ └── 760m_48b.yaml │ │ ├── forgetting_transformer │ │ ├── llama_125m_2b.yaml │ │ ├── llama_360m_7b.yaml │ │ ├── llama_760m_16b.yaml │ │ ├── llama_760m_48b.yaml │ │ ├── pro_125m_2b.yaml │ │ ├── pro_360m_7b.yaml │ │ ├── pro_760m_16b.yaml │ │ └── pro_760m_48b.yaml │ │ ├── hgrn2 │ │ ├── 125m_2b.yaml │ │ ├── 360m_7b.yaml │ │ ├── 760m_16b.yaml │ │ └── 760m_48b.yaml │ │ ├── mamba2 │ │ ├── 125m_2b.yaml │ │ ├── 360m_7b.yaml │ │ ├── 760m_16b.yaml │ │ └── 760m_48b.yaml │ │ ├── samba │ │ └── 760m_16b.yaml │ │ ├── transformer │ │ ├── llama_125m_2b.yaml │ │ ├── llama_360m_7b.yaml │ │ ├── llama_760m_16b.yaml │ │ ├── llama_760m_48b.yaml │ │ ├── pro_125m_2b.yaml │ │ ├── pro_360m_7b.yaml │ │ ├── pro_760m_16b.yaml │ │ └── pro_760m_48b.yaml │ │ └── transformer_swa │ │ └── 760m_16b.yaml ├── model │ ├── __init__.py │ ├── delta_net.py │ ├── forgetting_transformer.py │ ├── hgrn2.py │ ├── mamba2.py │ ├── samba.py │ └── transformer.py ├── optimizer │ ├── __init__.py │ └── adamw.py ├── schedule │ ├── __init__.py │ ├── constant.py │ ├── warmup_cosine.py │ ├── warmup_linear.py │ └── warmup_one_minus_sqrt.py ├── strategy │ ├── __init__.py │ ├── ddp.py │ └── fsdp.py └── utils.py ├── eval ├── lm_eval_harness │ ├── README.md │ ├── run_lm_eval.py │ └── table_lm_eval.py ├── longbench │ ├── LICENSE │ ├── README.md │ ├── config │ │ ├── dataset2maxlen.json │ │ └── dataset2prompt.json │ ├── eval.py │ ├── llama_flash_attn_monkey_patch.py │ ├── metrics.py │ ├── pred.py │ ├── refs │ │ └── ref.bib │ ├── requirements.txt │ ├── table_longbench.py │ └── task.md ├── niah │ ├── LICENSE │ ├── PaulGrahamEssays │ │ ├── addiction.txt │ │ ├── aord.txt │ │ ├── apple.txt │ │ ├── avg.txt │ │ ├── before.txt │ │ ├── bias.txt │ │ ├── boss.txt │ │ ├── copy.txt │ │ ├── corpdev.txt │ │ ├── desres.txt │ │ ├── diff.txt │ │ ├── ecw.txt │ │ ├── founders.txt │ │ ├── foundervisa.txt │ │ ├── gap.txt │ │ ├── gba.txt │ │ ├── gh.txt │ │ ├── goodtaste.txt │ │ ├── hubs.txt │ │ ├── iflisp.txt │ │ ├── island.txt │ │ ├── know.txt │ │ ├── langdes.txt │ │ ├── laundry.txt │ │ ├── love.txt │ │ ├── mod.txt │ │ ├── newideas.txt │ │ ├── nft.txt │ │ ├── philosophy.txt │ │ ├── popular.txt │ │ ├── pow.txt │ │ ├── rootsoflisp.txt │ │ ├── rss.txt │ │ ├── siliconvalley.txt │ │ ├── startuplessons.txt │ │ ├── submarine.txt │ │ ├── superangels.txt │ │ ├── todo.txt │ │ ├── unions.txt │ │ ├── useful.txt │ │ ├── vb.txt │ │ ├── vcsqueeze.txt │ │ ├── vw.txt │ │ ├── want.txt │ │ ├── web20.txt │ │ ├── weird.txt │ │ └── wisdom.txt │ ├── README.md │ ├── config-eval.yaml │ ├── config-pred.yaml │ ├── config-prompt-debug.yaml │ ├── config-prompt-easy.yaml │ ├── config-prompt-standard.yaml │ ├── eval.py │ ├── llama_flash_attn_monkey_patch.py │ ├── plot_niah.py │ ├── pred.py │ └── prompt.py └── per_token_loss │ ├── README.md │ ├── plot_per_token_loss.py │ └── run_per_token_loss.py ├── pyproject.toml ├── requirements-dev.txt ├── save_model.py ├── src └── forgetting_transformer │ ├── __init__.py │ ├── checkpoint.py │ ├── datamodule │ ├── __init__.py │ ├── common.py │ └── longcrawl64.py │ ├── logger.py │ ├── model │ ├── README.md │ ├── __init__.py │ ├── common.py │ ├── delta_net │ │ ├── __init__.py │ │ ├── configuration_delta_net.py │ │ ├── delta_net_layer.py │ │ └── modeling_delta_net.py │ ├── forgetting_transformer │ │ ├── __init__.py │ │ ├── configuration_forgetting_transformer.py │ │ ├── fgate_cache.py │ │ ├── glu_linear.py │ │ ├── modeling_forgetting_transformer.py │ │ └── token_shift.py │ ├── hgrn2 │ │ ├── __init__.py │ │ ├── configuration_hgrn2.py │ │ ├── hgrn2_attention.py │ │ └── modeling_hgrn2.py │ ├── mamba2 │ │ ├── __init__.py │ │ ├── configuration_mamba2.py │ │ └── modeling_mamba2.py │ ├── samba │ │ ├── __init__.py │ │ ├── configuration_samba.py │ │ └── modeling_samba.py │ └── transformer │ │ ├── __init__.py │ │ ├── configuration_transformer.py │ │ └── modeling_transformer.py │ ├── ops │ ├── __init__.py │ └── forgetting_attention.py │ ├── schedule │ ├── __init__.py │ └── schedule.py │ ├── tokenizer.py │ └── utils.py └── train.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Custom 2 | cache/ 3 | /data/ 4 | output/ 5 | results/ 6 | debug/ 7 | logs/ 8 | pred/ 9 | notebooks/ 10 | figures/ 11 | .psync 12 | # hyena_S5 specific stuff 13 | wandb/ 14 | cache_dir/ 15 | raw_datasets/ 16 | local_scripts/ 17 | 18 | .DS_Store 19 | 20 | # Byte-compiled / optimized / DLL files 21 | __pycache__/ 22 | *.py[cod] 23 | *$py.class 24 | 25 | # C extensions 26 | *.so 27 | 28 | # Distribution / packaging 29 | .Python 30 | build/ 31 | develop-eggs/ 32 | dist/ 33 | downloads/ 34 | eggs/ 35 | .eggs/ 36 | lib/ 37 | lib64/ 38 | parts/ 39 | sdist/ 40 | var/ 41 | wheels/ 42 | share/python-wheels/ 43 | *.egg-info/ 44 | .installed.cfg 45 | *.egg 46 | MANIFEST 47 | 48 | # PyInstaller 49 | # Usually these files are written by a python script from a template 50 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 51 | *.manifest 52 | *.spec 53 | 54 | # Installer logs 55 | pip-log.txt 56 | pip-delete-this-directory.txt 57 | 58 | # Unit test / coverage reports 59 | htmlcov/ 60 | .tox/ 61 | .nox/ 62 | .coverage 63 | .coverage.* 64 | .cache 65 | nosetests.xml 66 | coverage.xml 67 | *.cover 68 | *.py,cover 69 | .hypothesis/ 70 | .pytest_cache/ 71 | cover/ 72 | 73 | # Translations 74 | *.mo 75 | *.pot 76 | 77 | # Django stuff: 78 | *.log 79 | local_settings.py 80 | db.sqlite3 81 | db.sqlite3-journal 82 | 83 | # Flask stuff: 84 | instance/ 85 | .webassets-cache 86 | 87 | # Scrapy stuff: 88 | .scrapy 89 | 90 | # Sphinx documentation 91 | docs/_build/ 92 | 93 | # PyBuilder 94 | .pybuilder/ 95 | target/ 96 | 97 | # Jupyter Notebook 98 | .ipynb_checkpoints 99 | 100 | # IPython 101 | profile_default/ 102 | ipython_config.py 103 | 104 | # pyenv 105 | # For a library or package, you might want to ignore these files since the code is 106 | # intended to run in multiple environments; otherwise, check them in: 107 | # .python-version 108 | 109 | # pipenv 110 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 111 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 112 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 113 | # install all needed dependencies. 114 | #Pipfile.lock 115 | 116 | # poetry 117 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 118 | # This is especially recommended for binary packages to ensure reproducibility, and is more 119 | # commonly ignored for libraries. 120 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 121 | #poetry.lock 122 | 123 | # pdm 124 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 125 | #pdm.lock 126 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 127 | # in version control. 128 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 129 | .pdm.toml 130 | .pdm-python 131 | .pdm-build/ 132 | 133 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 134 | __pypackages__/ 135 | 136 | # Celery stuff 137 | celerybeat-schedule 138 | celerybeat.pid 139 | 140 | # SageMath parsed files 141 | *.sage.py 142 | 143 | # Environments 144 | .env 145 | .venv 146 | env/ 147 | venv/ 148 | ENV/ 149 | env.bak/ 150 | venv.bak/ 151 | 152 | # Spyder project settings 153 | .spyderproject 154 | .spyproject 155 | 156 | # Rope project settings 157 | .ropeproject 158 | 159 | # mkdocs documentation 160 | /site 161 | 162 | # mypy 163 | .mypy_cache/ 164 | .dmypy.json 165 | dmypy.json 166 | 167 | # Pyre type checker 168 | .pyre/ 169 | 170 | # pytype static type analyzer 171 | .pytype/ 172 | 173 | # Cython debug symbols 174 | cython_debug/ 175 | 176 | # PyCharm 177 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 178 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 179 | # and can be added to the global gitignore or merged into this file. For a more nuclear 180 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 181 | #.idea/ 182 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Zhixuan Lin 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /configs/__init__.py: -------------------------------------------------------------------------------- 1 | from configs.config import Config 2 | 3 | 4 | 5 | -------------------------------------------------------------------------------- /configs/config.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | from omegaconf import OmegaConf, MISSING 3 | from typing import List, Any, Literal, Optional, Union 4 | from pathlib import Path 5 | from hydra.core.config_store import ConfigStore 6 | from configs.optimizer import OptimizerConfig 7 | from configs.schedule import ScheduleConfig 8 | from configs.model import ModelConfig 9 | from configs.datamodule import DataModuleConfig 10 | from configs.utils import auto_register 11 | from configs.strategy import StrategyConfig 12 | 13 | @dataclass 14 | class WandbConfig: 15 | project: str = "forgetting-transformer" 16 | mode: str = "offline" 17 | log_dir: str = MISSING 18 | 19 | @dataclass 20 | class FabricConfig: 21 | devices: Union[int, str] = "auto" 22 | precision: str = 'bf16-mixed' 23 | 24 | 25 | @dataclass 26 | class TrainConfig: 27 | max_tokens: int = MISSING 28 | grad_acc_tokens: int = MISSING 29 | max_grad_norm: float = MISSING 30 | gradient_checkpointing: bool = False 31 | 32 | bias_weight_decay: bool = False 33 | normalization_weight_decay: bool = False 34 | conv_weight_decay: bool = True 35 | 36 | @dataclass 37 | class EvalConfig: 38 | min_val_length: int = 512 39 | 40 | 41 | @dataclass 42 | class Config: 43 | defaults: List[Any] = field( 44 | default_factory=lambda: [ 45 | {"model": "???"}, 46 | {"optimizer": "???"}, 47 | {"schedule": "???"}, 48 | {"datamodule": "???"}, 49 | {"strategy": "???"}, 50 | # If we don't do these hydra will mess up python logging 51 | # Also must none. `disabled` mess up other libraries. 52 | {"override hydra/job_logging": "none"}, 53 | {"override hydra/hydra_logging": "none"}, 54 | "_self_", 55 | ] 56 | ) 57 | 58 | # https://github.com/facebookresearch/hydra/issues/2049 59 | # If we don't do this hydra will create an annoying directory 60 | hydra: Any = field(default_factory=lambda: {"run": {"dir": "${output_dir}"}}) 61 | 62 | exp: str = "debug" 63 | tag: str = "debug" 64 | seed: int = 0 65 | 66 | # Only used for saving HF model 67 | hf_load_dir: Optional[str] = None 68 | hf_save_dir: Optional[str] = None 69 | hf_load_step: Optional[int] = None 70 | 71 | # Everything (config, metrics, checkpoints etc) except for wandb log will be saved here 72 | output_dir: str = MISSING 73 | # Any dataset should reside here 74 | data_dir: str = MISSING 75 | # Don't forget to set wandb.log_dir as well 76 | 77 | # When resuming, we first try to load the latest checkpoint from output_dir / 'checkpoints'. If nothing 78 | # found, we try to start from fork_step from fork_dir if it is not None. 79 | resume: bool = MISSING 80 | fork_dir: Optional[str] = None 81 | fork_step: Optional[int] = None 82 | 83 | log_interval: int = MISSING 84 | eval_interval: int = MISSING 85 | final_eval: bool = True 86 | skip_eval: bool = True 87 | # Save checkpoints every this steps. We only keep the latest checkpoint 88 | checkpoint_interval: int = MISSING 89 | # Eval results with training loss 90 | train_eval_interval: int = MISSING 91 | # Besides the latest checkpoint, also keeps permanent checkpoints at these 92 | # interval 93 | checkpoint_keep_interval: int = MISSING 94 | 95 | # Regular hierarhical config 96 | fabric: FabricConfig = FabricConfig() 97 | train: TrainConfig = TrainConfig() 98 | eval: EvalConfig = EvalConfig() 99 | wandb: WandbConfig = WandbConfig() 100 | 101 | # Meant to decided by default list 102 | strategy: StrategyConfig = MISSING 103 | model: ModelConfig = MISSING 104 | schedule: ScheduleConfig = MISSING 105 | datamodule: DataModuleConfig = MISSING 106 | optimizer: OptimizerConfig = MISSING 107 | 108 | cs = ConfigStore.instance() 109 | cs.store(name='config', node=Config) 110 | config_root = Path(__file__).parent 111 | for base_class in [ 112 | OptimizerConfig, 113 | ModelConfig, 114 | DataModuleConfig, 115 | ScheduleConfig, 116 | StrategyConfig, 117 | ]: 118 | auto_register(base_class, config_root) 119 | -------------------------------------------------------------------------------- /configs/datamodule/__init__.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | 3 | @dataclass 4 | class DataModuleConfig: 5 | pass 6 | -------------------------------------------------------------------------------- /configs/datamodule/longcrawl64.py: -------------------------------------------------------------------------------- 1 | from . import DataModuleConfig 2 | from omegaconf import MISSING 3 | from dataclasses import dataclass 4 | from typing import Optional 5 | 6 | @dataclass 7 | class LongCrawl64Config(DataModuleConfig): 8 | _target_: str = 'forgetting_transformer.datamodule.longcrawl64.LongCrawl64DataModule' 9 | # This is a custom resolver. Note inside data_dir refers to the root config node 10 | data_dir: str = '${join_path:${data_dir},longcrawl64}' 11 | rank: int = MISSING # Should be provided programmatically 12 | world_size: int = MISSING # Should be provided programmatically 13 | train_seq_len: Optional[int] = None 14 | train_batch_len: int = MISSING 15 | train_batch_size: int = MISSING 16 | # train_tokens_per_stage: int = MISSING 17 | train_doc_len: Optional[int] = None 18 | train_num_workers: int = MISSING 19 | 20 | eval_tokens: int = MISSING 21 | eval_seq_len: Optional[int] = None 22 | eval_batch_len: int = MISSING 23 | eval_local_batch_size: int = MISSING 24 | eval_doc_len: Optional[int] = None 25 | eval_num_workers: int = MISSING 26 | -------------------------------------------------------------------------------- /configs/experiment/longcrawl64/delta_net/125m_2b.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | defaults: 3 | - override /model: delta_net 4 | - override /optimizer: adamw 5 | - override /schedule: warmup_cosine 6 | - override /datamodule: longcrawl64 7 | - override /strategy: fsdp 8 | - _self_ 9 | 10 | exp: ??? 11 | tag: ??? 12 | seed: 0 13 | 14 | output_dir: ??? 15 | data_dir: ??? # data_dir / 'longcrawl' / 'train.zarr' should exist 16 | 17 | resume: True 18 | 19 | log_interval: 33554432 # 32Mi. 32 times per billion 20 | train_eval_interval: 536870912 # 512Mi 21 | checkpoint_interval: 268435456 # 256Mi In tokens. 4 times per 1Bi. It is worth it 22 | 23 | skip_eval: true 24 | eval_interval: 2684354560 # Only at the end. Not used due to skip_eval 25 | checkpoint_keep_interval: 2684354560 # Only at the end 26 | 27 | fabric: 28 | devices: auto 29 | precision: 'bf16-mixed' 30 | 31 | train: 32 | max_tokens: 2684354560 # 2.5 Bi 33 | # Used for one gradient accumulation step, must be larger than batch_len 34 | grad_acc_tokens: 16384 35 | max_grad_norm: 1.0 36 | gradient_checkpointing: false 37 | 38 | model: 39 | config: 40 | hidden_size: 768 41 | num_hidden_layers: 12 42 | num_heads: 6 43 | 44 | optimizer: 45 | lr: 0.002 46 | betas: [0.9, 0.95] 47 | weight_decay: 0.1 48 | 49 | schedule: 50 | init_value: 0.0 51 | peak_value: ${optimizer.lr} 52 | warmup_steps: 268435456 # 256Mi 53 | decay_steps: ${train.max_tokens} 54 | end_value: 0.0 55 | 56 | datamodule: 57 | train_batch_len: 16384 58 | train_batch_size: 32 59 | train_num_workers: 2 60 | 61 | eval_batch_len: 16384 62 | eval_local_batch_size: 1 63 | eval_tokens: 2147483648 # 2Bi 64 | eval_num_workers: 2 65 | -------------------------------------------------------------------------------- /configs/experiment/longcrawl64/delta_net/360m_7b.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | defaults: 3 | - override /model: delta_net 4 | - override /optimizer: adamw 5 | - override /schedule: warmup_cosine 6 | - override /datamodule: longcrawl64 7 | - override /strategy: fsdp 8 | - _self_ 9 | 10 | exp: ??? 11 | tag: ??? 12 | seed: 0 13 | 14 | output_dir: ??? 15 | data_dir: ??? # data_dir / 'longcrawl' / 'train.zarr' should exist 16 | 17 | resume: True 18 | 19 | log_interval: 33554432 # 32Mi. 32 times per billion 20 | train_eval_interval: 536870912 # 512Mi 21 | checkpoint_interval: 268435456 # 256Mi In tokens. 4 times per 1Bi. It is worth it 22 | 23 | skip_eval: true 24 | eval_interval: 7516192768 # Only at the end. Not used due to skip_eval 25 | checkpoint_keep_interval: 7516192768 # Only at the end 26 | 27 | fabric: 28 | devices: auto 29 | precision: 'bf16-mixed' 30 | 31 | train: 32 | max_tokens: 7516192768 # 7 Bi 33 | # Used for one gradient accumulation step, must be larger than batch_len 34 | grad_acc_tokens: 16384 35 | max_grad_norm: 1.0 36 | gradient_checkpointing: false 37 | 38 | model: 39 | config: 40 | hidden_size: 1024 41 | num_hidden_layers: 24 42 | num_heads: 8 43 | 44 | optimizer: 45 | lr: 0.002 46 | betas: [0.9, 0.95] 47 | weight_decay: 0.1 48 | 49 | schedule: 50 | init_value: 0.0 51 | peak_value: ${optimizer.lr} 52 | warmup_steps: 268435456 # 256Mi 53 | decay_steps: ${train.max_tokens} 54 | end_value: 0.0 55 | 56 | datamodule: 57 | train_batch_len: 16384 58 | train_batch_size: 32 59 | train_num_workers: 2 60 | 61 | eval_batch_len: 16384 62 | eval_local_batch_size: 1 63 | eval_tokens: 2147483648 # 2Bi 64 | eval_num_workers: 2 65 | -------------------------------------------------------------------------------- /configs/experiment/longcrawl64/delta_net/760m_16b.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | defaults: 3 | - override /model: delta_net 4 | - override /optimizer: adamw 5 | - override /schedule: warmup_cosine 6 | - override /datamodule: longcrawl64 7 | - override /strategy: fsdp 8 | - _self_ 9 | 10 | exp: ??? 11 | tag: ??? 12 | seed: 0 13 | 14 | output_dir: ??? 15 | data_dir: ??? # data_dir / 'longcrawl' / 'train.zarr' should exist 16 | 17 | resume: True 18 | 19 | log_interval: 33554432 # 32Mi. 32 times per billion 20 | train_eval_interval: 536870912 # 512Mi 21 | checkpoint_interval: 268435456 # 256Mi In tokens. 4 times per 1Bi. It is worth it 22 | 23 | skip_eval: true 24 | eval_interval: 16106127360 # Only at the end. Not used due to skip_eval 25 | checkpoint_keep_interval: 16106127360 # Only at the end 26 | 27 | fabric: 28 | devices: auto 29 | precision: 'bf16-mixed' 30 | 31 | train: 32 | max_tokens: 16106127360 # 15 Bi 33 | # Used for one gradient accumulation step, must be larger than batch_len 34 | grad_acc_tokens: 16384 35 | max_grad_norm: 1.0 36 | gradient_checkpointing: true 37 | 38 | model: 39 | config: 40 | hidden_size: 1536 41 | num_hidden_layers: 24 42 | num_heads: 12 43 | 44 | optimizer: 45 | lr: 0.001 46 | betas: [0.9, 0.95] 47 | weight_decay: 0.1 48 | 49 | schedule: 50 | init_value: 0.0 51 | peak_value: ${optimizer.lr} 52 | warmup_steps: 268435456 # 256Mi 53 | decay_steps: ${train.max_tokens} 54 | end_value: 0.0 55 | 56 | datamodule: 57 | train_batch_len: 16384 58 | train_batch_size: 32 59 | train_num_workers: 2 60 | 61 | eval_batch_len: 16384 62 | eval_local_batch_size: 1 63 | eval_tokens: 2147483648 # 2Bi 64 | eval_num_workers: 2 65 | -------------------------------------------------------------------------------- /configs/experiment/longcrawl64/delta_net/760m_48b.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | defaults: 3 | - override /model: delta_net 4 | - override /optimizer: adamw 5 | - override /schedule: warmup_cosine 6 | - override /datamodule: longcrawl64 7 | - override /strategy: fsdp 8 | - _self_ 9 | 10 | exp: ??? 11 | tag: ??? 12 | seed: 0 13 | 14 | output_dir: ??? 15 | data_dir: ??? # data_dir / 'longcrawl' / 'train.zarr' should exist 16 | 17 | resume: True 18 | 19 | log_interval: 33554432 # 32Mi. 32 times per billion 20 | train_eval_interval: 536870912 # 512Mi 21 | checkpoint_interval: 268435456 # 256Mi In tokens. 4 times per 1Bi. It is worth it 22 | 23 | skip_eval: true 24 | eval_interval: 48318382080 # Only at the end. Not used due to skip_eval 25 | checkpoint_keep_interval: 48318382080 # Only at the end 26 | 27 | fabric: 28 | devices: auto 29 | precision: 'bf16-mixed' 30 | 31 | train: 32 | max_tokens: 48318382080 # 15 Bi 33 | # Used for one gradient accumulation step, must be larger than batch_len 34 | grad_acc_tokens: 16384 35 | max_grad_norm: 1.0 36 | gradient_checkpointing: true 37 | 38 | model: 39 | config: 40 | hidden_size: 1536 41 | num_hidden_layers: 24 42 | num_heads: 12 43 | 44 | optimizer: 45 | lr: 0.001 46 | betas: [0.9, 0.95] 47 | weight_decay: 0.1 48 | 49 | schedule: 50 | init_value: 0.0 51 | peak_value: ${optimizer.lr} 52 | warmup_steps: 268435456 # 256Mi 53 | decay_steps: ${train.max_tokens} 54 | end_value: 0.0 55 | 56 | datamodule: 57 | train_batch_len: 16384 58 | train_batch_size: 32 59 | train_num_workers: 2 60 | 61 | eval_batch_len: 16384 62 | eval_local_batch_size: 1 63 | eval_tokens: 2147483648 # 2Bi 64 | eval_num_workers: 2 65 | -------------------------------------------------------------------------------- /configs/experiment/longcrawl64/forgetting_transformer/llama_125m_2b.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | defaults: 3 | - override /model: forgetting_transformer 4 | - override /optimizer: adamw 5 | - override /schedule: warmup_cosine 6 | - override /datamodule: longcrawl64 7 | - override /strategy: fsdp 8 | - _self_ 9 | 10 | exp: ??? 11 | tag: ??? 12 | seed: 0 13 | 14 | output_dir: ??? 15 | data_dir: ??? # data_dir / 'longcrawl' / 'train.zarr' should exist 16 | 17 | resume: True 18 | 19 | log_interval: 33554432 # 32Mi. 32 times per billion 20 | train_eval_interval: 536870912 # 512Mi 21 | checkpoint_interval: 268435456 # 256Mi In tokens. 4 times per 1Bi. It is worth it 22 | 23 | skip_eval: true # Don't do evaluation. 24 | eval_interval: 2684354560 # Only at the end. Not used due to skip_eval 25 | checkpoint_keep_interval: 2684354560 # Only at the end 26 | 27 | fabric: 28 | devices: auto 29 | precision: 'bf16-mixed' 30 | 31 | train: 32 | max_tokens: 2684354560 # 2.5 Bi 33 | # Used for one gradient accumulation step, must be larger than batch_len 34 | grad_acc_tokens: 32768 35 | max_grad_norm: 1.0 36 | gradient_checkpointing: false 37 | 38 | model: 39 | config: 40 | hidden_size: 768 41 | num_hidden_layers: 12 42 | num_heads: 12 43 | use_rope: false 44 | rope_base: 500000 45 | 46 | optimizer: 47 | lr: 0.002 48 | betas: [0.9, 0.95] 49 | weight_decay: 0.1 50 | 51 | schedule: 52 | init_value: 0.0 53 | peak_value: ${optimizer.lr} 54 | warmup_steps: 268435456 # 256Mi 55 | decay_steps: ${train.max_tokens} 56 | end_value: 0.0 57 | 58 | datamodule: 59 | train_batch_len: 16384 60 | train_batch_size: 32 61 | train_num_workers: 2 62 | 63 | eval_batch_len: 16384 64 | eval_local_batch_size: 2 65 | eval_tokens: 2147483648 # 2Bi 66 | eval_num_workers: 2 67 | -------------------------------------------------------------------------------- /configs/experiment/longcrawl64/forgetting_transformer/llama_360m_7b.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | defaults: 3 | - override /model: forgetting_transformer 4 | - override /optimizer: adamw 5 | - override /schedule: warmup_cosine 6 | - override /datamodule: longcrawl64 7 | - override /strategy: fsdp 8 | - _self_ 9 | 10 | exp: ??? 11 | tag: ??? 12 | seed: 0 13 | 14 | output_dir: ??? 15 | data_dir: ??? # data_dir / 'longcrawl' / 'train.zarr' should exist 16 | 17 | resume: True 18 | 19 | log_interval: 33554432 # 32Mi. 32 times per billion 20 | train_eval_interval: 536870912 # 512Mi 21 | checkpoint_interval: 268435456 # 256Mi In tokens. 4 times per 1Bi. It is worth it 22 | 23 | skip_eval: true 24 | eval_interval: 7516192768 # Only at the end. Not used due to skip_eval 25 | checkpoint_keep_interval: 7516192768 # Only at the end 26 | 27 | fabric: 28 | devices: auto 29 | precision: 'bf16-mixed' 30 | 31 | train: 32 | max_tokens: 7516192768 # 7 Bi 33 | # Used for one gradient accumulation step, must be larger than batch_len 34 | grad_acc_tokens: 16384 35 | max_grad_norm: 1.0 36 | gradient_checkpointing: false 37 | 38 | model: 39 | config: 40 | hidden_size: 1024 41 | num_hidden_layers: 24 42 | num_heads: 16 43 | use_rope: false 44 | rope_base: 500000 45 | 46 | optimizer: 47 | lr: 0.002 48 | betas: [0.9, 0.95] 49 | weight_decay: 0.1 50 | 51 | schedule: 52 | init_value: 0.0 53 | peak_value: ${optimizer.lr} 54 | warmup_steps: 268435456 # 256Mi 55 | decay_steps: ${train.max_tokens} 56 | end_value: 0.0 57 | 58 | datamodule: 59 | train_batch_len: 16384 60 | train_batch_size: 32 61 | train_num_workers: 2 62 | 63 | eval_batch_len: 16384 64 | eval_local_batch_size: 1 65 | eval_tokens: 2147483648 # 2Bi 66 | eval_num_workers: 2 67 | -------------------------------------------------------------------------------- /configs/experiment/longcrawl64/forgetting_transformer/llama_760m_16b.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | defaults: 3 | - override /model: forgetting_transformer 4 | - override /optimizer: adamw 5 | - override /schedule: warmup_cosine 6 | - override /datamodule: longcrawl64 7 | - override /strategy: fsdp 8 | - _self_ 9 | 10 | exp: ??? 11 | tag: ??? 12 | seed: 0 13 | 14 | output_dir: ??? 15 | data_dir: ??? # data_dir / 'longcrawl' / 'train.zarr' should exist 16 | 17 | resume: True 18 | 19 | log_interval: 33554432 # 32Mi. 32 times per billion 20 | train_eval_interval: 536870912 # 512Mi 21 | checkpoint_interval: 268435456 # 256Mi In tokens. 4 times per 1Bi. It is worth it 22 | 23 | skip_eval: true 24 | eval_interval: 16106127360 # Only at the end. Not used due to skip_eval 25 | checkpoint_keep_interval: 16106127360 # Only at the end. 26 | 27 | fabric: 28 | devices: auto 29 | precision: 'bf16-mixed' 30 | 31 | train: 32 | max_tokens: 16106127360 # 15 Bi 33 | # Used for one gradient accumulation step, must be larger than batch_len 34 | grad_acc_tokens: 16384 35 | max_grad_norm: 1.0 36 | gradient_checkpointing: true 37 | 38 | model: 39 | config: 40 | hidden_size: 1536 41 | num_hidden_layers: 24 42 | num_heads: 24 43 | use_rope: false 44 | rope_base: 500000 45 | 46 | optimizer: 47 | lr: 0.001 48 | betas: [0.9, 0.95] 49 | weight_decay: 0.1 50 | 51 | schedule: 52 | init_value: 0.0 53 | peak_value: ${optimizer.lr} 54 | warmup_steps: 268435456 # 256Mi 55 | decay_steps: ${train.max_tokens} 56 | end_value: 0.0 57 | 58 | datamodule: 59 | train_batch_len: 16384 60 | train_batch_size: 32 61 | train_num_workers: 2 62 | 63 | eval_batch_len: 16384 64 | eval_local_batch_size: 1 65 | eval_tokens: 2147483648 # 2Bi 66 | eval_num_workers: 2 67 | -------------------------------------------------------------------------------- /configs/experiment/longcrawl64/forgetting_transformer/llama_760m_48b.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | defaults: 3 | - override /model: forgetting_transformer 4 | - override /optimizer: adamw 5 | - override /schedule: warmup_cosine 6 | - override /datamodule: longcrawl64 7 | - override /strategy: fsdp 8 | - _self_ 9 | 10 | exp: ??? 11 | tag: ??? 12 | seed: 0 13 | 14 | output_dir: ??? 15 | data_dir: ??? # data_dir / 'longcrawl' / 'train.zarr' should exist 16 | 17 | resume: True 18 | 19 | log_interval: 33554432 # 32Mi. 32 times per billion 20 | train_eval_interval: 536870912 # 512Mi 21 | checkpoint_interval: 268435456 # 256Mi In tokens. 4 times per 1Bi. It is worth it 22 | 23 | skip_eval: true 24 | eval_interval: 48318382080 # Only at the end. Not used due to skip_eval 25 | checkpoint_keep_interval: 48318382080 # Only at the end. 26 | 27 | fabric: 28 | devices: auto 29 | precision: 'bf16-mixed' 30 | 31 | train: 32 | max_tokens: 48318382080 # 15 Bi 33 | # Used for one gradient accumulation step, must be larger than batch_len 34 | grad_acc_tokens: 16384 35 | max_grad_norm: 1.0 36 | gradient_checkpointing: true 37 | 38 | model: 39 | config: 40 | hidden_size: 1536 41 | num_hidden_layers: 24 42 | num_heads: 24 43 | use_rope: false 44 | rope_base: 500000 45 | 46 | optimizer: 47 | lr: 0.001 48 | betas: [0.9, 0.95] 49 | weight_decay: 0.1 50 | 51 | schedule: 52 | init_value: 0.0 53 | peak_value: ${optimizer.lr} 54 | warmup_steps: 268435456 # 256Mi 55 | decay_steps: ${train.max_tokens} 56 | end_value: 0.0 57 | 58 | datamodule: 59 | train_batch_len: 16384 60 | train_batch_size: 32 61 | train_num_workers: 2 62 | 63 | eval_batch_len: 16384 64 | eval_local_batch_size: 1 65 | eval_tokens: 2147483648 # 2Bi 66 | eval_num_workers: 2 67 | -------------------------------------------------------------------------------- /configs/experiment/longcrawl64/forgetting_transformer/pro_125m_2b.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | defaults: 3 | - override /model: forgetting_transformer 4 | - override /optimizer: adamw 5 | - override /schedule: warmup_cosine 6 | - override /datamodule: longcrawl64 7 | - override /strategy: fsdp 8 | - _self_ 9 | 10 | exp: ??? 11 | tag: ??? 12 | seed: 0 13 | 14 | output_dir: ??? 15 | data_dir: ??? # data_dir / 'longcrawl' / 'train.zarr' should exist 16 | 17 | resume: True 18 | 19 | log_interval: 33554432 # 32Mi. 32 times per billion 20 | train_eval_interval: 536870912 # 512Mi 21 | checkpoint_interval: 268435456 # 256Mi In tokens. 4 times per 1Bi. It is worth it 22 | 23 | skip_eval: true # Don't do evaluation. 24 | eval_interval: 2684354560 # Only at the end. Not used due to skip_eval 25 | checkpoint_keep_interval: 2684354560 # Only at the end 26 | 27 | fabric: 28 | devices: auto 29 | precision: 'bf16-mixed' 30 | 31 | train: 32 | max_tokens: 2684354560 # 2.5 Bi 33 | # Used for one gradient accumulation step, must be larger than batch_len 34 | grad_acc_tokens: 32768 35 | max_grad_norm: 1.0 36 | gradient_checkpointing: false 37 | 38 | model: 39 | config: 40 | hidden_size: 768 41 | num_hidden_layers: 12 42 | num_heads: 12 43 | use_rope: false 44 | rope_base: 500000 45 | # Pro config 46 | use_v_shift: true 47 | use_k_shift: true 48 | qk_norm: true 49 | use_output_gate: true 50 | use_output_norm: true 51 | hidden_ratio: 3.5 # output gates use extra params so we reduce it here 52 | 53 | optimizer: 54 | lr: 0.002 55 | betas: [0.9, 0.95] 56 | weight_decay: 0.1 57 | 58 | schedule: 59 | init_value: 0.0 60 | peak_value: ${optimizer.lr} 61 | warmup_steps: 268435456 # 256Mi 62 | decay_steps: ${train.max_tokens} 63 | end_value: 0.0 64 | 65 | datamodule: 66 | train_batch_len: 16384 67 | train_batch_size: 32 68 | train_num_workers: 2 69 | 70 | eval_batch_len: 16384 71 | eval_local_batch_size: 2 72 | eval_tokens: 2147483648 # 2Bi 73 | eval_num_workers: 2 74 | -------------------------------------------------------------------------------- /configs/experiment/longcrawl64/forgetting_transformer/pro_360m_7b.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | defaults: 3 | - override /model: forgetting_transformer 4 | - override /optimizer: adamw 5 | - override /schedule: warmup_cosine 6 | - override /datamodule: longcrawl64 7 | - override /strategy: fsdp 8 | - _self_ 9 | 10 | exp: ??? 11 | tag: ??? 12 | seed: 0 13 | 14 | output_dir: ??? 15 | data_dir: ??? # data_dir / 'longcrawl' / 'train.zarr' should exist 16 | 17 | resume: True 18 | 19 | log_interval: 33554432 # 32Mi. 32 times per billion 20 | train_eval_interval: 536870912 # 512Mi 21 | checkpoint_interval: 268435456 # 256Mi In tokens. 4 times per 1Bi. It is worth it 22 | 23 | skip_eval: true 24 | eval_interval: 7516192768 # Only at the end. Not used due to skip_eval 25 | checkpoint_keep_interval: 7516192768 # Only at the end 26 | 27 | fabric: 28 | devices: auto 29 | precision: 'bf16-mixed' 30 | 31 | train: 32 | max_tokens: 7516192768 # 7 Bi 33 | # Used for one gradient accumulation step, must be larger than batch_len 34 | grad_acc_tokens: 16384 35 | max_grad_norm: 1.0 36 | gradient_checkpointing: false 37 | 38 | model: 39 | config: 40 | hidden_size: 1024 41 | num_hidden_layers: 24 42 | num_heads: 16 43 | use_rope: false 44 | rope_base: 500000 45 | # Pro config 46 | use_v_shift: true 47 | use_k_shift: true 48 | qk_norm: true 49 | use_output_gate: true 50 | use_output_norm: true 51 | hidden_ratio: 3.5 # output gates use extra params so we reduce it here 52 | 53 | optimizer: 54 | lr: 0.002 55 | betas: [0.9, 0.95] 56 | weight_decay: 0.1 57 | 58 | schedule: 59 | init_value: 0.0 60 | peak_value: ${optimizer.lr} 61 | warmup_steps: 268435456 # 256Mi 62 | decay_steps: ${train.max_tokens} 63 | end_value: 0.0 64 | 65 | datamodule: 66 | train_batch_len: 16384 67 | train_batch_size: 32 68 | train_num_workers: 2 69 | 70 | eval_batch_len: 16384 71 | eval_local_batch_size: 1 72 | eval_tokens: 2147483648 # 2Bi 73 | eval_num_workers: 2 74 | -------------------------------------------------------------------------------- /configs/experiment/longcrawl64/forgetting_transformer/pro_760m_16b.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | defaults: 3 | - override /model: forgetting_transformer 4 | - override /optimizer: adamw 5 | - override /schedule: warmup_cosine 6 | - override /datamodule: longcrawl64 7 | - override /strategy: fsdp 8 | - _self_ 9 | 10 | exp: ??? 11 | tag: ??? 12 | seed: 0 13 | 14 | output_dir: ??? 15 | data_dir: ??? # data_dir / 'longcrawl' / 'train.zarr' should exist 16 | 17 | resume: True 18 | 19 | log_interval: 33554432 # 32Mi. 32 times per billion 20 | train_eval_interval: 536870912 # 512Mi 21 | checkpoint_interval: 268435456 # 256Mi In tokens. 4 times per 1Bi. It is worth it 22 | 23 | skip_eval: true 24 | eval_interval: 16106127360 # Only at the end. Not used due to skip_eval 25 | checkpoint_keep_interval: 16106127360 # Only at the end. 26 | 27 | fabric: 28 | devices: auto 29 | precision: 'bf16-mixed' 30 | 31 | train: 32 | max_tokens: 16106127360 # 15 Bi 33 | # Used for one gradient accumulation step, must be larger than batch_len 34 | grad_acc_tokens: 16384 35 | max_grad_norm: 1.0 36 | gradient_checkpointing: true 37 | 38 | model: 39 | config: 40 | hidden_size: 1536 41 | num_hidden_layers: 24 42 | num_heads: 24 43 | use_rope: false 44 | rope_base: 500000 45 | # Pro config 46 | use_v_shift: true 47 | use_k_shift: true 48 | qk_norm: true 49 | use_output_gate: true 50 | use_output_norm: true 51 | hidden_ratio: 3.5 # output gates use extra params so we reduce it here 52 | 53 | optimizer: 54 | lr: 0.001 55 | betas: [0.9, 0.95] 56 | weight_decay: 0.1 57 | 58 | schedule: 59 | init_value: 0.0 60 | peak_value: ${optimizer.lr} 61 | warmup_steps: 268435456 # 256Mi 62 | decay_steps: ${train.max_tokens} 63 | end_value: 0.0 64 | 65 | datamodule: 66 | train_batch_len: 16384 67 | train_batch_size: 32 68 | train_num_workers: 2 69 | 70 | eval_batch_len: 16384 71 | eval_local_batch_size: 1 72 | eval_tokens: 2147483648 # 2Bi 73 | eval_num_workers: 2 74 | -------------------------------------------------------------------------------- /configs/experiment/longcrawl64/forgetting_transformer/pro_760m_48b.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | defaults: 3 | - override /model: forgetting_transformer 4 | - override /optimizer: adamw 5 | - override /schedule: warmup_cosine 6 | - override /datamodule: longcrawl64 7 | - override /strategy: fsdp 8 | - _self_ 9 | 10 | exp: ??? 11 | tag: ??? 12 | seed: 0 13 | 14 | output_dir: ??? 15 | data_dir: ??? # data_dir / 'longcrawl' / 'train.zarr' should exist 16 | 17 | resume: True 18 | 19 | log_interval: 33554432 # 32Mi. 32 times per billion 20 | train_eval_interval: 536870912 # 512Mi 21 | checkpoint_interval: 268435456 # 256Mi In tokens. 4 times per 1Bi. It is worth it 22 | 23 | skip_eval: true 24 | eval_interval: 48318382080 # Only at the end. Not used due to skip_eval 25 | checkpoint_keep_interval: 48318382080 # Only at the end. 26 | 27 | fabric: 28 | devices: auto 29 | precision: 'bf16-mixed' 30 | 31 | train: 32 | max_tokens: 48318382080 # 15 Bi 33 | # Used for one gradient accumulation step, must be larger than batch_len 34 | grad_acc_tokens: 16384 35 | max_grad_norm: 1.0 36 | gradient_checkpointing: true 37 | 38 | model: 39 | config: 40 | hidden_size: 1536 41 | num_hidden_layers: 24 42 | num_heads: 24 43 | use_rope: false 44 | rope_base: 500000 45 | # Pro config 46 | use_v_shift: true 47 | use_k_shift: true 48 | qk_norm: true 49 | use_output_gate: true 50 | use_output_norm: true 51 | hidden_ratio: 3.5 # output gates use extra params so we reduce it here 52 | 53 | optimizer: 54 | lr: 0.002 55 | betas: [0.9, 0.95] 56 | weight_decay: 0.1 57 | 58 | schedule: 59 | init_value: 0.0 60 | peak_value: ${optimizer.lr} 61 | warmup_steps: 268435456 # 256Mi 62 | decay_steps: ${train.max_tokens} 63 | end_value: 0.0 64 | 65 | datamodule: 66 | train_batch_len: 16384 67 | train_batch_size: 32 68 | train_num_workers: 2 69 | 70 | eval_batch_len: 16384 71 | eval_local_batch_size: 1 72 | eval_tokens: 2147483648 # 2Bi 73 | eval_num_workers: 2 74 | -------------------------------------------------------------------------------- /configs/experiment/longcrawl64/hgrn2/125m_2b.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | defaults: 3 | - override /model: hgrn2 4 | - override /optimizer: adamw 5 | - override /schedule: warmup_cosine 6 | - override /datamodule: longcrawl64 7 | - override /strategy: fsdp 8 | - _self_ 9 | 10 | exp: ??? 11 | tag: ??? 12 | seed: 0 13 | 14 | output_dir: ??? 15 | data_dir: ??? # data_dir / 'longcrawl' / 'train.zarr' should exist 16 | 17 | resume: True 18 | 19 | log_interval: 33554432 # 32Mi. 32 times per billion 20 | train_eval_interval: 536870912 # 512Mi 21 | checkpoint_interval: 268435456 # 256Mi In tokens. 4 times per 1Bi. It is worth it 22 | 23 | skip_eval: true 24 | eval_interval: 2684354560 # Only at the end. Not used due to skip_eval 25 | checkpoint_keep_interval: 2684354560 # Only at the end 26 | 27 | fabric: 28 | devices: auto 29 | precision: 'bf16-mixed' 30 | 31 | train: 32 | max_tokens: 2684354560 # 2.5 Bi 33 | # Used for one gradient accumulation step, must be larger than batch_len 34 | grad_acc_tokens: 32768 35 | max_grad_norm: 1.0 36 | gradient_checkpointing: false 37 | 38 | model: 39 | config: 40 | hidden_size: 768 41 | expand_ratio: 128 42 | num_hidden_layers: 12 43 | 44 | optimizer: 45 | lr: 0.002 46 | betas: [0.9, 0.95] 47 | weight_decay: 0.1 48 | 49 | schedule: 50 | init_value: 0.0 51 | peak_value: ${optimizer.lr} 52 | warmup_steps: 268435456 # 256Mi 53 | decay_steps: ${train.max_tokens} 54 | end_value: 0.0 55 | 56 | datamodule: 57 | train_batch_len: 16384 58 | train_batch_size: 32 59 | train_num_workers: 2 60 | 61 | eval_batch_len: 16384 62 | eval_local_batch_size: 2 63 | eval_tokens: 2147483648 # 2Bi 64 | eval_num_workers: 2 65 | -------------------------------------------------------------------------------- /configs/experiment/longcrawl64/hgrn2/360m_7b.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | defaults: 3 | - override /model: hgrn2 4 | - override /optimizer: adamw 5 | - override /schedule: warmup_cosine 6 | - override /datamodule: longcrawl64 7 | - override /strategy: fsdp 8 | - _self_ 9 | 10 | exp: ??? 11 | tag: ??? 12 | seed: 0 13 | 14 | output_dir: ??? 15 | data_dir: ??? # data_dir / 'longcrawl' / 'train.zarr' should exist 16 | 17 | resume: True 18 | 19 | log_interval: 33554432 # 32Mi. 32 times per billion 20 | train_eval_interval: 536870912 # 512Mi 21 | checkpoint_interval: 268435456 # 256Mi In tokens. 4 times per 1Bi. It is worth it 22 | 23 | skip_eval: true 24 | eval_interval: 7516192768 # Only at the end. Not used due to skip_eval 25 | checkpoint_keep_interval: 7516192768 # Only at the end 26 | 27 | fabric: 28 | devices: auto 29 | precision: 'bf16-mixed' 30 | 31 | train: 32 | max_tokens: 7516192768 # 7 Bi 33 | # Used for one gradient accumulation step, must be larger than batch_len 34 | grad_acc_tokens: 16384 35 | max_grad_norm: 1.0 36 | gradient_checkpointing: false 37 | 38 | model: 39 | config: 40 | hidden_size: 1024 41 | expand_ratio: 128 42 | num_hidden_layers: 24 43 | 44 | optimizer: 45 | lr: 0.002 46 | betas: [0.9, 0.95] 47 | weight_decay: 0.1 48 | 49 | schedule: 50 | init_value: 0.0 51 | peak_value: ${optimizer.lr} 52 | warmup_steps: 268435456 # 256Mi 53 | decay_steps: ${train.max_tokens} 54 | end_value: 0.0 55 | 56 | datamodule: 57 | train_batch_len: 16384 58 | train_batch_size: 32 59 | train_num_workers: 2 60 | 61 | eval_batch_len: 16384 62 | eval_local_batch_size: 1 63 | eval_tokens: 2147483648 # 2Bi 64 | eval_num_workers: 2 65 | -------------------------------------------------------------------------------- /configs/experiment/longcrawl64/hgrn2/760m_16b.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | defaults: 3 | - override /model: hgrn2 4 | - override /optimizer: adamw 5 | - override /schedule: warmup_cosine 6 | - override /datamodule: longcrawl64 7 | - override /strategy: fsdp 8 | - _self_ 9 | 10 | exp: ??? 11 | tag: ??? 12 | seed: 0 13 | 14 | output_dir: ??? 15 | data_dir: ??? # data_dir / 'longcrawl' / 'train.zarr' should exist 16 | 17 | resume: True 18 | 19 | log_interval: 33554432 # 32Mi. 32 times per billion 20 | train_eval_interval: 536870912 # 512Mi 21 | checkpoint_interval: 268435456 # 256Mi In tokens. 4 times per 1Bi. It is worth it 22 | 23 | skip_eval: true 24 | eval_interval: 16106127360 # Only at the end. Not used due to skip_eval 25 | checkpoint_keep_interval: 16106127360 # Only at the end 26 | 27 | fabric: 28 | devices: auto 29 | precision: 'bf16-mixed' 30 | 31 | train: 32 | max_tokens: 16106127360 # 15 Bi 33 | # Used for one gradient accumulation step, must be larger than batch_len 34 | grad_acc_tokens: 16384 35 | max_grad_norm: 1.0 36 | gradient_checkpointing: true 37 | 38 | model: 39 | config: 40 | hidden_size: 1536 41 | expand_ratio: 128 42 | num_hidden_layers: 24 43 | 44 | optimizer: 45 | lr: 0.001 46 | betas: [0.9, 0.95] 47 | weight_decay: 0.1 48 | 49 | schedule: 50 | init_value: 0.0 51 | peak_value: ${optimizer.lr} 52 | warmup_steps: 268435456 # 256Mi 53 | decay_steps: ${train.max_tokens} 54 | end_value: 0.0 55 | 56 | datamodule: 57 | train_batch_len: 16384 58 | train_batch_size: 32 59 | train_num_workers: 2 60 | 61 | eval_batch_len: 16384 62 | eval_local_batch_size: 1 63 | eval_tokens: 2147483648 # 2Bi 64 | eval_num_workers: 2 65 | -------------------------------------------------------------------------------- /configs/experiment/longcrawl64/hgrn2/760m_48b.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | defaults: 3 | - override /model: hgrn2 4 | - override /optimizer: adamw 5 | - override /schedule: warmup_cosine 6 | - override /datamodule: longcrawl64 7 | - override /strategy: fsdp 8 | - _self_ 9 | 10 | exp: ??? 11 | tag: ??? 12 | seed: 0 13 | 14 | output_dir: ??? 15 | data_dir: ??? # data_dir / 'longcrawl' / 'train.zarr' should exist 16 | 17 | resume: True 18 | 19 | log_interval: 33554432 # 32Mi. 32 times per billion 20 | train_eval_interval: 536870912 # 512Mi 21 | checkpoint_interval: 268435456 # 256Mi In tokens. 4 times per 1Bi. It is worth it 22 | 23 | skip_eval: true 24 | eval_interval: 48318382080 # Only at the end. Not used due to skip_eval 25 | checkpoint_keep_interval: 48318382080 # Only at the end 26 | 27 | fabric: 28 | devices: auto 29 | precision: 'bf16-mixed' 30 | 31 | train: 32 | max_tokens: 48318382080 # 15 Bi 33 | # Used for one gradient accumulation step, must be larger than batch_len 34 | grad_acc_tokens: 16384 35 | max_grad_norm: 1.0 36 | gradient_checkpointing: true 37 | 38 | model: 39 | config: 40 | hidden_size: 1536 41 | expand_ratio: 128 42 | num_hidden_layers: 24 43 | 44 | optimizer: 45 | lr: 0.002 46 | betas: [0.9, 0.95] 47 | weight_decay: 0.1 48 | 49 | schedule: 50 | init_value: 0.0 51 | peak_value: ${optimizer.lr} 52 | warmup_steps: 268435456 # 256Mi 53 | decay_steps: ${train.max_tokens} 54 | end_value: 0.0 55 | 56 | datamodule: 57 | train_batch_len: 16384 58 | train_batch_size: 32 59 | train_num_workers: 2 60 | 61 | eval_batch_len: 16384 62 | eval_local_batch_size: 1 63 | eval_tokens: 2147483648 # 2Bi 64 | eval_num_workers: 2 65 | -------------------------------------------------------------------------------- /configs/experiment/longcrawl64/mamba2/125m_2b.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | defaults: 3 | - override /model: mamba2 4 | - override /optimizer: adamw 5 | - override /schedule: warmup_cosine 6 | - override /datamodule: longcrawl64 7 | - override /strategy: fsdp 8 | - _self_ 9 | 10 | exp: ??? 11 | tag: ??? 12 | seed: 0 13 | 14 | output_dir: ??? 15 | data_dir: ??? # data_dir / 'longcrawl' / 'train.zarr' should exist 16 | 17 | resume: True 18 | 19 | log_interval: 33554432 # 32Mi. 32 times per billion 20 | train_eval_interval: 536870912 # 512Mi 21 | checkpoint_interval: 268435456 # 256Mi In tokens. 4 times per 1Bi. It is worth it 22 | 23 | skip_eval: true 24 | eval_interval: 2684354560 # Only at the end. Not used due to skip_eval 25 | checkpoint_keep_interval: 2684354560 # Only at the end 26 | 27 | fabric: 28 | devices: auto 29 | precision: 'bf16-mixed' 30 | 31 | train: 32 | max_tokens: 2684354560 # 2.5 Bi 33 | # Used for one gradient accumulation step, must be larger than batch_len 34 | grad_acc_tokens: 32768 35 | max_grad_norm: 1.0 36 | gradient_checkpointing: false 37 | 38 | model: 39 | config: 40 | head_dim: 64 41 | state_size: 128 42 | num_heads: 24 43 | num_hidden_layers: 24 44 | hidden_size: 768 45 | 46 | optimizer: 47 | lr: 0.002 48 | betas: [0.9, 0.95] 49 | weight_decay: 0.1 50 | 51 | schedule: 52 | init_value: 0.0 53 | peak_value: ${optimizer.lr} 54 | warmup_steps: 268435456 # 256Mi 55 | decay_steps: ${train.max_tokens} 56 | end_value: 0.0 57 | 58 | datamodule: 59 | train_batch_len: 16384 60 | train_batch_size: 32 61 | train_num_workers: 2 62 | 63 | eval_batch_len: 16384 64 | eval_local_batch_size: 2 65 | eval_tokens: 2147483648 # 2Bi 66 | eval_num_workers: 2 67 | -------------------------------------------------------------------------------- /configs/experiment/longcrawl64/mamba2/360m_7b.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | defaults: 3 | - override /model: mamba2 4 | - override /optimizer: adamw 5 | - override /schedule: warmup_cosine 6 | - override /datamodule: longcrawl64 7 | - override /strategy: fsdp 8 | - _self_ 9 | 10 | exp: ??? 11 | tag: ??? 12 | seed: 0 13 | 14 | output_dir: ??? 15 | data_dir: ??? # data_dir / 'longcrawl' / 'train.zarr' should exist 16 | 17 | resume: True 18 | 19 | log_interval: 33554432 # 32Mi. 32 times per billion 20 | train_eval_interval: 536870912 # 512Mi 21 | checkpoint_interval: 268435456 # 256Mi In tokens. 4 times per 1Bi. It is worth it 22 | 23 | skip_eval: true 24 | eval_interval: 7516192768 # Only at the end. Not used due to skip_eval 25 | checkpoint_keep_interval: 7516192768 # Only at the end 26 | 27 | fabric: 28 | devices: auto 29 | precision: 'bf16-mixed' 30 | 31 | train: 32 | max_tokens: 7516192768 # 7 Bi 33 | # Used for one gradient accumulation step, must be larger than batch_len 34 | grad_acc_tokens: 16384 35 | max_grad_norm: 1.0 36 | gradient_checkpointing: false 37 | 38 | model: 39 | config: 40 | head_dim: 64 41 | state_size: 128 42 | num_heads: 32 43 | num_hidden_layers: 48 44 | hidden_size: 1024 45 | 46 | optimizer: 47 | lr: 0.002 48 | betas: [0.9, 0.95] 49 | weight_decay: 0.1 50 | 51 | schedule: 52 | init_value: 0.0 53 | peak_value: ${optimizer.lr} 54 | warmup_steps: 268435456 # 256Mi 55 | decay_steps: ${train.max_tokens} 56 | end_value: 0.0 57 | 58 | datamodule: 59 | train_batch_len: 16384 60 | train_batch_size: 32 61 | train_num_workers: 2 62 | 63 | eval_batch_len: 16384 64 | eval_local_batch_size: 1 65 | eval_tokens: 2147483648 # 2Bi 66 | eval_num_workers: 2 67 | -------------------------------------------------------------------------------- /configs/experiment/longcrawl64/mamba2/760m_16b.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | defaults: 3 | - override /model: mamba2 4 | - override /optimizer: adamw 5 | - override /schedule: warmup_cosine 6 | - override /datamodule: longcrawl64 7 | - override /strategy: fsdp 8 | - _self_ 9 | 10 | exp: ??? 11 | tag: ??? 12 | seed: 0 13 | 14 | output_dir: ??? 15 | data_dir: ??? # data_dir / 'longcrawl' / 'train.zarr' should exist 16 | 17 | resume: True 18 | 19 | log_interval: 33554432 # 32Mi. 32 times per billion 20 | train_eval_interval: 536870912 # 512Mi 21 | checkpoint_interval: 268435456 # 256Mi In tokens. 4 times per 1Bi. It is worth it 22 | 23 | skip_eval: true 24 | eval_interval: 16106127360 # Only at the end. Not used due to skip_eval 25 | checkpoint_keep_interval: 16106127360 # Only at the end 26 | 27 | fabric: 28 | devices: auto 29 | precision: 'bf16-mixed' 30 | 31 | train: 32 | max_tokens: 16106127360 # 15 Bi 33 | # Used for one gradient accumulation step, must be larger than batch_len 34 | grad_acc_tokens: 16384 35 | max_grad_norm: 1.0 36 | gradient_checkpointing: true 37 | 38 | model: 39 | config: 40 | head_dim: 64 41 | state_size: 128 42 | num_heads: 48 43 | num_hidden_layers: 48 44 | hidden_size: 1536 45 | 46 | optimizer: 47 | lr: 0.001 48 | betas: [0.9, 0.95] 49 | weight_decay: 0.1 50 | 51 | schedule: 52 | init_value: 0.0 53 | peak_value: ${optimizer.lr} 54 | warmup_steps: 268435456 # 256Mi 55 | decay_steps: ${train.max_tokens} 56 | end_value: 0.0 57 | 58 | datamodule: 59 | train_batch_len: 16384 60 | train_batch_size: 32 61 | train_num_workers: 2 62 | 63 | eval_batch_len: 16384 64 | eval_local_batch_size: 1 65 | eval_tokens: 2147483648 # 2Bi 66 | eval_num_workers: 2 67 | -------------------------------------------------------------------------------- /configs/experiment/longcrawl64/mamba2/760m_48b.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | defaults: 3 | - override /model: mamba2 4 | - override /optimizer: adamw 5 | - override /schedule: warmup_cosine 6 | - override /datamodule: longcrawl64 7 | - override /strategy: fsdp 8 | - _self_ 9 | 10 | exp: ??? 11 | tag: ??? 12 | seed: 0 13 | 14 | output_dir: ??? 15 | data_dir: ??? # data_dir / 'longcrawl' / 'train.zarr' should exist 16 | 17 | resume: True 18 | 19 | log_interval: 33554432 # 32Mi. 32 times per billion 20 | train_eval_interval: 536870912 # 512Mi 21 | checkpoint_interval: 268435456 # 256Mi In tokens. 4 times per 1Bi. It is worth it 22 | 23 | skip_eval: true 24 | eval_interval: 48318382080 # Only at the end. Not used due to skip_eval 25 | checkpoint_keep_interval: 48318382080 # Only at the end 26 | 27 | fabric: 28 | devices: auto 29 | precision: 'bf16-mixed' 30 | 31 | train: 32 | max_tokens: 48318382080 # 15 Bi 33 | # Used for one gradient accumulation step, must be larger than batch_len 34 | grad_acc_tokens: 16384 35 | max_grad_norm: 1.0 36 | gradient_checkpointing: true 37 | 38 | model: 39 | config: 40 | head_dim: 64 41 | state_size: 128 42 | num_heads: 48 43 | num_hidden_layers: 48 44 | hidden_size: 1536 45 | 46 | optimizer: 47 | lr: 0.002 48 | betas: [0.9, 0.95] 49 | weight_decay: 0.1 50 | 51 | schedule: 52 | init_value: 0.0 53 | peak_value: ${optimizer.lr} 54 | warmup_steps: 268435456 # 256Mi 55 | decay_steps: ${train.max_tokens} 56 | end_value: 0.0 57 | 58 | datamodule: 59 | train_batch_len: 16384 60 | train_batch_size: 32 61 | train_num_workers: 2 62 | 63 | eval_batch_len: 16384 64 | eval_local_batch_size: 1 65 | eval_tokens: 2147483648 # 2Bi 66 | eval_num_workers: 2 67 | -------------------------------------------------------------------------------- /configs/experiment/longcrawl64/samba/760m_16b.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | defaults: 3 | - override /model: samba 4 | - override /optimizer: adamw 5 | - override /schedule: warmup_cosine 6 | - override /datamodule: longcrawl64 7 | - override /strategy: fsdp 8 | - _self_ 9 | 10 | exp: ??? 11 | tag: ??? 12 | seed: 0 13 | 14 | output_dir: ??? 15 | data_dir: ??? # data_dir / 'longcrawl' / 'train.zarr' should exist 16 | 17 | resume: True 18 | 19 | log_interval: 33554432 # 32Mi. 32 times per billion 20 | train_eval_interval: 536870912 # 512Mi 21 | checkpoint_interval: 268435456 # 256Mi In tokens. 4 times per 1Bi. It is worth it 22 | 23 | skip_eval: true 24 | eval_interval: 16106127360 # Only at the end. Not used due to skip_eval 25 | checkpoint_keep_interval: 16106127360 # Only at the end 26 | 27 | fabric: 28 | devices: auto 29 | precision: 'bf16-mixed' 30 | 31 | train: 32 | max_tokens: 16106127360 # 15 Bi 33 | # Used for one gradient accumulation step, must be larger than batch_len 34 | grad_acc_tokens: 16384 35 | max_grad_norm: 1.0 36 | gradient_checkpointing: true 37 | 38 | model: 39 | config: 40 | hidden_size: 1536 41 | num_hidden_layers: 24 42 | rope_base: 500000 43 | attn: 44 | num_heads: 12 45 | window_size: 2048 46 | layers: null 47 | 48 | 49 | optimizer: 50 | lr: 0.001 51 | betas: [0.9, 0.95] 52 | weight_decay: 0.1 53 | 54 | schedule: 55 | init_value: 0.0 56 | peak_value: ${optimizer.lr} 57 | warmup_steps: 268435456 # 256Mi 58 | decay_steps: ${train.max_tokens} 59 | end_value: 0.0 60 | 61 | datamodule: 62 | train_batch_len: 16384 63 | train_batch_size: 32 64 | train_num_workers: 2 65 | 66 | eval_batch_len: 16384 67 | eval_local_batch_size: 1 68 | eval_tokens: 2147483648 # 2Bi 69 | eval_num_workers: 2 70 | -------------------------------------------------------------------------------- /configs/experiment/longcrawl64/transformer/llama_125m_2b.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | defaults: 3 | - override /model: transformer 4 | - override /optimizer: adamw 5 | - override /schedule: warmup_cosine 6 | - override /datamodule: longcrawl64 7 | - override /strategy: fsdp 8 | - _self_ 9 | 10 | exp: ??? 11 | tag: ??? 12 | seed: 0 13 | 14 | output_dir: ??? 15 | data_dir: ??? # data_dir / 'longcrawl' / 'train.zarr' should exist 16 | 17 | resume: True 18 | 19 | log_interval: 33554432 # 32Mi. 32 times per billion 20 | train_eval_interval: 536870912 # 512Mi 21 | checkpoint_interval: 268435456 # 256Mi In tokens. 4 times per 1Bi. It is worth it 22 | 23 | skip_eval: true 24 | eval_interval: 2684354560 # Only at the end. Not used due to skip_eval 25 | checkpoint_keep_interval: 2684354560 # Only at the end 26 | 27 | fabric: 28 | devices: auto 29 | precision: 'bf16-mixed' 30 | 31 | train: 32 | max_tokens: 2684354560 # 2.5 Bi 33 | # Used for one gradient accumulation step, must be larger than batch_len 34 | grad_acc_tokens: 32768 35 | max_grad_norm: 1.0 36 | gradient_checkpointing: false 37 | 38 | model: 39 | config: 40 | hidden_size: 768 41 | num_hidden_layers: 12 42 | num_heads: 12 43 | rope_base: 500000 44 | 45 | optimizer: 46 | lr: 0.002 47 | betas: [0.9, 0.95] 48 | weight_decay: 0.1 49 | 50 | schedule: 51 | init_value: 0.0 52 | peak_value: ${optimizer.lr} 53 | warmup_steps: 268435456 # 256Mi 54 | decay_steps: ${train.max_tokens} 55 | end_value: 0.0 56 | 57 | datamodule: 58 | train_batch_len: 16384 59 | train_batch_size: 32 60 | train_num_workers: 2 61 | 62 | eval_batch_len: 16384 63 | eval_local_batch_size: 2 64 | eval_tokens: 2147483648 # 2Bi 65 | eval_num_workers: 2 66 | -------------------------------------------------------------------------------- /configs/experiment/longcrawl64/transformer/llama_360m_7b.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | defaults: 3 | - override /model: transformer 4 | - override /optimizer: adamw 5 | - override /schedule: warmup_cosine 6 | - override /datamodule: longcrawl64 7 | - override /strategy: fsdp 8 | - _self_ 9 | 10 | exp: ??? 11 | tag: ??? 12 | seed: 0 13 | 14 | output_dir: ??? 15 | data_dir: ??? # data_dir / 'longcrawl' / 'train.zarr' should exist 16 | 17 | resume: True 18 | 19 | log_interval: 33554432 # 32Mi. 32 times per billion 20 | train_eval_interval: 536870912 # 512Mi 21 | checkpoint_interval: 268435456 # 256Mi In tokens. 4 times per 1Bi. It is worth it 22 | 23 | skip_eval: true 24 | eval_interval: 7516192768 # Only at the end. Not used due to skip_eval 25 | checkpoint_keep_interval: 7516192768 # Only at the end 26 | 27 | fabric: 28 | devices: auto 29 | precision: 'bf16-mixed' 30 | 31 | train: 32 | max_tokens: 7516192768 # 7 Bi 33 | # Used for one gradient accumulation step, must be larger than batch_len 34 | grad_acc_tokens: 16384 35 | max_grad_norm: 1.0 36 | gradient_checkpointing: false 37 | 38 | model: 39 | config: 40 | hidden_size: 1024 41 | num_hidden_layers: 24 42 | num_heads: 16 43 | rope_base: 500000 44 | 45 | optimizer: 46 | lr: 0.002 47 | betas: [0.9, 0.95] 48 | weight_decay: 0.1 49 | 50 | schedule: 51 | init_value: 0.0 52 | peak_value: ${optimizer.lr} 53 | warmup_steps: 268435456 # 256Mi 54 | decay_steps: ${train.max_tokens} 55 | end_value: 0.0 56 | 57 | datamodule: 58 | train_batch_len: 16384 59 | train_batch_size: 32 60 | train_num_workers: 2 61 | 62 | eval_batch_len: 16384 63 | eval_local_batch_size: 1 64 | eval_tokens: 2147483648 # 2Bi 65 | eval_num_workers: 2 66 | -------------------------------------------------------------------------------- /configs/experiment/longcrawl64/transformer/llama_760m_16b.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | defaults: 3 | - override /model: transformer 4 | - override /optimizer: adamw 5 | - override /schedule: warmup_cosine 6 | - override /datamodule: longcrawl64 7 | - override /strategy: fsdp 8 | - _self_ 9 | 10 | exp: ??? 11 | tag: ??? 12 | seed: 0 13 | 14 | output_dir: ??? 15 | data_dir: ??? # data_dir / 'longcrawl' / 'train.zarr' should exist 16 | 17 | resume: True 18 | 19 | log_interval: 33554432 # 32Mi. 32 times per billion 20 | train_eval_interval: 536870912 # 512Mi 21 | checkpoint_interval: 268435456 # 256Mi In tokens. 4 times per 1Bi. It is worth it 22 | 23 | skip_eval: true 24 | eval_interval: 16106127360 # Only at the end. Not used due to skip_eval 25 | checkpoint_keep_interval: 16106127360 # Only at the end 26 | 27 | fabric: 28 | devices: auto 29 | precision: 'bf16-mixed' 30 | 31 | train: 32 | max_tokens: 16106127360 # 15 Bi 33 | # Used for one gradient accumulation step, must be larger than batch_len 34 | grad_acc_tokens: 16384 35 | max_grad_norm: 1.0 36 | gradient_checkpointing: true 37 | 38 | model: 39 | config: 40 | hidden_size: 1536 41 | num_hidden_layers: 24 42 | num_heads: 12 43 | rope_base: 500000 44 | 45 | optimizer: 46 | lr: 0.001 47 | betas: [0.9, 0.95] 48 | weight_decay: 0.1 49 | 50 | schedule: 51 | init_value: 0.0 52 | peak_value: ${optimizer.lr} 53 | warmup_steps: 268435456 # 256Mi 54 | decay_steps: ${train.max_tokens} 55 | end_value: 0.0 56 | 57 | datamodule: 58 | train_batch_len: 16384 59 | train_batch_size: 32 60 | train_num_workers: 2 61 | 62 | eval_batch_len: 16384 63 | eval_local_batch_size: 1 64 | eval_tokens: 2147483648 # 2Bi 65 | eval_num_workers: 2 66 | -------------------------------------------------------------------------------- /configs/experiment/longcrawl64/transformer/llama_760m_48b.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | defaults: 3 | - override /model: transformer 4 | - override /optimizer: adamw 5 | - override /schedule: warmup_cosine 6 | - override /datamodule: longcrawl64 7 | - override /strategy: fsdp 8 | - _self_ 9 | 10 | exp: ??? 11 | tag: ??? 12 | seed: 0 13 | 14 | output_dir: ??? 15 | data_dir: ??? # data_dir / 'longcrawl' / 'train.zarr' should exist 16 | 17 | resume: True 18 | 19 | log_interval: 33554432 # 32Mi. 32 times per billion 20 | train_eval_interval: 536870912 # 512Mi 21 | checkpoint_interval: 268435456 # 256Mi In tokens. 4 times per 1Bi. It is worth it 22 | 23 | skip_eval: true 24 | eval_interval: 48318382080 # Only at the end. Not used due to skip_eval 25 | checkpoint_keep_interval: 48318382080 # Only at the end 26 | 27 | fabric: 28 | devices: auto 29 | precision: 'bf16-mixed' 30 | 31 | train: 32 | max_tokens: 48318382080 # 15 Bi 33 | # Used for one gradient accumulation step, must be larger than batch_len 34 | grad_acc_tokens: 16384 35 | max_grad_norm: 1.0 36 | gradient_checkpointing: true 37 | 38 | model: 39 | config: 40 | hidden_size: 1536 41 | num_hidden_layers: 24 42 | num_heads: 12 43 | rope_base: 500000 44 | 45 | optimizer: 46 | lr: 0.0005 47 | betas: [0.9, 0.95] 48 | weight_decay: 0.1 49 | 50 | schedule: 51 | init_value: 0.0 52 | peak_value: ${optimizer.lr} 53 | warmup_steps: 268435456 # 256Mi 54 | decay_steps: ${train.max_tokens} 55 | end_value: 0.0 56 | 57 | datamodule: 58 | train_batch_len: 16384 59 | train_batch_size: 32 60 | train_num_workers: 2 61 | 62 | eval_batch_len: 16384 63 | eval_local_batch_size: 1 64 | eval_tokens: 2147483648 # 2Bi 65 | eval_num_workers: 2 66 | -------------------------------------------------------------------------------- /configs/experiment/longcrawl64/transformer/pro_125m_2b.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | defaults: 3 | - override /model: forgetting_transformer 4 | - override /optimizer: adamw 5 | - override /schedule: warmup_cosine 6 | - override /datamodule: longcrawl64 7 | - override /strategy: fsdp 8 | - _self_ 9 | 10 | exp: ??? 11 | tag: ??? 12 | seed: 0 13 | 14 | output_dir: ??? 15 | data_dir: ??? # data_dir / 'longcrawl' / 'train.zarr' should exist 16 | 17 | resume: True 18 | 19 | log_interval: 33554432 # 32Mi. 32 times per billion 20 | train_eval_interval: 536870912 # 512Mi 21 | checkpoint_interval: 268435456 # 256Mi In tokens. 4 times per 1Bi. It is worth it 22 | 23 | skip_eval: true 24 | eval_interval: 2684354560 # Only at the end. Not used due to skip_eval 25 | checkpoint_keep_interval: 2684354560 # Only at the end 26 | 27 | fabric: 28 | devices: auto 29 | precision: 'bf16-mixed' 30 | 31 | train: 32 | max_tokens: 2684354560 # 2.5 Bi 33 | # Used for one gradient accumulation step, must be larger than batch_len 34 | grad_acc_tokens: 32768 35 | max_grad_norm: 1.0 36 | gradient_checkpointing: false 37 | 38 | model: 39 | config: 40 | hidden_size: 768 41 | num_hidden_layers: 12 42 | num_heads: 12 43 | rope_base: 500000 44 | # Pro config. Note this is implemented as FoX with fgate_type="none" and use_rope=true 45 | # See the `- override /model: forgetting_transformer` on top of this file 46 | use_rope: true 47 | fgate_type: "none" 48 | use_v_shift: true 49 | use_k_shift: true 50 | qk_norm: true 51 | use_output_gate: true 52 | use_output_norm: true 53 | hidden_ratio: 3.5 # output gates use extra params so we reduce it here 54 | 55 | optimizer: 56 | lr: 0.002 57 | betas: [0.9, 0.95] 58 | weight_decay: 0.1 59 | 60 | schedule: 61 | init_value: 0.0 62 | peak_value: ${optimizer.lr} 63 | warmup_steps: 268435456 # 256Mi 64 | decay_steps: ${train.max_tokens} 65 | end_value: 0.0 66 | 67 | datamodule: 68 | train_batch_len: 16384 69 | train_batch_size: 32 70 | train_num_workers: 2 71 | 72 | eval_batch_len: 16384 73 | eval_local_batch_size: 2 74 | eval_tokens: 2147483648 # 2Bi 75 | eval_num_workers: 2 76 | -------------------------------------------------------------------------------- /configs/experiment/longcrawl64/transformer/pro_360m_7b.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | defaults: 3 | - override /model: forgetting_transformer 4 | - override /optimizer: adamw 5 | - override /schedule: warmup_cosine 6 | - override /datamodule: longcrawl64 7 | - override /strategy: fsdp 8 | - _self_ 9 | 10 | exp: ??? 11 | tag: ??? 12 | seed: 0 13 | 14 | output_dir: ??? 15 | data_dir: ??? # data_dir / 'longcrawl' / 'train.zarr' should exist 16 | 17 | resume: True 18 | 19 | log_interval: 33554432 # 32Mi. 32 times per billion 20 | train_eval_interval: 536870912 # 512Mi 21 | checkpoint_interval: 268435456 # 256Mi In tokens. 4 times per 1Bi. It is worth it 22 | 23 | skip_eval: true 24 | eval_interval: 7516192768 # Only at the end. Not used due to skip_eval 25 | checkpoint_keep_interval: 7516192768 # Only at the end 26 | 27 | fabric: 28 | devices: auto 29 | precision: 'bf16-mixed' 30 | 31 | train: 32 | max_tokens: 7516192768 # 7 Bi 33 | # Used for one gradient accumulation step, must be larger than batch_len 34 | grad_acc_tokens: 16384 35 | max_grad_norm: 1.0 36 | gradient_checkpointing: false 37 | 38 | model: 39 | config: 40 | hidden_size: 1024 41 | num_hidden_layers: 24 42 | num_heads: 16 43 | rope_base: 500000 44 | # Pro config. Note this is implemented as FoX with fgate_type="none" and use_rope=true 45 | # See the `- override /model: forgetting_transformer` on top of this file 46 | use_rope: true 47 | fgate_type: "none" 48 | use_v_shift: true 49 | use_k_shift: true 50 | qk_norm: true 51 | use_output_gate: true 52 | use_output_norm: true 53 | hidden_ratio: 3.5 # output gates use extra params so we reduce it here 54 | 55 | optimizer: 56 | lr: 0.002 57 | betas: [0.9, 0.95] 58 | weight_decay: 0.1 59 | 60 | schedule: 61 | init_value: 0.0 62 | peak_value: ${optimizer.lr} 63 | warmup_steps: 268435456 # 256Mi 64 | decay_steps: ${train.max_tokens} 65 | end_value: 0.0 66 | 67 | datamodule: 68 | train_batch_len: 16384 69 | train_batch_size: 32 70 | train_num_workers: 2 71 | 72 | eval_batch_len: 16384 73 | eval_local_batch_size: 1 74 | eval_tokens: 2147483648 # 2Bi 75 | eval_num_workers: 2 76 | -------------------------------------------------------------------------------- /configs/experiment/longcrawl64/transformer/pro_760m_16b.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | defaults: 3 | - override /model: forgetting_transformer 4 | - override /optimizer: adamw 5 | - override /schedule: warmup_cosine 6 | - override /datamodule: longcrawl64 7 | - override /strategy: fsdp 8 | - _self_ 9 | 10 | exp: ??? 11 | tag: ??? 12 | seed: 0 13 | 14 | output_dir: ??? 15 | data_dir: ??? # data_dir / 'longcrawl' / 'train.zarr' should exist 16 | 17 | resume: True 18 | 19 | log_interval: 33554432 # 32Mi. 32 times per billion 20 | train_eval_interval: 536870912 # 512Mi 21 | checkpoint_interval: 268435456 # 256Mi In tokens. 4 times per 1Bi. It is worth it 22 | 23 | skip_eval: true 24 | eval_interval: 16106127360 # Only at the end. Not used due to skip_eval 25 | checkpoint_keep_interval: 16106127360 # Only at the end 26 | 27 | fabric: 28 | devices: auto 29 | precision: 'bf16-mixed' 30 | 31 | train: 32 | max_tokens: 16106127360 # 15 Bi 33 | # Used for one gradient accumulation step, must be larger than batch_len 34 | grad_acc_tokens: 16384 35 | max_grad_norm: 1.0 36 | gradient_checkpointing: true 37 | 38 | model: 39 | config: 40 | hidden_size: 1536 41 | num_hidden_layers: 24 42 | num_heads: 12 43 | rope_base: 500000 44 | # Pro config. Note this is implemented as FoX with fgate_type="none" and use_rope=true 45 | # See the `- override /model: forgetting_transformer` on top of this file 46 | use_rope: true 47 | fgate_type: "none" 48 | use_v_shift: true 49 | use_k_shift: true 50 | qk_norm: true 51 | use_output_gate: true 52 | use_output_norm: true 53 | hidden_ratio: 3.5 # output gates use extra params so we reduce it here 54 | 55 | optimizer: 56 | lr: 0.001 57 | betas: [0.9, 0.95] 58 | weight_decay: 0.1 59 | 60 | schedule: 61 | init_value: 0.0 62 | peak_value: ${optimizer.lr} 63 | warmup_steps: 268435456 # 256Mi 64 | decay_steps: ${train.max_tokens} 65 | end_value: 0.0 66 | 67 | datamodule: 68 | train_batch_len: 16384 69 | train_batch_size: 32 70 | train_num_workers: 2 71 | 72 | eval_batch_len: 16384 73 | eval_local_batch_size: 1 74 | eval_tokens: 2147483648 # 2Bi 75 | eval_num_workers: 2 76 | -------------------------------------------------------------------------------- /configs/experiment/longcrawl64/transformer/pro_760m_48b.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | defaults: 3 | - override /model: forgetting_transformer 4 | - override /optimizer: adamw 5 | - override /schedule: warmup_cosine 6 | - override /datamodule: longcrawl64 7 | - override /strategy: fsdp 8 | - _self_ 9 | 10 | exp: ??? 11 | tag: ??? 12 | seed: 0 13 | 14 | output_dir: ??? 15 | data_dir: ??? # data_dir / 'longcrawl' / 'train.zarr' should exist 16 | 17 | resume: True 18 | 19 | log_interval: 33554432 # 32Mi. 32 times per billion 20 | train_eval_interval: 536870912 # 512Mi 21 | checkpoint_interval: 268435456 # 256Mi In tokens. 4 times per 1Bi. It is worth it 22 | 23 | skip_eval: true 24 | eval_interval: 48318382080 # Only at the end. Not used due to skip_eval 25 | checkpoint_keep_interval: 48318382080 # Only at the end 26 | 27 | fabric: 28 | devices: auto 29 | precision: 'bf16-mixed' 30 | 31 | train: 32 | max_tokens: 48318382080 # 15 Bi 33 | # Used for one gradient accumulation step, must be larger than batch_len 34 | grad_acc_tokens: 16384 35 | max_grad_norm: 1.0 36 | gradient_checkpointing: true 37 | 38 | model: 39 | config: 40 | hidden_size: 1536 41 | num_hidden_layers: 24 42 | num_heads: 12 43 | rope_base: 500000 44 | # Pro config. Note this is implemented as FoX with fgate_type="none" and use_rope=true 45 | # See the `- override /model: forgetting_transformer` on top of this file 46 | use_rope: true 47 | fgate_type: "none" 48 | use_v_shift: true 49 | use_k_shift: true 50 | qk_norm: true 51 | use_output_gate: true 52 | use_output_norm: true 53 | hidden_ratio: 3.5 # output gates use extra params so we reduce it here 54 | 55 | optimizer: 56 | lr: 0.001 57 | betas: [0.9, 0.95] 58 | weight_decay: 0.1 59 | 60 | schedule: 61 | init_value: 0.0 62 | peak_value: ${optimizer.lr} 63 | warmup_steps: 268435456 # 256Mi 64 | decay_steps: ${train.max_tokens} 65 | end_value: 0.0 66 | 67 | datamodule: 68 | train_batch_len: 16384 69 | train_batch_size: 32 70 | train_num_workers: 2 71 | 72 | eval_batch_len: 16384 73 | eval_local_batch_size: 1 74 | eval_tokens: 2147483648 # 2Bi 75 | eval_num_workers: 2 76 | -------------------------------------------------------------------------------- /configs/experiment/longcrawl64/transformer_swa/760m_16b.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | defaults: 3 | - override /model: transformer 4 | - override /optimizer: adamw 5 | - override /schedule: warmup_cosine 6 | - override /datamodule: longcrawl64 7 | - override /strategy: fsdp 8 | - _self_ 9 | 10 | exp: ??? 11 | tag: ??? 12 | seed: 0 13 | 14 | output_dir: ??? 15 | data_dir: ??? # data_dir / 'longcrawl' / 'train.zarr' should exist 16 | 17 | resume: True 18 | 19 | log_interval: 33554432 # 32Mi. 32 times per billion 20 | train_eval_interval: 536870912 # 512Mi 21 | checkpoint_interval: 268435456 # 256Mi In tokens. 4 times per 1Bi. It is worth it 22 | 23 | skip_eval: true 24 | eval_interval: 16106127360 # Only at the end. Not used due to skip_eval 25 | checkpoint_keep_interval: 16106127360 # Only at the end 26 | 27 | fabric: 28 | devices: auto 29 | precision: 'bf16-mixed' 30 | 31 | train: 32 | max_tokens: 16106127360 # 15 Bi 33 | # Used for one gradient accumulation step, must be larger than batch_len 34 | grad_acc_tokens: 16384 35 | max_grad_norm: 1.0 36 | gradient_checkpointing: true 37 | 38 | model: 39 | config: 40 | hidden_size: 1536 41 | num_hidden_layers: 24 42 | num_heads: 12 43 | rope_base: 500000 44 | window_size: 2048 45 | 46 | optimizer: 47 | lr: 0.001 48 | betas: [0.9, 0.95] 49 | weight_decay: 0.1 50 | 51 | schedule: 52 | init_value: 0.0 53 | peak_value: ${optimizer.lr} 54 | warmup_steps: 268435456 # 256Mi 55 | decay_steps: ${train.max_tokens} 56 | end_value: 0.0 57 | 58 | datamodule: 59 | train_batch_len: 16384 60 | train_batch_size: 32 61 | train_num_workers: 2 62 | 63 | eval_batch_len: 16384 64 | eval_local_batch_size: 1 65 | eval_tokens: 2147483648 # 2Bi 66 | eval_num_workers: 2 67 | -------------------------------------------------------------------------------- /configs/model/__init__.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | 3 | @dataclass 4 | class ModelConfig: 5 | pass 6 | -------------------------------------------------------------------------------- /configs/model/delta_net.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | from omegaconf import OmegaConf, MISSING 3 | from typing import List, Any, Optional 4 | 5 | from . import ModelConfig 6 | 7 | 8 | @dataclass 9 | class DeltaNetArgConfig: 10 | _target_: str = "forgetting_transformer.model.delta_net.configuration_delta_net.DeltaNetConfig" 11 | vocab_size: int = MISSING 12 | hidden_size: int = MISSING 13 | expand_k: int = 1 14 | expand_v: int = 1 15 | use_gate: bool = False 16 | use_short_conv: bool = True 17 | conv_size: int = 4 18 | use_beta: bool = True 19 | use_output_norm: bool = True 20 | hidden_ratio: Optional[int] = 4 21 | intermediate_size: Optional[int] = None 22 | num_hidden_layers: int = MISSING 23 | num_heads: int = MISSING 24 | attn_mode: str = "chunk" 25 | qk_norm: str = 'l2' 26 | qk_activation: str = 'silu' 27 | hidden_act: str = "swish" 28 | max_position_embeddings: Optional[int] = None 29 | norm_first: bool = False 30 | norm_eps: float = 1e-6 31 | use_cache: bool = True 32 | pad_token_id: Optional[int] = None 33 | bos_token_id: Optional[int] = None 34 | eos_token_id: Optional[int] = None 35 | tie_word_embeddings: bool = False 36 | initializer_range: float = 0.02 37 | fuse_cross_entropy: bool = True 38 | 39 | 40 | 41 | 42 | @dataclass 43 | class DeltaNetConfig(ModelConfig): 44 | _target_: str = "forgetting_transformer.model.delta_net.modeling_delta_net.DeltaNetForCausalLM" 45 | config: DeltaNetArgConfig = DeltaNetArgConfig() 46 | -------------------------------------------------------------------------------- /configs/model/forgetting_transformer.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | from omegaconf import OmegaConf, MISSING 3 | from typing import List, Any, Optional 4 | 5 | from . import ModelConfig 6 | 7 | 8 | @dataclass 9 | class ForgettingTransformerArgConfig: 10 | _target_: str = "forgetting_transformer.model.forgetting_transformer.configuration_forgetting_transformer.ForgettingTransformerConfig" 11 | vocab_size: int = MISSING # Should be provided programmatically 12 | hidden_size: int = MISSING 13 | hidden_ratio: float = 4 14 | intermediate_size: Optional[int] = None 15 | num_hidden_layers: int = MISSING 16 | num_heads: int = MISSING 17 | num_kv_heads: Optional[int] = None 18 | hidden_act: str = "swish" 19 | window_size: Optional[int] = None 20 | max_position_embeddings: Optional[int] = None 21 | initializer_range: float = 0.02 22 | elementwise_affine: Optional[bool] = True 23 | norm_eps: float = 1e-6 24 | use_cache: bool = True 25 | pad_token_id: Optional[int] = None 26 | bos_token_id: Optional[int] = None 27 | eos_token_id: Optional[int] = None 28 | tie_word_embeddings: bool = False 29 | attention_bias: bool = False 30 | fuse_norm: bool = True 31 | fuse_cross_entropy: bool = True 32 | rope_base: float = 500000 33 | use_rope: bool = False 34 | use_output_gate: bool = False 35 | ogate_act: str = "sigmoid" 36 | fgate_type: str = "full" 37 | fgate_bias_init: bool = False 38 | decay_time_min: Optional[float] = None 39 | decay_time_max: Optional[float] = None 40 | use_output_norm: bool = False 41 | qk_norm: bool = False 42 | qk_norm_share_param_across_head: bool = False 43 | 44 | use_k_shift: bool = False 45 | use_v_shift: bool = False 46 | 47 | 48 | 49 | 50 | @dataclass 51 | class ForgettingTransformerConfig(ModelConfig): 52 | _target_: str = "forgetting_transformer.model.forgetting_transformer.modeling_forgetting_transformer.ForgettingTransformerForCausalLM" 53 | config: ForgettingTransformerArgConfig = ForgettingTransformerArgConfig() 54 | -------------------------------------------------------------------------------- /configs/model/hgrn2.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | from omegaconf import OmegaConf, MISSING 3 | from typing import List, Any, Optional 4 | 5 | from . import ModelConfig 6 | 7 | 8 | @dataclass 9 | class HGRN2ArgConfig: 10 | _target_: str = "forgetting_transformer.model.hgrn2.configuration_hgrn2.HGRN2Config" 11 | vocab_size: int = MISSING 12 | hidden_size: int = MISSING 13 | num_hidden_layers: int = MISSING 14 | attn_mode: str = "chunk" 15 | num_heads: Optional[int] = None 16 | expand_ratio: Optional[int] = MISSING 17 | use_short_conv: bool = False 18 | conv_size: int = 4 19 | use_lower_bound: bool = True 20 | hidden_ratio: Optional[int] = 4 21 | intermediate_size: Optional[int] = None 22 | hidden_act: str = "swish" 23 | max_position_embeddings: Optional[int] = None 24 | elementwise_affine: Optional[bool] = True 25 | norm_eps: float = 1e-6 26 | use_cache: bool = True 27 | pad_token_id: Optional[int] = None 28 | bos_token_id: Optional[int] = None 29 | eos_token_id: Optional[int] = None 30 | tie_word_embeddings: bool = False 31 | initializer_range: float = 0.02 32 | fuse_cross_entropy: bool = True 33 | 34 | 35 | 36 | 37 | @dataclass 38 | class HGRN2Config(ModelConfig): 39 | _target_: str = "forgetting_transformer.model.hgrn2.modeling_hgrn2.HGRN2ForCausalLM" 40 | config: HGRN2ArgConfig = HGRN2ArgConfig() 41 | -------------------------------------------------------------------------------- /configs/model/mamba2.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | from omegaconf import OmegaConf, MISSING 3 | from typing import List, Any, Optional 4 | 5 | from . import ModelConfig 6 | 7 | 8 | @dataclass 9 | class Mamba2ArgConfig: 10 | _target_: str = "forgetting_transformer.model.mamba2.configuration_mamba2.Mamba2Config" 11 | num_heads: int = MISSING 12 | head_dim: int = MISSING 13 | vocab_size: int = MISSING 14 | hidden_size: int = MISSING 15 | state_size: int = MISSING 16 | num_hidden_layers: int = MISSING 17 | layer_norm_epsilon: float = 1e-5 18 | pad_token_id: Optional[int] = None 19 | bos_token_id: Optional[int] = None 20 | eos_token_id: Optional[int] = None 21 | expand: int = 2 22 | conv_kernel: int = 4 23 | n_groups: int = 1 24 | use_bias: bool = False 25 | use_conv_bias: bool = True 26 | hidden_act: str = "silu" 27 | initializer_range: float = 0.02 28 | residual_in_fp32: bool = True 29 | time_step_rank: str = "auto" 30 | time_step_min: float = 0.001 31 | time_step_max: float = 0.1 32 | time_step_floor: float = 1e-4 33 | time_step_limit=(0.0, float("inf")) 34 | rescale_prenorm_residual: bool = True 35 | use_cache: bool = True 36 | rms_norm: bool = True 37 | chunk_size: int = 256 38 | fuse_cross_entropy: bool = True 39 | tie_word_embeddings: bool = False 40 | 41 | 42 | 43 | 44 | @dataclass 45 | class Mamba2Config(ModelConfig): 46 | _target_: str = "forgetting_transformer.model.mamba2.modeling_mamba2.Mamba2ForCausalLM" 47 | config: Mamba2ArgConfig = Mamba2ArgConfig() 48 | -------------------------------------------------------------------------------- /configs/model/samba.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | from omegaconf import OmegaConf, MISSING 3 | from typing import List, Any, Optional, Dict 4 | 5 | from . import ModelConfig 6 | 7 | @dataclass 8 | class SambaAttnConfig: 9 | num_kv_heads: Optional[int] = None 10 | num_heads: int = MISSING 11 | window_size: Optional[int] = MISSING 12 | layers: Optional[List[int]] = MISSING 13 | 14 | @dataclass 15 | class SambaArgConfig: 16 | _target_: str = "forgetting_transformer.model.samba.configuration_samba.SambaConfig" 17 | vocab_size: int = MISSING 18 | hidden_size: int = MISSING 19 | state_size: int = 16 20 | num_hidden_layers: int = MISSING 21 | norm_eps=1e-5 22 | pad_token_id: Optional[int] = None 23 | bos_token_id: Optional[int] = None 24 | eos_token_id: Optional[int] = None 25 | expand: int = 2 26 | conv_kernel: int = 4 27 | use_bias: bool = False 28 | use_conv_bias: bool = True 29 | hidden_act: str = "silu" 30 | initializer_range: float = 0.02 31 | residual_in_fp32: bool = False 32 | time_step_rank: str = "auto" 33 | time_step_scale: float = 1.0 34 | time_step_min: float = 0.001 35 | time_step_max: float = 0.1 36 | time_step_init_scheme: str = "random" 37 | time_step_floor: float = 1e-4 38 | max_position_embeddings: Optional[int] = None 39 | attn: SambaAttnConfig = SambaAttnConfig() 40 | attn_hidden_ratio: Optional[float] = 4 41 | mamba_hidden_ratio: Optional[float] = 3 42 | use_cache: bool = True 43 | fuse_norm: bool = True 44 | fuse_cross_entropy: bool = True 45 | tie_word_embeddings: bool = False 46 | rope_base: float = MISSING 47 | rescale_prenorm_residual: bool = True # To be consistent with other impl 48 | 49 | 50 | 51 | 52 | @dataclass 53 | class SambaConfig(ModelConfig): 54 | _target_: str = "forgetting_transformer.model.samba.modeling_samba.SambaForCausalLM" 55 | config: SambaArgConfig = SambaArgConfig() 56 | -------------------------------------------------------------------------------- /configs/model/transformer.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | from omegaconf import OmegaConf, MISSING 3 | from typing import List, Any, Optional 4 | 5 | from . import ModelConfig 6 | 7 | 8 | @dataclass 9 | class TransformerArgConfig: 10 | _target_: str = "forgetting_transformer.model.transformer.configuration_transformer.TransformerConfig" 11 | vocab_size: int = MISSING # Should be provided programmatically 12 | hidden_size: int = MISSING 13 | hidden_ratio: int = 4 14 | intermediate_size: Optional[int] = None 15 | num_hidden_layers: int = MISSING 16 | num_heads: int = MISSING 17 | num_kv_heads: Optional[int] = None 18 | hidden_act: str = "swish" 19 | window_size: Optional[int] = None 20 | max_position_embeddings: Optional[int] = None 21 | initializer_range: float = 0.02 22 | elementwise_affine: Optional[bool] = True 23 | norm_eps: float = 1e-6 24 | use_cache: bool = True 25 | pad_token_id: Optional[int] = None 26 | bos_token_id: Optional[int] = None 27 | eos_token_id: Optional[int] = None 28 | tie_word_embeddings: bool = False 29 | attention_bias: bool = False 30 | fuse_norm: bool = True 31 | fuse_cross_entropy: bool = True 32 | rope_base: float = MISSING 33 | use_rope: bool = True 34 | 35 | 36 | 37 | 38 | @dataclass 39 | class TransformerConfig(ModelConfig): 40 | _target_: str = "forgetting_transformer.model.transformer.modeling_transformer.TransformerForCausalLM" 41 | config: TransformerArgConfig = TransformerArgConfig() 42 | -------------------------------------------------------------------------------- /configs/optimizer/__init__.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | 3 | @dataclass 4 | class OptimizerConfig: 5 | pass 6 | -------------------------------------------------------------------------------- /configs/optimizer/adamw.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | from omegaconf import OmegaConf, MISSING 3 | from typing import List, Any 4 | 5 | from . import OptimizerConfig 6 | 7 | 8 | @dataclass 9 | class AdamWConfig(OptimizerConfig): 10 | _target_: str = "torch.optim.AdamW" 11 | lr: float = MISSING 12 | betas: List[float] = MISSING 13 | weight_decay: float = MISSING 14 | -------------------------------------------------------------------------------- /configs/schedule/__init__.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | 3 | 4 | @dataclass 5 | class ScheduleConfig: 6 | pass 7 | -------------------------------------------------------------------------------- /configs/schedule/constant.py: -------------------------------------------------------------------------------- 1 | from . import ScheduleConfig 2 | from omegaconf import MISSING 3 | from dataclasses import dataclass 4 | 5 | 6 | @dataclass 7 | class ConstantScheduleConfig(ScheduleConfig): 8 | _target_: str = 'forgetting_transformer.schedule.constant_schedule' 9 | value: float = MISSING 10 | -------------------------------------------------------------------------------- /configs/schedule/warmup_cosine.py: -------------------------------------------------------------------------------- 1 | from . import ScheduleConfig 2 | from omegaconf import MISSING 3 | from dataclasses import dataclass 4 | 5 | 6 | @dataclass 7 | class WarmupCosineScheduleConfig(ScheduleConfig): 8 | _target_: str = 'forgetting_transformer.schedule.warmup_cosine_decay_schedule' 9 | init_value: float = MISSING 10 | peak_value: float = MISSING 11 | warmup_steps: int = MISSING 12 | decay_steps: int = MISSING 13 | end_value: float = MISSING 14 | -------------------------------------------------------------------------------- /configs/schedule/warmup_linear.py: -------------------------------------------------------------------------------- 1 | from . import ScheduleConfig 2 | from omegaconf import MISSING 3 | from dataclasses import dataclass 4 | 5 | 6 | @dataclass 7 | class WarmupLinearScheduleConfig(ScheduleConfig): 8 | _target_: str = 'forgetting_transformer.schedule.warmup_linear_decay_schedule' 9 | init_value: float = MISSING 10 | peak_value: float = MISSING 11 | warmup_steps: int = MISSING 12 | decay_steps: int = MISSING 13 | end_value: float = MISSING 14 | -------------------------------------------------------------------------------- /configs/schedule/warmup_one_minus_sqrt.py: -------------------------------------------------------------------------------- 1 | from . import ScheduleConfig 2 | from omegaconf import MISSING 3 | from dataclasses import dataclass 4 | 5 | 6 | @dataclass 7 | class WarmupOneMinusSqrtScheduleConfig(ScheduleConfig): 8 | _target_: str = 'forgetting_transformer.schedule.warmup_one_minus_sqrt_schedule' 9 | init_value: float = MISSING 10 | peak_value: float = MISSING 11 | warmup_steps: int = MISSING 12 | total_steps: int = MISSING 13 | anneal_steps: int = MISSING 14 | end_value: float = MISSING 15 | -------------------------------------------------------------------------------- /configs/strategy/__init__.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | 3 | 4 | @dataclass 5 | class StrategyConfig: 6 | pass 7 | -------------------------------------------------------------------------------- /configs/strategy/ddp.py: -------------------------------------------------------------------------------- 1 | from . import StrategyConfig 2 | from omegaconf import MISSING 3 | from dataclasses import dataclass 4 | 5 | 6 | @dataclass 7 | class DDPConfig(StrategyConfig): 8 | _target_: str = "lightning.fabric.strategies.DDPStrategy" 9 | -------------------------------------------------------------------------------- /configs/strategy/fsdp.py: -------------------------------------------------------------------------------- 1 | from . import StrategyConfig 2 | from omegaconf import MISSING 3 | from dataclasses import dataclass 4 | 5 | 6 | @dataclass 7 | class FSDPConfig(StrategyConfig): 8 | _target_: str = "lightning.fabric.strategies.FSDPStrategy" 9 | state_dict_type: str = "full" # We don't want any trouble later 10 | sharding_strategy: str = "FULL_SHARD" # We don't want any trouble later 11 | cpu_offload: bool = False 12 | -------------------------------------------------------------------------------- /configs/utils.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | import importlib 3 | from hydra.core.config_store import ConfigStore 4 | from pathlib import Path 5 | from typing import Any, Type, Union, Optional 6 | from types import ModuleType 7 | import pkgutil 8 | from dataclasses import is_dataclass 9 | 10 | 11 | def auto_register(base_class: Type, config_root: Optional[Union[str, Path]]): 12 | """Auto register config that inherits a base class. 13 | 14 | This automatically registers all the config class defined in the same package 15 | as baseclass. Rules: 16 | - The base class must be defined in the __init__.py of the package 17 | - Subclasses must be defined in direct modules of that package 18 | - Each module file should only contain one subclass. 19 | """ 20 | config_root = config_root 21 | assert config_root.stem == "configs", "Just a sanity check, you can change this" 22 | 23 | pkg = importlib.import_module(base_class.__module__) 24 | assert hasattr(pkg, "__path__"), ( 25 | f"{base_class}'s module does not have attribute __path__. {base_class}" 26 | f" must be defined in `__init__.py` in order for auto register to work" 27 | ) 28 | pkg_path = Path(pkg.__file__).parent 29 | 30 | try: 31 | group = str(pkg_path.relative_to(config_root)) 32 | except ValueError: 33 | raise ValueError( 34 | f"Node {pkg.__name__}'s path {pkg_path} is not under config root {config_root}." 35 | ) 36 | 37 | cs = ConfigStore.instance() 38 | for loader, module_name, is_pkg in pkgutil.iter_modules(pkg.__path__): 39 | module = importlib.import_module(f"{pkg.__name__}.{module_name}") 40 | # Iterate through the attributes of the module 41 | valid_list = [] 42 | for name, obj in inspect.getmembers(module): 43 | if ( 44 | inspect.isclass(obj) 45 | and issubclass(obj, base_class) 46 | and obj is not base_class 47 | ): 48 | assert is_dataclass(obj), f"{obj} must be dataclass" 49 | valid_list.append((name, obj)) 50 | if len(valid_list) != 1: 51 | raise ValueError( 52 | f"Module {module} should define exactly one subclass of {base_class}, but got {valid_list}" 53 | ) 54 | else: 55 | name, obj = valid_list[0] 56 | cs.store(name=module_name, group=group, node=obj) 57 | -------------------------------------------------------------------------------- /eval/lm_eval_harness/README.md: -------------------------------------------------------------------------------- 1 | # Language Model Evaluation Harness 2 | 3 | This directory contains the code for evaluating trained models on several tasks from [Language Model Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness). 4 | 5 | ## Usage 6 | 7 | Example usage: 8 | 9 | ```bash 10 | export SAVE_DIR="./results" # You can use any other path 11 | python run_lm_eval.py \ 12 | --model "fox-pro-760m-longcrawl64-48b" \ 13 | --model_path "zhixuan-lin/fox-pro-760m-longcrawl64-48b" \ 14 | --device_id 0 \ 15 | --max_len 16384 \ 16 | --batch_size 16 \ 17 | --save_dir $SAVE_DIR 18 | ``` 19 | 20 | After you've got the results, you can generate a latex table: 21 | 22 | ```bash 23 | python table_lm_eval.py --result_dir $SAVE_DIR 24 | ``` 25 | 26 | You can change the `MODELS` list in `table_lm_eval.py` to specify what models to include in your table. 27 | 28 | Note that we observe the evaluation results to be non-deterministic, likely due to GPU non-determinism. Therefore the results you obtain may not exactly match those reported in the paper. However, the difference should be small. 29 | 30 | ## Citation 31 | 32 | If you use this code, consider citing Language Evaluation Harness: 33 | 34 | ``` 35 | @misc{eval-harness, 36 | author = {Gao, Leo and Tow, Jonathan and Abbasi, Baber and Biderman, Stella and Black, Sid and DiPofi, Anthony and Foster, Charles and Golding, Laurence and Hsu, Jeffrey and Le Noac'h, Alain and Li, Haonan and McDonell, Kyle and Muennighoff, Niklas and Ociepa, Chris and Phang, Jason and Reynolds, Laria and Schoelkopf, Hailey and Skowron, Aviya and Sutawika, Lintang and Tang, Eric and Thite, Anish and Wang, Ben and Wang, Kevin and Zou, Andy}, 37 | title = {A framework for few-shot language model evaluation}, 38 | month = 07, 39 | year = 2024, 40 | publisher = {Zenodo}, 41 | version = {v0.4.3}, 42 | doi = {10.5281/zenodo.12608602}, 43 | url = {https://zenodo.org/records/12608602} 44 | } 45 | ``` 46 | -------------------------------------------------------------------------------- /eval/lm_eval_harness/run_lm_eval.py: -------------------------------------------------------------------------------- 1 | from typing import Callable, Dict, Union, Optional, Tuple, NamedTuple, Any, List 2 | import logging 3 | from pathlib import Path 4 | import rich 5 | import rich.syntax 6 | 7 | import torch 8 | import os 9 | import os.path as osp 10 | from torch import nn 11 | import colorlog 12 | from datetime import datetime 13 | import jsonlines 14 | import lm_eval 15 | from lm_eval.models.huggingface import HFLM 16 | 17 | import json 18 | import pprint 19 | import argparse 20 | from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaTokenizerFast, LlamaTokenizer 21 | import forgetting_transformer.tokenizer 22 | import forgetting_transformer.model 23 | import pickle 24 | 25 | 26 | 27 | 28 | def main(): 29 | parser = argparse.ArgumentParser() 30 | # parser.add_argument('--model', type=str, required=True, choices=["mamba2-760m", "fot-760m", "hgrn2-760m", "delta_net-760m", "transformer-760m", "fot-qk-norm-760m"]) 31 | parser.add_argument('--model', type=str, required=True) 32 | parser.add_argument('--model_path', type=str, required=True) 33 | parser.add_argument('--device_id', type=int, required=True) 34 | parser.add_argument('--max_len', type=int, required=True) 35 | parser.add_argument('--batch_size', type=int, required=True) 36 | parser.add_argument('--save_dir', type=str, required=True) 37 | args = parser.parse_args() 38 | 39 | assert args.model == Path(args.model_path).name, f"Model name '{args.model}' is different from the last component of model path '{args.path}'. You can delete this assertion if you are sure this is correct." 40 | model_name = args.model 41 | device_id = args.device_id 42 | max_len = args.max_len 43 | batch_size = args.batch_size 44 | save_dir = Path(args.save_dir) / model_name 45 | save_dir.mkdir(parents=True, exist_ok=True) 46 | 47 | model_path = args.model_path 48 | 49 | device = torch.device(f"cuda:{device_id}") 50 | 51 | 52 | tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, add_bos_token=True, clean_up_tokenization_spaces=False) 53 | assert max_len == 16384, "Just in case. You can delete this." 54 | model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True).to(device) 55 | 56 | 57 | 58 | # TODO: note that the models are trained with BOS. Therefore, in principle in all 59 | # evaluation BOS should be added. However, for wikitext perplexity eval except for 60 | # the first rolling window, no BOS is added. This is fine in our case since our 16k 61 | # context length covers most wikitext docs. However, if you use a short training 62 | # context length with BOS, you will need to modify HFLM to implement the correct 63 | # behavior. 64 | hflm = HFLM( 65 | pretrained=model, 66 | batch_size=batch_size, 67 | tokenizer=tokenizer, 68 | max_length=max_len, 69 | add_bos_token=True, # This is basically whether to use add_special_tokens 70 | ) 71 | 72 | task_manager = lm_eval.tasks.TaskManager() 73 | 74 | # Setting `task_manager` to the one above is optional and should generally be 75 | # done 76 | # if you want to include tasks from paths other than ones in `lm_eval/tasks`. 77 | # `simple_evaluate` will instantiate its own task_manager if it is set to None 78 | # here. 79 | 80 | with torch.cuda.device(device): 81 | with torch.autocast(device_type="cuda", dtype=torch.bfloat16): 82 | with torch.no_grad(): 83 | results = lm_eval.simple_evaluate( # call simple_evaluate 84 | model=hflm, 85 | # tasks=["wikitext"], 86 | tasks=["wikitext", "lambada_openai", "piqa", "hellaswag", "winogrande", "arc_easy", "arc_challenge", "boolq", "sciq", "copa", "openbookqa"], 87 | # tasks=["winogrande"], 88 | # tasks=["scrolls_narrativeqa", "scrolls_qasper", "scrolls_quality"], 89 | # tasks=[ 90 | 91 | # "scrolls_govreport", # 10min for mamba2 92 | # "scrolls_qmsum",# 4min for mamba2 93 | # "scrolls_summscreenfd", <10min 94 | 95 | # "scrolls_qasper", 96 | 97 | # "scrolls_quality", 98 | # "scrolls_contractnli", 99 | 100 | # "scrolls_narrativeqa", 101 | # ], 102 | # tasks=["wikitext"], 103 | # tasks=["lambada_openai"], 104 | num_fewshot=0, 105 | task_manager=task_manager, 106 | device="cuda" 107 | ) 108 | pprint.pprint(results["results"]) 109 | save_path = save_dir / "results.json" 110 | with save_path.open("w") as f: 111 | json.dump(results["results"], f, indent=4) 112 | print(f"Results saved to {save_path}") 113 | # import ipdb; ipdb.set_trace() 114 | if __name__ == "__main__": 115 | main() # pylint: disable=no-value-for-parameter 116 | 117 | -------------------------------------------------------------------------------- /eval/longbench/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 THU-KEG & Zhipu AI 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /eval/longbench/README.md: -------------------------------------------------------------------------------- 1 | # LongBench 2 | 3 | This directory contains the code for evaluation on LongBench. The code is adapted from the original [LongBench-v1 repository](https://github.com/THUDM/LongBench/blob/main/LongBench/README.md). 4 | 5 | ## Usage 6 | 7 | Usage example: 8 | 9 | ```bash 10 | python pred.py --model "fox-pro-760m-longcrawl64-48b" --model_path "zhixuan-lin/fox-pro-760m-longcrawl64-48b" --max_length 15500 11 | python eval.py --model "fox-pro-760m-longcrawl64-48b" 12 | ``` 13 | 14 | After you run these, results will be saved to `./pred`. You can create a latex table using: 15 | 16 | 17 | ```bash 18 | python table_longbench.py 19 | ``` 20 | 21 | You can change `MODELS` in `table_longbench.py` to specify which models you want to include in the table. 22 | 23 | 24 | Note that we observe the evaluation results to be non-deterministic, likely due to GPU non-determinism. Therefore the results you obtain may not exactly match those reported in the paper. However, the difference should be small. 25 | 26 | ## Citation 27 | 28 | If you use this code, consider citing LongBench: 29 | ``` 30 | @article{bai2023longbench, 31 | title={LongBench: A Bilingual, Multitask Benchmark for Long Context Understanding}, 32 | author={Bai, Yushi and Lv, Xin and Zhang, Jiajie and Lyu, Hongchang and Tang, Jiankai and Huang, Zhidian and Du, Zhengxiao and Liu, Xiao and Zeng, Aohan and Hou, Lei and Dong, Yuxiao and Tang, Jie and Li, Juanzi}, 33 | journal={arXiv preprint arXiv:2308.14508}, 34 | year={2023} 35 | } 36 | ``` 37 | When citing LongBench, please kindly consider citing the original dataset papers. The relevant citation information is listed [here](refs/ref.bib). 38 | -------------------------------------------------------------------------------- /eval/longbench/config/dataset2maxlen.json: -------------------------------------------------------------------------------- 1 | { 2 | "narrativeqa": 128, 3 | "qasper": 128, 4 | "multifieldqa_en": 64, 5 | "multifieldqa_zh": 64, 6 | "hotpotqa": 32, 7 | "2wikimqa": 32, 8 | "musique": 32, 9 | "dureader": 128, 10 | "gov_report": 512, 11 | "qmsum": 512, 12 | "multi_news": 512, 13 | "vcsum": 512, 14 | "trec": 64, 15 | "triviaqa": 32, 16 | "samsum": 128, 17 | "lsht": 64, 18 | "passage_count": 32, 19 | "passage_retrieval_en": 32, 20 | "passage_retrieval_zh": 32, 21 | "lcc": 64, 22 | "repobench-p": 64 23 | } -------------------------------------------------------------------------------- /eval/longbench/config/dataset2prompt.json: -------------------------------------------------------------------------------- 1 | { 2 | "narrativeqa": "You are given a story, which can be either a novel or a movie script, and a question. Answer the question asconcisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story asconcisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:", 3 | "qasper": "You are given a scientific article and a question. Answer the question as concisely as you can, using a single phrase or sentence if possible. If the question cannot be answered based on the information in the article, write \"unanswerable\". If the question is a yes/no question, answer \"yes\", \"no\", or \"unanswerable\". Do not provide any explanation.\n\nArticle: {context}\n\n Answer the question based on the above article as concisely as you can, using a single phrase or sentence if possible. If the question cannot be answered based on the information in the article, write \"unanswerable\". If the question is a yes/no question, answer \"yes\", \"no\", or \"unanswerable\". Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:", 4 | "multifieldqa_en": "Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:", 5 | "multifieldqa_zh": "阅读以下文字并用中文简短回答:\n\n{context}\n\n现在请基于上面的文章回答下面的问题,只告诉我答案,不要输出任何其他字词。\n\n问题:{input}\n回答:", 6 | "hotpotqa": "Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:", 7 | "2wikimqa": "Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:", 8 | "musique": "Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:", 9 | "dureader": "请基于给定的文章回答下述问题。\n\n文章:{context}\n\n请基于上述文章回答下面的问题。\n\n问题:{input}\n回答:", 10 | "gov_report": "You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:", 11 | "qmsum": "You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:", 12 | "multi_news": "You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:", 13 | "vcsum": "下面有一段会议记录,请你阅读后,写一段总结,总结会议的内容。\n会议记录:\n{context}\n\n会议总结:", 14 | "trec": "Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}", 15 | "triviaqa": "Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}", 16 | "samsum": "Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}", 17 | "lsht": "请判断给定新闻的类别,下面是一些例子。\n\n{context}\n{input}", 18 | "passage_count": "There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ", 19 | "passage_retrieval_en": "Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like \"Paragraph 1\", \"Paragraph 2\", etc.\n\nThe answer is: ", 20 | "passage_retrieval_zh": "以下是若干段落文字,以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是\"段落1\",\"段落2\"等格式\n\n答案是:", 21 | "lcc": "Please complete the code given below. \n{context}Next line of code:\n", 22 | "repobench-p": "Please complete the code given below. \n{context}{input}Next line of code:\n" 23 | } -------------------------------------------------------------------------------- /eval/longbench/eval.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | import json 4 | import argparse 5 | import numpy as np 6 | 7 | from metrics import ( 8 | qa_f1_score, 9 | rouge_zh_score, 10 | qa_f1_zh_score, 11 | rouge_score, 12 | classification_score, 13 | retrieval_score, 14 | retrieval_zh_score, 15 | count_score, 16 | code_sim_score, 17 | ) 18 | 19 | dataset2metric = { 20 | "narrativeqa": qa_f1_score, 21 | "qasper": qa_f1_score, 22 | "multifieldqa_en": qa_f1_score, 23 | "multifieldqa_zh": qa_f1_zh_score, 24 | "hotpotqa": qa_f1_score, 25 | "2wikimqa": qa_f1_score, 26 | "musique": qa_f1_score, 27 | "dureader": rouge_zh_score, 28 | "gov_report": rouge_score, 29 | "qmsum": rouge_score, 30 | "multi_news": rouge_score, 31 | "vcsum": rouge_zh_score, 32 | "trec": classification_score, 33 | "triviaqa": qa_f1_score, 34 | "samsum": rouge_score, 35 | "lsht": classification_score, 36 | "passage_retrieval_en": retrieval_score, 37 | "passage_count": count_score, 38 | "passage_retrieval_zh": retrieval_zh_score, 39 | "lcc": code_sim_score, 40 | "repobench-p": code_sim_score, 41 | } 42 | 43 | def parse_args(args=None): 44 | parser = argparse.ArgumentParser() 45 | parser.add_argument('--model', type=str, default=None) 46 | parser.add_argument('--e', action='store_true', help="Evaluate on LongBench-E") 47 | return parser.parse_args(args) 48 | 49 | def scorer_e(dataset, predictions, answers, lengths, all_classes): 50 | scores = {"0-4k": [], "4-8k": [], "8k+": []} 51 | for (prediction, ground_truths, length) in zip(predictions, answers, lengths): 52 | score = 0. 53 | # if dataset in ["trec", "triviaqa", "samsum", "lsht"]: 54 | prediction = prediction.lstrip('\n').split('\n')[0] 55 | for ground_truth in ground_truths: 56 | score = max(score, dataset2metric[dataset](prediction, ground_truth, all_classes=all_classes)) 57 | if length < 4000: 58 | scores["0-4k"].append(score) 59 | elif length < 8000: 60 | scores["4-8k"].append(score) 61 | else: 62 | scores["8k+"].append(score) 63 | for key in scores.keys(): 64 | scores[key] = round(100 * np.mean(scores[key]), 2) 65 | return scores 66 | 67 | def scorer(dataset, predictions, answers, all_classes): 68 | total_score = 0. 69 | for (prediction, ground_truths) in zip(predictions, answers): 70 | score = 0. 71 | # if dataset in ["trec", "triviaqa", "samsum", "lsht"]: 72 | prediction = prediction.lstrip('\n').split('\n')[0] 73 | for ground_truth in ground_truths: 74 | score = max(score, dataset2metric[dataset](prediction, ground_truth, all_classes=all_classes)) 75 | total_score += score 76 | return round(100 * total_score / len(predictions), 2) 77 | 78 | if __name__ == '__main__': 79 | args = parse_args() 80 | scores = dict() 81 | if args.e: 82 | path = f"pred_e/{args.model}/" 83 | else: 84 | path = f"pred/{args.model}/" 85 | 86 | 87 | if args.e: 88 | # datasets = ["qasper", "multifieldqa_en", "hotpotqa", "2wikimqa", "gov_report", "multi_news", \ 89 | # "trec", "triviaqa", "samsum", "passage_count", "passage_retrieval_en", "lcc", "repobench-p"] 90 | datasets = ["qasper", "multifieldqa_en", "hotpotqa", "2wikimqa", "gov_report", "multi_news", \ 91 | "trec", "triviaqa", "samsum", "lcc", "repobench-p"] 92 | # datasets = ["triviaqa"] 93 | else: 94 | # datasets = ["narrativeqa", "qasper", "multifieldqa_en", "multifieldqa_zh", "hotpotqa", "2wikimqa", "musique", \ 95 | # "dureader", "gov_report", "qmsum", "multi_news", "vcsum", "trec", "triviaqa", "samsum", "lsht", \ 96 | # "passage_count", "passage_retrieval_en", "passage_retrieval_zh", "lcc", "repobench-p"] 97 | # English tasks 98 | # datasets = ["2wikimqa", "narrativeqa", "qasper", "multifieldqa_en", "hotpotqa", "musique", \ 99 | # "gov_report", "qmsum", "multi_news", "trec", "triviaqa", "samsum", \ 100 | # "passage_count", "passage_retrieval_en", "lcc", "repobench-p"] 101 | datasets = ["2wikimqa", "narrativeqa", "qasper", "multifieldqa_en", "hotpotqa", "musique", \ 102 | "gov_report", "qmsum", "multi_news", "trec", "triviaqa", "samsum", \ 103 | "lcc", "repobench-p"] 104 | # datasets = ["2wikimqa"] 105 | # datasets = ["2wikimqa", "narrativeqa", "qasper", "multifieldqa_en", "hotpotqa", "musique", \ 106 | for dataset in datasets: 107 | sentinel_path = (Path(path) / f"{dataset}.jsonl.done") 108 | result_path = (Path(path) / f"{dataset}.jsonl") 109 | assert sentinel_path.is_file(), f"{sentinel_path} is missing" 110 | assert result_path.is_file(), f"{result_path} is missing" 111 | all_files = os.listdir(path) 112 | print("Evaluating on:", [file for file in all_files if (Path(path) / f"{file}.done").is_file()]) 113 | for filename in all_files: 114 | if not filename.endswith("jsonl"): 115 | continue 116 | sentinel_path = f"{path}{filename}.done" 117 | if not Path(sentinel_path).exists(): 118 | print(f"{filename} is incomplete. Skipping") 119 | # We don't delete things because pred.py might be writing to it 120 | continue 121 | predictions, answers, lengths = [], [], [] 122 | dataset = filename.split('.')[0] 123 | with open(f"{path}{filename}", "r", encoding="utf-8") as f: 124 | for line in f: 125 | data = json.loads(line) 126 | predictions.append(data["pred"]) 127 | answers.append(data["answers"]) 128 | all_classes = data["all_classes"] 129 | if "length" in data: 130 | lengths.append(data["length"]) 131 | if args.e: 132 | score = scorer_e(dataset, predictions, answers, lengths, all_classes) 133 | else: 134 | score = scorer(dataset, predictions, answers, all_classes) 135 | scores[dataset] = score 136 | if args.e: 137 | out_path = f"pred_e/{args.model}/result.json" 138 | else: 139 | out_path = f"pred/{args.model}/result.json" 140 | with open(out_path, "w") as f: 141 | json.dump(scores, f, ensure_ascii=False, indent=4) 142 | print(f"Results written to {out_path}.") 143 | -------------------------------------------------------------------------------- /eval/longbench/metrics.py: -------------------------------------------------------------------------------- 1 | import re 2 | import string 3 | 4 | import jieba 5 | from fuzzywuzzy import fuzz 6 | import difflib 7 | 8 | from typing import List 9 | from collections import Counter 10 | from rouge import Rouge 11 | 12 | def normalize_answer(s): 13 | """Lower text and remove punctuation, articles and extra whitespace.""" 14 | 15 | def remove_articles(text): 16 | return re.sub(r"\b(a|an|the)\b", " ", text) 17 | 18 | def white_space_fix(text): 19 | return " ".join(text.split()) 20 | 21 | def remove_punc(text): 22 | exclude = set(string.punctuation) 23 | return "".join(ch for ch in text if ch not in exclude) 24 | 25 | def lower(text): 26 | return text.lower() 27 | 28 | return white_space_fix(remove_articles(remove_punc(lower(s)))) 29 | 30 | 31 | def normalize_zh_answer(s): 32 | """Lower text and remove punctuation, extra whitespace.""" 33 | 34 | def white_space_fix(text): 35 | return "".join(text.split()) 36 | 37 | def remove_punc(text): 38 | cn_punctuation = "!?。。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏." 39 | all_punctuation = set(string.punctuation + cn_punctuation) 40 | return "".join(ch for ch in text if ch not in all_punctuation) 41 | 42 | def lower(text): 43 | return text.lower() 44 | 45 | return white_space_fix(remove_punc(lower(s))) 46 | 47 | def count_score(prediction, ground_truth, **kwargs): 48 | numbers = re.findall(r"\d+", prediction) 49 | right_num = 0 50 | for number in numbers: 51 | if str(number) == str(ground_truth): 52 | right_num += 1 53 | final_score = 0.0 if len(numbers) == 0 else right_num / len(numbers) 54 | return float(final_score) 55 | 56 | def retrieval_score(prediction, ground_truth, **kwargs): 57 | pattern = r'Paragraph (\d+)' 58 | matches = re.findall(pattern, ground_truth) 59 | ground_truth_id = matches[0] 60 | numbers = re.findall(r"\d+", prediction) 61 | right_num = 0 62 | for number in numbers: 63 | if str(number) == str(ground_truth_id): 64 | right_num += 1 65 | final_score = 0.0 if len(numbers) == 0 else right_num / len(numbers) 66 | return float(final_score) 67 | 68 | def retrieval_zh_score(prediction, ground_truth, **kwargs): 69 | pattern = r'段落(\d+)' 70 | matches = re.findall(pattern, ground_truth) 71 | ground_truth_id = matches[0] 72 | numbers = re.findall(r"\d+", prediction) 73 | right_num = 0 74 | for number in numbers: 75 | if str(number) == str(ground_truth_id): 76 | right_num += 1 77 | final_score = 0.0 if len(numbers) == 0 else right_num / len(numbers) 78 | return float(final_score) 79 | 80 | def code_sim_score(prediction, ground_truth, **kwargs): 81 | all_lines = prediction.lstrip('\n').split('\n') 82 | prediction = "" 83 | for line in all_lines: 84 | if ('`' not in line) and ('#' not in line) and ('//' not in line): 85 | prediction = line 86 | break 87 | return (fuzz.ratio(prediction, ground_truth) / 100) 88 | 89 | def classification_score(prediction, ground_truth, **kwargs): 90 | em_match_list = [] 91 | all_classes = kwargs["all_classes"] 92 | for class_name in all_classes: 93 | if class_name in prediction: 94 | em_match_list.append(class_name) 95 | for match_term in em_match_list: 96 | if match_term in ground_truth and match_term != ground_truth: 97 | em_match_list.remove(match_term) 98 | if ground_truth in em_match_list: 99 | score = (1.0 / len(em_match_list)) 100 | else: 101 | score = 0.0 102 | return score 103 | 104 | def rouge_score(prediction, ground_truth, **kwargs): 105 | rouge = Rouge() 106 | try: 107 | scores = rouge.get_scores([prediction], [ground_truth], avg=True) 108 | except: 109 | return 0.0 110 | return scores["rouge-l"]["f"] 111 | 112 | def rouge_zh_score(prediction, ground_truth, **kwargs): 113 | prediction = " ".join(list(jieba.cut(prediction, cut_all=False))) 114 | ground_truth = " ".join(list(jieba.cut(ground_truth, cut_all=False))) 115 | score = rouge_score(prediction, ground_truth) 116 | return score 117 | 118 | def f1_score(prediction, ground_truth, **kwargs): 119 | common = Counter(prediction) & Counter(ground_truth) 120 | num_same = sum(common.values()) 121 | if num_same == 0: 122 | return 0 123 | precision = 1.0 * num_same / len(prediction) 124 | recall = 1.0 * num_same / len(ground_truth) 125 | f1 = (2 * precision * recall) / (precision + recall) 126 | return f1 127 | 128 | def qa_f1_score(prediction, ground_truth, **kwargs): 129 | normalized_prediction = normalize_answer(prediction) 130 | normalized_ground_truth = normalize_answer(ground_truth) 131 | 132 | prediction_tokens = normalized_prediction.split() 133 | ground_truth_tokens = normalized_ground_truth.split() 134 | return f1_score(prediction_tokens, ground_truth_tokens) 135 | 136 | 137 | def qa_f1_zh_score(prediction, ground_truth, **kwargs): 138 | prediction_tokens = list(jieba.cut(prediction, cut_all=False)) 139 | ground_truth_tokens = list(jieba.cut(ground_truth, cut_all=False)) 140 | prediction_tokens = [normalize_zh_answer(token) for token in prediction_tokens] 141 | ground_truth_tokens = [normalize_zh_answer(token) for token in ground_truth_tokens] 142 | prediction_tokens = [token for token in prediction_tokens if len(token) > 0] 143 | ground_truth_tokens = [token for token in ground_truth_tokens if len(token) > 0] 144 | return f1_score(prediction_tokens, ground_truth_tokens) 145 | -------------------------------------------------------------------------------- /eval/longbench/refs/ref.bib: -------------------------------------------------------------------------------- 1 | @inproceedings{yang2018hotpotqa, 2 | title={HotpotQA: A Dataset for Diverse, Explainable Multi-hop Question Answering}, 3 | author={Yang, Zhilin and Qi, Peng and Zhang, Saizheng and Bengio, Yoshua and Cohen, William and Salakhutdinov, Ruslan and Manning, Christopher D}, 4 | booktitle={Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing}, 5 | pages={2369--2380}, 6 | year={2018} 7 | } 8 | 9 | @inproceedings{ho2020constructing, 10 | title={Constructing A Multi-hop QA Dataset for Comprehensive Evaluation of Reasoning Steps}, 11 | author={Ho, Xanh and Nguyen, Anh-Khoa Duong and Sugawara, Saku and Aizawa, Akiko}, 12 | booktitle={Proceedings of the 28th International Conference on Computational Linguistics}, 13 | pages={6609--6625}, 14 | year={2020} 15 | } 16 | 17 | @article{trivedi2022musique, 18 | title={♫ MuSiQue: Multihop Questions via Single-hop Question Composition}, 19 | author={Trivedi, Harsh and Balasubramanian, Niranjan and Khot, Tushar and Sabharwal, Ashish}, 20 | journal={Transactions of the Association for Computational Linguistics}, 21 | volume={10}, 22 | pages={539--554}, 23 | year={2022}, 24 | publisher={MIT Press One Broadway, 12th Floor, Cambridge, Massachusetts 02142, USA~…} 25 | } 26 | 27 | @article{he2018dureader, 28 | title={DuReader: a Chinese Machine Reading Comprehension Dataset from Real-world Applications}, 29 | author={He, Wei and Liu, Kai and Liu, Jing and Lyu, Yajuan and Zhao, Shiqi and Xiao, Xinyan and Liu, Yuan and Wang, Yizhong and Wu, Hua and She, Qiaoqiao and others}, 30 | journal={ACL 2018}, 31 | pages={37}, 32 | year={2018} 33 | } 34 | 35 | @article{kovcisky2018narrativeqa, 36 | title={The narrativeqa reading comprehension challenge}, 37 | author={Ko{\v{c}}isk{\`y}, Tom{\'a}{\v{s}} and Schwarz, Jonathan and Blunsom, Phil and Dyer, Chris and Hermann, Karl Moritz and Melis, G{\'a}bor and Grefenstette, Edward}, 38 | journal={Transactions of the Association for Computational Linguistics}, 39 | volume={6}, 40 | pages={317--328}, 41 | year={2018}, 42 | publisher={MIT Press One Rogers Street, Cambridge, MA 02142-1209, USA journals-info~…} 43 | } 44 | 45 | @inproceedings{dasigi2021dataset, 46 | title={A Dataset of Information-Seeking Questions and Answers Anchored in Research Papers}, 47 | author={Dasigi, Pradeep and Lo, Kyle and Beltagy, Iz and Cohan, Arman and Smith, Noah A and Gardner, Matt}, 48 | booktitle={Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies}, 49 | pages={4599--4610}, 50 | year={2021} 51 | } 52 | 53 | @inproceedings{huang2021efficient, 54 | title={Efficient Attentions for Long Document Summarization}, 55 | author={Huang, Luyang and Cao, Shuyang and Parulian, Nikolaus and Ji, Heng and Wang, Lu}, 56 | booktitle={Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies}, 57 | pages={1419--1436}, 58 | year={2021} 59 | } 60 | 61 | @inproceedings{zhong2021qmsum, 62 | title={QMSum: A New Benchmark for Query-based Multi-domain Meeting Summarization}, 63 | author={Zhong, Ming and Yin, Da and Yu, Tao and Zaidi, Ahmad and Mutuma, Mutethia and Jha, Rahul and Hassan, Ahmed and Celikyilmaz, Asli and Liu, Yang and Qiu, Xipeng and others}, 64 | booktitle={Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies}, 65 | pages={5905--5921}, 66 | year={2021} 67 | } 68 | 69 | @article{wu2023vcsum, 70 | title={VCSUM: A Versatile Chinese Meeting Summarization Dataset}, 71 | author={Wu, Han and Zhan, Mingjie and Tan, Haochen and Hou, Zhaohui and Liang, Ding and Song, Linqi}, 72 | journal={arXiv preprint arXiv:2305.05280}, 73 | year={2023} 74 | } 75 | 76 | @inproceedings{joshi2017triviaqa, 77 | title={TriviaQA: A Large Scale Distantly Supervised Challenge Dataset for Reading Comprehension}, 78 | author={Joshi, Mandar and Choi, Eunsol and Weld, Daniel S and Zettlemoyer, Luke}, 79 | booktitle={Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)}, 80 | pages={1601--1611}, 81 | year={2017} 82 | } 83 | 84 | @article{gliwa2019samsum, 85 | title={SAMSum Corpus: A Human-annotated Dialogue Dataset for Abstractive Summarization}, 86 | author={Gliwa, Bogdan and Mochol, Iwona and Biesek, Maciej and Wawer, Aleksander}, 87 | journal={EMNLP-IJCNLP 2019}, 88 | pages={70}, 89 | year={2019} 90 | } 91 | 92 | @inproceedings{fabbri2019multi, 93 | title={Multi-News: A Large-Scale Multi-Document Summarization Dataset and Abstractive Hierarchical Model}, 94 | author={Fabbri, Alexander Richard and Li, Irene and She, Tianwei and Li, Suyi and Radev, Dragomir}, 95 | booktitle={Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics}, 96 | pages={1074--1084}, 97 | year={2019} 98 | } 99 | 100 | @inproceedings{li2002learning, 101 | title={Learning question classifiers}, 102 | author={Li, Xin and Roth, Dan}, 103 | booktitle={COLING 2002: The 19th International Conference on Computational Linguistics}, 104 | year={2002} 105 | } 106 | 107 | @article{guo2023longcoder, 108 | title={LongCoder: A Long-Range Pre-trained Language Model for Code Completion}, 109 | author={Guo, Daya and Xu, Canwen and Duan, Nan and Yin, Jian and McAuley, Julian}, 110 | journal={arXiv preprint arXiv:2306.14893}, 111 | year={2023} 112 | } 113 | 114 | @article{liu2023repobench, 115 | title={RepoBench: Benchmarking Repository-Level Code Auto-Completion Systems}, 116 | author={Liu, Tianyang and Xu, Canwen and McAuley, Julian}, 117 | journal={arXiv preprint arXiv:2306.03091}, 118 | year={2023} 119 | } -------------------------------------------------------------------------------- /eval/longbench/requirements.txt: -------------------------------------------------------------------------------- 1 | datasets 2 | tqdm 3 | rouge 4 | jieba 5 | fuzzywuzzy 6 | torch 7 | transformers==4.31.0 8 | einops -------------------------------------------------------------------------------- /eval/niah/PaulGrahamEssays/bias.txt: -------------------------------------------------------------------------------- 1 | October 2015This will come as a surprise to a lot of people, but in some cases 2 | it's possible to detect bias in a selection process without knowing 3 | anything about the applicant pool. Which is exciting because among 4 | other things it means third parties can use this technique to detect 5 | bias whether those doing the selecting want them to or not.You can use this technique whenever (a) you have at least 6 | a random sample of the applicants that were selected, (b) their 7 | subsequent performance is measured, and (c) the groups of 8 | applicants you're comparing have roughly equal distribution of ability.How does it work? Think about what it means to be biased. What 9 | it means for a selection process to be biased against applicants 10 | of type x is that it's harder for them to make it through. Which 11 | means applicants of type x have to be better to get selected than 12 | applicants not of type x. 13 | [1] 14 | Which means applicants of type x 15 | who do make it through the selection process will outperform other 16 | successful applicants. And if the performance of all the successful 17 | applicants is measured, you'll know if they do.Of course, the test you use to measure performance must be a valid 18 | one. And in particular it must not be invalidated by the bias you're 19 | trying to measure. 20 | But there are some domains where performance can be measured, and 21 | in those detecting bias is straightforward. Want to know if the 22 | selection process was biased against some type of applicant? Check 23 | whether they outperform the others. This is not just a heuristic 24 | for detecting bias. It's what bias means.For example, many suspect that venture capital firms are biased 25 | against female founders. This would be easy to detect: among their 26 | portfolio companies, do startups with female founders outperform 27 | those without? A couple months ago, one VC firm (almost certainly 28 | unintentionally) published a study showing bias of this type. First 29 | Round Capital found that among its portfolio companies, startups 30 | with female founders outperformed 31 | those without by 63%. 32 | [2]The reason I began by saying that this technique would come as a 33 | surprise to many people is that we so rarely see analyses of this 34 | type. I'm sure it will come as a surprise to First Round that they 35 | performed one. I doubt anyone there realized that by limiting their 36 | sample to their own portfolio, they were producing a study not of 37 | startup trends but of their own biases when selecting companies.I predict we'll see this technique used more in the future. The 38 | information needed to conduct such studies is increasingly available. 39 | Data about who applies for things is usually closely guarded by the 40 | organizations selecting them, but nowadays data about who gets 41 | selected is often publicly available to anyone who takes the trouble 42 | to aggregate it. 43 | Notes[1] 44 | This technique wouldn't work if the selection process looked 45 | for different things from different types of applicants—for 46 | example, if an employer hired men based on their ability but women 47 | based on their appearance.[2] 48 | As Paul Buchheit points out, First Round excluded their most 49 | successful investment, Uber, from the study. And while it 50 | makes sense to exclude outliers from some types of studies, 51 | studies of returns from startup investing, which is all about 52 | hitting outliers, are not one of them. 53 | Thanks to Sam Altman, Jessica Livingston, and Geoff Ralston for reading 54 | drafts of this. -------------------------------------------------------------------------------- /eval/niah/PaulGrahamEssays/copy.txt: -------------------------------------------------------------------------------- 1 | July 2006 2 | When I was in high school I spent a lot of time imitating bad 3 | writers. What we studied in English classes was mostly fiction, 4 | so I assumed that was the highest form of writing. Mistake number 5 | one. The stories that seemed to be most admired were ones in which 6 | people suffered in complicated ways. Anything funny or 7 | gripping was ipso facto suspect, unless it was old enough to be hard to 8 | understand, like Shakespeare or Chaucer. Mistake number two. The 9 | ideal medium seemed the short story, which I've since learned had 10 | quite a brief life, roughly coincident with the peak of magazine 11 | publishing. But since their size made them perfect for use in 12 | high school classes, we read a lot of them, which gave us the 13 | impression the short story was flourishing. Mistake number three. 14 | And because they were so short, nothing really had to happen; you 15 | could just show a randomly truncated slice of life, and that was 16 | considered advanced. Mistake number four. The result was that I 17 | wrote a lot of stories in which nothing happened except that someone 18 | was unhappy in a way that seemed deep.For most of college I was a philosophy major. I was very impressed 19 | by the papers published in philosophy journals. They were so 20 | beautifully typeset, and their tone was just captivating—alternately 21 | casual and buffer-overflowingly technical. A fellow would be walking 22 | along a street and suddenly modality qua modality would spring upon 23 | him. I didn't ever quite understand these papers, but I figured 24 | I'd get around to that later, when I had time to reread them more 25 | closely. In the meantime I tried my best to imitate them. This 26 | was, I can now see, a doomed undertaking, because they weren't 27 | really saying anything. No philosopher ever refuted another, for 28 | example, because no one said anything definite enough to refute. 29 | Needless to say, my imitations didn't say anything either.In grad school I was still wasting time imitating the wrong things. 30 | There was then a fashionable type of program called an expert system, 31 | at the core of which was something called an inference engine. I 32 | looked at what these things did and thought "I could write that in 33 | a thousand lines of code." And yet eminent professors were writing 34 | books about them, and startups were selling them for a year's salary 35 | a copy. What an opportunity, I thought; these impressive things 36 | seem easy to me; I must be pretty sharp. Wrong. It was simply a 37 | fad. The books the professors wrote about expert systems are now 38 | ignored. They were not even on a path to anything interesting. 39 | And the customers paying so much for them were largely the same 40 | government agencies that paid thousands for screwdrivers and toilet 41 | seats.How do you avoid copying the wrong things? Copy only what you 42 | genuinely like. That would have saved me in all three cases. I 43 | didn't enjoy the short stories we had to read in English classes; 44 | I didn't learn anything from philosophy papers; I didn't use expert 45 | systems myself. I believed these things were good because they 46 | were admired.It can be hard to separate the things you like from the things 47 | you're impressed with. One trick is to ignore presentation. Whenever 48 | I see a painting impressively hung in a museum, I ask myself: how 49 | much would I pay for this if I found it at a garage sale, dirty and 50 | frameless, and with no idea who painted it? If you walk around a 51 | museum trying this experiment, you'll find you get some truly 52 | startling results. Don't ignore this data point just because it's 53 | an outlier.Another way to figure out what you like is to look at what you enjoy 54 | as guilty pleasures. Many things people like, especially if they're 55 | young and ambitious, they like largely for the feeling of virtue 56 | in liking them. 99% of people reading Ulysses are thinking 57 | "I'm reading Ulysses" as they do it. A guilty pleasure is 58 | at least a pure one. What do you read when you don't feel up to being 59 | virtuous? What kind of book do you read and feel sad that there's 60 | only half of it left, instead of being impressed that you're half 61 | way through? That's what you really like.Even when you find genuinely good things to copy, there's another 62 | pitfall to be avoided. Be careful to copy what makes them good, 63 | rather than their flaws. It's easy to be drawn into imitating 64 | flaws, because they're easier to see, and of course easier to copy 65 | too. For example, most painters in the eighteenth and nineteenth 66 | centuries used brownish colors. They were imitating the great 67 | painters of the Renaissance, whose paintings by that time were brown 68 | with dirt. Those paintings have since been cleaned, revealing 69 | brilliant colors; their imitators are of course still brown.It was painting, incidentally, that cured me of copying the wrong 70 | things. Halfway through grad school I decided I wanted to try being 71 | a painter, and the art world was so manifestly corrupt that it 72 | snapped the leash of credulity. These people made philosophy 73 | professors seem as scrupulous as mathematicians. It was so clearly 74 | a choice of doing good work xor being an insider that I was forced 75 | to see the distinction. It's there to some degree in almost every 76 | field, but I had till then managed to avoid facing it.That was one of the most valuable things I learned from painting: 77 | you have to figure out for yourself what's 78 | good. You can't trust 79 | authorities. They'll lie to you on this one. 80 | 81 | Comment on this essay. -------------------------------------------------------------------------------- /eval/niah/PaulGrahamEssays/diff.txt: -------------------------------------------------------------------------------- 1 | December 2001 (rev. May 2002) 2 | 3 | (This article came about in response to some questions on 4 | the LL1 mailing list. It is now 5 | incorporated in Revenge of the Nerds.)When McCarthy designed Lisp in the late 1950s, it was 6 | a radical departure from existing languages, 7 | the most important of which was Fortran.Lisp embodied nine new ideas: 8 | 1. Conditionals. A conditional is an if-then-else 9 | construct. We take these for granted now. They were 10 | invented 11 | by McCarthy in the course of developing Lisp. 12 | (Fortran at that time only had a conditional 13 | goto, closely based on the branch instruction in the 14 | underlying hardware.) McCarthy, who was on the Algol committee, got 15 | conditionals into Algol, whence they spread to most other 16 | languages.2. A function type. In Lisp, functions are first class 17 | objects-- they're a data type just like integers, strings, 18 | etc, and have a literal representation, can be stored in variables, 19 | can be passed as arguments, and so on.3. Recursion. Recursion existed as a mathematical concept 20 | before Lisp of course, but Lisp was the first programming language to support 21 | it. (It's arguably implicit in making functions first class 22 | objects.)4. A new concept of variables. In Lisp, all variables 23 | are effectively pointers. Values are what 24 | have types, not variables, and assigning or binding 25 | variables means copying pointers, not what they point to.5. Garbage-collection.6. Programs composed of expressions. Lisp programs are 26 | trees of expressions, each of which returns a value. 27 | (In some Lisps expressions 28 | can return multiple values.) This is in contrast to Fortran 29 | and most succeeding languages, which distinguish between 30 | expressions and statements.It was natural to have this 31 | distinction in Fortran because (not surprisingly in a language 32 | where the input format was punched cards) the language was 33 | line-oriented. You could not nest statements. And 34 | so while you needed expressions for math to work, there was 35 | no point in making anything else return a value, because 36 | there could not be anything waiting for it.This limitation 37 | went away with the arrival of block-structured languages, 38 | but by then it was too late. The distinction between 39 | expressions and statements was entrenched. It spread from 40 | Fortran into Algol and thence to both their descendants.When a language is made entirely of expressions, you can 41 | compose expressions however you want. You can say either 42 | (using Arc syntax)(if foo (= x 1) (= x 2))or(= x (if foo 1 2))7. A symbol type. Symbols differ from strings in that 43 | you can test equality by comparing a pointer.8. A notation for code using trees of symbols.9. The whole language always available. 44 | There is 45 | no real distinction between read-time, compile-time, and runtime. 46 | You can compile or run code while reading, read or run code 47 | while compiling, and read or compile code at runtime.Running code at read-time lets users reprogram Lisp's syntax; 48 | running code at compile-time is the basis of macros; compiling 49 | at runtime is the basis of Lisp's use as an extension 50 | language in programs like Emacs; and reading at runtime 51 | enables programs to communicate using s-expressions, an 52 | idea recently reinvented as XML. 53 | When Lisp was first invented, all these ideas were far 54 | removed from ordinary programming practice, which was 55 | dictated largely by the hardware available in the late 1950s.Over time, the default language, embodied 56 | in a succession of popular languages, has 57 | gradually evolved toward Lisp. 1-5 are now widespread. 58 | 6 is starting to appear in the mainstream. 59 | Python has a form of 7, though there doesn't seem to be 60 | any syntax for it. 61 | 8, which (with 9) is what makes Lisp macros 62 | possible, is so far still unique to Lisp, 63 | perhaps because (a) it requires those parens, or something 64 | just as bad, and (b) if you add that final increment of power, 65 | you can no 66 | longer claim to have invented a new language, but only 67 | to have designed a new dialect of Lisp ; -)Though useful to present-day programmers, it's 68 | strange to describe Lisp in terms of its 69 | variation from the random expedients other languages 70 | adopted. That was not, probably, how McCarthy 71 | thought of it. Lisp wasn't designed to fix the mistakes 72 | in Fortran; it came about more as the byproduct of an 73 | attempt to axiomatize computation. -------------------------------------------------------------------------------- /eval/niah/PaulGrahamEssays/founders.txt: -------------------------------------------------------------------------------- 1 | 2 | 3 | Want to start a startup? Get funded by 4 | Y Combinator. 5 | 6 | 7 | 8 | 9 | October 2010 10 | 11 | (I wrote this for Forbes, who asked me to write something 12 | about the qualities we look for in founders. In print they had to cut 13 | the last item because they didn't have room.)1. DeterminationThis has turned out to be the most important quality in startup 14 | founders. We thought when we started Y Combinator that the most 15 | important quality would be intelligence. That's the myth in the 16 | Valley. And certainly you don't want founders to be stupid. But 17 | as long as you're over a certain threshold of intelligence, what 18 | matters most is determination. You're going to hit a lot of 19 | obstacles. You can't be the sort of person who gets demoralized 20 | easily.Bill Clerico and Rich Aberman of WePay 21 | are a good example. They're 22 | doing a finance startup, which means endless negotiations with big, 23 | bureaucratic companies. When you're starting a startup that depends 24 | on deals with big companies to exist, it often feels like they're 25 | trying to ignore you out of existence. But when Bill Clerico starts 26 | calling you, you may as well do what he asks, because he is not 27 | going away. 28 | 2. FlexibilityYou do not however want the sort of determination implied by phrases 29 | like "don't give up on your dreams." The world of startups is so 30 | unpredictable that you need to be able to modify your dreams on the 31 | fly. The best metaphor I've found for the combination of determination 32 | and flexibility you need is a running back. 33 | He's determined to get 34 | downfield, but at any given moment he may need to go sideways or 35 | even backwards to get there.The current record holder for flexibility may be Daniel Gross of 36 | Greplin. He applied to YC with 37 | some bad ecommerce idea. We told 38 | him we'd fund him if he did something else. He thought for a second, 39 | and said ok. He then went through two more ideas before settling 40 | on Greplin. He'd only been working on it for a couple days when 41 | he presented to investors at Demo Day, but he got a lot of interest. 42 | He always seems to land on his feet. 43 | 3. ImaginationIntelligence does matter a lot of course. It seems like the type 44 | that matters most is imagination. It's not so important to be able 45 | to solve predefined problems quickly as to be able to come up with 46 | surprising new ideas. In the startup world, most good ideas 47 | seem 48 | bad initially. If they were obviously good, someone would already 49 | be doing them. So you need the kind of intelligence that produces 50 | ideas with just the right level of craziness.Airbnb is that kind of idea. 51 | In fact, when we funded Airbnb, we 52 | thought it was too crazy. We couldn't believe large numbers of 53 | people would want to stay in other people's places. We funded them 54 | because we liked the founders so much. As soon as we heard they'd 55 | been supporting themselves by selling Obama and McCain branded 56 | breakfast cereal, they were in. And it turned out the idea was on 57 | the right side of crazy after all. 58 | 4. NaughtinessThough the most successful founders are usually good people, they 59 | tend to have a piratical gleam in their eye. They're not Goody 60 | Two-Shoes type good. Morally, they care about getting the big 61 | questions right, but not about observing proprieties. That's why 62 | I'd use the word naughty rather than evil. They delight in 63 | breaking 64 | rules, but not rules that matter. This quality may be redundant 65 | though; it may be implied by imagination.Sam Altman of Loopt 66 | is one of the most successful alumni, so we 67 | asked him what question we could put on the Y Combinator application 68 | that would help us discover more people like him. He said to ask 69 | about a time when they'd hacked something to their advantage—hacked in the sense of beating the system, not breaking into 70 | computers. It has become one of the questions we pay most attention 71 | to when judging applications. 72 | 5. FriendshipEmpirically it seems to be hard to start a startup with just 73 | one 74 | founder. Most of the big successes have two or three. And the 75 | relationship between the founders has to be strong. They must 76 | genuinely like one another, and work well together. Startups do 77 | to the relationship between the founders what a dog does to a sock: 78 | if it can be pulled apart, it will be.Emmett Shear and Justin Kan of Justin.tv 79 | are a good example of close 80 | friends who work well together. They've known each other since 81 | second grade. They can practically read one another's minds. I'm 82 | sure they argue, like all founders, but I have never once sensed 83 | any unresolved tension between them.Thanks to Jessica Livingston and Chris Steiner for reading drafts of this. -------------------------------------------------------------------------------- /eval/niah/PaulGrahamEssays/foundervisa.txt: -------------------------------------------------------------------------------- 1 | 2 | 3 | April 2009I usually avoid politics, but since we now seem to have an administration that's open to suggestions, I'm going to risk making one. The single biggest thing the government could do to increase the number of startups in this country is a policy that would cost nothing: establish a new class of visa for startup founders.The biggest constraint on the number of new startups that get created in the US is not tax policy or employment law or even Sarbanes-Oxley. It's that we won't let the people who want to start them into the country.Letting just 10,000 startup founders into the country each year could have a visible effect on the economy. If we assume 4 people per startup, which is probably an overestimate, that's 2500 new companies. Each year. They wouldn't all grow as big as Google, but out of 2500 some would come close.By definition these 10,000 founders wouldn't be taking jobs from Americans: it could be part of the terms of the visa that they couldn't work for existing companies, only new ones they'd founded. In fact they'd cause there to be 4 | more jobs for Americans, because the companies they started would hire more employees as they grew.The tricky part might seem to be how one defined a startup. But that could be solved quite easily: let the market decide. Startup investors work hard to find the best startups. The government could not do better than to piggyback on their expertise, and use investment by recognized startup investors as the test of whether a company was a real startup.How would the government decide who's a startup investor? The same way they decide what counts as a university for student visas. We'll establish our own accreditation procedure. We know who one another are.10,000 people is a drop in the bucket by immigration standards, but would represent a huge increase in the pool of startup founders. I think this would have such a visible effect on the economy that it would make the legislator who introduced the bill famous. The only way to know for sure would be to try it, and that would cost practically nothing. 5 | Thanks to Trevor Blackwell, Paul Buchheit, Jeff Clavier, David Hornik, Jessica Livingston, Greg Mcadoo, Aydin Senkut, and Fred Wilson for reading drafts of this.Related: -------------------------------------------------------------------------------- /eval/niah/PaulGrahamEssays/goodtaste.txt: -------------------------------------------------------------------------------- 1 | November 2021(This essay is derived from a talk at the Cambridge Union.)When I was a kid, I'd have said there wasn't. My father told me so. 2 | Some people like some things, and other people like other things, 3 | and who's to say who's right?It seemed so obvious that there was no such thing as good taste 4 | that it was only through indirect evidence that I realized my father 5 | was wrong. And that's what I'm going to give you here: a proof by 6 | reductio ad absurdum. If we start from the premise that there's no 7 | such thing as good taste, we end up with conclusions that are 8 | obviously false, and therefore the premise must be wrong.We'd better start by saying what good taste is. There's a narrow 9 | sense in which it refers to aesthetic judgements and a broader one 10 | in which it refers to preferences of any kind. The strongest proof 11 | would be to show that taste exists in the narrowest sense, so I'm 12 | going to talk about taste in art. You have better taste than me if 13 | the art you like is better than the art I like.If there's no such thing as good taste, then there's no such thing 14 | as good art. Because if there is such a 15 | thing as good art, it's 16 | easy to tell which of two people has better taste. Show them a lot 17 | of works by artists they've never seen before and ask them to 18 | choose the best, and whoever chooses the better art has better 19 | taste.So if you want to discard the concept of good taste, you also have 20 | to discard the concept of good art. And that means you have to 21 | discard the possibility of people being good at making it. Which 22 | means there's no way for artists to be good at their jobs. And not 23 | just visual artists, but anyone who is in any sense an artist. You 24 | can't have good actors, or novelists, or composers, or dancers 25 | either. You can have popular novelists, but not good ones.We don't realize how far we'd have to go if we discarded the concept 26 | of good taste, because we don't even debate the most obvious cases. 27 | But it doesn't just mean we can't say which of two famous painters 28 | is better. It means we can't say that any painter is better than a 29 | randomly chosen eight year old.That was how I realized my father was wrong. I started studying 30 | painting. And it was just like other kinds of work I'd done: you 31 | could do it well, or badly, and if you tried hard, you could get 32 | better at it. And it was obvious that Leonardo and Bellini were 33 | much better at it than me. That gap between us was not imaginary. 34 | They were so good. And if they could be good, then art could be 35 | good, and there was such a thing as good taste after all.Now that I've explained how to show there is such a thing as good 36 | taste, I should also explain why people think there isn't. There 37 | are two reasons. One is that there's always so much disagreement 38 | about taste. Most people's response to art is a tangle of unexamined 39 | impulses. Is the artist famous? Is the subject attractive? Is this 40 | the sort of art they're supposed to like? Is it hanging in a famous 41 | museum, or reproduced in a big, expensive book? In practice most 42 | people's response to art is dominated by such extraneous factors.And the people who do claim to have good taste are so often mistaken. 43 | The paintings admired by the so-called experts in one generation 44 | are often so different from those admired a few generations later. 45 | It's easy to conclude there's nothing real there at all. It's only 46 | when you isolate this force, for example by trying to paint and 47 | comparing your work to Bellini's, that you can see that it does in 48 | fact exist.The other reason people doubt that art can be good is that there 49 | doesn't seem to be any room in the art for this goodness. The 50 | argument goes like this. Imagine several people looking at a work 51 | of art and judging how good it is. If being good art really is a 52 | property of objects, it should be in the object somehow. But it 53 | doesn't seem to be; it seems to be something happening in the heads 54 | of each of the observers. And if they disagree, how do you choose 55 | between them?The solution to this puzzle is to realize that the purpose of art 56 | is to work on its human audience, and humans have a lot in common. 57 | And to the extent the things an object acts upon respond in the 58 | same way, that's arguably what it means for the object to have the 59 | corresponding property. If everything a particle interacts with 60 | behaves as if the particle had a mass of m, then it has a mass of 61 | m. So the distinction between "objective" and "subjective" is not 62 | binary, but a matter of degree, depending on how much the subjects 63 | have in common. Particles interacting with one another are at one 64 | pole, but people interacting with art are not all the way at the 65 | other; their reactions aren't random.Because people's responses to art aren't random, art can be designed 66 | to operate on people, and be good or bad depending on how effectively 67 | it does so. Much as a vaccine can be. If someone were talking about 68 | the ability of a vaccine to confer immunity, it would seem very 69 | frivolous to object that conferring immunity wasn't really a property 70 | of vaccines, because acquiring immunity is something that happens 71 | in the immune system of each individual person. Sure, people's 72 | immune systems vary, and a vaccine that worked on one might not 73 | work on another, but that doesn't make it meaningless to talk about 74 | the effectiveness of a vaccine.The situation with art is messier, of course. You can't measure 75 | effectiveness by simply taking a vote, as you do with vaccines. 76 | You have to imagine the responses of subjects with a deep knowledge 77 | of art, and enough clarity of mind to be able to ignore extraneous 78 | influences like the fame of the artist. And even then you'd still 79 | see some disagreement. People do vary, and judging art is hard, 80 | especially recent art. There is definitely not a total order either 81 | of works or of people's ability to judge them. But there is equally 82 | definitely a partial order of both. So while it's not possible to 83 | have perfect taste, it is possible to have good taste. 84 | Thanks to the Cambridge Union for inviting me, and to Trevor 85 | Blackwell, Jessica Livingston, and Robert Morris for reading drafts 86 | of this. 87 | -------------------------------------------------------------------------------- /eval/niah/PaulGrahamEssays/iflisp.txt: -------------------------------------------------------------------------------- 1 | May 2003If Lisp is so great, why don't more people use it? I was 2 | asked this question by a student in the audience at a 3 | talk I gave recently. Not for the first time, either.In languages, as in so many things, there's not much 4 | correlation between popularity and quality. Why does 5 | John Grisham (King of Torts sales rank, 44) outsell 6 | Jane Austen (Pride and Prejudice sales rank, 6191)? 7 | Would even Grisham claim that it's because he's a better 8 | writer?Here's the first sentence of Pride and Prejudice: 9 | 10 | It is a truth universally acknowledged, that a single man 11 | in possession of a good fortune must be in want of a 12 | wife. 13 | 14 | "It is a truth universally acknowledged?" Long words for 15 | the first sentence of a love story.Like Jane Austen, Lisp looks hard. Its syntax, or lack 16 | of syntax, makes it look completely unlike 17 | the languages 18 | most people are used to. Before I learned Lisp, I was afraid 19 | of it too. I recently came across a notebook from 1983 20 | in which I'd written: 21 | 22 | I suppose I should learn Lisp, but it seems so foreign. 23 | 24 | Fortunately, I was 19 at the time and not too resistant to learning 25 | new things. I was so ignorant that learning 26 | almost anything meant learning new things.People frightened by Lisp make up other reasons for not 27 | using it. The standard 28 | excuse, back when C was the default language, was that Lisp 29 | was too slow. Now that Lisp dialects are among 30 | the faster 31 | languages available, that excuse has gone away. 32 | Now the standard excuse is openly circular: that other languages 33 | are more popular.(Beware of such reasoning. It gets you Windows.)Popularity is always self-perpetuating, but it's especially 34 | so in programming languages. More libraries 35 | get written for popular languages, which makes them still 36 | more popular. Programs often have to work with existing programs, 37 | and this is easier if they're written in the same language, 38 | so languages spread from program to program like a virus. 39 | And managers prefer popular languages, because they give them 40 | more leverage over developers, who can more easily be replaced.Indeed, if programming languages were all more or less equivalent, 41 | there would be little justification for using any but the most 42 | popular. But they aren't all equivalent, not by a long 43 | shot. And that's why less popular languages, like Jane Austen's 44 | novels, continue to survive at all. When everyone else is reading 45 | the latest John Grisham novel, there will always be a few people 46 | reading Jane Austen instead. -------------------------------------------------------------------------------- /eval/niah/PaulGrahamEssays/island.txt: -------------------------------------------------------------------------------- 1 | July 2006I've discovered a handy test for figuring out what you're addicted 2 | to. Imagine you were going to spend the weekend at a friend's house 3 | on a little island off the coast of Maine. There are no shops on 4 | the island and you won't be able to leave while you're there. Also, 5 | you've never been to this house before, so you can't assume it will 6 | have more than any house might.What, besides clothes and toiletries, do you make a point of packing? 7 | That's what you're addicted to. For example, if you find yourself 8 | packing a bottle of vodka (just in case), you may want to stop and 9 | think about that.For me the list is four things: books, earplugs, a notebook, and a 10 | pen.There are other things I might bring if I thought of it, like music, 11 | or tea, but I can live without them. I'm not so addicted to caffeine 12 | that I wouldn't risk the house not having any tea, just for a 13 | weekend.Quiet is another matter. I realize it seems a bit eccentric to 14 | take earplugs on a trip to an island off the coast of Maine. If 15 | anywhere should be quiet, that should. But what if the person in 16 | the next room snored? What if there was a kid playing basketball? 17 | (Thump, thump, thump... thump.) Why risk it? Earplugs are small.Sometimes I can think with noise. If I already have momentum on 18 | some project, I can work in noisy places. I can edit an essay or 19 | debug code in an airport. But airports are not so bad: most of the 20 | noise is whitish. I couldn't work with the sound of a sitcom coming 21 | through the wall, or a car in the street playing thump-thump music.And of course there's another kind of thinking, when you're starting 22 | something new, that requires complete quiet. You never 23 | know when this will strike. It's just as well to carry plugs.The notebook and pen are professional equipment, as it were. Though 24 | actually there is something druglike about them, in the sense that 25 | their main purpose is to make me feel better. I hardly ever go 26 | back and read stuff I write down in notebooks. It's just that if 27 | I can't write things down, worrying about remembering one idea gets 28 | in the way of having the next. Pen and paper wick ideas.The best notebooks I've found are made by a company called Miquelrius. 29 | I use their smallest size, which is about 2.5 x 4 in. 30 | The secret to writing on such 31 | narrow pages is to break words only when you run out of space, like 32 | a Latin inscription. I use the cheapest plastic Bic ballpoints, 33 | partly because their gluey ink doesn't seep through pages, and 34 | partly so I don't worry about losing them.I only started carrying a notebook about three years ago. Before 35 | that I used whatever scraps of paper I could find. But the problem 36 | with scraps of paper is that they're not ordered. In a notebook 37 | you can guess what a scribble means by looking at the pages 38 | around it. In the scrap era I was constantly finding notes I'd 39 | written years before that might say something I needed to remember, 40 | if I could only figure out what.As for books, I know the house would probably have something to 41 | read. On the average trip I bring four books and only read one of 42 | them, because I find new books to read en route. Really bringing 43 | books is insurance.I realize this dependence on books is not entirely good—that what 44 | I need them for is distraction. The books I bring on trips are 45 | often quite virtuous, the sort of stuff that might be assigned 46 | reading in a college class. But I know my motives aren't virtuous. 47 | I bring books because if the world gets boring I need to be able 48 | to slip into another distilled by some writer. It's like eating 49 | jam when you know you should be eating fruit.There is a point where I'll do without books. I was walking in 50 | some steep mountains once, and decided I'd rather just think, if I 51 | was bored, rather than carry a single unnecessary ounce. It wasn't 52 | so bad. I found I could entertain myself by having ideas instead 53 | of reading other people's. If you stop eating jam, fruit starts 54 | to taste better.So maybe I'll try not bringing books on some future trip. They're 55 | going to have to pry the plugs out of my cold, dead ears, however. -------------------------------------------------------------------------------- /eval/niah/PaulGrahamEssays/know.txt: -------------------------------------------------------------------------------- 1 | December 2014I've read Villehardouin's chronicle of the Fourth Crusade at least 2 | two times, maybe three. And yet if I had to write down everything 3 | I remember from it, I doubt it would amount to much more than a 4 | page. Multiply this times several hundred, and I get an uneasy 5 | feeling when I look at my bookshelves. What use is it to read all 6 | these books if I remember so little from them?A few months ago, as I was reading Constance Reid's excellent 7 | biography of Hilbert, I figured out if not the answer to this 8 | question, at least something that made me feel better about it. 9 | She writes: 10 | 11 | Hilbert had no patience with mathematical lectures which filled 12 | the students with facts but did not teach them how to frame a 13 | problem and solve it. He often used to tell them that "a perfect 14 | formulation of a problem is already half its solution." 15 | 16 | That has always seemed to me an important point, and I was even 17 | more convinced of it after hearing it confirmed by Hilbert.But how had I come to believe in this idea in the first place? A 18 | combination of my own experience and other things I'd read. None 19 | of which I could at that moment remember! And eventually I'd forget 20 | that Hilbert had confirmed it too. But my increased belief in the 21 | importance of this idea would remain something I'd learned from 22 | this book, even after I'd forgotten I'd learned it.Reading and experience train your model of the world. And even if 23 | you forget the experience or what you read, its effect on your model 24 | of the world persists. Your mind is like a compiled program you've 25 | lost the source of. It works, but you don't know why.The place to look for what I learned from Villehardouin's chronicle 26 | is not what I remember from it, but my mental models of the crusades, 27 | Venice, medieval culture, siege warfare, and so on. Which doesn't 28 | mean I couldn't have read more attentively, but at least the harvest 29 | of reading is not so miserably small as it might seem.This is one of those things that seem obvious in retrospect. But 30 | it was a surprise to me and presumably would be to anyone else who 31 | felt uneasy about (apparently) forgetting so much they'd read.Realizing it does more than make you feel a little better about 32 | forgetting, though. There are specific implications.For example, reading and experience are usually "compiled" at the 33 | time they happen, using the state of your brain at that time. The 34 | same book would get compiled differently at different points in 35 | your life. Which means it is very much worth reading important 36 | books multiple times. I always used to feel some misgivings about 37 | rereading books. I unconsciously lumped reading together with work 38 | like carpentry, where having to do something again is a sign you 39 | did it wrong the first time. Whereas now the phrase "already read" 40 | seems almost ill-formed.Intriguingly, this implication isn't limited to books. Technology 41 | will increasingly make it possible to relive our experiences. When 42 | people do that today it's usually to enjoy them again (e.g. when 43 | looking at pictures of a trip) or to find the origin of some bug in 44 | their compiled code (e.g. when Stephen Fry succeeded in remembering 45 | the childhood trauma that prevented him from singing). But as 46 | technologies for recording and playing back your life improve, it 47 | may become common for people to relive experiences without any goal 48 | in mind, simply to learn from them again as one might when rereading 49 | a book.Eventually we may be able not just to play back experiences but 50 | also to index and even edit them. So although not knowing how you 51 | know things may seem part of being human, it may not be. 52 | Thanks to Sam Altman, Jessica Livingston, and Robert Morris for reading 53 | drafts of this. -------------------------------------------------------------------------------- /eval/niah/PaulGrahamEssays/mod.txt: -------------------------------------------------------------------------------- 1 | December 2019There are two distinct ways to be politically moderate: on purpose 2 | and by accident. Intentional moderates are trimmers, deliberately 3 | choosing a position mid-way between the extremes of right and left. 4 | Accidental moderates end up in the middle, on average, because they 5 | make up their own minds about each question, and the far right and 6 | far left are roughly equally wrong.You can distinguish intentional from accidental moderates by the 7 | distribution of their opinions. If the far left opinion on some 8 | matter is 0 and the far right opinion 100, an intentional moderate's 9 | opinion on every question will be near 50. Whereas an accidental 10 | moderate's opinions will be scattered over a broad range, but will, 11 | like those of the intentional moderate, average to about 50.Intentional moderates are similar to those on the far left and the 12 | far right in that their opinions are, in a sense, not their own. 13 | The defining quality of an ideologue, whether on the left or the 14 | right, is to acquire one's opinions in bulk. You don't get to pick 15 | and choose. Your opinions about taxation can be predicted from your 16 | opinions about sex. And although intentional moderates 17 | might seem to be the opposite of ideologues, their beliefs (though 18 | in their case the word "positions" might be more accurate) are also 19 | acquired in bulk. If the median opinion shifts to the right or left, 20 | the intentional moderate must shift with it. Otherwise they stop 21 | being moderate.Accidental moderates, on the other hand, not only choose their own 22 | answers, but choose their own questions. They may not care at all 23 | about questions that the left and right both think are terribly 24 | important. So you can only even measure the politics of an accidental 25 | moderate from the intersection of the questions they care about and 26 | those the left and right care about, and this can 27 | sometimes be vanishingly small.It is not merely a manipulative rhetorical trick to say "if you're 28 | not with us, you're against us," but often simply false.Moderates are sometimes derided as cowards, particularly by 29 | the extreme left. But while it may be accurate to call intentional 30 | moderates cowards, openly being an accidental moderate requires the 31 | most courage of all, because you get attacked from both right and 32 | left, and you don't have the comfort of being an orthodox member 33 | of a large group to sustain you.Nearly all the most impressive people I know are accidental moderates. 34 | If I knew a lot of professional athletes, or people in the entertainment 35 | business, that might be different. Being on the far left or far 36 | right doesn't affect how fast you run or how well you sing. But 37 | someone who works with ideas has to be independent-minded to do it 38 | well.Or more precisely, you have to be independent-minded about the ideas 39 | you work with. You could be mindlessly doctrinaire in your politics 40 | and still be a good mathematician. In the 20th century, a lot of 41 | very smart people were Marxists — just no one who was smart about 42 | the subjects Marxism involves. But if the ideas you use in your 43 | work intersect with the politics of your time, you have two choices: 44 | be an accidental moderate, or be mediocre.Notes[1] It's possible in theory for one side to be entirely right and 45 | the other to be entirely wrong. Indeed, ideologues must always 46 | believe this is the case. But historically it rarely has been.[2] For some reason the far right tend to ignore moderates rather 47 | than despise them as backsliders. I'm not sure why. Perhaps it 48 | means that the far right is less ideological than the far left. Or 49 | perhaps that they are more confident, or more resigned, or simply 50 | more disorganized. I just don't know.[3] Having heretical opinions doesn't mean you have to express 51 | them openly. It may be 52 | easier to have them if you don't. 53 | Thanks to Austen Allred, Trevor Blackwell, Patrick Collison, Jessica Livingston, 54 | Amjad Masad, Ryan Petersen, and Harj Taggar for reading drafts of this. -------------------------------------------------------------------------------- /eval/niah/PaulGrahamEssays/nft.txt: -------------------------------------------------------------------------------- 1 | May 2021Noora Health, a nonprofit I've 2 | supported for years, just launched 3 | a new NFT. It has a dramatic name, Save Thousands of Lives, 4 | because that's what the proceeds will do.Noora has been saving lives for 7 years. They run programs in 5 | hospitals in South Asia to teach new mothers how to take care of 6 | their babies once they get home. They're in 165 hospitals now. And 7 | because they know the numbers before and after they start at a new 8 | hospital, they can measure the impact they have. It is massive. 9 | For every 1000 live births, they save 9 babies.This number comes from a study 10 | of 133,733 families at 28 different 11 | hospitals that Noora conducted in collaboration with the Better 12 | Birth team at Ariadne Labs, a joint center for health systems 13 | innovation at Brigham and Women’s Hospital and Harvard T.H. Chan 14 | School of Public Health.Noora is so effective that even if you measure their costs in the 15 | most conservative way, by dividing their entire budget by the number 16 | of lives saved, the cost of saving a life is the lowest I've seen. 17 | $1,235.For this NFT, they're going to issue a public report tracking how 18 | this specific tranche of money is spent, and estimating the number 19 | of lives saved as a result.NFTs are a new territory, and this way of using them is especially 20 | new, but I'm excited about its potential. And I'm excited to see 21 | what happens with this particular auction, because unlike an NFT 22 | representing something that has already happened, 23 | this NFT gets better as the price gets higher.The reserve price was about $2.5 million, because that's what it 24 | takes for the name to be accurate: that's what it costs to save 25 | 2000 lives. But the higher the price of this NFT goes, the more 26 | lives will be saved. What a sentence to be able to write. -------------------------------------------------------------------------------- /eval/niah/PaulGrahamEssays/pow.txt: -------------------------------------------------------------------------------- 1 | January 2017People who are powerful but uncharismatic will tend to be disliked. 2 | Their power makes them a target for criticism that they don't have 3 | the charisma to disarm. That was Hillary Clinton's problem. It also 4 | tends to be a problem for any CEO who is more of a builder than a 5 | schmoozer. And yet the builder-type CEO is (like Hillary) probably 6 | the best person for the job.I don't think there is any solution to this problem. It's human 7 | nature. The best we can do is to recognize that it's happening, and 8 | to understand that being a magnet for criticism is sometimes a sign 9 | not that someone is the wrong person for a job, but that they're 10 | the right one. -------------------------------------------------------------------------------- /eval/niah/PaulGrahamEssays/rootsoflisp.txt: -------------------------------------------------------------------------------- 1 | May 2001 2 | 3 | (I wrote this article to help myself understand exactly 4 | what McCarthy discovered. You don't need to know this stuff 5 | to program in Lisp, but it should be helpful to 6 | anyone who wants to 7 | understand the essence of Lisp — both in the sense of its 8 | origins and its semantic core. The fact that it has such a core 9 | is one of Lisp's distinguishing features, and the reason why, 10 | unlike other languages, Lisp has dialects.)In 1960, John 11 | McCarthy published a remarkable paper in 12 | which he did for programming something like what Euclid did for 13 | geometry. He showed how, given a handful of simple 14 | operators and a notation for functions, you can 15 | build a whole programming language. 16 | He called this language Lisp, for "List Processing," 17 | because one of his key ideas was to use a simple 18 | data structure called a list for both 19 | code and data.It's worth understanding what McCarthy discovered, not 20 | just as a landmark in the history of computers, but as 21 | a model for what programming is tending to become in 22 | our own time. It seems to me that there have been 23 | two really clean, consistent models of programming so 24 | far: the C model and the Lisp model. 25 | These two seem points of high ground, with swampy lowlands 26 | between them. As computers have grown more powerful, 27 | the new languages being developed have been moving 28 | steadily toward the Lisp model. A popular recipe 29 | for new programming languages in the past 20 years 30 | has been to take the C model of computing and add to 31 | it, piecemeal, parts taken from the Lisp model, 32 | like runtime typing and garbage collection.In this article I'm going to try to explain in the 33 | simplest possible terms what McCarthy discovered. 34 | The point is not just to learn about an interesting 35 | theoretical result someone figured out forty years ago, 36 | but to show where languages are heading. 37 | The unusual thing about Lisp — in fact, the defining 38 | quality of Lisp — is that it can be written in 39 | itself. To understand what McCarthy meant by this, 40 | we're going to retrace his steps, with his mathematical 41 | notation translated into running Common Lisp code. -------------------------------------------------------------------------------- /eval/niah/PaulGrahamEssays/rss.txt: -------------------------------------------------------------------------------- 1 | Aaron Swartz created a scraped 2 | feed 3 | of the essays page. -------------------------------------------------------------------------------- /eval/niah/PaulGrahamEssays/todo.txt: -------------------------------------------------------------------------------- 1 | April 2012A palliative care nurse called Bronnie Ware made a list of the 2 | biggest regrets 3 | of the dying. Her list seems plausible. I could see 4 | myself — can see myself — making at least 4 of these 5 | 5 mistakes.If you had to compress them into a single piece of advice, it might 6 | be: don't be a cog. The 5 regrets paint a portrait of post-industrial 7 | man, who shrinks himself into a shape that fits his circumstances, 8 | then turns dutifully till he stops.The alarming thing is, the mistakes that produce these regrets are 9 | all errors of omission. You forget your dreams, ignore your family, 10 | suppress your feelings, neglect your friends, and forget to be 11 | happy. Errors of omission are a particularly dangerous type of 12 | mistake, because you make them by default.I would like to avoid making these mistakes. But how do you avoid 13 | mistakes you make by default? Ideally you transform your life so 14 | it has other defaults. But it may not be possible to do that 15 | completely. As long as these mistakes happen by default, you probably 16 | have to be reminded not to make them. So I inverted the 5 regrets, 17 | yielding a list of 5 commands 18 | 19 | Don't ignore your dreams; don't work too much; say what you 20 | think; cultivate friendships; be happy. 21 | 22 | which I then put at the top of the file I use as a todo list. -------------------------------------------------------------------------------- /eval/niah/PaulGrahamEssays/unions.txt: -------------------------------------------------------------------------------- 1 | May 2007People who worry about the increasing gap between rich and poor 2 | generally look back on the mid twentieth century as a golden age. 3 | In those days we had a large number of high-paying union manufacturing 4 | jobs that boosted the median income. I wouldn't quite call the 5 | high-paying union job a myth, but I think people who dwell on it 6 | are reading too much into it.Oddly enough, it was working with startups that made me realize 7 | where the high-paying union job came from. In a rapidly growing 8 | market, you don't worry too much about efficiency. It's more 9 | important to grow fast. If there's some mundane problem getting 10 | in your way, and there's a simple solution that's somewhat expensive, 11 | just take it and get on with more important things. EBay didn't 12 | win by paying less for servers than their competitors.Difficult though it may be to imagine now, manufacturing was a 13 | growth industry in the mid twentieth century. This was an era when 14 | small firms making everything from cars to candy were getting 15 | consolidated into a new kind of corporation with national reach and 16 | huge economies of scale. You had to grow fast or die. Workers 17 | were for these companies what servers are for an Internet startup. 18 | A reliable supply was more important than low cost.If you looked in the head of a 1950s auto executive, the attitude 19 | must have been: sure, give 'em whatever they ask for, so long as 20 | the new model isn't delayed.In other words, those workers were not paid what their work was 21 | worth. Circumstances being what they were, companies would have 22 | been stupid to insist on paying them so little.If you want a less controversial example of this phenomenon, ask 23 | anyone who worked as a consultant building web sites during the 24 | Internet Bubble. In the late nineties you could get paid huge sums 25 | of money for building the most trivial things. And yet does anyone 26 | who was there have any expectation those days will ever return? I 27 | doubt it. Surely everyone realizes that was just a temporary 28 | aberration.The era of labor unions seems to have been the same kind of aberration, 29 | just spread 30 | over a longer period, and mixed together with a lot of ideology 31 | that prevents people from viewing it with as cold an eye as they 32 | would something like consulting during the Bubble.Basically, unions were just Razorfish.People who think the labor movement was the creation of heroic union 33 | organizers have a problem to explain: why are unions shrinking now? 34 | The best they can do is fall back on the default explanation of 35 | people living in fallen civilizations. Our ancestors were giants. 36 | The workers of the early twentieth century must have had a moral 37 | courage that's lacking today.In fact there's a simpler explanation. The early twentieth century 38 | was just a fast-growing startup overpaying for infrastructure. And 39 | we in the present are not a fallen people, who have abandoned 40 | whatever mysterious high-minded principles produced the high-paying 41 | union job. We simply live in a time when the fast-growing companies 42 | overspend on different things. -------------------------------------------------------------------------------- /eval/niah/PaulGrahamEssays/vw.txt: -------------------------------------------------------------------------------- 1 | January 2012A few hours before the Yahoo acquisition was announced in June 1998 2 | I took a snapshot of Viaweb's 3 | site. I thought it might be interesting to look at one day.The first thing one notices is is how tiny the pages are. Screens 4 | were a lot smaller in 1998. If I remember correctly, our frontpage 5 | used to just fit in the size window people typically used then.Browsers then (IE 6 was still 3 years in the future) had few fonts 6 | and they weren't antialiased. If you wanted to make pages that 7 | looked good, you had to render display text as images.You may notice a certain similarity between the Viaweb and Y Combinator logos. We did that 8 | as an inside joke when we started YC. Considering how basic a red 9 | circle is, it seemed surprising to me when we started Viaweb how 10 | few other companies used one as their logo. A bit later I realized 11 | why.On the Company 12 | page you'll notice a mysterious individual called John McArtyem. 13 | Robert Morris (aka Rtm) was so publicity averse after the 14 | Worm that he 15 | didn't want his name on the site. I managed to get him to agree 16 | to a compromise: we could use his bio but not his name. He has 17 | since relaxed a bit 18 | on that point.Trevor graduated at about the same time the acquisition closed, so in the 19 | course of 4 days he went from impecunious grad student to millionaire 20 | PhD. The culmination of my career as a writer of press releases 21 | was one celebrating 22 | his graduation, illustrated with a drawing I did of him during 23 | a meeting.(Trevor also appears as Trevino 24 | Bagwell in our directory of web designers merchants could hire 25 | to build stores for them. We inserted him as a ringer in case some 26 | competitor tried to spam our web designers. We assumed his logo 27 | would deter any actual customers, but it did not.)Back in the 90s, to get users you had to get mentioned in magazines 28 | and newspapers. There were not the same ways to get found online 29 | that there are today. So we used to pay a PR 30 | firm $16,000 a month to get us mentioned in the press. Fortunately 31 | reporters liked 32 | us.In our advice about 33 | getting traffic from search engines (I don't think the term SEO 34 | had been coined yet), we say there are only 7 that matter: Yahoo, 35 | AltaVista, Excite, WebCrawler, InfoSeek, Lycos, and HotBot. Notice 36 | anything missing? Google was incorporated that September.We supported online transactions via a company called 37 | Cybercash, 38 | since if we lacked that feature we'd have gotten beaten up in product 39 | comparisons. But Cybercash was so bad and most stores' order volumes 40 | were so low that it was better if merchants processed orders like phone orders. We had a page in our site trying to talk merchants 41 | out of doing real time authorizations.The whole site was organized like a funnel, directing people to the 42 | test drive. 43 | It was a novel thing to be able to try out software online. We put 44 | cgi-bin in our dynamic urls to fool competitors about how our 45 | software worked.We had some well 46 | known users. Needless to say, Frederick's of Hollywood got the 47 | most traffic. We charged a flat fee of $300/month for big stores, 48 | so it was a little alarming to have users who got lots of traffic. 49 | I once calculated how much Frederick's was costing us in bandwidth, 50 | and it was about $300/month.Since we hosted all the stores, which together were getting just 51 | over 10 million page views per month in June 1998, we consumed what 52 | at the time seemed a lot of bandwidth. We had 2 T1s (3 Mb/sec) 53 | coming into our offices. In those days there was no AWS. Even 54 | colocating servers seemed too risky, considering how often things 55 | went wrong with them. So we had our servers in our offices. Or 56 | more precisely, in Trevor's office. In return for the unique 57 | privilege of sharing his office with no other humans, he had to 58 | share it with 6 shrieking tower servers. His office was nicknamed 59 | the Hot Tub on account of the heat they generated. Most days his 60 | stack of window air conditioners could keep up.For describing pages, we had a template language called RTML, which 61 | supposedly stood for something, but which in fact I named after 62 | Rtm. RTML was Common Lisp augmented by some macros and libraries, 63 | and concealed under a structure editor that made it look like it 64 | had syntax.Since we did continuous releases, our software didn't actually have 65 | versions. But in those days the trade press expected versions, so 66 | we made them up. If we wanted to get lots of attention, we made 67 | the version number an 68 | integer. That "version 4.0" icon was generated by our own 69 | button generator, incidentally. The whole Viaweb site was made 70 | with our software, even though it wasn't an online store, because 71 | we wanted to experience what our users did.At the end of 1997, we released a general purpose shopping search 72 | engine called Shopfind. It 73 | was pretty advanced for the time. It had a programmable crawler 74 | that could crawl most of the different stores online and pick out 75 | the products. -------------------------------------------------------------------------------- /eval/niah/PaulGrahamEssays/want.txt: -------------------------------------------------------------------------------- 1 | November 2022Since I was about 9 I've been puzzled by the apparent contradiction 2 | between being made of matter that behaves in a predictable way, and 3 | the feeling that I could choose to do whatever I wanted. At the 4 | time I had a self-interested motive for exploring the question. At 5 | that age (like most succeeding ages) I was always in trouble with 6 | the authorities, and it seemed to me that there might possibly be 7 | some way to get out of trouble by arguing that I wasn't responsible 8 | for my actions. I gradually lost hope of that, but the puzzle 9 | remained: How do you reconcile being a machine made of matter with 10 | the feeling that you're free to choose what you do? 11 | [1]The best way to explain the answer may be to start with a slightly 12 | wrong version, and then fix it. The wrong version is: You can do 13 | what you want, but you can't want what you want. Yes, you can control 14 | what you do, but you'll do what you want, and you can't control 15 | that.The reason this is mistaken is that people do sometimes change what 16 | they want. People who don't want to want something — drug addicts, 17 | for example — can sometimes make themselves stop wanting it. And 18 | people who want to want something — who want to like classical 19 | music, or broccoli — sometimes succeed.So we modify our initial statement: You can do what you want, but 20 | you can't want to want what you want.That's still not quite true. It's possible to change what you want 21 | to want. I can imagine someone saying "I decided to stop wanting 22 | to like classical music." But we're getting closer to the truth. 23 | It's rare for people to change what they want to want, and the more 24 | "want to"s we add, the rarer it gets.We can get arbitrarily close to a true statement by adding more "want 25 | to"s in much the same way we can get arbitrarily close to 1 by adding 26 | more 9s to a string of 9s following a decimal point. In practice 27 | three or four "want to"s must surely be enough. It's hard even to 28 | envision what it would mean to change what you want to want to want 29 | to want, let alone actually do it.So one way to express the correct answer is to use a regular 30 | expression. You can do what you want, but there's some statement 31 | of the form "you can't (want to)* want what you want" that's true. 32 | Ultimately you get back to a want that you don't control. 33 | [2] 34 | Notes[1] 35 | I didn't know when I was 9 that matter might behave randomly, 36 | but I don't think it affects the problem much. Randomness destroys 37 | the ghost in the machine as effectively as determinism.[2] 38 | If you don't like using an expression, you can make the same 39 | point using higher-order desires: There is some n such that you 40 | don't control your nth-order desires. 41 | Thanks to Trevor Blackwell, 42 | Jessica Livingston, Robert Morris, and 43 | Michael Nielsen for reading drafts of this. -------------------------------------------------------------------------------- /eval/niah/PaulGrahamEssays/weird.txt: -------------------------------------------------------------------------------- 1 | August 2021When people say that in their experience all programming languages 2 | are basically equivalent, they're making a statement not about 3 | languages but about the kind of programming they've done.99.5% of programming consists of gluing together calls to library 4 | functions. All popular languages are equally good at this. So one 5 | can easily spend one's whole career operating in the intersection 6 | of popular programming languages.But the other .5% of programming is disproportionately interesting. 7 | If you want to learn what it consists of, the weirdness of weird 8 | languages is a good clue to follow.Weird languages aren't weird by accident. Not the good ones, at 9 | least. The weirdness of the good ones usually implies the existence 10 | of some form of programming that's not just the usual gluing together 11 | of library calls.A concrete example: Lisp macros. Lisp macros seem weird even to 12 | many Lisp programmers. They're not only not in the intersection of 13 | popular languages, but by their nature would be hard to implement 14 | properly in a language without turning it into a dialect of 15 | Lisp. And macros are definitely evidence of techniques that go 16 | beyond glue programming. For example, solving problems by first 17 | writing a language for problems of that type, and then writing 18 | your specific application in it. Nor is this all you can do with 19 | macros; it's just one region in a space of program-manipulating 20 | techniques that even now is far from fully explored.So if you want to expand your concept of what programming can be, 21 | one way to do it is by learning weird languages. Pick a language 22 | that most programmers consider weird but whose median user is smart, 23 | and then focus on the differences between this language and the 24 | intersection of popular languages. What can you say in this language 25 | that would be impossibly inconvenient to say in others? In the 26 | process of learning how to say things you couldn't previously say, 27 | you'll probably be learning how to think things you couldn't 28 | previously think. 29 | Thanks to Trevor Blackwell, Patrick Collison, Daniel Gackle, Amjad 30 | Masad, and Robert Morris for reading drafts of this. 31 | -------------------------------------------------------------------------------- /eval/niah/README.md: -------------------------------------------------------------------------------- 1 | # Needle-in-a-Haystack Evaluation 2 | 3 | 4 | This directory contains the code for the needle in the haystack experiments in the paper. The code is adapted from [the needle test in LongAlign](https://github.com/THUDM/LongAlign/tree/main/Needle_test). 5 | 6 | ## Usage 7 | 8 | First, generate the prompts for the easy and the standard mode: 9 | 10 | ```bash 11 | python prompt.py --config config-prompt-easy.yaml --exp max_len_32k_easy 12 | python prompt.py --config config-prompt-standard.yaml --exp max_len_32k_standard 13 | ``` 14 | 15 | Then you can run the actual retrieval task. For example, This is how you can evaluate FoX (Pro): 16 | 17 | ```bash 18 | python pred.py --exp max_len_32k_easy --model "fox-pro-760m-longcrawl64-48b" --model_path "zhixuan-lin/fox-pro-760m-longcrawl64-48b" --device_id 0 19 | python pred.py --exp max_len_32k_standard --model "fox-pro-760m-longcrawl64-48b" --model_path "zhixuan-lin/fox-pro-760m-longcrawl64-48b" --device_id 0 20 | ``` 21 | 22 | The results would be saved to `./pred`. After this we need to use `gpt-4o-2024-08-06` to score the retrieval results. This require an OpenAI API key be set in `$API_KEY`. Then you can run the following: 23 | 24 | ```bash 25 | python eval.py --exp max_len_32k_easy --model fox-pro-760m-longcrawl64-48b --api-key $API_KEY 26 | python eval.py --exp max_len_32k_standard --model fox-pro-760m-longcrawl64-48b --api-key $API_KEY 27 | ``` 28 | 29 | The scores would be saved to `./results`. After this you can visualize the results as follows: 30 | 31 | ```bash 32 | FIGURE_DIR="./figures" # You can use any other path 33 | python plot_niah.py --figure_dir=$FIGURE_DIR 34 | ``` 35 | 36 | You can change `MODEL_LIST` in `plot_niah.py` to specify the set of models for which you want to visualize results. 37 | 38 | Note that we observe the evaluation results to be non-deterministic, likely due to GPU non-determinism. Therefore the results you obtain may not exactly match those reported in the paper. However, the difference should be small. 39 | 40 | ## Citation 41 | 42 | If you use this code, consider citing LongAlign: 43 | 44 | ``` 45 | @inproceedings{bai2024longalign, 46 | title = "{L}ong{A}lign: A Recipe for Long Context Alignment of Large Language Models", 47 | author = "Bai, Yushi and Lv, Xin and Zhang, Jiajie and He, Yuze and Qi, Ji and Hou, Lei and Tang, Jie and Dong, Yuxiao and Li, Juanzi", 48 | booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2024", 49 | month = nov, 50 | year = "2024", 51 | address = "Miami, Florida, USA", 52 | publisher = "Association for Computational Linguistics", 53 | url = "https://aclanthology.org/2024.findings-emnlp.74", 54 | doi = "10.18653/v1/2024.findings-emnlp.74", 55 | pages = "1376--1395", 56 | } 57 | ``` 58 | -------------------------------------------------------------------------------- /eval/niah/config-eval.yaml: -------------------------------------------------------------------------------- 1 | pred_dir: 'pred' 2 | save_dir: 'results' 3 | 4 | model: 5 | model_provider: "OpenAI" 6 | model_name: "gpt-4o-2024-08-06" 7 | 8 | prompt: 9 | needle: "\nThe best thing to do in San Francisco is eat a sandwich and sit in Dolores Park on a sunny day.\n" 10 | retrieval_question: "What is the best thing to do in San Francisco?" 11 | -------------------------------------------------------------------------------- /eval/niah/config-pred.yaml: -------------------------------------------------------------------------------- 1 | prompt_dir: 'prompts' 2 | save_dir: 'pred' 3 | 4 | model: 5 | model_provider: "Huggingface" # "OpenAI", "Anthropic" or "Huggingface" 6 | -------------------------------------------------------------------------------- /eval/niah/config-prompt-debug.yaml: -------------------------------------------------------------------------------- 1 | prompt: 2 | # needle: "\nThe best thing to do in San Francisco is eat a sandwich and sit in Dolores Park on a sunny day.\n" 3 | needle: "\nWhat is the best thing to do in San Francisco? Answer: The best thing to do in San Francisco is eat a sandwich and sit in Dolores Park on a sunny day.\n" 4 | haystack_dir: "PaulGrahamEssays" 5 | retrieval_question: "What is the best thing to do in San Francisco? Answer: The best thing to do in San Francisco is" # We use the Anthropic's retrieval question as the default one 6 | 7 | context: 8 | min_len: 1000 9 | max_len: 32000 10 | interval: 2 11 | manually_select_list: null # null or a list of context lengths to manually select 12 | 13 | document_depth: 14 | min_percent: 0 15 | max_percent: 100 16 | interval: 2 17 | interval_type: "linear" # "linear", "sigmoid" or null 18 | manually_select_list: null # null or a list of document percents to manually select 19 | 20 | tokenizer: 21 | tokenizer_type: "Huggingface" 22 | model_name: "zhixuan-lin/longcrawl64-json-gpt2-tokenizer" # Change it to your own model name / HF model path 23 | 24 | save_dir: 'prompts' 25 | 26 | -------------------------------------------------------------------------------- /eval/niah/config-prompt-easy.yaml: -------------------------------------------------------------------------------- 1 | prompt: 2 | # needle: "\nThe best thing to do in San Francisco is eat a sandwich and sit in Dolores Park on a sunny day.\n" 3 | needle: "\nWhat is the best thing to do in San Francisco? Answer: The best thing to do in San Francisco is eat a sandwich and sit in Dolores Park on a sunny day.\n" 4 | haystack_dir: "PaulGrahamEssays" 5 | retrieval_question: "What is the best thing to do in San Francisco? Answer: The best thing to do in San Francisco is" # We use the Anthropic's retrieval question as the default one 6 | 7 | context: 8 | min_len: 1000 9 | max_len: 32000 10 | interval: 11 11 | manually_select_list: null # null or a list of context lengths to manually select 12 | 13 | document_depth: 14 | min_percent: 0 15 | max_percent: 100 16 | interval: 11 17 | interval_type: "linear" # "linear", "sigmoid" or null 18 | manually_select_list: null # null or a list of document percents to manually select 19 | 20 | tokenizer: 21 | tokenizer_type: "Huggingface" 22 | model_name: "zhixuan-lin/longcrawl64-json-gpt2-tokenizer" # Change it to your own model name / HF model path 23 | 24 | save_dir: 'prompts' 25 | 26 | -------------------------------------------------------------------------------- /eval/niah/config-prompt-standard.yaml: -------------------------------------------------------------------------------- 1 | prompt: 2 | needle: "\nThe best thing to do in San Francisco is eat a sandwich and sit in Dolores Park on a sunny day.\n" 3 | # needle: "\nWhat is the best thing to do in San Francisco? Answer: The best thing to do in San Francisco is eat a sandwich and sit in Dolores Park on a sunny day.\n" 4 | haystack_dir: "PaulGrahamEssays" 5 | retrieval_question: "What is the best thing to do in San Francisco? Answer: The best thing to do in San Francisco is" # We use the Anthropic's retrieval question as the default one 6 | 7 | context: 8 | min_len: 1000 9 | max_len: 32000 10 | interval: 11 11 | manually_select_list: null # null or a list of context lengths to manually select 12 | 13 | document_depth: 14 | min_percent: 0 15 | max_percent: 100 16 | interval: 11 17 | interval_type: "linear" # "linear", "sigmoid" or null 18 | manually_select_list: null # null or a list of document percents to manually select 19 | 20 | tokenizer: 21 | tokenizer_type: "Huggingface" 22 | model_name: "zhixuan-lin/longcrawl64-json-gpt2-tokenizer" # Change it to your own model name / HF model path 23 | 24 | save_dir: 'prompts' 25 | 26 | -------------------------------------------------------------------------------- /eval/per_token_loss/README.md: -------------------------------------------------------------------------------- 1 | # Per-token loss 2 | 3 | This directory contains code to compute and plot per-token loss using LongCrawl64. 4 | 5 | Before you run the evaluation, make sure you have downloaded the heldout set of LongCrawl64. If you haven't, run the following: 6 | 7 | ```bash 8 | DATA_DIR="./data" # You can use any other path 9 | mkdir -p ${DATA_DIR}/longcrawl64 10 | # Install gsutil 11 | curl https://sdk.cloud.google.com | bash 12 | GSUTIL_PARALLEL_THREAD_COUNT=5 GSUTIL_PARALLEL_PROCESS_COUNT=5 gsutil -m cp -r 'gs://longcrawl64/heldout.zarr' ${DATA_DIR}/longcrawl64 13 | ``` 14 | 15 | 16 | After this you can run the evaluation as follows: 17 | 18 | ```bash 19 | DATA_DIR="./data" # Or whatever path that contains the LongCrawl64 dataset 20 | SAVE_DIR="./results" # You can use any other path 21 | fabric run run_per_token_loss.py \ 22 | --devices 1 \ 23 | --model "fox-pro-760m-longcrawl64-48b" \ 24 | --model_path "zhixuan-lin/fox-pro-760m-longcrawl64-48b" \ 25 | --data_path $DATA_DIR/longcrawl64 \ 26 | --save_dir $SAVE_DIR \ 27 | --resume \ 28 | --save_interval 128 29 | ``` 30 | We also support multi-gpu evaluation and resuming. However, resuming requires that you use the same number of GPUs as the resumed evaluation run, **otherwise the rseults would be incorrect**. 31 | 32 | After this, you can plot the per token loss: 33 | 34 | ```bash 35 | RESULT_DIR=$SAVE_DIR 36 | FIGURE_DIR="./figures" # You can use any other path 37 | python plot_per_token_loss.py \ 38 | --result_dir $RESULT_DIR \ 39 | --figure_dir $FIGURE_DIR 40 | ``` 41 | 42 | You can change `MODELS` in `plot_per_token_loss.py` to specify the set of models for which you want to plot the per-token loss. 43 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=61.0"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "forgetting-transformer" 7 | dynamic = ["version"] 8 | description = "Official implementation of the Forgetting Transformer" 9 | readme = "README.md" 10 | authors = [ 11 | { name = "Zhixuan Lin", email = "zxlin.cs@gmail.com" }, 12 | ] 13 | classifiers = [ 14 | "Programming Language :: Python :: 3", 15 | "License :: OSI Approved :: MIT License", 16 | "Operating System :: OS Independent", 17 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 18 | ] 19 | requires-python = ">=3.10" 20 | dependencies = [ 21 | ] 22 | 23 | [project.optional-dependencies] 24 | dev = ["pytest"] 25 | 26 | [project.urls] 27 | Homepage = "https://github.com/zhixuan-lin/forgetting-transformer" 28 | 29 | [tool.setuptools.dynamic] 30 | version = {attr = "forgetting_transformer.__version__"} 31 | -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | pytest 2 | einops 3 | numpy 4 | zarr 5 | colorlog 6 | rich 7 | wandb 8 | jsonlines 9 | matplotlib 10 | seaborn 11 | hydra-core==1.3.2 12 | torch==2.4.0 13 | click==8.1.7 # Needed for lightning CLI 14 | lightning==2.4.0 15 | transformers==4.44.0 16 | datasets==2.20.0 17 | lm_eval==0.4.4 18 | 19 | # Optional: if you want to run baselines 20 | flash-attn==2.6.3 # Needed for transformer LLaMA 21 | causal-conv1d==1.4.0 # For Mamba-2 and DeltaNet 22 | mamba-ssm==2.2.2 # For Mamba-2 23 | 24 | # Optional: for evaluation 25 | tqdm 26 | rouge 27 | jieba 28 | fuzzywuzzy 29 | -------------------------------------------------------------------------------- /save_model.py: -------------------------------------------------------------------------------- 1 | 2 | from typing import Callable, Dict, Union, Optional, Tuple, NamedTuple, Any, List 3 | import logging 4 | from pathlib import Path 5 | import rich 6 | import rich.syntax 7 | 8 | import hydra 9 | from omegaconf import OmegaConf, DictConfig 10 | import torch 11 | import lightning as L 12 | from lightning.fabric.utilities.rank_zero import rank_zero_only 13 | import os 14 | import os.path as osp 15 | from torch import nn 16 | import colorlog 17 | from datetime import datetime 18 | import jsonlines 19 | 20 | # from forgetting_transformer.model.common import LMOutput 21 | from transformers.modeling_outputs import ModelOutput 22 | from forgetting_transformer.datamodule.common import DataInfo, Batch 23 | from forgetting_transformer.checkpoint import Checkpointer 24 | from configs.config import Config 25 | from collections import defaultdict, OrderedDict 26 | import numpy as np 27 | import time 28 | from dataclasses import dataclass, field, asdict 29 | from torch.distributed.fsdp import FullyShardedDataParallel 30 | import torch.utils.flop_counter 31 | from transformers import AutoTokenizer 32 | from transformers import GPT2Tokenizer 33 | import json 34 | import pprint 35 | from forgetting_transformer.tokenizer import JSONGPT2Tokenizer 36 | import argparse 37 | 38 | 39 | @dataclass 40 | class ModelInfo: 41 | total_params: int 42 | trainable_params: int 43 | embedding_params: int 44 | flops_per_token: int # Note this depends how we train the model 45 | non_embedding_params: int = field(init=False) 46 | 47 | def __post_init__(self): 48 | self.non_embedding_params = self.total_params - self.embedding_params 49 | 50 | 51 | 52 | # @hydra.main(version_base=None, config_name="config", config_path="configs") 53 | def save_model(): 54 | parser = argparse.ArgumentParser() 55 | parser.add_argument("--hf_load_dir", type=str, required=True) 56 | parser.add_argument("--hf_save_dir", type=str, required=True) 57 | parser.add_argument("--hf_load_step", type=int, required=False) 58 | args = parser.parse_args() 59 | 60 | 61 | assert args.hf_load_dir is not None 62 | assert args.hf_save_dir is not None 63 | assert args.hf_load_step is None, "You can remove this if you know what you are doing" 64 | 65 | args.hf_load_dir = osp.realpath(args.hf_load_dir) 66 | load_config_path = Path(args.hf_load_dir) / "config.yaml" 67 | config: Config = OmegaConf.load(load_config_path) 68 | 69 | assert Path(args.hf_load_dir).exists() 70 | # with fabric.init_module(empty_init=False): 71 | assert OmegaConf.is_missing( 72 | config.model.config, "vocab_size" 73 | ), "Vocab size should be left missing" 74 | data_info_path = Path(args.hf_load_dir) / "metrics" / "jsonlines" / "train_data_info.jsonl" 75 | with jsonlines.open(data_info_path) as reader: 76 | data_info: Dict = reader.read() 77 | config.model.config.vocab_size = data_info['train_data_info/vocab_size'] 78 | model: nn.Module = hydra.utils.instantiate(config.model) 79 | 80 | if args.hf_load_step is None: 81 | resume_step, checkpoint_path = Checkpointer.get_checkpoint_path( 82 | checkpoint_dir=Path(args.hf_load_dir) / "checkpoints", 83 | step=None, 84 | ) 85 | print(f"step: {resume_step}") 86 | assert resume_step == config.train.max_tokens 87 | else: 88 | resume_step, checkpoint_path = Checkpointer.get_checkpoint_path( 89 | checkpoint_dir=Path(args.hf_load_dir) / "checkpoints", 90 | step=args.hf_load_step, 91 | ) 92 | print(f"step: {resume_step}") 93 | assert resume_step == args.hf_load_step 94 | # if input("not checking step. proceed? (y/n)").strip() == 'y': 95 | # pass 96 | # else: 97 | # import sys; sys.exit() 98 | checkpoint = torch.load(checkpoint_path, weights_only=False) 99 | 100 | model.load_state_dict(checkpoint["model"]) 101 | del checkpoint 102 | 103 | if "SlimPajama" in config.datamodule._target_: 104 | tokenizer = AutoTokenizer.from_pretrained("fla-hub/gla-1.3B-100B") 105 | elif "LongCrawl" in config.datamodule._target_: 106 | tokenizer = JSONGPT2Tokenizer.from_pretrained("gpt2", add_bos_token=True, clean_up_tokenization_spaces=False, add_prefix_space=False) 107 | else: 108 | raise ValueError(f"Unknow data module {config.datamodule._target_}") 109 | # tokenizer = GPT2Tokenizer.from_pretrained("gpt2", add_bos_token=False, clean_up_tokenization_spaces=False, add_prefix_space=False) 110 | tokenizer.model_max_length = data_info["train_data_info/batch_len"] 111 | 112 | path = Path(args.hf_save_dir) 113 | path.mkdir(parents=True, exist_ok=True) 114 | model.save_pretrained(path,) 115 | tokenizer.save_pretrained(path) 116 | print(f"Model and tokenizer saved to {path}") 117 | 118 | # tokenizer = GPT2Tokenizer.from_pretrained("gpt2", add_bos_token=False, clean_up_tokenization_spaces=False, add_prefix_space=True) 119 | 120 | # import ipdb; ipdb.set_trace() 121 | if __name__ == "__main__": 122 | save_model() # pylint: disable=no-value-for-parameter 123 | -------------------------------------------------------------------------------- /src/forgetting_transformer/__init__.py: -------------------------------------------------------------------------------- 1 | from forgetting_transformer.ops.forgetting_attention import ( 2 | forgetting_attention 3 | ) 4 | __version__ = '0.0.1' 5 | -------------------------------------------------------------------------------- /src/forgetting_transformer/datamodule/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhixuan-lin/forgetting-transformer/f8ce22afe14980628534e06d9ee62baeeddf1dcf/src/forgetting_transformer/datamodule/__init__.py -------------------------------------------------------------------------------- /src/forgetting_transformer/datamodule/common.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, NamedTuple 2 | import torch 3 | from forgetting_transformer.utils import safe_divide 4 | from dataclasses import dataclass, field 5 | 6 | 7 | @dataclass 8 | class DataInfo: 9 | vocab_size: int 10 | global_tokens_per_batch: int 11 | local_tokens_per_batch: int 12 | # tokens_per_stage: int 13 | batch_len: int 14 | seq_len: Optional[int] 15 | total_tokens: int 16 | global_batch_size: int = field(init=False) 17 | local_batch_size: int = field(init=False) 18 | """General dataloader information 19 | 20 | Each local batch has shape (local_batch_size, batch_len) 21 | 22 | Arguments: 23 | - `tokens_per_stage`: the following should always be true: as long as 24 | two dataloaders 25 | - use the same data source 26 | - have the same tokens_per_stage 27 | Then within each stage, the set of tokens they emit must be the same, even 28 | though the order these tokens are emitted are different. 29 | - `seq_len`: if None, the sequences are variable length. Otherwise all 30 | sequences should have the same length. The practical implication is 31 | that resets should either all be False, or only the first timestep is 32 | True. 33 | 34 | """ 35 | def __post_init__(self): 36 | self.global_batch_size = safe_divide(self.global_tokens_per_batch, self.batch_len) 37 | self.local_batch_size = safe_divide(self.local_tokens_per_batch, self.batch_len) 38 | 39 | 40 | class Batch(NamedTuple): 41 | input_ids: torch.LongTensor 42 | labels: torch.LongTensor 43 | resets: torch.BoolTensor 44 | -------------------------------------------------------------------------------- /src/forgetting_transformer/model/README.md: -------------------------------------------------------------------------------- 1 | Most files in this directory are adapted from the [Flash Linear Attention](https://github.com/fla-org/flash-linear-attention) repository. 2 | 3 | The original license info from the Flash Linear Attention repository: 4 | 5 | ``` 6 | MIT License 7 | 8 | Copyright (c) 2023-2025 Songlin Yang 9 | 10 | Permission is hereby granted, free of charge, to any person obtaining a copy 11 | of this software and associated documentation files (the "Software"), to deal 12 | in the Software without restriction, including without limitation the rights 13 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 14 | copies of the Software, and to permit persons to whom the Software is 15 | furnished to do so, subject to the following conditions: 16 | 17 | The above copyright notice and this permission notice shall be included in all 18 | copies or substantial portions of the Software. 19 | 20 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 21 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 22 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 23 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 24 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 25 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 26 | SOFTWARE. 27 | ``` 28 | -------------------------------------------------------------------------------- /src/forgetting_transformer/model/__init__.py: -------------------------------------------------------------------------------- 1 | # from .mamba2 import Mamba2ForCausalLM, Mamba2Config 2 | # from .forgetting_transformer import ForgettingTransformerForCausalLM, ForgettingTransformerConfig 3 | # from .transformer import TransformerForCausalLM, TransformerConfig 4 | # from .delta_net import DeltaNetForCausalLM, DeltaNetConfig 5 | # from .hgrn2 import HGRN2ForCausalLM, HGRN2Config 6 | # from .samba import SambaForCausalLM, SambaConfig 7 | 8 | import importlib 9 | import warnings 10 | with warnings.catch_warnings(): 11 | warnings.filterwarnings(action="ignore", message="Flash Attention is not installed") 12 | warnings.filterwarnings(action="ignore", message="`torch.cuda.amp") 13 | from forgetting_transformer.model.forgetting_transformer import ( 14 | ForgettingTransformerForCausalLM, 15 | ForgettingTransformerConfig, 16 | ) 17 | from forgetting_transformer.model.forgetting_transformer.modeling_forgetting_transformer import ( 18 | ForgettingAttentionLayer 19 | ) 20 | 21 | for model in ["mamba2", "forgetting_transformer", "transformer", "delta_net", "hgrn2", "samba"]: 22 | # We do not want to espose the names. 23 | importlib.import_module(f".{model}", __name__) 24 | -------------------------------------------------------------------------------- /src/forgetting_transformer/model/common.py: -------------------------------------------------------------------------------- 1 | from typing import NamedTuple, Optional, Any 2 | import torch 3 | 4 | 5 | class LMOutput(NamedTuple): 6 | loss: torch.Tensor 7 | carry: Any 8 | logits: Optional[torch.Tensor] = None 9 | -------------------------------------------------------------------------------- /src/forgetting_transformer/model/delta_net/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from transformers import AutoConfig, AutoModel, AutoModelForCausalLM 4 | 5 | from .configuration_delta_net import \ 6 | DeltaNetConfig 7 | from .modeling_delta_net import ( 8 | DeltaNetForCausalLM, DeltaNetModel) 9 | 10 | AutoConfig.register(DeltaNetConfig.model_type, DeltaNetConfig) 11 | AutoModel.register(DeltaNetConfig, DeltaNetModel) 12 | AutoModelForCausalLM.register(DeltaNetConfig, DeltaNetForCausalLM) 13 | 14 | __all__ = ['DeltaNetConfig', 'DeltaNetForCausalLM', 'DeltaNetModel'] 15 | -------------------------------------------------------------------------------- /src/forgetting_transformer/model/delta_net/configuration_delta_net.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from typing import Optional 4 | 5 | from transformers.configuration_utils import PretrainedConfig 6 | 7 | 8 | class DeltaNetConfig(PretrainedConfig): 9 | 10 | model_type = 'delta_net-project_fox' 11 | keys_to_ignore_at_inference = ['past_key_values'] 12 | 13 | def __init__( 14 | self, 15 | vocab_size: int = 32000, 16 | hidden_size: int = 2048, 17 | expand_k: int = 1, 18 | expand_v: int = 1, 19 | use_gate: bool = False, 20 | use_short_conv: bool = True, 21 | conv_size: int = 4, 22 | use_beta: bool = True, 23 | use_output_norm: bool = True, 24 | hidden_ratio: Optional[int] = 4, 25 | intermediate_size: Optional[int] = None, 26 | num_hidden_layers: int = 24, 27 | num_heads: int = 16, 28 | attn_mode: str = "chunk", 29 | qk_norm: str = 'l2', 30 | qk_activation: str = 'silu', 31 | hidden_act: str = "swish", 32 | max_position_embeddings: int = 2048, 33 | norm_first: bool = False, 34 | norm_eps: float = 1e-6, 35 | use_cache: bool = True, 36 | pad_token_id: int = None, 37 | bos_token_id: int = 1, 38 | eos_token_id: int = 2, 39 | tie_word_embeddings: bool = False, 40 | initializer_range: float = 0.02, 41 | fuse_cross_entropy: bool = True, 42 | **kwargs 43 | ): 44 | self.vocab_size = vocab_size 45 | self.max_position_embeddings = max_position_embeddings 46 | self.hidden_size = hidden_size 47 | self.expand_k = expand_k 48 | self.expand_v = expand_v 49 | self.hidden_ratio = hidden_ratio 50 | self.intermediate_size = intermediate_size 51 | self.num_hidden_layers = num_hidden_layers 52 | self.num_heads = num_heads 53 | self.attn_mode = attn_mode 54 | self.hidden_act = hidden_act 55 | self.norm_first = norm_first 56 | self.norm_eps = norm_eps 57 | self.use_cache = use_cache 58 | self.initializer_range = initializer_range 59 | self.fuse_cross_entropy = fuse_cross_entropy 60 | self.use_gate = use_gate 61 | self.use_short_conv = use_short_conv 62 | self.conv_size = conv_size 63 | self.use_beta = use_beta 64 | self.use_output_norm = use_output_norm 65 | self.qk_norm = qk_norm 66 | self.qk_activation = qk_activation 67 | 68 | super().__init__( 69 | pad_token_id=pad_token_id, 70 | bos_token_id=bos_token_id, 71 | eos_token_id=eos_token_id, 72 | tie_word_embeddings=tie_word_embeddings, 73 | **kwargs, 74 | ) 75 | -------------------------------------------------------------------------------- /src/forgetting_transformer/model/forgetting_transformer/__init__.py: -------------------------------------------------------------------------------- 1 | # # -*- coding: utf-8 -*- 2 | 3 | from transformers import AutoConfig, AutoModel, AutoModelForCausalLM 4 | 5 | from .configuration_forgetting_transformer import ForgettingTransformerConfig 6 | from .modeling_forgetting_transformer import ( 7 | ForgettingTransformerForCausalLM, ForgettingTransformerModel) 8 | 9 | AutoConfig.register(ForgettingTransformerConfig.model_type, ForgettingTransformerConfig) 10 | AutoModel.register(ForgettingTransformerConfig, ForgettingTransformerModel) 11 | AutoModelForCausalLM.register(ForgettingTransformerConfig, ForgettingTransformerForCausalLM) 12 | 13 | 14 | 15 | __all__ = ['ForgettingTransformerConfig', 'ForgettingTransformerForCausalLM', 'ForgettingTransformerModel'] 16 | -------------------------------------------------------------------------------- /src/forgetting_transformer/model/forgetting_transformer/configuration_forgetting_transformer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from typing import Optional 4 | 5 | from transformers.configuration_utils import PretrainedConfig 6 | 7 | 8 | class ForgettingTransformerConfig(PretrainedConfig): 9 | 10 | model_type = 'forgetting_transformer-project_fox' 11 | keys_to_ignore_at_inference = ['past_key_values'] 12 | 13 | def __init__( 14 | self, 15 | vocab_size: int = 32000, 16 | hidden_size: int = 2048, 17 | hidden_ratio: Optional[float] = 4, 18 | intermediate_size: Optional[int] = None, 19 | num_hidden_layers: int = 24, 20 | num_heads: int = 32, 21 | num_kv_heads: int = None, 22 | hidden_act: str = "swish", 23 | window_size: Optional[int] = None, 24 | max_position_embeddings: int = 2048, 25 | initializer_range: float = 0.02, 26 | elementwise_affine: Optional[bool] = True, 27 | norm_eps: float = 1e-6, 28 | use_cache: bool = True, 29 | pad_token_id: int = None, 30 | bos_token_id: int = 1, 31 | eos_token_id: int = 2, 32 | tie_word_embeddings: bool = False, 33 | attention_bias: bool = False, 34 | fuse_norm: bool = True, 35 | fuse_cross_entropy: bool = True, 36 | rope_base: float = 500000.0, 37 | use_rope: bool = False, 38 | use_output_gate: bool = False, 39 | ogate_act: str = "sigmoid", 40 | fgate_type: str = "full", 41 | fgate_bias_init: bool = False, 42 | decay_time_min: Optional[float] = None, 43 | decay_time_max: Optional[float] = None, 44 | use_output_norm: bool = False, 45 | qk_norm: bool = False, 46 | qk_norm_share_param_across_head: bool = False, 47 | use_k_shift: bool = False, 48 | use_v_shift: bool = False, 49 | **kwargs, 50 | ): 51 | self.vocab_size = vocab_size 52 | self.hidden_size = hidden_size 53 | self.hidden_ratio = hidden_ratio 54 | self.intermediate_size = intermediate_size 55 | self.num_hidden_layers = num_hidden_layers 56 | self.num_heads = num_heads 57 | self.num_kv_heads = num_kv_heads 58 | self.window_size = window_size 59 | self.max_position_embeddings = max_position_embeddings 60 | 61 | self.hidden_act = hidden_act 62 | self.initializer_range = initializer_range 63 | self.elementwise_affine = elementwise_affine 64 | self.norm_eps = norm_eps 65 | self.use_cache = use_cache 66 | self.attention_bias = attention_bias 67 | self.fuse_cross_entropy = fuse_cross_entropy 68 | self.fuse_norm = fuse_norm 69 | self.rope_base = rope_base 70 | self.use_rope = use_rope 71 | self.use_output_gate = use_output_gate 72 | self.ogate_act = ogate_act 73 | self.fgate_type = fgate_type 74 | self.fgate_bias_init = fgate_bias_init 75 | self.decay_time_min = decay_time_min 76 | self.decay_time_max = decay_time_max 77 | self.use_output_norm = use_output_norm 78 | self.qk_norm = qk_norm 79 | self.qk_norm_share_param_across_head = qk_norm_share_param_across_head 80 | self.use_k_shift = use_k_shift 81 | self.use_v_shift = use_v_shift 82 | 83 | super().__init__( 84 | pad_token_id=pad_token_id, 85 | bos_token_id=bos_token_id, 86 | eos_token_id=eos_token_id, 87 | tie_word_embeddings=tie_word_embeddings, 88 | **kwargs, 89 | ) 90 | -------------------------------------------------------------------------------- /src/forgetting_transformer/model/forgetting_transformer/glu_linear.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | 4 | 5 | glu_fwd_codestring = """ 6 | template T glu_fwd(T x, T y) { 7 | return float(y) / (1.0f + ::exp(-float(x))); 8 | } 9 | """ 10 | glu_bwd_codestring = """ 11 | template T glu_bwd(T x, T y, T g, T& dx, T& dy) { 12 | float x_sigmoid = 1.0f / (1.0f + ::exp(-float(x))); 13 | dx = x_sigmoid * (1.0f - x_sigmoid) * float(g) * float(y); 14 | dy = x_sigmoid * float(g); 15 | } 16 | """ 17 | 18 | glu_bwd_with_output_codestring = """ 19 | template T glu_bwd_with_output(T x, T y, T g, T& dx, T& dy, T& z) { 20 | float x_sigmoid = 1.0f / (1.0f + ::exp(-float(x))); 21 | dx = x_sigmoid * (1.0f - x_sigmoid) * float(g) * float(y); 22 | dy = x_sigmoid * float(g); 23 | z = x_sigmoid * float(y); 24 | } 25 | """ 26 | 27 | glu_fwd = torch.cuda.jiterator._create_jit_fn(glu_fwd_codestring) 28 | glu_bwd = torch.cuda.jiterator._create_multi_output_jit_fn(glu_bwd_codestring, num_outputs=2) 29 | glu_bwd_with_output = torch.cuda.jiterator._create_multi_output_jit_fn(glu_bwd_with_output_codestring, num_outputs=3) 30 | 31 | 32 | class GLULinearFunction(torch.autograd.Function): 33 | r""" 34 | Gated Linear Unit (GLU) function followed by a linear transformation. 35 | 36 | .. math:: 37 | \text{GLULinear}(x, y, W, b) = (sh(x) * y) W + b 38 | 39 | This simple wrap discards the intermediate results of GLU(x, y) to save memory. 40 | """ 41 | 42 | @staticmethod 43 | def forward(ctx, x, y, weight, bias): 44 | z = glu_fwd(x, y) 45 | out = F.linear(z.to(weight.dtype), weight, bias) 46 | # We don't store z, will be recomputed in the backward pass to save memory 47 | ctx.save_for_backward(x, y, weight) 48 | ctx.linear_bias_is_none = bias is None 49 | return out 50 | 51 | @staticmethod 52 | def backward(ctx, dout, *args): 53 | x, y, weight = ctx.saved_tensors 54 | dout = dout.reshape(-1, dout.shape[-1]) 55 | dz = F.linear(dout, weight.t()).view_as(x) 56 | dx, dy, z = glu_bwd_with_output(x, y, dz) 57 | dlinear_weight = torch.einsum("bo,bi->oi", dout, z.reshape(-1, z.shape[-1])) 58 | dlinear_bias = None if ctx.linear_bias_is_none else dout.sum(0) 59 | return dx, dy, dlinear_weight, dlinear_bias 60 | 61 | glu_linear = GLULinearFunction.apply 62 | -------------------------------------------------------------------------------- /src/forgetting_transformer/model/hgrn2/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from transformers import AutoConfig, AutoModel, AutoModelForCausalLM 4 | 5 | from .configuration_hgrn2 import HGRN2Config 6 | from .modeling_hgrn2 import HGRN2ForCausalLM, HGRN2Model 7 | 8 | AutoConfig.register(HGRN2Config.model_type, HGRN2Config) 9 | AutoModel.register(HGRN2Config, HGRN2Model) 10 | AutoModelForCausalLM.register(HGRN2Config, HGRN2ForCausalLM) 11 | 12 | 13 | __all__ = ['HGRN2Config', 'HGRN2ForCausalLM', 'HGRN2Model'] 14 | -------------------------------------------------------------------------------- /src/forgetting_transformer/model/hgrn2/configuration_hgrn2.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from typing import Optional 4 | 5 | from transformers.configuration_utils import PretrainedConfig 6 | 7 | 8 | class HGRN2Config(PretrainedConfig): 9 | 10 | model_type = 'hgrn2-project_fox' 11 | keys_to_ignore_at_inference = ['past_key_values'] 12 | 13 | def __init__( 14 | self, 15 | vocab_size: int = 32000, 16 | hidden_size: int = 2048, 17 | num_hidden_layers: int = 24, 18 | attn_mode: str = "chunk", 19 | num_heads: Optional[int] = None, 20 | expand_ratio: Optional[int] = 128, 21 | use_short_conv: bool = False, 22 | conv_size: int = 4, 23 | use_lower_bound: bool = True, 24 | hidden_ratio: Optional[int] = 4, 25 | intermediate_size: Optional[int] = None, 26 | hidden_act: str = "swish", 27 | max_position_embeddings: int = 2048, 28 | elementwise_affine: Optional[bool] = True, 29 | norm_eps: float = 1e-6, 30 | use_cache: bool = True, 31 | pad_token_id: int = None, 32 | bos_token_id: int = 1, 33 | eos_token_id: int = 2, 34 | tie_word_embeddings: bool = False, 35 | initializer_range: float = 0.02, 36 | fuse_cross_entropy: bool = True, 37 | **kwargs 38 | ): 39 | self.vocab_size = vocab_size 40 | self.max_position_embeddings = max_position_embeddings 41 | self.hidden_size = hidden_size 42 | self.num_hidden_layers = num_hidden_layers 43 | self.attn_mode = attn_mode 44 | self.num_heads = num_heads 45 | self.expand_ratio = expand_ratio 46 | self.use_short_conv = use_short_conv 47 | self.conv_size = conv_size 48 | self.use_lower_bound = use_lower_bound 49 | self.hidden_ratio = hidden_ratio 50 | self.intermediate_size = intermediate_size 51 | self.hidden_act = hidden_act 52 | self.elementwise_affine = elementwise_affine 53 | self.norm_eps = norm_eps 54 | self.use_cache = use_cache 55 | self.initializer_range = initializer_range 56 | self.fuse_cross_entropy = fuse_cross_entropy 57 | 58 | super().__init__( 59 | pad_token_id=pad_token_id, 60 | bos_token_id=bos_token_id, 61 | eos_token_id=eos_token_id, 62 | tie_word_embeddings=tie_word_embeddings, 63 | **kwargs, 64 | ) 65 | -------------------------------------------------------------------------------- /src/forgetting_transformer/model/mamba2/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from transformers import AutoConfig, AutoModel, AutoModelForCausalLM 4 | 5 | from .configuration_mamba2 import Mamba2Config 6 | from .modeling_mamba2 import Mamba2ForCausalLM, Mamba2Model 7 | 8 | AutoConfig.register(Mamba2Config.model_type, Mamba2Config, True) 9 | AutoModel.register(Mamba2Config, Mamba2Model, True) 10 | AutoModelForCausalLM.register(Mamba2Config, Mamba2ForCausalLM, True) 11 | 12 | 13 | __all__ = ['Mamba2Config', 'Mamba2ForCausalLM', 'Mamba2Model'] 14 | -------------------------------------------------------------------------------- /src/forgetting_transformer/model/samba/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from transformers import AutoConfig, AutoModel, AutoModelForCausalLM 4 | 5 | from .configuration_samba import SambaConfig 6 | from .modeling_samba import (SambaBlock, SambaForCausalLM, 7 | SambaModel) 8 | 9 | AutoConfig.register(SambaConfig.model_type, SambaConfig, True) 10 | AutoModel.register(SambaConfig, SambaModel, True) 11 | AutoModelForCausalLM.register(SambaConfig, SambaForCausalLM, True) 12 | 13 | 14 | __all__ = ['SambaConfig', 'SambaForCausalLM', 'SambaModel', 'SambaBlock'] 15 | -------------------------------------------------------------------------------- /src/forgetting_transformer/model/samba/configuration_samba.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import math 4 | from typing import Dict, Optional 5 | 6 | from transformers.configuration_utils import PretrainedConfig 7 | try: 8 | from omegaconf import DictConfig, OmegaConf 9 | except ImportError: 10 | DictConfig, OmegaConf = None, None 11 | 12 | 13 | class SambaConfig(PretrainedConfig): 14 | 15 | model_type = "samba-project_fox" 16 | 17 | def __init__( 18 | self, 19 | vocab_size: int = 32000, 20 | hidden_size: int = 2304, 21 | state_size: int = 16, 22 | num_hidden_layers: int = 18, 23 | norm_eps=1e-5, 24 | pad_token_id: int = 0, 25 | bos_token_id: int = 1, 26 | eos_token_id: int = 2, 27 | expand: int = 2, 28 | conv_kernel: int = 4, 29 | use_bias: bool = False, 30 | use_conv_bias: bool = True, 31 | hidden_act: str = "silu", 32 | initializer_range: str = 0.02, 33 | residual_in_fp32: bool = False, 34 | time_step_rank: str = "auto", 35 | time_step_scale: float = 1.0, 36 | time_step_min: float = 0.001, 37 | time_step_max: float = 0.1, 38 | time_step_init_scheme: str = "random", 39 | time_step_floor: float = 1e-4, 40 | max_position_embeddings: int = 2048, 41 | attn: Optional[Dict] = None, 42 | attn_hidden_ratio: Optional[float] = 4, 43 | mamba_hidden_ratio: Optional[float] = 3, 44 | rescale_prenorm_residual: bool = False, 45 | use_cache: bool = True, 46 | fuse_norm: bool = True, 47 | fuse_cross_entropy: bool = True, 48 | tie_word_embeddings: bool = False, 49 | rope_base: float = 500000.0, 50 | **kwargs, 51 | ): 52 | self.vocab_size = vocab_size 53 | self.hidden_size = hidden_size 54 | self.state_size = state_size 55 | self.num_hidden_layers = num_hidden_layers 56 | self.norm_eps = norm_eps 57 | self.conv_kernel = conv_kernel 58 | self.expand = expand 59 | self.intermediate_size = int(expand * self.hidden_size) 60 | self.bos_token_id = bos_token_id 61 | self.eos_token_id = eos_token_id 62 | self.pad_token_id = pad_token_id 63 | self.use_bias = use_bias 64 | self.use_conv_bias = use_conv_bias 65 | self.hidden_act = hidden_act 66 | self.initializer_range = initializer_range 67 | self.time_step_rank = math.ceil(self.hidden_size / 16) if time_step_rank == "auto" else time_step_rank 68 | self.time_step_scale = time_step_scale 69 | self.time_step_min = time_step_min 70 | self.time_step_max = time_step_max 71 | self.time_step_init_scheme = time_step_init_scheme 72 | self.time_step_floor = time_step_floor 73 | self.max_position_embeddings = max_position_embeddings 74 | self.attn_hidden_ratio = attn_hidden_ratio 75 | self.mamba_hidden_ratio = mamba_hidden_ratio 76 | self.rescale_prenorm_residual = rescale_prenorm_residual 77 | self.residual_in_fp32 = residual_in_fp32 78 | self.use_cache = use_cache 79 | self.fuse_cross_entropy = fuse_cross_entropy 80 | self.fuse_norm = fuse_norm 81 | self.rope_base = rope_base 82 | 83 | if attn is not None: 84 | if isinstance(attn, (DictConfig)): 85 | attn = OmegaConf.to_container(attn) 86 | if not isinstance(attn, dict): 87 | raise ValueError("attn must be a dictionary") 88 | if 'layers' not in attn: 89 | raise ValueError("Layer indices must be provided to initialize hybrid attention layers") 90 | if 'num_heads' not in attn: 91 | raise ValueError("Number of heads must be provided to initialize hybrid attention layers") 92 | # attn['num_heads'] = attn.get('num_kv_heads', 18) 93 | # attn['num_kv_heads'] = attn.get('num_kv_heads', attn['num_heads']) 94 | # attn['window_size'] = attn.get('window_size', 2048) 95 | # else: 96 | # raise ValueError("attn must not be None") 97 | self.attn = attn 98 | 99 | super().__init__( 100 | bos_token_id=bos_token_id, 101 | eos_token_id=eos_token_id, 102 | pad_token_id=pad_token_id, 103 | tie_word_embeddings=tie_word_embeddings, 104 | **kwargs 105 | ) 106 | -------------------------------------------------------------------------------- /src/forgetting_transformer/model/transformer/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from transformers import AutoConfig, AutoModel, AutoModelForCausalLM 4 | 5 | from .configuration_transformer import TransformerConfig 6 | from .modeling_transformer import ( 7 | TransformerForCausalLM, TransformerModel) 8 | 9 | AutoConfig.register(TransformerConfig.model_type, TransformerConfig) 10 | AutoModel.register(TransformerConfig, TransformerModel) 11 | AutoModelForCausalLM.register(TransformerConfig, TransformerForCausalLM) 12 | 13 | 14 | 15 | __all__ = ['TransformerConfig', 'TransformerForCausalLM', 'TransformerModel'] 16 | -------------------------------------------------------------------------------- /src/forgetting_transformer/model/transformer/configuration_transformer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from typing import Optional 4 | 5 | from transformers.configuration_utils import PretrainedConfig 6 | 7 | 8 | class TransformerConfig(PretrainedConfig): 9 | 10 | model_type = 'transformer-project_fox' 11 | keys_to_ignore_at_inference = ['past_key_values'] 12 | 13 | def __init__( 14 | self, 15 | vocab_size: int = 32000, 16 | hidden_size: int = 2048, 17 | hidden_ratio: Optional[int] = 4, 18 | intermediate_size: Optional[int] = None, 19 | num_hidden_layers: int = 24, 20 | num_heads: int = 32, 21 | num_kv_heads: int = None, 22 | hidden_act: str = "swish", 23 | window_size: Optional[int] = None, 24 | max_position_embeddings: int = 2048, 25 | initializer_range: float = 0.02, 26 | elementwise_affine: Optional[bool] = True, 27 | norm_eps: float = 1e-6, 28 | use_cache: bool = True, 29 | pad_token_id: int = None, 30 | bos_token_id: int = 1, 31 | eos_token_id: int = 2, 32 | tie_word_embeddings: bool = False, 33 | attention_bias: bool = False, 34 | fuse_norm: bool = True, 35 | fuse_cross_entropy: bool = True, 36 | rope_base: float = 500000.0, 37 | use_rope: bool = True, 38 | **kwargs, 39 | ): 40 | self.vocab_size = vocab_size 41 | self.hidden_size = hidden_size 42 | self.hidden_ratio = hidden_ratio 43 | self.intermediate_size = intermediate_size 44 | self.num_hidden_layers = num_hidden_layers 45 | self.num_heads = num_heads 46 | self.num_kv_heads = num_kv_heads 47 | self.window_size = window_size 48 | self.max_position_embeddings = max_position_embeddings 49 | 50 | self.hidden_act = hidden_act 51 | self.initializer_range = initializer_range 52 | self.elementwise_affine = elementwise_affine 53 | self.norm_eps = norm_eps 54 | self.use_cache = use_cache 55 | self.attention_bias = attention_bias 56 | self.fuse_cross_entropy = fuse_cross_entropy 57 | self.fuse_norm = fuse_norm 58 | self.rope_base = rope_base 59 | self.use_rope = use_rope 60 | 61 | super().__init__( 62 | pad_token_id=pad_token_id, 63 | bos_token_id=bos_token_id, 64 | eos_token_id=eos_token_id, 65 | tie_word_embeddings=tie_word_embeddings, 66 | **kwargs, 67 | ) 68 | -------------------------------------------------------------------------------- /src/forgetting_transformer/ops/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhixuan-lin/forgetting-transformer/f8ce22afe14980628534e06d9ee62baeeddf1dcf/src/forgetting_transformer/ops/__init__.py -------------------------------------------------------------------------------- /src/forgetting_transformer/schedule/__init__.py: -------------------------------------------------------------------------------- 1 | from .schedule import ( 2 | constant_schedule, 3 | warmup_cosine_decay_schedule, 4 | warmup_linear_decay_schedule, 5 | linear_schedule, 6 | polynomial_schedule, 7 | warmup_one_minus_sqrt_schedule 8 | ) 9 | -------------------------------------------------------------------------------- /src/forgetting_transformer/tokenizer.py: -------------------------------------------------------------------------------- 1 | import json 2 | from typing import Callable, Dict, Union, Optional, Tuple, NamedTuple, Any, List 3 | from transformers import GPT2Tokenizer, PretrainedConfig, AutoTokenizer 4 | 5 | 6 | 7 | class JSONGPT2Tokenizer(GPT2Tokenizer): 8 | def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs): 9 | (text, kwargs) = super().prepare_for_tokenization(text, is_split_into_words, **kwargs) 10 | text = json.dumps(text) 11 | text = text[1:-1] 12 | return (text, kwargs) 13 | 14 | def decode( 15 | self, 16 | token_ids: Union[int, List[int], "np.ndarray", "torch.Tensor", "tf.Tensor"], 17 | skip_special_tokens: bool = False, 18 | clean_up_tokenization_spaces: bool = None, 19 | **kwargs, 20 | ): 21 | text = super().decode( 22 | token_ids=token_ids, 23 | skip_special_tokens=skip_special_tokens, 24 | clean_up_tokenization_spaces=clean_up_tokenization_spaces, 25 | **kwargs, 26 | ) 27 | try: 28 | # Unfortunately this is what LongCrawl64 did. 29 | text = json.loads(f'"{text}"') 30 | except json.JSONDecodeError: 31 | # Best effort decoding 32 | text = text.encode().decode("unicode_escape", "ignore") 33 | return text 34 | 35 | class DummyConfig(PretrainedConfig): 36 | pass 37 | 38 | AutoTokenizer.register(DummyConfig, JSONGPT2Tokenizer) 39 | --------------------------------------------------------------------------------