├── .gitignore
├── LICENSE
├── README.md
├── configs
    ├── __init__.py
    ├── config.py
    ├── datamodule
    │   ├── __init__.py
    │   └── longcrawl64.py
    ├── experiment
    │   └── longcrawl64
    │   │   ├── delta_net
    │   │       ├── 125m_2b.yaml
    │   │       ├── 360m_7b.yaml
    │   │       ├── 760m_16b.yaml
    │   │       └── 760m_48b.yaml
    │   │   ├── forgetting_transformer
    │   │       ├── llama_125m_2b.yaml
    │   │       ├── llama_360m_7b.yaml
    │   │       ├── llama_760m_16b.yaml
    │   │       ├── llama_760m_48b.yaml
    │   │       ├── pro_125m_2b.yaml
    │   │       ├── pro_360m_7b.yaml
    │   │       ├── pro_760m_16b.yaml
    │   │       └── pro_760m_48b.yaml
    │   │   ├── hgrn2
    │   │       ├── 125m_2b.yaml
    │   │       ├── 360m_7b.yaml
    │   │       ├── 760m_16b.yaml
    │   │       └── 760m_48b.yaml
    │   │   ├── mamba2
    │   │       ├── 125m_2b.yaml
    │   │       ├── 360m_7b.yaml
    │   │       ├── 760m_16b.yaml
    │   │       └── 760m_48b.yaml
    │   │   ├── samba
    │   │       └── 760m_16b.yaml
    │   │   ├── transformer
    │   │       ├── llama_125m_2b.yaml
    │   │       ├── llama_360m_7b.yaml
    │   │       ├── llama_760m_16b.yaml
    │   │       ├── llama_760m_48b.yaml
    │   │       ├── pro_125m_2b.yaml
    │   │       ├── pro_360m_7b.yaml
    │   │       ├── pro_760m_16b.yaml
    │   │       └── pro_760m_48b.yaml
    │   │   └── transformer_swa
    │   │       └── 760m_16b.yaml
    ├── model
    │   ├── __init__.py
    │   ├── delta_net.py
    │   ├── forgetting_transformer.py
    │   ├── hgrn2.py
    │   ├── mamba2.py
    │   ├── samba.py
    │   └── transformer.py
    ├── optimizer
    │   ├── __init__.py
    │   └── adamw.py
    ├── schedule
    │   ├── __init__.py
    │   ├── constant.py
    │   ├── warmup_cosine.py
    │   ├── warmup_linear.py
    │   └── warmup_one_minus_sqrt.py
    ├── strategy
    │   ├── __init__.py
    │   ├── ddp.py
    │   └── fsdp.py
    └── utils.py
├── eval
    ├── lm_eval_harness
    │   ├── README.md
    │   ├── run_lm_eval.py
    │   └── table_lm_eval.py
    ├── longbench
    │   ├── LICENSE
    │   ├── README.md
    │   ├── config
    │   │   ├── dataset2maxlen.json
    │   │   └── dataset2prompt.json
    │   ├── eval.py
    │   ├── llama_flash_attn_monkey_patch.py
    │   ├── metrics.py
    │   ├── pred.py
    │   ├── refs
    │   │   └── ref.bib
    │   ├── requirements.txt
    │   ├── table_longbench.py
    │   └── task.md
    ├── niah
    │   ├── LICENSE
    │   ├── PaulGrahamEssays
    │   │   ├── addiction.txt
    │   │   ├── aord.txt
    │   │   ├── apple.txt
    │   │   ├── avg.txt
    │   │   ├── before.txt
    │   │   ├── bias.txt
    │   │   ├── boss.txt
    │   │   ├── copy.txt
    │   │   ├── corpdev.txt
    │   │   ├── desres.txt
    │   │   ├── diff.txt
    │   │   ├── ecw.txt
    │   │   ├── founders.txt
    │   │   ├── foundervisa.txt
    │   │   ├── gap.txt
    │   │   ├── gba.txt
    │   │   ├── gh.txt
    │   │   ├── goodtaste.txt
    │   │   ├── hubs.txt
    │   │   ├── iflisp.txt
    │   │   ├── island.txt
    │   │   ├── know.txt
    │   │   ├── langdes.txt
    │   │   ├── laundry.txt
    │   │   ├── love.txt
    │   │   ├── mod.txt
    │   │   ├── newideas.txt
    │   │   ├── nft.txt
    │   │   ├── philosophy.txt
    │   │   ├── popular.txt
    │   │   ├── pow.txt
    │   │   ├── rootsoflisp.txt
    │   │   ├── rss.txt
    │   │   ├── siliconvalley.txt
    │   │   ├── startuplessons.txt
    │   │   ├── submarine.txt
    │   │   ├── superangels.txt
    │   │   ├── todo.txt
    │   │   ├── unions.txt
    │   │   ├── useful.txt
    │   │   ├── vb.txt
    │   │   ├── vcsqueeze.txt
    │   │   ├── vw.txt
    │   │   ├── want.txt
    │   │   ├── web20.txt
    │   │   ├── weird.txt
    │   │   └── wisdom.txt
    │   ├── README.md
    │   ├── config-eval.yaml
    │   ├── config-pred.yaml
    │   ├── config-prompt-debug.yaml
    │   ├── config-prompt-easy.yaml
    │   ├── config-prompt-standard.yaml
    │   ├── eval.py
    │   ├── llama_flash_attn_monkey_patch.py
    │   ├── plot_niah.py
    │   ├── pred.py
    │   └── prompt.py
    └── per_token_loss
    │   ├── README.md
    │   ├── plot_per_token_loss.py
    │   └── run_per_token_loss.py
├── pyproject.toml
├── requirements-dev.txt
├── save_model.py
├── src
    └── forgetting_transformer
    │   ├── __init__.py
    │   ├── checkpoint.py
    │   ├── datamodule
    │       ├── __init__.py
    │       ├── common.py
    │       └── longcrawl64.py
    │   ├── logger.py
    │   ├── model
    │       ├── README.md
    │       ├── __init__.py
    │       ├── common.py
    │       ├── delta_net
    │       │   ├── __init__.py
    │       │   ├── configuration_delta_net.py
    │       │   ├── delta_net_layer.py
    │       │   └── modeling_delta_net.py
    │       ├── forgetting_transformer
    │       │   ├── __init__.py
    │       │   ├── configuration_forgetting_transformer.py
    │       │   ├── fgate_cache.py
    │       │   ├── glu_linear.py
    │       │   ├── modeling_forgetting_transformer.py
    │       │   └── token_shift.py
    │       ├── hgrn2
    │       │   ├── __init__.py
    │       │   ├── configuration_hgrn2.py
    │       │   ├── hgrn2_attention.py
    │       │   └── modeling_hgrn2.py
    │       ├── mamba2
    │       │   ├── __init__.py
    │       │   ├── configuration_mamba2.py
    │       │   └── modeling_mamba2.py
    │       ├── samba
    │       │   ├── __init__.py
    │       │   ├── configuration_samba.py
    │       │   └── modeling_samba.py
    │       └── transformer
    │       │   ├── __init__.py
    │       │   ├── configuration_transformer.py
    │       │   └── modeling_transformer.py
    │   ├── ops
    │       ├── __init__.py
    │       └── forgetting_attention.py
    │   ├── schedule
    │       ├── __init__.py
    │       └── schedule.py
    │   ├── tokenizer.py
    │   └── utils.py
└── train.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Custom
  2 | cache/
  3 | /data/
  4 | output/
  5 | results/
  6 | debug/
  7 | logs/
  8 | pred/
  9 | notebooks/
 10 | figures/
 11 | .psync
 12 | # hyena_S5 specific stuff
 13 | wandb/
 14 | cache_dir/
 15 | raw_datasets/
 16 | local_scripts/
 17 | 
 18 | .DS_Store
 19 | 
 20 | # Byte-compiled / optimized / DLL files
 21 | __pycache__/
 22 | *.py[cod]
 23 | *$py.class
 24 | 
 25 | # C extensions
 26 | *.so
 27 | 
 28 | # Distribution / packaging
 29 | .Python
 30 | build/
 31 | develop-eggs/
 32 | dist/
 33 | downloads/
 34 | eggs/
 35 | .eggs/
 36 | lib/
 37 | lib64/
 38 | parts/
 39 | sdist/
 40 | var/
 41 | wheels/
 42 | share/python-wheels/
 43 | *.egg-info/
 44 | .installed.cfg
 45 | *.egg
 46 | MANIFEST
 47 | 
 48 | # PyInstaller
 49 | #  Usually these files are written by a python script from a template
 50 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 51 | *.manifest
 52 | *.spec
 53 | 
 54 | # Installer logs
 55 | pip-log.txt
 56 | pip-delete-this-directory.txt
 57 | 
 58 | # Unit test / coverage reports
 59 | htmlcov/
 60 | .tox/
 61 | .nox/
 62 | .coverage
 63 | .coverage.*
 64 | .cache
 65 | nosetests.xml
 66 | coverage.xml
 67 | *.cover
 68 | *.py,cover
 69 | .hypothesis/
 70 | .pytest_cache/
 71 | cover/
 72 | 
 73 | # Translations
 74 | *.mo
 75 | *.pot
 76 | 
 77 | # Django stuff:
 78 | *.log
 79 | local_settings.py
 80 | db.sqlite3
 81 | db.sqlite3-journal
 82 | 
 83 | # Flask stuff:
 84 | instance/
 85 | .webassets-cache
 86 | 
 87 | # Scrapy stuff:
 88 | .scrapy
 89 | 
 90 | # Sphinx documentation
 91 | docs/_build/
 92 | 
 93 | # PyBuilder
 94 | .pybuilder/
 95 | target/
 96 | 
 97 | # Jupyter Notebook
 98 | .ipynb_checkpoints
 99 | 
100 | # IPython
101 | profile_default/
102 | ipython_config.py
103 | 
104 | # pyenv
105 | #   For a library or package, you might want to ignore these files since the code is
106 | #   intended to run in multiple environments; otherwise, check them in:
107 | # .python-version
108 | 
109 | # pipenv
110 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
111 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
112 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
113 | #   install all needed dependencies.
114 | #Pipfile.lock
115 | 
116 | # poetry
117 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
118 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
119 | #   commonly ignored for libraries.
120 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
121 | #poetry.lock
122 | 
123 | # pdm
124 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
125 | #pdm.lock
126 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
127 | #   in version control.
128 | #   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
129 | .pdm.toml
130 | .pdm-python
131 | .pdm-build/
132 | 
133 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
134 | __pypackages__/
135 | 
136 | # Celery stuff
137 | celerybeat-schedule
138 | celerybeat.pid
139 | 
140 | # SageMath parsed files
141 | *.sage.py
142 | 
143 | # Environments
144 | .env
145 | .venv
146 | env/
147 | venv/
148 | ENV/
149 | env.bak/
150 | venv.bak/
151 | 
152 | # Spyder project settings
153 | .spyderproject
154 | .spyproject
155 | 
156 | # Rope project settings
157 | .ropeproject
158 | 
159 | # mkdocs documentation
160 | /site
161 | 
162 | # mypy
163 | .mypy_cache/
164 | .dmypy.json
165 | dmypy.json
166 | 
167 | # Pyre type checker
168 | .pyre/
169 | 
170 | # pytype static type analyzer
171 | .pytype/
172 | 
173 | # Cython debug symbols
174 | cython_debug/
175 | 
176 | # PyCharm
177 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
178 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
179 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
180 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
181 | #.idea/
182 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Zhixuan Lin
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/configs/__init__.py:
--------------------------------------------------------------------------------
1 | from configs.config import Config
2 | 
3 | 
4 | 
5 | 


--------------------------------------------------------------------------------
/configs/config.py:
--------------------------------------------------------------------------------
  1 | from dataclasses import dataclass, field
  2 | from omegaconf import OmegaConf, MISSING
  3 | from typing import List, Any, Literal, Optional, Union
  4 | from pathlib import Path
  5 | from hydra.core.config_store import ConfigStore
  6 | from configs.optimizer import OptimizerConfig
  7 | from configs.schedule import ScheduleConfig
  8 | from configs.model import ModelConfig
  9 | from configs.datamodule import DataModuleConfig
 10 | from configs.utils import auto_register
 11 | from configs.strategy import StrategyConfig
 12 | 
 13 | @dataclass
 14 | class WandbConfig:
 15 |     project: str = "forgetting-transformer"
 16 |     mode: str = "offline"
 17 |     log_dir: str = MISSING
 18 | 
 19 | @dataclass
 20 | class FabricConfig:
 21 |     devices: Union[int, str] = "auto"
 22 |     precision: str = 'bf16-mixed'
 23 | 
 24 | 
 25 | @dataclass
 26 | class TrainConfig:
 27 |     max_tokens: int = MISSING
 28 |     grad_acc_tokens: int = MISSING
 29 |     max_grad_norm: float = MISSING
 30 |     gradient_checkpointing: bool = False
 31 | 
 32 |     bias_weight_decay: bool = False
 33 |     normalization_weight_decay: bool = False
 34 |     conv_weight_decay: bool = True
 35 | 
 36 | @dataclass
 37 | class EvalConfig:
 38 |     min_val_length: int = 512
 39 | 
 40 | 
 41 | @dataclass
 42 | class Config:
 43 |     defaults: List[Any] = field(
 44 |         default_factory=lambda: [
 45 |             {"model": "???"},
 46 |             {"optimizer": "???"},
 47 |             {"schedule": "???"},
 48 |             {"datamodule": "???"},
 49 |             {"strategy": "???"},
 50 |             # If we don't do these hydra will mess up python logging
 51 |             # Also must none. `disabled` mess up other libraries.
 52 |             {"override hydra/job_logging": "none"},
 53 |             {"override hydra/hydra_logging": "none"},
 54 |             "_self_",
 55 |         ]
 56 |     )
 57 | 
 58 |     # https://github.com/facebookresearch/hydra/issues/2049
 59 |     # If we don't do this hydra will create an annoying directory
 60 |     hydra: Any = field(default_factory=lambda: {"run": {"dir": "${output_dir}"}})
 61 | 
 62 |     exp: str = "debug"
 63 |     tag: str = "debug"
 64 |     seed: int = 0
 65 | 
 66 |     # Only used for saving HF model
 67 |     hf_load_dir: Optional[str] = None
 68 |     hf_save_dir: Optional[str] = None
 69 |     hf_load_step: Optional[int] = None
 70 | 
 71 |     # Everything (config, metrics, checkpoints etc) except for wandb log will be saved here
 72 |     output_dir: str = MISSING
 73 |     # Any dataset should reside here
 74 |     data_dir: str = MISSING
 75 |     # Don't forget to set wandb.log_dir as well
 76 | 
 77 |     # When resuming, we first try to load the latest checkpoint from output_dir / 'checkpoints'. If nothing
 78 |     # found, we try to start from fork_step from fork_dir if it is not None.
 79 |     resume: bool = MISSING
 80 |     fork_dir: Optional[str] = None
 81 |     fork_step: Optional[int] = None
 82 | 
 83 |     log_interval: int = MISSING
 84 |     eval_interval: int = MISSING
 85 |     final_eval: bool = True
 86 |     skip_eval: bool = True
 87 |     # Save checkpoints every this steps. We only keep the latest checkpoint
 88 |     checkpoint_interval: int = MISSING  
 89 |     # Eval results with training loss
 90 |     train_eval_interval: int = MISSING
 91 |     # Besides the latest checkpoint, also keeps permanent checkpoints at these
 92 |     # interval
 93 |     checkpoint_keep_interval: int = MISSING
 94 | 
 95 |     # Regular hierarhical config
 96 |     fabric: FabricConfig = FabricConfig()
 97 |     train: TrainConfig = TrainConfig()
 98 |     eval: EvalConfig = EvalConfig()
 99 |     wandb: WandbConfig = WandbConfig()
100 | 
101 |     # Meant to decided by default list
102 |     strategy: StrategyConfig = MISSING
103 |     model: ModelConfig = MISSING
104 |     schedule: ScheduleConfig = MISSING
105 |     datamodule: DataModuleConfig = MISSING
106 |     optimizer: OptimizerConfig = MISSING
107 | 
108 | cs = ConfigStore.instance()
109 | cs.store(name='config', node=Config)
110 | config_root = Path(__file__).parent
111 | for base_class in [
112 |     OptimizerConfig,
113 |     ModelConfig,
114 |     DataModuleConfig,
115 |     ScheduleConfig,
116 |     StrategyConfig,
117 | ]:
118 |     auto_register(base_class, config_root)
119 | 


--------------------------------------------------------------------------------
/configs/datamodule/__init__.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass
2 | 
3 | @dataclass
4 | class DataModuleConfig:
5 |     pass
6 | 


--------------------------------------------------------------------------------
/configs/datamodule/longcrawl64.py:
--------------------------------------------------------------------------------
 1 | from . import DataModuleConfig
 2 | from omegaconf import MISSING
 3 | from dataclasses import dataclass
 4 | from typing import Optional
 5 | 
 6 | @dataclass
 7 | class LongCrawl64Config(DataModuleConfig):
 8 |     _target_: str =  'forgetting_transformer.datamodule.longcrawl64.LongCrawl64DataModule'
 9 |     # This is a custom resolver. Note inside data_dir refers to the root config node
10 |     data_dir: str = '${join_path:${data_dir},longcrawl64}'
11 |     rank: int = MISSING        # Should be provided programmatically
12 |     world_size: int = MISSING  # Should be provided programmatically
13 |     train_seq_len: Optional[int] = None
14 |     train_batch_len: int = MISSING
15 |     train_batch_size: int = MISSING
16 |     # train_tokens_per_stage: int = MISSING
17 |     train_doc_len: Optional[int] = None
18 |     train_num_workers: int = MISSING
19 | 
20 |     eval_tokens: int = MISSING
21 |     eval_seq_len: Optional[int] = None
22 |     eval_batch_len: int = MISSING
23 |     eval_local_batch_size: int = MISSING
24 |     eval_doc_len: Optional[int] = None
25 |     eval_num_workers: int = MISSING
26 | 


--------------------------------------------------------------------------------
/configs/experiment/longcrawl64/delta_net/125m_2b.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | defaults:
 3 |   - override /model: delta_net
 4 |   - override /optimizer: adamw
 5 |   - override /schedule: warmup_cosine
 6 |   - override /datamodule: longcrawl64
 7 |   - override /strategy: fsdp
 8 |   - _self_
 9 | 
10 | exp: ???
11 | tag: ???
12 | seed: 0
13 | 
14 | output_dir: ???
15 | data_dir: ???  # data_dir / 'longcrawl' / 'train.zarr' should exist
16 | 
17 | resume: True
18 | 
19 | log_interval: 33554432               # 32Mi. 32 times per billion
20 | train_eval_interval: 536870912       # 512Mi
21 | checkpoint_interval: 268435456       # 256Mi In tokens. 4 times per 1Bi. It is worth it
22 | 
23 | skip_eval: true
24 | eval_interval: 2684354560            # Only at the end. Not used due to skip_eval
25 | checkpoint_keep_interval: 2684354560 # Only at the end
26 | 
27 | fabric:
28 |   devices: auto
29 |   precision: 'bf16-mixed'
30 | 
31 | train:
32 |   max_tokens: 2684354560  # 2.5 Bi
33 |   # Used for one gradient accumulation step, must be larger than batch_len
34 |   grad_acc_tokens: 16384
35 |   max_grad_norm: 1.0
36 |   gradient_checkpointing: false
37 | 
38 | model:
39 |   config:
40 |     hidden_size: 768
41 |     num_hidden_layers: 12
42 |     num_heads: 6
43 | 
44 | optimizer: 
45 |   lr: 0.002
46 |   betas: [0.9, 0.95]
47 |   weight_decay: 0.1
48 | 
49 | schedule:
50 |   init_value: 0.0
51 |   peak_value: ${optimizer.lr}
52 |   warmup_steps: 268435456  # 256Mi
53 |   decay_steps: ${train.max_tokens}
54 |   end_value: 0.0
55 | 
56 | datamodule:
57 |   train_batch_len: 16384
58 |   train_batch_size: 32
59 |   train_num_workers: 2
60 | 
61 |   eval_batch_len: 16384
62 |   eval_local_batch_size: 1
63 |   eval_tokens: 2147483648  # 2Bi
64 |   eval_num_workers: 2
65 | 


--------------------------------------------------------------------------------
/configs/experiment/longcrawl64/delta_net/360m_7b.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | defaults:
 3 |   - override /model: delta_net
 4 |   - override /optimizer: adamw
 5 |   - override /schedule: warmup_cosine
 6 |   - override /datamodule: longcrawl64
 7 |   - override /strategy: fsdp
 8 |   - _self_
 9 | 
10 | exp: ???
11 | tag: ???
12 | seed: 0
13 | 
14 | output_dir: ???
15 | data_dir: ???  # data_dir / 'longcrawl' / 'train.zarr' should exist
16 | 
17 | resume: True
18 | 
19 | log_interval: 33554432               # 32Mi. 32 times per billion
20 | train_eval_interval: 536870912       # 512Mi
21 | checkpoint_interval: 268435456       # 256Mi In tokens. 4 times per 1Bi. It is worth it
22 | 
23 | skip_eval: true
24 | eval_interval: 7516192768            # Only at the end. Not used due to skip_eval
25 | checkpoint_keep_interval: 7516192768 # Only at the end
26 | 
27 | fabric:
28 |   devices: auto
29 |   precision: 'bf16-mixed'
30 | 
31 | train:
32 |   max_tokens: 7516192768  # 7 Bi
33 |   # Used for one gradient accumulation step, must be larger than batch_len
34 |   grad_acc_tokens: 16384
35 |   max_grad_norm: 1.0
36 |   gradient_checkpointing: false
37 | 
38 | model:
39 |   config:
40 |     hidden_size: 1024
41 |     num_hidden_layers: 24
42 |     num_heads: 8
43 | 
44 | optimizer: 
45 |   lr: 0.002
46 |   betas: [0.9, 0.95]
47 |   weight_decay: 0.1
48 | 
49 | schedule:
50 |   init_value: 0.0
51 |   peak_value: ${optimizer.lr}
52 |   warmup_steps: 268435456  # 256Mi
53 |   decay_steps: ${train.max_tokens}
54 |   end_value: 0.0
55 | 
56 | datamodule:
57 |   train_batch_len: 16384
58 |   train_batch_size: 32
59 |   train_num_workers: 2
60 | 
61 |   eval_batch_len: 16384
62 |   eval_local_batch_size: 1
63 |   eval_tokens: 2147483648  # 2Bi
64 |   eval_num_workers: 2
65 | 


--------------------------------------------------------------------------------
/configs/experiment/longcrawl64/delta_net/760m_16b.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | defaults:
 3 |   - override /model: delta_net
 4 |   - override /optimizer: adamw
 5 |   - override /schedule: warmup_cosine
 6 |   - override /datamodule: longcrawl64
 7 |   - override /strategy: fsdp
 8 |   - _self_
 9 | 
10 | exp: ???
11 | tag: ???
12 | seed: 0
13 | 
14 | output_dir: ???
15 | data_dir: ???  # data_dir / 'longcrawl' / 'train.zarr' should exist
16 | 
17 | resume: True
18 | 
19 | log_interval: 33554432               # 32Mi. 32 times per billion
20 | train_eval_interval: 536870912       # 512Mi
21 | checkpoint_interval: 268435456       # 256Mi In tokens. 4 times per 1Bi. It is worth it
22 | 
23 | skip_eval: true
24 | eval_interval: 16106127360            # Only at the end. Not used due to skip_eval
25 | checkpoint_keep_interval: 16106127360 # Only at the end
26 | 
27 | fabric:
28 |   devices: auto
29 |   precision: 'bf16-mixed'
30 | 
31 | train:
32 |   max_tokens: 16106127360  # 15 Bi
33 |   # Used for one gradient accumulation step, must be larger than batch_len
34 |   grad_acc_tokens: 16384
35 |   max_grad_norm: 1.0
36 |   gradient_checkpointing: true
37 | 
38 | model:
39 |   config:
40 |     hidden_size: 1536
41 |     num_hidden_layers: 24
42 |     num_heads: 12
43 | 
44 | optimizer: 
45 |   lr: 0.001
46 |   betas: [0.9, 0.95]
47 |   weight_decay: 0.1
48 | 
49 | schedule:
50 |   init_value: 0.0
51 |   peak_value: ${optimizer.lr}
52 |   warmup_steps: 268435456  # 256Mi
53 |   decay_steps: ${train.max_tokens}
54 |   end_value: 0.0
55 | 
56 | datamodule:
57 |   train_batch_len: 16384
58 |   train_batch_size: 32
59 |   train_num_workers: 2
60 | 
61 |   eval_batch_len: 16384
62 |   eval_local_batch_size: 1
63 |   eval_tokens: 2147483648  # 2Bi
64 |   eval_num_workers: 2
65 | 


--------------------------------------------------------------------------------
/configs/experiment/longcrawl64/delta_net/760m_48b.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | defaults:
 3 |   - override /model: delta_net
 4 |   - override /optimizer: adamw
 5 |   - override /schedule: warmup_cosine
 6 |   - override /datamodule: longcrawl64
 7 |   - override /strategy: fsdp
 8 |   - _self_
 9 | 
10 | exp: ???
11 | tag: ???
12 | seed: 0
13 | 
14 | output_dir: ???
15 | data_dir: ???  # data_dir / 'longcrawl' / 'train.zarr' should exist
16 | 
17 | resume: True
18 | 
19 | log_interval: 33554432               # 32Mi. 32 times per billion
20 | train_eval_interval: 536870912       # 512Mi
21 | checkpoint_interval: 268435456       # 256Mi In tokens. 4 times per 1Bi. It is worth it
22 | 
23 | skip_eval: true
24 | eval_interval: 48318382080            # Only at the end. Not used due to skip_eval
25 | checkpoint_keep_interval: 48318382080 # Only at the end
26 | 
27 | fabric:
28 |   devices: auto
29 |   precision: 'bf16-mixed'
30 | 
31 | train:
32 |   max_tokens: 48318382080  # 15 Bi
33 |   # Used for one gradient accumulation step, must be larger than batch_len
34 |   grad_acc_tokens: 16384
35 |   max_grad_norm: 1.0
36 |   gradient_checkpointing: true
37 | 
38 | model:
39 |   config:
40 |     hidden_size: 1536
41 |     num_hidden_layers: 24
42 |     num_heads: 12
43 | 
44 | optimizer: 
45 |   lr: 0.001
46 |   betas: [0.9, 0.95]
47 |   weight_decay: 0.1
48 | 
49 | schedule:
50 |   init_value: 0.0
51 |   peak_value: ${optimizer.lr}
52 |   warmup_steps: 268435456  # 256Mi
53 |   decay_steps: ${train.max_tokens}
54 |   end_value: 0.0
55 | 
56 | datamodule:
57 |   train_batch_len: 16384
58 |   train_batch_size: 32
59 |   train_num_workers: 2
60 | 
61 |   eval_batch_len: 16384
62 |   eval_local_batch_size: 1
63 |   eval_tokens: 2147483648  # 2Bi
64 |   eval_num_workers: 2
65 | 


--------------------------------------------------------------------------------
/configs/experiment/longcrawl64/forgetting_transformer/llama_125m_2b.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | defaults:
 3 |   - override /model: forgetting_transformer
 4 |   - override /optimizer: adamw
 5 |   - override /schedule: warmup_cosine
 6 |   - override /datamodule: longcrawl64
 7 |   - override /strategy: fsdp
 8 |   - _self_
 9 | 
10 | exp: ???
11 | tag: ???
12 | seed: 0
13 | 
14 | output_dir: ???
15 | data_dir: ???  # data_dir / 'longcrawl' / 'train.zarr' should exist
16 | 
17 | resume: True
18 | 
19 | log_interval: 33554432               # 32Mi. 32 times per billion
20 | train_eval_interval: 536870912       # 512Mi
21 | checkpoint_interval: 268435456       # 256Mi In tokens. 4 times per 1Bi. It is worth it
22 | 
23 | skip_eval: true                      # Don't do evaluation.
24 | eval_interval: 2684354560            # Only at the end. Not used due to skip_eval
25 | checkpoint_keep_interval: 2684354560 # Only at the end
26 | 
27 | fabric:
28 |   devices: auto
29 |   precision: 'bf16-mixed'
30 | 
31 | train:
32 |   max_tokens: 2684354560  # 2.5 Bi
33 |   # Used for one gradient accumulation step, must be larger than batch_len
34 |   grad_acc_tokens: 32768
35 |   max_grad_norm: 1.0
36 |   gradient_checkpointing: false
37 | 
38 | model:
39 |   config:
40 |     hidden_size: 768
41 |     num_hidden_layers: 12
42 |     num_heads: 12
43 |     use_rope: false
44 |     rope_base: 500000
45 | 
46 | optimizer: 
47 |   lr: 0.002
48 |   betas: [0.9, 0.95]
49 |   weight_decay: 0.1
50 | 
51 | schedule:
52 |   init_value: 0.0
53 |   peak_value: ${optimizer.lr}
54 |   warmup_steps: 268435456  # 256Mi
55 |   decay_steps: ${train.max_tokens}
56 |   end_value: 0.0 
57 | 
58 | datamodule:
59 |   train_batch_len: 16384
60 |   train_batch_size: 32
61 |   train_num_workers: 2
62 | 
63 |   eval_batch_len: 16384
64 |   eval_local_batch_size: 2
65 |   eval_tokens: 2147483648  # 2Bi
66 |   eval_num_workers: 2
67 | 


--------------------------------------------------------------------------------
/configs/experiment/longcrawl64/forgetting_transformer/llama_360m_7b.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | defaults:
 3 |   - override /model: forgetting_transformer
 4 |   - override /optimizer: adamw
 5 |   - override /schedule: warmup_cosine
 6 |   - override /datamodule: longcrawl64
 7 |   - override /strategy: fsdp
 8 |   - _self_
 9 | 
10 | exp: ???
11 | tag: ???
12 | seed: 0
13 | 
14 | output_dir: ???
15 | data_dir: ???  # data_dir / 'longcrawl' / 'train.zarr' should exist
16 | 
17 | resume: True
18 | 
19 | log_interval: 33554432               # 32Mi. 32 times per billion
20 | train_eval_interval: 536870912       # 512Mi
21 | checkpoint_interval: 268435456       # 256Mi In tokens. 4 times per 1Bi. It is worth it
22 | 
23 | skip_eval: true
24 | eval_interval: 7516192768            # Only at the end. Not used due to skip_eval
25 | checkpoint_keep_interval: 7516192768 # Only at the end
26 | 
27 | fabric:
28 |   devices: auto
29 |   precision: 'bf16-mixed'
30 | 
31 | train:
32 |   max_tokens: 7516192768  # 7 Bi
33 |   # Used for one gradient accumulation step, must be larger than batch_len
34 |   grad_acc_tokens: 16384
35 |   max_grad_norm: 1.0
36 |   gradient_checkpointing: false
37 | 
38 | model:
39 |   config:
40 |     hidden_size: 1024
41 |     num_hidden_layers: 24
42 |     num_heads: 16
43 |     use_rope: false
44 |     rope_base: 500000
45 | 
46 | optimizer: 
47 |   lr: 0.002
48 |   betas: [0.9, 0.95]
49 |   weight_decay: 0.1
50 | 
51 | schedule:
52 |   init_value: 0.0
53 |   peak_value: ${optimizer.lr}
54 |   warmup_steps: 268435456  # 256Mi
55 |   decay_steps: ${train.max_tokens}
56 |   end_value: 0.0
57 | 
58 | datamodule:
59 |   train_batch_len: 16384
60 |   train_batch_size: 32
61 |   train_num_workers: 2
62 | 
63 |   eval_batch_len: 16384
64 |   eval_local_batch_size: 1
65 |   eval_tokens: 2147483648  # 2Bi
66 |   eval_num_workers: 2
67 | 


--------------------------------------------------------------------------------
/configs/experiment/longcrawl64/forgetting_transformer/llama_760m_16b.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | defaults:
 3 |   - override /model: forgetting_transformer
 4 |   - override /optimizer: adamw
 5 |   - override /schedule: warmup_cosine
 6 |   - override /datamodule: longcrawl64
 7 |   - override /strategy: fsdp
 8 |   - _self_
 9 | 
10 | exp: ???
11 | tag: ???
12 | seed: 0
13 | 
14 | output_dir: ???
15 | data_dir: ???  # data_dir / 'longcrawl' / 'train.zarr' should exist
16 | 
17 | resume: True
18 | 
19 | log_interval: 33554432               # 32Mi. 32 times per billion
20 | train_eval_interval: 536870912       # 512Mi
21 | checkpoint_interval: 268435456       # 256Mi In tokens. 4 times per 1Bi. It is worth it
22 | 
23 | skip_eval: true
24 | eval_interval: 16106127360            # Only at the end. Not used due to skip_eval
25 | checkpoint_keep_interval: 16106127360 # Only at the end.
26 | 
27 | fabric:
28 |   devices: auto
29 |   precision: 'bf16-mixed'
30 | 
31 | train:
32 |   max_tokens: 16106127360  # 15 Bi
33 |   # Used for one gradient accumulation step, must be larger than batch_len
34 |   grad_acc_tokens: 16384
35 |   max_grad_norm: 1.0
36 |   gradient_checkpointing: true
37 | 
38 | model:
39 |   config:
40 |     hidden_size: 1536
41 |     num_hidden_layers: 24
42 |     num_heads: 24
43 |     use_rope: false
44 |     rope_base: 500000
45 | 
46 | optimizer: 
47 |   lr: 0.001
48 |   betas: [0.9, 0.95]
49 |   weight_decay: 0.1
50 | 
51 | schedule:
52 |   init_value: 0.0
53 |   peak_value: ${optimizer.lr}
54 |   warmup_steps: 268435456  # 256Mi
55 |   decay_steps: ${train.max_tokens}
56 |   end_value: 0.0
57 | 
58 | datamodule:
59 |   train_batch_len: 16384
60 |   train_batch_size: 32
61 |   train_num_workers: 2
62 | 
63 |   eval_batch_len: 16384
64 |   eval_local_batch_size: 1
65 |   eval_tokens: 2147483648  # 2Bi
66 |   eval_num_workers: 2
67 | 


--------------------------------------------------------------------------------
/configs/experiment/longcrawl64/forgetting_transformer/llama_760m_48b.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | defaults:
 3 |   - override /model: forgetting_transformer
 4 |   - override /optimizer: adamw
 5 |   - override /schedule: warmup_cosine
 6 |   - override /datamodule: longcrawl64
 7 |   - override /strategy: fsdp
 8 |   - _self_
 9 | 
10 | exp: ???
11 | tag: ???
12 | seed: 0
13 | 
14 | output_dir: ???
15 | data_dir: ???  # data_dir / 'longcrawl' / 'train.zarr' should exist
16 | 
17 | resume: True
18 | 
19 | log_interval: 33554432               # 32Mi. 32 times per billion
20 | train_eval_interval: 536870912       # 512Mi
21 | checkpoint_interval: 268435456       # 256Mi In tokens. 4 times per 1Bi. It is worth it
22 | 
23 | skip_eval: true
24 | eval_interval: 48318382080            # Only at the end. Not used due to skip_eval
25 | checkpoint_keep_interval: 48318382080 # Only at the end.
26 | 
27 | fabric:
28 |   devices: auto
29 |   precision: 'bf16-mixed'
30 | 
31 | train:
32 |   max_tokens: 48318382080  # 15 Bi
33 |   # Used for one gradient accumulation step, must be larger than batch_len
34 |   grad_acc_tokens: 16384
35 |   max_grad_norm: 1.0
36 |   gradient_checkpointing: true
37 | 
38 | model:
39 |   config:
40 |     hidden_size: 1536
41 |     num_hidden_layers: 24
42 |     num_heads: 24
43 |     use_rope: false
44 |     rope_base: 500000
45 | 
46 | optimizer: 
47 |   lr: 0.001
48 |   betas: [0.9, 0.95]
49 |   weight_decay: 0.1
50 | 
51 | schedule:
52 |   init_value: 0.0
53 |   peak_value: ${optimizer.lr}
54 |   warmup_steps: 268435456  # 256Mi
55 |   decay_steps: ${train.max_tokens}
56 |   end_value: 0.0
57 | 
58 | datamodule:
59 |   train_batch_len: 16384
60 |   train_batch_size: 32
61 |   train_num_workers: 2
62 | 
63 |   eval_batch_len: 16384
64 |   eval_local_batch_size: 1
65 |   eval_tokens: 2147483648  # 2Bi
66 |   eval_num_workers: 2
67 | 


--------------------------------------------------------------------------------
/configs/experiment/longcrawl64/forgetting_transformer/pro_125m_2b.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | defaults:
 3 |   - override /model: forgetting_transformer
 4 |   - override /optimizer: adamw
 5 |   - override /schedule: warmup_cosine
 6 |   - override /datamodule: longcrawl64
 7 |   - override /strategy: fsdp
 8 |   - _self_
 9 | 
10 | exp: ???
11 | tag: ???
12 | seed: 0
13 | 
14 | output_dir: ???
15 | data_dir: ???  # data_dir / 'longcrawl' / 'train.zarr' should exist
16 | 
17 | resume: True
18 | 
19 | log_interval: 33554432               # 32Mi. 32 times per billion
20 | train_eval_interval: 536870912       # 512Mi
21 | checkpoint_interval: 268435456       # 256Mi In tokens. 4 times per 1Bi. It is worth it
22 | 
23 | skip_eval: true                      # Don't do evaluation.
24 | eval_interval: 2684354560            # Only at the end. Not used due to skip_eval
25 | checkpoint_keep_interval: 2684354560 # Only at the end
26 | 
27 | fabric:
28 |   devices: auto
29 |   precision: 'bf16-mixed'
30 | 
31 | train:
32 |   max_tokens: 2684354560  # 2.5 Bi
33 |   # Used for one gradient accumulation step, must be larger than batch_len
34 |   grad_acc_tokens: 32768
35 |   max_grad_norm: 1.0
36 |   gradient_checkpointing: false
37 | 
38 | model:
39 |   config:
40 |     hidden_size: 768
41 |     num_hidden_layers: 12
42 |     num_heads: 12
43 |     use_rope: false
44 |     rope_base: 500000
45 |     # Pro config
46 |     use_v_shift: true
47 |     use_k_shift: true
48 |     qk_norm: true
49 |     use_output_gate: true
50 |     use_output_norm: true
51 |     hidden_ratio: 3.5   # output gates use extra params so we reduce it here
52 | 
53 | optimizer: 
54 |   lr: 0.002
55 |   betas: [0.9, 0.95]
56 |   weight_decay: 0.1
57 | 
58 | schedule:
59 |   init_value: 0.0
60 |   peak_value: ${optimizer.lr}
61 |   warmup_steps: 268435456  # 256Mi
62 |   decay_steps: ${train.max_tokens}
63 |   end_value: 0.0 
64 | 
65 | datamodule:
66 |   train_batch_len: 16384
67 |   train_batch_size: 32
68 |   train_num_workers: 2
69 | 
70 |   eval_batch_len: 16384
71 |   eval_local_batch_size: 2
72 |   eval_tokens: 2147483648  # 2Bi
73 |   eval_num_workers: 2
74 | 


--------------------------------------------------------------------------------
/configs/experiment/longcrawl64/forgetting_transformer/pro_360m_7b.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | defaults:
 3 |   - override /model: forgetting_transformer
 4 |   - override /optimizer: adamw
 5 |   - override /schedule: warmup_cosine
 6 |   - override /datamodule: longcrawl64
 7 |   - override /strategy: fsdp
 8 |   - _self_
 9 | 
10 | exp: ???
11 | tag: ???
12 | seed: 0
13 | 
14 | output_dir: ???
15 | data_dir: ???  # data_dir / 'longcrawl' / 'train.zarr' should exist
16 | 
17 | resume: True
18 | 
19 | log_interval: 33554432               # 32Mi. 32 times per billion
20 | train_eval_interval: 536870912       # 512Mi
21 | checkpoint_interval: 268435456       # 256Mi In tokens. 4 times per 1Bi. It is worth it
22 | 
23 | skip_eval: true
24 | eval_interval: 7516192768            # Only at the end. Not used due to skip_eval
25 | checkpoint_keep_interval: 7516192768 # Only at the end
26 | 
27 | fabric:
28 |   devices: auto
29 |   precision: 'bf16-mixed'
30 | 
31 | train:
32 |   max_tokens: 7516192768  # 7 Bi
33 |   # Used for one gradient accumulation step, must be larger than batch_len
34 |   grad_acc_tokens: 16384
35 |   max_grad_norm: 1.0
36 |   gradient_checkpointing: false
37 | 
38 | model:
39 |   config:
40 |     hidden_size: 1024
41 |     num_hidden_layers: 24
42 |     num_heads: 16
43 |     use_rope: false
44 |     rope_base: 500000
45 |     # Pro config
46 |     use_v_shift: true
47 |     use_k_shift: true
48 |     qk_norm: true
49 |     use_output_gate: true
50 |     use_output_norm: true
51 |     hidden_ratio: 3.5   # output gates use extra params so we reduce it here
52 | 
53 | optimizer: 
54 |   lr: 0.002
55 |   betas: [0.9, 0.95]
56 |   weight_decay: 0.1
57 | 
58 | schedule:
59 |   init_value: 0.0
60 |   peak_value: ${optimizer.lr}
61 |   warmup_steps: 268435456  # 256Mi
62 |   decay_steps: ${train.max_tokens}
63 |   end_value: 0.0
64 | 
65 | datamodule:
66 |   train_batch_len: 16384
67 |   train_batch_size: 32
68 |   train_num_workers: 2
69 | 
70 |   eval_batch_len: 16384
71 |   eval_local_batch_size: 1
72 |   eval_tokens: 2147483648  # 2Bi
73 |   eval_num_workers: 2
74 | 


--------------------------------------------------------------------------------
/configs/experiment/longcrawl64/forgetting_transformer/pro_760m_16b.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | defaults:
 3 |   - override /model: forgetting_transformer
 4 |   - override /optimizer: adamw
 5 |   - override /schedule: warmup_cosine
 6 |   - override /datamodule: longcrawl64
 7 |   - override /strategy: fsdp
 8 |   - _self_
 9 | 
10 | exp: ???
11 | tag: ???
12 | seed: 0
13 | 
14 | output_dir: ???
15 | data_dir: ???  # data_dir / 'longcrawl' / 'train.zarr' should exist
16 | 
17 | resume: True
18 | 
19 | log_interval: 33554432               # 32Mi. 32 times per billion
20 | train_eval_interval: 536870912       # 512Mi
21 | checkpoint_interval: 268435456       # 256Mi In tokens. 4 times per 1Bi. It is worth it
22 | 
23 | skip_eval: true
24 | eval_interval: 16106127360            # Only at the end. Not used due to skip_eval
25 | checkpoint_keep_interval: 16106127360 # Only at the end.
26 | 
27 | fabric:
28 |   devices: auto
29 |   precision: 'bf16-mixed'
30 | 
31 | train:
32 |   max_tokens: 16106127360  # 15 Bi
33 |   # Used for one gradient accumulation step, must be larger than batch_len
34 |   grad_acc_tokens: 16384
35 |   max_grad_norm: 1.0
36 |   gradient_checkpointing: true
37 | 
38 | model:
39 |   config:
40 |     hidden_size: 1536
41 |     num_hidden_layers: 24
42 |     num_heads: 24
43 |     use_rope: false
44 |     rope_base: 500000
45 |     # Pro config
46 |     use_v_shift: true
47 |     use_k_shift: true
48 |     qk_norm: true
49 |     use_output_gate: true
50 |     use_output_norm: true
51 |     hidden_ratio: 3.5   # output gates use extra params so we reduce it here
52 | 
53 | optimizer: 
54 |   lr: 0.001
55 |   betas: [0.9, 0.95]
56 |   weight_decay: 0.1
57 | 
58 | schedule:
59 |   init_value: 0.0
60 |   peak_value: ${optimizer.lr}
61 |   warmup_steps: 268435456  # 256Mi
62 |   decay_steps: ${train.max_tokens}
63 |   end_value: 0.0
64 | 
65 | datamodule:
66 |   train_batch_len: 16384
67 |   train_batch_size: 32
68 |   train_num_workers: 2
69 | 
70 |   eval_batch_len: 16384
71 |   eval_local_batch_size: 1
72 |   eval_tokens: 2147483648  # 2Bi
73 |   eval_num_workers: 2
74 | 


--------------------------------------------------------------------------------
/configs/experiment/longcrawl64/forgetting_transformer/pro_760m_48b.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | defaults:
 3 |   - override /model: forgetting_transformer
 4 |   - override /optimizer: adamw
 5 |   - override /schedule: warmup_cosine
 6 |   - override /datamodule: longcrawl64
 7 |   - override /strategy: fsdp
 8 |   - _self_
 9 | 
10 | exp: ???
11 | tag: ???
12 | seed: 0
13 | 
14 | output_dir: ???
15 | data_dir: ???  # data_dir / 'longcrawl' / 'train.zarr' should exist
16 | 
17 | resume: True
18 | 
19 | log_interval: 33554432               # 32Mi. 32 times per billion
20 | train_eval_interval: 536870912       # 512Mi
21 | checkpoint_interval: 268435456       # 256Mi In tokens. 4 times per 1Bi. It is worth it
22 | 
23 | skip_eval: true
24 | eval_interval: 48318382080            # Only at the end. Not used due to skip_eval
25 | checkpoint_keep_interval: 48318382080 # Only at the end.
26 | 
27 | fabric:
28 |   devices: auto
29 |   precision: 'bf16-mixed'
30 | 
31 | train:
32 |   max_tokens: 48318382080  # 15 Bi
33 |   # Used for one gradient accumulation step, must be larger than batch_len
34 |   grad_acc_tokens: 16384
35 |   max_grad_norm: 1.0
36 |   gradient_checkpointing: true
37 | 
38 | model:
39 |   config:
40 |     hidden_size: 1536
41 |     num_hidden_layers: 24
42 |     num_heads: 24
43 |     use_rope: false
44 |     rope_base: 500000
45 |     # Pro config
46 |     use_v_shift: true
47 |     use_k_shift: true
48 |     qk_norm: true
49 |     use_output_gate: true
50 |     use_output_norm: true
51 |     hidden_ratio: 3.5   # output gates use extra params so we reduce it here
52 | 
53 | optimizer: 
54 |   lr: 0.002
55 |   betas: [0.9, 0.95]
56 |   weight_decay: 0.1
57 | 
58 | schedule:
59 |   init_value: 0.0
60 |   peak_value: ${optimizer.lr}
61 |   warmup_steps: 268435456  # 256Mi
62 |   decay_steps: ${train.max_tokens}
63 |   end_value: 0.0
64 | 
65 | datamodule:
66 |   train_batch_len: 16384
67 |   train_batch_size: 32
68 |   train_num_workers: 2
69 | 
70 |   eval_batch_len: 16384
71 |   eval_local_batch_size: 1
72 |   eval_tokens: 2147483648  # 2Bi
73 |   eval_num_workers: 2
74 | 


--------------------------------------------------------------------------------
/configs/experiment/longcrawl64/hgrn2/125m_2b.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | defaults:
 3 |   - override /model: hgrn2
 4 |   - override /optimizer: adamw
 5 |   - override /schedule: warmup_cosine
 6 |   - override /datamodule: longcrawl64
 7 |   - override /strategy: fsdp
 8 |   - _self_
 9 | 
10 | exp: ???
11 | tag: ???
12 | seed: 0
13 | 
14 | output_dir: ???
15 | data_dir: ???  # data_dir / 'longcrawl' / 'train.zarr' should exist
16 | 
17 | resume: True
18 | 
19 | log_interval: 33554432               # 32Mi. 32 times per billion
20 | train_eval_interval: 536870912       # 512Mi
21 | checkpoint_interval: 268435456       # 256Mi In tokens. 4 times per 1Bi. It is worth it
22 | 
23 | skip_eval: true
24 | eval_interval: 2684354560            # Only at the end. Not used due to skip_eval
25 | checkpoint_keep_interval: 2684354560 # Only at the end
26 | 
27 | fabric:
28 |   devices: auto
29 |   precision: 'bf16-mixed'
30 | 
31 | train:
32 |   max_tokens: 2684354560  # 2.5 Bi
33 |   # Used for one gradient accumulation step, must be larger than batch_len
34 |   grad_acc_tokens: 32768
35 |   max_grad_norm: 1.0
36 |   gradient_checkpointing: false
37 | 
38 | model:
39 |   config:
40 |     hidden_size: 768
41 |     expand_ratio: 128
42 |     num_hidden_layers: 12
43 | 
44 | optimizer: 
45 |   lr: 0.002
46 |   betas: [0.9, 0.95]
47 |   weight_decay: 0.1
48 | 
49 | schedule:
50 |   init_value: 0.0
51 |   peak_value: ${optimizer.lr}
52 |   warmup_steps: 268435456  # 256Mi
53 |   decay_steps: ${train.max_tokens}
54 |   end_value: 0.0
55 | 
56 | datamodule:
57 |   train_batch_len: 16384
58 |   train_batch_size: 32
59 |   train_num_workers: 2
60 | 
61 |   eval_batch_len: 16384
62 |   eval_local_batch_size: 2
63 |   eval_tokens: 2147483648  # 2Bi
64 |   eval_num_workers: 2
65 | 


--------------------------------------------------------------------------------
/configs/experiment/longcrawl64/hgrn2/360m_7b.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | defaults:
 3 |   - override /model: hgrn2
 4 |   - override /optimizer: adamw
 5 |   - override /schedule: warmup_cosine
 6 |   - override /datamodule: longcrawl64
 7 |   - override /strategy: fsdp
 8 |   - _self_
 9 | 
10 | exp: ???
11 | tag: ???
12 | seed: 0
13 | 
14 | output_dir: ???
15 | data_dir: ???  # data_dir / 'longcrawl' / 'train.zarr' should exist
16 | 
17 | resume: True
18 | 
19 | log_interval: 33554432               # 32Mi. 32 times per billion
20 | train_eval_interval: 536870912       # 512Mi
21 | checkpoint_interval: 268435456       # 256Mi In tokens. 4 times per 1Bi. It is worth it
22 | 
23 | skip_eval: true
24 | eval_interval: 7516192768            # Only at the end. Not used due to skip_eval
25 | checkpoint_keep_interval: 7516192768 # Only at the end
26 | 
27 | fabric:
28 |   devices: auto
29 |   precision: 'bf16-mixed'
30 | 
31 | train:
32 |   max_tokens: 7516192768  # 7 Bi
33 |   # Used for one gradient accumulation step, must be larger than batch_len
34 |   grad_acc_tokens: 16384
35 |   max_grad_norm: 1.0
36 |   gradient_checkpointing: false
37 | 
38 | model:
39 |   config:
40 |     hidden_size: 1024
41 |     expand_ratio: 128
42 |     num_hidden_layers: 24
43 | 
44 | optimizer: 
45 |   lr: 0.002
46 |   betas: [0.9, 0.95]
47 |   weight_decay: 0.1
48 | 
49 | schedule:
50 |   init_value: 0.0
51 |   peak_value: ${optimizer.lr}
52 |   warmup_steps: 268435456  # 256Mi
53 |   decay_steps: ${train.max_tokens}
54 |   end_value: 0.0
55 | 
56 | datamodule:
57 |   train_batch_len: 16384
58 |   train_batch_size: 32
59 |   train_num_workers: 2
60 | 
61 |   eval_batch_len: 16384
62 |   eval_local_batch_size: 1
63 |   eval_tokens: 2147483648  # 2Bi
64 |   eval_num_workers: 2
65 | 


--------------------------------------------------------------------------------
/configs/experiment/longcrawl64/hgrn2/760m_16b.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | defaults:
 3 |   - override /model: hgrn2
 4 |   - override /optimizer: adamw
 5 |   - override /schedule: warmup_cosine
 6 |   - override /datamodule: longcrawl64
 7 |   - override /strategy: fsdp
 8 |   - _self_
 9 | 
10 | exp: ???
11 | tag: ???
12 | seed: 0
13 | 
14 | output_dir: ???
15 | data_dir: ???  # data_dir / 'longcrawl' / 'train.zarr' should exist
16 | 
17 | resume: True
18 | 
19 | log_interval: 33554432               # 32Mi. 32 times per billion
20 | train_eval_interval: 536870912       # 512Mi
21 | checkpoint_interval: 268435456       # 256Mi In tokens. 4 times per 1Bi. It is worth it
22 | 
23 | skip_eval: true
24 | eval_interval: 16106127360            # Only at the end. Not used due to skip_eval
25 | checkpoint_keep_interval: 16106127360 # Only at the end
26 | 
27 | fabric:
28 |   devices: auto
29 |   precision: 'bf16-mixed'
30 | 
31 | train:
32 |   max_tokens: 16106127360  # 15 Bi
33 |   # Used for one gradient accumulation step, must be larger than batch_len
34 |   grad_acc_tokens: 16384
35 |   max_grad_norm: 1.0
36 |   gradient_checkpointing: true
37 | 
38 | model:
39 |   config:
40 |     hidden_size: 1536
41 |     expand_ratio: 128
42 |     num_hidden_layers: 24
43 | 
44 | optimizer: 
45 |   lr: 0.001
46 |   betas: [0.9, 0.95]
47 |   weight_decay: 0.1
48 | 
49 | schedule:
50 |   init_value: 0.0
51 |   peak_value: ${optimizer.lr}
52 |   warmup_steps: 268435456  # 256Mi
53 |   decay_steps: ${train.max_tokens}
54 |   end_value: 0.0
55 | 
56 | datamodule:
57 |   train_batch_len: 16384
58 |   train_batch_size: 32
59 |   train_num_workers: 2
60 | 
61 |   eval_batch_len: 16384
62 |   eval_local_batch_size: 1
63 |   eval_tokens: 2147483648  # 2Bi
64 |   eval_num_workers: 2
65 | 


--------------------------------------------------------------------------------
/configs/experiment/longcrawl64/hgrn2/760m_48b.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | defaults:
 3 |   - override /model: hgrn2
 4 |   - override /optimizer: adamw
 5 |   - override /schedule: warmup_cosine
 6 |   - override /datamodule: longcrawl64
 7 |   - override /strategy: fsdp
 8 |   - _self_
 9 | 
10 | exp: ???
11 | tag: ???
12 | seed: 0
13 | 
14 | output_dir: ???
15 | data_dir: ???  # data_dir / 'longcrawl' / 'train.zarr' should exist
16 | 
17 | resume: True
18 | 
19 | log_interval: 33554432               # 32Mi. 32 times per billion
20 | train_eval_interval: 536870912       # 512Mi
21 | checkpoint_interval: 268435456       # 256Mi In tokens. 4 times per 1Bi. It is worth it
22 | 
23 | skip_eval: true
24 | eval_interval: 48318382080            # Only at the end. Not used due to skip_eval
25 | checkpoint_keep_interval: 48318382080 # Only at the end
26 | 
27 | fabric:
28 |   devices: auto
29 |   precision: 'bf16-mixed'
30 | 
31 | train:
32 |   max_tokens: 48318382080  # 15 Bi
33 |   # Used for one gradient accumulation step, must be larger than batch_len
34 |   grad_acc_tokens: 16384
35 |   max_grad_norm: 1.0
36 |   gradient_checkpointing: true
37 | 
38 | model:
39 |   config:
40 |     hidden_size: 1536
41 |     expand_ratio: 128
42 |     num_hidden_layers: 24
43 | 
44 | optimizer: 
45 |   lr: 0.002
46 |   betas: [0.9, 0.95]
47 |   weight_decay: 0.1
48 | 
49 | schedule:
50 |   init_value: 0.0
51 |   peak_value: ${optimizer.lr}
52 |   warmup_steps: 268435456  # 256Mi
53 |   decay_steps: ${train.max_tokens}
54 |   end_value: 0.0
55 | 
56 | datamodule:
57 |   train_batch_len: 16384
58 |   train_batch_size: 32
59 |   train_num_workers: 2
60 | 
61 |   eval_batch_len: 16384
62 |   eval_local_batch_size: 1
63 |   eval_tokens: 2147483648  # 2Bi
64 |   eval_num_workers: 2
65 | 


--------------------------------------------------------------------------------
/configs/experiment/longcrawl64/mamba2/125m_2b.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | defaults:
 3 |   - override /model: mamba2
 4 |   - override /optimizer: adamw
 5 |   - override /schedule: warmup_cosine
 6 |   - override /datamodule: longcrawl64
 7 |   - override /strategy: fsdp
 8 |   - _self_
 9 | 
10 | exp: ???
11 | tag: ???
12 | seed: 0
13 | 
14 | output_dir: ???
15 | data_dir: ???  # data_dir / 'longcrawl' / 'train.zarr' should exist
16 | 
17 | resume: True
18 | 
19 | log_interval: 33554432               # 32Mi. 32 times per billion
20 | train_eval_interval: 536870912       # 512Mi
21 | checkpoint_interval: 268435456       # 256Mi In tokens. 4 times per 1Bi. It is worth it
22 | 
23 | skip_eval: true
24 | eval_interval: 2684354560            # Only at the end. Not used due to skip_eval
25 | checkpoint_keep_interval: 2684354560 # Only at the end
26 | 
27 | fabric:
28 |   devices: auto
29 |   precision: 'bf16-mixed'
30 | 
31 | train:
32 |   max_tokens: 2684354560  # 2.5 Bi
33 |   # Used for one gradient accumulation step, must be larger than batch_len
34 |   grad_acc_tokens: 32768
35 |   max_grad_norm: 1.0
36 |   gradient_checkpointing: false
37 | 
38 | model:
39 |   config:
40 |     head_dim: 64
41 |     state_size: 128
42 |     num_heads: 24
43 |     num_hidden_layers: 24
44 |     hidden_size: 768
45 | 
46 | optimizer: 
47 |   lr: 0.002
48 |   betas: [0.9, 0.95]
49 |   weight_decay: 0.1
50 | 
51 | schedule:
52 |   init_value: 0.0
53 |   peak_value: ${optimizer.lr}
54 |   warmup_steps: 268435456  # 256Mi
55 |   decay_steps: ${train.max_tokens}
56 |   end_value: 0.0
57 | 
58 | datamodule:
59 |   train_batch_len: 16384
60 |   train_batch_size: 32
61 |   train_num_workers: 2
62 | 
63 |   eval_batch_len: 16384
64 |   eval_local_batch_size: 2
65 |   eval_tokens: 2147483648  # 2Bi
66 |   eval_num_workers: 2
67 | 


--------------------------------------------------------------------------------
/configs/experiment/longcrawl64/mamba2/360m_7b.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | defaults:
 3 |   - override /model: mamba2
 4 |   - override /optimizer: adamw
 5 |   - override /schedule: warmup_cosine
 6 |   - override /datamodule: longcrawl64
 7 |   - override /strategy: fsdp
 8 |   - _self_
 9 | 
10 | exp: ???
11 | tag: ???
12 | seed: 0
13 | 
14 | output_dir: ???
15 | data_dir: ???  # data_dir / 'longcrawl' / 'train.zarr' should exist
16 | 
17 | resume: True
18 | 
19 | log_interval: 33554432               # 32Mi. 32 times per billion
20 | train_eval_interval: 536870912       # 512Mi
21 | checkpoint_interval: 268435456       # 256Mi In tokens. 4 times per 1Bi. It is worth it
22 | 
23 | skip_eval: true
24 | eval_interval: 7516192768            # Only at the end. Not used due to skip_eval
25 | checkpoint_keep_interval: 7516192768 # Only at the end
26 | 
27 | fabric:
28 |   devices: auto
29 |   precision: 'bf16-mixed'
30 | 
31 | train:
32 |   max_tokens: 7516192768  # 7 Bi
33 |   # Used for one gradient accumulation step, must be larger than batch_len
34 |   grad_acc_tokens: 16384
35 |   max_grad_norm: 1.0
36 |   gradient_checkpointing: false
37 | 
38 | model:
39 |   config:
40 |     head_dim: 64
41 |     state_size: 128
42 |     num_heads: 32
43 |     num_hidden_layers: 48
44 |     hidden_size: 1024
45 | 
46 | optimizer: 
47 |   lr: 0.002
48 |   betas: [0.9, 0.95]
49 |   weight_decay: 0.1
50 | 
51 | schedule:
52 |   init_value: 0.0
53 |   peak_value: ${optimizer.lr}
54 |   warmup_steps: 268435456  # 256Mi
55 |   decay_steps: ${train.max_tokens}
56 |   end_value: 0.0
57 | 
58 | datamodule:
59 |   train_batch_len: 16384
60 |   train_batch_size: 32
61 |   train_num_workers: 2
62 | 
63 |   eval_batch_len: 16384
64 |   eval_local_batch_size: 1
65 |   eval_tokens: 2147483648  # 2Bi
66 |   eval_num_workers: 2
67 | 


--------------------------------------------------------------------------------
/configs/experiment/longcrawl64/mamba2/760m_16b.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | defaults:
 3 |   - override /model: mamba2
 4 |   - override /optimizer: adamw
 5 |   - override /schedule: warmup_cosine
 6 |   - override /datamodule: longcrawl64
 7 |   - override /strategy: fsdp
 8 |   - _self_
 9 | 
10 | exp: ???
11 | tag: ???
12 | seed: 0
13 | 
14 | output_dir: ???
15 | data_dir: ???  # data_dir / 'longcrawl' / 'train.zarr' should exist
16 | 
17 | resume: True
18 | 
19 | log_interval: 33554432               # 32Mi. 32 times per billion
20 | train_eval_interval: 536870912       # 512Mi
21 | checkpoint_interval: 268435456       # 256Mi In tokens. 4 times per 1Bi. It is worth it
22 | 
23 | skip_eval: true
24 | eval_interval: 16106127360            # Only at the end. Not used due to skip_eval
25 | checkpoint_keep_interval: 16106127360 # Only at the end
26 | 
27 | fabric:
28 |   devices: auto
29 |   precision: 'bf16-mixed'
30 | 
31 | train:
32 |   max_tokens: 16106127360  # 15 Bi
33 |   # Used for one gradient accumulation step, must be larger than batch_len
34 |   grad_acc_tokens: 16384
35 |   max_grad_norm: 1.0
36 |   gradient_checkpointing: true
37 | 
38 | model:
39 |   config:
40 |     head_dim: 64
41 |     state_size: 128
42 |     num_heads: 48
43 |     num_hidden_layers: 48
44 |     hidden_size: 1536
45 | 
46 | optimizer: 
47 |   lr: 0.001
48 |   betas: [0.9, 0.95]
49 |   weight_decay: 0.1
50 | 
51 | schedule:
52 |   init_value: 0.0
53 |   peak_value: ${optimizer.lr}
54 |   warmup_steps: 268435456  # 256Mi
55 |   decay_steps: ${train.max_tokens}
56 |   end_value: 0.0
57 | 
58 | datamodule:
59 |   train_batch_len: 16384
60 |   train_batch_size: 32
61 |   train_num_workers: 2
62 | 
63 |   eval_batch_len: 16384
64 |   eval_local_batch_size: 1
65 |   eval_tokens: 2147483648  # 2Bi
66 |   eval_num_workers: 2
67 | 


--------------------------------------------------------------------------------
/configs/experiment/longcrawl64/mamba2/760m_48b.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | defaults:
 3 |   - override /model: mamba2
 4 |   - override /optimizer: adamw
 5 |   - override /schedule: warmup_cosine
 6 |   - override /datamodule: longcrawl64
 7 |   - override /strategy: fsdp
 8 |   - _self_
 9 | 
10 | exp: ???
11 | tag: ???
12 | seed: 0
13 | 
14 | output_dir: ???
15 | data_dir: ???  # data_dir / 'longcrawl' / 'train.zarr' should exist
16 | 
17 | resume: True
18 | 
19 | log_interval: 33554432               # 32Mi. 32 times per billion
20 | train_eval_interval: 536870912       # 512Mi
21 | checkpoint_interval: 268435456       # 256Mi In tokens. 4 times per 1Bi. It is worth it
22 | 
23 | skip_eval: true
24 | eval_interval: 48318382080            # Only at the end. Not used due to skip_eval
25 | checkpoint_keep_interval: 48318382080 # Only at the end
26 | 
27 | fabric:
28 |   devices: auto
29 |   precision: 'bf16-mixed'
30 | 
31 | train:
32 |   max_tokens: 48318382080  # 15 Bi
33 |   # Used for one gradient accumulation step, must be larger than batch_len
34 |   grad_acc_tokens: 16384
35 |   max_grad_norm: 1.0
36 |   gradient_checkpointing: true
37 | 
38 | model:
39 |   config:
40 |     head_dim: 64
41 |     state_size: 128
42 |     num_heads: 48
43 |     num_hidden_layers: 48
44 |     hidden_size: 1536
45 | 
46 | optimizer: 
47 |   lr: 0.002
48 |   betas: [0.9, 0.95]
49 |   weight_decay: 0.1
50 | 
51 | schedule:
52 |   init_value: 0.0
53 |   peak_value: ${optimizer.lr}
54 |   warmup_steps: 268435456  # 256Mi
55 |   decay_steps: ${train.max_tokens}
56 |   end_value: 0.0
57 | 
58 | datamodule:
59 |   train_batch_len: 16384
60 |   train_batch_size: 32
61 |   train_num_workers: 2
62 | 
63 |   eval_batch_len: 16384
64 |   eval_local_batch_size: 1
65 |   eval_tokens: 2147483648  # 2Bi
66 |   eval_num_workers: 2
67 | 


--------------------------------------------------------------------------------
/configs/experiment/longcrawl64/samba/760m_16b.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | defaults:
 3 |   - override /model: samba
 4 |   - override /optimizer: adamw
 5 |   - override /schedule: warmup_cosine
 6 |   - override /datamodule: longcrawl64
 7 |   - override /strategy: fsdp
 8 |   - _self_
 9 | 
10 | exp: ???
11 | tag: ???
12 | seed: 0
13 | 
14 | output_dir: ???
15 | data_dir: ???  # data_dir / 'longcrawl' / 'train.zarr' should exist
16 | 
17 | resume: True
18 | 
19 | log_interval: 33554432               # 32Mi. 32 times per billion
20 | train_eval_interval: 536870912       # 512Mi
21 | checkpoint_interval: 268435456       # 256Mi In tokens. 4 times per 1Bi. It is worth it
22 | 
23 | skip_eval: true
24 | eval_interval: 16106127360            # Only at the end. Not used due to skip_eval
25 | checkpoint_keep_interval: 16106127360 # Only at the end
26 | 
27 | fabric:
28 |   devices: auto
29 |   precision: 'bf16-mixed'
30 | 
31 | train:
32 |   max_tokens: 16106127360  # 15 Bi
33 |   # Used for one gradient accumulation step, must be larger than batch_len
34 |   grad_acc_tokens: 16384
35 |   max_grad_norm: 1.0
36 |   gradient_checkpointing: true
37 | 
38 | model:
39 |   config:
40 |     hidden_size: 1536
41 |     num_hidden_layers: 24
42 |     rope_base: 500000
43 |     attn:
44 |       num_heads: 12
45 |       window_size: 2048
46 |       layers: null
47 | 
48 | 
49 | optimizer: 
50 |   lr: 0.001
51 |   betas: [0.9, 0.95]
52 |   weight_decay: 0.1
53 | 
54 | schedule:
55 |   init_value: 0.0
56 |   peak_value: ${optimizer.lr}
57 |   warmup_steps: 268435456  # 256Mi
58 |   decay_steps: ${train.max_tokens}
59 |   end_value: 0.0
60 | 
61 | datamodule:
62 |   train_batch_len: 16384
63 |   train_batch_size: 32
64 |   train_num_workers: 2
65 | 
66 |   eval_batch_len: 16384
67 |   eval_local_batch_size: 1
68 |   eval_tokens: 2147483648  # 2Bi
69 |   eval_num_workers: 2
70 | 


--------------------------------------------------------------------------------
/configs/experiment/longcrawl64/transformer/llama_125m_2b.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | defaults:
 3 |   - override /model: transformer
 4 |   - override /optimizer: adamw
 5 |   - override /schedule: warmup_cosine
 6 |   - override /datamodule: longcrawl64
 7 |   - override /strategy: fsdp
 8 |   - _self_
 9 | 
10 | exp: ???
11 | tag: ???
12 | seed: 0
13 | 
14 | output_dir: ???
15 | data_dir: ???  # data_dir / 'longcrawl' / 'train.zarr' should exist
16 | 
17 | resume: True
18 | 
19 | log_interval: 33554432               # 32Mi. 32 times per billion
20 | train_eval_interval: 536870912       # 512Mi
21 | checkpoint_interval: 268435456       # 256Mi In tokens. 4 times per 1Bi. It is worth it
22 | 
23 | skip_eval: true
24 | eval_interval: 2684354560            # Only at the end. Not used due to skip_eval
25 | checkpoint_keep_interval: 2684354560 # Only at the end
26 | 
27 | fabric:
28 |   devices: auto
29 |   precision: 'bf16-mixed'
30 | 
31 | train:
32 |   max_tokens: 2684354560  # 2.5 Bi
33 |   # Used for one gradient accumulation step, must be larger than batch_len
34 |   grad_acc_tokens: 32768
35 |   max_grad_norm: 1.0
36 |   gradient_checkpointing: false
37 | 
38 | model:
39 |   config:
40 |     hidden_size: 768
41 |     num_hidden_layers: 12
42 |     num_heads: 12
43 |     rope_base: 500000
44 | 
45 | optimizer: 
46 |   lr: 0.002
47 |   betas: [0.9, 0.95]
48 |   weight_decay: 0.1
49 | 
50 | schedule:
51 |   init_value: 0.0
52 |   peak_value: ${optimizer.lr}
53 |   warmup_steps: 268435456  # 256Mi
54 |   decay_steps: ${train.max_tokens}
55 |   end_value: 0.0
56 | 
57 | datamodule:
58 |   train_batch_len: 16384
59 |   train_batch_size: 32
60 |   train_num_workers: 2
61 | 
62 |   eval_batch_len: 16384
63 |   eval_local_batch_size: 2
64 |   eval_tokens: 2147483648  # 2Bi
65 |   eval_num_workers: 2
66 | 


--------------------------------------------------------------------------------
/configs/experiment/longcrawl64/transformer/llama_360m_7b.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | defaults:
 3 |   - override /model: transformer
 4 |   - override /optimizer: adamw
 5 |   - override /schedule: warmup_cosine
 6 |   - override /datamodule: longcrawl64
 7 |   - override /strategy: fsdp
 8 |   - _self_
 9 | 
10 | exp: ???
11 | tag: ???
12 | seed: 0
13 | 
14 | output_dir: ???
15 | data_dir: ???  # data_dir / 'longcrawl' / 'train.zarr' should exist
16 | 
17 | resume: True
18 | 
19 | log_interval: 33554432               # 32Mi. 32 times per billion
20 | train_eval_interval: 536870912       # 512Mi
21 | checkpoint_interval: 268435456       # 256Mi In tokens. 4 times per 1Bi. It is worth it
22 | 
23 | skip_eval: true
24 | eval_interval: 7516192768            # Only at the end. Not used due to skip_eval
25 | checkpoint_keep_interval: 7516192768 # Only at the end
26 | 
27 | fabric:
28 |   devices: auto
29 |   precision: 'bf16-mixed'
30 | 
31 | train:
32 |   max_tokens: 7516192768  # 7 Bi
33 |   # Used for one gradient accumulation step, must be larger than batch_len
34 |   grad_acc_tokens: 16384
35 |   max_grad_norm: 1.0
36 |   gradient_checkpointing: false
37 | 
38 | model:
39 |   config:
40 |     hidden_size: 1024
41 |     num_hidden_layers: 24
42 |     num_heads: 16
43 |     rope_base: 500000
44 | 
45 | optimizer: 
46 |   lr: 0.002
47 |   betas: [0.9, 0.95]
48 |   weight_decay: 0.1
49 | 
50 | schedule:
51 |   init_value: 0.0
52 |   peak_value: ${optimizer.lr}
53 |   warmup_steps: 268435456  # 256Mi
54 |   decay_steps: ${train.max_tokens}
55 |   end_value: 0.0
56 | 
57 | datamodule:
58 |   train_batch_len: 16384
59 |   train_batch_size: 32
60 |   train_num_workers: 2
61 | 
62 |   eval_batch_len: 16384
63 |   eval_local_batch_size: 1
64 |   eval_tokens: 2147483648  # 2Bi
65 |   eval_num_workers: 2
66 | 


--------------------------------------------------------------------------------
/configs/experiment/longcrawl64/transformer/llama_760m_16b.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | defaults:
 3 |   - override /model: transformer
 4 |   - override /optimizer: adamw
 5 |   - override /schedule: warmup_cosine
 6 |   - override /datamodule: longcrawl64
 7 |   - override /strategy: fsdp
 8 |   - _self_
 9 | 
10 | exp: ???
11 | tag: ???
12 | seed: 0
13 | 
14 | output_dir: ???
15 | data_dir: ???  # data_dir / 'longcrawl' / 'train.zarr' should exist
16 | 
17 | resume: True
18 | 
19 | log_interval: 33554432               # 32Mi. 32 times per billion
20 | train_eval_interval: 536870912       # 512Mi
21 | checkpoint_interval: 268435456       # 256Mi In tokens. 4 times per 1Bi. It is worth it
22 | 
23 | skip_eval: true
24 | eval_interval: 16106127360            # Only at the end. Not used due to skip_eval
25 | checkpoint_keep_interval: 16106127360 # Only at the end
26 | 
27 | fabric:
28 |   devices: auto
29 |   precision: 'bf16-mixed'
30 | 
31 | train:
32 |   max_tokens: 16106127360  # 15 Bi
33 |   # Used for one gradient accumulation step, must be larger than batch_len
34 |   grad_acc_tokens: 16384
35 |   max_grad_norm: 1.0
36 |   gradient_checkpointing: true
37 | 
38 | model:
39 |   config:
40 |     hidden_size: 1536
41 |     num_hidden_layers: 24
42 |     num_heads: 12
43 |     rope_base: 500000
44 | 
45 | optimizer: 
46 |   lr: 0.001
47 |   betas: [0.9, 0.95]
48 |   weight_decay: 0.1
49 | 
50 | schedule:
51 |   init_value: 0.0
52 |   peak_value: ${optimizer.lr}
53 |   warmup_steps: 268435456  # 256Mi
54 |   decay_steps: ${train.max_tokens}
55 |   end_value: 0.0
56 | 
57 | datamodule:
58 |   train_batch_len: 16384
59 |   train_batch_size: 32
60 |   train_num_workers: 2
61 | 
62 |   eval_batch_len: 16384
63 |   eval_local_batch_size: 1
64 |   eval_tokens: 2147483648  # 2Bi
65 |   eval_num_workers: 2
66 | 


--------------------------------------------------------------------------------
/configs/experiment/longcrawl64/transformer/llama_760m_48b.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | defaults:
 3 |   - override /model: transformer
 4 |   - override /optimizer: adamw
 5 |   - override /schedule: warmup_cosine
 6 |   - override /datamodule: longcrawl64
 7 |   - override /strategy: fsdp
 8 |   - _self_
 9 | 
10 | exp: ???
11 | tag: ???
12 | seed: 0
13 | 
14 | output_dir: ???
15 | data_dir: ???  # data_dir / 'longcrawl' / 'train.zarr' should exist
16 | 
17 | resume: True
18 | 
19 | log_interval: 33554432               # 32Mi. 32 times per billion
20 | train_eval_interval: 536870912       # 512Mi
21 | checkpoint_interval: 268435456       # 256Mi In tokens. 4 times per 1Bi. It is worth it
22 | 
23 | skip_eval: true
24 | eval_interval: 48318382080            # Only at the end. Not used due to skip_eval
25 | checkpoint_keep_interval: 48318382080 # Only at the end
26 | 
27 | fabric:
28 |   devices: auto
29 |   precision: 'bf16-mixed'
30 | 
31 | train:
32 |   max_tokens: 48318382080  # 15 Bi
33 |   # Used for one gradient accumulation step, must be larger than batch_len
34 |   grad_acc_tokens: 16384
35 |   max_grad_norm: 1.0
36 |   gradient_checkpointing: true
37 | 
38 | model:
39 |   config:
40 |     hidden_size: 1536
41 |     num_hidden_layers: 24
42 |     num_heads: 12
43 |     rope_base: 500000
44 | 
45 | optimizer: 
46 |   lr: 0.0005
47 |   betas: [0.9, 0.95]
48 |   weight_decay: 0.1
49 | 
50 | schedule:
51 |   init_value: 0.0
52 |   peak_value: ${optimizer.lr}
53 |   warmup_steps: 268435456  # 256Mi
54 |   decay_steps: ${train.max_tokens}
55 |   end_value: 0.0
56 | 
57 | datamodule:
58 |   train_batch_len: 16384
59 |   train_batch_size: 32
60 |   train_num_workers: 2
61 | 
62 |   eval_batch_len: 16384
63 |   eval_local_batch_size: 1
64 |   eval_tokens: 2147483648  # 2Bi
65 |   eval_num_workers: 2
66 | 


--------------------------------------------------------------------------------
/configs/experiment/longcrawl64/transformer/pro_125m_2b.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | defaults:
 3 |   - override /model: forgetting_transformer
 4 |   - override /optimizer: adamw
 5 |   - override /schedule: warmup_cosine
 6 |   - override /datamodule: longcrawl64
 7 |   - override /strategy: fsdp
 8 |   - _self_
 9 | 
10 | exp: ???
11 | tag: ???
12 | seed: 0
13 | 
14 | output_dir: ???
15 | data_dir: ???  # data_dir / 'longcrawl' / 'train.zarr' should exist
16 | 
17 | resume: True
18 | 
19 | log_interval: 33554432               # 32Mi. 32 times per billion
20 | train_eval_interval: 536870912       # 512Mi
21 | checkpoint_interval: 268435456       # 256Mi In tokens. 4 times per 1Bi. It is worth it
22 | 
23 | skip_eval: true
24 | eval_interval: 2684354560            # Only at the end. Not used due to skip_eval
25 | checkpoint_keep_interval: 2684354560 # Only at the end
26 | 
27 | fabric:
28 |   devices: auto
29 |   precision: 'bf16-mixed'
30 | 
31 | train:
32 |   max_tokens: 2684354560  # 2.5 Bi
33 |   # Used for one gradient accumulation step, must be larger than batch_len
34 |   grad_acc_tokens: 32768
35 |   max_grad_norm: 1.0
36 |   gradient_checkpointing: false
37 | 
38 | model:
39 |   config:
40 |     hidden_size: 768
41 |     num_hidden_layers: 12
42 |     num_heads: 12
43 |     rope_base: 500000
44 |     # Pro config. Note this is implemented as FoX with fgate_type="none" and use_rope=true
45 |     # See the `- override /model: forgetting_transformer` on top of this file
46 |     use_rope: true
47 |     fgate_type: "none"
48 |     use_v_shift: true
49 |     use_k_shift: true
50 |     qk_norm: true
51 |     use_output_gate: true
52 |     use_output_norm: true
53 |     hidden_ratio: 3.5   # output gates use extra params so we reduce it here
54 | 
55 | optimizer: 
56 |   lr: 0.002
57 |   betas: [0.9, 0.95]
58 |   weight_decay: 0.1
59 | 
60 | schedule:
61 |   init_value: 0.0
62 |   peak_value: ${optimizer.lr}
63 |   warmup_steps: 268435456  # 256Mi
64 |   decay_steps: ${train.max_tokens}
65 |   end_value: 0.0
66 | 
67 | datamodule:
68 |   train_batch_len: 16384
69 |   train_batch_size: 32
70 |   train_num_workers: 2
71 | 
72 |   eval_batch_len: 16384
73 |   eval_local_batch_size: 2
74 |   eval_tokens: 2147483648  # 2Bi
75 |   eval_num_workers: 2
76 | 


--------------------------------------------------------------------------------
/configs/experiment/longcrawl64/transformer/pro_360m_7b.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | defaults:
 3 |   - override /model: forgetting_transformer
 4 |   - override /optimizer: adamw
 5 |   - override /schedule: warmup_cosine
 6 |   - override /datamodule: longcrawl64
 7 |   - override /strategy: fsdp
 8 |   - _self_
 9 | 
10 | exp: ???
11 | tag: ???
12 | seed: 0
13 | 
14 | output_dir: ???
15 | data_dir: ???  # data_dir / 'longcrawl' / 'train.zarr' should exist
16 | 
17 | resume: True
18 | 
19 | log_interval: 33554432               # 32Mi. 32 times per billion
20 | train_eval_interval: 536870912       # 512Mi
21 | checkpoint_interval: 268435456       # 256Mi In tokens. 4 times per 1Bi. It is worth it
22 | 
23 | skip_eval: true
24 | eval_interval: 7516192768            # Only at the end. Not used due to skip_eval
25 | checkpoint_keep_interval: 7516192768 # Only at the end
26 | 
27 | fabric:
28 |   devices: auto
29 |   precision: 'bf16-mixed'
30 | 
31 | train:
32 |   max_tokens: 7516192768  # 7 Bi
33 |   # Used for one gradient accumulation step, must be larger than batch_len
34 |   grad_acc_tokens: 16384
35 |   max_grad_norm: 1.0
36 |   gradient_checkpointing: false
37 | 
38 | model:
39 |   config:
40 |     hidden_size: 1024
41 |     num_hidden_layers: 24
42 |     num_heads: 16
43 |     rope_base: 500000
44 |     # Pro config. Note this is implemented as FoX with fgate_type="none" and use_rope=true
45 |     # See the `- override /model: forgetting_transformer` on top of this file
46 |     use_rope: true
47 |     fgate_type: "none"
48 |     use_v_shift: true
49 |     use_k_shift: true
50 |     qk_norm: true
51 |     use_output_gate: true
52 |     use_output_norm: true
53 |     hidden_ratio: 3.5   # output gates use extra params so we reduce it here
54 | 
55 | optimizer: 
56 |   lr: 0.002
57 |   betas: [0.9, 0.95]
58 |   weight_decay: 0.1
59 | 
60 | schedule:
61 |   init_value: 0.0
62 |   peak_value: ${optimizer.lr}
63 |   warmup_steps: 268435456  # 256Mi
64 |   decay_steps: ${train.max_tokens}
65 |   end_value: 0.0
66 | 
67 | datamodule:
68 |   train_batch_len: 16384
69 |   train_batch_size: 32
70 |   train_num_workers: 2
71 | 
72 |   eval_batch_len: 16384
73 |   eval_local_batch_size: 1
74 |   eval_tokens: 2147483648  # 2Bi
75 |   eval_num_workers: 2
76 | 


--------------------------------------------------------------------------------
/configs/experiment/longcrawl64/transformer/pro_760m_16b.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | defaults:
 3 |   - override /model: forgetting_transformer
 4 |   - override /optimizer: adamw
 5 |   - override /schedule: warmup_cosine
 6 |   - override /datamodule: longcrawl64
 7 |   - override /strategy: fsdp
 8 |   - _self_
 9 | 
10 | exp: ???
11 | tag: ???
12 | seed: 0
13 | 
14 | output_dir: ???
15 | data_dir: ???  # data_dir / 'longcrawl' / 'train.zarr' should exist
16 | 
17 | resume: True
18 | 
19 | log_interval: 33554432               # 32Mi. 32 times per billion
20 | train_eval_interval: 536870912       # 512Mi
21 | checkpoint_interval: 268435456       # 256Mi In tokens. 4 times per 1Bi. It is worth it
22 | 
23 | skip_eval: true
24 | eval_interval: 16106127360            # Only at the end. Not used due to skip_eval
25 | checkpoint_keep_interval: 16106127360 # Only at the end
26 | 
27 | fabric:
28 |   devices: auto
29 |   precision: 'bf16-mixed'
30 | 
31 | train:
32 |   max_tokens: 16106127360  # 15 Bi
33 |   # Used for one gradient accumulation step, must be larger than batch_len
34 |   grad_acc_tokens: 16384
35 |   max_grad_norm: 1.0
36 |   gradient_checkpointing: true
37 | 
38 | model:
39 |   config:
40 |     hidden_size: 1536
41 |     num_hidden_layers: 24
42 |     num_heads: 12
43 |     rope_base: 500000
44 |     # Pro config. Note this is implemented as FoX with fgate_type="none" and use_rope=true
45 |     # See the `- override /model: forgetting_transformer` on top of this file
46 |     use_rope: true
47 |     fgate_type: "none"
48 |     use_v_shift: true
49 |     use_k_shift: true
50 |     qk_norm: true
51 |     use_output_gate: true
52 |     use_output_norm: true
53 |     hidden_ratio: 3.5   # output gates use extra params so we reduce it here
54 | 
55 | optimizer: 
56 |   lr: 0.001
57 |   betas: [0.9, 0.95]
58 |   weight_decay: 0.1
59 | 
60 | schedule:
61 |   init_value: 0.0
62 |   peak_value: ${optimizer.lr}
63 |   warmup_steps: 268435456  # 256Mi
64 |   decay_steps: ${train.max_tokens}
65 |   end_value: 0.0
66 | 
67 | datamodule:
68 |   train_batch_len: 16384
69 |   train_batch_size: 32
70 |   train_num_workers: 2
71 | 
72 |   eval_batch_len: 16384
73 |   eval_local_batch_size: 1
74 |   eval_tokens: 2147483648  # 2Bi
75 |   eval_num_workers: 2
76 | 


--------------------------------------------------------------------------------
/configs/experiment/longcrawl64/transformer/pro_760m_48b.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | defaults:
 3 |   - override /model: forgetting_transformer
 4 |   - override /optimizer: adamw
 5 |   - override /schedule: warmup_cosine
 6 |   - override /datamodule: longcrawl64
 7 |   - override /strategy: fsdp
 8 |   - _self_
 9 | 
10 | exp: ???
11 | tag: ???
12 | seed: 0
13 | 
14 | output_dir: ???
15 | data_dir: ???  # data_dir / 'longcrawl' / 'train.zarr' should exist
16 | 
17 | resume: True
18 | 
19 | log_interval: 33554432               # 32Mi. 32 times per billion
20 | train_eval_interval: 536870912       # 512Mi
21 | checkpoint_interval: 268435456       # 256Mi In tokens. 4 times per 1Bi. It is worth it
22 | 
23 | skip_eval: true
24 | eval_interval: 48318382080            # Only at the end. Not used due to skip_eval
25 | checkpoint_keep_interval: 48318382080 # Only at the end
26 | 
27 | fabric:
28 |   devices: auto
29 |   precision: 'bf16-mixed'
30 | 
31 | train:
32 |   max_tokens: 48318382080  # 15 Bi
33 |   # Used for one gradient accumulation step, must be larger than batch_len
34 |   grad_acc_tokens: 16384
35 |   max_grad_norm: 1.0
36 |   gradient_checkpointing: true
37 | 
38 | model:
39 |   config:
40 |     hidden_size: 1536
41 |     num_hidden_layers: 24
42 |     num_heads: 12
43 |     rope_base: 500000
44 |     # Pro config. Note this is implemented as FoX with fgate_type="none" and use_rope=true
45 |     # See the `- override /model: forgetting_transformer` on top of this file
46 |     use_rope: true
47 |     fgate_type: "none"
48 |     use_v_shift: true
49 |     use_k_shift: true
50 |     qk_norm: true
51 |     use_output_gate: true
52 |     use_output_norm: true
53 |     hidden_ratio: 3.5   # output gates use extra params so we reduce it here
54 | 
55 | optimizer: 
56 |   lr: 0.001
57 |   betas: [0.9, 0.95]
58 |   weight_decay: 0.1
59 | 
60 | schedule:
61 |   init_value: 0.0
62 |   peak_value: ${optimizer.lr}
63 |   warmup_steps: 268435456  # 256Mi
64 |   decay_steps: ${train.max_tokens}
65 |   end_value: 0.0
66 | 
67 | datamodule:
68 |   train_batch_len: 16384
69 |   train_batch_size: 32
70 |   train_num_workers: 2
71 | 
72 |   eval_batch_len: 16384
73 |   eval_local_batch_size: 1
74 |   eval_tokens: 2147483648  # 2Bi
75 |   eval_num_workers: 2
76 | 


--------------------------------------------------------------------------------
/configs/experiment/longcrawl64/transformer_swa/760m_16b.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | defaults:
 3 |   - override /model: transformer
 4 |   - override /optimizer: adamw
 5 |   - override /schedule: warmup_cosine
 6 |   - override /datamodule: longcrawl64
 7 |   - override /strategy: fsdp
 8 |   - _self_
 9 | 
10 | exp: ???
11 | tag: ???
12 | seed: 0
13 | 
14 | output_dir: ???
15 | data_dir: ???  # data_dir / 'longcrawl' / 'train.zarr' should exist
16 | 
17 | resume: True
18 | 
19 | log_interval: 33554432               # 32Mi. 32 times per billion
20 | train_eval_interval: 536870912       # 512Mi
21 | checkpoint_interval: 268435456       # 256Mi In tokens. 4 times per 1Bi. It is worth it
22 | 
23 | skip_eval: true
24 | eval_interval: 16106127360            # Only at the end. Not used due to skip_eval
25 | checkpoint_keep_interval: 16106127360 # Only at the end
26 | 
27 | fabric:
28 |   devices: auto
29 |   precision: 'bf16-mixed'
30 | 
31 | train:
32 |   max_tokens: 16106127360  # 15 Bi
33 |   # Used for one gradient accumulation step, must be larger than batch_len
34 |   grad_acc_tokens: 16384
35 |   max_grad_norm: 1.0
36 |   gradient_checkpointing: true
37 | 
38 | model:
39 |   config:
40 |     hidden_size: 1536
41 |     num_hidden_layers: 24
42 |     num_heads: 12
43 |     rope_base: 500000
44 |     window_size: 2048
45 | 
46 | optimizer: 
47 |   lr: 0.001
48 |   betas: [0.9, 0.95]
49 |   weight_decay: 0.1
50 | 
51 | schedule:
52 |   init_value: 0.0
53 |   peak_value: ${optimizer.lr}
54 |   warmup_steps: 268435456  # 256Mi
55 |   decay_steps: ${train.max_tokens}
56 |   end_value: 0.0
57 | 
58 | datamodule:
59 |   train_batch_len: 16384
60 |   train_batch_size: 32
61 |   train_num_workers: 2
62 | 
63 |   eval_batch_len: 16384
64 |   eval_local_batch_size: 1
65 |   eval_tokens: 2147483648  # 2Bi
66 |   eval_num_workers: 2
67 | 


--------------------------------------------------------------------------------
/configs/model/__init__.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass
2 | 
3 | @dataclass
4 | class ModelConfig:
5 |     pass
6 | 


--------------------------------------------------------------------------------
/configs/model/delta_net.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass, field
 2 | from omegaconf import OmegaConf, MISSING
 3 | from typing import List, Any, Optional
 4 | 
 5 | from . import ModelConfig
 6 | 
 7 | 
 8 | @dataclass
 9 | class DeltaNetArgConfig:
10 |     _target_: str = "forgetting_transformer.model.delta_net.configuration_delta_net.DeltaNetConfig"
11 |     vocab_size: int = MISSING
12 |     hidden_size: int = MISSING
13 |     expand_k: int = 1
14 |     expand_v: int = 1
15 |     use_gate: bool = False
16 |     use_short_conv: bool = True
17 |     conv_size: int = 4
18 |     use_beta: bool = True
19 |     use_output_norm: bool = True
20 |     hidden_ratio: Optional[int] = 4
21 |     intermediate_size: Optional[int] = None
22 |     num_hidden_layers: int = MISSING
23 |     num_heads: int = MISSING
24 |     attn_mode: str = "chunk"
25 |     qk_norm: str = 'l2'
26 |     qk_activation: str = 'silu'
27 |     hidden_act: str = "swish"
28 |     max_position_embeddings: Optional[int] = None
29 |     norm_first: bool = False
30 |     norm_eps: float = 1e-6
31 |     use_cache: bool = True
32 |     pad_token_id: Optional[int] = None
33 |     bos_token_id: Optional[int] = None
34 |     eos_token_id: Optional[int] = None
35 |     tie_word_embeddings: bool = False
36 |     initializer_range: float = 0.02
37 |     fuse_cross_entropy: bool = True
38 | 
39 | 
40 | 
41 | 
42 | @dataclass
43 | class DeltaNetConfig(ModelConfig):
44 |     _target_: str = "forgetting_transformer.model.delta_net.modeling_delta_net.DeltaNetForCausalLM"
45 |     config: DeltaNetArgConfig = DeltaNetArgConfig()
46 | 


--------------------------------------------------------------------------------
/configs/model/forgetting_transformer.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass, field
 2 | from omegaconf import OmegaConf, MISSING
 3 | from typing import List, Any, Optional
 4 | 
 5 | from . import ModelConfig
 6 | 
 7 | 
 8 | @dataclass
 9 | class ForgettingTransformerArgConfig:
10 |     _target_: str = "forgetting_transformer.model.forgetting_transformer.configuration_forgetting_transformer.ForgettingTransformerConfig"
11 |     vocab_size: int = MISSING  # Should be provided programmatically
12 |     hidden_size: int = MISSING
13 |     hidden_ratio: float = 4
14 |     intermediate_size: Optional[int] = None
15 |     num_hidden_layers: int = MISSING
16 |     num_heads: int = MISSING
17 |     num_kv_heads: Optional[int] = None
18 |     hidden_act: str = "swish"
19 |     window_size: Optional[int] = None
20 |     max_position_embeddings: Optional[int] = None
21 |     initializer_range: float = 0.02
22 |     elementwise_affine: Optional[bool] = True
23 |     norm_eps: float = 1e-6
24 |     use_cache: bool = True
25 |     pad_token_id: Optional[int] = None
26 |     bos_token_id: Optional[int] = None
27 |     eos_token_id: Optional[int] = None
28 |     tie_word_embeddings: bool = False
29 |     attention_bias: bool = False
30 |     fuse_norm: bool = True
31 |     fuse_cross_entropy: bool = True
32 |     rope_base: float = 500000
33 |     use_rope: bool = False
34 |     use_output_gate: bool = False
35 |     ogate_act: str = "sigmoid"
36 |     fgate_type: str = "full"
37 |     fgate_bias_init: bool = False
38 |     decay_time_min: Optional[float] = None
39 |     decay_time_max: Optional[float] = None
40 |     use_output_norm: bool = False
41 |     qk_norm: bool = False
42 |     qk_norm_share_param_across_head: bool = False
43 | 
44 |     use_k_shift: bool = False
45 |     use_v_shift: bool = False
46 | 
47 | 
48 | 
49 | 
50 | @dataclass
51 | class ForgettingTransformerConfig(ModelConfig):
52 |     _target_: str = "forgetting_transformer.model.forgetting_transformer.modeling_forgetting_transformer.ForgettingTransformerForCausalLM"
53 |     config: ForgettingTransformerArgConfig = ForgettingTransformerArgConfig()
54 | 


--------------------------------------------------------------------------------
/configs/model/hgrn2.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass, field
 2 | from omegaconf import OmegaConf, MISSING
 3 | from typing import List, Any, Optional
 4 | 
 5 | from . import ModelConfig
 6 | 
 7 | 
 8 | @dataclass
 9 | class HGRN2ArgConfig:
10 |     _target_: str = "forgetting_transformer.model.hgrn2.configuration_hgrn2.HGRN2Config"
11 |     vocab_size: int = MISSING
12 |     hidden_size: int = MISSING
13 |     num_hidden_layers: int = MISSING
14 |     attn_mode: str = "chunk"
15 |     num_heads: Optional[int] = None
16 |     expand_ratio: Optional[int] = MISSING
17 |     use_short_conv: bool = False
18 |     conv_size: int = 4
19 |     use_lower_bound: bool = True
20 |     hidden_ratio: Optional[int] = 4
21 |     intermediate_size: Optional[int] = None
22 |     hidden_act: str = "swish"
23 |     max_position_embeddings: Optional[int] = None
24 |     elementwise_affine: Optional[bool] = True
25 |     norm_eps: float = 1e-6
26 |     use_cache: bool = True
27 |     pad_token_id: Optional[int] = None
28 |     bos_token_id: Optional[int] = None
29 |     eos_token_id: Optional[int] = None
30 |     tie_word_embeddings: bool = False
31 |     initializer_range: float = 0.02
32 |     fuse_cross_entropy: bool = True
33 | 
34 | 
35 | 
36 | 
37 | @dataclass
38 | class HGRN2Config(ModelConfig):
39 |     _target_: str = "forgetting_transformer.model.hgrn2.modeling_hgrn2.HGRN2ForCausalLM"
40 |     config: HGRN2ArgConfig = HGRN2ArgConfig()
41 | 


--------------------------------------------------------------------------------
/configs/model/mamba2.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass, field
 2 | from omegaconf import OmegaConf, MISSING
 3 | from typing import List, Any, Optional
 4 | 
 5 | from . import ModelConfig
 6 | 
 7 | 
 8 | @dataclass
 9 | class Mamba2ArgConfig:
10 |     _target_: str = "forgetting_transformer.model.mamba2.configuration_mamba2.Mamba2Config"
11 |     num_heads: int = MISSING
12 |     head_dim: int = MISSING
13 |     vocab_size: int = MISSING
14 |     hidden_size: int = MISSING
15 |     state_size: int = MISSING
16 |     num_hidden_layers: int = MISSING
17 |     layer_norm_epsilon: float = 1e-5
18 |     pad_token_id: Optional[int] = None
19 |     bos_token_id: Optional[int] = None
20 |     eos_token_id: Optional[int] = None
21 |     expand: int = 2
22 |     conv_kernel: int = 4
23 |     n_groups: int = 1
24 |     use_bias: bool = False
25 |     use_conv_bias: bool = True
26 |     hidden_act: str = "silu"
27 |     initializer_range: float = 0.02
28 |     residual_in_fp32: bool = True
29 |     time_step_rank: str = "auto"
30 |     time_step_min: float = 0.001
31 |     time_step_max: float = 0.1
32 |     time_step_floor: float = 1e-4
33 |     time_step_limit=(0.0, float("inf"))
34 |     rescale_prenorm_residual: bool = True
35 |     use_cache: bool = True
36 |     rms_norm: bool = True
37 |     chunk_size: int = 256
38 |     fuse_cross_entropy: bool = True
39 |     tie_word_embeddings: bool = False
40 | 
41 | 
42 | 
43 | 
44 | @dataclass
45 | class Mamba2Config(ModelConfig):
46 |     _target_: str = "forgetting_transformer.model.mamba2.modeling_mamba2.Mamba2ForCausalLM"
47 |     config: Mamba2ArgConfig = Mamba2ArgConfig()
48 | 


--------------------------------------------------------------------------------
/configs/model/samba.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass, field
 2 | from omegaconf import OmegaConf, MISSING
 3 | from typing import List, Any, Optional, Dict
 4 | 
 5 | from . import ModelConfig
 6 | 
 7 | @dataclass
 8 | class SambaAttnConfig:
 9 |     num_kv_heads: Optional[int] = None
10 |     num_heads: int = MISSING
11 |     window_size: Optional[int] = MISSING
12 |     layers: Optional[List[int]] = MISSING
13 | 
14 | @dataclass
15 | class SambaArgConfig:
16 |     _target_: str = "forgetting_transformer.model.samba.configuration_samba.SambaConfig"
17 |     vocab_size: int = MISSING
18 |     hidden_size: int = MISSING
19 |     state_size: int = 16
20 |     num_hidden_layers: int = MISSING
21 |     norm_eps=1e-5
22 |     pad_token_id: Optional[int] = None
23 |     bos_token_id: Optional[int] = None
24 |     eos_token_id: Optional[int] = None
25 |     expand: int = 2
26 |     conv_kernel: int = 4
27 |     use_bias: bool = False
28 |     use_conv_bias: bool = True
29 |     hidden_act: str = "silu"
30 |     initializer_range: float = 0.02
31 |     residual_in_fp32: bool = False
32 |     time_step_rank: str = "auto"
33 |     time_step_scale: float = 1.0
34 |     time_step_min: float = 0.001
35 |     time_step_max: float = 0.1
36 |     time_step_init_scheme: str = "random"
37 |     time_step_floor: float = 1e-4
38 |     max_position_embeddings: Optional[int] = None
39 |     attn: SambaAttnConfig = SambaAttnConfig()
40 |     attn_hidden_ratio: Optional[float] = 4
41 |     mamba_hidden_ratio: Optional[float] = 3
42 |     use_cache: bool = True
43 |     fuse_norm: bool = True
44 |     fuse_cross_entropy: bool = True
45 |     tie_word_embeddings: bool = False
46 |     rope_base: float = MISSING
47 |     rescale_prenorm_residual: bool = True  # To be consistent with other impl
48 | 
49 | 
50 | 
51 | 
52 | @dataclass
53 | class SambaConfig(ModelConfig):
54 |     _target_: str = "forgetting_transformer.model.samba.modeling_samba.SambaForCausalLM"
55 |     config: SambaArgConfig = SambaArgConfig()
56 | 


--------------------------------------------------------------------------------
/configs/model/transformer.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass, field
 2 | from omegaconf import OmegaConf, MISSING
 3 | from typing import List, Any, Optional
 4 | 
 5 | from . import ModelConfig
 6 | 
 7 | 
 8 | @dataclass
 9 | class TransformerArgConfig:
10 |     _target_: str = "forgetting_transformer.model.transformer.configuration_transformer.TransformerConfig"
11 |     vocab_size: int = MISSING  # Should be provided programmatically
12 |     hidden_size: int = MISSING
13 |     hidden_ratio: int = 4
14 |     intermediate_size: Optional[int] = None
15 |     num_hidden_layers: int = MISSING
16 |     num_heads: int = MISSING
17 |     num_kv_heads: Optional[int] = None
18 |     hidden_act: str = "swish"
19 |     window_size: Optional[int] = None
20 |     max_position_embeddings: Optional[int] = None
21 |     initializer_range: float = 0.02
22 |     elementwise_affine: Optional[bool] = True
23 |     norm_eps: float = 1e-6
24 |     use_cache: bool = True
25 |     pad_token_id: Optional[int] = None
26 |     bos_token_id: Optional[int] = None
27 |     eos_token_id: Optional[int] = None
28 |     tie_word_embeddings: bool = False
29 |     attention_bias: bool = False
30 |     fuse_norm: bool = True
31 |     fuse_cross_entropy: bool = True
32 |     rope_base: float = MISSING
33 |     use_rope: bool = True
34 | 
35 | 
36 | 
37 | 
38 | @dataclass
39 | class TransformerConfig(ModelConfig):
40 |     _target_: str = "forgetting_transformer.model.transformer.modeling_transformer.TransformerForCausalLM"
41 |     config: TransformerArgConfig = TransformerArgConfig()
42 | 


--------------------------------------------------------------------------------
/configs/optimizer/__init__.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass
2 | 
3 | @dataclass
4 | class OptimizerConfig:
5 |     pass
6 | 


--------------------------------------------------------------------------------
/configs/optimizer/adamw.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass, field
 2 | from omegaconf import OmegaConf, MISSING
 3 | from typing import List, Any
 4 | 
 5 | from . import OptimizerConfig
 6 | 
 7 | 
 8 | @dataclass
 9 | class AdamWConfig(OptimizerConfig):
10 |     _target_: str = "torch.optim.AdamW"
11 |     lr: float = MISSING
12 |     betas: List[float] = MISSING
13 |     weight_decay: float = MISSING
14 | 


--------------------------------------------------------------------------------
/configs/schedule/__init__.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass
2 | 
3 | 
4 | @dataclass
5 | class ScheduleConfig:
6 |     pass
7 | 


--------------------------------------------------------------------------------
/configs/schedule/constant.py:
--------------------------------------------------------------------------------
 1 | from . import ScheduleConfig
 2 | from omegaconf import MISSING
 3 | from dataclasses import dataclass
 4 | 
 5 | 
 6 | @dataclass
 7 | class ConstantScheduleConfig(ScheduleConfig):
 8 |     _target_: str = 'forgetting_transformer.schedule.constant_schedule'
 9 |     value: float = MISSING
10 | 


--------------------------------------------------------------------------------
/configs/schedule/warmup_cosine.py:
--------------------------------------------------------------------------------
 1 | from . import ScheduleConfig
 2 | from omegaconf import MISSING
 3 | from dataclasses import dataclass
 4 | 
 5 | 
 6 | @dataclass
 7 | class WarmupCosineScheduleConfig(ScheduleConfig):
 8 |     _target_: str = 'forgetting_transformer.schedule.warmup_cosine_decay_schedule'
 9 |     init_value: float = MISSING
10 |     peak_value: float = MISSING
11 |     warmup_steps: int = MISSING
12 |     decay_steps: int = MISSING
13 |     end_value: float = MISSING
14 | 


--------------------------------------------------------------------------------
/configs/schedule/warmup_linear.py:
--------------------------------------------------------------------------------
 1 | from . import ScheduleConfig
 2 | from omegaconf import MISSING
 3 | from dataclasses import dataclass
 4 | 
 5 | 
 6 | @dataclass
 7 | class WarmupLinearScheduleConfig(ScheduleConfig):
 8 |     _target_: str = 'forgetting_transformer.schedule.warmup_linear_decay_schedule'
 9 |     init_value: float = MISSING
10 |     peak_value: float = MISSING
11 |     warmup_steps: int = MISSING
12 |     decay_steps: int = MISSING
13 |     end_value: float = MISSING
14 | 


--------------------------------------------------------------------------------
/configs/schedule/warmup_one_minus_sqrt.py:
--------------------------------------------------------------------------------
 1 | from . import ScheduleConfig
 2 | from omegaconf import MISSING
 3 | from dataclasses import dataclass
 4 | 
 5 | 
 6 | @dataclass
 7 | class WarmupOneMinusSqrtScheduleConfig(ScheduleConfig):
 8 |     _target_: str = 'forgetting_transformer.schedule.warmup_one_minus_sqrt_schedule'
 9 |     init_value: float = MISSING
10 |     peak_value: float = MISSING
11 |     warmup_steps: int = MISSING
12 |     total_steps: int = MISSING
13 |     anneal_steps: int = MISSING
14 |     end_value: float = MISSING
15 | 


--------------------------------------------------------------------------------
/configs/strategy/__init__.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass
2 | 
3 | 
4 | @dataclass
5 | class StrategyConfig:
6 |     pass
7 | 


--------------------------------------------------------------------------------
/configs/strategy/ddp.py:
--------------------------------------------------------------------------------
1 | from . import StrategyConfig
2 | from omegaconf import MISSING
3 | from dataclasses import dataclass
4 | 
5 | 
6 | @dataclass
7 | class DDPConfig(StrategyConfig):
8 |     _target_: str = "lightning.fabric.strategies.DDPStrategy"
9 | 


--------------------------------------------------------------------------------
/configs/strategy/fsdp.py:
--------------------------------------------------------------------------------
 1 | from . import StrategyConfig
 2 | from omegaconf import MISSING
 3 | from dataclasses import dataclass
 4 | 
 5 | 
 6 | @dataclass
 7 | class FSDPConfig(StrategyConfig):
 8 |     _target_: str = "lightning.fabric.strategies.FSDPStrategy"
 9 |     state_dict_type: str = "full"  # We don't want any trouble later
10 |     sharding_strategy: str = "FULL_SHARD"  # We don't want any trouble later
11 |     cpu_offload: bool = False
12 | 


--------------------------------------------------------------------------------
/configs/utils.py:
--------------------------------------------------------------------------------
 1 | import inspect
 2 | import importlib
 3 | from hydra.core.config_store import ConfigStore
 4 | from pathlib import Path
 5 | from typing import Any, Type, Union, Optional
 6 | from types import ModuleType
 7 | import pkgutil
 8 | from dataclasses import is_dataclass
 9 | 
10 | 
11 | def auto_register(base_class: Type, config_root: Optional[Union[str, Path]]):
12 |     """Auto register config that inherits a base class.
13 |     
14 |     This automatically registers all the config class defined in the same package
15 |     as baseclass. Rules:
16 |     - The base class must be defined in the __init__.py of the package
17 |     - Subclasses must be defined in direct modules of that package
18 |     - Each module file should only contain one subclass.
19 |     """
20 |     config_root = config_root
21 |     assert config_root.stem == "configs", "Just a sanity check, you can change this"
22 | 
23 |     pkg = importlib.import_module(base_class.__module__)
24 |     assert hasattr(pkg, "__path__"), (
25 |         f"{base_class}'s module does not have attribute __path__. {base_class}"
26 |         f" must be defined in `__init__.py` in order for auto register to work"
27 |     )
28 |     pkg_path = Path(pkg.__file__).parent
29 | 
30 |     try:
31 |         group = str(pkg_path.relative_to(config_root))
32 |     except ValueError:
33 |         raise ValueError(
34 |             f"Node {pkg.__name__}'s path {pkg_path} is not under config root {config_root}."
35 |         )
36 | 
37 |     cs = ConfigStore.instance()
38 |     for loader, module_name, is_pkg in pkgutil.iter_modules(pkg.__path__):
39 |         module = importlib.import_module(f"{pkg.__name__}.{module_name}")
40 |         # Iterate through the attributes of the module
41 |         valid_list = []
42 |         for name, obj in inspect.getmembers(module):
43 |             if (
44 |                 inspect.isclass(obj)
45 |                 and issubclass(obj, base_class)
46 |                 and obj is not base_class
47 |             ):
48 |                 assert is_dataclass(obj), f"{obj} must be dataclass"
49 |                 valid_list.append((name, obj))
50 |         if len(valid_list) != 1:
51 |             raise ValueError(
52 |                 f"Module {module} should define exactly one subclass of {base_class}, but got {valid_list}"
53 |             )
54 |         else:
55 |             name, obj = valid_list[0]
56 |             cs.store(name=module_name, group=group, node=obj)
57 | 


--------------------------------------------------------------------------------
/eval/lm_eval_harness/README.md:
--------------------------------------------------------------------------------
 1 | # Language Model Evaluation Harness
 2 | 
 3 | This directory contains the code for evaluating trained models on several tasks from [Language Model Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness).
 4 | 
 5 | ## Usage
 6 | 
 7 | Example usage:
 8 | 
 9 | ```bash
10 | export SAVE_DIR="./results"  # You can use any other path
11 | python run_lm_eval.py \
12 |     --model "fox-pro-760m-longcrawl64-48b" \
13 |     --model_path "zhixuan-lin/fox-pro-760m-longcrawl64-48b" \
14 |     --device_id 0 \
15 |     --max_len 16384 \
16 |     --batch_size 16 \
17 |     --save_dir $SAVE_DIR
18 | ```
19 | 
20 | After you've got the results, you can generate a latex table:
21 | 
22 | ```bash
23 | python table_lm_eval.py --result_dir $SAVE_DIR
24 | ```
25 | 
26 | You can change the `MODELS` list in `table_lm_eval.py` to specify what models to include in your table.
27 | 
28 | Note that we observe the evaluation results to be non-deterministic, likely due to GPU non-determinism. Therefore the results you obtain may not exactly match those reported in the paper. However, the difference should be small.
29 | 
30 | ## Citation
31 | 
32 | If you use this code, consider citing Language Evaluation Harness:
33 | 
34 | ```
35 | @misc{eval-harness,
36 |   author       = {Gao, Leo and Tow, Jonathan and Abbasi, Baber and Biderman, Stella and Black, Sid and DiPofi, Anthony and Foster, Charles and Golding, Laurence and Hsu, Jeffrey and Le Noac'h, Alain and Li, Haonan and McDonell, Kyle and Muennighoff, Niklas and Ociepa, Chris and Phang, Jason and Reynolds, Laria and Schoelkopf, Hailey and Skowron, Aviya and Sutawika, Lintang and Tang, Eric and Thite, Anish and Wang, Ben and Wang, Kevin and Zou, Andy},
37 |   title        = {A framework for few-shot language model evaluation},
38 |   month        = 07,
39 |   year         = 2024,
40 |   publisher    = {Zenodo},
41 |   version      = {v0.4.3},
42 |   doi          = {10.5281/zenodo.12608602},
43 |   url          = {https://zenodo.org/records/12608602}
44 | }
45 | ```
46 | 


--------------------------------------------------------------------------------
/eval/lm_eval_harness/run_lm_eval.py:
--------------------------------------------------------------------------------
  1 | from typing import Callable, Dict, Union, Optional, Tuple, NamedTuple, Any, List
  2 | import logging
  3 | from pathlib import Path
  4 | import rich
  5 | import rich.syntax
  6 | 
  7 | import torch
  8 | import os
  9 | import os.path as osp
 10 | from torch import nn
 11 | import colorlog
 12 | from datetime import datetime
 13 | import jsonlines
 14 | import lm_eval
 15 | from lm_eval.models.huggingface import HFLM
 16 | 
 17 | import json
 18 | import pprint
 19 | import argparse
 20 | from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaTokenizerFast, LlamaTokenizer
 21 | import forgetting_transformer.tokenizer
 22 | import forgetting_transformer.model
 23 | import pickle
 24 | 
 25 | 
 26 | 
 27 | 
 28 | def main():
 29 |     parser = argparse.ArgumentParser()
 30 |     # parser.add_argument('--model', type=str, required=True, choices=["mamba2-760m", "fot-760m", "hgrn2-760m", "delta_net-760m", "transformer-760m", "fot-qk-norm-760m"])
 31 |     parser.add_argument('--model', type=str, required=True)
 32 |     parser.add_argument('--model_path', type=str, required=True)
 33 |     parser.add_argument('--device_id', type=int, required=True)
 34 |     parser.add_argument('--max_len', type=int, required=True)
 35 |     parser.add_argument('--batch_size', type=int, required=True)
 36 |     parser.add_argument('--save_dir', type=str, required=True)
 37 |     args = parser.parse_args()
 38 | 
 39 |     assert args.model == Path(args.model_path).name, f"Model name '{args.model}' is different from the last component of model path '{args.path}'. You can delete this assertion if you are sure this is correct."
 40 |     model_name = args.model
 41 |     device_id = args.device_id
 42 |     max_len = args.max_len
 43 |     batch_size = args.batch_size
 44 |     save_dir = Path(args.save_dir) / model_name
 45 |     save_dir.mkdir(parents=True, exist_ok=True)
 46 | 
 47 |     model_path = args.model_path
 48 | 
 49 |     device = torch.device(f"cuda:{device_id}")
 50 | 
 51 | 
 52 |     tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, add_bos_token=True, clean_up_tokenization_spaces=False)
 53 |     assert max_len == 16384, "Just in case. You can delete this."
 54 |     model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True).to(device)
 55 | 
 56 | 
 57 | 
 58 |     # TODO: note that the models are trained with BOS. Therefore, in principle in all
 59 |     # evaluation BOS should be added. However, for wikitext perplexity eval except for
 60 |     # the first rolling window, no BOS is added. This is fine in our case since our 16k
 61 |     # context length covers most wikitext docs. However, if you use a short training
 62 |     # context length with BOS, you will need to modify HFLM to implement the correct
 63 |     # behavior.
 64 |     hflm = HFLM(
 65 |         pretrained=model,
 66 |         batch_size=batch_size,
 67 |         tokenizer=tokenizer,
 68 |         max_length=max_len,
 69 |         add_bos_token=True,  # This is basically whether to use add_special_tokens
 70 |     )
 71 | 
 72 |     task_manager = lm_eval.tasks.TaskManager()
 73 | 
 74 |     # Setting `task_manager` to the one above is optional and should generally be
 75 |     # done
 76 |     # if you want to include tasks from paths other than ones in `lm_eval/tasks`.
 77 |     # `simple_evaluate` will instantiate its own task_manager if it is set to None
 78 |     # here.
 79 | 
 80 |     with torch.cuda.device(device):
 81 |         with torch.autocast(device_type="cuda", dtype=torch.bfloat16):
 82 |             with torch.no_grad():
 83 |                 results = lm_eval.simple_evaluate( # call simple_evaluate
 84 |                     model=hflm,
 85 |                     # tasks=["wikitext"], 
 86 |                     tasks=["wikitext", "lambada_openai", "piqa", "hellaswag", "winogrande", "arc_easy", "arc_challenge", "boolq", "sciq", "copa", "openbookqa"], 
 87 |                     # tasks=["winogrande"], 
 88 |                     # tasks=["scrolls_narrativeqa", "scrolls_qasper", "scrolls_quality"], 
 89 |                     # tasks=[
 90 | 
 91 |                         # "scrolls_govreport",  # 10min for mamba2
 92 |                         # "scrolls_qmsum",# 4min for mamba2
 93 |                         # "scrolls_summscreenfd", <10min
 94 | 
 95 |                         # "scrolls_qasper",
 96 | 
 97 |                         # "scrolls_quality",
 98 |                         # "scrolls_contractnli",
 99 | 
100 |                         # "scrolls_narrativeqa",
101 |                     # ], 
102 |                     # tasks=["wikitext"], 
103 |                     # tasks=["lambada_openai"], 
104 |                     num_fewshot=0,
105 |                     task_manager=task_manager,
106 |                     device="cuda"
107 |                 )
108 |     pprint.pprint(results["results"])
109 |     save_path = save_dir / "results.json"
110 |     with save_path.open("w") as f:
111 |         json.dump(results["results"], f, indent=4)
112 |     print(f"Results saved to {save_path}")
113 |     # import ipdb; ipdb.set_trace()
114 | if __name__ == "__main__":
115 |     main()  # pylint: disable=no-value-for-parameter
116 | 
117 | 


--------------------------------------------------------------------------------
/eval/longbench/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 THU-KEG & Zhipu AI
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/eval/longbench/README.md:
--------------------------------------------------------------------------------
 1 | # LongBench
 2 | 
 3 | This directory contains the code for evaluation on LongBench. The code is adapted from the original [LongBench-v1 repository](https://github.com/THUDM/LongBench/blob/main/LongBench/README.md).
 4 | 
 5 | ## Usage
 6 | 
 7 | Usage example:
 8 | 
 9 | ```bash
10 | python pred.py --model "fox-pro-760m-longcrawl64-48b" --model_path "zhixuan-lin/fox-pro-760m-longcrawl64-48b" --max_length 15500
11 | python eval.py --model "fox-pro-760m-longcrawl64-48b"
12 | ```
13 | 
14 | After you run these, results will be saved to `./pred`. You can create a latex table using:
15 | 
16 | 
17 | ```bash
18 | python table_longbench.py
19 | ```
20 | 
21 | You can change `MODELS` in `table_longbench.py` to specify which models you want to include in the table.
22 | 
23 | 
24 | Note that we observe the evaluation results to be non-deterministic, likely due to GPU non-determinism. Therefore the results you obtain may not exactly match those reported in the paper. However, the difference should be small.
25 | 
26 | ## Citation
27 | 
28 | If you use this code, consider citing LongBench:
29 | ```
30 | @article{bai2023longbench,
31 |   title={LongBench: A Bilingual, Multitask Benchmark for Long Context Understanding},
32 |   author={Bai, Yushi and Lv, Xin and Zhang, Jiajie and Lyu, Hongchang and Tang, Jiankai and Huang, Zhidian and Du, Zhengxiao and Liu, Xiao and Zeng, Aohan and Hou, Lei and Dong, Yuxiao and Tang, Jie and Li, Juanzi},
33 |   journal={arXiv preprint arXiv:2308.14508},
34 |   year={2023}
35 | }
36 | ```
37 | When citing LongBench, please kindly consider citing the original dataset papers. The relevant citation information is listed [here](refs/ref.bib).
38 | 


--------------------------------------------------------------------------------
/eval/longbench/config/dataset2maxlen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "narrativeqa": 128,
 3 |     "qasper": 128,
 4 |     "multifieldqa_en": 64,
 5 |     "multifieldqa_zh": 64,
 6 |     "hotpotqa": 32,
 7 |     "2wikimqa": 32,
 8 |     "musique": 32,
 9 |     "dureader": 128,
10 |     "gov_report": 512,
11 |     "qmsum": 512,
12 |     "multi_news": 512,
13 |     "vcsum": 512,
14 |     "trec": 64,
15 |     "triviaqa": 32,
16 |     "samsum": 128,
17 |     "lsht": 64,
18 |     "passage_count": 32,
19 |     "passage_retrieval_en": 32,
20 |     "passage_retrieval_zh": 32,
21 |     "lcc": 64,
22 |     "repobench-p": 64
23 | }


--------------------------------------------------------------------------------
/eval/longbench/config/dataset2prompt.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "narrativeqa": "You are given a story, which can be either a novel or a movie script, and a question. Answer the question asconcisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nStory: {context}\n\nNow, answer the question based on the story asconcisely as you can, using a single phrase if possible. Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:",
 3 |     "qasper": "You are given a scientific article and a question. Answer the question as concisely as you can, using a single phrase or sentence if possible. If the question cannot be answered based on the information in the article, write \"unanswerable\". If the question is a yes/no question, answer \"yes\", \"no\", or \"unanswerable\". Do not provide any explanation.\n\nArticle: {context}\n\n Answer the question based on the above article as concisely as you can, using a single phrase or sentence if possible. If the question cannot be answered based on the information in the article, write \"unanswerable\". If the question is a yes/no question, answer \"yes\", \"no\", or \"unanswerable\". Do not provide any explanation.\n\nQuestion: {input}\n\nAnswer:",
 4 |     "multifieldqa_en": "Read the following text and answer briefly.\n\n{context}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:",
 5 |     "multifieldqa_zh": "阅读以下文字并用中文简短回答：\n\n{context}\n\n现在请基于上面的文章回答下面的问题，只告诉我答案，不要输出任何其他字词。\n\n问题：{input}\n回答：",
 6 |     "hotpotqa": "Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:",
 7 |     "2wikimqa": "Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:",
 8 |     "musique": "Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{context}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {input}\nAnswer:",
 9 |     "dureader": "请基于给定的文章回答下述问题。\n\n文章：{context}\n\n请基于上述文章回答下面的问题。\n\n问题：{input}\n回答：",
10 |     "gov_report": "You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{context}\n\nNow, write a one-page summary of the report.\n\nSummary:",
11 |     "qmsum": "You are given a meeting transcript and a query containing a question or instruction. Answer the query in one or more sentences.\n\nTranscript:\n{context}\n\nNow, answer the query based on the above meeting transcript in one or more sentences.\n\nQuery: {input}\nAnswer:",
12 |     "multi_news": "You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:",
13 |     "vcsum": "下面有一段会议记录，请你阅读后，写一段总结，总结会议的内容。\n会议记录：\n{context}\n\n会议总结：",
14 |     "trec": "Please determine the type of the question below. Here are some examples of questions.\n\n{context}\n{input}",
15 |     "triviaqa": "Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{context}\n\n{input}",
16 |     "samsum": "Summarize the dialogue into a few short sentences. The following are some examples.\n\n{context}\n\n{input}",
17 |     "lsht": "请判断给定新闻的类别，下面是一些例子。\n\n{context}\n{input}",
18 |     "passage_count": "There are some paragraphs below sourced from Wikipedia. Some of them may be duplicates. Please carefully read these paragraphs and determine how many unique paragraphs there are after removing duplicates. In other words, how many non-repeating paragraphs are there in total?\n\n{context}\n\nPlease enter the final count of unique paragraphs after removing duplicates. The output format should only contain the number, such as 1, 2, 3, and so on.\n\nThe final answer is: ",
19 |     "passage_retrieval_en": "Here are 30 paragraphs from Wikipedia, along with an abstract. Please determine which paragraph the abstract is from.\n\n{context}\n\nThe following is an abstract.\n\n{input}\n\nPlease enter the number of the paragraph that the abstract is from. The answer format must be like \"Paragraph 1\", \"Paragraph 2\", etc.\n\nThe answer is: ",
20 |     "passage_retrieval_zh": "以下是若干段落文字，以及其中一个段落的摘要。请确定给定的摘要出自哪一段。\n\n{context}\n\n下面是一个摘要\n\n{input}\n\n请输入摘要所属段落的编号。答案格式必须是\"段落1\"，\"段落2\"等格式\n\n答案是：",
21 |     "lcc": "Please complete the code given below. \n{context}Next line of code:\n",
22 |     "repobench-p": "Please complete the code given below. \n{context}{input}Next line of code:\n"
23 | }


--------------------------------------------------------------------------------
/eval/longbench/eval.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from pathlib import Path
  3 | import json
  4 | import argparse
  5 | import numpy as np
  6 | 
  7 | from metrics import (
  8 |     qa_f1_score,
  9 |     rouge_zh_score,
 10 |     qa_f1_zh_score,
 11 |     rouge_score,
 12 |     classification_score,
 13 |     retrieval_score,
 14 |     retrieval_zh_score,
 15 |     count_score,
 16 |     code_sim_score,
 17 | )
 18 | 
 19 | dataset2metric = {
 20 |     "narrativeqa": qa_f1_score,
 21 |     "qasper": qa_f1_score,
 22 |     "multifieldqa_en": qa_f1_score,
 23 |     "multifieldqa_zh": qa_f1_zh_score,
 24 |     "hotpotqa": qa_f1_score,
 25 |     "2wikimqa": qa_f1_score,
 26 |     "musique": qa_f1_score,
 27 |     "dureader": rouge_zh_score,
 28 |     "gov_report": rouge_score,
 29 |     "qmsum": rouge_score,
 30 |     "multi_news": rouge_score,
 31 |     "vcsum": rouge_zh_score,
 32 |     "trec": classification_score,
 33 |     "triviaqa": qa_f1_score,
 34 |     "samsum": rouge_score,
 35 |     "lsht": classification_score,
 36 |     "passage_retrieval_en": retrieval_score,
 37 |     "passage_count": count_score,
 38 |     "passage_retrieval_zh": retrieval_zh_score,
 39 |     "lcc": code_sim_score,
 40 |     "repobench-p": code_sim_score,
 41 | }
 42 | 
 43 | def parse_args(args=None):
 44 |     parser = argparse.ArgumentParser()
 45 |     parser.add_argument('--model', type=str, default=None)
 46 |     parser.add_argument('--e', action='store_true', help="Evaluate on LongBench-E")
 47 |     return parser.parse_args(args)
 48 | 
 49 | def scorer_e(dataset, predictions, answers, lengths, all_classes):
 50 |     scores = {"0-4k": [], "4-8k": [], "8k+": []}
 51 |     for (prediction, ground_truths, length) in zip(predictions, answers, lengths):
 52 |         score = 0.
 53 |         # if dataset in ["trec", "triviaqa", "samsum", "lsht"]:
 54 |         prediction = prediction.lstrip('\n').split('\n')[0]
 55 |         for ground_truth in ground_truths:
 56 |             score = max(score, dataset2metric[dataset](prediction, ground_truth, all_classes=all_classes))
 57 |         if length < 4000:
 58 |             scores["0-4k"].append(score)
 59 |         elif length < 8000:
 60 |             scores["4-8k"].append(score)
 61 |         else:
 62 |             scores["8k+"].append(score)
 63 |     for key in scores.keys():
 64 |         scores[key] = round(100 * np.mean(scores[key]), 2)
 65 |     return scores
 66 | 
 67 | def scorer(dataset, predictions, answers, all_classes):
 68 |     total_score = 0.
 69 |     for (prediction, ground_truths) in zip(predictions, answers):
 70 |         score = 0.
 71 |         # if dataset in ["trec", "triviaqa", "samsum", "lsht"]:
 72 |         prediction = prediction.lstrip('\n').split('\n')[0]
 73 |         for ground_truth in ground_truths:
 74 |             score = max(score, dataset2metric[dataset](prediction, ground_truth, all_classes=all_classes))
 75 |         total_score += score
 76 |     return round(100 * total_score / len(predictions), 2)
 77 | 
 78 | if __name__ == '__main__':
 79 |     args = parse_args()
 80 |     scores = dict()
 81 |     if args.e:
 82 |         path = f"pred_e/{args.model}/"
 83 |     else:
 84 |         path = f"pred/{args.model}/"
 85 | 
 86 | 
 87 |     if args.e:
 88 |         # datasets = ["qasper", "multifieldqa_en", "hotpotqa", "2wikimqa", "gov_report", "multi_news", \
 89 |             # "trec", "triviaqa", "samsum", "passage_count", "passage_retrieval_en", "lcc", "repobench-p"]
 90 |         datasets = ["qasper", "multifieldqa_en", "hotpotqa", "2wikimqa", "gov_report", "multi_news", \
 91 |             "trec", "triviaqa", "samsum", "lcc", "repobench-p"]
 92 |         # datasets = ["triviaqa"]
 93 |     else:
 94 |         # datasets = ["narrativeqa", "qasper", "multifieldqa_en", "multifieldqa_zh", "hotpotqa", "2wikimqa", "musique", \
 95 |                     # "dureader", "gov_report", "qmsum", "multi_news", "vcsum", "trec", "triviaqa", "samsum", "lsht", \
 96 |                     # "passage_count", "passage_retrieval_en", "passage_retrieval_zh", "lcc", "repobench-p"]
 97 |         # English tasks
 98 |         # datasets = ["2wikimqa", "narrativeqa", "qasper", "multifieldqa_en", "hotpotqa", "musique", \
 99 |                     # "gov_report", "qmsum", "multi_news", "trec", "triviaqa", "samsum", \
100 |                     # "passage_count", "passage_retrieval_en", "lcc", "repobench-p"]
101 |         datasets = ["2wikimqa", "narrativeqa", "qasper", "multifieldqa_en", "hotpotqa", "musique", \
102 |                     "gov_report", "qmsum", "multi_news", "trec", "triviaqa", "samsum", \
103 |                     "lcc", "repobench-p"]
104 |         # datasets = ["2wikimqa"]
105 |         # datasets = ["2wikimqa", "narrativeqa", "qasper", "multifieldqa_en", "hotpotqa", "musique", \
106 |     for dataset in datasets:
107 |         sentinel_path = (Path(path) / f"{dataset}.jsonl.done")
108 |         result_path = (Path(path) / f"{dataset}.jsonl")
109 |         assert sentinel_path.is_file(), f"{sentinel_path} is missing"
110 |         assert result_path.is_file(), f"{result_path} is missing"
111 |     all_files = os.listdir(path)
112 |     print("Evaluating on:", [file for file in all_files if (Path(path) / f"{file}.done").is_file()])
113 |     for filename in all_files:
114 |         if not filename.endswith("jsonl"):
115 |             continue
116 |         sentinel_path = f"{path}{filename}.done"
117 |         if not Path(sentinel_path).exists():
118 |             print(f"{filename} is incomplete. Skipping")
119 |             # We don't delete things because pred.py might be writing to it
120 |             continue
121 |         predictions, answers, lengths = [], [], []
122 |         dataset = filename.split('.')[0]
123 |         with open(f"{path}{filename}", "r", encoding="utf-8") as f:
124 |             for line in f:
125 |                 data = json.loads(line)
126 |                 predictions.append(data["pred"])
127 |                 answers.append(data["answers"])
128 |                 all_classes = data["all_classes"]
129 |                 if "length" in data:
130 |                     lengths.append(data["length"])
131 |         if args.e:
132 |             score = scorer_e(dataset, predictions, answers, lengths, all_classes)
133 |         else:
134 |             score = scorer(dataset, predictions, answers, all_classes)
135 |         scores[dataset] = score
136 |     if args.e:
137 |         out_path = f"pred_e/{args.model}/result.json"
138 |     else:
139 |         out_path = f"pred/{args.model}/result.json"
140 |     with open(out_path, "w") as f:
141 |         json.dump(scores, f, ensure_ascii=False, indent=4)
142 |     print(f"Results written to {out_path}.")
143 | 


--------------------------------------------------------------------------------
/eval/longbench/metrics.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import string
  3 | 
  4 | import jieba
  5 | from fuzzywuzzy import fuzz
  6 | import difflib
  7 | 
  8 | from typing import List
  9 | from collections import Counter
 10 | from rouge import Rouge
 11 | 
 12 | def normalize_answer(s):
 13 |     """Lower text and remove punctuation, articles and extra whitespace."""
 14 | 
 15 |     def remove_articles(text):
 16 |         return re.sub(r"\b(a|an|the)\b", " ", text)
 17 | 
 18 |     def white_space_fix(text):
 19 |         return " ".join(text.split())
 20 | 
 21 |     def remove_punc(text):
 22 |         exclude = set(string.punctuation)
 23 |         return "".join(ch for ch in text if ch not in exclude)
 24 | 
 25 |     def lower(text):
 26 |         return text.lower()
 27 | 
 28 |     return white_space_fix(remove_articles(remove_punc(lower(s))))
 29 | 
 30 | 
 31 | def normalize_zh_answer(s):
 32 |     """Lower text and remove punctuation, extra whitespace."""
 33 | 
 34 |     def white_space_fix(text):
 35 |         return "".join(text.split())
 36 | 
 37 |     def remove_punc(text):
 38 |         cn_punctuation = "！？｡。＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏."
 39 |         all_punctuation = set(string.punctuation + cn_punctuation)
 40 |         return "".join(ch for ch in text if ch not in all_punctuation)
 41 | 
 42 |     def lower(text):
 43 |         return text.lower()
 44 | 
 45 |     return white_space_fix(remove_punc(lower(s)))
 46 | 
 47 | def count_score(prediction, ground_truth, **kwargs):
 48 |     numbers = re.findall(r"\d+", prediction)
 49 |     right_num = 0
 50 |     for number in numbers:
 51 |         if str(number) == str(ground_truth):
 52 |             right_num += 1
 53 |     final_score = 0.0 if len(numbers) == 0 else right_num / len(numbers)
 54 |     return float(final_score)
 55 | 
 56 | def retrieval_score(prediction, ground_truth, **kwargs):
 57 |     pattern = r'Paragraph (\d+)'
 58 |     matches = re.findall(pattern, ground_truth)
 59 |     ground_truth_id = matches[0]
 60 |     numbers = re.findall(r"\d+", prediction)
 61 |     right_num = 0
 62 |     for number in numbers:
 63 |         if str(number) == str(ground_truth_id):
 64 |             right_num += 1
 65 |     final_score = 0.0 if len(numbers) == 0 else right_num / len(numbers)
 66 |     return float(final_score)
 67 | 
 68 | def retrieval_zh_score(prediction, ground_truth, **kwargs):
 69 |     pattern = r'段落(\d+)'
 70 |     matches = re.findall(pattern, ground_truth)
 71 |     ground_truth_id = matches[0]
 72 |     numbers = re.findall(r"\d+", prediction)
 73 |     right_num = 0
 74 |     for number in numbers:
 75 |         if str(number) == str(ground_truth_id):
 76 |             right_num += 1
 77 |     final_score = 0.0 if len(numbers) == 0 else right_num / len(numbers)
 78 |     return float(final_score)
 79 | 
 80 | def code_sim_score(prediction, ground_truth, **kwargs):
 81 |     all_lines = prediction.lstrip('\n').split('\n')
 82 |     prediction = ""
 83 |     for line in all_lines:
 84 |         if ('`' not in line) and ('#' not in line) and ('//' not in line):
 85 |             prediction = line
 86 |             break
 87 |     return (fuzz.ratio(prediction, ground_truth) / 100)
 88 | 
 89 | def classification_score(prediction, ground_truth, **kwargs):
 90 |     em_match_list = []
 91 |     all_classes = kwargs["all_classes"]
 92 |     for class_name in all_classes:
 93 |         if class_name in prediction:
 94 |             em_match_list.append(class_name)
 95 |     for match_term in em_match_list:
 96 |         if match_term in ground_truth and match_term != ground_truth:
 97 |             em_match_list.remove(match_term)
 98 |     if ground_truth in em_match_list:
 99 |         score = (1.0 / len(em_match_list))
100 |     else:
101 |         score = 0.0
102 |     return score
103 |     
104 | def rouge_score(prediction, ground_truth, **kwargs):
105 |     rouge = Rouge()
106 |     try:
107 |         scores = rouge.get_scores([prediction], [ground_truth], avg=True)
108 |     except:
109 |         return 0.0
110 |     return scores["rouge-l"]["f"]
111 | 
112 | def rouge_zh_score(prediction, ground_truth, **kwargs):
113 |     prediction = " ".join(list(jieba.cut(prediction, cut_all=False)))
114 |     ground_truth = " ".join(list(jieba.cut(ground_truth, cut_all=False))) 
115 |     score = rouge_score(prediction, ground_truth)
116 |     return score
117 | 
118 | def f1_score(prediction, ground_truth, **kwargs):
119 |     common = Counter(prediction) & Counter(ground_truth)
120 |     num_same = sum(common.values())
121 |     if num_same == 0:
122 |         return 0
123 |     precision = 1.0 * num_same / len(prediction)
124 |     recall = 1.0 * num_same / len(ground_truth)
125 |     f1 = (2 * precision * recall) / (precision + recall)
126 |     return f1
127 | 
128 | def qa_f1_score(prediction, ground_truth, **kwargs):
129 |     normalized_prediction = normalize_answer(prediction)
130 |     normalized_ground_truth = normalize_answer(ground_truth)
131 | 
132 |     prediction_tokens = normalized_prediction.split()
133 |     ground_truth_tokens = normalized_ground_truth.split()
134 |     return f1_score(prediction_tokens, ground_truth_tokens)
135 | 
136 | 
137 | def qa_f1_zh_score(prediction, ground_truth, **kwargs):
138 |     prediction_tokens = list(jieba.cut(prediction, cut_all=False))
139 |     ground_truth_tokens = list(jieba.cut(ground_truth, cut_all=False))
140 |     prediction_tokens = [normalize_zh_answer(token) for token in prediction_tokens]
141 |     ground_truth_tokens = [normalize_zh_answer(token) for token in ground_truth_tokens]
142 |     prediction_tokens = [token for token in prediction_tokens if len(token) > 0]
143 |     ground_truth_tokens = [token for token in ground_truth_tokens if len(token) > 0]
144 |     return f1_score(prediction_tokens, ground_truth_tokens)
145 | 


--------------------------------------------------------------------------------
/eval/longbench/refs/ref.bib:
--------------------------------------------------------------------------------
  1 | @inproceedings{yang2018hotpotqa,
  2 |   title={HotpotQA: A Dataset for Diverse, Explainable Multi-hop Question Answering},
  3 |   author={Yang, Zhilin and Qi, Peng and Zhang, Saizheng and Bengio, Yoshua and Cohen, William and Salakhutdinov, Ruslan and Manning, Christopher D},
  4 |   booktitle={Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing},
  5 |   pages={2369--2380},
  6 |   year={2018}
  7 | }
  8 | 
  9 | @inproceedings{ho2020constructing,
 10 |   title={Constructing A Multi-hop QA Dataset for Comprehensive Evaluation of Reasoning Steps},
 11 |   author={Ho, Xanh and Nguyen, Anh-Khoa Duong and Sugawara, Saku and Aizawa, Akiko},
 12 |   booktitle={Proceedings of the 28th International Conference on Computational Linguistics},
 13 |   pages={6609--6625},
 14 |   year={2020}
 15 | }
 16 | 
 17 | @article{trivedi2022musique,
 18 |   title={♫ MuSiQue: Multihop Questions via Single-hop Question Composition},
 19 |   author={Trivedi, Harsh and Balasubramanian, Niranjan and Khot, Tushar and Sabharwal, Ashish},
 20 |   journal={Transactions of the Association for Computational Linguistics},
 21 |   volume={10},
 22 |   pages={539--554},
 23 |   year={2022},
 24 |   publisher={MIT Press One Broadway, 12th Floor, Cambridge, Massachusetts 02142, USA~…}
 25 | }
 26 | 
 27 | @article{he2018dureader,
 28 |   title={DuReader: a Chinese Machine Reading Comprehension Dataset from Real-world Applications},
 29 |   author={He, Wei and Liu, Kai and Liu, Jing and Lyu, Yajuan and Zhao, Shiqi and Xiao, Xinyan and Liu, Yuan and Wang, Yizhong and Wu, Hua and She, Qiaoqiao and others},
 30 |   journal={ACL 2018},
 31 |   pages={37},
 32 |   year={2018}
 33 | }
 34 | 
 35 | @article{kovcisky2018narrativeqa,
 36 |   title={The narrativeqa reading comprehension challenge},
 37 |   author={Ko{\v{c}}isk{\`y}, Tom{\'a}{\v{s}} and Schwarz, Jonathan and Blunsom, Phil and Dyer, Chris and Hermann, Karl Moritz and Melis, G{\'a}bor and Grefenstette, Edward},
 38 |   journal={Transactions of the Association for Computational Linguistics},
 39 |   volume={6},
 40 |   pages={317--328},
 41 |   year={2018},
 42 |   publisher={MIT Press One Rogers Street, Cambridge, MA 02142-1209, USA journals-info~…}
 43 | }
 44 | 
 45 | @inproceedings{dasigi2021dataset,
 46 |   title={A Dataset of Information-Seeking Questions and Answers Anchored in Research Papers},
 47 |   author={Dasigi, Pradeep and Lo, Kyle and Beltagy, Iz and Cohan, Arman and Smith, Noah A and Gardner, Matt},
 48 |   booktitle={Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies},
 49 |   pages={4599--4610},
 50 |   year={2021}
 51 | }
 52 | 
 53 | @inproceedings{huang2021efficient,
 54 |   title={Efficient Attentions for Long Document Summarization},
 55 |   author={Huang, Luyang and Cao, Shuyang and Parulian, Nikolaus and Ji, Heng and Wang, Lu},
 56 |   booktitle={Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies},
 57 |   pages={1419--1436},
 58 |   year={2021}
 59 | }
 60 | 
 61 | @inproceedings{zhong2021qmsum,
 62 |   title={QMSum: A New Benchmark for Query-based Multi-domain Meeting Summarization},
 63 |   author={Zhong, Ming and Yin, Da and Yu, Tao and Zaidi, Ahmad and Mutuma, Mutethia and Jha, Rahul and Hassan, Ahmed and Celikyilmaz, Asli and Liu, Yang and Qiu, Xipeng and others},
 64 |   booktitle={Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies},
 65 |   pages={5905--5921},
 66 |   year={2021}
 67 | }
 68 | 
 69 | @article{wu2023vcsum,
 70 |   title={VCSUM: A Versatile Chinese Meeting Summarization Dataset},
 71 |   author={Wu, Han and Zhan, Mingjie and Tan, Haochen and Hou, Zhaohui and Liang, Ding and Song, Linqi},
 72 |   journal={arXiv preprint arXiv:2305.05280},
 73 |   year={2023}
 74 | }
 75 | 
 76 | @inproceedings{joshi2017triviaqa,
 77 |   title={TriviaQA: A Large Scale Distantly Supervised Challenge Dataset for Reading Comprehension},
 78 |   author={Joshi, Mandar and Choi, Eunsol and Weld, Daniel S and Zettlemoyer, Luke},
 79 |   booktitle={Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
 80 |   pages={1601--1611},
 81 |   year={2017}
 82 | }
 83 | 
 84 | @article{gliwa2019samsum,
 85 |   title={SAMSum Corpus: A Human-annotated Dialogue Dataset for Abstractive Summarization},
 86 |   author={Gliwa, Bogdan and Mochol, Iwona and Biesek, Maciej and Wawer, Aleksander},
 87 |   journal={EMNLP-IJCNLP 2019},
 88 |   pages={70},
 89 |   year={2019}
 90 | }
 91 | 
 92 | @inproceedings{fabbri2019multi,
 93 |   title={Multi-News: A Large-Scale Multi-Document Summarization Dataset and Abstractive Hierarchical Model},
 94 |   author={Fabbri, Alexander Richard and Li, Irene and She, Tianwei and Li, Suyi and Radev, Dragomir},
 95 |   booktitle={Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics},
 96 |   pages={1074--1084},
 97 |   year={2019}
 98 | }
 99 | 
100 | @inproceedings{li2002learning,
101 |   title={Learning question classifiers},
102 |   author={Li, Xin and Roth, Dan},
103 |   booktitle={COLING 2002: The 19th International Conference on Computational Linguistics},
104 |   year={2002}
105 | }
106 | 
107 | @article{guo2023longcoder,
108 |   title={LongCoder: A Long-Range Pre-trained Language Model for Code Completion},
109 |   author={Guo, Daya and Xu, Canwen and Duan, Nan and Yin, Jian and McAuley, Julian},
110 |   journal={arXiv preprint arXiv:2306.14893},
111 |   year={2023}
112 | }
113 | 
114 | @article{liu2023repobench,
115 |   title={RepoBench: Benchmarking Repository-Level Code Auto-Completion Systems},
116 |   author={Liu, Tianyang and Xu, Canwen and McAuley, Julian},
117 |   journal={arXiv preprint arXiv:2306.03091},
118 |   year={2023}
119 | }


--------------------------------------------------------------------------------
/eval/longbench/requirements.txt:
--------------------------------------------------------------------------------
1 | datasets
2 | tqdm
3 | rouge
4 | jieba
5 | fuzzywuzzy
6 | torch
7 | transformers==4.31.0
8 | einops


--------------------------------------------------------------------------------
/eval/niah/PaulGrahamEssays/bias.txt:
--------------------------------------------------------------------------------
 1 | October 2015This will come as a surprise to a lot of people, but in some cases
 2 | it's possible to detect bias in a selection process without knowing
 3 | anything about the applicant pool.  Which is exciting because among
 4 | other things it means third parties can use this technique to detect
 5 | bias whether those doing the selecting want them to or not.You can use this technique whenever (a) you have at least
 6 | a random sample of the applicants that were selected, (b) their
 7 | subsequent performance is measured, and (c) the groups of
 8 | applicants you're comparing have roughly equal distribution of ability.How does it work?  Think about what it means to be biased.  What
 9 | it means for a selection process to be biased against applicants
10 | of type x is that it's harder for them to make it through.  Which
11 | means applicants of type x have to be better to get selected than
12 | applicants not of type x.
13 | [1]
14 | Which means applicants of type x
15 | who do make it through the selection process will outperform other
16 | successful applicants.  And if the performance of all the successful
17 | applicants is measured, you'll know if they do.Of course, the test you use to measure performance must be a valid
18 | one.  And in particular it must not be invalidated by the bias you're
19 | trying to measure.
20 | But there are some domains where performance can be measured, and
21 | in those detecting bias is straightforward. Want to know if the
22 | selection process was biased against some type of applicant?  Check
23 | whether they outperform the others.  This is not just a heuristic
24 | for detecting bias.  It's what bias means.For example, many suspect that venture capital firms are biased
25 | against female founders. This would be easy to detect: among their
26 | portfolio companies, do startups with female founders outperform
27 | those without?  A couple months ago, one VC firm (almost certainly
28 | unintentionally) published a study showing bias of this type. First
29 | Round Capital found that among its portfolio companies, startups
30 | with female founders outperformed
31 | those without by 63%. 
32 | [2]The reason I began by saying that this technique would come as a
33 | surprise to many people is that we so rarely see analyses of this
34 | type.  I'm sure it will come as a surprise to First Round that they
35 | performed one. I doubt anyone there realized that by limiting their
36 | sample to their own portfolio, they were producing a study not of
37 | startup trends but of their own biases when selecting companies.I predict we'll see this technique used more in the future.  The
38 | information needed to conduct such studies is increasingly available.
39 | Data about who applies for things is usually closely guarded by the
40 | organizations selecting them, but nowadays data about who gets
41 | selected is often publicly available to anyone who takes the trouble
42 | to aggregate it.
43 | Notes[1]
44 | This technique wouldn't work if the selection process looked
45 | for different things from different types of applicants—for
46 | example, if an employer hired men based on their ability but women
47 | based on their appearance.[2]
48 | As Paul Buchheit points out, First Round excluded their most 
49 | successful investment, Uber, from the study.  And while it 
50 | makes sense to exclude outliers from some types of studies, 
51 | studies of returns from startup investing, which is all about 
52 | hitting outliers, are not one of them.
53 | Thanks to Sam Altman, Jessica Livingston, and Geoff Ralston for reading
54 | drafts of this.


--------------------------------------------------------------------------------
/eval/niah/PaulGrahamEssays/copy.txt:
--------------------------------------------------------------------------------
 1 | July 2006
 2 | When I was in high school I spent a lot of time imitating bad
 3 | writers.  What we studied in English classes was mostly fiction,
 4 | so I assumed that was the highest form of writing.  Mistake number
 5 | one.  The stories that seemed to be most admired were ones in which
 6 | people suffered in complicated ways.  Anything funny or
 7 | gripping was ipso facto suspect, unless it was old enough to be hard to
 8 | understand, like Shakespeare or Chaucer.  Mistake number two.  The
 9 | ideal medium seemed the short story, which I've since learned had
10 | quite a brief life, roughly coincident with the peak of magazine
11 | publishing.  But since their size made them perfect for use in
12 | high school classes, we read a lot of them, which gave us the
13 | impression the short story was flourishing.  Mistake number three.
14 | And because they were so short, nothing really had to happen; you
15 | could just show a randomly truncated slice of life, and that was
16 | considered advanced.  Mistake number four.  The result was that I
17 | wrote a lot of stories in which nothing happened except that someone
18 | was unhappy in a way that seemed deep.For most of college I was a philosophy major.  I was very impressed
19 | by the papers published in philosophy journals.  They were so
20 | beautifully typeset, and their tone was just captivating—alternately
21 | casual and buffer-overflowingly technical.  A fellow would be walking
22 | along a street and suddenly modality qua modality would spring upon
23 | him.  I didn't ever quite understand these papers, but I figured
24 | I'd get around to that later, when I had time to reread them more
25 | closely.  In the meantime I tried my best to imitate them.  This
26 | was, I can now see, a doomed undertaking, because they weren't
27 | really saying anything.  No philosopher ever refuted another, for
28 | example, because no one said anything definite enough to refute.
29 | Needless to say, my imitations didn't say anything either.In grad school I was still wasting time imitating the wrong things.
30 | There was then a fashionable type of program called an expert system,
31 | at the core of which was something called an inference engine.  I
32 | looked at what these things did and thought "I could write that in
33 | a thousand lines of code."  And yet eminent professors were writing
34 | books about them, and startups were selling them for a year's salary
35 | a copy.  What an opportunity, I thought; these impressive things
36 | seem easy to me; I must be pretty sharp.  Wrong.  It was simply a
37 | fad.  The books the professors wrote about expert systems are now
38 | ignored.  They were not even on a path to anything interesting.
39 | And the customers paying so much for them were largely the same
40 | government agencies that paid thousands for screwdrivers and toilet
41 | seats.How do you avoid copying the wrong things?  Copy only what you
42 | genuinely like.  That would have saved me in all three cases.  I
43 | didn't enjoy the short stories we had to read in English classes;
44 | I didn't learn anything from philosophy papers; I didn't use expert
45 | systems myself.  I believed these things were good because they
46 | were admired.It can be hard to separate the things you like from the things
47 | you're impressed with.  One trick is to ignore presentation.  Whenever
48 | I see a painting impressively hung in a museum, I ask myself: how
49 | much would I pay for this if I found it at a garage sale, dirty and
50 | frameless, and with no idea who painted it?  If you walk around a
51 | museum trying this experiment, you'll find you get some truly
52 | startling results.  Don't ignore this data point just because it's
53 | an outlier.Another way to figure out what you like is to look at what you enjoy
54 | as guilty pleasures.  Many things people like, especially if they're
55 | young and ambitious, they like largely for the feeling of virtue
56 | in liking them.  99% of people reading Ulysses are thinking
57 | "I'm reading Ulysses" as they do it. A guilty pleasure is
58 | at least a pure one.  What do you read when you don't feel up to being
59 | virtuous?  What kind of book do you read and feel sad that there's
60 | only half of it left, instead of being impressed that you're half
61 | way through?  That's what you really like.Even when you find genuinely good things to copy, there's another
62 | pitfall to be avoided.  Be careful to copy what makes them good,
63 | rather than their flaws.  It's easy to be drawn into imitating
64 | flaws, because they're easier to see, and of course easier to copy
65 | too.  For example, most painters in the eighteenth and nineteenth
66 | centuries used brownish colors.  They were imitating the great
67 | painters of the Renaissance, whose paintings by that time were brown
68 | with dirt.  Those paintings have since been cleaned, revealing
69 | brilliant colors; their imitators are of course still brown.It was painting, incidentally, that cured me of copying the wrong
70 | things.  Halfway through grad school I decided I wanted to try being
71 | a painter, and the art world was so manifestly corrupt that it
72 | snapped the leash of credulity.  These people made philosophy
73 | professors seem as scrupulous as mathematicians.  It was so clearly
74 | a choice of doing good work xor being an insider that I was forced
75 | to see the distinction.  It's there to some degree in almost every
76 | field, but I had till then managed to avoid facing it.That was one of the most valuable things I learned from painting:
77 | you have to figure out for yourself what's 
78 | good.  You can't trust
79 | authorities. They'll lie to you on this one.
80 | 
81 | Comment on this essay.


--------------------------------------------------------------------------------
/eval/niah/PaulGrahamEssays/diff.txt:
--------------------------------------------------------------------------------
 1 | December 2001 (rev. May 2002)
 2 | 
 3 | (This article came about in response to some questions on
 4 | the LL1 mailing list.  It is now
 5 | incorporated in Revenge of the Nerds.)When McCarthy designed Lisp in the late 1950s, it was
 6 | a radical departure from existing languages,
 7 | the most important of which was Fortran.Lisp embodied nine new ideas:
 8 | 1. Conditionals.  A conditional is an if-then-else
 9 | construct.  We take these for granted now.  They were 
10 | invented
11 | by McCarthy in the course of developing Lisp. 
12 | (Fortran at that time only had a conditional
13 | goto, closely based on the branch instruction in the 
14 | underlying hardware.)  McCarthy, who was on the Algol committee, got
15 | conditionals into Algol, whence they spread to most other
16 | languages.2. A function type. In Lisp, functions are first class 
17 | objects-- they're a data type just like integers, strings,
18 | etc, and have a literal representation, can be stored in variables,
19 | can be passed as arguments, and so on.3. Recursion.  Recursion existed as a mathematical concept
20 | before Lisp of course, but Lisp was the first programming language to support
21 | it.  (It's arguably implicit in making functions first class
22 | objects.)4. A new concept of variables.  In Lisp, all variables
23 | are effectively pointers. Values are what
24 | have types, not variables, and assigning or binding
25 | variables means copying pointers, not what they point to.5. Garbage-collection.6. Programs composed of expressions. Lisp programs are 
26 | trees of expressions, each of which returns a value.  
27 | (In some Lisps expressions
28 | can return multiple values.)  This is in contrast to Fortran
29 | and most succeeding languages, which distinguish between
30 | expressions and statements.It was natural to have this
31 | distinction in Fortran because (not surprisingly in a language
32 | where the input format was punched cards) the language was
33 | line-oriented.  You could not nest statements.  And
34 | so while you needed expressions for math to work, there was
35 | no point in making anything else return a value, because
36 | there could not be anything waiting for it.This limitation
37 | went away with the arrival of block-structured languages,
38 | but by then it was too late. The distinction between
39 | expressions and statements was entrenched.  It spread from 
40 | Fortran into Algol and thence to both their descendants.When a language is made entirely of expressions, you can
41 | compose expressions however you want.  You can say either
42 | (using Arc syntax)(if foo (= x 1) (= x 2))or(= x (if foo 1 2))7. A symbol type.  Symbols differ from strings in that
43 | you can test equality by comparing a pointer.8. A notation for code using trees of symbols.9. The whole language always available.  
44 | There is
45 | no real distinction between read-time, compile-time, and runtime.
46 | You can compile or run code while reading, read or run code
47 | while compiling, and read or compile code at runtime.Running code at read-time lets users reprogram Lisp's syntax;
48 | running code at compile-time is the basis of macros; compiling
49 | at runtime is the basis of Lisp's use as an extension
50 | language in programs like Emacs; and reading at runtime
51 | enables programs to communicate using s-expressions, an
52 | idea recently reinvented as XML.
53 | When Lisp was first invented, all these ideas were far
54 | removed from ordinary programming practice, which was
55 | dictated largely by the hardware available in the late 1950s.Over time, the default language, embodied
56 | in a succession of popular languages, has
57 | gradually evolved toward Lisp.  1-5 are now widespread.
58 | 6 is starting to appear in the mainstream.
59 | Python has a form of 7, though there doesn't seem to be
60 | any syntax for it.  
61 | 8, which (with 9) is what makes Lisp macros
62 | possible, is so far still unique to Lisp,
63 | perhaps because (a) it requires those parens, or something 
64 | just as bad, and (b) if you add that final increment of power, 
65 | you can no 
66 | longer claim to have invented a new language, but only
67 | to have designed a new dialect of Lisp ; -)Though useful to present-day programmers, it's
68 | strange to describe Lisp in terms of its
69 | variation from the random expedients other languages
70 | adopted.  That was not, probably, how McCarthy
71 | thought of it.  Lisp wasn't designed to fix the mistakes
72 | in Fortran; it came about more as the byproduct of an
73 | attempt to axiomatize computation.


--------------------------------------------------------------------------------
/eval/niah/PaulGrahamEssays/founders.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | Want to start a startup?  Get funded by
 4 | Y Combinator.
 5 | 
 6 | 
 7 | 
 8 | 
 9 | October 2010
10 | 
11 | (I wrote this for Forbes, who asked me to write something
12 | about the qualities we look for in founders.  In print they had to cut
13 | the last item because they didn't have room.)1. DeterminationThis has turned out to be the most important quality in startup
14 | founders.  We thought when we started Y Combinator that the most
15 | important quality would be intelligence.  That's the myth in the
16 | Valley. And certainly you don't want founders to be stupid.  But
17 | as long as you're over a certain threshold of intelligence, what
18 | matters most is determination.  You're going to hit a lot of
19 | obstacles.  You can't be the sort of person who gets demoralized
20 | easily.Bill Clerico and Rich Aberman of WePay 
21 | are a good example.  They're
22 | doing a finance startup, which means endless negotiations with big,
23 | bureaucratic companies.  When you're starting a startup that depends
24 | on deals with big companies to exist, it often feels like they're
25 | trying to ignore you out of existence.  But when Bill Clerico starts
26 | calling you, you may as well do what he asks, because he is not
27 | going away.
28 | 2. FlexibilityYou do not however want the sort of determination implied by phrases
29 | like "don't give up on your dreams."  The world of startups is so
30 | unpredictable that you need to be able to modify your dreams on the
31 | fly.  The best metaphor I've found for the combination of determination
32 | and flexibility you need is a running back.  
33 | He's determined to get
34 | downfield, but at any given moment he may need to go sideways or
35 | even backwards to get there.The current record holder for flexibility may be Daniel Gross of
36 | Greplin.  He applied to YC with 
37 | some bad ecommerce idea.  We told
38 | him we'd fund him if he did something else.  He thought for a second,
39 | and said ok.  He then went through two more ideas before settling
40 | on Greplin.  He'd only been working on it for a couple days when
41 | he presented to investors at Demo Day, but he got a lot of interest.
42 | He always seems to land on his feet.
43 | 3. ImaginationIntelligence does matter a lot of course.  It seems like the type
44 | that matters most is imagination.  It's not so important to be able
45 | to solve predefined problems quickly as to be able to come up with
46 | surprising new ideas.  In the startup world, most good ideas 
47 | seem
48 | bad initially.  If they were obviously good, someone would already
49 | be doing them.  So you need the kind of intelligence that produces
50 | ideas with just the right level of craziness.Airbnb is that kind of idea.  
51 | In fact, when we funded Airbnb, we
52 | thought it was too crazy.  We couldn't believe large numbers of
53 | people would want to stay in other people's places.  We funded them
54 | because we liked the founders so much.  As soon as we heard they'd
55 | been supporting themselves by selling Obama and McCain branded
56 | breakfast cereal, they were in.  And it turned out the idea was on
57 | the right side of crazy after all.
58 | 4. NaughtinessThough the most successful founders are usually good people, they
59 | tend to have a piratical gleam in their eye.  They're not Goody
60 | Two-Shoes type good.  Morally, they care about getting the big
61 | questions right, but not about observing proprieties.  That's why
62 | I'd use the word naughty rather than evil.  They delight in 
63 | breaking
64 | rules, but not rules that matter.  This quality may be redundant
65 | though; it may be implied by imagination.Sam Altman of Loopt 
66 | is one of the most successful alumni, so we
67 | asked him what question we could put on the Y Combinator application
68 | that would help us discover more people like him.  He said to ask
69 | about a time when they'd hacked something to their advantage—hacked in the sense of beating the system, not breaking into
70 | computers.  It has become one of the questions we pay most attention
71 | to when judging applications.
72 | 5. FriendshipEmpirically it seems to be hard to start a startup with just 
73 | one
74 | founder.  Most of the big successes have two or three.  And the
75 | relationship between the founders has to be strong.  They must
76 | genuinely like one another, and work well together.  Startups do
77 | to the relationship between the founders what a dog does to a sock:
78 | if it can be pulled apart, it will be.Emmett Shear and Justin Kan of Justin.tv 
79 | are a good example of close
80 | friends who work well together.  They've known each other since
81 | second grade.  They can practically read one another's minds.  I'm
82 | sure they argue, like all founders, but I have never once sensed
83 | any unresolved tension between them.Thanks to Jessica Livingston and Chris Steiner for reading drafts of this.


--------------------------------------------------------------------------------
/eval/niah/PaulGrahamEssays/foundervisa.txt:
--------------------------------------------------------------------------------
1 | 
2 | 
3 | April 2009I usually avoid politics, but since we now seem to have an administration that's open to suggestions, I'm going to risk making one.  The single biggest thing the government could do to increase the number of startups in this country is a policy that would cost nothing: establish a new class of visa for startup founders.The biggest constraint on the number of new startups that get created in the US is not tax policy or employment law or even Sarbanes-Oxley.  It's that we won't let the people who want to start them into the country.Letting just 10,000 startup founders into the country each year could have a visible effect on the economy.  If we assume 4 people per startup, which is probably an overestimate, that's 2500 new companies.  Each year.  They wouldn't all grow as big as Google, but out of 2500 some would come close.By definition these 10,000 founders wouldn't be taking jobs from Americans: it could be part of the terms of the visa that they couldn't work for existing companies, only new ones they'd founded.  In fact they'd cause there to be 
4 | more jobs for Americans, because the companies they started would hire more employees as they grew.The tricky part might seem to be how one defined a startup. But that could be solved quite easily: let the market decide.  Startup investors work hard to find the best startups.  The government could not do better than to piggyback on their expertise, and use investment by recognized startup investors as the test of whether a company was a real startup.How would the government decide who's a startup investor?  The same way they decide what counts as a university for student visas. We'll establish our own accreditation procedure. We know who one another are.10,000 people is a drop in the bucket by immigration standards, but would represent a huge increase in the pool of startup founders.  I think this would have such a visible effect on the economy that it would make the legislator who introduced the bill famous.  The only way to know for sure would be to try it, and that would cost practically nothing.
5 | Thanks to Trevor Blackwell, Paul Buchheit, Jeff Clavier, David Hornik, Jessica Livingston, Greg Mcadoo, Aydin Senkut, and Fred Wilson for reading drafts of this.Related:


--------------------------------------------------------------------------------
/eval/niah/PaulGrahamEssays/goodtaste.txt:
--------------------------------------------------------------------------------
 1 | November 2021(This essay is derived from a talk at the Cambridge Union.)When I was a kid, I'd have said there wasn't. My father told me so.
 2 | Some people like some things, and other people like other things,
 3 | and who's to say who's right?It seemed so obvious that there was no such thing as good taste
 4 | that it was only through indirect evidence that I realized my father
 5 | was wrong. And that's what I'm going to give you here: a proof by
 6 | reductio ad absurdum. If we start from the premise that there's no
 7 | such thing as good taste, we end up with conclusions that are
 8 | obviously false, and therefore the premise must be wrong.We'd better start by saying what good taste is. There's a narrow
 9 | sense in which it refers to aesthetic judgements and a broader one
10 | in which it refers to preferences of any kind. The strongest proof
11 | would be to show that taste exists in the narrowest sense, so I'm
12 | going to talk about taste in art. You have better taste than me if
13 | the art you like is better than the art I like.If there's no such thing as good taste, then there's no such thing
14 | as good art. Because if there is such a
15 | thing as good art, it's
16 | easy to tell which of two people has better taste. Show them a lot
17 | of works by artists they've never seen before and ask them to
18 | choose the best, and whoever chooses the better art has better
19 | taste.So if you want to discard the concept of good taste, you also have
20 | to discard the concept of good art. And that means you have to
21 | discard the possibility of people being good at making it. Which
22 | means there's no way for artists to be good at their jobs. And not
23 | just visual artists, but anyone who is in any sense an artist. You
24 | can't have good actors, or novelists, or composers, or dancers
25 | either. You can have popular novelists, but not good ones.We don't realize how far we'd have to go if we discarded the concept
26 | of good taste, because we don't even debate the most obvious cases.
27 | But it doesn't just mean we can't say which of two famous painters
28 | is better. It means we can't say that any painter is better than a
29 | randomly chosen eight year old.That was how I realized my father was wrong. I started studying
30 | painting. And it was just like other kinds of work I'd done: you
31 | could do it well, or badly, and if you tried hard, you could get
32 | better at it. And it was obvious that Leonardo and Bellini were
33 | much better at it than me. That gap between us was not imaginary.
34 | They were so good. And if they could be good, then art could be
35 | good, and there was such a thing as good taste after all.Now that I've explained how to show there is such a thing as good
36 | taste, I should also explain why people think there isn't. There
37 | are two reasons. One is that there's always so much disagreement
38 | about taste. Most people's response to art is a tangle of unexamined
39 | impulses. Is the artist famous? Is the subject attractive? Is this
40 | the sort of art they're supposed to like? Is it hanging in a famous
41 | museum, or reproduced in a big, expensive book? In practice most
42 | people's response to art is dominated by such extraneous factors.And the people who do claim to have good taste are so often mistaken.
43 | The paintings admired by the so-called experts in one generation
44 | are often so different from those admired a few generations later.
45 | It's easy to conclude there's nothing real there at all. It's only
46 | when you isolate this force, for example by trying to paint and
47 | comparing your work to Bellini's, that you can see that it does in
48 | fact exist.The other reason people doubt that art can be good is that there
49 | doesn't seem to be any room in the art for this goodness. The
50 | argument goes like this. Imagine several people looking at a work
51 | of art and judging how good it is. If being good art really is a
52 | property of objects, it should be in the object somehow. But it
53 | doesn't seem to be; it seems to be something happening in the heads
54 | of each of the observers. And if they disagree, how do you choose
55 | between them?The solution to this puzzle is to realize that the purpose of art
56 | is to work on its human audience, and humans have a lot in common.
57 | And to the extent the things an object acts upon respond in the
58 | same way, that's arguably what it means for the object to have the
59 | corresponding property. If everything a particle interacts with
60 | behaves as if the particle had a mass of m, then it has a mass of
61 | m. So the distinction between "objective" and "subjective" is not
62 | binary, but a matter of degree, depending on how much the subjects
63 | have in common. Particles interacting with one another are at one
64 | pole, but people interacting with art are not all the way at the
65 | other; their reactions aren't random.Because people's responses to art aren't random, art can be designed
66 | to operate on people, and be good or bad depending on how effectively
67 | it does so. Much as a vaccine can be. If someone were talking about
68 | the ability of a vaccine to confer immunity, it would seem very
69 | frivolous to object that conferring immunity wasn't really a property
70 | of vaccines, because acquiring immunity is something that happens
71 | in the immune system of each individual person. Sure, people's
72 | immune systems vary, and a vaccine that worked on one might not
73 | work on another, but that doesn't make it meaningless to talk about
74 | the effectiveness of a vaccine.The situation with art is messier, of course. You can't measure
75 | effectiveness by simply taking a vote, as you do with vaccines.
76 | You have to imagine the responses of subjects with a deep knowledge
77 | of art, and enough clarity of mind to be able to ignore extraneous
78 | influences like the fame of the artist. And even then you'd still
79 | see some disagreement. People do vary, and judging art is hard,
80 | especially recent art. There is definitely not a total order either
81 | of works or of people's ability to judge them. But there is equally
82 | definitely a partial order of both. So while it's not possible to
83 | have perfect taste, it is possible to have good taste.
84 | Thanks to the Cambridge Union for inviting me, and to Trevor
85 | Blackwell, Jessica Livingston, and Robert Morris for reading drafts
86 | of this.
87 | 


--------------------------------------------------------------------------------
/eval/niah/PaulGrahamEssays/iflisp.txt:
--------------------------------------------------------------------------------
 1 | May 2003If Lisp is so great, why don't more people use it?  I was    
 2 | asked this question by a student in the audience at a 
 3 | talk I gave recently.  Not for the first time, either.In languages, as in so many things, there's not much     
 4 | correlation between popularity and quality.  Why does   
 5 | John Grisham (King of Torts sales rank, 44) outsell
 6 | Jane Austen (Pride and Prejudice sales rank, 6191)?
 7 | Would even Grisham claim that it's because he's a better
 8 | writer?Here's the first sentence of Pride and Prejudice:
 9 | 
10 | It is a truth universally acknowledged, that a single man 
11 | in possession of a good fortune must be in want of a
12 | wife.
13 | 
14 | "It is a truth universally acknowledged?"  Long words for
15 | the first sentence of a love story.Like Jane Austen, Lisp looks hard.  Its syntax, or lack
16 | of syntax, makes it look completely unlike 
17 | the languages
18 | most people are used to.  Before I learned Lisp, I was afraid
19 | of it too.  I recently came across a notebook from 1983
20 | in which I'd written:
21 | 
22 | I suppose I should learn Lisp, but it seems so foreign.
23 | 
24 | Fortunately, I was 19 at the time and not too resistant to learning
25 | new things.  I was so ignorant that learning
26 | almost anything meant learning new things.People frightened by Lisp make up other reasons for not
27 | using it.  The standard
28 | excuse, back when C was the default language, was that Lisp
29 | was too slow.  Now that Lisp dialects are among
30 | the faster
31 | languages available, that excuse has gone away.
32 | Now the standard excuse is openly circular: that other languages
33 | are more popular.(Beware of such reasoning.  It gets you Windows.)Popularity is always self-perpetuating, but it's especially
34 | so in programming languages. More libraries
35 | get written for popular languages, which makes them still
36 | more popular.  Programs often have to work with existing programs,
37 | and this is easier if they're written in the same language,
38 | so languages spread from program to program like a virus.
39 | And managers prefer popular languages, because they give them 
40 | more leverage over developers, who can more easily be replaced.Indeed, if programming languages were all more or less equivalent,
41 | there would be little justification for using any but the most
42 | popular.  But they aren't all equivalent, not by a long
43 | shot.  And that's why less popular languages, like Jane Austen's 
44 | novels, continue to survive at all.  When everyone else is reading 
45 | the latest John Grisham novel, there will always be a few people 
46 | reading Jane Austen instead.


--------------------------------------------------------------------------------
/eval/niah/PaulGrahamEssays/island.txt:
--------------------------------------------------------------------------------
 1 | July 2006I've discovered a handy test for figuring out what you're addicted
 2 | to.  Imagine you were going to spend the weekend at a friend's house
 3 | on a little island off the coast of Maine.  There are no shops on
 4 | the island and you won't be able to leave while you're there.  Also,
 5 | you've never been to this house before, so you can't assume it will
 6 | have more than any house might.What, besides clothes and toiletries, do you make a point of packing?
 7 | That's what you're addicted to.  For example, if you find yourself
 8 | packing a bottle of vodka (just in case), you may want to stop and
 9 | think about that.For me the list is four things: books, earplugs, a notebook, and a
10 | pen.There are other things I might bring if I thought of it, like music,
11 | or tea, but I can live without them.  I'm not so addicted to caffeine
12 | that I wouldn't risk the house not having any tea, just for a
13 | weekend.Quiet is another matter.  I realize it seems a bit eccentric to
14 | take earplugs on a trip to an island off the coast of Maine.  If
15 | anywhere should be quiet, that should.  But what if the person in
16 | the next room snored?  What if there was a kid playing basketball?
17 | (Thump, thump, thump... thump.)  Why risk it?  Earplugs are small.Sometimes I can think with noise.  If I already have momentum on
18 | some project, I can work in noisy places.  I can edit an essay or
19 | debug code in an airport.  But airports are not so bad: most of the
20 | noise is whitish.  I couldn't work with the sound of a sitcom coming
21 | through the wall, or a car in the street playing thump-thump music.And of course there's another kind of thinking, when you're starting
22 | something new, that requires complete quiet.   You never
23 | know when this will strike. It's just as well to carry plugs.The notebook and pen are professional equipment, as it were.  Though
24 | actually there is something druglike about them, in the sense that
25 | their main purpose is to make me feel better.  I hardly ever go
26 | back and read stuff I write down in notebooks.  It's just that if
27 | I can't write things down, worrying about remembering one idea gets
28 | in the way of having the next.  Pen and paper wick ideas.The best notebooks I've found are made by a company called Miquelrius.
29 | I use their smallest size, which is about 2.5 x 4 in.
30 | The secret to writing on such
31 | narrow pages is to break words only when you run out of space, like
32 | a Latin inscription.  I use the cheapest plastic Bic ballpoints,
33 | partly because their gluey ink doesn't seep through pages, and
34 | partly so I don't worry about losing them.I only started carrying a notebook about three years ago.  Before
35 | that I used whatever scraps of paper I could find.  But the problem
36 | with scraps of paper is that they're not ordered.  In a notebook
37 | you can guess what a scribble means by looking at the pages
38 | around it.  In the scrap era I was constantly finding notes I'd
39 | written years before that might say something I needed to remember,
40 | if I could only figure out what.As for books, I know the house would probably have something to
41 | read.  On the average trip I bring four books and only read one of
42 | them, because I find new books to read en route.  Really bringing
43 | books is insurance.I realize this dependence on books is not entirely good—that what
44 | I need them for is distraction.  The books I bring on trips are
45 | often quite virtuous, the sort of stuff that might be assigned
46 | reading in a college class.  But I know my motives aren't virtuous.
47 | I bring books because if the world gets boring I need to be able
48 | to slip into another distilled by some writer.  It's like eating
49 | jam when you know you should be eating fruit.There is a point where I'll do without books.  I was walking in
50 | some steep mountains once, and decided I'd rather just think, if I
51 | was bored, rather than carry a single unnecessary ounce.  It wasn't
52 | so bad.  I found I could entertain myself by having ideas instead
53 | of reading other people's.  If you stop eating jam, fruit starts
54 | to taste better.So maybe I'll try not bringing books on some future trip.  They're
55 | going to have to pry the plugs out of my cold, dead ears, however.


--------------------------------------------------------------------------------
/eval/niah/PaulGrahamEssays/know.txt:
--------------------------------------------------------------------------------
 1 | December 2014I've read Villehardouin's chronicle of the Fourth Crusade at least
 2 | two times, maybe three.  And yet if I had to write down everything
 3 | I remember from it, I doubt it would amount to much more than a
 4 | page.  Multiply this times several hundred, and I get an uneasy
 5 | feeling when I look at my bookshelves. What use is it to read all
 6 | these books if I remember so little from them?A few months ago, as I was reading Constance Reid's excellent
 7 | biography of Hilbert, I figured out if not the answer to this
 8 | question, at least something that made me feel better about it.
 9 | She writes:
10 | 
11 |   Hilbert had no patience with mathematical lectures which filled
12 |   the students with facts but did not teach them how to frame a
13 |   problem and solve it. He often used to tell them that "a perfect
14 |   formulation of a problem is already half its solution."
15 | 
16 | That has always seemed to me an important point, and I was even
17 | more convinced of it after hearing it confirmed by Hilbert.But how had I come to believe in this idea in the first place?  A
18 | combination of my own experience and other things I'd read.  None
19 | of which I could at that moment remember!  And eventually I'd forget
20 | that Hilbert had confirmed it too.  But my increased belief in the
21 | importance of this idea would remain something I'd learned from
22 | this book, even after I'd forgotten I'd learned it.Reading and experience train your model of the world.  And even if
23 | you forget the experience or what you read, its effect on your model
24 | of the world persists.  Your mind is like a compiled program you've
25 | lost the source of.  It works, but you don't know why.The place to look for what I learned from Villehardouin's chronicle
26 | is not what I remember from it, but my mental models of the crusades,
27 | Venice, medieval culture, siege warfare, and so on.  Which doesn't
28 | mean I couldn't have read more attentively, but at least the harvest
29 | of reading is not so miserably small as it might seem.This is one of those things that seem obvious in retrospect.  But
30 | it was a surprise to me and presumably would be to anyone else who
31 | felt uneasy about (apparently) forgetting so much they'd read.Realizing it does more than make you feel a little better about
32 | forgetting, though.  There are specific implications.For example, reading and experience are usually "compiled" at the
33 | time they happen, using the state of your brain at that time.  The
34 | same book would get compiled differently at different points in
35 | your life.  Which means it is very much worth reading important
36 | books multiple times.  I always used to feel some misgivings about
37 | rereading books.  I unconsciously lumped reading together with work
38 | like carpentry, where having to do something again is a sign you
39 | did it wrong the first time.  Whereas now the phrase "already read"
40 | seems almost ill-formed.Intriguingly, this implication isn't limited to books.  Technology
41 | will increasingly make it possible to relive our experiences.  When
42 | people do that today it's usually to enjoy them again (e.g. when
43 | looking at pictures of a trip) or to find the origin of some bug in
44 | their compiled code (e.g. when Stephen Fry succeeded in remembering
45 | the childhood trauma that prevented him from singing).  But as
46 | technologies for recording and playing back your life improve, it
47 | may become common for people to relive experiences without any goal
48 | in mind, simply to learn from them again as one might when rereading
49 | a book.Eventually we may be able not just to play back experiences but
50 | also to index and even edit them. So although not knowing how you
51 | know things may seem part of being human, it may not be.
52 | Thanks to Sam Altman, Jessica Livingston, and Robert Morris for reading 
53 | drafts of this.


--------------------------------------------------------------------------------
/eval/niah/PaulGrahamEssays/mod.txt:
--------------------------------------------------------------------------------
 1 | December 2019There are two distinct ways to be politically moderate: on purpose
 2 | and by accident. Intentional moderates are trimmers, deliberately
 3 | choosing a position mid-way between the extremes of right and left.
 4 | Accidental moderates end up in the middle, on average, because they
 5 | make up their own minds about each question, and the far right and
 6 | far left are roughly equally wrong.You can distinguish intentional from accidental moderates by the
 7 | distribution of their opinions. If the far left opinion on some
 8 | matter is 0 and the far right opinion 100, an intentional moderate's
 9 | opinion on every question will be near 50. Whereas an accidental
10 | moderate's opinions will be scattered over a broad range, but will,
11 | like those of the intentional moderate, average to about 50.Intentional moderates are similar to those on the far left and the
12 | far right in that their opinions are, in a sense, not their own.
13 | The defining quality of an ideologue, whether on the left or the
14 | right, is to acquire one's opinions in bulk. You don't get to pick
15 | and choose. Your opinions about taxation can be predicted from your
16 | opinions about sex. And although intentional moderates
17 | might seem to be the opposite of ideologues, their beliefs (though
18 | in their case the word "positions" might be more accurate) are also
19 | acquired in bulk. If the median opinion shifts to the right or left,
20 | the intentional moderate must shift with it. Otherwise they stop
21 | being moderate.Accidental moderates, on the other hand, not only choose their own
22 | answers, but choose their own questions. They may not care at all
23 | about questions that the left and right both think are terribly
24 | important. So you can only even measure the politics of an accidental
25 | moderate from the intersection of the questions they care about and
26 | those the left and right care about, and this can
27 | sometimes be vanishingly small.It is not merely a manipulative rhetorical trick to say "if you're
28 | not with us, you're against us," but often simply false.Moderates are sometimes derided as cowards, particularly by 
29 | the extreme left. But while it may be accurate to call intentional
30 | moderates cowards, openly being an accidental moderate requires the
31 | most courage of all, because you get attacked from both right and
32 | left, and you don't have the comfort of being an orthodox member
33 | of a large group to sustain you.Nearly all the most impressive people I know are accidental moderates.
34 | If I knew a lot of professional athletes, or people in the entertainment
35 | business, that might be different. Being on the far left or far
36 | right doesn't affect how fast you run or how well you sing. But
37 | someone who works with ideas has to be independent-minded to do it
38 | well.Or more precisely, you have to be independent-minded about the ideas
39 | you work with. You could be mindlessly doctrinaire in your politics
40 | and still be a good mathematician. In the 20th century, a lot of
41 | very smart people were Marxists  just no one who was smart about
42 | the subjects Marxism involves. But if the ideas you use in your
43 | work intersect with the politics of your time, you have two choices:
44 | be an accidental moderate, or be mediocre.Notes[1] It's possible in theory for one side to be entirely right and
45 | the other to be entirely wrong. Indeed, ideologues must always
46 | believe this is the case. But historically it rarely has been.[2] For some reason the far right tend to ignore moderates rather
47 | than despise them as backsliders. I'm not sure why. Perhaps it
48 | means that the far right is less ideological than the far left. Or
49 | perhaps that they are more confident, or more resigned, or simply
50 | more disorganized. I just don't know.[3] Having heretical opinions doesn't mean you have to express
51 | them openly. It may be
52 | easier to have them if you don't.
53 | Thanks to Austen Allred, Trevor Blackwell, Patrick Collison, Jessica Livingston,
54 | Amjad Masad, Ryan Petersen, and Harj Taggar for reading drafts of this.


--------------------------------------------------------------------------------
/eval/niah/PaulGrahamEssays/nft.txt:
--------------------------------------------------------------------------------
 1 | May 2021Noora Health, a nonprofit I've 
 2 | supported for years, just launched
 3 | a new NFT. It has a dramatic name, Save Thousands of Lives,
 4 | because that's what the proceeds will do.Noora has been saving lives for 7 years. They run programs in
 5 | hospitals in South Asia to teach new mothers how to take care of
 6 | their babies once they get home. They're in 165 hospitals now. And
 7 | because they know the numbers before and after they start at a new
 8 | hospital, they can measure the impact they have. It is massive.
 9 | For every 1000 live births, they save 9 babies.This number comes from a study
10 | of 133,733 families at 28 different
11 | hospitals that Noora conducted in collaboration with the Better
12 | Birth team at Ariadne Labs, a joint center for health systems
13 | innovation at Brigham and Womens Hospital and Harvard T.H. Chan
14 | School of Public Health.Noora is so effective that even if you measure their costs in the
15 | most conservative way, by dividing their entire budget by the number
16 | of lives saved, the cost of saving a life is the lowest I've seen.
17 | $1,235.For this NFT, they're going to issue a public report tracking how
18 | this specific tranche of money is spent, and estimating the number
19 | of lives saved as a result.NFTs are a new territory, and this way of using them is especially
20 | new, but I'm excited about its potential. And I'm excited to see
21 | what happens with this particular auction, because unlike an NFT
22 | representing something that has already happened,
23 | this NFT gets better as the price gets higher.The reserve price was about $2.5 million, because that's what it
24 | takes for the name to be accurate: that's what it costs to save
25 | 2000 lives. But the higher the price of this NFT goes, the more
26 | lives will be saved. What a sentence to be able to write.


--------------------------------------------------------------------------------
/eval/niah/PaulGrahamEssays/pow.txt:
--------------------------------------------------------------------------------
 1 | January 2017People who are powerful but uncharismatic will tend to be disliked.
 2 | Their power makes them a target for criticism that they don't have
 3 | the charisma to disarm. That was Hillary Clinton's problem. It also
 4 | tends to be a problem for any CEO who is more of a builder than a
 5 | schmoozer. And yet the builder-type CEO is (like Hillary) probably
 6 | the best person for the job.I don't think there is any solution to this problem. It's human
 7 | nature. The best we can do is to recognize that it's happening, and
 8 | to understand that being a magnet for criticism is sometimes a sign
 9 | not that someone is the wrong person for a job, but that they're
10 | the right one.


--------------------------------------------------------------------------------
/eval/niah/PaulGrahamEssays/rootsoflisp.txt:
--------------------------------------------------------------------------------
 1 | May 2001
 2 | 
 3 | (I wrote this article to help myself understand exactly
 4 | what McCarthy discovered.  You don't need to know this stuff
 5 | to program in Lisp, but it should be helpful to 
 6 | anyone who wants to
 7 | understand the essence of Lisp  both in the sense of its
 8 | origins and its semantic core.  The fact that it has such a core
 9 | is one of Lisp's distinguishing features, and the reason why,
10 | unlike other languages, Lisp has dialects.)In 1960, John 
11 | McCarthy published a remarkable paper in
12 | which he did for programming something like what Euclid did for
13 | geometry. He showed how, given a handful of simple
14 | operators and a notation for functions, you can
15 | build a whole programming language.
16 | He called this language Lisp, for "List Processing,"
17 | because one of his key ideas was to use a simple
18 | data structure called a list for both
19 | code and data.It's worth understanding what McCarthy discovered, not
20 | just as a landmark in the history of computers, but as
21 | a model for what programming is tending to become in
22 | our own time.  It seems to me that there have been
23 | two really clean, consistent models of programming so
24 | far: the C model and the Lisp model.
25 | These two seem points of high ground, with swampy lowlands
26 | between them.  As computers have grown more powerful,
27 | the new languages being developed have been moving
28 | steadily toward the Lisp model.  A popular recipe
29 | for new programming languages in the past 20 years 
30 | has been to take the C model of computing and add to
31 | it, piecemeal, parts taken from the Lisp model,
32 | like runtime typing and garbage collection.In this article I'm going to try to explain in the
33 | simplest possible terms what McCarthy discovered.
34 | The point is not just to learn about an interesting
35 | theoretical result someone figured out forty years ago,
36 | but to show where languages are heading.
37 | The unusual thing about Lisp  in fact, the defining
38 | quality of Lisp  is that it can be written in
39 | itself.  To understand what McCarthy meant by this,
40 | we're going to retrace his steps, with his mathematical
41 | notation translated into running Common Lisp code.


--------------------------------------------------------------------------------
/eval/niah/PaulGrahamEssays/rss.txt:
--------------------------------------------------------------------------------
1 | Aaron Swartz created a scraped
2 | feed
3 | of the essays page.


--------------------------------------------------------------------------------
/eval/niah/PaulGrahamEssays/todo.txt:
--------------------------------------------------------------------------------
 1 | April 2012A palliative care nurse called Bronnie Ware made a list of the
 2 | biggest regrets
 3 | of the dying.  Her list seems plausible.  I could see
 4 | myself — can see myself — making at least 4 of these
 5 | 5 mistakes.If you had to compress them into a single piece of advice, it might
 6 | be: don't be a cog.  The 5 regrets paint a portrait of post-industrial
 7 | man, who shrinks himself into a shape that fits his circumstances,
 8 | then turns dutifully till he stops.The alarming thing is, the mistakes that produce these regrets are
 9 | all errors of omission.  You forget your dreams, ignore your family,
10 | suppress your feelings, neglect your friends, and forget to be
11 | happy.  Errors of omission are a particularly dangerous type of
12 | mistake, because you make them by default.I would like to avoid making these mistakes.  But how do you avoid
13 | mistakes you make by default?  Ideally you transform your life so
14 | it has other defaults.  But it may not be possible to do that
15 | completely. As long as these mistakes happen by default, you probably
16 | have to be reminded not to make them.  So I inverted the 5 regrets,
17 | yielding a list of 5 commands
18 | 
19 |    Don't ignore your dreams; don't work too much; say what you
20 |    think; cultivate friendships; be happy.
21 | 
22 | which I then put at the top of the file I use as a todo list.


--------------------------------------------------------------------------------
/eval/niah/PaulGrahamEssays/unions.txt:
--------------------------------------------------------------------------------
 1 | May 2007People who worry about the increasing gap between rich and poor
 2 | generally look back on the mid twentieth century as a golden age.
 3 | In those days we had a large number of high-paying union manufacturing
 4 | jobs that boosted the median income.  I wouldn't quite call the
 5 | high-paying union job a myth, but I think people who dwell on it
 6 | are reading too much into it.Oddly enough, it was working with startups that made me realize
 7 | where the high-paying union job came from.  In a rapidly growing
 8 | market, you don't worry too much about efficiency.  It's more
 9 | important to grow fast.  If there's some mundane problem getting
10 | in your way, and there's a simple solution that's somewhat expensive,
11 | just take it and get on with more important things.  EBay didn't
12 | win by paying less for servers than their competitors.Difficult though it may be to imagine now, manufacturing was a
13 | growth industry in the mid twentieth century.  This was an era when
14 | small firms making everything from cars to candy were getting
15 | consolidated into a new kind of corporation with national reach and
16 | huge economies of scale.  You had to grow fast or die.  Workers
17 | were for these companies what servers are for an Internet startup.
18 | A reliable supply was more important than low cost.If you looked in the head of a 1950s auto executive, the attitude
19 | must have been: sure, give 'em whatever they ask for, so long as
20 | the new model isn't delayed.In other words, those workers were not paid what their work was
21 | worth.  Circumstances being what they were, companies would have
22 | been stupid to insist on paying them so little.If you want a less controversial example of this phenomenon, ask
23 | anyone who worked as a consultant building web sites during the
24 | Internet Bubble.  In the late nineties you could get paid huge sums
25 | of money for building the most trivial things.  And yet does anyone
26 | who was there have any expectation those days will ever return?  I
27 | doubt it.  Surely everyone realizes that was just a temporary
28 | aberration.The era of labor unions seems to have been the same kind of aberration, 
29 | just spread
30 | over a longer period, and mixed together with a lot of ideology
31 | that prevents people from viewing it with as cold an eye as they
32 | would something like consulting during the Bubble.Basically, unions were just Razorfish.People who think the labor movement was the creation of heroic union
33 | organizers have a problem to explain: why are unions shrinking now?
34 | The best they can do is fall back on the default explanation of
35 | people living in fallen civilizations.  Our ancestors were giants.
36 | The workers of the early twentieth century must have had a moral
37 | courage that's lacking today.In fact there's a simpler explanation.  The early twentieth century
38 | was just a fast-growing startup overpaying for infrastructure.  And
39 | we in the present are not a fallen people, who have abandoned
40 | whatever mysterious high-minded principles produced the high-paying
41 | union job.  We simply live in a time when the fast-growing companies
42 | overspend on different things.


--------------------------------------------------------------------------------
/eval/niah/PaulGrahamEssays/vw.txt:
--------------------------------------------------------------------------------
 1 | January 2012A few hours before the Yahoo acquisition was announced in June 1998
 2 | I took a snapshot of Viaweb's
 3 | site.  I thought it might be interesting to look at one day.The first thing one notices is is how tiny the pages are.  Screens
 4 | were a lot smaller in 1998.  If I remember correctly, our frontpage
 5 | used to just fit in the size window people typically used then.Browsers then (IE 6 was still 3 years in the future) had few fonts
 6 | and they weren't antialiased.  If you wanted to make pages that
 7 | looked good, you had to render display text as images.You may notice a certain similarity between the Viaweb and Y Combinator logos.  We did that
 8 | as an inside joke when we started YC.  Considering how basic a red
 9 | circle is, it seemed surprising to me when we started Viaweb how
10 | few other companies used one as their logo.  A bit later I realized
11 | why.On the Company
12 | page you'll notice a mysterious individual called John McArtyem.
13 | Robert Morris (aka Rtm) was so publicity averse after the 
14 | Worm that he
15 | didn't want his name on the site.  I managed to get him to agree
16 | to a compromise: we could use his bio but not his name.  He has
17 | since relaxed a bit
18 | on that point.Trevor graduated at about the same time the acquisition closed, so in the
19 | course of 4 days he went from impecunious grad student to millionaire
20 | PhD.  The culmination of my career as a writer of press releases
21 | was one celebrating
22 | his graduation, illustrated with a drawing I did of him during
23 | a meeting.(Trevor also appears as Trevino
24 | Bagwell in our directory of web designers merchants could hire
25 | to build stores for them.  We inserted him as a ringer in case some
26 | competitor tried to spam our web designers.   We assumed his logo
27 | would deter any actual customers, but it did not.)Back in the 90s, to get users you had to get mentioned in magazines
28 | and newspapers.  There were not the same ways to get found online
29 | that there are today.  So we used to pay a PR
30 | firm $16,000 a month to get us mentioned in the press.  Fortunately
31 | reporters liked
32 | us.In our advice about
33 | getting traffic from search engines (I don't think the term SEO
34 | had been coined yet), we say there are only 7 that matter: Yahoo,
35 | AltaVista, Excite, WebCrawler, InfoSeek, Lycos, and HotBot.  Notice
36 | anything missing?  Google was incorporated that September.We supported online transactions via a company called 
37 | Cybercash,
38 | since if we lacked that feature we'd have gotten beaten up in product
39 | comparisons.  But Cybercash was so bad and most stores' order volumes
40 | were so low that it was better if merchants processed orders like phone orders.  We had a page in our site trying to talk merchants
41 | out of doing real time authorizations.The whole site was organized like a funnel, directing people to the
42 | test drive.
43 | It was a novel thing to be able to try out software online.  We put
44 | cgi-bin in our dynamic urls to fool competitors about how our
45 | software worked.We had some well
46 | known users.  Needless to say, Frederick's of Hollywood got the
47 | most traffic.  We charged a flat fee of $300/month for big stores,
48 | so it was a little alarming to have users who got lots of traffic.
49 | I once calculated how much Frederick's was costing us in bandwidth,
50 | and it was about $300/month.Since we hosted all the stores, which together were getting just
51 | over 10 million page views per month in June 1998, we consumed what
52 | at the time seemed a lot of bandwidth.  We had 2 T1s (3 Mb/sec)
53 | coming into our offices.  In those days there was no AWS.  Even
54 | colocating servers seemed too risky, considering how often things
55 | went wrong with them.  So we had our servers in our offices.  Or
56 | more precisely, in Trevor's office.  In return for the unique
57 | privilege of sharing his office with no other humans, he had to
58 | share it with 6 shrieking tower servers.  His office was nicknamed
59 | the Hot Tub on account of the heat they generated.  Most days his
60 | stack of window air conditioners could keep up.For describing pages, we had a template language called RTML, which
61 | supposedly stood for something, but which in fact I named after
62 | Rtm.  RTML was Common Lisp augmented by some macros and libraries,
63 | and concealed under a structure editor that made it look like it
64 | had syntax.Since we did continuous releases, our software didn't actually have
65 | versions.  But in those days the trade press expected versions, so
66 | we made them up.  If we wanted to get lots of attention, we made
67 | the version number an
68 | integer.  That "version 4.0" icon was generated by our own
69 | button generator, incidentally.  The whole Viaweb site was made
70 | with our software, even though it wasn't an online store, because
71 | we wanted to experience what our users did.At the end of 1997, we released a general purpose shopping search
72 | engine called Shopfind.  It
73 | was pretty advanced for the time.  It had a programmable crawler
74 | that could crawl most of the different stores online and pick out
75 | the products.


--------------------------------------------------------------------------------
/eval/niah/PaulGrahamEssays/want.txt:
--------------------------------------------------------------------------------
 1 | November 2022Since I was about 9 I've been puzzled by the apparent contradiction
 2 | between being made of matter that behaves in a predictable way, and
 3 | the feeling that I could choose to do whatever I wanted. At the
 4 | time I had a self-interested motive for exploring the question. At
 5 | that age (like most succeeding ages) I was always in trouble with
 6 | the authorities, and it seemed to me that there might possibly be
 7 | some way to get out of trouble by arguing that I wasn't responsible
 8 | for my actions. I gradually lost hope of that, but the puzzle
 9 | remained: How do you reconcile being a machine made of matter with
10 | the feeling that you're free to choose what you do?
11 | [1]The best way to explain the answer may be to start with a slightly
12 | wrong version, and then fix it. The wrong version is: You can do
13 | what you want, but you can't want what you want. Yes, you can control
14 | what you do, but you'll do what you want, and you can't control
15 | that.The reason this is mistaken is that people do sometimes change what
16 | they want. People who don't want to want something — drug addicts,
17 | for example — can sometimes make themselves stop wanting it. And
18 | people who want to want something — who want to like classical
19 | music, or broccoli — sometimes succeed.So we modify our initial statement: You can do what you want, but
20 | you can't want to want what you want.That's still not quite true. It's possible to change what you want
21 | to want. I can imagine someone saying "I decided to stop wanting
22 | to like classical music." But we're getting closer to the truth.
23 | It's rare for people to change what they want to want, and the more
24 | "want to"s we add, the rarer it gets.We can get arbitrarily close to a true statement by adding more "want
25 | to"s in much the same way we can get arbitrarily close to 1 by adding
26 | more 9s to a string of 9s following a decimal point. In practice
27 | three or four "want to"s must surely be enough. It's hard even to
28 | envision what it would mean to change what you want to want to want
29 | to want, let alone actually do it.So one way to express the correct answer is to use a regular
30 | expression. You can do what you want, but there's some statement
31 | of the form "you can't (want to)* want what you want" that's true.
32 | Ultimately you get back to a want that you don't control.
33 | [2]
34 | Notes[1]
35 | I didn't know when I was 9 that matter might behave randomly,
36 | but I don't think it affects the problem much. Randomness destroys
37 | the ghost in the machine as effectively as determinism.[2]
38 | If you don't like using an expression, you can make the same
39 | point using higher-order desires: There is some n such that you
40 | don't control your nth-order desires.
41 | Thanks to Trevor Blackwell,
42 | Jessica Livingston, Robert Morris, and
43 | Michael Nielsen for reading drafts of this.


--------------------------------------------------------------------------------
/eval/niah/PaulGrahamEssays/weird.txt:
--------------------------------------------------------------------------------
 1 | August 2021When people say that in their experience all programming languages
 2 | are basically equivalent, they're making a statement not about
 3 | languages but about the kind of programming they've done.99.5% of programming consists of gluing together calls to library
 4 | functions. All popular languages are equally good at this. So one
 5 | can easily spend one's whole career operating in the intersection
 6 | of popular programming languages.But the other .5% of programming is disproportionately interesting.
 7 | If you want to learn what it consists of, the weirdness of weird
 8 | languages is a good clue to follow.Weird languages aren't weird by accident. Not the good ones, at
 9 | least. The weirdness of the good ones usually implies the existence
10 | of some form of programming that's not just the usual gluing together
11 | of library calls.A concrete example: Lisp macros. Lisp macros seem weird even to
12 | many Lisp programmers. They're not only not in the intersection of
13 | popular languages, but by their nature would be hard to implement
14 | properly in a language without turning it into a dialect of
15 | Lisp. And macros are definitely evidence of techniques that go
16 | beyond glue programming. For example, solving problems by first
17 | writing a language for problems of that type, and then writing
18 | your specific application in it. Nor is this all you can do with
19 | macros; it's just one region in a space of program-manipulating
20 | techniques that even now is far from fully explored.So if you want to expand your concept of what programming can be,
21 | one way to do it is by learning weird languages. Pick a language
22 | that most programmers consider weird but whose median user is smart,
23 | and then focus on the differences between this language and the
24 | intersection of popular languages. What can you say in this language
25 | that would be impossibly inconvenient to say in others? In the
26 | process of learning how to say things you couldn't previously say,
27 | you'll probably be learning how to think things you couldn't
28 | previously think.
29 | Thanks to Trevor Blackwell, Patrick Collison, Daniel Gackle, Amjad
30 | Masad, and Robert Morris for reading drafts of this.
31 | 


--------------------------------------------------------------------------------
/eval/niah/README.md:
--------------------------------------------------------------------------------
 1 | # Needle-in-a-Haystack Evaluation
 2 | 
 3 | 
 4 | This directory contains the code for the needle in the haystack experiments in the paper. The code is adapted from [the needle test in LongAlign](https://github.com/THUDM/LongAlign/tree/main/Needle_test).
 5 | 
 6 | ## Usage
 7 | 
 8 | First, generate the prompts for the easy and the standard mode:
 9 | 
10 | ```bash
11 | python prompt.py --config config-prompt-easy.yaml --exp max_len_32k_easy
12 | python prompt.py --config config-prompt-standard.yaml --exp max_len_32k_standard
13 | ```
14 | 
15 | Then you can run the actual retrieval task. For example, This is how you can evaluate FoX (Pro):
16 | 
17 | ```bash
18 | python pred.py --exp max_len_32k_easy --model "fox-pro-760m-longcrawl64-48b" --model_path "zhixuan-lin/fox-pro-760m-longcrawl64-48b" --device_id 0 
19 | python pred.py --exp max_len_32k_standard --model "fox-pro-760m-longcrawl64-48b" --model_path "zhixuan-lin/fox-pro-760m-longcrawl64-48b" --device_id 0 
20 | ```
21 | 
22 | The results would be saved to `./pred`. After this we need to use `gpt-4o-2024-08-06` to score the retrieval results. This require an OpenAI API key be set in `$API_KEY`. Then you can run the following:
23 | 
24 | ```bash
25 | python eval.py --exp max_len_32k_easy --model fox-pro-760m-longcrawl64-48b --api-key $API_KEY
26 | python eval.py --exp max_len_32k_standard --model fox-pro-760m-longcrawl64-48b --api-key $API_KEY
27 | ```
28 | 
29 | The scores would be saved to `./results`. After this you can visualize the results as follows:
30 | 
31 | ```bash
32 | FIGURE_DIR="./figures"  # You can use any other path
33 | python plot_niah.py --figure_dir=$FIGURE_DIR
34 | ```
35 | 
36 | You can change `MODEL_LIST` in `plot_niah.py` to specify the set of models for which you want to visualize results.
37 | 
38 | Note that we observe the evaluation results to be non-deterministic, likely due to GPU non-determinism. Therefore the results you obtain may not exactly match those reported in the paper. However, the difference should be small.
39 | 
40 | ## Citation
41 | 
42 | If you use this code, consider citing LongAlign:
43 | 
44 | ```
45 | @inproceedings{bai2024longalign,
46 |     title = "{L}ong{A}lign: A Recipe for Long Context Alignment of Large Language Models",
47 |     author = "Bai, Yushi and Lv, Xin and Zhang, Jiajie and He, Yuze and Qi, Ji and Hou, Lei and Tang, Jie and Dong, Yuxiao and Li, Juanzi",
48 |     booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2024",
49 |     month = nov,
50 |     year = "2024",
51 |     address = "Miami, Florida, USA",
52 |     publisher = "Association for Computational Linguistics",
53 |     url = "https://aclanthology.org/2024.findings-emnlp.74",
54 |     doi = "10.18653/v1/2024.findings-emnlp.74",
55 |     pages = "1376--1395",
56 | }
57 | ```
58 | 


--------------------------------------------------------------------------------
/eval/niah/config-eval.yaml:
--------------------------------------------------------------------------------
 1 | pred_dir: 'pred'
 2 | save_dir: 'results'
 3 | 
 4 | model:
 5 |   model_provider: "OpenAI"
 6 |   model_name: "gpt-4o-2024-08-06"
 7 | 
 8 | prompt:
 9 |   needle: "\nThe best thing to do in San Francisco is eat a sandwich and sit in Dolores Park on a sunny day.\n"
10 |   retrieval_question: "What is the best thing to do in San Francisco?"
11 | 


--------------------------------------------------------------------------------
/eval/niah/config-pred.yaml:
--------------------------------------------------------------------------------
1 | prompt_dir: 'prompts'
2 | save_dir: 'pred'
3 | 
4 | model:
5 |   model_provider: "Huggingface"  # "OpenAI", "Anthropic" or "Huggingface"
6 | 


--------------------------------------------------------------------------------
/eval/niah/config-prompt-debug.yaml:
--------------------------------------------------------------------------------
 1 | prompt:
 2 |   # needle: "\nThe best thing to do in San Francisco is eat a sandwich and sit in Dolores Park on a sunny day.\n"
 3 |   needle: "\nWhat is the best thing to do in San Francisco? Answer: The best thing to do in San Francisco is eat a sandwich and sit in Dolores Park on a sunny day.\n"
 4 |   haystack_dir: "PaulGrahamEssays"
 5 |   retrieval_question: "What is the best thing to do in San Francisco? Answer: The best thing to do in San Francisco is" # We use the Anthropic's retrieval question as the default one
 6 | 
 7 | context:
 8 |   min_len: 1000
 9 |   max_len: 32000
10 |   interval: 2
11 |   manually_select_list: null  # null or a list of context lengths to manually select
12 | 
13 | document_depth:
14 |   min_percent: 0
15 |   max_percent: 100
16 |   interval: 2
17 |   interval_type: "linear"  # "linear", "sigmoid" or null
18 |   manually_select_list: null  # null or a list of document percents to manually select
19 | 
20 | tokenizer:
21 |   tokenizer_type: "Huggingface"
22 |   model_name: "zhixuan-lin/longcrawl64-json-gpt2-tokenizer" # Change it to your own model name / HF model path
23 | 
24 | save_dir: 'prompts'
25 | 
26 | 


--------------------------------------------------------------------------------
/eval/niah/config-prompt-easy.yaml:
--------------------------------------------------------------------------------
 1 | prompt:
 2 |   # needle: "\nThe best thing to do in San Francisco is eat a sandwich and sit in Dolores Park on a sunny day.\n"
 3 |   needle: "\nWhat is the best thing to do in San Francisco? Answer: The best thing to do in San Francisco is eat a sandwich and sit in Dolores Park on a sunny day.\n"
 4 |   haystack_dir: "PaulGrahamEssays"
 5 |   retrieval_question: "What is the best thing to do in San Francisco? Answer: The best thing to do in San Francisco is" # We use the Anthropic's retrieval question as the default one
 6 | 
 7 | context:
 8 |   min_len: 1000
 9 |   max_len: 32000
10 |   interval: 11
11 |   manually_select_list: null  # null or a list of context lengths to manually select
12 | 
13 | document_depth:
14 |   min_percent: 0
15 |   max_percent: 100
16 |   interval: 11
17 |   interval_type: "linear"  # "linear", "sigmoid" or null
18 |   manually_select_list: null  # null or a list of document percents to manually select
19 | 
20 | tokenizer:
21 |   tokenizer_type: "Huggingface"
22 |   model_name: "zhixuan-lin/longcrawl64-json-gpt2-tokenizer" # Change it to your own model name / HF model path
23 | 
24 | save_dir: 'prompts'
25 | 
26 | 


--------------------------------------------------------------------------------
/eval/niah/config-prompt-standard.yaml:
--------------------------------------------------------------------------------
 1 | prompt:
 2 |   needle: "\nThe best thing to do in San Francisco is eat a sandwich and sit in Dolores Park on a sunny day.\n"
 3 |   # needle: "\nWhat is the best thing to do in San Francisco? Answer: The best thing to do in San Francisco is eat a sandwich and sit in Dolores Park on a sunny day.\n"
 4 |   haystack_dir: "PaulGrahamEssays"
 5 |   retrieval_question: "What is the best thing to do in San Francisco? Answer: The best thing to do in San Francisco is" # We use the Anthropic's retrieval question as the default one
 6 | 
 7 | context:
 8 |   min_len: 1000
 9 |   max_len: 32000
10 |   interval: 11
11 |   manually_select_list: null  # null or a list of context lengths to manually select
12 | 
13 | document_depth:
14 |   min_percent: 0
15 |   max_percent: 100
16 |   interval: 11
17 |   interval_type: "linear"  # "linear", "sigmoid" or null
18 |   manually_select_list: null  # null or a list of document percents to manually select
19 | 
20 | tokenizer:
21 |   tokenizer_type: "Huggingface"
22 |   model_name: "zhixuan-lin/longcrawl64-json-gpt2-tokenizer" # Change it to your own model name / HF model path
23 | 
24 | save_dir: 'prompts'
25 | 
26 | 


--------------------------------------------------------------------------------
/eval/per_token_loss/README.md:
--------------------------------------------------------------------------------
 1 | # Per-token loss
 2 | 
 3 | This directory contains code to compute and plot per-token loss using LongCrawl64. 
 4 | 
 5 | Before you run the evaluation, make sure you have downloaded the heldout set of LongCrawl64. If you haven't, run the following:
 6 | 
 7 | ```bash
 8 | DATA_DIR="./data"  # You can use any other path
 9 | mkdir -p ${DATA_DIR}/longcrawl64
10 | # Install gsutil
11 | curl https://sdk.cloud.google.com | bash
12 | GSUTIL_PARALLEL_THREAD_COUNT=5 GSUTIL_PARALLEL_PROCESS_COUNT=5 gsutil -m cp -r 'gs://longcrawl64/heldout.zarr' ${DATA_DIR}/longcrawl64
13 | ```
14 | 
15 | 
16 | After this you can run the evaluation as follows:
17 | 
18 | ```bash
19 | DATA_DIR="./data"  # Or whatever path that contains the LongCrawl64 dataset
20 | SAVE_DIR="./results"  # You can use any other path
21 | fabric run run_per_token_loss.py \
22 |    --devices 1 \
23 |    --model  "fox-pro-760m-longcrawl64-48b" \
24 |    --model_path "zhixuan-lin/fox-pro-760m-longcrawl64-48b" \
25 |    --data_path $DATA_DIR/longcrawl64 \
26 |    --save_dir $SAVE_DIR \
27 |    --resume \
28 |    --save_interval 128
29 | ```
30 | We also support multi-gpu evaluation and resuming. However, resuming requires that you use the same number of GPUs as the resumed evaluation run, **otherwise the rseults would be incorrect**.
31 | 
32 | After this, you can plot the per token loss:
33 | 
34 | ```bash
35 | RESULT_DIR=$SAVE_DIR  
36 | FIGURE_DIR="./figures"  # You can use any other path
37 | python plot_per_token_loss.py \
38 |     --result_dir $RESULT_DIR \
39 |     --figure_dir $FIGURE_DIR
40 | ```
41 | 
42 | You can change `MODELS` in `plot_per_token_loss.py` to specify the set of models for which you want to plot the per-token loss.
43 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools>=61.0"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [project]
 6 | name = "forgetting-transformer"
 7 | dynamic = ["version"]
 8 | description = "Official implementation of the Forgetting Transformer"
 9 | readme = "README.md"
10 | authors = [
11 |     { name = "Zhixuan Lin", email = "zxlin.cs@gmail.com" },
12 | ]
13 | classifiers = [
14 |     "Programming Language :: Python :: 3",
15 |     "License :: OSI Approved :: MIT License",
16 |     "Operating System :: OS Independent",
17 |     "Topic :: Scientific/Engineering :: Artificial Intelligence",
18 | ]
19 | requires-python = ">=3.10"
20 | dependencies = [
21 | ]
22 | 
23 | [project.optional-dependencies]
24 | dev = ["pytest"]
25 | 
26 | [project.urls]
27 | Homepage = "https://github.com/zhixuan-lin/forgetting-transformer"
28 | 
29 | [tool.setuptools.dynamic]
30 | version = {attr = "forgetting_transformer.__version__"}
31 | 


--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
 1 | pytest
 2 | einops
 3 | numpy
 4 | zarr
 5 | colorlog
 6 | rich
 7 | wandb
 8 | jsonlines
 9 | matplotlib
10 | seaborn
11 | hydra-core==1.3.2
12 | torch==2.4.0
13 | click==8.1.7  # Needed for lightning CLI
14 | lightning==2.4.0
15 | transformers==4.44.0
16 | datasets==2.20.0
17 | lm_eval==0.4.4
18 | 
19 | # Optional: if you want to run baselines
20 | flash-attn==2.6.3    # Needed for transformer LLaMA
21 | causal-conv1d==1.4.0 # For Mamba-2 and DeltaNet
22 | mamba-ssm==2.2.2     # For Mamba-2
23 | 
24 | # Optional: for evaluation
25 | tqdm
26 | rouge
27 | jieba
28 | fuzzywuzzy
29 | 


--------------------------------------------------------------------------------
/save_model.py:
--------------------------------------------------------------------------------
  1 | 
  2 | from typing import Callable, Dict, Union, Optional, Tuple, NamedTuple, Any, List
  3 | import logging
  4 | from pathlib import Path
  5 | import rich
  6 | import rich.syntax
  7 | 
  8 | import hydra
  9 | from omegaconf import OmegaConf, DictConfig
 10 | import torch
 11 | import lightning as L
 12 | from lightning.fabric.utilities.rank_zero import rank_zero_only
 13 | import os
 14 | import os.path as osp
 15 | from torch import nn
 16 | import colorlog
 17 | from datetime import datetime
 18 | import jsonlines
 19 | 
 20 | # from forgetting_transformer.model.common import LMOutput
 21 | from transformers.modeling_outputs import ModelOutput
 22 | from forgetting_transformer.datamodule.common import DataInfo, Batch
 23 | from forgetting_transformer.checkpoint import Checkpointer
 24 | from configs.config import Config
 25 | from collections import defaultdict, OrderedDict
 26 | import numpy as np
 27 | import time
 28 | from dataclasses import dataclass, field, asdict
 29 | from torch.distributed.fsdp import FullyShardedDataParallel
 30 | import torch.utils.flop_counter
 31 | from transformers import AutoTokenizer
 32 | from transformers import GPT2Tokenizer
 33 | import json
 34 | import pprint
 35 | from forgetting_transformer.tokenizer import JSONGPT2Tokenizer
 36 | import argparse
 37 | 
 38 | 
 39 | @dataclass
 40 | class ModelInfo:
 41 |     total_params: int
 42 |     trainable_params: int
 43 |     embedding_params: int
 44 |     flops_per_token: int  # Note this depends how we train the model
 45 |     non_embedding_params: int = field(init=False)
 46 | 
 47 |     def __post_init__(self):
 48 |         self.non_embedding_params = self.total_params - self.embedding_params
 49 | 
 50 | 
 51 | 
 52 | # @hydra.main(version_base=None, config_name="config", config_path="configs")
 53 | def save_model():
 54 |     parser = argparse.ArgumentParser()
 55 |     parser.add_argument("--hf_load_dir", type=str, required=True)
 56 |     parser.add_argument("--hf_save_dir", type=str, required=True)
 57 |     parser.add_argument("--hf_load_step", type=int, required=False)
 58 |     args = parser.parse_args()
 59 | 
 60 | 
 61 |     assert args.hf_load_dir is not None
 62 |     assert args.hf_save_dir is not None
 63 |     assert args.hf_load_step is None, "You can remove this if you know what you are doing"
 64 | 
 65 |     args.hf_load_dir = osp.realpath(args.hf_load_dir)
 66 |     load_config_path = Path(args.hf_load_dir) / "config.yaml"
 67 |     config: Config = OmegaConf.load(load_config_path)
 68 | 
 69 |     assert Path(args.hf_load_dir).exists()
 70 |     # with fabric.init_module(empty_init=False):
 71 |     assert OmegaConf.is_missing(
 72 |         config.model.config, "vocab_size"
 73 |     ), "Vocab size should be left missing"
 74 |     data_info_path = Path(args.hf_load_dir) / "metrics" / "jsonlines" / "train_data_info.jsonl"
 75 |     with jsonlines.open(data_info_path) as reader:
 76 |         data_info: Dict = reader.read() 
 77 |     config.model.config.vocab_size = data_info['train_data_info/vocab_size']
 78 |     model: nn.Module = hydra.utils.instantiate(config.model)
 79 | 
 80 |     if args.hf_load_step is None:
 81 |         resume_step, checkpoint_path = Checkpointer.get_checkpoint_path(
 82 |             checkpoint_dir=Path(args.hf_load_dir) / "checkpoints",
 83 |             step=None,
 84 |         )
 85 |         print(f"step: {resume_step}")
 86 |         assert resume_step == config.train.max_tokens
 87 |     else:
 88 |         resume_step, checkpoint_path = Checkpointer.get_checkpoint_path(
 89 |             checkpoint_dir=Path(args.hf_load_dir) / "checkpoints",
 90 |             step=args.hf_load_step,
 91 |         )
 92 |         print(f"step: {resume_step}")
 93 |         assert resume_step == args.hf_load_step
 94 |     # if input("not checking step. proceed? (y/n)").strip() == 'y':
 95 |         # pass
 96 |     # else:
 97 |         # import sys; sys.exit()
 98 |     checkpoint = torch.load(checkpoint_path, weights_only=False)
 99 | 
100 |     model.load_state_dict(checkpoint["model"])
101 |     del checkpoint
102 | 
103 |     if "SlimPajama" in config.datamodule._target_:
104 |         tokenizer = AutoTokenizer.from_pretrained("fla-hub/gla-1.3B-100B")
105 |     elif "LongCrawl" in config.datamodule._target_:
106 |         tokenizer = JSONGPT2Tokenizer.from_pretrained("gpt2", add_bos_token=True, clean_up_tokenization_spaces=False, add_prefix_space=False)
107 |     else:
108 |         raise ValueError(f"Unknow data module {config.datamodule._target_}")
109 |     # tokenizer = GPT2Tokenizer.from_pretrained("gpt2", add_bos_token=False, clean_up_tokenization_spaces=False, add_prefix_space=False)
110 |     tokenizer.model_max_length = data_info["train_data_info/batch_len"]
111 | 
112 |     path = Path(args.hf_save_dir)
113 |     path.mkdir(parents=True, exist_ok=True)
114 |     model.save_pretrained(path,)
115 |     tokenizer.save_pretrained(path)
116 |     print(f"Model and tokenizer saved to {path}")
117 | 
118 |     # tokenizer = GPT2Tokenizer.from_pretrained("gpt2", add_bos_token=False, clean_up_tokenization_spaces=False, add_prefix_space=True)
119 | 
120 |     # import ipdb; ipdb.set_trace()
121 | if __name__ == "__main__":
122 |     save_model()  # pylint: disable=no-value-for-parameter
123 | 


--------------------------------------------------------------------------------
/src/forgetting_transformer/__init__.py:
--------------------------------------------------------------------------------
1 | from forgetting_transformer.ops.forgetting_attention import (
2 |     forgetting_attention
3 | )
4 | __version__ = '0.0.1'
5 | 


--------------------------------------------------------------------------------
/src/forgetting_transformer/datamodule/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhixuan-lin/forgetting-transformer/f8ce22afe14980628534e06d9ee62baeeddf1dcf/src/forgetting_transformer/datamodule/__init__.py


--------------------------------------------------------------------------------
/src/forgetting_transformer/datamodule/common.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional, NamedTuple
 2 | import torch
 3 | from forgetting_transformer.utils import safe_divide
 4 | from dataclasses import dataclass, field
 5 | 
 6 | 
 7 | @dataclass
 8 | class DataInfo:
 9 |     vocab_size: int
10 |     global_tokens_per_batch: int
11 |     local_tokens_per_batch: int
12 |     # tokens_per_stage: int
13 |     batch_len: int
14 |     seq_len: Optional[int]
15 |     total_tokens: int
16 |     global_batch_size: int = field(init=False)
17 |     local_batch_size: int = field(init=False)
18 |     """General dataloader information
19 | 
20 |     Each local batch has shape (local_batch_size, batch_len)
21 | 
22 |     Arguments:
23 |         - `tokens_per_stage`: the following should always be true: as long as
24 |           two dataloaders
25 |             - use the same data source
26 |             - have the same tokens_per_stage
27 |            Then within each stage, the set of tokens they emit must be the same, even
28 |            though the order these tokens are emitted are different.
29 |         - `seq_len`: if None, the sequences are variable length. Otherwise all
30 |           sequences should have the same length. The practical implication is
31 |           that resets should either all be False, or only the first timestep is
32 |           True.
33 | 
34 |     """
35 |     def __post_init__(self):
36 |         self.global_batch_size = safe_divide(self.global_tokens_per_batch, self.batch_len)
37 |         self.local_batch_size = safe_divide(self.local_tokens_per_batch, self.batch_len)
38 | 
39 | 
40 | class Batch(NamedTuple):
41 |     input_ids: torch.LongTensor
42 |     labels: torch.LongTensor
43 |     resets: torch.BoolTensor
44 | 


--------------------------------------------------------------------------------
/src/forgetting_transformer/model/README.md:
--------------------------------------------------------------------------------
 1 | Most files in this directory are adapted from the [Flash Linear Attention](https://github.com/fla-org/flash-linear-attention) repository.
 2 | 
 3 | The original license info from the Flash Linear Attention repository:
 4 | 
 5 | ```
 6 | MIT License
 7 | 
 8 | Copyright (c) 2023-2025 Songlin Yang
 9 | 
10 | Permission is hereby granted, free of charge, to any person obtaining a copy
11 | of this software and associated documentation files (the "Software"), to deal
12 | in the Software without restriction, including without limitation the rights
13 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 | copies of the Software, and to permit persons to whom the Software is
15 | furnished to do so, subject to the following conditions:
16 | 
17 | The above copyright notice and this permission notice shall be included in all
18 | copies or substantial portions of the Software.
19 | 
20 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26 | SOFTWARE.
27 | ```
28 | 


--------------------------------------------------------------------------------
/src/forgetting_transformer/model/__init__.py:
--------------------------------------------------------------------------------
 1 | # from .mamba2 import Mamba2ForCausalLM, Mamba2Config
 2 | # from .forgetting_transformer import ForgettingTransformerForCausalLM, ForgettingTransformerConfig
 3 | # from .transformer import TransformerForCausalLM, TransformerConfig
 4 | # from .delta_net import DeltaNetForCausalLM, DeltaNetConfig
 5 | # from .hgrn2 import HGRN2ForCausalLM, HGRN2Config
 6 | # from .samba import SambaForCausalLM, SambaConfig
 7 | 
 8 | import importlib
 9 | import warnings
10 | with warnings.catch_warnings():
11 |     warnings.filterwarnings(action="ignore", message="Flash Attention is not installed")
12 |     warnings.filterwarnings(action="ignore", message="`torch.cuda.amp")
13 |     from forgetting_transformer.model.forgetting_transformer import (
14 |         ForgettingTransformerForCausalLM,
15 |         ForgettingTransformerConfig,
16 |     )
17 |     from forgetting_transformer.model.forgetting_transformer.modeling_forgetting_transformer import (
18 |         ForgettingAttentionLayer
19 |     )
20 | 
21 |     for model in ["mamba2", "forgetting_transformer", "transformer", "delta_net", "hgrn2", "samba"]:
22 |         # We do not want to espose the names.
23 |             importlib.import_module(f".{model}", __name__)
24 | 


--------------------------------------------------------------------------------
/src/forgetting_transformer/model/common.py:
--------------------------------------------------------------------------------
1 | from typing import NamedTuple, Optional, Any
2 | import torch
3 | 
4 | 
5 | class LMOutput(NamedTuple):
6 |     loss: torch.Tensor
7 |     carry: Any
8 |     logits: Optional[torch.Tensor] = None
9 | 


--------------------------------------------------------------------------------
/src/forgetting_transformer/model/delta_net/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from transformers import AutoConfig, AutoModel, AutoModelForCausalLM
 4 | 
 5 | from .configuration_delta_net import \
 6 |     DeltaNetConfig
 7 | from .modeling_delta_net import (
 8 |     DeltaNetForCausalLM, DeltaNetModel)
 9 | 
10 | AutoConfig.register(DeltaNetConfig.model_type, DeltaNetConfig)
11 | AutoModel.register(DeltaNetConfig, DeltaNetModel)
12 | AutoModelForCausalLM.register(DeltaNetConfig, DeltaNetForCausalLM)
13 | 
14 | __all__ = ['DeltaNetConfig', 'DeltaNetForCausalLM', 'DeltaNetModel']
15 | 


--------------------------------------------------------------------------------
/src/forgetting_transformer/model/delta_net/configuration_delta_net.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from typing import Optional
 4 | 
 5 | from transformers.configuration_utils import PretrainedConfig
 6 | 
 7 | 
 8 | class DeltaNetConfig(PretrainedConfig):
 9 | 
10 |     model_type = 'delta_net-project_fox'
11 |     keys_to_ignore_at_inference = ['past_key_values']
12 | 
13 |     def __init__(
14 |         self,
15 |         vocab_size: int = 32000,
16 |         hidden_size: int = 2048,
17 |         expand_k: int = 1,
18 |         expand_v: int = 1,
19 |         use_gate: bool = False,
20 |         use_short_conv: bool = True,
21 |         conv_size: int = 4,
22 |         use_beta: bool = True,
23 |         use_output_norm: bool = True,
24 |         hidden_ratio: Optional[int] = 4,
25 |         intermediate_size: Optional[int] = None,
26 |         num_hidden_layers: int = 24,
27 |         num_heads: int = 16,
28 |         attn_mode: str = "chunk",
29 |         qk_norm: str = 'l2',
30 |         qk_activation: str = 'silu',
31 |         hidden_act: str = "swish",
32 |         max_position_embeddings: int = 2048,
33 |         norm_first: bool = False,
34 |         norm_eps: float = 1e-6,
35 |         use_cache: bool = True,
36 |         pad_token_id: int = None,
37 |         bos_token_id: int = 1,
38 |         eos_token_id: int = 2,
39 |         tie_word_embeddings: bool = False,
40 |         initializer_range: float = 0.02,
41 |         fuse_cross_entropy: bool = True,
42 |         **kwargs
43 |     ):
44 |         self.vocab_size = vocab_size
45 |         self.max_position_embeddings = max_position_embeddings
46 |         self.hidden_size = hidden_size
47 |         self.expand_k = expand_k
48 |         self.expand_v = expand_v
49 |         self.hidden_ratio = hidden_ratio
50 |         self.intermediate_size = intermediate_size
51 |         self.num_hidden_layers = num_hidden_layers
52 |         self.num_heads = num_heads
53 |         self.attn_mode = attn_mode
54 |         self.hidden_act = hidden_act
55 |         self.norm_first = norm_first
56 |         self.norm_eps = norm_eps
57 |         self.use_cache = use_cache
58 |         self.initializer_range = initializer_range
59 |         self.fuse_cross_entropy = fuse_cross_entropy
60 |         self.use_gate = use_gate
61 |         self.use_short_conv = use_short_conv
62 |         self.conv_size = conv_size
63 |         self.use_beta = use_beta
64 |         self.use_output_norm = use_output_norm
65 |         self.qk_norm = qk_norm
66 |         self.qk_activation = qk_activation
67 | 
68 |         super().__init__(
69 |             pad_token_id=pad_token_id,
70 |             bos_token_id=bos_token_id,
71 |             eos_token_id=eos_token_id,
72 |             tie_word_embeddings=tie_word_embeddings,
73 |             **kwargs,
74 |         )
75 | 


--------------------------------------------------------------------------------
/src/forgetting_transformer/model/forgetting_transformer/__init__.py:
--------------------------------------------------------------------------------
 1 | # # -*- coding: utf-8 -*-
 2 | 
 3 | from transformers import AutoConfig, AutoModel, AutoModelForCausalLM
 4 | 
 5 | from .configuration_forgetting_transformer import ForgettingTransformerConfig
 6 | from .modeling_forgetting_transformer import (
 7 |     ForgettingTransformerForCausalLM, ForgettingTransformerModel)
 8 | 
 9 | AutoConfig.register(ForgettingTransformerConfig.model_type, ForgettingTransformerConfig)
10 | AutoModel.register(ForgettingTransformerConfig, ForgettingTransformerModel)
11 | AutoModelForCausalLM.register(ForgettingTransformerConfig, ForgettingTransformerForCausalLM)
12 | 
13 | 
14 | 
15 | __all__ = ['ForgettingTransformerConfig', 'ForgettingTransformerForCausalLM', 'ForgettingTransformerModel']
16 | 


--------------------------------------------------------------------------------
/src/forgetting_transformer/model/forgetting_transformer/configuration_forgetting_transformer.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from typing import Optional
 4 | 
 5 | from transformers.configuration_utils import PretrainedConfig
 6 | 
 7 | 
 8 | class ForgettingTransformerConfig(PretrainedConfig):
 9 | 
10 |     model_type = 'forgetting_transformer-project_fox'
11 |     keys_to_ignore_at_inference = ['past_key_values']
12 | 
13 |     def __init__(
14 |         self,
15 |         vocab_size: int = 32000,
16 |         hidden_size: int = 2048,
17 |         hidden_ratio: Optional[float] = 4,
18 |         intermediate_size: Optional[int] = None,
19 |         num_hidden_layers: int = 24,
20 |         num_heads: int = 32,
21 |         num_kv_heads: int = None,
22 |         hidden_act: str = "swish",
23 |         window_size: Optional[int] = None,
24 |         max_position_embeddings: int = 2048,
25 |         initializer_range: float = 0.02,
26 |         elementwise_affine: Optional[bool] = True,
27 |         norm_eps: float = 1e-6,
28 |         use_cache: bool = True,
29 |         pad_token_id: int = None,
30 |         bos_token_id: int = 1,
31 |         eos_token_id: int = 2,
32 |         tie_word_embeddings: bool = False,
33 |         attention_bias: bool = False,
34 |         fuse_norm: bool = True,
35 |         fuse_cross_entropy: bool = True,
36 |         rope_base: float = 500000.0,
37 |         use_rope: bool = False,
38 |         use_output_gate: bool = False,
39 |         ogate_act: str = "sigmoid",
40 |         fgate_type: str = "full",
41 |         fgate_bias_init: bool = False,
42 |         decay_time_min: Optional[float] = None,
43 |         decay_time_max: Optional[float] = None,
44 |         use_output_norm: bool = False,
45 |         qk_norm: bool = False,
46 |         qk_norm_share_param_across_head: bool = False,
47 |         use_k_shift: bool = False,
48 |         use_v_shift: bool = False,
49 |         **kwargs,
50 |     ):
51 |         self.vocab_size = vocab_size
52 |         self.hidden_size = hidden_size
53 |         self.hidden_ratio = hidden_ratio
54 |         self.intermediate_size = intermediate_size
55 |         self.num_hidden_layers = num_hidden_layers
56 |         self.num_heads = num_heads
57 |         self.num_kv_heads = num_kv_heads
58 |         self.window_size = window_size
59 |         self.max_position_embeddings = max_position_embeddings
60 | 
61 |         self.hidden_act = hidden_act
62 |         self.initializer_range = initializer_range
63 |         self.elementwise_affine = elementwise_affine
64 |         self.norm_eps = norm_eps
65 |         self.use_cache = use_cache
66 |         self.attention_bias = attention_bias
67 |         self.fuse_cross_entropy = fuse_cross_entropy
68 |         self.fuse_norm = fuse_norm
69 |         self.rope_base = rope_base
70 |         self.use_rope = use_rope
71 |         self.use_output_gate = use_output_gate
72 |         self.ogate_act = ogate_act
73 |         self.fgate_type = fgate_type
74 |         self.fgate_bias_init = fgate_bias_init
75 |         self.decay_time_min = decay_time_min
76 |         self.decay_time_max = decay_time_max
77 |         self.use_output_norm = use_output_norm
78 |         self.qk_norm = qk_norm
79 |         self.qk_norm_share_param_across_head = qk_norm_share_param_across_head
80 |         self.use_k_shift = use_k_shift
81 |         self.use_v_shift = use_v_shift
82 | 
83 |         super().__init__(
84 |             pad_token_id=pad_token_id,
85 |             bos_token_id=bos_token_id,
86 |             eos_token_id=eos_token_id,
87 |             tie_word_embeddings=tie_word_embeddings,
88 |             **kwargs,
89 |         )
90 | 


--------------------------------------------------------------------------------
/src/forgetting_transformer/model/forgetting_transformer/glu_linear.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | 
 4 | 
 5 | glu_fwd_codestring = """
 6 | template <typename T> T glu_fwd(T x, T y) {
 7 |     return float(y) / (1.0f + ::exp(-float(x)));
 8 | }
 9 | """
10 | glu_bwd_codestring = """
11 | template <typename T> T glu_bwd(T x, T y, T g, T& dx, T& dy) {
12 |     float x_sigmoid = 1.0f / (1.0f + ::exp(-float(x)));
13 |     dx = x_sigmoid * (1.0f - x_sigmoid) * float(g) * float(y);
14 |     dy = x_sigmoid * float(g);
15 | }
16 | """
17 | 
18 | glu_bwd_with_output_codestring = """
19 | template <typename T> T glu_bwd_with_output(T x, T y, T g, T& dx, T& dy, T& z) {
20 |     float x_sigmoid = 1.0f / (1.0f + ::exp(-float(x)));
21 |     dx = x_sigmoid * (1.0f - x_sigmoid) * float(g) * float(y);
22 |     dy = x_sigmoid * float(g);
23 |     z = x_sigmoid * float(y);
24 | }
25 | """
26 | 
27 | glu_fwd = torch.cuda.jiterator._create_jit_fn(glu_fwd_codestring)
28 | glu_bwd = torch.cuda.jiterator._create_multi_output_jit_fn(glu_bwd_codestring, num_outputs=2)
29 | glu_bwd_with_output = torch.cuda.jiterator._create_multi_output_jit_fn(glu_bwd_with_output_codestring, num_outputs=3)
30 | 
31 | 
32 | class GLULinearFunction(torch.autograd.Function):
33 |     r"""
34 |     Gated Linear Unit (GLU) function followed by a linear transformation.
35 | 
36 |     .. math::
37 |         \text{GLULinear}(x, y, W, b) = (sh(x) * y) W + b
38 | 
39 |     This simple wrap discards the intermediate results of GLU(x, y) to save memory.
40 |     """
41 | 
42 |     @staticmethod
43 |     def forward(ctx, x, y, weight, bias):
44 |         z = glu_fwd(x, y)
45 |         out = F.linear(z.to(weight.dtype), weight, bias)
46 |         # We don't store z, will be recomputed in the backward pass to save memory
47 |         ctx.save_for_backward(x, y, weight)
48 |         ctx.linear_bias_is_none = bias is None
49 |         return out
50 | 
51 |     @staticmethod
52 |     def backward(ctx, dout, *args):
53 |         x, y, weight = ctx.saved_tensors
54 |         dout = dout.reshape(-1, dout.shape[-1])
55 |         dz = F.linear(dout, weight.t()).view_as(x)
56 |         dx, dy, z = glu_bwd_with_output(x, y, dz)
57 |         dlinear_weight = torch.einsum("bo,bi->oi", dout, z.reshape(-1, z.shape[-1]))
58 |         dlinear_bias = None if ctx.linear_bias_is_none else dout.sum(0)
59 |         return dx, dy, dlinear_weight, dlinear_bias
60 | 
61 | glu_linear = GLULinearFunction.apply
62 | 


--------------------------------------------------------------------------------
/src/forgetting_transformer/model/hgrn2/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from transformers import AutoConfig, AutoModel, AutoModelForCausalLM
 4 | 
 5 | from .configuration_hgrn2 import HGRN2Config
 6 | from .modeling_hgrn2 import HGRN2ForCausalLM, HGRN2Model
 7 | 
 8 | AutoConfig.register(HGRN2Config.model_type, HGRN2Config)
 9 | AutoModel.register(HGRN2Config, HGRN2Model)
10 | AutoModelForCausalLM.register(HGRN2Config, HGRN2ForCausalLM)
11 | 
12 | 
13 | __all__ = ['HGRN2Config', 'HGRN2ForCausalLM', 'HGRN2Model']
14 | 


--------------------------------------------------------------------------------
/src/forgetting_transformer/model/hgrn2/configuration_hgrn2.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from typing import Optional
 4 | 
 5 | from transformers.configuration_utils import PretrainedConfig
 6 | 
 7 | 
 8 | class HGRN2Config(PretrainedConfig):
 9 | 
10 |     model_type = 'hgrn2-project_fox'
11 |     keys_to_ignore_at_inference = ['past_key_values']
12 | 
13 |     def __init__(
14 |         self,
15 |         vocab_size: int = 32000,
16 |         hidden_size: int = 2048,
17 |         num_hidden_layers: int = 24,
18 |         attn_mode: str = "chunk",
19 |         num_heads: Optional[int] = None,
20 |         expand_ratio: Optional[int] = 128,
21 |         use_short_conv: bool = False,
22 |         conv_size: int = 4,
23 |         use_lower_bound: bool = True,
24 |         hidden_ratio: Optional[int] = 4,
25 |         intermediate_size: Optional[int] = None,
26 |         hidden_act: str = "swish",
27 |         max_position_embeddings: int = 2048,
28 |         elementwise_affine: Optional[bool] = True,
29 |         norm_eps: float = 1e-6,
30 |         use_cache: bool = True,
31 |         pad_token_id: int = None,
32 |         bos_token_id: int = 1,
33 |         eos_token_id: int = 2,
34 |         tie_word_embeddings: bool = False,
35 |         initializer_range: float = 0.02,
36 |         fuse_cross_entropy: bool = True,
37 |         **kwargs
38 |     ):
39 |         self.vocab_size = vocab_size
40 |         self.max_position_embeddings = max_position_embeddings
41 |         self.hidden_size = hidden_size
42 |         self.num_hidden_layers = num_hidden_layers
43 |         self.attn_mode = attn_mode
44 |         self.num_heads = num_heads
45 |         self.expand_ratio = expand_ratio
46 |         self.use_short_conv = use_short_conv
47 |         self.conv_size = conv_size
48 |         self.use_lower_bound = use_lower_bound
49 |         self.hidden_ratio = hidden_ratio
50 |         self.intermediate_size = intermediate_size
51 |         self.hidden_act = hidden_act
52 |         self.elementwise_affine = elementwise_affine
53 |         self.norm_eps = norm_eps
54 |         self.use_cache = use_cache
55 |         self.initializer_range = initializer_range
56 |         self.fuse_cross_entropy = fuse_cross_entropy
57 | 
58 |         super().__init__(
59 |             pad_token_id=pad_token_id,
60 |             bos_token_id=bos_token_id,
61 |             eos_token_id=eos_token_id,
62 |             tie_word_embeddings=tie_word_embeddings,
63 |             **kwargs,
64 |         )
65 | 


--------------------------------------------------------------------------------
/src/forgetting_transformer/model/mamba2/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from transformers import AutoConfig, AutoModel, AutoModelForCausalLM
 4 | 
 5 | from .configuration_mamba2 import Mamba2Config
 6 | from .modeling_mamba2 import Mamba2ForCausalLM, Mamba2Model
 7 | 
 8 | AutoConfig.register(Mamba2Config.model_type, Mamba2Config, True)
 9 | AutoModel.register(Mamba2Config, Mamba2Model, True)
10 | AutoModelForCausalLM.register(Mamba2Config, Mamba2ForCausalLM, True)
11 | 
12 | 
13 | __all__ = ['Mamba2Config', 'Mamba2ForCausalLM', 'Mamba2Model']
14 | 


--------------------------------------------------------------------------------
/src/forgetting_transformer/model/samba/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from transformers import AutoConfig, AutoModel, AutoModelForCausalLM
 4 | 
 5 | from .configuration_samba import SambaConfig
 6 | from .modeling_samba import (SambaBlock, SambaForCausalLM,
 7 |                                              SambaModel)
 8 | 
 9 | AutoConfig.register(SambaConfig.model_type, SambaConfig, True)
10 | AutoModel.register(SambaConfig, SambaModel, True)
11 | AutoModelForCausalLM.register(SambaConfig, SambaForCausalLM, True)
12 | 
13 | 
14 | __all__ = ['SambaConfig', 'SambaForCausalLM', 'SambaModel', 'SambaBlock']
15 | 


--------------------------------------------------------------------------------
/src/forgetting_transformer/model/samba/configuration_samba.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import math
  4 | from typing import Dict, Optional
  5 | 
  6 | from transformers.configuration_utils import PretrainedConfig
  7 | try:
  8 |     from omegaconf import DictConfig, OmegaConf
  9 | except ImportError:
 10 |     DictConfig, OmegaConf = None, None
 11 | 
 12 | 
 13 | class SambaConfig(PretrainedConfig):
 14 | 
 15 |     model_type = "samba-project_fox"
 16 | 
 17 |     def __init__(
 18 |         self,
 19 |         vocab_size: int = 32000,
 20 |         hidden_size: int = 2304,
 21 |         state_size: int = 16,
 22 |         num_hidden_layers: int = 18,
 23 |         norm_eps=1e-5,
 24 |         pad_token_id: int = 0,
 25 |         bos_token_id: int = 1,
 26 |         eos_token_id: int = 2,
 27 |         expand: int = 2,
 28 |         conv_kernel: int = 4,
 29 |         use_bias: bool = False,
 30 |         use_conv_bias: bool = True,
 31 |         hidden_act: str = "silu",
 32 |         initializer_range: str = 0.02,
 33 |         residual_in_fp32: bool = False,
 34 |         time_step_rank: str = "auto",
 35 |         time_step_scale: float = 1.0,
 36 |         time_step_min: float = 0.001,
 37 |         time_step_max: float = 0.1,
 38 |         time_step_init_scheme: str = "random",
 39 |         time_step_floor: float = 1e-4,
 40 |         max_position_embeddings: int = 2048,
 41 |         attn: Optional[Dict] = None,
 42 |         attn_hidden_ratio: Optional[float] = 4,
 43 |         mamba_hidden_ratio: Optional[float] = 3,
 44 |         rescale_prenorm_residual: bool = False,
 45 |         use_cache: bool = True,
 46 |         fuse_norm: bool = True,
 47 |         fuse_cross_entropy: bool = True,
 48 |         tie_word_embeddings: bool = False,
 49 |         rope_base: float = 500000.0,
 50 |         **kwargs,
 51 |     ):
 52 |         self.vocab_size = vocab_size
 53 |         self.hidden_size = hidden_size
 54 |         self.state_size = state_size
 55 |         self.num_hidden_layers = num_hidden_layers
 56 |         self.norm_eps = norm_eps
 57 |         self.conv_kernel = conv_kernel
 58 |         self.expand = expand
 59 |         self.intermediate_size = int(expand * self.hidden_size)
 60 |         self.bos_token_id = bos_token_id
 61 |         self.eos_token_id = eos_token_id
 62 |         self.pad_token_id = pad_token_id
 63 |         self.use_bias = use_bias
 64 |         self.use_conv_bias = use_conv_bias
 65 |         self.hidden_act = hidden_act
 66 |         self.initializer_range = initializer_range
 67 |         self.time_step_rank = math.ceil(self.hidden_size / 16) if time_step_rank == "auto" else time_step_rank
 68 |         self.time_step_scale = time_step_scale
 69 |         self.time_step_min = time_step_min
 70 |         self.time_step_max = time_step_max
 71 |         self.time_step_init_scheme = time_step_init_scheme
 72 |         self.time_step_floor = time_step_floor
 73 |         self.max_position_embeddings = max_position_embeddings
 74 |         self.attn_hidden_ratio = attn_hidden_ratio
 75 |         self.mamba_hidden_ratio = mamba_hidden_ratio
 76 |         self.rescale_prenorm_residual = rescale_prenorm_residual
 77 |         self.residual_in_fp32 = residual_in_fp32
 78 |         self.use_cache = use_cache
 79 |         self.fuse_cross_entropy = fuse_cross_entropy
 80 |         self.fuse_norm = fuse_norm
 81 |         self.rope_base = rope_base
 82 | 
 83 |         if attn is not None:
 84 |             if isinstance(attn, (DictConfig)):
 85 |                 attn = OmegaConf.to_container(attn)
 86 |             if not isinstance(attn, dict):
 87 |                 raise ValueError("attn must be a dictionary")
 88 |             if 'layers' not in attn:
 89 |                 raise ValueError("Layer indices must be provided to initialize hybrid attention layers")
 90 |             if 'num_heads' not in attn:
 91 |                 raise ValueError("Number of heads must be provided to initialize hybrid attention layers")
 92 |             # attn['num_heads'] = attn.get('num_kv_heads', 18)
 93 |             # attn['num_kv_heads'] = attn.get('num_kv_heads', attn['num_heads'])
 94 |             # attn['window_size'] = attn.get('window_size', 2048)
 95 |         # else:
 96 |             # raise ValueError("attn must not be None")
 97 |         self.attn = attn
 98 | 
 99 |         super().__init__(
100 |             bos_token_id=bos_token_id,
101 |             eos_token_id=eos_token_id,
102 |             pad_token_id=pad_token_id,
103 |             tie_word_embeddings=tie_word_embeddings,
104 |             **kwargs
105 |         )
106 | 


--------------------------------------------------------------------------------
/src/forgetting_transformer/model/transformer/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from transformers import AutoConfig, AutoModel, AutoModelForCausalLM
 4 | 
 5 | from .configuration_transformer import TransformerConfig
 6 | from .modeling_transformer import (
 7 |     TransformerForCausalLM, TransformerModel)
 8 | 
 9 | AutoConfig.register(TransformerConfig.model_type, TransformerConfig)
10 | AutoModel.register(TransformerConfig, TransformerModel)
11 | AutoModelForCausalLM.register(TransformerConfig, TransformerForCausalLM)
12 | 
13 | 
14 | 
15 | __all__ = ['TransformerConfig', 'TransformerForCausalLM', 'TransformerModel']
16 | 


--------------------------------------------------------------------------------
/src/forgetting_transformer/model/transformer/configuration_transformer.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from typing import Optional
 4 | 
 5 | from transformers.configuration_utils import PretrainedConfig
 6 | 
 7 | 
 8 | class TransformerConfig(PretrainedConfig):
 9 | 
10 |     model_type = 'transformer-project_fox'
11 |     keys_to_ignore_at_inference = ['past_key_values']
12 | 
13 |     def __init__(
14 |         self,
15 |         vocab_size: int = 32000,
16 |         hidden_size: int = 2048,
17 |         hidden_ratio: Optional[int] = 4,
18 |         intermediate_size: Optional[int] = None,
19 |         num_hidden_layers: int = 24,
20 |         num_heads: int = 32,
21 |         num_kv_heads: int = None,
22 |         hidden_act: str = "swish",
23 |         window_size: Optional[int] = None,
24 |         max_position_embeddings: int = 2048,
25 |         initializer_range: float = 0.02,
26 |         elementwise_affine: Optional[bool] = True,
27 |         norm_eps: float = 1e-6,
28 |         use_cache: bool = True,
29 |         pad_token_id: int = None,
30 |         bos_token_id: int = 1,
31 |         eos_token_id: int = 2,
32 |         tie_word_embeddings: bool = False,
33 |         attention_bias: bool = False,
34 |         fuse_norm: bool = True,
35 |         fuse_cross_entropy: bool = True,
36 |         rope_base: float = 500000.0,
37 |         use_rope: bool = True,
38 |         **kwargs,
39 |     ):
40 |         self.vocab_size = vocab_size
41 |         self.hidden_size = hidden_size
42 |         self.hidden_ratio = hidden_ratio
43 |         self.intermediate_size = intermediate_size
44 |         self.num_hidden_layers = num_hidden_layers
45 |         self.num_heads = num_heads
46 |         self.num_kv_heads = num_kv_heads
47 |         self.window_size = window_size
48 |         self.max_position_embeddings = max_position_embeddings
49 | 
50 |         self.hidden_act = hidden_act
51 |         self.initializer_range = initializer_range
52 |         self.elementwise_affine = elementwise_affine
53 |         self.norm_eps = norm_eps
54 |         self.use_cache = use_cache
55 |         self.attention_bias = attention_bias
56 |         self.fuse_cross_entropy = fuse_cross_entropy
57 |         self.fuse_norm = fuse_norm
58 |         self.rope_base = rope_base
59 |         self.use_rope = use_rope
60 | 
61 |         super().__init__(
62 |             pad_token_id=pad_token_id,
63 |             bos_token_id=bos_token_id,
64 |             eos_token_id=eos_token_id,
65 |             tie_word_embeddings=tie_word_embeddings,
66 |             **kwargs,
67 |         )
68 | 


--------------------------------------------------------------------------------
/src/forgetting_transformer/ops/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhixuan-lin/forgetting-transformer/f8ce22afe14980628534e06d9ee62baeeddf1dcf/src/forgetting_transformer/ops/__init__.py


--------------------------------------------------------------------------------
/src/forgetting_transformer/schedule/__init__.py:
--------------------------------------------------------------------------------
1 | from .schedule import (
2 |     constant_schedule,
3 |     warmup_cosine_decay_schedule,
4 |     warmup_linear_decay_schedule,
5 |     linear_schedule,
6 |     polynomial_schedule,
7 |     warmup_one_minus_sqrt_schedule
8 | )
9 | 


--------------------------------------------------------------------------------
/src/forgetting_transformer/tokenizer.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from typing import Callable, Dict, Union, Optional, Tuple, NamedTuple, Any, List
 3 | from transformers import GPT2Tokenizer, PretrainedConfig, AutoTokenizer
 4 | 
 5 | 
 6 | 
 7 | class JSONGPT2Tokenizer(GPT2Tokenizer):
 8 |     def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
 9 |         (text, kwargs) = super().prepare_for_tokenization(text, is_split_into_words, **kwargs)
10 |         text = json.dumps(text)
11 |         text = text[1:-1]
12 |         return (text, kwargs)
13 | 
14 |     def decode(
15 |         self,
16 |         token_ids: Union[int, List[int], "np.ndarray", "torch.Tensor", "tf.Tensor"],
17 |         skip_special_tokens: bool = False,
18 |         clean_up_tokenization_spaces: bool = None,
19 |         **kwargs,
20 |     ):
21 |         text = super().decode(
22 |             token_ids=token_ids,
23 |             skip_special_tokens=skip_special_tokens,
24 |             clean_up_tokenization_spaces=clean_up_tokenization_spaces,
25 |             **kwargs,
26 |         )
27 |         try:
28 |             # Unfortunately this is what LongCrawl64 did. 
29 |             text = json.loads(f'"{text}"')
30 |         except json.JSONDecodeError:
31 |             # Best effort decoding
32 |             text = text.encode().decode("unicode_escape", "ignore")
33 |         return text
34 | 
35 | class DummyConfig(PretrainedConfig):
36 |     pass
37 | 
38 | AutoTokenizer.register(DummyConfig, JSONGPT2Tokenizer)
39 | 


--------------------------------------------------------------------------------