├── .env.example ├── .gitattributes ├── .gitignore ├── .pre-commit-config.yaml ├── .vscode ├── launch.json └── settings.json ├── LICENSE ├── Makefile ├── README.md ├── VERSION ├── conf ├── cpt │ └── lora.yaml └── deepspeed │ ├── bf16.json │ ├── bf16_zero1.json │ ├── bf16_zero1_default.json │ ├── bf16_zero2_default.json │ ├── bf16_zero3.json │ └── fp16.json ├── docs ├── Contribution.md ├── Installation.md ├── LLaMA_MoE.pdf ├── Notification.md ├── continual_pretraining │ └── README.md ├── expert_construction │ └── README.md ├── imgs │ ├── MoE-Routing.gif │ ├── title-favicon.png │ ├── wechat-notification-config.png │ └── wechat-notification.jpg └── supervised_fine_tuning │ └── SFT.md ├── example.py ├── requirements.txt ├── scripts ├── analysis │ └── get_layer_wise_score_scale_factor.sh ├── cpt │ ├── 16_2 │ │ ├── baseline_112gpus_sheared_llama_portion_fluency_sf4.sh │ │ └── baseline_112gpus_sheared_llama_portion_fluency_sf8.sh │ ├── 8_2 │ │ ├── baseline_112gpus_8_2_sheared_llama_portion_fluency_sf4.sh │ │ └── mixtral_112gpus_8_2_sheared_llama_portion_fluency_sf4.sh │ ├── README.md │ ├── dynamic_data_selection │ │ ├── baseline.sh │ │ ├── baseline_112gpus.sh │ │ ├── baseline_112gpus_linear_gate.sh │ │ ├── baseline_112gpus_scale2.0.sh │ │ ├── baseline_112gpus_sheared_llama_portion.sh │ │ ├── baseline_112gpus_sheared_llama_portion_fluency.sh │ │ ├── baseline_112gpus_sheared_llama_portion_gate_balance_loss0.1.sh │ │ ├── baseline_112gpus_sheared_llama_portion_no_ad.sh │ │ ├── baseline_32gpus.sh │ │ ├── sheared_llama_112gpus.sh │ │ ├── sheared_llama_112gpus_100B.sh │ │ ├── sheared_llama_32gpus.sh │ │ └── sheared_llama_paper.sh │ ├── fpt.sh │ ├── fpt_13b.sh │ ├── fpt_7b.sh │ ├── fpt_7b_residual.sh │ ├── fpt_7b_weight_norm.sh │ ├── fpt_llama2_7b_moefication.sh │ ├── fpt_llama2_7b_share.sh │ ├── fpt_resume.sh │ ├── fpt_switch.sh │ ├── fpt_test_lr.sh │ ├── gate_loss.sh │ ├── lora.sh │ ├── multi_jobs.sh │ ├── test │ │ ├── fpt_7b_residual_test.sh │ │ ├── fpt_7b_test.sh │ │ └── test_conn.sh │ └── vs_upcycle_dense │ │ └── fpt_3b_total_10b.sh ├── eval │ ├── eval_mmlu_moe.sh │ ├── ref_loss.sh │ └── ref_loss_random_split.sh ├── examples │ ├── create_noise_llama_moe.sh │ ├── create_noise_llama_moe_residual.sh │ ├── create_switch_llama_moe.sh │ ├── load_llama_moe.sh │ ├── load_llama_moe_hf.sh │ └── load_llama_moe_residual.sh ├── expert_construction │ ├── convert │ │ ├── run_convert.sh │ │ ├── run_convert_gradient.sh │ │ ├── run_convert_gradient_residual.sh │ │ └── run_convert_mistral.sh │ ├── get_hidden_features │ │ ├── run_get_hidden_features.sh │ │ └── run_prepare_datasets.sh │ ├── prune │ │ ├── run_prune_gradient.sh │ │ ├── run_prune_gradient_convert.sh │ │ ├── run_prune_gradient_convert_one4all.sh │ │ ├── run_prune_gradient_one4all.sh │ │ ├── run_prune_random.sh │ │ ├── run_prune_random_convert.sh │ │ ├── run_prune_random_convert_one4all.sh │ │ └── run_prune_random_one4all.sh │ ├── select │ │ ├── run_select.sh │ │ └── run_select_multiprocess.sh │ └── split │ │ ├── run_split_clustering.sh │ │ ├── run_split_gradient.sh │ │ ├── run_split_gradient_get_grads.sh │ │ ├── run_split_gradient_one4all.sh │ │ ├── run_split_gradient_residual.sh │ │ ├── run_split_gradient_residual_one4all.sh │ │ ├── run_split_graph.sh │ │ ├── run_split_random.sh │ │ └── run_split_random_one4all.sh ├── sft │ ├── 2_16.sh │ ├── 2_8.sh │ └── 4_16.sh ├── test │ ├── test_args.sh │ └── test_conn.sh ├── tokenize │ ├── clustering.sh │ ├── lines.sh │ ├── redpajama.sh │ └── slimpajama_convert.sh └── visualization │ ├── run_visualize_expert_load_one4all.sh │ ├── run_visualize_expert_neuron_overlap.sh │ ├── run_visualize_expert_neuron_overlap_one4all.sh │ ├── run_visualize_expert_neuron_overlap_overview.sh │ ├── run_visualize_expert_select_mlp.sh │ ├── run_visualize_expert_select_mlp_one4all.sh │ ├── run_visualize_mlp_output_scale.sh │ ├── run_visualize_swiglu_output.sh │ └── run_visualize_swiglu_output_one4all.sh ├── setup.py ├── smoe ├── __init__.py ├── callbacks │ ├── __init__.py │ ├── save_model.py │ └── tensorboard.py ├── data │ ├── __init__.py │ ├── aggregation.py │ ├── collate_fn.py │ ├── datasets_moe.py │ ├── dynamic_selection.py │ ├── redpajama.py │ ├── single_file.py │ └── streaming.py ├── entrypoint │ ├── __init__.py │ ├── analysis │ │ ├── __init__.py │ │ ├── act_scale.py │ │ ├── clustering_distribution.py │ │ ├── gate_load_vis.py │ │ ├── get_layer_wise_score_scale_factor.py │ │ ├── hidden_before_gate_vis.py │ │ └── scale_factor_simulation.py │ ├── compress_png_images.py │ ├── cpt │ │ ├── __init__.py │ │ ├── cpt_fpt.py │ │ └── cpt_lora.py │ ├── download_llama.py │ ├── eval │ │ ├── __init__.py │ │ ├── eval_mmlu_moe_0.py │ │ ├── eval_mmlu_moe_1.py │ │ ├── eval_mmlu_moe_2.py │ │ └── eval_mmlu_moe_3.py │ ├── examples │ │ ├── __init__.py │ │ ├── create_noise_llama_moe.py │ │ ├── create_noise_llama_moe_residual.py │ │ ├── create_switch_llama_moe.py │ │ ├── load_llama_moe.py │ │ ├── load_llama_moe_hf.py │ │ ├── load_llama_moe_residual.py │ │ └── load_relu_llama.py │ ├── expert_construction │ │ ├── __init__.py │ │ ├── llama_convert.py │ │ ├── llama_convert_neuron_index.py │ │ ├── llama_convert_neuron_index_residual.py │ │ ├── llama_get_hidden_features.py │ │ ├── llama_prepare_datasets.py │ │ ├── llama_prune_gradient.py │ │ ├── llama_prune_random.py │ │ ├── llama_select_mlp.py │ │ ├── llama_select_mlp_multiprocess.py │ │ ├── llama_split_clustering.py │ │ ├── llama_split_gradient.py │ │ ├── llama_split_gradient_get_grads.py │ │ ├── llama_split_gradient_residual.py │ │ ├── llama_split_graph.py │ │ ├── llama_split_graph_trans_gp.py │ │ ├── llama_split_random.py │ │ └── mistral_convert.py │ ├── sft │ │ ├── __init__.py │ │ └── train_sft.py │ ├── text_clustering.py │ └── visualization │ │ ├── __init__.py │ │ ├── visualize_expert_load.py │ │ ├── visualize_expert_neuron_overlap.py │ │ ├── visualize_expert_neuron_overlap_overview.py │ │ ├── visualize_expert_select_mlp.py │ │ ├── visualize_gate_loss.py │ │ ├── visualize_mlp_output_scale.py │ │ └── visualize_swiglu_output.py ├── metrics │ ├── __init__.py │ ├── accuracy.py │ └── preprocess.py ├── models │ ├── __init__.py │ ├── llama_moe │ │ ├── __init__.py │ │ ├── configuration_llama_moe.py │ │ ├── modeling_llama_moe.py │ │ └── modeling_llama_moe_hf.py │ ├── llama_moe_residual │ │ ├── __init__.py │ │ ├── configuration_llama_moe_residual.py │ │ └── modeling_llama_moe_residual.py │ ├── mistral │ │ ├── __init__.py │ │ ├── configuration_mistral.py │ │ └── modeling_mistral.py │ └── mixtral │ │ ├── __init__.py │ │ ├── configuration_mixtral.py │ │ └── modeling_mixtral.py ├── modules │ ├── __init__.py │ ├── flash_attn.py │ ├── moe │ │ ├── __init__.py │ │ ├── moe_calculators.py │ │ ├── moe_experts.py │ │ ├── moe_gates.py │ │ └── moe_layers.py │ ├── moe_residual │ │ ├── __init__.py │ │ ├── moe_residual_layers.py │ │ └── residual_blocks.py │ └── norm.py ├── trainer │ ├── __init__.py │ ├── llama_lr_scheduling.py │ └── moefy │ │ ├── __init__.py │ │ └── expert_split_gradient.py └── utils │ ├── __init__.py │ ├── cache_utils.py │ ├── config.py │ ├── conversation.py │ ├── convert_moe_to_dense.py │ ├── debugging.py │ ├── eval │ ├── __init__.py │ ├── crop.py │ └── gather_results.py │ ├── expert_construction │ ├── __init__.py │ ├── convert_llama_moe.py │ ├── convert_llama_moe_neuron_index.py │ ├── convert_llama_moe_neuron_index_residual.py │ ├── expert_select.py │ ├── expert_split.py │ ├── expert_split_residual.py │ ├── k_means_constrained_cos.py │ └── prune_llama.py │ ├── extract_text_from_jsonl.py │ ├── io.py │ ├── kernel_function.py │ ├── logging.py │ ├── merge_llama_with_lora.py │ ├── model_operation │ ├── __init__.py │ ├── change_llama_forward.py │ ├── change_llama_moe_forward.py │ ├── modify_llama_model.py │ └── modify_llama_moe_model.py │ ├── modeling_attn_mask_utils.py │ ├── notification.py │ ├── operations │ ├── __init__.py │ ├── operation_list.py │ ├── operation_string.py │ └── operation_tensor.py │ ├── param.py │ ├── param_estimation.py │ ├── random_utils.py │ ├── seed.py │ ├── split_files.py │ ├── text_clustering.py │ ├── tokenize.py │ ├── vars.py │ └── visualization │ ├── __init__.py │ ├── bar.py │ ├── convert_gif.py │ ├── line.py │ ├── plotter.py │ ├── tsne_torch_model.py │ └── visualize.py ├── tests ├── __init__.py ├── data │ ├── __init__.py │ ├── test_aggregation.py │ ├── test_redpajama.py │ └── test_streaming.py ├── entrypoint │ ├── __init__.py │ └── test_conn.py ├── models │ ├── __init__.py │ ├── test_noise_moe.py │ ├── test_noise_moe_residual.py │ ├── test_switch_moe.py │ └── test_switch_moe_residual.py ├── modules │ ├── __init__.py │ ├── test_hook.py │ └── test_hook_llama_mlp.py └── utils │ ├── __init__.py │ ├── test_gumble.py │ ├── test_logging.py │ └── visualization │ ├── __init__.py │ └── test_expert_load.py ├── tools ├── check_killed.py ├── cp_files.py ├── listen.py ├── queue_submit.py └── scl_jobs.sh └── tox.ini /.env.example: -------------------------------------------------------------------------------- 1 | WECHAT_ROBOT_WEBHOOK="https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=blahblah" 2 | WECHAT_ROBOT_MENTIONS="wechat_user1,user2" 3 | WECHAT_ROBOT_MENTIONS_MOBILE="15600000000,16700000000" 4 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | docs/imgs/title-favicon.png filter=lfs diff=lfs merge=lfs -text 2 | docs/imgs/MoE-Routing.gif filter=lfs diff=lfs merge=lfs -text 3 | docs/LLaMA_MoE.pdf filter=lfs diff=lfs merge=lfs -text 4 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pycqa/isort 3 | rev: 5.12.0 4 | hooks: 5 | - id: isort 6 | name: isort (python) 7 | args: ["--profile", "black", "--filter-files"] 8 | - repo: https://github.com/psf/black 9 | rev: 22.12.0 10 | hooks: 11 | - id: black 12 | - repo: https://github.com/pre-commit/pre-commit-hooks 13 | rev: v4.4.0 14 | hooks: 15 | - id: trailing-whitespace 16 | - id: end-of-file-fixer 17 | - id: check-yaml 18 | - id: check-added-large-files 19 | -------------------------------------------------------------------------------- /.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | // Use IntelliSense to learn about possible attributes. 3 | // Hover to view descriptions of existing attributes. 4 | // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 5 | "version": "0.2.0", 6 | "configurations": [ 7 | { 8 | "name": "tokenize", 9 | "type": "python", 10 | "request": "launch", 11 | "module": "smoe.utils.tokenize", 12 | "justMyCode": true 13 | }, 14 | { 15 | "name": "Python: Remote Attach", 16 | "type": "python", 17 | "request": "attach", 18 | "connect": { 19 | "host": "x.x.x.x", 20 | "port": 5678 21 | }, 22 | "pathMappings": [ 23 | { 24 | "localRoot": "${workspaceFolder}", 25 | "remoteRoot": "." 26 | } 27 | ], 28 | "justMyCode": false 29 | } 30 | ] 31 | } 32 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "python.testing.pytestArgs": [ 3 | "tests" 4 | ], 5 | "python.testing.unittestEnabled": false, 6 | "python.testing.pytestEnabled": true 7 | } 8 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | all: format clean pre test 2 | echo 'finished' 3 | 4 | .PHONY: format 5 | format: 6 | isort --profile black --filter-files . 7 | black . 8 | 9 | .PHONY: test 10 | test: 11 | coverage run --source smoe -m pytest -vv . 12 | coverage report -m 13 | flake8 14 | 15 | .PHONY: pre 16 | pre: 17 | pre-commit run --all-files 18 | 19 | .PHONY: debug 20 | debug: 21 | pytest -vv tests/utils/test_logging.py 22 | 23 | .PHONY: clean 24 | clean: 25 | rm -rf build/ 26 | rm -rf dist/ 27 | rm -rf *.egg-info/ 28 | rm -f .coverage 29 | rm -f coverage.xml 30 | find . | grep -E '(__pycache__|\.pyc|\.pyo$$)' | xargs rm -rf 31 | -------------------------------------------------------------------------------- /VERSION: -------------------------------------------------------------------------------- 1 | 0.0.0 2 | -------------------------------------------------------------------------------- /conf/cpt/lora.yaml: -------------------------------------------------------------------------------- 1 | # this config file is for demonstration usage only 2 | deepspeed: conf/deepspeed/bf16.json 3 | -------------------------------------------------------------------------------- /conf/deepspeed/bf16.json: -------------------------------------------------------------------------------- 1 | { 2 | "bf16": { 3 | "enabled": true 4 | }, 5 | "zero_optimization": { 6 | "stage": 1, 7 | "allgather_partitions": true, 8 | "allgather_bucket_size": 1e8, 9 | "overlap_comm": true, 10 | "reduce_scatter": true, 11 | "reduce_bucket_size": 1e8, 12 | "contiguous_gradients": true 13 | }, 14 | "gradient_accumulation_steps": "auto", 15 | "gradient_clipping": "auto", 16 | "steps_per_print": 2000, 17 | "train_batch_size": "auto", 18 | "train_micro_batch_size_per_gpu": "auto", 19 | "wall_clock_breakdown": false 20 | } 21 | -------------------------------------------------------------------------------- /conf/deepspeed/bf16_zero1.json: -------------------------------------------------------------------------------- 1 | { 2 | "bf16": { 3 | "enabled": true 4 | }, 5 | "zero_optimization": { 6 | "stage": 1, 7 | "allgather_partitions": true, 8 | "allgather_bucket_size": 1e8, 9 | "overlap_comm": true, 10 | "reduce_scatter": true, 11 | "reduce_bucket_size": 1e8, 12 | "contiguous_gradients": true 13 | }, 14 | "gradient_accumulation_steps": "auto", 15 | "gradient_clipping": "auto", 16 | "steps_per_print": 2000, 17 | "train_batch_size": "auto", 18 | "train_micro_batch_size_per_gpu": "auto", 19 | "wall_clock_breakdown": false 20 | } 21 | -------------------------------------------------------------------------------- /conf/deepspeed/bf16_zero1_default.json: -------------------------------------------------------------------------------- 1 | { 2 | "bf16": { 3 | "enabled": true 4 | }, 5 | "zero_optimization": { 6 | "stage": 1 7 | }, 8 | "gradient_accumulation_steps": "auto", 9 | "gradient_clipping": "auto", 10 | "steps_per_print": 2000, 11 | "train_batch_size": "auto", 12 | "train_micro_batch_size_per_gpu": "auto", 13 | "wall_clock_breakdown": false, 14 | "reduce_bucket_size": 536870912 15 | } 16 | -------------------------------------------------------------------------------- /conf/deepspeed/bf16_zero2_default.json: -------------------------------------------------------------------------------- 1 | { 2 | "bf16": { 3 | "enabled": true 4 | }, 5 | "zero_optimization": { 6 | "stage": 2 7 | }, 8 | "gradient_accumulation_steps": "auto", 9 | "gradient_clipping": "auto", 10 | "steps_per_print": 2000, 11 | "train_batch_size": "auto", 12 | "train_micro_batch_size_per_gpu": "auto", 13 | "wall_clock_breakdown": false 14 | } 15 | -------------------------------------------------------------------------------- /conf/deepspeed/bf16_zero3.json: -------------------------------------------------------------------------------- 1 | { 2 | "bf16": { 3 | "enabled": true 4 | }, 5 | "zero_optimization": { 6 | "stage": 3 7 | }, 8 | "gradient_accumulation_steps": "auto", 9 | "gradient_clipping": "auto", 10 | "steps_per_print": 2000, 11 | "train_batch_size": "auto", 12 | "train_micro_batch_size_per_gpu": "auto", 13 | "wall_clock_breakdown": false, 14 | "reduce_bucket_size": 536870912 15 | } 16 | -------------------------------------------------------------------------------- /conf/deepspeed/fp16.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 100, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1e-10 9 | }, 10 | "zero_optimization": { 11 | "stage": 2, 12 | "allgather_partitions": true, 13 | "allgather_bucket_size": 1e8, 14 | "overlap_comm": true, 15 | "reduce_scatter": true, 16 | "reduce_bucket_size": 1e8, 17 | "contiguous_gradients": true 18 | }, 19 | 20 | "gradient_accumulation_steps": "auto", 21 | "gradient_clipping": "auto", 22 | "steps_per_print": 2000, 23 | "train_batch_size": "auto", 24 | "train_micro_batch_size_per_gpu": "auto", 25 | "wall_clock_breakdown": false 26 | } 27 | -------------------------------------------------------------------------------- /docs/Contribution.md: -------------------------------------------------------------------------------- 1 | # 🤝 Contribution 2 | 3 | - Make sure the Python version `>=3.10` (a strict version contraint for better type hinting) 4 | 5 | ```bash 6 | $ conda install git # upgrade git 7 | $ git clone git@github.com:pjlab-sys4nlp/llama-moe.git 8 | $ cd llama-moe 9 | $ pip install -e .[dev] 10 | $ pre-commit install 11 | ``` 12 | -------------------------------------------------------------------------------- /docs/Installation.md: -------------------------------------------------------------------------------- 1 | # 🌴 Installation 2 | 3 | 1. Prepare conda environment: `conda create -n smoe python=3.11` (If your environment name is not `smoe`, you may need to change environment in launching scripts) 4 | 2. Add correct environment variables in `~/.bashrc` (`gcc` is set to newer version for installing `flash-attn`). e.g.: 5 | ```bash 6 | export PATH=/mnt/petrelfs/share/cuda-11.8/bin:$PATH 7 | export LD_LIBRARY_PATH=/mnt/petrelfs/share/cuda-11.8/lib64:$LD_LIBRARY_PATH 8 | export PATH=/mnt/petrelfs/share/gcc-10.1.0/bin:$PATH 9 | export LD_LIBRARY_PATH=/mnt/petrelfs/share/gcc-10.1.0/lib64:$LD_LIBRARY_PATH 10 | ``` 11 | 3. Take the variables into effect: `source ~/.bashrc` 12 | 4. Install PyTorch (CUDA-11.8): `pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118` 13 | 5. Install dependencies: `pip install -r requirements.txt` 14 | 6. Install `flash-attn`: `pip install flash-attn==2.0.1 --no-build-isolation`. You may need to follow the [flash-attn installation instructions](https://github.com/Dao-AILab/flash-attention?tab=readme-ov-file#installation-and-features) to avoid some errors. 15 | 7. Install the latest Git: `conda install git` 16 | 8. Clone the repo: `git clone git@github.com:pjlab-sys4nlp/llama-moe.git` (If you don't setup the ssh key to GitHub, you may not able to clone through ssh. Check the [docs](https://docs.github.com/en/authentication/connecting-to-github-with-ssh/adding-a-new-ssh-key-to-your-github-account) about it.) 17 | 9. Change current directory: `cd llama-moe` 18 | 10. Install `smoe` in [editable mode](https://pip.pypa.io/en/stable/cli/pip_install/#cmdoption-e): `pip install -e .[dev]` 19 | 11. Setup `pre-commit` hooks: `pre-commit install` 20 | -------------------------------------------------------------------------------- /docs/LLaMA_MoE.pdf: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:a00df5a444cd37a7ea94f28062c73e0cd3f77ecdb5c6bdb163f57f18d22acc7f 3 | size 914313 4 | -------------------------------------------------------------------------------- /docs/Notification.md: -------------------------------------------------------------------------------- 1 | # 💬 Notification 2 | 3 | ## WeChatWork Notification 4 | 5 | ![WeChatWork Notification Example](imgs/wechat-notification.jpg) 6 | 7 | 1. You should create a WeChat Work group if you don't have one. 8 | 2. Add a group robot from the group settings panel, and get the **webhook url**. 9 | 3. Create an env file: `cp .env.example .env` 10 | 4. Update the content in the `.env` file, make sure the webhook url is correctly configured. If you'd like to AT someone in the group, you should update the mobile phone number. 11 | ![Configuration](imgs/wechat-notification-config.png) 12 | 5. Add notification decorator to the code, and you would see the notification messages in the chat group: 13 | ```python 14 | from smoe.utils.notification import wechat_sender 15 | 16 | @wechat_sender() 17 | def main(): 18 | raise RuntimeError("error testing") 19 | 20 | if __name__ == "__main__": 21 | main() 22 | ``` 23 | -------------------------------------------------------------------------------- /docs/imgs/MoE-Routing.gif: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:d0a31562b85a1ad8d7e62c58dcb3f60bdf19ed70b4becf3f3b0ae51ae1ec19bd 3 | size 608200 4 | -------------------------------------------------------------------------------- /docs/imgs/title-favicon.png: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:656e5f3de4440b469d9b7bb928a14872dfb69329b7d539bce620cdef782d804c 3 | size 1167538 4 | -------------------------------------------------------------------------------- /docs/imgs/wechat-notification-config.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pjlab-sys4nlp/llama-moe/b17aff436cce58e2fcd7327789c7fadafe15d19e/docs/imgs/wechat-notification-config.png -------------------------------------------------------------------------------- /docs/imgs/wechat-notification.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pjlab-sys4nlp/llama-moe/b17aff436cce58e2fcd7327789c7fadafe15d19e/docs/imgs/wechat-notification.jpg -------------------------------------------------------------------------------- /docs/supervised_fine_tuning/SFT.md: -------------------------------------------------------------------------------- 1 | # Supervised Fine-Tuning (SFT) 2 | 3 | ## Data Preparation 4 | 5 | Download [Deita 6K](https://huggingface.co/datasets/hkust-nlp/deita-6k-v0) to `data/deita/deita_6k.jsonl`. 6 | 7 | ## Training 8 | 9 | Start training in Slurm clusters: `sbatch scripts/sft/2_8.sh`. 10 | 11 | ## Inference 12 | 13 | ```python 14 | from transformers import AutoModelForCausalLM 15 | from transformers import AutoTokenizer 16 | 17 | from src.utils.conversation import Conversation 18 | 19 | conv = Conversation() 20 | conv.append_message("human", "Give me a three-day plan in Suzhou.") 21 | conv.append_message("gpt", None) 22 | prompt = conv.get_prompt() 23 | print(prompt) 24 | print(prompt[-1] == " ") 25 | 26 | model_dir = "llama-moe/LLaMA-MoE-v1-3_5B-2_8-sft" 27 | 28 | tok = AutoTokenizer.from_pretrained(model_dir) 29 | m = AutoModelForCausalLM.from_pretrained(model_dir, trust_remote_code=True) 30 | m.eval() 31 | m.cuda() 32 | 33 | inputs = tok(prompt, return_tensors="pt") 34 | input_ids = inputs["input_ids"].cuda() 35 | 36 | output = m.generate(input_ids, max_length=100, temperature=1.0, do_sample=True, use_cache=True) 37 | response = tok.decode(output[0], skip_special_tokens=True) 38 | print(response) 39 | ``` 40 | -------------------------------------------------------------------------------- /example.py: -------------------------------------------------------------------------------- 1 | # python>=3.10 2 | 3 | import torch 4 | from transformers import AutoModelForCausalLM, AutoTokenizer 5 | 6 | model_dir = "llama-moe/LLaMA-MoE-v1-3_5B-2_8" 7 | tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True) 8 | model = AutoModelForCausalLM.from_pretrained( 9 | model_dir, torch_dtype=torch.bfloat16, trust_remote_code=True 10 | ) 11 | model.eval() 12 | model.to("cuda:0") 13 | 14 | input_text = "Suzhou is famous of" 15 | inputs = tokenizer(input_text, return_tensors="pt") 16 | inputs = inputs.to("cuda:0") 17 | 18 | pred = model.generate(**inputs, max_length=50, temperature=0.0) 19 | print(tokenizer.decode(pred.cpu()[0], skip_special_tokens=True)) 20 | # Suzhou is famous of its beautiful gardens. The most famous one is the Humble Administrator's Garden. It is a classical Chinese garden with a history of more than 600 years. The garden is divided into three 21 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | accelerate==0.21.0 2 | black==23.7.0 3 | coverage==7.2.7 4 | datasets==2.14.1 5 | debugpy==1.6.7 6 | deepspeed==0.10.0 7 | flake8==6.0.0 8 | huggingface-hub==0.16.4 9 | isort==5.12.0 10 | k-means-constrained==0.7.3 11 | nltk==3.8.1 12 | ninja==1.11.1 13 | omegaconf==2.0.6 14 | packaging==23.1 15 | peft==0.4.0 16 | pre-commit==3.3.3 17 | pytest==7.4.0 18 | safetensors==0.3.1 19 | scikit-learn==1.3.0 20 | sentencepiece==0.1.99 21 | tensorboard==2.13.0 22 | tokenizers==0.13.3 23 | torch==2.0.1 24 | torchaudio==2.0.2 25 | torchvision==0.15.2 26 | tqdm==4.65.0 27 | transformers==4.31.0 28 | triton==2.0.0 29 | trl==0.4.7 30 | wandb==0.15.6 31 | xformers==0.0.20 32 | pebble==5.0.3 33 | matplotlib==3.7.2 34 | python-dotenv==1.0.0 35 | sentence-transformers==2.2.2 36 | Pillow==9.4.0 37 | numpy==1.25.0 38 | opencv-python==4.8.1.78 39 | pynvml==11.5.0 40 | PyYaml==6.0.1 41 | pandas<2.1.0 42 | -------------------------------------------------------------------------------- /scripts/analysis/get_layer_wise_score_scale_factor.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | 3 | # llama_7B llama_13B llama_30B llama_base llama_3B 4 | # llama2_7B llama2_13B llama2_30B llama2_base 5 | llama_size="llama2_7B" 6 | # model_path=/mnt/petrelfs/share_data/quxiaoye/models/LlamaMoEForCausalLM-copy/Gradient-max-l1_norm-sample-feature_change/llama_13B-16Select4-864Neurons 7 | model_path=/mnt/petrelfs/share_data/quxiaoye/models/llama2_7B 8 | 9 | data_begin_index=0 10 | data_end_index=500 11 | batch_size=8 12 | # block_size=2048 13 | block_size=4096 14 | 15 | #save_folder=${llama_size}_dense 16 | save_folder=${llama_size}_moe_trained 17 | 18 | share_path=/mnt/petrelfs/share_data/quxiaoye 19 | tokenizer_path=${share_path}/models/${llama_size} 20 | data_path=${share_path}/data/vis_data/head30_shuffled_output/shuffled_20.txt 21 | target_scale_file_path=/mnt/petrelfs/dongdaize.d/workspace/train-moe/visualization/mlp-outputs-scale/${save_folder} 22 | save_path=/mnt/petrelfs/dongdaize.d/workspace/train-moe/visualization/mlp-layer-wise-scale-factors/${save_folder} 23 | 24 | gpus=1 25 | cpus=16 26 | OMP_NUM_THREADS=2 srun --partition=MoE --job-name=visualize --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=spot \ 27 | python -m smoe.entrypoint.analysis.get_layer_wise_score_scale_factor \ 28 | --tokenizer_path ${tokenizer_path} \ 29 | --model_path ${model_path} \ 30 | --target_scale_file_path ${target_scale_file_path} \ 31 | --data_path ${data_path} \ 32 | --save_path ${save_path} \ 33 | --data_begin_index ${data_begin_index} \ 34 | --data_end_index ${data_end_index} \ 35 | --batch_size ${batch_size} \ 36 | --block_size ${block_size} 37 | -------------------------------------------------------------------------------- /scripts/cpt/README.md: -------------------------------------------------------------------------------- 1 | # Scripts for Continual Pre-training 2 | 3 | - `lora.sh`: Parameter-efficient tuning 4 | - `fpt.sh`: Full-parameter pretraining 5 | -------------------------------------------------------------------------------- /scripts/cpt/multi_jobs.sh: -------------------------------------------------------------------------------- 1 | models=( 2 | /mnt/petrelfs/share_data/quxiaoye/models/LlamaMoEForCausalLM/Clustering-l2/llama_13B-16Select4-up_proj 3 | /mnt/petrelfs/share_data/quxiaoye/models/LlamaMoEForCausalLM/Graph-l2_norm/llama_13B-16Select4-up_proj 4 | /mnt/petrelfs/share_data/quxiaoye/models/LlamaMoEForCausalLM/Random/llama_13B-16Select4-up_proj 5 | ) 6 | 7 | for model in "${models[@]}" 8 | do 9 | sbatch scripts/cpt/fpt_13b.sh $model 10 | done 11 | 12 | 13 | # Submitted batch job 1904066 14 | # Submitted batch job 1904067 15 | # Submitted batch job 1904068 16 | -------------------------------------------------------------------------------- /scripts/cpt/test/test_conn.sh: -------------------------------------------------------------------------------- 1 | # !/usr/bin/bash 2 | 3 | # SBATCH --job-name=test_conn 4 | # SBATCH --output=logs/test_conn.log 5 | # SBATCH --error=logs/test_conn.log 6 | 7 | # SBATCH --partition=MoE_T 8 | # SBATCH --ntasks-per-node=1 9 | # SBATCH --cpus-per-task=26 10 | # SBATCH --mem=0 11 | 12 | # SBATCH --nodes=8 13 | # SBATCH --gres=gpu:1 14 | # SBATCH --quotatype=reserved 15 | 16 | # srun -p MoE_T -N8 -n8 --gres=gpu:1 -w HOST-10-140-60-[134,141,163,180-181,184] torchrun --nnodes 8 --nproc_per_node 1 tests/entrypoint/test_conn.py 17 | # $ srun -p MoE_T -N8 -n8 --gres=gpu:1 -w HOST-10-140-60-[134,141,163,180-181,184] bash scripts/cpt/test/test_conn.sh 18 | 19 | nodes=($(scontrol show hostnames $SLURM_JOB_NODELIS)) 20 | nodes_array=($nodes) 21 | head_node=${nodes_array[0]} 22 | head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address) 23 | echo "Node: $head_node" 24 | echo "Node IP: $head_node_ip" 25 | echo "Node list: $nodes" 26 | 27 | torchrun \ 28 | --nnodes ${num_nodes} \ 29 | --nproc_per_node ${num_gpu_per_node} \ 30 | --node_rank $SLURM_NODEID \ 31 | --rdzv_id $RANDOM \ 32 | --rdzv_backend c10d \ 33 | --rdzv_endpoint $head_node:29519 \ 34 | tests/entrypoint/test_conn.py 35 | -------------------------------------------------------------------------------- /scripts/eval/eval_mmlu_moe.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | 3 | # llama_7B llama_13B llama_30B llama_base 4 | # llama2_7B llama2_13B llama2_30B llama2_base 5 | llama_size="llama2_7B" 6 | 7 | num_experts=8 # 8 16 8 | num_selects=2 # 2 4 9 | split_type=Random # Graph-l1_norm Graph-l2_norm Clustering-l2 Clustering-cos Random 10 | select_type=l2_norm # plain positive l1_norm l2_norm 11 | proj_type=up_proj # gate_proj up_proj 12 | 13 | set_num_selects=2 # 手动设置eval时选择的专家数量 14 | 15 | data_path=/mnt/petrelfs/share_data/quxiaoye 16 | tokenizer_path=${data_path}/models/${llama_size} 17 | data_dir=${data_path}/llama_data/mmlu_data/ 18 | model_path=${data_path}/models/LlamaMoEForCausalLM/${split_type}-${select_type}/${llama_size}_${num_experts}Select${num_selects}-${proj_type} 19 | save_path=${data_path}/eval_mmlu_outputs/${split_type}-${select_type}/${llama_size}_${num_experts}Select${num_selects}-${proj_type}-S${set_num_selects} 20 | 21 | # model_path=${data_path}/models/llama_7B-16Select4-up_proj 22 | # save_path=${data_path}/eval_mmlu_outputs/16select4_16card_bs16_checkpoint15000 23 | 24 | gpus=1 25 | cpus=$((gpus * 16)) 26 | for i in '0' '1' '2' '3'; do 27 | OMP_NUM_THREADS=16 srun --partition=MoE --mpi=pmi2 --gres=gpu:${gpus} -n1 -c ${cpus} --ntasks-per-node=1 --job-name=test --kill-on-bad-exit=1 \ 28 | python -m smoe.entrypoint.eval.eval_mmlu_moe_${i} \ 29 | --data_dir ${data_dir} \ 30 | --save_dir ${save_path} \ 31 | --tokenizer_path ${tokenizer_path} \ 32 | --model_path ${model_path} \ 33 | --select_num ${set_num_selects} & 34 | sleep 0.5s 35 | done 36 | 37 | wait 38 | chmod -R 755 ${save_path} 39 | -------------------------------------------------------------------------------- /scripts/examples/create_noise_llama_moe.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | 3 | # llama_7B llama_13B llama_30B llama_base 4 | # llama2_7B llama2_13B llama2_30B llama2_base 5 | tokenizer_path=/mnt/petrelfs/share_data/quxiaoye/models/llama_7B/ 6 | 7 | model_type=LlamaMoEForCausalLM # LlamaMoEModel LlamaMoEForCausalLM LlamaMoEForSequenceClassification 8 | 9 | gpus=1 10 | cpus=16 11 | OMP_NUM_THREADS=8 srun --partition=MoE --job-name=test --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=spot \ 12 | python -m smoe.entrypoint.examples.create_noise_llama_moe \ 13 | --tokenizer_path ${tokenizer_path} \ 14 | --model_type ${model_type} 15 | -------------------------------------------------------------------------------- /scripts/examples/create_noise_llama_moe_residual.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | 3 | # llama_7B llama_13B llama_30B llama_base 4 | # llama2_7B llama2_13B llama2_30B llama2_base 5 | tokenizer_path=/mnt/petrelfs/share_data/quxiaoye/models/llama_7B/ 6 | 7 | model_type=LlamaMoEResidualForCausalLM # LlamaMoEResidualModel LlamaMoEResidualForCausalLM LlamaMoEResidualForSequenceClassification 8 | 9 | gpus=1 10 | cpus=16 11 | OMP_NUM_THREADS=8 srun --partition=MoE --job-name=test --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=spot \ 12 | python -m smoe.entrypoint.examples.create_noise_llama_moe_residual \ 13 | --tokenizer_path ${tokenizer_path} \ 14 | --model_type ${model_type} 15 | -------------------------------------------------------------------------------- /scripts/examples/create_switch_llama_moe.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | 3 | # llama_7B llama_13B llama_30B llama_base 4 | # llama2_7B llama2_13B llama2_30B llama2_base 5 | tokenizer_path=/mnt/petrelfs/share_data/quxiaoye/models/llama_7B 6 | 7 | model_type=LlamaMoEForCausalLM # LlamaMoEModel LlamaMoEForCausalLM LlamaMoEForSequenceClassification 8 | 9 | gpus=1 10 | cpus=16 11 | OMP_NUM_THREADS=8 srun --partition=MoE --job-name=test --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=spot \ 12 | python -m smoe.entrypoint.examples.create_switch_llama_moe \ 13 | --tokenizer_path ${tokenizer_path} \ 14 | --model_type ${model_type} 15 | -------------------------------------------------------------------------------- /scripts/examples/load_llama_moe.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | 3 | # llama_7B llama_13B llama_30B llama_base 4 | # llama2_7B llama2_13B llama2_30B llama2_base 5 | base_model=llama_7B 6 | 7 | num_experts=16 # 8 16 8 | num_selects=4 # 2 4 9 | model_type=LlamaMoEForCausalLM # LlamaMoEModel LlamaMoEForCausalLM LlamaMoEForSequenceClassification 10 | split_type=Random # Graph-l1_norm Graph-l2_norm Clustering-l2 Clustering-cos Random 11 | select_type="" # plain positive l2_norm 12 | proj_type=up_proj # gate_proj up_proj 13 | 14 | tokenizer_path=/mnt/petrelfs/share_data/quxiaoye/models/${base_model}/ 15 | 16 | if [ "${select_type}" = "" ]; then 17 | model_path=/mnt/petrelfs/share_data/quxiaoye/models/${model_type}/${split_type}/${base_model}-${num_experts}Select${num_selects}-${proj_type}/ 18 | else 19 | model_path=/mnt/petrelfs/share_data/quxiaoye/models/${model_type}/${split_type}-${select_type}/${base_model}-${num_experts}Select${num_selects}-${proj_type}/ 20 | fi 21 | 22 | #model_path=/mnt/petrelfs/share_data/quxiaoye/models/LlamaMoEForCausalLM-Prune/Gradient-max-l1_norm-total-feature_grad/llama2_7B-0-0.20Percent-2201Neurons 23 | 24 | gpus=1 25 | cpus=8 26 | OMP_NUM_THREADS=2 srun --partition=MoE --job-name=test --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=spot \ 27 | python -m smoe.entrypoint.examples.load_llama_moe \ 28 | --tokenizer_path ${tokenizer_path} \ 29 | --model_path ${model_path} \ 30 | --model_type ${model_type} 31 | -------------------------------------------------------------------------------- /scripts/examples/load_llama_moe_hf.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | 3 | tokenizer_path=/mnt/petrelfs/share_data/quxiaoye/models/llama-moe-models/LLaMA-MoE-v1-3_0B-2_16 4 | model_path=/mnt/petrelfs/share_data/quxiaoye/models/llama-moe-models/LLaMA-MoE-v1-3_0B-2_16 5 | 6 | model_type=LlamaMoEForCausalLM # LlamaMoEModel LlamaMoEForCausalLM 7 | 8 | gpus=1 9 | cpus=8 10 | OMP_NUM_THREADS=2 srun --partition=MoE --job-name="☝☝☝" --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=spot \ 11 | python -m smoe.entrypoint.examples.load_llama_moe_hf \ 12 | --tokenizer_path ${tokenizer_path} \ 13 | --model_path ${model_path} \ 14 | --model_type ${model_type} 15 | -------------------------------------------------------------------------------- /scripts/examples/load_llama_moe_residual.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | 3 | # llama_7B llama_13B llama_30B llama_base llama_3B 4 | # llama2_7B llama2_13B llama2_30B llama2_base 5 | base_model=llama_13B 6 | 7 | num_experts=15 # 13 14 15 8 | num_experts_residual=1 # 1 2 3 9 | num_selects=3 # 1 2 3 10 | expert_size=864 11 | # 540 1080 2160 4320 8640 12 | # 688 1376 2752 5504 11008 13 | # 864 1728 3456 6912 13824 14 | model_type=LlamaMoEResidualForCausalLM # LlamaMoEResidualModel LlamaMoEResidualForCausalLM LlamaMoEResidualForSequenceClassification 15 | 16 | kernel=l1_norm 17 | criterion=max # min max 18 | accumulate_level=sample # sample total 19 | importance_type=feature_change # feature_grad feature_change 20 | 21 | tokenizer_path=/mnt/petrelfs/share_data/quxiaoye/models/${base_model}/ 22 | model_path=/mnt/petrelfs/share_data/quxiaoye/models/${model_type}/Gradient-${criterion}-${kernel}-${accumulate_level}-${importance_type}/${base_model}-${num_experts}Select${num_selects}-${num_experts_residual}Residuals-${expert_size}Neurons-Share 23 | 24 | gpus=1 25 | cpus=8 26 | OMP_NUM_THREADS=2 srun --partition=MoE --job-name=test --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=spot \ 27 | python -m smoe.entrypoint.examples.load_llama_moe_residual \ 28 | --tokenizer_path ${tokenizer_path} \ 29 | --model_path ${model_path} \ 30 | --model_type ${model_type} 31 | -------------------------------------------------------------------------------- /scripts/expert_construction/convert/run_convert.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | 3 | # llama_7B llama_13B llama_30B llama_base 4 | # llama2_7B llama2_13B llama2_30B llama2_base 5 | # open_llama_7b 6 | # ReluLLaMA-7B 7 | llama_size="ReluLLaMA-7B" 8 | 9 | num_experts=16 # 4 8 16 32 10 | num_selects=4 # 1 2 4 8 11 | split_type=Clustering-l2 # Graph-l1_norm Graph-l2_norm Clustering-l2 Clustering-cos Random 12 | proj_type=gate_proj # gate_proj up_proj 13 | select_type=positive # plain positive l1_norm l2_norm 14 | 15 | use_random_gate="False" # True False 16 | gate_type="mlp" # mlp linear 17 | use_softmax="False" 18 | multiply_gate_scores="False" 19 | 20 | score_scale_factor=1.0 # 1.0 2.0 4.0 8.0 16.0 21 | score_scale_factor_file_path="" 22 | #score_scale_factor_file_path=/mnt/petrelfs/dongdaize.d/workspace/train-moe/visualization/mlp-layer-wise-scale-factors/llama_13B_dense 23 | 24 | convert_type=LlamaMoEForCausalLM # LlamaMoEModel LlamaMoEForCausalLM LlamaMoEForSequenceClassification 25 | 26 | data_path=/mnt/petrelfs/share_data/quxiaoye 27 | model_path=${data_path}/models/${llama_size} 28 | split_file_path=${data_path}/moefication_results/split/${llama_size}-${num_experts}Expert-Split-${split_type} 29 | 30 | if [ ${use_random_gate} = "True" ]; then 31 | select_file_path="" 32 | save_path=${data_path}/models/${convert_type}/${split_type}/${llama_size}-${num_experts}Select${num_selects}-${proj_type}-Scale${score_scale_factor} 33 | else 34 | select_file_path="/mnt/petrelfs/share_data/quxiaoye/moefication_results/select/Clustering-l2/ReluLLaMA-7B-16Expert-Select-MLP-positive-random" 35 | save_path=${data_path}/models/${convert_type}/${split_type}-${select_type}/${llama_size}-${num_experts}Select${num_selects}-${proj_type}-HardBCE 36 | # select_file_path=${data_path}/moefication_results/select/${split_type}/${llama_size}-${num_experts}Expert-Select-MLP-${select_type} 37 | # save_path=${data_path}/models/${convert_type}/${split_type}-${select_type}/${llama_size}-${num_experts}Select${num_selects}-${proj_type} 38 | fi 39 | 40 | gpus=0 41 | cpus=8 42 | OMP_NUM_THREADS=2 srun --partition=MoE --job-name=convert --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=auto \ 43 | python -m smoe.entrypoint.expert_construction.llama_convert \ 44 | --model_path ${model_path} \ 45 | --split_file_path ${split_file_path} \ 46 | --select_file_path "${select_file_path}" \ 47 | --save_path ${save_path} \ 48 | --template layers.{}.mlp.${proj_type}.weight \ 49 | --num_experts ${num_experts} \ 50 | --num_selects ${num_selects} \ 51 | --use_random_gate ${use_random_gate} \ 52 | --gate_type ${gate_type} \ 53 | --use_softmax ${use_softmax} \ 54 | --multiply_gate_scores ${multiply_gate_scores} \ 55 | --score_scale_factor ${score_scale_factor} \ 56 | --score_scale_factor_file_path "${score_scale_factor_file_path}" \ 57 | --convert_type ${convert_type} 58 | -------------------------------------------------------------------------------- /scripts/expert_construction/convert/run_convert_gradient.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | 3 | # llama_7B llama_13B llama_30B llama_base llama_3B 4 | # llama2_7B llama2_13B llama2_30B llama2_base 5 | llama_size="llama2_7B" 6 | 7 | share_neurons=False # True False 8 | num_experts=8 # 2 4 8 16 32 9 | num_selects=2 # 1 2 4 10 | expert_size=1376 11 | # 540 1080 2160 4320 8640 12 | # 688 1376 2752 5504 11008 13 | # 864 1728 3456 6912 13824 14 | 15 | score_scale_factor=8.0 # 1.0 2.0 4.0 8.0 16.0 16 | score_scale_factor_file_path="" 17 | #score_scale_factor_file_path=/mnt/petrelfs/dongdaize.d/workspace/train-moe/visualization/mlp-layer-wise-scale-factors/llama_13B_dense 18 | #score_scale_factor_file_path=/mnt/petrelfs/dongdaize.d/workspace/train-moe/visualization/mlp-layer-wise-scale-factors/llama_13B_moe_trained 19 | 20 | convert_type=LlamaMoEForCausalLM # LlamaMoEModel LlamaMoEForCausalLM LlamaMoEForSequenceClassification 21 | 22 | kernel=l1_norm 23 | criterion=max # min max 24 | accumulate_level=sample # sample total 25 | importance_type=feature_change # feature_grad feature_change 26 | proj_type=up_proj # gate_proj up_proj 27 | 28 | data_path=/mnt/petrelfs/share_data/quxiaoye 29 | model_path=${data_path}/models/${llama_size} 30 | split_file_path=${data_path}/moefication_results/split/${llama_size}-Split-Gradient-${criterion}-${kernel}-${accumulate_level}-${importance_type}/${num_experts}Experts-${expert_size}Neurons 31 | save_path=${data_path}/models/${convert_type}/Gradient-${criterion}-${kernel}-${accumulate_level}-${importance_type}/${llama_size}-${num_experts}Select${num_selects}-${expert_size}Neurons 32 | 33 | gpus=0 34 | cpus=8 35 | if [ ${share_neurons} = "True" ]; then 36 | split_file_path=${split_file_path}-Share 37 | save_path=${save_path}-Share 38 | OMP_NUM_THREADS=2 srun --partition=MoE --job-name=convert --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=auto \ 39 | python -m smoe.entrypoint.expert_construction.llama_convert_neuron_index \ 40 | --model_path ${model_path} \ 41 | --split_file_path ${split_file_path} \ 42 | --select_file_path "" \ 43 | --save_path ${save_path} \ 44 | --template layers.{}.mlp.${proj_type}.weight \ 45 | --num_experts ${num_experts} \ 46 | --num_selects ${num_selects} \ 47 | --score_scale_factor ${score_scale_factor} \ 48 | --score_scale_factor_file_path "${score_scale_factor_file_path}" \ 49 | --convert_type ${convert_type} \ 50 | --use_random_gate True 51 | else 52 | OMP_NUM_THREADS=8 srun --partition=MoE --job-name=convert --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=auto \ 53 | python -m smoe.entrypoint.expert_construction.llama_convert \ 54 | --model_path ${model_path} \ 55 | --split_file_path ${split_file_path} \ 56 | --select_file_path "" \ 57 | --save_path ${save_path} \ 58 | --template layers.{}.mlp.${proj_type}.weight \ 59 | --num_experts ${num_experts} \ 60 | --num_selects ${num_selects} \ 61 | --score_scale_factor ${score_scale_factor} \ 62 | --score_scale_factor_file_path "${score_scale_factor_file_path}" \ 63 | --convert_type ${convert_type} \ 64 | --use_random_gate True 65 | fi 66 | -------------------------------------------------------------------------------- /scripts/expert_construction/convert/run_convert_gradient_residual.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | 3 | # llama_7B llama_13B llama_30B llama_base llama_3B 4 | # llama2_7B llama2_13B llama2_30B llama2_base 5 | llama_size="llama2_7B" 6 | 7 | share_neurons=True # True False 8 | num_experts=7 # 7 14 28 9 | num_experts_residual=1 # 1 2 3 4 10 | num_selects=1 # 1 2 3 4 11 | expert_size=1376 12 | # 540 1080 2160 4320 8640 13 | # 688 1376 2752 5504 11008 14 | # 864 1728 3456 6912 13824 15 | 16 | score_scale_factor_residual=1.0 # 4.0 8.0 12.0 16.0 17 | score_scale_factor=4.0 # 4.0 8.0 12.0 16.0 18 | 19 | convert_type=LlamaMoEResidualForCausalLM # LlamaMoEResidualModel LlamaMoEResidualForCausalLM LlamaMoEResidualForSequenceClassification 20 | 21 | kernel=l1_norm 22 | criterion=max # min max 23 | accumulate_level=sample # sample total 24 | importance_type=feature_change # feature_grad feature_change 25 | proj_type=up_proj # gate_proj up_proj 26 | 27 | data_path=/mnt/petrelfs/share_data/quxiaoye 28 | model_path=${data_path}/models/${llama_size} 29 | split_file_path=${data_path}/moefication_results/split/${llama_size}-Split-Gradient-${criterion}-${kernel}-${accumulate_level}-${importance_type}/${num_experts}Experts-${num_experts_residual}Residuals-${expert_size}Neurons 30 | save_path=${data_path}/models/${convert_type}/Gradient-${criterion}-${kernel}-${accumulate_level}-${importance_type}/${llama_size}-${num_experts}Select${num_selects}-${num_experts_residual}Residuals-${expert_size}Neurons 31 | if [ ${share_neurons} = "True" ]; then 32 | split_file_path=${split_file_path}-Share 33 | save_path=${save_path}-Share 34 | fi 35 | 36 | gpus=0 37 | cpus=8 38 | OMP_NUM_THREADS=2 srun --partition=MoE --job-name=convert --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=auto \ 39 | python -m smoe.entrypoint.expert_construction.llama_convert_neuron_index_residual \ 40 | --model_path ${model_path} \ 41 | --split_file_path ${split_file_path} \ 42 | --select_file_path "" \ 43 | --save_path ${save_path} \ 44 | --template layers.{}.mlp.${proj_type}.weight \ 45 | --num_experts ${num_experts} \ 46 | --num_experts_residual ${num_experts_residual} \ 47 | --num_selects ${num_selects} \ 48 | --score_scale_factor ${score_scale_factor} \ 49 | --score_scale_factor_residual ${score_scale_factor_residual} \ 50 | --convert_type ${convert_type} \ 51 | --use_random_gate True 52 | -------------------------------------------------------------------------------- /scripts/expert_construction/convert/run_convert_mistral.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | 3 | model_path=/mnt/petrelfs/share_data/quxiaoye/models/Mistral-7B-v0.1 4 | moe_config_path=/mnt/petrelfs/share_data/quxiaoye/models/Mixtral-8x7B-v0.1 5 | split_file_path=/mnt/petrelfs/share_data/quxiaoye/moefication_results/split/Mistral-7B-v0.1-8Expert-Split-Random 6 | save_path=/mnt/petrelfs/share_data/quxiaoye/models/Mixtral-8x7B-v0.1-Random-8Select2 7 | 8 | template=layers.{}.mlp.up_proj.weight 9 | num_experts=8 10 | num_selects=2 11 | 12 | gpus=0 13 | cpus=8 14 | OMP_NUM_THREADS=2 srun --partition=MoE --job-name=convert --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=auto \ 15 | python -m smoe.entrypoint.expert_construction.mistral_convert \ 16 | --model_path ${model_path} \ 17 | --moe_config_path ${moe_config_path} \ 18 | --split_file_path ${split_file_path} \ 19 | --save_path ${save_path} \ 20 | --template ${template} \ 21 | --num_experts ${num_experts} \ 22 | --num_selects ${num_selects} 23 | 24 | chmod -R 755 ${save_path} >/dev/null 2>&1 25 | -------------------------------------------------------------------------------- /scripts/expert_construction/get_hidden_features/run_get_hidden_features.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | 3 | # llama_7B llama_13B llama_30B llama_base 4 | # llama2_7B llama2_13B llama2_30B llama2_base 5 | # ReluLLaMA-7B 6 | llama_size="ReluLLaMA-7B" 7 | save_interval=1 8 | batch_size=4 9 | block_size=2048 10 | data_use_percent=0.002 11 | 12 | proj_type=gate_proj # gate_proj up_proj 13 | 14 | data_path=/mnt/petrelfs/share_data/quxiaoye 15 | model_path=${data_path}/models/${llama_size} 16 | train_data_path=${data_path}/data/moefication_LLAMA_data 17 | train_data_cache_path=${data_path}/data/moefication_LLAMA_data_cache 18 | save_path=${data_path}/moefication_results/features 19 | 20 | gpus=4 21 | cpus=$((gpus * 16)) 22 | quotatype=auto # auto spot reserved 23 | OMP_NUM_THREADS=2 srun --partition=MoE --job-name=get_features --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=${quotatype} \ 24 | torchrun --nproc_per_node=${gpus} -m smoe.entrypoint.expert_construction.llama_get_hidden_features \ 25 | --model_path ${model_path} \ 26 | --train_data_path ${train_data_path} \ 27 | --train_data_cache_path ${train_data_cache_path} \ 28 | --save_path ${save_path} \ 29 | --template layers.{}.mlp.${proj_type}.weight \ 30 | --data_use_percent ${data_use_percent} \ 31 | --save_interval ${save_interval} \ 32 | --batch_size ${batch_size} \ 33 | --block_size ${block_size} 34 | 35 | wait 36 | -------------------------------------------------------------------------------- /scripts/expert_construction/get_hidden_features/run_prepare_datasets.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | 3 | # llama_7B llama_13B llama_30B llama_base 4 | # llama2_7B llama2_13B llama2_30B llama2_base 5 | llama_size="llama_7B" 6 | 7 | data_path=/mnt/petrelfs/share_data/quxiaoye 8 | model_path=${data_path}/models/${llama_size} 9 | train_data_path=${data_path}/data/moefication_LLAMA_data 10 | train_data_cache_path=${data_path}/data/moefication_LLAMA_data_cache 11 | 12 | gpus=0 13 | cpus=16 14 | OMP_NUM_THREADS=2 srun --partition=MoE --job-name=datasets --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=auto \ 15 | python -m smoe.entrypoint.expert_construction.llama_prepare_datasets \ 16 | --model_path ${model_path} \ 17 | --train_data_path ${train_data_path} \ 18 | --train_data_cache_path ${train_data_cache_path} 19 | 20 | wait 21 | chmod -R 755 ${train_data_cache_path} >/dev/null 2>&1 22 | -------------------------------------------------------------------------------- /scripts/expert_construction/prune/run_prune_gradient.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | 3 | # llama_7B llama_13B llama_30B llama_base 4 | # llama2_7B llama2_13B llama2_30B llama2_base 5 | llama_size="llama_7B" 6 | 7 | retain_percent=0.20 # 0.99 0.98 0.95 0.90 0.80 0.75 0.70 0.60 0.50 0.40 0.30 0.25 0.20 0.13 0.10 0.06 0.05 8 | use_grad_sum=True # True False 9 | 10 | if [ ${use_grad_sum} = "True" ]; then 11 | expert_index=All 12 | else 13 | expert_index=0 14 | fi 15 | 16 | criterion=max # min max 17 | kernel=l1_norm # plain l1_norm l2_norm 18 | accumulate_level=sample # sample total 19 | importance_type=feature_change # feature_grad feature_change 20 | proj_type=up_proj # gate_proj up_proj 21 | 22 | data_path=/mnt/petrelfs/share_data/quxiaoye 23 | model_path=${data_path}/models/${llama_size} 24 | grad_file_path=${data_path}/moefication_results/split/Gradients/${llama_size}-Gradients-${kernel}-${accumulate_level}-${importance_type} 25 | save_path=${data_path}/moefication_results/prune 26 | 27 | gpus=0 28 | cpus=8 29 | OMP_NUM_THREADS=2 srun --partition=MoE --job-name=split --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=auto \ 30 | python -m smoe.entrypoint.expert_construction.llama_prune_gradient \ 31 | --model_path ${model_path} \ 32 | --grad_file_path ${grad_file_path} \ 33 | --save_path ${save_path} \ 34 | --retain_percent ${retain_percent} \ 35 | --expert_index ${expert_index} \ 36 | --template layers.{}.mlp.${proj_type}.weight \ 37 | --kernel ${kernel} \ 38 | --accumulate_level ${accumulate_level} \ 39 | --importance_type ${importance_type} \ 40 | --criterion ${criterion} \ 41 | --use_grad_sum ${use_grad_sum} 42 | 43 | chmod -R 755 ${save_path} >/dev/null 2>&1 44 | -------------------------------------------------------------------------------- /scripts/expert_construction/prune/run_prune_gradient_convert.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | 3 | # llama_7B llama_13B llama_30B llama_base 4 | # llama2_7B llama2_13B llama2_30B llama2_base 5 | llama_size="llama_7B" 6 | 7 | intermediate_size=11008 8 | retain_percent=99 # 99 98 95 90 80 75 70 60 50 40 30 25 20 13 10 06 05 9 | expert_size=$((${retain_percent} * ${intermediate_size} / 100)) 10 | echo ${expert_size} 11 | 12 | convert_type=LlamaMoEForCausalLM # LlamaMoEModel LlamaMoEForCausalLM LlamaMoEForSequenceClassification 13 | use_grad_sum=True # True False 14 | 15 | if [ ${use_grad_sum} = "True" ]; then 16 | expert_index=All 17 | else 18 | expert_index=0 19 | fi 20 | 21 | criterion=min # min max 22 | kernel=l1_norm # plain l1_norm l2_norm 23 | accumulate_level=sample # sample total 24 | importance_type=feature_change # feature_grad feature_change 25 | proj_type=up_proj # gate_proj up_proj 26 | 27 | data_path=/mnt/petrelfs/share_data/quxiaoye 28 | model_path=${data_path}/models/${llama_size} 29 | split_file_path=${data_path}/moefication_results/prune/${llama_size}-Prune-Gradient-${criterion}-${kernel}-${accumulate_level}-${importance_type}/${expert_index}-0.${retain_percent}Percent-${expert_size}Neurons 30 | save_path=${data_path}/models/${convert_type}-Prune/Gradient-${criterion}-${kernel}-${accumulate_level}-${importance_type}/${llama_size}-${expert_index}-0.${retain_percent}Percent-${expert_size}Neurons 31 | 32 | gpus=0 33 | cpus=16 34 | OMP_NUM_THREADS=2 srun --partition=MoE --job-name=prune-convert --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=auto \ 35 | python -m smoe.entrypoint.expert_construction.llama_convert_neuron_index \ 36 | --model_path ${model_path} \ 37 | --split_file_path ${split_file_path} \ 38 | --select_file_path "" \ 39 | --save_path ${save_path} \ 40 | --template layers.{}.mlp.${proj_type}.weight \ 41 | --num_experts 1 \ 42 | --num_selects 1 \ 43 | --convert_type ${convert_type} \ 44 | --use_random_gate True 45 | -------------------------------------------------------------------------------- /scripts/expert_construction/prune/run_prune_gradient_convert_one4all.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | 3 | # llama_7B llama_13B llama_30B llama_base 4 | # llama2_7B llama2_13B llama2_30B llama2_base 5 | llama_size="llama_7B" 6 | 7 | intermediate_size=11008 8 | 9 | convert_type=LlamaMoEForCausalLM # LlamaMoEModel LlamaMoEForCausalLM LlamaMoEForSequenceClassification 10 | use_grad_sum=True # True False 11 | 12 | if [ ${use_grad_sum} = "True" ]; then 13 | expert_index=All 14 | else 15 | expert_index=0 16 | fi 17 | 18 | criterion=max # min max 19 | kernel=l1_norm # plain l1_norm l2_norm 20 | accumulate_level=sample # sample total 21 | importance_type=feature_change # feature_grad feature_change 22 | proj_type=up_proj # gate_proj up_proj 23 | 24 | data_path=/mnt/petrelfs/share_data/quxiaoye 25 | model_path=${data_path}/models/${llama_size} 26 | 27 | gpus=0 28 | cpus=16 29 | for retain_percent in 99 98 95 90 80 75 70 60 50 40 30 25 20 13 10 06 05; do 30 | expert_size=$((${retain_percent} * ${intermediate_size} / 100)) 31 | echo ${expert_size} 32 | split_file_path=${data_path}/moefication_results/prune/${llama_size}-Prune-Gradient-${criterion}-${kernel}-${accumulate_level}-${importance_type}/${expert_index}-0.${retain_percent}Percent-${expert_size}Neurons 33 | save_path=${data_path}/models/${convert_type}-Prune/Gradient-${criterion}-${kernel}-${accumulate_level}-${importance_type}/${llama_size}-${expert_index}-0.${retain_percent}Percent-${expert_size}Neurons 34 | 35 | OMP_NUM_THREADS=2 srun --partition=MoE --job-name=prune-convert --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=auto \ 36 | python -m smoe.entrypoint.expert_construction.llama_convert_neuron_index \ 37 | --model_path ${model_path} \ 38 | --split_file_path ${split_file_path} \ 39 | --select_file_path "" \ 40 | --save_path ${save_path} \ 41 | --template layers.{}.mlp.${proj_type}.weight \ 42 | --num_experts 1 \ 43 | --num_selects 1 \ 44 | --convert_type ${convert_type} \ 45 | --use_random_gate True & 46 | sleep 1 47 | done 48 | 49 | wait 50 | -------------------------------------------------------------------------------- /scripts/expert_construction/prune/run_prune_gradient_one4all.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | 3 | # llama_7B llama_13B llama_30B llama_base 4 | # llama2_7B llama2_13B llama2_30B llama2_base 5 | llama_size="llama_7B" 6 | 7 | kernel=l1_norm 8 | proj_type=up_proj 9 | 10 | use_grad_sum=True # True False 11 | 12 | if [ ${use_grad_sum} = "True" ]; then 13 | expert_index=All 14 | else 15 | expert_index=0 16 | fi 17 | 18 | data_path=/mnt/petrelfs/share_data/quxiaoye 19 | model_path=${data_path}/models/${llama_size} 20 | save_path=${data_path}/moefication_results/prune 21 | 22 | gpus=0 23 | cpus=8 24 | for retain_percent in 0.99 0.98 0.95 0.90 0.80 0.75 0.70 0.60 0.50 0.40 0.30 0.25 0.20 0.13 0.10 0.06 0.05; do 25 | for criterion in min max; do 26 | for accumulate_level in sample total; do 27 | for importance_type in feature_grad feature_change; do 28 | grad_file_path=${data_path}/moefication_results/split/Gradients/${llama_size}-Gradients-${kernel}-${accumulate_level}-${importance_type} 29 | 30 | OMP_NUM_THREADS=2 srun --partition=MoE --job-name=split --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=auto \ 31 | python -m smoe.entrypoint.expert_construction.llama_prune_gradient \ 32 | --model_path ${model_path} \ 33 | --grad_file_path ${grad_file_path} \ 34 | --save_path ${save_path} \ 35 | --retain_percent ${retain_percent} \ 36 | --expert_index ${expert_index} \ 37 | --template layers.{}.mlp.${proj_type}.weight \ 38 | --kernel ${kernel} \ 39 | --accumulate_level ${accumulate_level} \ 40 | --importance_type ${importance_type} \ 41 | --criterion ${criterion} \ 42 | --use_grad_sum ${use_grad_sum} & 43 | sleep 1 44 | 45 | done 46 | done 47 | done 48 | done 49 | 50 | wait 51 | chmod -R 755 ${save_path} >/dev/null 2>&1 52 | -------------------------------------------------------------------------------- /scripts/expert_construction/prune/run_prune_random.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | 3 | # llama_7B llama_13B llama_30B llama_base 4 | # llama2_7B llama2_13B llama2_30B llama2_base 5 | llama_size="llama_7B" 6 | 7 | retain_percent=0.05 # 0.99 0.98 0.95 0.90 0.80 0.75 0.70 0.60 0.50 0.40 0.30 0.25 0.20 0.13 0.10 0.06 0.05 8 | proj_type=up_proj # gate_proj up_proj 9 | 10 | data_path=/mnt/petrelfs/share_data/quxiaoye 11 | model_path=${data_path}/models/${llama_size} 12 | save_path=${data_path}/moefication_results/prune 13 | 14 | gpus=0 15 | cpus=8 16 | OMP_NUM_THREADS=2 srun --partition=MoE --job-name=split --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=auto \ 17 | python -m smoe.entrypoint.expert_construction.llama_prune_random \ 18 | --model_path ${model_path} \ 19 | --save_path ${save_path} \ 20 | --retain_percent ${retain_percent} \ 21 | --template layers.{}.mlp.${proj_type}.weight 22 | 23 | chmod -R 755 ${save_path} >/dev/null 2>&1 24 | -------------------------------------------------------------------------------- /scripts/expert_construction/prune/run_prune_random_convert.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | 3 | # llama_7B llama_13B llama_30B llama_base 4 | # llama2_7B llama2_13B llama2_30B llama2_base 5 | llama_size="llama_7B" 6 | 7 | intermediate_size=11008 8 | retain_percent=99 # 99 98 95 90 80 75 70 60 50 40 30 25 20 13 10 06 05 9 | expert_size=$((${retain_percent} * ${intermediate_size} / 100)) 10 | echo ${expert_size} 11 | 12 | convert_type=LlamaMoEForCausalLM # LlamaMoEModel LlamaMoEForCausalLM LlamaMoEForSequenceClassification 13 | proj_type=up_proj # gate_proj up_proj 14 | 15 | data_path=/mnt/petrelfs/share_data/quxiaoye 16 | model_path=${data_path}/models/${llama_size} 17 | split_file_path=${data_path}/moefication_results/prune/${llama_size}-Prune-Random/0.${retain_percent}Percent-${expert_size}Neurons 18 | save_path=${data_path}/models/${convert_type}-Prune/Random/${llama_size}-0.${retain_percent}Percent-${expert_size}Neurons 19 | 20 | gpus=0 21 | cpus=16 22 | OMP_NUM_THREADS=2 srun --partition=MoE --job-name=prune-convert --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=auto \ 23 | python -m smoe.entrypoint.expert_construction.llama_convert_neuron_index \ 24 | --model_path ${model_path} \ 25 | --split_file_path ${split_file_path} \ 26 | --select_file_path "" \ 27 | --save_path ${save_path} \ 28 | --template layers.{}.mlp.${proj_type}.weight \ 29 | --num_experts 1 \ 30 | --num_selects 1 \ 31 | --convert_type ${convert_type} \ 32 | --use_random_gate True 33 | -------------------------------------------------------------------------------- /scripts/expert_construction/prune/run_prune_random_convert_one4all.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | 3 | # llama_7B llama_13B llama_30B llama_base 4 | # llama2_7B llama2_13B llama2_30B llama2_base 5 | llama_size="llama_7B" 6 | 7 | intermediate_size=11008 8 | 9 | convert_type=LlamaMoEForCausalLM # LlamaMoEModel LlamaMoEForCausalLM LlamaMoEForSequenceClassification 10 | proj_type=up_proj # gate_proj up_proj 11 | 12 | data_path=/mnt/petrelfs/share_data/quxiaoye 13 | model_path=${data_path}/models/${llama_size} 14 | 15 | gpus=0 16 | cpus=16 17 | for retain_percent in 99 98 95 90 80 75 70 60 50 40 30 25 20 13 10 06 05; do 18 | expert_size=$((${retain_percent} * ${intermediate_size} / 100)) 19 | echo ${expert_size} 20 | split_file_path=${data_path}/moefication_results/prune/${llama_size}-Prune-Random/0.${retain_percent}Percent-${expert_size}Neurons 21 | save_path=${data_path}/models/${convert_type}-Prune/Random/${llama_size}-0.${retain_percent}Percent-${expert_size}Neurons 22 | 23 | OMP_NUM_THREADS=2 srun --partition=MoE --job-name=prune-convert --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=auto \ 24 | python -m smoe.entrypoint.expert_construction.llama_convert_neuron_index \ 25 | --model_path ${model_path} \ 26 | --split_file_path ${split_file_path} \ 27 | --select_file_path "" \ 28 | --save_path ${save_path} \ 29 | --template layers.{}.mlp.${proj_type}.weight \ 30 | --num_experts 1 \ 31 | --num_selects 1 \ 32 | --convert_type ${convert_type} \ 33 | --use_random_gate True & 34 | sleep 1 35 | done 36 | 37 | wait 38 | -------------------------------------------------------------------------------- /scripts/expert_construction/prune/run_prune_random_one4all.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | 3 | # llama_7B llama_13B llama_30B llama_base 4 | # llama2_7B llama2_13B llama2_30B llama2_base 5 | llama_size="llama2_7B" 6 | 7 | proj_type=up_proj # gate_proj up_proj 8 | 9 | data_path=/mnt/petrelfs/share_data/quxiaoye 10 | model_path=${data_path}/models/${llama_size} 11 | save_path=${data_path}/moefication_results/prune 12 | 13 | gpus=0 14 | cpus=8 15 | for retain_percent in 0.99 0.98 0.95 0.90 0.80 0.75 0.70 0.60 0.50 0.40 0.30 0.25 0.20 0.13 0.10 0.06 0.05; do 16 | OMP_NUM_THREADS=2 srun --partition=MoE --job-name=split --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=auto \ 17 | python -m smoe.entrypoint.expert_construction.llama_prune_random \ 18 | --model_path ${model_path} \ 19 | --save_path ${save_path} \ 20 | --retain_percent ${retain_percent} \ 21 | --template layers.{}.mlp.${proj_type}.weight & 22 | sleep 1 23 | done 24 | 25 | wait 26 | chmod -R 755 ${save_path} >/dev/null 2>&1 27 | -------------------------------------------------------------------------------- /scripts/expert_construction/select/run_select.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | 3 | # llama_7B llama_13B llama_30B llama_base 4 | # llama2_7B llama2_13B llama2_30B llama2_base 5 | # ReluLLaMA-7B 6 | llama_size="ReluLLaMA-7B" 7 | 8 | num_experts=16 # 8 16 9 | num_selects=4 # 2 4 10 | split_type=Clustering-l2 # Graph-l1_norm Graph-l2_norm Clustering-l2 Clustering-cos Random 11 | select_type=positive # plain positive l1_norm l2_norm 12 | mlp_init_criterion=random # weight random 13 | proj_type=gate_proj # gate_proj up_proj 14 | 15 | use_balance="False" 16 | balance_loss_lambda=0.0 # 0.0001 17 | add_noise="False" 18 | use_softmax="False" 19 | 20 | data_use_percent=1.0 # 1.0 0.71 0.43 21 | train_percent=0.97 22 | batch_size=1024 23 | epochs=800 24 | lr=0.5 25 | 26 | data_path=/mnt/petrelfs/share_data/quxiaoye 27 | model_path=${data_path}/models/${llama_size} 28 | split_file_path=${data_path}/moefication_results/split/${llama_size}-${num_experts}Expert-Split-${split_type} 29 | hidden_features_path=${data_path}/moefication_results/features/${llama_size}-Hidden-Features 30 | save_path=${data_path}/moefication_results/select/${split_type} 31 | 32 | save_visualization_path=/mnt/petrelfs/dongdaize.d/workspace/train-moe/visualization/expert-select/${split_type}-${select_type}-${mlp_init_criterion}/${llama_size}-${num_experts}Select${num_selects}-${proj_type} 33 | 34 | gpus=1 35 | cpus=16 36 | for specify_layer in "0 1" "2 3" "4 5" "6 7" "8 9" "10 11" "12 13" "14 15" "16 17" "18 19" "20 21" "22 23" "24 25" "26 27" "28 29" "30 31"; do # 并行启用任务 37 | OMP_NUM_THREADS=2 srun --partition=MoE --job-name=select --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=auto \ 38 | python -m smoe.entrypoint.expert_construction.llama_select_mlp \ 39 | --model_path ${model_path} \ 40 | --split_file_path ${split_file_path} \ 41 | --hidden_features_path ${hidden_features_path} \ 42 | --save_path ${save_path} \ 43 | --save_visualization_path ${save_visualization_path} \ 44 | --specify_layer ${specify_layer} \ 45 | --template layers.{}.mlp.${proj_type}.weight \ 46 | --num_experts ${num_experts} \ 47 | --num_selects ${num_selects} \ 48 | --select_criterion ${select_type} \ 49 | --mlp_init_criterion ${mlp_init_criterion} \ 50 | --use_balance ${use_balance} \ 51 | --balance_loss_lambda ${balance_loss_lambda} \ 52 | --add_noise ${add_noise} \ 53 | --use_softmax ${use_softmax} \ 54 | --data_use_percent ${data_use_percent} \ 55 | --train_percent ${train_percent} \ 56 | --batch_size ${batch_size} \ 57 | --epochs ${epochs} \ 58 | --lr ${lr} & # 并行运行下一命令 59 | sleep 1 60 | done 61 | # "0" "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" "11" "12" "13" "14" "15" "16" "17" "18" "19" "20" "21" "22" "23" "24" "25" "26" "27" "28" "29" "30" "31" 62 | # "0 1" "2 3" "4 5" "6 7" "8 9" "10 11" "12 13" "14 15" "16 17" "18 19" "20 21" "22 23" "24 25" "26 27" "28 29" "30 31" 63 | # "0 1 2 3" "4 5 6 7" "8 9 10 11" "12 13 14 15" "16 17 18 19" "20 21 22 23" "24 25 26 27" "28 29 30 31" 64 | # "0 1 2 3 4 5 6 7" "8 9 10 11 12 13 14 15" "16 17 18 19 20 21 22 23" "24 25 26 27 28 29 30 31" 65 | # "0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15" "16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31" 66 | wait 67 | -------------------------------------------------------------------------------- /scripts/expert_construction/select/run_select_multiprocess.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | 3 | # llama_7B llama_13B llama_30B llama_base 4 | # llama2_7B llama2_13B llama2_30B llama2_base 5 | llama_size="llama_7B" 6 | 7 | num_experts=16 # 8 16 8 | num_selects=4 # 2 4 9 | split_type=Clustering-l2 # Graph-l1_norm Graph-l2_norm Clustering-l2 Clustering-cos Random 10 | select_type=l2_norm # plain positive l1_norm l2_norm 11 | proj_type=gate_proj # gate_proj up_proj 12 | 13 | data_use_percent=1.0 14 | train_percent=0.95 15 | batch_size=1024 16 | epochs=200 17 | lr=0.01 18 | 19 | data_path=/mnt/petrelfs/share_data/quxiaoye 20 | model_path=${data_path}/models/${llama_size} 21 | split_file_path=${data_path}/moefication_results/split/${llama_size}-${num_experts}Expert-Split-${split_type} 22 | hidden_features_path=${data_path}/moefication_results/features/${llama_size}-Hidden-Features 23 | save_path=${data_path}/moefication_results/select/${split_type} 24 | 25 | save_visualization_path=/mnt/petrelfs/dongdaize.d/workspace/train-moe/visualization/expert-select/${split_type}-${select_type}/${llama_size}-${num_experts}Select${num_selects}-${proj_type} 26 | 27 | gpus=4 28 | cpus=$((gpus * 16)) 29 | for specify_layer in "0 1 2 3 4 5 6 7" "8 9 10 11 12 13 14 15" "16 17 18 19 20 21 22 23" "24 25 26 27 28 29 30 31"; do # 并行启用任务 30 | OMP_NUM_THREADS=2 srun --partition=MoE --job-name=select --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=auto \ 31 | python -m smoe.entrypoint.expert_construction.llama_select_mlp_multiprocess \ 32 | --model_path ${model_path} \ 33 | --split_file_path ${split_file_path} \ 34 | --hidden_features_path ${hidden_features_path} \ 35 | --save_path ${save_path} \ 36 | --save_visualization_path ${save_visualization_path} \ 37 | --specify_layer ${specify_layer} \ 38 | --template layers.{}.mlp.${proj_type}.weight \ 39 | --num_experts ${num_experts} \ 40 | --num_selects ${num_selects} \ 41 | --select_criterion ${select_type} \ 42 | --num_threads ${gpus} \ 43 | --use_softmax \ 44 | --data_use_percent ${data_use_percent} \ 45 | --train_percent ${train_percent} \ 46 | --batch_size ${batch_size} \ 47 | --epochs ${epochs} \ 48 | --lr ${lr} & # 并行运行下一命令 49 | sleep 0.5 # 等待0.5s 50 | done 51 | # "0 1" "2 3" "4 5" "6 7" "8 9" "10 11" "12 13" "14 15" "16 17" "18 19" "20 21" "22 23" "24 25" "26 27" "28 29" "30 31" 52 | # "0 1 2 3" "4 5 6 7" "8 9 10 11" "12 13 14 15" "16 17 18 19" "20 21 22 23" "24 25 26 27" "28 29 30 31" 53 | # "0 1 2 3 4 5 6 7" "8 9 10 11 12 13 14 15" "16 17 18 19 20 21 22 23" "24 25 26 27 28 29 30 31" 54 | # "0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15" "16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31" 55 | 56 | wait 57 | chmod -R 755 ${save_path} >/dev/null 2>&1 58 | -------------------------------------------------------------------------------- /scripts/expert_construction/split/run_split_clustering.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | 3 | # llama_7B llama_13B llama_30B llama_base 4 | # llama2_7B llama2_13B llama2_30B llama2_base 5 | # ReluLLaMA-7B 6 | llama_size="ReluLLaMA-7B" 7 | 8 | num_experts=16 # 8 16 9 | metric=l2 # l2 cos 10 | proj_type=up_proj # gate_proj up_proj 11 | 12 | data_path=/mnt/petrelfs/share_data/quxiaoye 13 | model_path=${data_path}/models/${llama_size} 14 | save_path=${data_path}/moefication_results/split 15 | 16 | gpus=0 17 | cpus=16 18 | OMP_NUM_THREADS=2 srun --partition=MoE --job-name=split --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=auto \ 19 | python -m smoe.entrypoint.expert_construction.llama_split_clustering \ 20 | --model_path ${model_path} \ 21 | --save_path ${save_path} \ 22 | --template layers.{}.mlp.${proj_type}.weight \ 23 | --num_experts ${num_experts} \ 24 | --metric ${metric} \ 25 | --cpu_threads ${cpus} 26 | -------------------------------------------------------------------------------- /scripts/expert_construction/split/run_split_gradient.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | 3 | # llama_7B llama_13B llama_30B llama_base llama_3B 4 | # llama2_7B llama2_13B llama2_30B llama2_base 5 | llama_size="llama_3B" 6 | 7 | share_neurons=True # True False 8 | expert_num=4 9 | 10 | #intermediate_size=8640 # 8640 11008 13824 11 | #scale_factor=4 12 | #expert_size=$(expr ${scale_factor} \* ${intermediate_size} / ${expert_num}) 13 | 14 | expert_size=8640 15 | # 540 1080 2160 4320 8640 16 | # 688 1376 2752 5504 11008 17 | # 864 1728 3456 6912 13824 18 | 19 | echo ${expert_num} ${expert_size} ${share_neurons} 20 | 21 | kernel=l1_norm 22 | accumulate_level=sample # sample total 23 | importance_type=feature_change # feature_grad feature_change 24 | criterion=max # min max 25 | proj_type=up_proj 26 | 27 | data_path=/mnt/petrelfs/share_data/quxiaoye 28 | model_path=${data_path}/models/${llama_size} 29 | score_file_path=${data_path}/moefication_results/split/Gradients${expert_num}/${llama_size}-Gradients-${kernel}-${accumulate_level}-${importance_type} 30 | save_path=${data_path}/moefication_results/split 31 | visualization_path=/mnt/petrelfs/dongdaize.d/workspace/train-moe/visualization/expert-neuron-overlap/cluster${expert_num}/${llama_size}-${expert_size}-${accumulate_level}-${importance_type}-${kernel}-${criterion}-${proj_type} 32 | 33 | gpus=0 34 | cpus=8 35 | OMP_NUM_THREADS=2 srun --partition=MoE --job-name=split --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=auto \ 36 | python -m smoe.entrypoint.expert_construction.llama_split_gradient \ 37 | --model_path ${model_path} \ 38 | --score_file_path ${score_file_path} \ 39 | --save_path ${save_path} \ 40 | --visualization_path ${visualization_path} \ 41 | --expert_num ${expert_num} \ 42 | --expert_size ${expert_size} \ 43 | --template layers.{}.mlp.${proj_type}.weight \ 44 | --kernel ${kernel} \ 45 | --accumulate_level ${accumulate_level} \ 46 | --importance_type ${importance_type} \ 47 | --criterion ${criterion} \ 48 | --share_neurons ${share_neurons} 49 | 50 | chmod -R 755 ${save_path} >/dev/null 2>&1 51 | -------------------------------------------------------------------------------- /scripts/expert_construction/split/run_split_gradient_one4all.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | 3 | # llama_7B llama_13B llama_30B llama_base llama_3B 4 | # llama2_7B llama2_13B llama2_30B llama2_base 5 | llama_size="llama2_7B" 6 | echo ${llama_size} 7 | 8 | intermediate_size=11008 # 8640 11008 13824 9 | expert_num_list=(8) 10 | expert_size_list=(1376 2752 5504 11008) 11 | # 540 1080 2160 4320 8640 12 | # 688 1376 2752 5504 11008 13 | # 864 1728 3456 6912 13824 14 | 15 | kernel=l1_norm 16 | accumulate_level=sample # sample total 17 | importance_type=feature_change # feature_grad feature_change 18 | criterion=max # min max 19 | proj_type=up_proj 20 | 21 | data_path=/mnt/petrelfs/share_data/quxiaoye 22 | model_path=${data_path}/models/${llama_size} 23 | save_path=${data_path}/moefication_results/split 24 | 25 | gpus=0 26 | cpus=8 27 | 28 | share_neurons=True 29 | for expert_num in "${expert_num_list[@]}"; do 30 | for expert_size in "${expert_size_list[@]}"; do 31 | echo ${expert_num} ${expert_size} ${share_neurons} 32 | score_file_path=${data_path}/moefication_results/split/Gradients${expert_num}/${llama_size}-Gradients-${kernel}-${accumulate_level}-${importance_type} 33 | visualization_path=/mnt/petrelfs/dongdaize.d/workspace/train-moe/visualization/expert-neuron-overlap/cluster${expert_num}/${llama_size}-${expert_size}-${accumulate_level}-${importance_type}-${kernel}-${criterion}-${proj_type} 34 | 35 | OMP_NUM_THREADS=2 srun --partition=MoE --job-name=split --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=auto \ 36 | python -m smoe.entrypoint.expert_construction.llama_split_gradient \ 37 | --model_path ${model_path} \ 38 | --score_file_path ${score_file_path} \ 39 | --save_path ${save_path} \ 40 | --visualization_path ${visualization_path} \ 41 | --expert_num ${expert_num} \ 42 | --expert_size ${expert_size} \ 43 | --template layers.{}.mlp.${proj_type}.weight \ 44 | --kernel ${kernel} \ 45 | --accumulate_level ${accumulate_level} \ 46 | --importance_type ${importance_type} \ 47 | --criterion ${criterion} \ 48 | --share_neurons ${share_neurons} & 49 | sleep 1 50 | done 51 | done 52 | 53 | scale_factor=1 54 | share_neurons=False 55 | for expert_num in "${expert_num_list[@]}"; do 56 | expert_size=$(expr ${scale_factor} \* ${intermediate_size} / ${expert_num}) 57 | echo ${expert_num} ${expert_size} ${share_neurons} 58 | 59 | OMP_NUM_THREADS=2 srun --partition=MoE --job-name=split --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=auto \ 60 | python -m smoe.entrypoint.expert_construction.llama_split_gradient \ 61 | --model_path ${model_path} \ 62 | --score_file_path ${score_file_path} \ 63 | --save_path ${save_path} \ 64 | --expert_num ${expert_num} \ 65 | --expert_size ${expert_size} \ 66 | --template layers.{}.mlp.${proj_type}.weight \ 67 | --kernel ${kernel} \ 68 | --accumulate_level ${accumulate_level} \ 69 | --importance_type ${importance_type} \ 70 | --criterion ${criterion} \ 71 | --share_neurons ${share_neurons} 72 | done 73 | 74 | wait 75 | chmod -R 755 ${save_path} >/dev/null 2>&1 76 | -------------------------------------------------------------------------------- /scripts/expert_construction/split/run_split_gradient_residual.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | 3 | # llama_7B llama_13B llama_30B llama_base llama_3B 4 | # llama2_7B llama2_13B llama2_30B llama2_base 5 | llama_size="llama2_7B" 6 | 7 | share_neurons=False # True False 8 | expert_num_moe=7 9 | expert_num_residual=1 10 | total_expert_num=$((${expert_num_moe} + ${expert_num_residual})) 11 | 12 | #intermediate_size=8640 # 8640 11008 13824 13 | #scale_factor=1 14 | #expert_size=$(expr ${scale_factor} \* ${intermediate_size} / ${total_expert_num}) 15 | 16 | expert_size=1376 17 | # 540 1080 2160 4320 8640 18 | # 688 1376 2752 5504 11008 19 | # 864 1728 3456 6912 13824 20 | 21 | echo ${total_expert_num}\(${expert_num_moe}+${expert_num_residual}\) ${expert_size} ${share_neurons} 22 | 23 | kernel=l1_norm 24 | accumulate_level=sample # sample total 25 | importance_type=feature_change # feature_grad feature_change 26 | criterion=max # min max 27 | proj_type=up_proj 28 | 29 | data_path=/mnt/petrelfs/share_data/quxiaoye 30 | model_path=${data_path}/models/${llama_size} 31 | score_file_path=${data_path}/moefication_results/split/Gradients${total_expert_num}/${llama_size}-Gradients-${kernel}-${accumulate_level}-${importance_type} 32 | save_path=${data_path}/moefication_results/split 33 | visualization_path=/mnt/petrelfs/dongdaize.d/workspace/train-moe/visualization/expert-neuron-overlap/cluster${total_expert_num}-${expert_num_residual}residual-${expert_num_moe}moe/${llama_size}-${expert_size}-${accumulate_level}-${importance_type}-${kernel}-${criterion}-${proj_type} 34 | 35 | gpus=0 36 | cpus=8 37 | OMP_NUM_THREADS=2 srun --partition=MoE --job-name=split --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=auto \ 38 | python -m smoe.entrypoint.expert_construction.llama_split_gradient_residual \ 39 | --model_path ${model_path} \ 40 | --score_file_path ${score_file_path} \ 41 | --save_path ${save_path} \ 42 | --visualization_path ${visualization_path} \ 43 | --expert_num_moe ${expert_num_moe} \ 44 | --expert_num_residual ${expert_num_residual} \ 45 | --expert_size ${expert_size} \ 46 | --template layers.{}.mlp.${proj_type}.weight \ 47 | --kernel ${kernel} \ 48 | --accumulate_level ${accumulate_level} \ 49 | --importance_type ${importance_type} \ 50 | --criterion ${criterion} \ 51 | --share_neurons ${share_neurons} 52 | 53 | chmod -R 755 ${save_path} >/dev/null 2>&1 54 | -------------------------------------------------------------------------------- /scripts/expert_construction/split/run_split_gradient_residual_one4all.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | 3 | # llama_7B llama_13B llama_30B llama_base llama_3B 4 | # llama2_7B llama2_13B llama2_30B llama2_base 5 | llama_size="llama2_7B" 6 | echo ${llama_size} 7 | 8 | expert_num_moe_list=(13 14 15) 9 | expert_num_residual_list=(3 2 1) 10 | 11 | #intermediate_size=8640 # 8640 11008 13824 12 | #scale_factor=1 13 | #expert_size=$(expr ${scale_factor} \* ${intermediate_size} / ${total_expert_num}) 14 | 15 | expert_size=864 16 | # 540 1080 2160 4320 8640 17 | # 688 1376 2752 5504 11008 18 | # 864 1728 3456 6912 13824 19 | 20 | kernel=l1_norm 21 | accumulate_level=sample # sample total 22 | importance_type=feature_change # feature_grad feature_change 23 | criterion=max # min max 24 | proj_type=up_proj 25 | 26 | data_path=/mnt/petrelfs/share_data/quxiaoye 27 | model_path=${data_path}/models/${llama_size} 28 | save_path=${data_path}/moefication_results/split 29 | 30 | gpus=0 31 | cpus=8 32 | for idx in "${!expert_num_moe_list[@]}"; do 33 | expert_num_moe=${expert_num_moe_list[${idx}]} 34 | expert_num_residual=${expert_num_residual_list[${idx}]} 35 | total_expert_num=$((${expert_num_moe} + ${expert_num_residual})) 36 | score_file_path=${data_path}/moefication_results/split/Gradients${total_expert_num}/${llama_size}-Gradients-${kernel}-${accumulate_level}-${importance_type} 37 | visualization_path=/mnt/petrelfs/dongdaize.d/workspace/train-moe/visualization/expert-neuron-overlap/cluster${total_expert_num}-${expert_num_residual}residual-${expert_num_moe}moe/${llama_size}-${expert_size}-${accumulate_level}-${importance_type}-${kernel}-${criterion}-${proj_type} 38 | 39 | for share_neurons in "True" "False"; do 40 | echo ${total_expert_num}\(${expert_num_moe}+${expert_num_residual}\) ${expert_size} ${share_neurons} 41 | 42 | OMP_NUM_THREADS=2 srun --partition=MoE --job-name=split --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=auto \ 43 | python -m smoe.entrypoint.expert_construction.llama_split_gradient_residual \ 44 | --model_path ${model_path} \ 45 | --score_file_path ${score_file_path} \ 46 | --save_path ${save_path} \ 47 | --visualization_path ${visualization_path} \ 48 | --expert_num_moe ${expert_num_moe} \ 49 | --expert_num_residual ${expert_num_residual} \ 50 | --expert_size ${expert_size} \ 51 | --template layers.{}.mlp.${proj_type}.weight \ 52 | --kernel ${kernel} \ 53 | --accumulate_level ${accumulate_level} \ 54 | --importance_type ${importance_type} \ 55 | --criterion ${criterion} \ 56 | --share_neurons ${share_neurons} & 57 | sleep 1.0 58 | done 59 | done 60 | 61 | chmod -R 755 ${save_path} >/dev/null 2>&1 62 | -------------------------------------------------------------------------------- /scripts/expert_construction/split/run_split_graph.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | 3 | # llama_7B llama_13B llama_30B llama_base 4 | # llama2_7B llama2_13B llama2_30B llama2_base 5 | llama_size=llama_13B 6 | 7 | num_experts=16 # 8 16 8 | metric=l1_norm # l1_norm l2_norm plain 9 | proj_type=up_proj # gate_proj up_proj 10 | threshold=1 11 | 12 | data_path=/mnt/petrelfs/share_data/quxiaoye 13 | model_path=${data_path}/models/${llama_size} 14 | save_path=${data_path}/moefication_results/split/${llama_size}-${num_experts}Expert-Split-Graph-${metric}/ 15 | hidden_features_path=${data_path}/moefication_results/features/${llama_size}-Hidden-Features 16 | 17 | gpus=0 18 | cpus=16 19 | 20 | # STEP1 21 | 22 | for specify_layer in {0..39}; do 23 | OMP_NUM_THREADS=2 srun --partition=MoE --job-name=split --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=auto \ 24 | python -m smoe.entrypoint.expert_construction.llama_split_graph \ 25 | --model_path ${model_path} \ 26 | --save_path ${save_path} \ 27 | --specify_layer ${specify_layer} \ 28 | --template layers.{}.mlp.${proj_type}.weight \ 29 | --num_experts ${num_experts} \ 30 | --threshold ${threshold} \ 31 | --metric ${metric} \ 32 | --hidden_features_path ${hidden_features_path} & 33 | sleep 0.7 34 | done 35 | wait 36 | 37 | # STEP2 38 | 39 | gpmetis_run=/mnt/petrelfs/share_data/quxiaoye/metis_for_graph_split/bin/gpmetis 40 | template1=layers. 41 | template2=.mlp.${proj_type}.weight 42 | 43 | for layer in {0..39}; do 44 | OMP_NUM_THREADS=8 srun --partition=MoE --job-name=split --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=auto \ 45 | ${gpmetis_run} ${save_path}/${template1}${layer}${template2} ${num_experts} & 46 | sleep 0.7 47 | done 48 | wait 49 | 50 | # STEP3 51 | 52 | template3=.part.${num_experts} 53 | 54 | for layer in {0..39}; do 55 | OMP_NUM_THREADS=8 srun --partition=MoE --job-name=split --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=auto \ 56 | python -m smoe.entrypoint.expert_construction.llama_split_graph_trans_gp \ 57 | --gpmetised_file_path ${save_path}/${template1}${layer}${template2}${template3} & 58 | sleep 0.7 59 | done 60 | wait 61 | 62 | chmod -R 755 ${save_path} >/dev/null 2>&1 63 | -------------------------------------------------------------------------------- /scripts/expert_construction/split/run_split_random.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | 3 | # llama_7B llama_13B llama_30B llama_base 4 | # llama2_7B llama2_13B llama2_30B llama2_base 5 | # open_llama_7b 6 | # Mistral-7B-v0.1 7 | # ReluLLaMA-7B 8 | llama_size="ReluLLaMA-7B" 9 | 10 | num_experts=16 # 8 16 11 | 12 | data_path=/mnt/petrelfs/share_data/quxiaoye 13 | model_path=${data_path}/models/${llama_size} 14 | save_path=${data_path}/moefication_results/split 15 | 16 | gpus=0 17 | cpus=8 18 | OMP_NUM_THREADS=2 srun --partition=MoE --job-name=split --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=auto \ 19 | python -m smoe.entrypoint.expert_construction.llama_split_random \ 20 | --model_path ${model_path} \ 21 | --save_path ${save_path} \ 22 | --template layers.{}.mlp.gate_proj.weight \ 23 | --num_experts ${num_experts} 24 | -------------------------------------------------------------------------------- /scripts/expert_construction/split/run_split_random_one4all.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | 3 | # llama_7B llama_13B llama_30B llama_base 4 | # llama2_7B llama2_13B llama2_30B llama2_base 5 | # open_llama_7b 6 | llama_size="open_llama_7b" 7 | 8 | data_path=/mnt/petrelfs/share_data/quxiaoye 9 | model_path=${data_path}/models/${llama_size} 10 | save_path=${data_path}/moefication_results/split 11 | 12 | gpus=0 13 | cpus=8 14 | for num_experts in 4 8 16 32; do 15 | for proj_type in "gate_proj" "up_proj"; do 16 | OMP_NUM_THREADS=2 srun --partition=MoE --job-name=split --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=auto \ 17 | python -m smoe.entrypoint.expert_construction.llama_split_random \ 18 | --model_path ${model_path} \ 19 | --save_path ${save_path} \ 20 | --template layers.{}.mlp.${proj_type}.weight \ 21 | --num_experts ${num_experts} & 22 | sleep 0.7 23 | done 24 | done 25 | 26 | wait 27 | chmod -R 755 ${save_path} >/dev/null 2>&1 28 | -------------------------------------------------------------------------------- /scripts/sft/2_16.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | 3 | #SBATCH --job-name=llama_moe_2_16_deita 4 | #SBATCH --output=logs/%x-%j.log 5 | #SBATCH --error=logs/%x-%j.log 6 | 7 | #SBATCH --partition=MoE 8 | #SBATCH --ntasks-per-node=1 9 | #SBATCH --cpus-per-task=16 10 | #SBATCH --mem=64G 11 | 12 | #SBATCH --nodes=1 13 | #SBATCH --gres=gpu:4 14 | #SBATCH --quotatype=auto 15 | 16 | export WANDB_PROJECT="llama_moe_sft" 17 | num_gpus=4 18 | 19 | { 20 | task_name="llama_moe_2_16_deita" 21 | model_type="auto" 22 | model_name_or_path="/mnt/petrelfs/zhutong/llama-moe-models/LLaMA-MoE-v1-3_0B-2_16" 23 | dataset_dir_or_path="data/deita/deita_6k.jsonl" 24 | 25 | comment="llama-moe 2/16, deita, w/ balance loss, w/ freeze gate, w/ gate noise" 26 | base_dir="outputs/llama_moe_sft" 27 | output_dir="${base_dir}/${task_name}/$SLURM_JOB_NAME-$SLURM_JOB_ID" 28 | mkdir -p $output_dir 29 | scontrol write batch_script $SLURM_JOBID $output_dir/sbatch.sh 30 | git diff > $output_dir/diff.patch 31 | env > $output_dir/env 32 | echo -e "Job ID: ${SLURM_JOB_ID}\n\nLog: logs/llama_moe_2_16_deita-$SLURM_JOB_ID.log\n\nGit commit: $(git log -1 --oneline)\n\nGit branch: $(git branch | grep "*")\n\nComment: ${comment}" > $output_dir/comment.txt 33 | ln -snf $(scontrol show job $SLURM_JOB_ID | grep "StdOut=" | cut -d '=' -f 2) $output_dir/log.log 34 | echo "$SLURM_JOB_ID" > $base_dir/latest.jobid 35 | ln -snf $output_dir $base_dir/latest.dir 36 | ln -snf $(scontrol show job $SLURM_JOB_ID | grep "StdOut=" | cut -d '=' -f 2) $base_dir/latest.log 37 | 38 | nodes=($(scontrol show hostnames $SLURM_JOB_NODELIS)) 39 | nodes_array=($nodes) 40 | head_node=${nodes_array[0]} 41 | echo "Node: $head_node" 42 | 43 | torchrun \ 44 | --nnodes 1 \ 45 | --nproc_per_node $num_gpus \ 46 | --node_rank $SLURM_NODEID \ 47 | --rdzv_id $RANDOM \ 48 | --rdzv_backend c10d \ 49 | --rdzv_endpoint $head_node:29522 \ 50 | -m smoe.entrypoint.sft.train_sft \ 51 | --do_train \ 52 | --freeze_gate True \ 53 | --evaluation_strategy no \ 54 | --run_name $task_name \ 55 | --model_type $model_type \ 56 | --model_name_or_path $model_name_or_path \ 57 | --dataset_dir_or_path $dataset_dir_or_path \ 58 | --output_dir $output_dir \ 59 | --deepspeed conf/ds_bf16_zero1.json \ 60 | --seed 12306 \ 61 | --bf16 True \ 62 | --tf32 True \ 63 | --torch_dtype bfloat16 \ 64 | --per_device_train_batch_size 4 \ 65 | --per_device_eval_batch_size 4 \ 66 | --gradient_accumulation_steps 8 \ 67 | --num_train_epochs 2 \ 68 | --save_strategy steps \ 69 | --save_steps 9999999999999 \ 70 | --save_total_limit 1 \ 71 | --learning_rate 2e-5 \ 72 | --weight_decay 0. \ 73 | --warmup_ratio 0.03 \ 74 | --lr_scheduler_type cosine \ 75 | --logging_steps 1 \ 76 | --model_max_length 2048 \ 77 | --gradient_checkpointing True \ 78 | --report_to wandb 79 | 80 | } 81 | -------------------------------------------------------------------------------- /scripts/sft/2_8.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | 3 | #SBATCH --job-name=llama_moe_2_8_deita 4 | #SBATCH --output=logs/%x-%j.log 5 | #SBATCH --error=logs/%x-%j.log 6 | 7 | #SBATCH --partition=MoE 8 | #SBATCH --ntasks-per-node=1 9 | #SBATCH --cpus-per-task=16 10 | #SBATCH --mem=64G 11 | 12 | #SBATCH --nodes=1 13 | #SBATCH --gres=gpu:4 14 | #SBATCH --quotatype=auto 15 | 16 | export WANDB_PROJECT="llama_moe_sft" 17 | num_gpus=4 18 | 19 | { 20 | task_name="llama_moe_2_8_deita" 21 | model_type="auto" 22 | model_name_or_path="/mnt/petrelfs/zhutong/llama-moe-models/LLaMA-MoE-v1-3_5B-2_8-new" 23 | dataset_dir_or_path="data/deita/deita_6k.jsonl" 24 | 25 | comment="llama-moe 2/8, deita, w/ balance loss, w/ freeze gate, w/ gate noise" 26 | base_dir="outputs/llama_moe_sft" 27 | output_dir="${base_dir}/${task_name}/$SLURM_JOB_NAME-$SLURM_JOB_ID" 28 | mkdir -p $output_dir 29 | scontrol write batch_script $SLURM_JOBID $output_dir/sbatch.sh 30 | git diff > $output_dir/diff.patch 31 | env > $output_dir/env 32 | echo -e "Job ID: ${SLURM_JOB_ID}\n\nLog: logs/llama_moe_2_8_deita-$SLURM_JOB_ID.log\n\nGit commit: $(git log -1 --oneline)\n\nGit branch: $(git branch | grep "*")\n\nComment: ${comment}" > $output_dir/comment.txt 33 | ln -snf $(scontrol show job $SLURM_JOB_ID | grep "StdOut=" | cut -d '=' -f 2) $output_dir/log.log 34 | echo "$SLURM_JOB_ID" > $base_dir/latest.jobid 35 | ln -snf $output_dir $base_dir/latest.dir 36 | ln -snf $(scontrol show job $SLURM_JOB_ID | grep "StdOut=" | cut -d '=' -f 2) $base_dir/latest.log 37 | 38 | nodes=($(scontrol show hostnames $SLURM_JOB_NODELIS)) 39 | nodes_array=($nodes) 40 | head_node=${nodes_array[0]} 41 | echo "Node: $head_node" 42 | 43 | torchrun \ 44 | --nnodes 1 \ 45 | --nproc_per_node $num_gpus \ 46 | --node_rank $SLURM_NODEID \ 47 | --rdzv_id $RANDOM \ 48 | --rdzv_backend c10d \ 49 | --rdzv_endpoint $head_node:29522 \ 50 | -m smoe.entrypoint.sft.train_sft \ 51 | --do_train \ 52 | --freeze_gate True \ 53 | --evaluation_strategy no \ 54 | --run_name $task_name \ 55 | --model_type $model_type \ 56 | --model_name_or_path $model_name_or_path \ 57 | --dataset_dir_or_path $dataset_dir_or_path \ 58 | --output_dir $output_dir \ 59 | --deepspeed conf/deepspeed/bf16_zero1.json \ 60 | --seed 12306 \ 61 | --bf16 True \ 62 | --tf32 True \ 63 | --torch_dtype bfloat16 \ 64 | --per_device_train_batch_size 4 \ 65 | --per_device_eval_batch_size 4 \ 66 | --gradient_accumulation_steps 8 \ 67 | --num_train_epochs 2 \ 68 | --save_strategy steps \ 69 | --save_steps 9999999999999 \ 70 | --save_total_limit 1 \ 71 | --learning_rate 2e-5 \ 72 | --weight_decay 0. \ 73 | --warmup_ratio 0.03 \ 74 | --lr_scheduler_type cosine \ 75 | --logging_steps 1 \ 76 | --model_max_length 2048 \ 77 | --gradient_checkpointing True \ 78 | --report_to wandb 79 | 80 | } 81 | -------------------------------------------------------------------------------- /scripts/sft/4_16.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | 3 | #SBATCH --job-name=llama_moe_4_16_deita 4 | #SBATCH --output=logs/%x-%j.log 5 | #SBATCH --error=logs/%x-%j.log 6 | 7 | #SBATCH --partition=MoE 8 | #SBATCH --ntasks-per-node=1 9 | #SBATCH --cpus-per-task=16 10 | #SBATCH --mem=64G 11 | 12 | #SBATCH --nodes=1 13 | #SBATCH --gres=gpu:4 14 | #SBATCH --quotatype=auto 15 | 16 | export WANDB_PROJECT="llama_moe_sft" 17 | num_gpus=4 18 | 19 | { 20 | task_name="llama_moe_4_16_deita" 21 | model_type="auto" 22 | model_name_or_path="/mnt/petrelfs/zhutong/llama-moe-models/LLaMA-MoE-v1-3_5B-4_16-new" 23 | dataset_dir_or_path="data/deita/deita_6k.jsonl" 24 | 25 | comment="llama-moe 4/16, deita, w/ balance loss, w/ freeze gate, w/ gate noise" 26 | base_dir="outputs/llama_moe_sft" 27 | output_dir="${base_dir}/${task_name}/$SLURM_JOB_NAME-$SLURM_JOB_ID" 28 | mkdir -p $output_dir 29 | scontrol write batch_script $SLURM_JOBID $output_dir/sbatch.sh 30 | git diff > $output_dir/diff.patch 31 | env > $output_dir/env 32 | echo -e "Job ID: ${SLURM_JOB_ID}\n\nLog: logs/llama_moe_4_16_deita-$SLURM_JOB_ID.log\n\nGit commit: $(git log -1 --oneline)\n\nGit branch: $(git branch | grep "*")\n\nComment: ${comment}" > $output_dir/comment.txt 33 | ln -snf $(scontrol show job $SLURM_JOB_ID | grep "StdOut=" | cut -d '=' -f 2) $output_dir/log.log 34 | echo "$SLURM_JOB_ID" > $base_dir/latest.jobid 35 | ln -snf $output_dir $base_dir/latest.dir 36 | ln -snf $(scontrol show job $SLURM_JOB_ID | grep "StdOut=" | cut -d '=' -f 2) $base_dir/latest.log 37 | 38 | nodes=($(scontrol show hostnames $SLURM_JOB_NODELIS)) 39 | nodes_array=($nodes) 40 | head_node=${nodes_array[0]} 41 | echo "Node: $head_node" 42 | 43 | torchrun \ 44 | --nnodes 1 \ 45 | --nproc_per_node $num_gpus \ 46 | --node_rank $SLURM_NODEID \ 47 | --rdzv_id $RANDOM \ 48 | --rdzv_backend c10d \ 49 | --rdzv_endpoint $head_node:29522 \ 50 | -m smoe.entrypoint.sft.train_sft \ 51 | --do_train \ 52 | --freeze_gate True \ 53 | --evaluation_strategy no \ 54 | --run_name $task_name \ 55 | --model_type $model_type \ 56 | --model_name_or_path $model_name_or_path \ 57 | --dataset_dir_or_path $dataset_dir_or_path \ 58 | --output_dir $output_dir \ 59 | --deepspeed conf/ds_bf16_zero1.json \ 60 | --seed 12306 \ 61 | --bf16 True \ 62 | --tf32 True \ 63 | --torch_dtype bfloat16 \ 64 | --per_device_train_batch_size 4 \ 65 | --per_device_eval_batch_size 4 \ 66 | --gradient_accumulation_steps 8 \ 67 | --num_train_epochs 2 \ 68 | --save_strategy steps \ 69 | --save_steps 9999999999999 \ 70 | --save_total_limit 1 \ 71 | --learning_rate 2e-5 \ 72 | --weight_decay 0. \ 73 | --warmup_ratio 0.03 \ 74 | --lr_scheduler_type cosine \ 75 | --logging_steps 1 \ 76 | --model_max_length 2048 \ 77 | --gradient_checkpointing True \ 78 | --report_to wandb 79 | 80 | } 81 | -------------------------------------------------------------------------------- /scripts/test/test_args.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | 3 | templates="1234567" 4 | 5 | gpus=0 6 | cpus=1 7 | OMP_NUM_THREADS=2 srun --partition=MoE --job-name=split --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=auto \ 8 | python -m tests.utils.test_args \ 9 | --t ${templates} 10 | -------------------------------------------------------------------------------- /scripts/test/test_conn.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | 3 | #SBATCH --job-name=test_conn 4 | #SBATCH --output=logs/%x.log 5 | #SBATCH --error=logs/%x.log 6 | 7 | #SBATCH --partition=MoE 8 | #SBATCH --ntasks-per-node=1 9 | #SBATCH --cpus-per-task=64 10 | #SBATCH --mem=0 11 | 12 | #SBATCH --nodes=3 13 | #SBATCH --gres=gpu:8 14 | #SBATCH --quotatype=reserved 15 | 16 | export OMP_NUM_THREADS=4 17 | 18 | nodes=($(scontrol show hostnames $SLURM_JOB_NODELIS)) 19 | nodes_array=($nodes) 20 | head_node=${nodes_array[0]} 21 | 22 | srun torchrun \ 23 | --nnodes 3 \ 24 | --nproc_per_node 8 \ 25 | --node_rank $SLURM_NODEID \ 26 | --rdzv_id $RANDOM \ 27 | --rdzv_backend c10d \ 28 | --rdzv_endpoint $head_node:29520 \ 29 | tests/entrypoint/test_conn.py 30 | -------------------------------------------------------------------------------- /scripts/tokenize/clustering.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | 3 | set -vx 4 | 5 | tokenizer_dir=/mnt/petrelfs/share_data/quxiaoye/models/llama_7B 6 | data_dir=/mnt/petrelfs/zhutong/smoe/resources/clustering_samples_32 7 | out_dir=/mnt/petrelfs/zhutong/smoe/resources/clustering_samples_32_tokenized 8 | logs_dir=logs 9 | 10 | mkdir -p $out_dir 11 | mkdir -p $logs_dir 12 | 13 | # for loop in: en_arxiv, en_book, en_c4, en_cc, en_stack, en_wikipedia, github 14 | for data_type in $(ls $data_dir) 15 | do 16 | log_path=logs/tokenize_${data_type}_32clusters.log 17 | nohup srun -p MoE -N1 -n1 --cpus-per-task=32 \ 18 | python -m smoe.utils.tokenize \ 19 | -f jsonl \ 20 | -t $tokenizer_dir \ 21 | -i $data_dir/$data_type \ 22 | -o $out_dir/$data_type \ 23 | 1>${log_path} 2>&1 & 24 | echo "$data_type > $log_path" 25 | done 26 | -------------------------------------------------------------------------------- /scripts/tokenize/lines.sh: -------------------------------------------------------------------------------- 1 | # srun -p MoE -N1 -n1 --cpus-per-task=8 python -m smoe.utils.tokenize -f jsonl -t /mnt/petrelfs/share_data/quxiaoye/models/llama_7B -i /mnt/petrelfs/zhutong/smoe/resources/clustering_samples_8/3.jsonl -o /mnt/petrelfs/share_data/quxiaoye/data/8clusters/3.jsonl 2 | nohup srun -p MoE -N1 -n1 --cpus-per-task=8 python -m smoe.utils.tokenize -f jsonl -t /mnt/petrelfs/share_data/quxiaoye/models/llama_7B -i /mnt/petrelfs/zhutong/smoe/resources/clustering_samples_32/5.jsonl -o /mnt/petrelfs/share_data/quxiaoye/data/32clusters/5.jsonl 1>logs/tokenize_32_5.log 2>&1 & 3 | nohup srun -p MoE -N1 -n1 --cpus-per-task=8 python -m smoe.utils.tokenize -f jsonl -t /mnt/petrelfs/share_data/quxiaoye/models/llama_7B -i /mnt/petrelfs/zhutong/smoe/resources/clustering_samples_32/7.jsonl -o /mnt/petrelfs/share_data/quxiaoye/data/32clusters/7.jsonl 1>logs/tokenize_32_7.log 2>&1 & 4 | nohup srun -p MoE -N1 -n1 --cpus-per-task=8 python -m smoe.utils.tokenize -f jsonl -t /mnt/petrelfs/share_data/quxiaoye/models/llama_7B -i /mnt/petrelfs/zhutong/smoe/resources/clustering_samples_32/8.jsonl -o /mnt/petrelfs/share_data/quxiaoye/data/32clusters/8.jsonl 1>logs/tokenize_32_8.log 2>&1 & 5 | nohup srun -p MoE -N1 -n1 --cpus-per-task=8 python -m smoe.utils.tokenize -f jsonl -t /mnt/petrelfs/share_data/quxiaoye/models/llama_7B -i /mnt/petrelfs/zhutong/smoe/resources/clustering_samples_32/12.jsonl -o /mnt/petrelfs/share_data/quxiaoye/data/32clusters/12.jsonl 1>logs/tokenize_32_12.log 2>&1 & 6 | nohup srun -p MoE -N1 -n1 --cpus-per-task=8 python -m smoe.utils.tokenize -f jsonl -t /mnt/petrelfs/share_data/quxiaoye/models/llama_7B -i /mnt/petrelfs/zhutong/smoe/resources/clustering_samples_32/26.jsonl -o /mnt/petrelfs/share_data/quxiaoye/data/32clusters/26.jsonl 1>logs/tokenize_32_26.log 2>&1 & 7 | nohup srun -p MoE -N1 -n1 --cpus-per-task=8 python -m smoe.utils.tokenize -f jsonl -t /mnt/petrelfs/share_data/quxiaoye/models/llama_7B -i /mnt/petrelfs/zhutong/smoe/resources/clustering_samples_32/31.jsonl -o /mnt/petrelfs/share_data/quxiaoye/data/32clusters/31.jsonl 1>logs/tokenize_32_31.log 2>&1 & 8 | -------------------------------------------------------------------------------- /scripts/tokenize/redpajama.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | 3 | set -vx 4 | 5 | tokenizer_dir=/mnt/petrelfs/share_data/quxiaoye/models/llama_7B 6 | data_dir=/mnt/petrelfs/share_data/quxiaoye/pretrain_LLAMA_all_data 7 | out_dir=/mnt/petrelfs/share_data/quxiaoye/pretrain_LLAMA_all_data_processed 8 | 9 | # tokenizer_dir=/mnt/petrelfs/share_data/quxiaoye/models/llama_3B 10 | # data_dir=/mnt/petrelfs/zhutong/smoe/resources/slimpajama_samples 11 | # out_dir=/mnt/petrelfs/zhutong/smoe/resources/slimpajama_samples_openllama3B_tokenized 12 | 13 | # tokenizer_dir=/mnt/petrelfs/share_data/quxiaoye/models/llama_7B 14 | # data_dir=/mnt/petrelfs/zhutong/lm-evaluation-harness-b281b0921b636bc36ad05c0b0b0763bd6dd43463/val_set/final 15 | # out_dir=/mnt/petrelfs/share_data/quxiaoye/data/llama1_7B_val_set_tokenized 16 | 17 | logs_dir=logs 18 | 19 | mkdir -p $logs_dir 20 | 21 | # for loop in: en_arxiv, en_book, en_c4, en_cc, en_stack, en_wikipedia, github 22 | for data_type in $(ls $data_dir) 23 | do 24 | log_path=logs/tokenize_$data_type.log 25 | nohup srun -p MoE -N1 -n1 --cpus-per-task=32 \ 26 | python -m smoe.utils.tokenize \ 27 | -f jsonl \ 28 | -t $tokenizer_dir \ 29 | -i $data_dir/$data_type \ 30 | -o $out_dir/$data_type \ 31 | 1>$logs_dir/tokenize_$data_type.log 2>&1 & 32 | echo "$data_type > $logs_dir/tokenize_$data_type.log" 33 | done 34 | -------------------------------------------------------------------------------- /scripts/tokenize/slimpajama_convert.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | 3 | # set -vx 4 | 5 | content_column=input_ids 6 | src_tokenizer_dir=/mnt/petrelfs/share_data/zhutong/models/llama2_7B 7 | tokenizer_dir=/mnt/petrelfs/share_data/zhutong/models/Mistral-7B-v0.1 8 | 9 | data_dir=/mnt/petrelfs/share_data/zhutong/data/slimpajama_fluency_llama_middle_parts 10 | out_dir=/mnt/petrelfs/share_data/zhutong/data/slimpajama_fluency_mistral_middle_parts 11 | # data_dir=/mnt/petrelfs/share_data/zhutong/data/llama1_7B_val_set_tokenized 12 | # out_dir=/mnt/petrelfs/share_data/zhutong/data/mixtral_val_set_tokenized 13 | 14 | 15 | logs_dir=logs 16 | 17 | mkdir -p $logs_dir 18 | 19 | # for loop in: en_arxiv, en_book, en_c4, en_cc, en_stack, en_wikipedia, github 20 | # for data_type in $(ls $data_dir) 21 | for data_type in "en_arxiv" "en_book" "en_c4" "en_stack" "en_wikipedia" "github" 22 | do 23 | # get all parts from source data dir 24 | for part in $(ls $data_dir/$data_type) 25 | do 26 | echo "tokenizing $data_dir/$data_type/$part - $(ls $data_dir/$data_type/$part | wc -l)" 27 | log_path=logs/tokenize-$data_type-$part.log 28 | nohup srun -p MoE_T -N1 -n1 --cpus-per-task=32 \ 29 | python -m smoe.utils.tokenize \ 30 | -f jsonl \ 31 | -c $content_column \ 32 | -s $src_tokenizer_dir \ 33 | -t $tokenizer_dir \ 34 | -i $data_dir/$data_type/$part \ 35 | -o $out_dir/$data_type/$part \ 36 | 1>$log_path 2>&1 & 37 | # echo "$data_type/$part > $log_path" 38 | sleep 3 39 | done 40 | 41 | # log_path=logs/tokenize_$data_type.log 42 | # nohup srun -p MoE_T -N1 -n1 --cpus-per-task=32 \ 43 | # python -m smoe.utils.tokenize \ 44 | # -f jsonl \ 45 | # -s $src_tokenizer_dir \ 46 | # -c $content_column \ 47 | # -t $tokenizer_dir \ 48 | # -i $data_dir/$data_type \ 49 | # -o $out_dir/$data_type \ 50 | # 1>$logs_dir/tokenize_$data_type.log 2>&1 & 51 | # echo "$data_type > $logs_dir/tokenize_$data_type.log" 52 | done 53 | -------------------------------------------------------------------------------- /scripts/visualization/run_visualize_expert_neuron_overlap.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | 3 | # llama_7B llama_13B llama_30B llama_base llama_3B 4 | # llama2_7B llama2_13B llama2_30B llama2_base 5 | llama_size="llama_13B" 6 | total_clusters=16 7 | expert_size=864 8 | # 540 1080 2160 4320 8640 9 | # 688 1376 2752 5504 11008 10 | # 864 1728 3456 6912 13824 11 | 12 | criterion=max # min max 13 | kernel=l1_norm # plain l1_norm l2_norm 14 | accumulate_level=sample # sample total 15 | importance_type=feature_change # feature_grad feature_change 16 | proj_type=up_proj # gate_proj up_proj 17 | 18 | if [ ${importance_type} = "feature_grad" ]; then 19 | template_postfix=grad 20 | else 21 | template_postfix=change 22 | fi 23 | 24 | data_path=/mnt/petrelfs/share_data/quxiaoye 25 | model_path=${data_path}/models/${llama_size} 26 | score_file_path=${data_path}/moefication_results/split/Gradients${total_clusters}/${llama_size}-Gradients-${kernel}-${accumulate_level}-${importance_type} 27 | save_path=/mnt/petrelfs/dongdaize.d/workspace/train-moe/visualization/expert-neuron-overlap/cluster${total_clusters}/${llama_size}-${expert_size}-${accumulate_level}-${importance_type}-${kernel}-${criterion}-${proj_type} 28 | 29 | gpus=0 30 | cpus=4 31 | OMP_NUM_THREADS=2 srun --partition=MoE --job-name=visualize --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=auto \ 32 | python -m smoe.entrypoint.visualization.visualize_expert_neuron_overlap \ 33 | --model_path ${model_path} \ 34 | --score_file_path ${score_file_path} \ 35 | --save_path ${save_path} \ 36 | --expert_size ${expert_size} \ 37 | --score_file_template layers.{}.mlp.${proj_type}.weight.${template_postfix} \ 38 | --criterion ${criterion} 39 | -------------------------------------------------------------------------------- /scripts/visualization/run_visualize_expert_neuron_overlap_one4all.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | 3 | # llama_7B llama_13B llama_30B llama_base llama_3B 4 | # llama2_7B llama2_13B llama2_30B llama2_base 5 | llama_size="llama_3B" 6 | total_clusters=16 7 | 8 | criterion=max # min max 9 | kernel=l1_norm # plain l1_norm l2_norm 10 | accumulate_level=sample # sample total 11 | importance_type=feature_change # feature_grad feature_change 12 | proj_type=up_proj # gate_proj up_proj 13 | 14 | if [ ${importance_type} = "feature_grad" ]; then 15 | template_postfix=grad 16 | else 17 | template_postfix=change 18 | fi 19 | 20 | data_path=/mnt/petrelfs/share_data/quxiaoye 21 | model_path=${data_path}/models/${llama_size} 22 | score_file_path=${data_path}/moefication_results/split/Gradients${total_clusters}/${llama_size}-Gradients-${kernel}-${accumulate_level}-${importance_type} 23 | 24 | gpus=0 25 | cpus=4 26 | for expert_size in 540 1080 2160 4320; do 27 | # 540 1080 2160 4320 8640 28 | # 688 1376 2752 5504 11008 29 | # 864 1728 3456 6912 13824 30 | save_path=/mnt/petrelfs/dongdaize.d/workspace/train-moe/visualization/expert-neuron-overlap/cluster${total_clusters}/${llama_size}-${expert_size}-${accumulate_level}-${importance_type}-${kernel}-${criterion}-${proj_type} 31 | 32 | OMP_NUM_THREADS=2 srun --partition=MoE --job-name=visualize --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=auto \ 33 | python -m smoe.entrypoint.visualization.visualize_expert_neuron_overlap \ 34 | --model_path ${model_path} \ 35 | --score_file_path ${score_file_path} \ 36 | --save_path ${save_path} \ 37 | --expert_size ${expert_size} \ 38 | --score_file_template layers.{}.mlp.${proj_type}.weight.${template_postfix} \ 39 | --criterion ${criterion} & 40 | sleep 0.7 41 | done 42 | -------------------------------------------------------------------------------- /scripts/visualization/run_visualize_expert_neuron_overlap_overview.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | 3 | # llama_7B llama_13B llama_30B llama_base llama_3B 4 | # llama2_7B llama2_13B llama2_30B llama2_base 5 | llama_size="llama_3B" 6 | total_clusters=16 7 | 8 | criterion=max # min max 9 | kernel=l1_norm # plain l1_norm l2_norm 10 | accumulate_level=sample # sample total 11 | importance_type=feature_change # feature_grad feature_change 12 | proj_type=up_proj # gate_proj up_proj 13 | 14 | if [ ${importance_type} = "feature_grad" ]; then 15 | template_postfix=grad 16 | else 17 | template_postfix=change 18 | fi 19 | 20 | data_path=/mnt/petrelfs/share_data/quxiaoye 21 | model_path=${data_path}/models/${llama_size} 22 | score_file_path=${data_path}/moefication_results/split/Gradients${total_clusters}/${llama_size}-Gradients-${kernel}-${accumulate_level}-${importance_type} 23 | save_path=/mnt/petrelfs/dongdaize.d/workspace/train-moe/visualization/expert-neuron-overlap-overview/cluster${total_clusters}/${llama_size}-${accumulate_level}-${importance_type}-${kernel}-${criterion}-${proj_type} 24 | 25 | gpus=0 26 | cpus=4 27 | OMP_NUM_THREADS=2 srun --partition=MoE --job-name=visualize --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=auto \ 28 | python -m smoe.entrypoint.visualization.visualize_expert_neuron_overlap_overview \ 29 | --model_path ${model_path} \ 30 | --score_file_path ${score_file_path} \ 31 | --save_path ${save_path} \ 32 | --score_file_template layers.{}.mlp.${proj_type}.weight.${template_postfix} \ 33 | --criterion ${criterion} 34 | -------------------------------------------------------------------------------- /scripts/visualization/run_visualize_expert_select_mlp.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | 3 | # llama_7B llama_13B llama_30B llama_base 4 | # llama2_7B llama2_13B llama2_30B llama2_base 5 | llama_size="llama2_7B" 6 | 7 | num_experts=8 # 8 16 8 | num_selects=2 # 2 4 9 | split_type=Clustering-l2 # Graph-l1_norm Graph-l2_norm Clustering-l2 Clustering-cos Random 10 | select_type=l2_norm # plain positive l1_norm l2_norm 11 | proj_type=gate_proj # up_proj gate_proj 12 | 13 | result_path=/mnt/petrelfs/share_data/quxiaoye/moefication_results/select/${split_type}/${llama_size}-${num_experts}Expert-Select-MLP-${select_type} 14 | save_path=/mnt/petrelfs/dongdaize.d/workspace/train-moe/visualization/expert-select/${split_type}-${select_type}/${llama_size}-${num_experts}Select${num_selects}-${proj_type} 15 | 16 | gpus=0 17 | cpus=4 18 | OMP_NUM_THREADS=2 srun --partition=MoE --job-name=visualize --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=auto \ 19 | python -m smoe.entrypoint.visualization.visualize_expert_select_mlp \ 20 | --result_path ${result_path} \ 21 | --save_path ${save_path} \ 22 | --proj_type ${proj_type} 23 | -------------------------------------------------------------------------------- /scripts/visualization/run_visualize_expert_select_mlp_one4all.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | 3 | # llama_7B llama_13B llama_30B llama_base 4 | # llama2_7B llama2_13B llama2_30B llama2_base 5 | llama_size="llama_7B" 6 | 7 | # 定义 num_selects 数组,与 num_experts 一一对应 8 | declare -a num_experts_array=(8 16) 9 | declare -a num_selects_array=(2 4) 10 | 11 | # 可视化所有可能的结果组合,无效进程会自动报错退出 12 | gpus=0 13 | cpus=4 14 | for idx in "${!num_selects_array[@]}"; do 15 | num_experts="${num_experts_array[$idx]}" 16 | num_selects="${num_selects_array[$idx]}" 17 | for split_type in "Graph-l1_norm" "Graph-l2_norm" "Clustering-l2" "Clustering-cos" "Random"; do 18 | for select_type in "plain" "positive" "l1_norm" "l2_norm"; do 19 | for proj_type in "gate_proj" "up_proj"; do 20 | 21 | result_path=/mnt/petrelfs/share_data/quxiaoye/moefication_results/select/${split_type}/${llama_size}-${num_experts}Expert-Select-MLP-${select_type} 22 | save_path=/mnt/petrelfs/dongdaize.d/workspace/train-moe/visualization/expert-select/${split_type}-${select_type}/${llama_size}-${num_experts}Select${num_selects}-${proj_type} 23 | 24 | # 若result_path存在,则执行可视化 25 | if [ -d "$result_path" ]; then 26 | OMP_NUM_THREADS=2 srun --partition=MoE --job-name=visualize --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=auto \ 27 | python -m smoe.entrypoint.visualization.visualize_expert_select_mlp \ 28 | --result_path ${result_path} \ 29 | --save_path ${save_path} \ 30 | --proj_type ${proj_type} & # 并行运行下一命令 31 | sleep 0.5 # 等待0.5s 32 | else 33 | echo "Directory does not exist: $result_path" 34 | fi 35 | 36 | done 37 | done 38 | done 39 | done 40 | -------------------------------------------------------------------------------- /scripts/visualization/run_visualize_mlp_output_scale.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | 3 | # llama_7B llama_13B llama_30B llama_base 4 | # llama2_7B llama2_13B llama2_30B llama2_base 5 | llama_size="llama_13B" 6 | 7 | data_begin_index=0 8 | data_end_index=500 9 | batch_size=8 10 | block_size=2048 11 | moe_score_scale_factor=1 12 | 13 | #save_folder=${llama_size}_dense 14 | #model_path=/mnt/petrelfs/share_data/quxiaoye/models/llama_13B 15 | #is_moe="False" 16 | 17 | #save_folder=${llama_size}_moe 18 | #model_path=/mnt/petrelfs/share_data/quxiaoye/models/LlamaMoEForCausalLM-copy/Gradient-max-l1_norm-sample-feature_change/llama_13B-16Select4-864Neurons 19 | #is_moe="True" 20 | 21 | #moe_score_scale_factor=5 22 | #save_folder=${llama_size}_moe_scale${moe_score_scale_factor} 23 | #model_path=/mnt/petrelfs/share_data/quxiaoye/models/LlamaMoEForCausalLM-copy/Gradient-max-l1_norm-sample-feature_change/llama_13B-16Select4-864Neurons 24 | #is_moe="True" 25 | 26 | save_folder=${llama_size}_moe_trained 27 | model_path=/mnt/petrelfs/share_data/quxiaoye/checkpoint-18000 28 | is_moe="True" 29 | 30 | share_path=/mnt/petrelfs/share_data/quxiaoye 31 | tokenizer_path=${share_path}/models/${llama_size} 32 | data_path=${share_path}/data/vis_data/head30_shuffled_output/shuffled_20.txt 33 | save_path=/mnt/petrelfs/dongdaize.d/workspace/train-moe/visualization/mlp-outputs-scale/${save_folder} 34 | 35 | gpus=1 36 | cpus=16 37 | OMP_NUM_THREADS=2 srun --partition=MoE --job-name=visualize --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=auto \ 38 | python -m smoe.entrypoint.visualization.visualize_mlp_output_scale \ 39 | --tokenizer_path ${tokenizer_path} \ 40 | --model_path ${model_path} \ 41 | --data_path ${data_path} \ 42 | --save_path ${save_path} \ 43 | --data_begin_index ${data_begin_index} \ 44 | --data_end_index ${data_end_index} \ 45 | --batch_size ${batch_size} \ 46 | --block_size ${block_size} \ 47 | --is_moe ${is_moe} \ 48 | --moe_score_scale_factor ${moe_score_scale_factor} 49 | -------------------------------------------------------------------------------- /scripts/visualization/run_visualize_swiglu_output.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | 3 | # llama_7B llama_13B llama_30B llama_base 4 | # llama2_7B llama2_13B llama2_30B llama2_base 5 | llama_size="llama_7B" 6 | 7 | proj_type=up_proj # gate_proj up_proj 8 | visualize_criterion=l2_norm # plain l1_norm l2_norm 9 | 10 | data_path=/mnt/petrelfs/share_data/quxiaoye 11 | model_path=${data_path}/models/${llama_size} 12 | hidden_features_path=${data_path}/moefication_results/features/${llama_size}-Hidden-Features 13 | 14 | save_path=/mnt/petrelfs/dongdaize.d/workspace/train-moe/visualization/swiglu-output/${llama_size}/${proj_type}-${visualize_criterion} 15 | 16 | gpus=1 17 | cpus=16 18 | for specify_layer in "0 1 2 3" "4 5 6 7" "8 9 10 11" "12 13 14 15" "16 17 18 19" "20 21 22 23" "24 25 26 27" "28 29 30 31"; do # 并行启用任务 19 | OMP_NUM_THREADS=2 srun --partition=MoE --job-name=visualize --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=auto \ 20 | python -m smoe.entrypoint.visualization.visualize_swiglu_output \ 21 | --model_path ${model_path} \ 22 | --hidden_features_path ${hidden_features_path} \ 23 | --save_path ${save_path} \ 24 | --template layers.{}.mlp.${proj_type}.weight \ 25 | --specify_layer ${specify_layer} \ 26 | --visualize_criterion ${visualize_criterion} & # 并行运行下一命令 27 | sleep 0.5 # 等待0.5s 28 | done 29 | # "0" "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" "11" "12" "13" "14" "15" "16" "17" "18" "19" "20" "21" "22" "23" "24" "25" "26" "27" "28" "29" "30" "31" 30 | # "0 1" "2 3" "4 5" "6 7" "8 9" "10 11" "12 13" "14 15" "16 17" "18 19" "20 21" "22 23" "24 25" "26 27" "28 29" "30 31" 31 | # "0 1 2 3" "4 5 6 7" "8 9 10 11" "12 13 14 15" "16 17 18 19" "20 21 22 23" "24 25 26 27" "28 29 30 31" 32 | # "0 1 2 3 4 5 6 7" "8 9 10 11 12 13 14 15" "16 17 18 19 20 21 22 23" "24 25 26 27 28 29 30 31" 33 | # "0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15" "16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31" 34 | -------------------------------------------------------------------------------- /scripts/visualization/run_visualize_swiglu_output_one4all.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | 3 | # llama_7B llama_13B llama_30B llama_base 4 | # llama2_7B llama2_13B llama2_30B llama2_base 5 | llama_size="llama_7B" 6 | 7 | data_path=/mnt/petrelfs/share_data/quxiaoye 8 | model_path=${data_path}/models/${llama_size} 9 | hidden_features_path=${data_path}/moefication_results/features/${llama_size}-Hidden-Features 10 | 11 | # 可视化所有可能的结果组合,无效进程会自动报错退出 12 | gpus=1 13 | cpus=16 14 | for visualize_criterion in "plain" "l1_norm" "l2_norm"; do 15 | for proj_type in "gate_proj" "up_proj"; do 16 | 17 | save_path=/mnt/petrelfs/dongdaize.d/workspace/train-moe/visualization/swiglu-output/${llama_size}/${proj_type}-${visualize_criterion} 18 | 19 | for specify_layer in "0 1 2 3" "4 5 6 7" "8 9 10 11" "12 13 14 15" "16 17 18 19" "20 21 22 23" "24 25 26 27" "28 29 30 31"; do # 并行启用任务 20 | OMP_NUM_THREADS=2 srun --partition=MoE --job-name=visualize --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=auto \ 21 | python -m smoe.entrypoint.visualization.visualize_swiglu_output \ 22 | --model_path ${model_path} \ 23 | --hidden_features_path ${hidden_features_path} \ 24 | --save_path ${save_path} \ 25 | --template layers.{}.mlp.${proj_type}.weight \ 26 | --specify_layer ${specify_layer} \ 27 | --visualize_criterion ${visualize_criterion} & # 并行运行下一命令 28 | sleep 0.5 # 等待0.5s 29 | 30 | done 31 | done 32 | done 33 | # "0" "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" "11" "12" "13" "14" "15" "16" "17" "18" "19" "20" "21" "22" "23" "24" "25" "26" "27" "28" "29" "30" "31" 34 | # "0 1" "2 3" "4 5" "6 7" "8 9" "10 11" "12 13" "14 15" "16 17" "18 19" "20 21" "22 23" "24 25" "26 27" "28 29" "30 31" 35 | # "0 1 2 3" "4 5 6 7" "8 9 10 11" "12 13 14 15" "16 17 18 19" "20 21 22 23" "24 25 26 27" "28 29 30 31" 36 | # "0 1 2 3 4 5 6 7" "8 9 10 11 12 13 14 15" "16 17 18 19 20 21 22 23" "24 25 26 27 28 29 30 31" 37 | # "0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15" "16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31" 38 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import setuptools 4 | 5 | readme_filepath = os.path.join(os.path.dirname(os.path.abspath(__file__)), "README.md") 6 | with open(readme_filepath, "r", encoding="utf8") as fh: 7 | long_description = fh.read() 8 | 9 | version_filepath = os.path.join(os.path.dirname(os.path.abspath(__file__)), "VERSION") 10 | with open(version_filepath, "r", encoding="utf8") as fh: 11 | version = fh.read().strip() 12 | 13 | setuptools.setup( 14 | name="smoe", 15 | version=version, 16 | author="MoE Group", 17 | author_email="tzhu1997@outlook.com", 18 | description="A toolkit for LLM MoE and continual pretraining.", 19 | long_description_content_type="text/markdown", 20 | long_description=long_description, 21 | url="https://github.com/Spico197/smoe", 22 | packages=setuptools.find_packages(exclude=["tests", "tests.*", "docs", "docs.*"]), 23 | classifiers=[ 24 | "Programming Language :: Python :: 3", 25 | "License :: OSI Approved :: Apache Software License", 26 | "Operating System :: OS Independent", 27 | ], 28 | python_requires=">=3.10", 29 | install_requires=[ 30 | "scikit-learn==1.3.0", 31 | "omegaconf==2.0.6", 32 | "tqdm==4.65.0", 33 | "datasets==2.14.1", 34 | "transformers==4.31.0", 35 | "peft==0.4.0", 36 | "tensorboard==2.13.0", 37 | ], 38 | extras_require={ 39 | "dev": [ 40 | "pytest==7.4.0", 41 | "coverage==7.2.7", 42 | "black==23.7.0", 43 | "isort==5.12.0", 44 | "flake8==6.0.0", 45 | "pre-commit==3.3.3", 46 | ] 47 | }, 48 | include_package_data=True, 49 | entry_points={}, 50 | ) 51 | -------------------------------------------------------------------------------- /smoe/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pjlab-sys4nlp/llama-moe/b17aff436cce58e2fcd7327789c7fadafe15d19e/smoe/__init__.py -------------------------------------------------------------------------------- /smoe/callbacks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pjlab-sys4nlp/llama-moe/b17aff436cce58e2fcd7327789c7fadafe15d19e/smoe/callbacks/__init__.py -------------------------------------------------------------------------------- /smoe/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pjlab-sys4nlp/llama-moe/b17aff436cce58e2fcd7327789c7fadafe15d19e/smoe/data/__init__.py -------------------------------------------------------------------------------- /smoe/data/aggregation.py: -------------------------------------------------------------------------------- 1 | from itertools import chain 2 | 3 | 4 | def group_texts(examples: dict, block_size: int = 1024): 5 | # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size. 6 | # Concatenate all texts. 7 | concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()} 8 | total_length = len(concatenated_examples[list(examples.keys())[0]]) 9 | # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can 10 | # customize this part to your needs. 11 | if total_length >= block_size: 12 | total_length = (total_length // block_size) * block_size 13 | # Split by chunks of max_len. 14 | result = { 15 | k: [t[i : i + block_size] for i in range(0, total_length, block_size)] 16 | for k, t in concatenated_examples.items() 17 | } 18 | result["labels"] = result["input_ids"].copy() 19 | return result 20 | 21 | 22 | def group_instances(examples: list[dict], block_size: int = 2048) -> list[dict]: 23 | """ 24 | Concate examples to a length of block size. 25 | 26 | Args: 27 | examples: a list of dict instances that have multiple keys 28 | block_size: the length of the concatenated examples 29 | """ 30 | 31 | def _concat(examples: list[dict]) -> dict: 32 | """ 33 | Concatenate the values of each key in the examples. 34 | 35 | Args: 36 | examples: a list of dict instances that have multiple keys 37 | """ 38 | concatenated_examples = {} 39 | keys = examples[0].keys() 40 | for k in keys: 41 | concatenated_examples[k] = list(chain(*[e[k] for e in examples])) 42 | if "labels" not in keys and "input_ids" in keys: 43 | concatenated_examples["labels"] = concatenated_examples["input_ids"] 44 | return concatenated_examples 45 | 46 | def _chunk(examples: dict, block_size: int) -> list[dict]: 47 | """ 48 | Split the concatenated examples into chunks of block_size. 49 | 50 | Args: 51 | examples: a dict instance that has multiple keys 52 | block_size: the length of the concatenated examples 53 | """ 54 | total_length = len(examples[list(examples.keys())[0]]) 55 | if total_length >= block_size: 56 | total_length = (total_length // block_size) * block_size 57 | result = { 58 | k: [t[i : i + block_size] for i in range(0, total_length, block_size)] 59 | for k, t in examples.items() 60 | } 61 | return result 62 | 63 | def _decompose(example: dict) -> list[dict]: 64 | """ 65 | Decompose the example into a list of dict instances. 66 | 67 | Args: 68 | example: a dict instance that has multiple keys 69 | """ 70 | num_chunks = len(example[list(example.keys())[0]]) 71 | return [{k: example[k][i] for k in example.keys()} for i in range(num_chunks)] 72 | 73 | concatenated_examples = _concat(examples) 74 | chunk = _chunk(concatenated_examples, block_size) 75 | return _decompose(chunk) 76 | -------------------------------------------------------------------------------- /smoe/data/redpajama.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from collections import defaultdict 3 | from functools import partial 4 | from pathlib import Path 5 | 6 | from datasets import IterableDataset, load_dataset 7 | from datasets.combine import interleave_datasets 8 | from tqdm import tqdm 9 | 10 | from smoe.data.aggregation import group_texts 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | def load_streaming_datasets( 16 | data_dir: str, 17 | prob_map: dict[str, float] = None, 18 | num_proc: int = None, 19 | debug_mode: bool = False, 20 | block_size: int = 1024, 21 | split: str = "train", 22 | verbose: bool = True, 23 | ) -> IterableDataset: 24 | dataset_dir = Path(data_dir) 25 | files = list(dataset_dir.glob("**/*.jsonl")) 26 | if debug_mode is True: 27 | files = [files[0]] 28 | 29 | fbar = files 30 | if verbose: 31 | fbar = tqdm(files, desc="Loading files") 32 | 33 | data_type_to_filepaths = defaultdict(list) 34 | for filepath in fbar: 35 | data_type = filepath.parent.stem 36 | assert ( 37 | data_type in prob_map if prob_map else True 38 | ), f"{data_type} not in {prob_map.keys()}" 39 | data_type_to_filepaths[data_type].append(str(filepath)) 40 | 41 | data_type_to_dataset_list = {} 42 | grouping_func = partial(group_texts, block_size=block_size) 43 | 44 | fbar = None 45 | if verbose: 46 | fbar = tqdm(total=len(data_type_to_filepaths), desc="Indexing files") 47 | for data_type, filepaths in data_type_to_filepaths.items(): 48 | ds = load_dataset( 49 | "json", 50 | data_files=filepaths, 51 | num_proc=num_proc, 52 | streaming=True, 53 | split=split, 54 | ) 55 | grouped_datasets = ds.map( 56 | grouping_func, 57 | batched=True, 58 | ) 59 | data_type_to_dataset_list[data_type] = grouped_datasets 60 | 61 | datasets_in_diff_types = [] 62 | probs = [] 63 | dbar = None 64 | if verbose: 65 | dbar = tqdm( 66 | total=len(data_type_to_dataset_list), desc="Mapping datasets with probs" 67 | ) 68 | for data_type, dataset in data_type_to_dataset_list.items(): 69 | prob = None 70 | if prob_map: 71 | prob = prob_map[data_type] 72 | probs.append(prob) 73 | datasets_in_diff_types.append(dataset) 74 | if dbar: 75 | dbar.update(1) 76 | dbar.set_postfix({data_type: f"{prob:.3%}%"}) 77 | 78 | if len(probs) == 0: 79 | probs = None 80 | else: 81 | sum_probs = sum(probs) 82 | if sum_probs != 1.0: 83 | logger.warn(f"Summation of prob_map is {sum_probs}, scaling to 1.0") 84 | probs = [p / sum_probs for p in probs] 85 | 86 | if verbose: 87 | logger.info("Grouping datasets") 88 | lm_datasets = interleave_datasets(datasets_in_diff_types, probs) 89 | 90 | return lm_datasets 91 | -------------------------------------------------------------------------------- /smoe/data/single_file.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | 3 | from datasets import IterableDataset, load_dataset 4 | 5 | from smoe.data.aggregation import group_texts 6 | 7 | 8 | def load_cached_dataset( 9 | filepath: str, 10 | num_proc: int = None, 11 | block_size: int = 2048, 12 | split: str = "train", 13 | ) -> IterableDataset: 14 | grouping_func = partial(group_texts, block_size=block_size) 15 | ds = load_dataset( 16 | "json", 17 | data_files=filepath, 18 | num_proc=num_proc, 19 | split=split, 20 | ) 21 | grouped_datasets = ds.map( 22 | grouping_func, 23 | batched=True, 24 | ) 25 | return grouped_datasets 26 | -------------------------------------------------------------------------------- /smoe/entrypoint/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pjlab-sys4nlp/llama-moe/b17aff436cce58e2fcd7327789c7fadafe15d19e/smoe/entrypoint/__init__.py -------------------------------------------------------------------------------- /smoe/entrypoint/analysis/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pjlab-sys4nlp/llama-moe/b17aff436cce58e2fcd7327789c7fadafe15d19e/smoe/entrypoint/analysis/__init__.py -------------------------------------------------------------------------------- /smoe/entrypoint/analysis/clustering_distribution.py: -------------------------------------------------------------------------------- 1 | """ 2 | python -m smoe.entrypoint.analysis.clustering_distribution -d resources/clustering_7_samples -o results/analysis_clustering7 3 | """ 4 | 5 | import argparse 6 | from pathlib import Path 7 | 8 | from smoe.utils.io import load_jsonlines 9 | from smoe.utils.visualization.bar import barh 10 | 11 | 12 | def main(args): 13 | data_dir = Path(args.data_dir) 14 | 15 | for file in data_dir.glob("*.jsonl"): 16 | cluster_idx = file.stem 17 | source_to_num = { 18 | "arxiv": 0, 19 | "books": 0, 20 | "c4": 0, 21 | "commoncrawl": 0, 22 | "github": 0, 23 | "stackexchange": 0, 24 | "wikipedia": 0, 25 | } 26 | data = load_jsonlines(file) 27 | for ins in data: 28 | source = ins["file"].split("-")[0] 29 | source_to_num[source] += 1 30 | barh( 31 | source_to_num, 32 | title=f"Cluster {cluster_idx}", 33 | save_filepath=f"{args.out_dir}/cluster_{cluster_idx}.png", 34 | ) 35 | print(f"Done: {file}") 36 | 37 | 38 | if __name__ == "__main__": 39 | parser = argparse.ArgumentParser() 40 | parser.add_argument("-d", "--data_dir", required=True) 41 | parser.add_argument("-o", "--out_dir", required=True) 42 | args = parser.parse_args() 43 | main(args) 44 | -------------------------------------------------------------------------------- /smoe/entrypoint/compress_png_images.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | 4 | from smoe.utils.io import compress_png_image 5 | 6 | 7 | def main(args): 8 | for dir_path, dir_names, file_names in os.walk(args.root_path): 9 | for name in file_names: 10 | if name.endswith(".png"): 11 | compress_png_image(os.path.join(dir_path, name), print_info=True) 12 | print("All done.") 13 | 14 | 15 | if __name__ == "__main__": 16 | parser = argparse.ArgumentParser() 17 | parser.add_argument("--root_path", type=str) 18 | args = parser.parse_args() 19 | 20 | args.root_path = "/mnt/petrelfs/dongdaize.d/workspace/train-moe/visualization" 21 | 22 | main(args) 23 | -------------------------------------------------------------------------------- /smoe/entrypoint/cpt/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pjlab-sys4nlp/llama-moe/b17aff436cce58e2fcd7327789c7fadafe15d19e/smoe/entrypoint/cpt/__init__.py -------------------------------------------------------------------------------- /smoe/entrypoint/download_llama.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from huggingface_hub import snapshot_download 3 | from transformers.models.llama.modeling_llama import LlamaForCausalLM 4 | from transformers.models.llama.tokenization_llama_fast import LlamaTokenizer 5 | 6 | repo_to_download = "openlm-research/open_llama_3b" 7 | target_dir = "/mnt/petrelfs/share_data/quxiaoye/models/llama_3B" 8 | 9 | snapshot_download( 10 | repo_id=repo_to_download, local_dir=target_dir, local_dir_use_symlinks=False 11 | ) 12 | 13 | tokenizer = LlamaTokenizer.from_pretrained(target_dir) 14 | model = LlamaForCausalLM.from_pretrained( 15 | target_dir, 16 | torch_dtype=torch.float16, 17 | device_map="cpu", 18 | ) 19 | 20 | prompt = "Q: What is the largest animal?\nA:" 21 | input_ids = tokenizer(prompt, return_tensors="pt").input_ids 22 | 23 | generation_output = model.generate(input_ids=input_ids, max_new_tokens=32) 24 | print(tokenizer.decode(generation_output[0])) 25 | -------------------------------------------------------------------------------- /smoe/entrypoint/eval/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pjlab-sys4nlp/llama-moe/b17aff436cce58e2fcd7327789c7fadafe15d19e/smoe/entrypoint/eval/__init__.py -------------------------------------------------------------------------------- /smoe/entrypoint/examples/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pjlab-sys4nlp/llama-moe/b17aff436cce58e2fcd7327789c7fadafe15d19e/smoe/entrypoint/examples/__init__.py -------------------------------------------------------------------------------- /smoe/entrypoint/examples/load_llama_moe_hf.py: -------------------------------------------------------------------------------- 1 | """ 2 | Load LLaMA MoE model from file. 3 | """ 4 | 5 | import argparse 6 | 7 | import torch.cuda 8 | from transformers import LlamaTokenizer 9 | 10 | from smoe.models.llama_moe.modeling_llama_moe_hf import LlamaMoEModel, LlamaMoEForCausalLM 11 | 12 | 13 | def main(args): 14 | device = "cuda" if torch.cuda.is_available() else "cpu" 15 | print("Loading model...") 16 | 17 | if args.model_type == "LlamaMoEModel": 18 | model = LlamaMoEModel.from_pretrained(args.model_path) 19 | elif args.model_type == "LlamaMoEForCausalLM": 20 | model = LlamaMoEForCausalLM.from_pretrained(args.model_path) 21 | else: 22 | raise ValueError 23 | 24 | model.config.use_cache = False 25 | 26 | # set moe configs 27 | model.set_moe_num_selects(1) # 修改专家的选择数量 28 | 29 | # set gate configs 30 | model.set_moe_gate_use_softmax(True) # 修改是否使用Softmax对门控输出进行激活 31 | model.set_moe_gate_use_balance(True) # 修改是否在训练时使用loss平衡专家选择的样本数量 32 | model.set_moe_gate_balance_loss_weight(0.02) # 修改平衡loss的权重 33 | model.set_moe_gate_add_noise(True) # 修改是否在训练时添加随机噪声到门控输出 34 | model.set_moe_gate_noise_epsilon(0.02) # 修改噪声的大小 35 | 36 | # set calculator configs 37 | model.set_moe_calculator_multiply_gate_scores(True) # 修改是否对专家输出加权 38 | model.set_moe_calculator_score_scale_factor(16.0) # 修改专家输出的权重放缩倍数 39 | 40 | # reset 41 | model.reset_gate_network() # 重新随机初始化门控网络 42 | model.reset_experts() # 重新初始化专家参数 43 | 44 | """prepare data""" 45 | sentence_list = [ 46 | "hi hi hi hi hi, hi hi hi hi hi, hi hi hi hi hi", 47 | "How are you? I'm fine, and you?", 48 | " ", 49 | "I am stupid. Are you sure?", 50 | "The past is never dead. It is not even past.", 51 | ] 52 | 53 | tokenizer = LlamaTokenizer.from_pretrained(args.tokenizer_path) 54 | tokenizer.pad_token = tokenizer.eos_token 55 | tokens = tokenizer(sentence_list, padding=True, return_tensors="pt") 56 | print(tokens) 57 | 58 | """forward test""" 59 | print("Forwarding inputs...") 60 | model.half() 61 | model.to(device) 62 | tokens.to(device) 63 | result = model.generate(**tokens, repetition_penalty=1.3, max_length=256) 64 | print(result) 65 | 66 | for i in range(result.shape[0]): 67 | print(result[i]) 68 | decoded_text = tokenizer.decode(result[i], skip_special_tokens=True) 69 | print(decoded_text) 70 | 71 | print("Done!") 72 | 73 | 74 | if __name__ == "__main__": 75 | parser = argparse.ArgumentParser() 76 | parser.add_argument("--tokenizer_path", type=str) 77 | parser.add_argument("--model_path", type=str) 78 | parser.add_argument( 79 | "--model_type", 80 | type=str, 81 | choices=( 82 | "LlamaMoEModel", 83 | "LlamaMoEForCausalLM", 84 | "LlamaMoEForSequenceClassification", 85 | ), 86 | ) 87 | args = parser.parse_args() 88 | main(args) 89 | -------------------------------------------------------------------------------- /smoe/entrypoint/examples/load_relu_llama.py: -------------------------------------------------------------------------------- 1 | """ 2 | Load ReLU LLaMA model from file. 3 | """ 4 | 5 | import argparse 6 | 7 | import torch.cuda 8 | from transformers import LlamaForCausalLM, LlamaTokenizer 9 | 10 | from smoe.utils.model_operation.modify_llama_model import llama_with_relu_activation 11 | 12 | 13 | def main(args): 14 | device = "cuda" if torch.cuda.is_available() else "cpu" 15 | print("Loading model...") 16 | 17 | model = LlamaForCausalLM.from_pretrained(args.model_path) 18 | model.model = llama_with_relu_activation(model.model) 19 | model.config.use_cache = True 20 | 21 | """prepare data""" 22 | sentence_list = [ 23 | "hi hi hi hi hi, hi hi hi hi hi, hi hi hi hi hi", 24 | "How are you? I'm fine, and you?", 25 | " ", 26 | "I am stupid. Are you sure?", 27 | "The past is never dead. It is not even past.", 28 | ] 29 | 30 | tokenizer = LlamaTokenizer.from_pretrained(args.tokenizer_path) 31 | tokenizer.pad_token = tokenizer.eos_token 32 | tokens = tokenizer(sentence_list, padding=True, return_tensors="pt") 33 | print(tokens) 34 | 35 | """forward test""" 36 | print("Forwarding inputs...") 37 | model.half() 38 | model.to(device) 39 | tokens.to(device) 40 | result = model.generate(**tokens, repetition_penalty=2.0, max_length=256) 41 | print(result) 42 | 43 | for i in range(result.shape[0]): 44 | print(result[i]) 45 | decoded_text = tokenizer.decode(result[i], skip_special_tokens=True) 46 | print(decoded_text) 47 | 48 | print("Done!") 49 | 50 | 51 | if __name__ == "__main__": 52 | parser = argparse.ArgumentParser() 53 | parser.add_argument( 54 | "--tokenizer_path", 55 | type=str, 56 | default="/mnt/petrelfs/share_data/quxiaoye/models/ReluLLaMA-7B", 57 | ) 58 | parser.add_argument( 59 | "--model_path", 60 | type=str, 61 | default="/mnt/petrelfs/share_data/quxiaoye/models/ReluLLaMA-7B", 62 | ) 63 | args = parser.parse_args() 64 | main(args) 65 | -------------------------------------------------------------------------------- /smoe/entrypoint/expert_construction/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pjlab-sys4nlp/llama-moe/b17aff436cce58e2fcd7327789c7fadafe15d19e/smoe/entrypoint/expert_construction/__init__.py -------------------------------------------------------------------------------- /smoe/entrypoint/expert_construction/llama_prepare_datasets.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import multiprocessing 3 | import os 4 | import pickle 5 | 6 | import torch 7 | from transformers import LlamaTokenizer 8 | 9 | from smoe.data.datasets_moe import LineByLineJsonlTextDataset 10 | 11 | 12 | # fmt: off 13 | def process_dataset(args, tokenizer, key, file_name): 14 | raw_file_path = os.path.join(args.train_data_path, file_name) 15 | print("\nReading dataset \"" + key + "\" from raw file \"" + raw_file_path + "\"...") 16 | 17 | datasets = LineByLineJsonlTextDataset(tokenizer, file_path=raw_file_path, block_size=2048) 18 | 19 | if not os.path.exists(args.train_data_cache_path): 20 | os.makedirs(args.train_data_cache_path) 21 | 22 | cached_file_path = os.path.join(args.train_data_cache_path, key + "_cached.pth") 23 | torch.save(datasets.examples, cached_file_path, pickle_protocol=pickle.HIGHEST_PROTOCOL) 24 | print(f"Dataset {key}: {sum([torch.sum(datasets[i]['attention_mask']).item() for i in range(len(datasets))])} total tokens.") # 统计非special token的数量 25 | 26 | 27 | if __name__ == "__main__": 28 | parser = argparse.ArgumentParser() 29 | parser.add_argument('--model_path', type=str, default="/home/data/models/llama-transformers/7B") 30 | parser.add_argument('--train_data_path', type=str, default="/home/dongdz/workspace/moefication/llama_data/") 31 | parser.add_argument('--train_data_cache_path', type=str, default="/home/dongdz/workspace/moefication/llama_data_cache/") 32 | 33 | args = parser.parse_args() 34 | print(args, "\n") 35 | 36 | """load tokenizer""" 37 | tokenizer = LlamaTokenizer.from_pretrained(args.model_path) 38 | tokenizer.pad_token = tokenizer.eos_token 39 | 40 | """prepare datasets""" 41 | dataset_names = [ 42 | "commoncrawl", 43 | "c4", 44 | "github", 45 | "wikipedia", 46 | "books", 47 | "arxiv", 48 | "stackexchange" 49 | ] 50 | 51 | # read datasets 52 | pool = multiprocessing.Pool(processes=len(dataset_names)) 53 | for key in dataset_names: 54 | for file_name in os.listdir(args.train_data_path): 55 | if key in file_name and file_name.endswith(".jsonl"): 56 | pool.apply_async(process_dataset, args=(args, tokenizer, key, file_name)) 57 | pool.close() 58 | pool.join() 59 | 60 | print("Done.") 61 | -------------------------------------------------------------------------------- /smoe/entrypoint/expert_construction/llama_prune_gradient.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | 4 | import torch 5 | from tqdm import tqdm 6 | from transformers import LlamaConfig 7 | 8 | from smoe.utils.expert_construction.prune_llama import GradientPrune 9 | from smoe.utils.io import torch_load_template_score_file 10 | from smoe.utils.operations.operation_string import str2bool 11 | 12 | if __name__ == "__main__": 13 | # fmt: off 14 | parser = argparse.ArgumentParser() 15 | parser.add_argument('--model_path', type=str) 16 | parser.add_argument('--grad_file_path', type=str) 17 | parser.add_argument('--save_path', type=str) 18 | parser.add_argument('--expert_index', type=str) 19 | parser.add_argument('--retain_percent', type=float) 20 | parser.add_argument('--template', type=str, default='layers.{}.mlp.gate_proj.weight') 21 | 22 | parser.add_argument('--kernel', type=str, default="plain", choices=("plain", "l1_norm", "l2_norm")) 23 | parser.add_argument('--accumulate_level', type=str, default="sample", choices=("sample", "total")) 24 | parser.add_argument('--importance_type', type=str, default="feature_grad", choices=("feature_grad", "feature_change")) 25 | parser.add_argument('--criterion', type=str, default="min", choices=("min", "max")) 26 | 27 | parser.add_argument('--use_grad_sum', type=str, default="False") 28 | 29 | args = parser.parse_args() 30 | args.use_grad_sum = str2bool(args.use_grad_sum) 31 | if args.expert_index != "All": 32 | args.expert_index = int(args.expert_index) 33 | print(args, "\n") 34 | 35 | print("Loading llama config...") 36 | config = LlamaConfig.from_pretrained(args.model_path) 37 | expert_size = int(config.intermediate_size * args.retain_percent) 38 | 39 | print("Processing layers...") 40 | save_root_path = args.save_path 41 | 42 | if args.importance_type == "feature_grad": 43 | file_postfix = ".grad" 44 | elif args.importance_type == "feature_change": 45 | file_postfix = ".change" 46 | else: 47 | raise NotImplementedError 48 | 49 | for i in tqdm(range(config.num_hidden_layers)): 50 | grad_list = torch_load_template_score_file(args.grad_file_path, args.template + file_postfix, i) 51 | 52 | if args.use_grad_sum: 53 | grad_list = torch.stack(grad_list, dim=0).sum(0) 54 | else: 55 | grad_list = grad_list[args.expert_index] 56 | 57 | args.save_path = os.path.join( 58 | save_root_path, 59 | f"{os.path.split(args.model_path)[1]}-Prune-Gradient-{args.criterion}-{args.kernel}-{args.accumulate_level}-{args.importance_type}", 60 | f"{args.expert_index}-{format(args.retain_percent, '.2f')}Percent-{expert_size}Neurons" 61 | ) 62 | 63 | split = GradientPrune(args, args.template, i, grad_list) 64 | split.prune(expert_size, criterion=args.criterion) 65 | split.save() 66 | print("Done.") 67 | # fmt: on 68 | -------------------------------------------------------------------------------- /smoe/entrypoint/expert_construction/llama_prune_random.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | 4 | from tqdm import tqdm 5 | from transformers import LlamaConfig 6 | 7 | from smoe.utils.expert_construction.prune_llama import RandomPrune 8 | 9 | if __name__ == "__main__": 10 | # fmt: off 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument('--model_path', type=str) 13 | parser.add_argument('--save_path', type=str) 14 | parser.add_argument('--retain_percent', type=float) 15 | parser.add_argument('--template', type=str, default='layers.{}.mlp.gate_proj.weight') 16 | 17 | args = parser.parse_args() 18 | print(args, "\n") 19 | 20 | print("Loading llama config...") 21 | config = LlamaConfig.from_pretrained(args.model_path) 22 | expert_size = int(config.intermediate_size * args.retain_percent) 23 | 24 | args.save_path = os.path.join( 25 | args.save_path, 26 | f"{os.path.split(args.model_path)[1]}-Prune-Random", 27 | f"{format(args.retain_percent, '.2f')}Percent-{expert_size}Neurons" 28 | ) 29 | 30 | for i in tqdm(range(config.num_hidden_layers)): 31 | split = RandomPrune(args, args.template, i, config.intermediate_size) 32 | split.prune(expert_size, seed=0) 33 | split.save() 34 | print("Done.") 35 | # fmt: on 36 | -------------------------------------------------------------------------------- /smoe/entrypoint/expert_construction/llama_split_clustering.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | 4 | from tqdm import tqdm 5 | from transformers import LlamaForCausalLM 6 | 7 | from smoe.utils.expert_construction.expert_split import ClusteringSplit 8 | 9 | if __name__ == "__main__": 10 | # fmt: off 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument('--model_path', type=str, default="/home/data/models/llama-transformers/7B") 13 | parser.add_argument('--save_path', type=str, default="/home/dongdz/workspace/moefication/llama_moe_temp_files/") 14 | parser.add_argument('--template', type=str, default='layers.{}.mlp.gate_proj.weight') 15 | parser.add_argument('--num_experts', type=int, default=8, help='number of experts') 16 | parser.add_argument('--metric', type=str, default="l2", choices=("l2", "cos")) 17 | parser.add_argument('--cpu_threads', type=int, default=-1) 18 | 19 | args = parser.parse_args() 20 | args.save_path = os.path.join(args.save_path, os.path.split(args.model_path)[1] + "-" + str(args.num_experts) + "Expert-Split-Clustering-" + args.metric) 21 | 22 | print("Loading llama model...") 23 | model = LlamaForCausalLM.from_pretrained(args.model_path).model 24 | 25 | for i in tqdm(range(model.config.num_hidden_layers)): 26 | split = ClusteringSplit(args, model, args.template, i) 27 | split.split(cpu_threads=args.cpu_threads) 28 | split.cnt() 29 | split.save() 30 | print("Done.") 31 | # fmt: on 32 | -------------------------------------------------------------------------------- /smoe/entrypoint/expert_construction/llama_split_gradient.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | 4 | from tqdm import tqdm 5 | from transformers import LlamaConfig 6 | 7 | from smoe.utils.expert_construction.expert_split import GradientSplit 8 | from smoe.utils.io import delete_file_or_dir, torch_load_template_score_file 9 | from smoe.utils.operations.operation_string import str2bool 10 | 11 | if __name__ == "__main__": 12 | # fmt: off 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument('--model_path', type=str) 15 | parser.add_argument('--score_file_path', type=str) 16 | parser.add_argument('--save_path', type=str) 17 | parser.add_argument('--visualization_path', type=str, default=None) 18 | parser.add_argument('--expert_num', type=int, default=None) 19 | parser.add_argument('--expert_size', type=int) 20 | parser.add_argument('--template', type=str, default='layers.{}.mlp.gate_proj.weight') 21 | 22 | parser.add_argument('--kernel', type=str, default="plain", choices=("plain", "l1_norm", "l2_norm")) 23 | parser.add_argument('--accumulate_level', type=str, default="sample", choices=("sample", "total")) 24 | parser.add_argument('--criterion', type=str, default="min", choices=("min", "max")) 25 | parser.add_argument('--importance_type', type=str, default="feature_grad", choices=("feature_grad", "feature_change")) 26 | parser.add_argument('--share_neurons', type=str, default="False") 27 | 28 | args = parser.parse_args() 29 | args.share_neurons = str2bool(args.share_neurons) 30 | print(args, "\n") 31 | 32 | print("Loading llama config...") 33 | config = LlamaConfig.from_pretrained(args.model_path) 34 | 35 | print("Processing layers...") 36 | save_root_path = args.save_path 37 | 38 | if args.importance_type == "feature_grad": 39 | file_postfix = ".grad" 40 | elif args.importance_type == "feature_change": 41 | file_postfix = ".change" 42 | else: 43 | raise NotImplementedError 44 | 45 | if args.visualization_path is not None: 46 | delete_file_or_dir(os.path.join(args.save_path, "total_neurons.txt")) 47 | 48 | for i in tqdm(range(config.num_hidden_layers)): 49 | score_list = torch_load_template_score_file(args.score_file_path, args.template + file_postfix, i) 50 | 51 | if args.expert_num is None: 52 | args.expert_num = len(score_list) 53 | else: 54 | assert args.expert_num <= len(score_list) 55 | 56 | args.save_path = os.path.join( 57 | save_root_path, 58 | f"{os.path.split(args.model_path)[1]}-Split-Gradient-{args.criterion}-{args.kernel}-{args.accumulate_level}-{args.importance_type}", 59 | f"{args.expert_num}Experts-{args.expert_size}Neurons{'-Share' if args.share_neurons else ''}" 60 | ) 61 | 62 | split = GradientSplit(args, args.template, i, score_list) 63 | split.split(args.expert_num, args.expert_size, criterion=args.criterion, share_neurons=args.share_neurons) 64 | if not args.share_neurons: 65 | split.cnt() 66 | split.save() 67 | 68 | if args.visualization_path is not None: 69 | split.visualize(args.visualization_path, share_neurons=args.share_neurons) 70 | print("Done.") 71 | # fmt: on 72 | -------------------------------------------------------------------------------- /smoe/entrypoint/expert_construction/llama_split_gradient_residual.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | 4 | from tqdm import tqdm 5 | from transformers import LlamaConfig 6 | 7 | from smoe.utils.expert_construction.expert_split_residual import GradientSplitResidual 8 | from smoe.utils.io import delete_file_or_dir, torch_load_template_score_file 9 | from smoe.utils.operations.operation_string import str2bool 10 | 11 | if __name__ == "__main__": 12 | # fmt: off 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument('--model_path', type=str) 15 | parser.add_argument('--score_file_path', type=str) 16 | parser.add_argument('--save_path', type=str) 17 | parser.add_argument('--visualization_path', type=str, default=None) 18 | parser.add_argument('--expert_num_moe', type=int) 19 | parser.add_argument('--expert_num_residual', type=int) 20 | parser.add_argument('--expert_size', type=int) 21 | parser.add_argument('--template', type=str, default='layers.{}.mlp.gate_proj.weight') 22 | 23 | parser.add_argument('--kernel', type=str, default="plain", choices=("plain", "l1_norm", "l2_norm")) 24 | parser.add_argument('--accumulate_level', type=str, default="sample", choices=("sample", "total")) 25 | parser.add_argument('--criterion', type=str, default="min", choices=("min", "max")) 26 | parser.add_argument('--importance_type', type=str, default="feature_grad", choices=("feature_grad", "feature_change")) 27 | parser.add_argument('--share_neurons', type=str, default="False") 28 | 29 | args = parser.parse_args() 30 | args.share_neurons = str2bool(args.share_neurons) 31 | print(args, "\n") 32 | 33 | print("Loading llama config...") 34 | config = LlamaConfig.from_pretrained(args.model_path) 35 | 36 | print("Processing layers...") 37 | save_root_path = args.save_path 38 | 39 | if args.importance_type == "feature_grad": 40 | file_postfix = ".grad" 41 | elif args.importance_type == "feature_change": 42 | file_postfix = ".change" 43 | else: 44 | raise NotImplementedError 45 | 46 | if args.visualization_path is not None: 47 | delete_file_or_dir(os.path.join(args.save_path, "total_neurons.txt")) 48 | 49 | for i in tqdm(range(config.num_hidden_layers)): 50 | score_list = torch_load_template_score_file(args.score_file_path, args.template + file_postfix, i) 51 | 52 | args.save_path = os.path.join( 53 | save_root_path, 54 | f"{os.path.split(args.model_path)[1]}-Split-Gradient-{args.criterion}-{args.kernel}-{args.accumulate_level}-{args.importance_type}", 55 | f"{args.expert_num_moe}Experts-{args.expert_num_residual}Residuals-{args.expert_size}Neurons{'-Share' if args.share_neurons else ''}" 56 | ) 57 | 58 | split = GradientSplitResidual(args, args.template, i, score_list) 59 | split.split(args.expert_num_moe, args.expert_num_residual, args.expert_size, criterion=args.criterion, share_neurons=args.share_neurons) 60 | split.save() 61 | 62 | if args.visualization_path is not None: 63 | split.visualize(args.visualization_path, share_neurons=args.share_neurons) 64 | print("Done.") 65 | # fmt: on 66 | -------------------------------------------------------------------------------- /smoe/entrypoint/expert_construction/llama_split_graph.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | 4 | from tqdm import tqdm 5 | from transformers import LlamaForCausalLM 6 | 7 | from smoe.utils.expert_construction.expert_split import GraphSplit 8 | 9 | if __name__ == "__main__": 10 | # fmt: off 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument("--model_path", type=str, default="./model_path") 13 | parser.add_argument("--hidden_features_path", type=str, default="./hidden_features_path", ) 14 | parser.add_argument("--save_path", type=str, default="./save_path", ) 15 | parser.add_argument('--specify_layer', nargs='+', help='used to specify train layers, example \"--specify_layer 0 1 2 3\"') 16 | 17 | parser.add_argument("--template", type=str, default="layers.{}.mlp.gate_proj.weight") 18 | parser.add_argument("--num_experts", type=int, default=8, help="number of experts") 19 | parser.add_argument("--metric", type=str, default="l1_norm") 20 | parser.add_argument("--threshold", type=int, default=1) 21 | 22 | args = parser.parse_args() 23 | # args.save_path = os.path.join( 24 | # args.save_path, 25 | # os.path.split(args.model_path)[1] + "-" + str(args.num_experts) + "Expert-Split-Graph-" + str(args.metric), 26 | # ) 27 | 28 | if not os.path.exists(args.save_path): 29 | os.makedirs(args.save_path) 30 | 31 | print("Loading llama model...") 32 | model = LlamaForCausalLM.from_pretrained(args.model_path).model 33 | 34 | if "specify_layer" in args: 35 | train_layers = [int(layer) for layer in args.specify_layer] 36 | else: 37 | train_layers = range(model.config.num_hidden_layers) 38 | 39 | for layer_idx in train_layers: 40 | print(f"Creating co-activation matrix for layer {layer_idx}...") 41 | split = GraphSplit(args, model, args.template, layer_idx) 42 | split.split_and_save() 43 | print("Done.") 44 | # fmt: on 45 | -------------------------------------------------------------------------------- /smoe/entrypoint/expert_construction/llama_split_graph_trans_gp.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import random 4 | import sys 5 | from collections import defaultdict 6 | 7 | import torch 8 | 9 | if __name__ == "__main__": 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument("--gpmetised_file_path", type=str, default="./file_path") 12 | 13 | args = parser.parse_args() 14 | 15 | labels = [] 16 | 17 | with open(args.gpmetised_file_path) as fin: 18 | d = defaultdict(list) 19 | for i, line in enumerate(fin): 20 | labels.append(int(line.strip())) 21 | d[labels[-1]].append(i) 22 | 23 | need_move = [] 24 | 25 | for i in range(max(d.keys()) + 1): 26 | if i not in d: 27 | d[i] = [] 28 | print(len(labels), len(d.keys())) 29 | 30 | num = len(labels) // len(d.keys()) 31 | for k, v in d.items(): 32 | if len(v) > num: 33 | random.shuffle(v) 34 | for i in range(num, len(v)): 35 | need_move.append(v[i]) 36 | d[k] = v[:num] 37 | 38 | print("need_move", need_move) 39 | 40 | random.shuffle(need_move) 41 | for k, v in d.items(): 42 | if len(v) < num: 43 | pos = num - len(v) 44 | v += need_move[:pos] 45 | need_move = need_move[pos:] 46 | for x in v: 47 | labels[x] = k 48 | 49 | vec = os.path.basename(args.gpmetised_file_path).split(".")[:-2] 50 | target = ".".join(vec) 51 | 52 | save_folder = os.path.join(os.path.dirname(args.gpmetised_file_path), "gp_split") 53 | 54 | if not os.path.exists(save_folder): 55 | os.makedirs(save_folder) 56 | 57 | torch.save(labels, os.path.join(save_folder, target)) 58 | 59 | from collections import Counter 60 | 61 | print(Counter(labels)) 62 | -------------------------------------------------------------------------------- /smoe/entrypoint/expert_construction/llama_split_random.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | 4 | from tqdm import tqdm 5 | from transformers import AutoConfig 6 | 7 | from smoe.utils.expert_construction.expert_split import RandomSplit 8 | 9 | if __name__ == "__main__": 10 | # fmt: off 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument('--model_path', type=str, default="/home/data/models/llama-transformers/7B") 13 | parser.add_argument('--save_path', type=str, default="/home/dongdz/workspace/moefication/llama_moe_temp_files/") 14 | parser.add_argument('--template', type=str, default='layers.{}.mlp.up_proj.weight') 15 | parser.add_argument('--num_experts', type=int, default=8, help='number of experts') 16 | 17 | args = parser.parse_args() 18 | args.save_path = os.path.join(args.save_path, os.path.split(args.model_path)[1] + "-" + str(args.num_experts) + "Expert-Split-Random") 19 | print(args, "\n") 20 | 21 | print("Loading llama config...") 22 | config = AutoConfig.from_pretrained(args.model_path) 23 | 24 | for i in tqdm(range(config.num_hidden_layers)): 25 | split = RandomSplit(args, config, args.template, i) 26 | split.split() 27 | split.cnt() 28 | split.save() 29 | print("Done.") 30 | -------------------------------------------------------------------------------- /smoe/entrypoint/sft/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pjlab-sys4nlp/llama-moe/b17aff436cce58e2fcd7327789c7fadafe15d19e/smoe/entrypoint/sft/__init__.py -------------------------------------------------------------------------------- /smoe/entrypoint/visualization/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pjlab-sys4nlp/llama-moe/b17aff436cce58e2fcd7327789c7fadafe15d19e/smoe/entrypoint/visualization/__init__.py -------------------------------------------------------------------------------- /smoe/entrypoint/visualization/visualize_expert_neuron_overlap.py: -------------------------------------------------------------------------------- 1 | """ 2 | Visualization of pair-wise overlap rate & overlap neuron count for moe models constructed by importance criterion (Share=True). 3 | """ 4 | import argparse 5 | import os 6 | 7 | import torch 8 | from tqdm import tqdm 9 | from transformers import LlamaConfig 10 | 11 | from smoe.utils.io import delete_file_or_dir, torch_load_template_score_file 12 | from smoe.utils.visualization.visualize import visualize_expert_neuron_overlap 13 | 14 | # fmt: off 15 | if __name__ == "__main__": 16 | parser = argparse.ArgumentParser() 17 | parser.add_argument('--model_path', type=str) 18 | parser.add_argument('--score_file_path', type=str) 19 | parser.add_argument('--save_path', type=str) 20 | parser.add_argument('--expert_size', type=int) 21 | parser.add_argument('--score_file_template', type=str, default="layers.{}.mlp.up_proj.weight.change") 22 | parser.add_argument('--criterion', type=str, default="max", choices=("min", "max")) 23 | 24 | args = parser.parse_args() 25 | print("\n", args) 26 | 27 | print("Loading llama config...") 28 | config = LlamaConfig.from_pretrained(args.model_path) 29 | 30 | delete_file_or_dir(os.path.join(args.save_path, "total_neurons.txt")) 31 | 32 | for layer_id in tqdm(range(config.num_hidden_layers)): 33 | """read scores from files""" 34 | score_list = torch_load_template_score_file(args.score_file_path, args.score_file_template, layer_id) 35 | num_experts = len(score_list) 36 | scores = torch.stack(score_list, dim=0) 37 | 38 | """get selected mask""" 39 | selected_mask_list = [] 40 | for j, score in enumerate(score_list): 41 | if args.criterion == "min": 42 | sorted_score, index = torch.sort(score) 43 | elif args.criterion == "max": 44 | sorted_score, index = torch.sort(score, descending=True) 45 | else: 46 | raise NotImplementedError 47 | selected_mask = torch.zeros_like(score, dtype=torch.int) 48 | selected_mask[index[:args.expert_size]] += 1 49 | selected_mask_list.append(selected_mask) 50 | selected_masks = torch.stack(selected_mask_list, dim=0) # shape(num_experts, intermediate_size) 51 | 52 | """visualize""" 53 | visualize_expert_neuron_overlap(selected_masks, num_experts, config.intermediate_size, args.expert_size, layer_id, save_dir=args.save_path) 54 | 55 | print("done.") 56 | -------------------------------------------------------------------------------- /smoe/entrypoint/visualization/visualize_expert_select_mlp.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | from smoe.utils.visualization.visualize import visualize_expert_select_mlp 4 | 5 | # fmt: off 6 | if __name__ == "__main__": 7 | parser = argparse.ArgumentParser() 8 | parser.add_argument('--result_path', type=str) 9 | parser.add_argument('--save_path', type=str) 10 | parser.add_argument('--proj_type', type=str) 11 | 12 | args = parser.parse_args() 13 | print(args, "\n") 14 | 15 | visualize_expert_select_mlp(args.result_path, args.save_path, args.proj_type) 16 | -------------------------------------------------------------------------------- /smoe/entrypoint/visualization/visualize_gate_loss.py: -------------------------------------------------------------------------------- 1 | import re 2 | import statistics as sts 3 | from collections import defaultdict 4 | from pathlib import Path 5 | 6 | from smoe.utils.io import load_nums_from_txt 7 | from smoe.utils.visualization.line import line_plot 8 | 9 | if __name__ == "__main__": 10 | folders = [ 11 | ["L2", "/mnt/petrelfs/zhutong/smoe/results/llama_7B_MoE_16Select4-l2_norm"], 12 | ["Random Params", "/mnt/petrelfs/zhutong/smoe/results/random_16select4_moe"], 13 | [ 14 | "Random Split", 15 | "/mnt/petrelfs/zhutong/smoe/results/RandomSplit-l2_norm-llama_7B-16Select4-up_proj", 16 | ], 17 | ] 18 | output_fig_file = "results/gate_loss.png" 19 | 20 | label_to_nums = defaultdict(list) 21 | for name, folder in folders: 22 | folder_path = Path(folder) 23 | txt_files = list(folder_path.glob("gate_loss_R*_L*.txt")) 24 | regex = re.compile(r"gate_loss_R(\d+)_L(\d+).txt") 25 | layer_to_loss = defaultdict(list) 26 | for txt_file in txt_files: 27 | rank, layer = regex.search(str(txt_file)).groups() 28 | rank, layer = int(rank), int(layer) 29 | layer_to_loss[layer].extend(load_nums_from_txt(txt_file)) 30 | 31 | layers = [] 32 | for layer, losses in sorted(layer_to_loss.items(), key=lambda item: item[0]): 33 | layers.append(layer) 34 | label_to_nums[name].append(sts.mean(losses)) 35 | 36 | line_plot( 37 | layers, 38 | label_to_nums, 39 | title="gate loss", 40 | xlabel="layer", 41 | ylabel="loss", 42 | save_path=output_fig_file, 43 | ) 44 | -------------------------------------------------------------------------------- /smoe/entrypoint/visualization/visualize_swiglu_output.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | 4 | import torch.cuda 5 | from transformers import LlamaConfig 6 | 7 | from smoe.utils.visualization.visualize import visualize_swiglu_output 8 | 9 | # fmt: off 10 | if __name__ == "__main__": 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument('--model_path', type=str) 13 | parser.add_argument('--hidden_features_path', type=str) 14 | parser.add_argument('--save_path', type=str) 15 | parser.add_argument('--template', type=str, default='layers.{}.mlp.gate_proj.weight') 16 | parser.add_argument('--specify_layer', nargs='+', help='used to specify layers for visualization, example \"--specify_layer 0 1 2 3\"') 17 | parser.add_argument('--visualize_criterion', default='plain', choices=["plain", "l1_norm", "l2_norm"]) 18 | 19 | args = parser.parse_args() 20 | print(args, "\n") 21 | 22 | device = "cuda:0" if torch.cuda.is_available() else "cpu" 23 | print(device) 24 | 25 | print("Loading llama config...") 26 | config = LlamaConfig.from_pretrained(args.model_path) 27 | 28 | if "specify_layer" in args: 29 | visualize_layers = [int(layer) for layer in args.specify_layer] 30 | else: 31 | visualize_layers = range(config.num_hidden_layers) 32 | print(visualize_layers) 33 | 34 | for layer_idx in visualize_layers: 35 | print(f"Visualizing SiwGLU output for layer {layer_idx}...") 36 | 37 | if "gate_proj" in args.template: 38 | hidden_outputs_path = os.path.join(args.hidden_features_path, "hidden_gate_outputs", "layer" + str(layer_idx)) 39 | neuron_type = "gate_proj" 40 | elif "up_proj" in args.template: 41 | hidden_outputs_path = os.path.join(args.hidden_features_path, "hidden_up_outputs", "layer" + str(layer_idx)) 42 | neuron_type = "up_proj" 43 | else: 44 | raise ValueError 45 | 46 | if args.visualize_criterion == "plain": 47 | edge = (-0.5, 0.5) 48 | elif args.visualize_criterion == "l1_norm": 49 | edge = (0, 0.5) 50 | elif args.visualize_criterion == "l2_norm": 51 | edge = (0, 0.1) 52 | else: 53 | raise ValueError 54 | 55 | visualize_swiglu_output(hidden_outputs_path, args.save_path, neuron_type, layer_idx, criterion=args.visualize_criterion, 56 | num_bins=1000, edge=edge, device=device) 57 | -------------------------------------------------------------------------------- /smoe/metrics/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pjlab-sys4nlp/llama-moe/b17aff436cce58e2fcd7327789c7fadafe15d19e/smoe/metrics/__init__.py -------------------------------------------------------------------------------- /smoe/metrics/accuracy.py: -------------------------------------------------------------------------------- 1 | from sklearn.metrics import accuracy_score 2 | 3 | 4 | def accuracy(predictions, references, normalize=True, sample_weight=None): 5 | return { 6 | "accuracy": float( 7 | accuracy_score( 8 | references, 9 | predictions, 10 | normalize=normalize, 11 | sample_weight=sample_weight, 12 | ) 13 | ) 14 | } 15 | 16 | 17 | def compute_metrics(eval_preds): 18 | preds, labels = eval_preds 19 | # preds have the same shape as the labels, after the argmax(-1) has been calculated 20 | # by preprocess_logits_for_metrics but we need to shift the labels 21 | labels = labels[:, 1:].reshape(-1) 22 | preds = preds[:, :-1].reshape(-1) 23 | return accuracy(predictions=preds, references=labels) 24 | -------------------------------------------------------------------------------- /smoe/metrics/preprocess.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def logits_argmax(logits: torch.Tensor | tuple[torch.Tensor], labels): 5 | if isinstance(logits, tuple): 6 | # Depending on the model and config, logits may contain extra tensors, 7 | # like past_key_values, but logits always come first 8 | logits = logits[0] 9 | return logits.argmax(dim=-1) 10 | -------------------------------------------------------------------------------- /smoe/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pjlab-sys4nlp/llama-moe/b17aff436cce58e2fcd7327789c7fadafe15d19e/smoe/models/__init__.py -------------------------------------------------------------------------------- /smoe/models/llama_moe/__init__.py: -------------------------------------------------------------------------------- 1 | from .configuration_llama_moe import LlamaMoEConfig # noqa: F401 2 | from .modeling_llama_moe import ( # noqa: F401 3 | BaseMoEModelOutputWithPast, 4 | LlamaMoEDecoderLayer, 5 | LlamaMoEForCausalLM, 6 | LlamaMoEForSequenceClassification, 7 | LlamaMoEModel, 8 | LlamaMoEPreTrainedModel, 9 | ) 10 | -------------------------------------------------------------------------------- /smoe/models/llama_moe_residual/__init__.py: -------------------------------------------------------------------------------- 1 | from .configuration_llama_moe_residual import LlamaMoEResidualConfig # noqa: F401 2 | from .modeling_llama_moe_residual import ( # noqa: F401 3 | LlamaMoEResidualDecoderLayer, 4 | LlamaMoEResidualForCausalLM, 5 | LlamaMoEResidualForSequenceClassification, 6 | LlamaMoEResidualModel, 7 | LlamaMoEResidualPreTrainedModel, 8 | ) 9 | -------------------------------------------------------------------------------- /smoe/models/mistral/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Mistral AI and The HuggingFace Inc. team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from typing import TYPE_CHECKING 15 | 16 | from transformers.utils import ( 17 | OptionalDependencyNotAvailable, 18 | _LazyModule, 19 | is_torch_available, 20 | ) 21 | 22 | _import_structure = { 23 | "configuration_mistral": ["MISTRAL_PRETRAINED_CONFIG_ARCHIVE_MAP", "MistralConfig"], 24 | } 25 | 26 | 27 | try: 28 | if not is_torch_available(): 29 | raise OptionalDependencyNotAvailable() 30 | except OptionalDependencyNotAvailable: 31 | pass 32 | else: 33 | _import_structure["modeling_mistral"] = [ 34 | "MistralForCausalLM", 35 | "MistralModel", 36 | "MistralPreTrainedModel", 37 | "MistralForSequenceClassification", 38 | ] 39 | 40 | 41 | if TYPE_CHECKING: 42 | from .configuration_mistral import ( 43 | MISTRAL_PRETRAINED_CONFIG_ARCHIVE_MAP, 44 | MistralConfig, 45 | ) 46 | 47 | try: 48 | if not is_torch_available(): 49 | raise OptionalDependencyNotAvailable() 50 | except OptionalDependencyNotAvailable: 51 | pass 52 | else: 53 | from .modeling_mistral import ( 54 | MistralForCausalLM, 55 | MistralForSequenceClassification, 56 | MistralModel, 57 | MistralPreTrainedModel, 58 | ) 59 | 60 | 61 | else: 62 | import sys 63 | 64 | sys.modules[__name__] = _LazyModule( 65 | __name__, globals()["__file__"], _import_structure, module_spec=__spec__ 66 | ) 67 | -------------------------------------------------------------------------------- /smoe/models/mixtral/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 Mixtral AI and The HuggingFace Inc. team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from typing import TYPE_CHECKING 15 | 16 | from transformers.utils import ( 17 | OptionalDependencyNotAvailable, 18 | _LazyModule, 19 | is_torch_available, 20 | ) 21 | 22 | _import_structure = { 23 | "configuration_mixtral": ["MIXTRAL_PRETRAINED_CONFIG_ARCHIVE_MAP", "MixtralConfig"], 24 | } 25 | 26 | 27 | try: 28 | if not is_torch_available(): 29 | raise OptionalDependencyNotAvailable() 30 | except OptionalDependencyNotAvailable: 31 | pass 32 | else: 33 | _import_structure["modeling_mixtral"] = [ 34 | "MixtralForCausalLM", 35 | "MixtralModel", 36 | "MixtralPreTrainedModel", 37 | "MixtralForSequenceClassification", 38 | ] 39 | 40 | 41 | if TYPE_CHECKING: 42 | from .configuration_mixtral import ( 43 | MIXTRAL_PRETRAINED_CONFIG_ARCHIVE_MAP, 44 | MixtralConfig, 45 | ) 46 | 47 | try: 48 | if not is_torch_available(): 49 | raise OptionalDependencyNotAvailable() 50 | except OptionalDependencyNotAvailable: 51 | pass 52 | else: 53 | from .modeling_mixtral import ( 54 | MixtralForCausalLM, 55 | MixtralForSequenceClassification, 56 | MixtralModel, 57 | MixtralPreTrainedModel, 58 | ) 59 | 60 | 61 | else: 62 | import sys 63 | 64 | sys.modules[__name__] = _LazyModule( 65 | __name__, globals()["__file__"], _import_structure, module_spec=__spec__ 66 | ) 67 | -------------------------------------------------------------------------------- /smoe/modules/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pjlab-sys4nlp/llama-moe/b17aff436cce58e2fcd7327789c7fadafe15d19e/smoe/modules/__init__.py -------------------------------------------------------------------------------- /smoe/modules/moe/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pjlab-sys4nlp/llama-moe/b17aff436cce58e2fcd7327789c7fadafe15d19e/smoe/modules/moe/__init__.py -------------------------------------------------------------------------------- /smoe/modules/moe_residual/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pjlab-sys4nlp/llama-moe/b17aff436cce58e2fcd7327789c7fadafe15d19e/smoe/modules/moe_residual/__init__.py -------------------------------------------------------------------------------- /smoe/modules/norm.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.init as init 4 | 5 | 6 | class WeightNorm(nn.Module): 7 | def __init__( 8 | self, hidden_size: int, scale: float = 1.0, device=None, dtype=None 9 | ) -> None: 10 | super().__init__() 11 | 12 | self.hsz = hidden_size 13 | self.scale = scale 14 | 15 | self.weight = nn.Parameter(torch.empty(hidden_size, device=device, dtype=dtype)) 16 | 17 | self.reset_parameters() 18 | 19 | def reset_parameters(self): 20 | # init.ones_(self.weight) 21 | init.constant_(self.weight, self.scale) 22 | 23 | def forward(self, hidden: torch.Tensor) -> torch.Tensor: 24 | # if torch.isnan(self.weight).any(): 25 | # remote_breakpoint() 26 | # return hidden * (self.scale * F.sigmoid(self.weight) + 1.0) 27 | return hidden * self.weight 28 | 29 | def extra_repr(self) -> str: 30 | return "hsz={}, scale={}".format(self.hsz, self.scale) 31 | -------------------------------------------------------------------------------- /smoe/trainer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pjlab-sys4nlp/llama-moe/b17aff436cce58e2fcd7327789c7fadafe15d19e/smoe/trainer/__init__.py -------------------------------------------------------------------------------- /smoe/trainer/moefy/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pjlab-sys4nlp/llama-moe/b17aff436cce58e2fcd7327789c7fadafe15d19e/smoe/trainer/moefy/__init__.py -------------------------------------------------------------------------------- /smoe/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pjlab-sys4nlp/llama-moe/b17aff436cce58e2fcd7327789c7fadafe15d19e/smoe/utils/__init__.py -------------------------------------------------------------------------------- /smoe/utils/debugging.py: -------------------------------------------------------------------------------- 1 | import socket 2 | 3 | import debugpy 4 | import torch.distributed as dist 5 | 6 | 7 | def remote_breakpoint(host: str = "0.0.0.0", port: int = 5678, rank: int = 0): 8 | """ 9 | This function helps to debug programs running in the remote computing node. 10 | 11 | In VSCode, you should add the configuration to the `.vscode/launch.json`, sth. like this 👇 12 | ```json 13 | { 14 | // Use IntelliSense to learn about possible attributes. 15 | // Hover to view descriptions of existing attributes. 16 | // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 17 | "version": "0.2.0", 18 | "configurations": [ 19 | { 20 | "name": "Python: Remote Attach", 21 | "type": "python", 22 | "request": "attach", 23 | "connect": { 24 | "host": "", 25 | "port": 5678 26 | }, 27 | "pathMappings": [ 28 | { 29 | "localRoot": "${workspaceFolder}", 30 | "remoteRoot": "." 31 | } 32 | ], 33 | "justMyCode": false 34 | } 35 | ] 36 | } 37 | ``` 38 | 39 | Then, you could insert one line of code to the debugging position: 40 | ```python 41 | from smoe.utils.debugging import remote_breakpoint; remote_breakpoint() 42 | ``` 43 | 44 | After the program starts and encounters the breakpoint, you could remote attach the debugger. 45 | """ 46 | 47 | def _dp(): 48 | print( 49 | f"Waiting for debugger to attach on {host}:{port}, server: {socket.gethostname()}..." 50 | ) 51 | debugpy.listen((host, port)) 52 | debugpy.wait_for_client() 53 | breakpoint() 54 | 55 | if dist.is_available() and dist.is_initialized(): 56 | if dist.get_rank() == rank: 57 | _dp() 58 | dist.barrier() 59 | else: 60 | _dp() 61 | -------------------------------------------------------------------------------- /smoe/utils/eval/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pjlab-sys4nlp/llama-moe/b17aff436cce58e2fcd7327789c7fadafe15d19e/smoe/utils/eval/__init__.py -------------------------------------------------------------------------------- /smoe/utils/eval/gather_results.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | 4 | import pandas as pd 5 | 6 | 7 | def gather_results(args): 8 | df = pd.DataFrame(columns=["dataset", "accuracy"]) 9 | for dir_path, dir_names, file_names in os.walk(args.save_dir): 10 | print(dir_path) 11 | for name in sorted(file_names): 12 | print(name) 13 | if name == "all_datasets_0.txt": 14 | file_path = os.path.join(dir_path, name) 15 | with open(file_path, "r") as file: 16 | for i, line in enumerate(file.readlines()): 17 | acc = float(line[17:22]) 18 | dataset = line[25:-2] 19 | if not dataset in df["dataset"].values.tolist(): 20 | df.loc[i] = [dataset, acc] 21 | 22 | if name == "all_datasets_1.txt": 23 | file_path = os.path.join(dir_path, name) 24 | with open(file_path, "r") as file: 25 | for i, line in enumerate(file.readlines()): 26 | acc = float(line[17:22]) 27 | dataset = line[25:-2] 28 | if not dataset in df["dataset"].values.tolist(): 29 | df.loc[i + 28] = [dataset, acc] 30 | 31 | if name == "all_datasets_2.txt": 32 | file_path = os.path.join(dir_path, name) 33 | with open(file_path, "r") as file: 34 | for i, line in enumerate(file.readlines()): 35 | acc = float(line[17:22]) 36 | dataset = line[25:-2] 37 | if not dataset in df["dataset"].values.tolist(): 38 | df.loc[i + 44] = [dataset, acc] 39 | 40 | if name == "all_datasets_3.txt": 41 | file_path = os.path.join(dir_path, name) 42 | with open(file_path, "r") as file: 43 | for i, line in enumerate(file.readlines()): 44 | acc = float(line[17:22]) 45 | dataset = line[25:-2] 46 | if not dataset in df["dataset"].values.tolist(): 47 | df.loc[i + 57] = [dataset, acc] 48 | 49 | avg_value = float(df["accuracy"].mean()) 50 | df.loc[60] = ["avg_value", avg_value] 51 | df.to_csv(os.path.join(dir_path, "all_datasets.csv"), index=None) 52 | 53 | 54 | if __name__ == "__main__": 55 | parser = argparse.ArgumentParser() 56 | parser.add_argument("--save_dir", "-s", type=str, default="results_moe") 57 | args = parser.parse_args() 58 | gather_results(args) 59 | -------------------------------------------------------------------------------- /smoe/utils/expert_construction/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pjlab-sys4nlp/llama-moe/b17aff436cce58e2fcd7327789c7fadafe15d19e/smoe/utils/expert_construction/__init__.py -------------------------------------------------------------------------------- /smoe/utils/expert_construction/prune_llama.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | 4 | import numpy as np 5 | import torch 6 | 7 | from smoe.utils.seed import set_seed 8 | 9 | 10 | class LayerPrune: 11 | def __init__(self, config, template, layer): 12 | self.config = config 13 | self.template = template 14 | self.layer = layer 15 | 16 | def save(self): 17 | if not os.path.exists(self.config.save_path): 18 | os.makedirs(self.config.save_path) 19 | 20 | filename = os.path.join(self.config.save_path, self.template.format(self.layer)) 21 | torch.save(self.labels, filename, pickle_protocol=pickle.HIGHEST_PROTOCOL) 22 | print(f'Expert indices for layer {self.layer} saved to "{filename}".') 23 | 24 | 25 | class GradientPrune(LayerPrune): 26 | # fmt: off 27 | def __init__(self, config, template, layer, score): 28 | super().__init__(config, template, layer) 29 | self.score = score 30 | self.num_experts = 1 31 | self.neuron_num = score.size(0) 32 | 33 | def sort_by_criterion(self, criterion): 34 | if criterion == "min": 35 | sorted_score, sorted_index = self.score.sort(0) 36 | elif criterion == "max": 37 | sorted_score, sorted_index = self.score.sort(0, descending=True) 38 | else: 39 | raise NotImplementedError 40 | return sorted_score.tolist(), sorted_index.tolist() 41 | 42 | def prune(self, expert_size, criterion="min"): 43 | sorted_score, sorted_index = self.sort_by_criterion(criterion) 44 | self.labels = [sorted_index[:expert_size]] # 与其他的labels不同,此处选择的是神经元索引,而非专家索引 45 | # print(self.labels) 46 | # fmt: on 47 | 48 | 49 | class RandomPrune(LayerPrune): 50 | # fmt: off 51 | def __init__(self, config, template, layer, neuron_num): 52 | super().__init__(config, template, layer) 53 | self.num_experts = 1 54 | self.neuron_num = neuron_num 55 | 56 | def prune(self, expert_size, seed=None): 57 | if seed is not None: 58 | set_seed(seed) 59 | index = torch.randperm(self.neuron_num).tolist() 60 | self.labels = [index[:expert_size]] # 与其他的labels不同,此处选择的是神经元索引,而非专家索引 61 | # print(self.labels) 62 | # fmt: on 63 | -------------------------------------------------------------------------------- /smoe/utils/extract_text_from_jsonl.py: -------------------------------------------------------------------------------- 1 | """ 2 | Extract texts from jsonlines file. 3 | 4 | Example: 5 | $ python -m smoe.utils.extract_text_from_jsonl -c content -i resources/redpajama/commoncrawl.jsonl -o resources/redpajama-processed/commoncrawl.txt 6 | """ 7 | 8 | import argparse 9 | import json 10 | 11 | 12 | def get_parser(): 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument( 15 | "-c", "--column_name", default="content", help="text column name" 16 | ) 17 | parser.add_argument("-i", "--input_filepath", help="filepath with text to tokenize") 18 | parser.add_argument("-o", "--output_filepath", help="output filepath") 19 | args = parser.parse_args() 20 | return args 21 | 22 | 23 | def extract_text(): 24 | args = get_parser() 25 | 26 | with open(args.input_filepath, "r", encoding="utf8") as fin: 27 | with open(args.output_filepath, "w", encoding="utf8") as fout: 28 | for line in fin: 29 | ins = json.loads(line) 30 | text = ins[args.column_name] 31 | fout.write(f"{text.strip()}\n") 32 | 33 | 34 | if __name__ == "__main__": 35 | extract_text() 36 | -------------------------------------------------------------------------------- /smoe/utils/io.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | import json 3 | import lzma 4 | import os 5 | import pickle 6 | import shutil 7 | 8 | import cv2 9 | import torch 10 | 11 | 12 | def delete_file_or_dir(dir): 13 | if os.path.isfile(dir): 14 | os.remove(dir) 15 | elif os.path.exists(dir): 16 | shutil.rmtree(dir) 17 | else: 18 | pass 19 | 20 | 21 | def torch_load_template_file(path, template, layer): 22 | target = os.path.join(path, template.format(layer)) 23 | return torch.load(target) 24 | 25 | 26 | def torch_load_template_score_file(path, template, layer): 27 | score_list = [] 28 | for expert_folder_name in sorted(os.listdir(path)): 29 | score_file = os.path.join(path, expert_folder_name, template.format(layer)) 30 | score = torch.load(score_file, map_location="cpu") 31 | score_list.append(score) 32 | return score_list 33 | 34 | 35 | def save_compressed_file_7z(tensor, path): # 7z 36 | with lzma.open(path, "wb") as file: 37 | pickle.dump(tensor, file) 38 | 39 | 40 | def load_compressed_file_7z(path): # 7z 41 | with lzma.open(path, "rb") as file: 42 | data = pickle.load(file) 43 | return data 44 | 45 | 46 | def save_compressed_file_gz(tensor, path, compresslevel=6): # gz 47 | with gzip.open(path, "wb", compresslevel=compresslevel) as file: 48 | pickle.dump(tensor, file) 49 | 50 | 51 | def load_compressed_file_gz(path): # gz 52 | with gzip.open(path, "rb") as file: 53 | data = pickle.load(file) 54 | return data 55 | 56 | 57 | class load_jsonlines_iter: 58 | def __init__(self, filepath, start_from: int = None) -> None: 59 | self.fin = open(filepath, "r", encoding="utf8") 60 | if start_from: 61 | self.fin.seek(start_from, os.SEEK_SET) 62 | 63 | def skip_lines(self, num_skip_lines: int): 64 | for i, _ in enumerate(self.fin, 1): 65 | if i == num_skip_lines: 66 | break 67 | 68 | def tell(self): 69 | return self.fin.tell() 70 | 71 | def __iter__(self): 72 | for line in self.fin: 73 | try: 74 | yield json.loads(line) 75 | except json.JSONDecodeError: 76 | pass 77 | self.fin.close() 78 | 79 | 80 | def load_json(filepath): 81 | with open(filepath, "r", encoding="utf8") as fin: 82 | return json.load(fin) 83 | 84 | 85 | def dump_json(obj, filepath, **kwargs): 86 | with open(filepath, "w", encoding="utf8") as fout: 87 | json.dump(obj, fout, ensure_ascii=False, **kwargs) 88 | 89 | 90 | def load_jsonlines(filepath): 91 | data = [] 92 | with open(filepath, "r", encoding="utf8") as fin: 93 | for line in fin: 94 | data.append(json.loads(line)) 95 | return data 96 | 97 | 98 | def dump_jsonlines(obj, filepath, **kwargs): 99 | with open(filepath, "w", encoding="utf8") as fout: 100 | for ins in obj: 101 | fout.write(f"{json.dumps(ins, ensure_ascii=False, **kwargs)}\n") 102 | 103 | 104 | def compress_png_image(image_path, print_info=False): 105 | img = cv2.imread(image_path, cv2.IMREAD_COLOR) 106 | cv2.imwrite(image_path, img, [cv2.IMWRITE_PNG_COMPRESSION, 9]) 107 | if print_info: 108 | print(f'Done for "{image_path}".') 109 | -------------------------------------------------------------------------------- /smoe/utils/kernel_function.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def pass_kernel_function(tensor, criterion): 5 | if criterion == "plain": 6 | return tensor 7 | elif criterion == "l1_norm": 8 | return torch.abs(tensor) 9 | elif criterion == "l2_norm": 10 | return tensor * tensor 11 | else: 12 | raise NotImplementedError 13 | -------------------------------------------------------------------------------- /smoe/utils/logging.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import sys 3 | 4 | import datasets 5 | import transformers 6 | from transformers import TrainingArguments 7 | 8 | # Setup logging 9 | logging.basicConfig( 10 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 11 | datefmt="%m/%d/%Y %H:%M:%S", 12 | level=logging.INFO, 13 | handlers=[logging.StreamHandler(sys.stdout)], 14 | ) 15 | 16 | transformers.utils.logging.enable_default_handler() 17 | transformers.utils.logging.enable_explicit_format() 18 | transformers.tokenization_utils.logging.set_verbosity_warning() 19 | 20 | 21 | def set_logging(should_log, log_level): 22 | if should_log: 23 | # The default of training_args.log_level is passive, so we set log level at info here to have that default. 24 | transformers.utils.logging.set_verbosity_info() 25 | 26 | datasets.utils.logging.set_verbosity(log_level) 27 | transformers.utils.logging.set_verbosity(log_level) 28 | 29 | 30 | def get_logger(name, log_level=None): 31 | logger = logging.getLogger(name) 32 | if log_level: 33 | logger.setLevel(log_level) 34 | return logger 35 | 36 | 37 | def get_logger_from_training_args(name: str, training_args: TrainingArguments): 38 | should_log = training_args.should_log 39 | log_level = training_args.get_process_log_level() 40 | set_logging(should_log, log_level) 41 | logger = get_logger(name, log_level=log_level) 42 | return logger 43 | -------------------------------------------------------------------------------- /smoe/utils/model_operation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pjlab-sys4nlp/llama-moe/b17aff436cce58e2fcd7327789c7fadafe15d19e/smoe/utils/model_operation/__init__.py -------------------------------------------------------------------------------- /smoe/utils/operations/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pjlab-sys4nlp/llama-moe/b17aff436cce58e2fcd7327789c7fadafe15d19e/smoe/utils/operations/__init__.py -------------------------------------------------------------------------------- /smoe/utils/operations/operation_string.py: -------------------------------------------------------------------------------- 1 | import re 2 | from argparse import ArgumentTypeError 3 | 4 | 5 | def str2bool(v): 6 | if isinstance(v, bool): 7 | return v 8 | if v.lower() in ("yes", "true", "t", "y", "1"): 9 | return True 10 | elif v.lower() in ("no", "false", "f", "n", "0"): 11 | return False 12 | else: 13 | raise ArgumentTypeError("Boolean value expected.") 14 | 15 | 16 | def string2list(string, sep=","): 17 | if isinstance(string, list) or string is None: 18 | return string 19 | else: 20 | split_string = string.split(sep) 21 | return [int(num) for num in split_string] 22 | 23 | 24 | def extract_numbers(string): 25 | """Extract numbers (int, float) from a given string.""" 26 | pattern = r"[-+]?\d*\.\d+|\d+" 27 | matches = re.findall(pattern, string) 28 | numbers = [float(match) if "." in match else int(match) for match in matches] 29 | return numbers 30 | 31 | 32 | def calculate_non_ascii_ratio(string): 33 | """Calculate the non-ASCII ratio of a given string.""" 34 | if len(string) == 0: 35 | non_ascii_ratio = 0.0 36 | else: 37 | non_ascii_count = sum(1 for char in string if ord(char) >= 128) 38 | non_ascii_ratio = non_ascii_count / len(string) 39 | return non_ascii_ratio 40 | 41 | 42 | def remove_non_ascii_code(string): 43 | """Use a regular expression to remove all non-ASCII characters""" 44 | string = re.sub(r"[^\x00-\x7F]+", "", string) 45 | return string 46 | 47 | 48 | def replace_non_ascii_code(string): 49 | """ 50 | Replace common non-ASCII characters with their ASCII counterparts in the given string. 51 | 52 | :param string: Input string with non-ASCII characters. 53 | :return: String with non-ASCII characters replaced. 54 | """ 55 | string = re.sub(r"“|”", '"', string) 56 | string = re.sub(r"‘|’", "'", string) 57 | string = re.sub(r"—|–", "-", string) 58 | string = re.sub(r"…", "...", string) 59 | 60 | return string 61 | -------------------------------------------------------------------------------- /smoe/utils/operations/operation_tensor.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def move_tensors_to_device(input, device): 5 | if isinstance(input, dict): 6 | for key, value in input.items(): 7 | if isinstance(value, torch.Tensor): 8 | input[key] = value.to(device) 9 | return input 10 | 11 | elif isinstance(input, list): 12 | for i in range(len(input)): 13 | if isinstance(input[i], torch.Tensor): 14 | input[i] = input[i].to(device) 15 | return input 16 | 17 | elif isinstance(input, torch.Tensor): 18 | return input.to(device) 19 | 20 | else: 21 | raise TypeError(input) 22 | 23 | 24 | def tensor2numbers(input): 25 | if isinstance(input, dict): 26 | for key, value in input.items(): 27 | if isinstance(value, torch.Tensor): 28 | input[key] = value.tolist() 29 | return input 30 | 31 | elif isinstance(input, list): 32 | for i in range(len(input)): 33 | if isinstance(input[i], torch.Tensor): 34 | input[i] = input[i].tolist() 35 | return input 36 | 37 | elif isinstance(input, torch.Tensor): 38 | return input.tolist() 39 | 40 | else: 41 | raise TypeError(input) 42 | 43 | 44 | def turn_last_true_mask_to_false(mask, true_mask_cnt=None): 45 | """Turn the last true value to false for each row in a mask matrix.""" 46 | # mask: shape(batch_size, seq_len) 47 | if true_mask_cnt is None: 48 | true_mask_cnt = torch.sum(mask, dim=1).unsqueeze(1) 49 | turn_position_indices = mask.cumsum(dim=1) == true_mask_cnt 50 | converted_mask = mask.clone() 51 | converted_mask[turn_position_indices] = False 52 | return converted_mask 53 | 54 | 55 | def turn_first_true_mask_to_false(mask): 56 | """Turn the first true value to false for each row in a mask matrix.""" 57 | # mask: shape(batch_size, seq_len) 58 | turn_position_indices = mask.cumsum(dim=1) == 1 59 | converted_mask = mask.clone() 60 | converted_mask[turn_position_indices] = False 61 | return converted_mask 62 | 63 | 64 | def last_true_position(mask): 65 | """Return the index of the last true value in each row in a mask matrix.""" 66 | # mask: shape(batch_size, seq_len) 67 | true_mask_cnt = torch.sum(mask, dim=1).unsqueeze(1) 68 | last_true_mask = (mask.cumsum(dim=1) == true_mask_cnt) & mask 69 | last_true_position = last_true_mask.nonzero()[:, 1].unsqueeze(1) 70 | return last_true_position 71 | 72 | 73 | def pass_kernel_function(tensor, criterion): 74 | if criterion == "plain": 75 | return tensor 76 | elif criterion == "l1_norm": 77 | return torch.abs(tensor) 78 | elif criterion == "l2_norm": 79 | return tensor * tensor 80 | else: 81 | raise NotImplementedError 82 | -------------------------------------------------------------------------------- /smoe/utils/param.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | 3 | from smoe.utils.logging import get_logger 4 | 5 | logger = get_logger(__name__) 6 | 7 | 8 | def get_trainable_parameters(model: nn.Module, verbose: bool = True): 9 | """ 10 | Prints the number of trainable parameters in the model. 11 | 12 | Credit to https://github.com/huggingface/peft/blob/main/src/peft/peft_model.py 13 | """ 14 | trainable_params = 0 15 | all_param = 0 16 | for _, param in model.named_parameters(): 17 | num_params = param.numel() 18 | # if using DS Zero 3 and the weights are initialized empty 19 | if num_params == 0 and hasattr(param, "ds_numel"): 20 | num_params = param.ds_numel 21 | 22 | # Due to the design of 4bit linear layers from bitsandbytes 23 | # one needs to multiply the number of parameters by 2 to get 24 | # the correct number of parameters 25 | if param.__class__.__name__ == "Params4bit": 26 | num_params = num_params * 2 27 | 28 | all_param += num_params 29 | if param.requires_grad: 30 | trainable_params += num_params 31 | if verbose: 32 | logger.info( 33 | f"trainable params: {trainable_params:,d}" 34 | f" || all params: {all_param:,d}" 35 | f" || trainable%: {100 * trainable_params / all_param}" 36 | ) 37 | 38 | return trainable_params, all_param 39 | -------------------------------------------------------------------------------- /smoe/utils/random_utils.py: -------------------------------------------------------------------------------- 1 | import random 2 | import string 3 | 4 | 5 | def get_random_string(length: int = 8) -> str: 6 | """Generate a unique random string. 7 | 8 | Args: 9 | length (int, optional): Length of the random string. Defaults to 16. 10 | 11 | Returns: 12 | str: A unique random string. 13 | """ 14 | return "".join(random.choices(string.ascii_letters + string.digits, k=length)) 15 | -------------------------------------------------------------------------------- /smoe/utils/seed.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | from typing import Optional 4 | 5 | import numpy as np 6 | import torch 7 | 8 | 9 | def set_seed(seed: Optional[int] = 1227, set_cudnn: Optional[bool] = False): 10 | os.environ["PYTHONHASHSEED"] = str(seed) 11 | random.seed(seed) 12 | np.random.seed(seed) 13 | torch.manual_seed(seed) 14 | torch.cuda.manual_seed(seed) 15 | 16 | if set_cudnn: 17 | torch.backends.cudnn.deterministic = True 18 | torch.backends.cudnn.benchmark = False 19 | -------------------------------------------------------------------------------- /smoe/utils/split_files.py: -------------------------------------------------------------------------------- 1 | """ 2 | split files in a folder to separate folders 3 | 4 | src: en_arxiv/* 5 | tgt: output/part0, output/part1, ... 6 | """ 7 | 8 | from pathlib import Path 9 | 10 | 11 | def split_files(src_dir, tgt_dir, num_parts): 12 | src_dir = Path(src_dir) 13 | tgt_dir = Path(tgt_dir) 14 | tgt_dir.mkdir(parents=True, exist_ok=True) 15 | 16 | filepaths = sorted(src_dir.glob("*.jsonl")) 17 | num_files = len(filepaths) 18 | num_files_per_part = num_files // num_parts 19 | print(f"{src_dir} --> {tgt_dir}") 20 | print(f"num_files_per_part: {num_files_per_part}") 21 | 22 | for i in range(num_parts): 23 | start = i * num_files_per_part 24 | end = (i + 1) * num_files_per_part 25 | if i == num_parts - 1: 26 | end = num_files 27 | print(f"part-{i}, start: {start}, end: {end}") 28 | 29 | part_dir = tgt_dir / f"part-{i:06d}" 30 | part_dir.mkdir(parents=True, exist_ok=True) 31 | for j in range(start, end): 32 | filepath = filepaths[j] 33 | tgt_filepath = part_dir / filepath.name 34 | tgt_filepath.symlink_to(filepath) 35 | 36 | 37 | if __name__ == "__main__": 38 | for data_type in [ 39 | # "en_arxiv", 40 | # "en_book", 41 | # "en_c4", 42 | "en_cc", 43 | # "en_stack", 44 | # "en_wikipedia", 45 | # "github", 46 | ]: 47 | split_files( 48 | f"/mnt/hwfile/share_data/zhutong/slimpajama_fluency_llama/{data_type}", 49 | f"/mnt/hwfile/share_data/zhutong/data/slimpajama_fluency_llama_middle_parts/{data_type}", 50 | 30, 51 | ) 52 | # split_files( 53 | # "/mnt/hwfile/share_data/zhutong/slimpajama_fluency_llama/en_arxiv", 54 | # "/mnt/hwfile/share_data/zhutong/data/slimpajama_fluency_llama_middle_parts/en_arxiv", 55 | # 30, 56 | # ) 57 | -------------------------------------------------------------------------------- /smoe/utils/text_clustering.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import joblib 4 | import numpy as np 5 | from sentence_transformers import SentenceTransformer 6 | from sklearn.cluster import KMeans 7 | 8 | from smoe.utils.vars import CLUSTERING_MODEL_NAME 9 | 10 | 11 | class TextClustering: 12 | def __init__( 13 | self, num_clusters: int = 16, encoder: str = "all-mpnet-base-v2" 14 | ) -> None: 15 | self.kmeans = KMeans(n_clusters=num_clusters) 16 | self.emb = SentenceTransformer(encoder) 17 | 18 | @property 19 | def num_clusters(self) -> int: 20 | return self.kmeans.n_clusters 21 | 22 | def encode_emb(self, sentences: list[str]) -> np.ndarray: 23 | arr: np.ndarray = self.emb.encode(sentences=sentences, show_progress_bar=False) 24 | return arr 25 | 26 | def fit_emb(self, emb: np.ndarray): 27 | self.kmeans.fit(emb) 28 | 29 | def fit(self, sentences: list[str]): 30 | emb_arr = self.encode_emb(sentences) 31 | self.kmeans.fit(emb_arr) 32 | 33 | def predict_emb(self, emb: np.ndarray) -> list[int]: 34 | return self.kmeans.predict(emb).tolist() 35 | 36 | def predict(self, sentences: list[str]) -> list[int]: 37 | emb_arr = self.encode_emb(sentences) 38 | return self.predict_emb(emb_arr) 39 | 40 | def save_pretrained(self, folder: str): 41 | model_path = Path(folder) / CLUSTERING_MODEL_NAME 42 | model_path.parent.mkdir(exist_ok=True, parents=True) 43 | joblib.dump(self.kmeans, model_path) 44 | 45 | @classmethod 46 | def from_pretrained(cls, folder: str): 47 | model_path = Path(folder) / CLUSTERING_MODEL_NAME 48 | kmeans = joblib.load(model_path) 49 | model = cls() 50 | model.kmeans = kmeans 51 | return model 52 | -------------------------------------------------------------------------------- /smoe/utils/vars.py: -------------------------------------------------------------------------------- 1 | IGNORE_INDEX = -100 2 | BEST_MODEL_CKPT_DIR = "best" 3 | MIDDLE_MODEL_CKPT_DIR = "middle" 4 | CLUSTERING_MODEL_NAME = "clustering.model" 5 | JSONL_DATASET_CACHE_NAME = "jsonl_dataset-{}.bin" 6 | META_SUFFIX = ".meta" 7 | -------------------------------------------------------------------------------- /smoe/utils/visualization/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pjlab-sys4nlp/llama-moe/b17aff436cce58e2fcd7327789c7fadafe15d19e/smoe/utils/visualization/__init__.py -------------------------------------------------------------------------------- /smoe/utils/visualization/bar.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import matplotlib.pyplot as plt 4 | 5 | 6 | def barh( 7 | label2num: dict, 8 | title: str = "No Title", 9 | save_filepath=None, 10 | sort_type="label", 11 | limit=None, 12 | ): 13 | """ 14 | Refers to https://gist.github.com/Spico197/40f0224f9202ef645ac86637a958eaff 15 | 16 | Args: 17 | sort_type: label or num 18 | """ 19 | assert sort_type in ["label", "num"] 20 | if sort_type == "label": 21 | label2num_sorted = sorted(label2num.items(), key=lambda x: x[0]) 22 | else: 23 | label2num_sorted = sorted(label2num.items(), key=lambda x: x[1]) 24 | if limit: 25 | label2num_sorted = label2num_sorted[:limit] 26 | tot = sum([x[1] for x in label2num_sorted]) 27 | fig = plt.figure(figsize=(16, 9), dpi=350) 28 | ax = fig.add_subplot(111) 29 | ax.barh(range(len(label2num_sorted)), [x[1] for x in label2num_sorted], zorder=3) 30 | ax.set_yticks(range(len(label2num_sorted))) 31 | ax.set_yticklabels( 32 | [ 33 | "{} - {} ({:.2f}%)".format(x[0], x[1], float(x[1]) / tot * 100) 34 | for x in label2num_sorted 35 | ], 36 | fontsize=16, 37 | ) 38 | ax.set_xlabel("Total: {}".format(tot), fontsize=16) 39 | ax.set_title(title) 40 | ax.grid(zorder=0) 41 | plt.rc("axes", axisbelow=True) 42 | plt.rc("ytick", labelsize=16) 43 | plt.tight_layout() 44 | # plt.show() 45 | if save_filepath: 46 | Path(save_filepath).parent.mkdir(exist_ok=True, parents=True) 47 | plt.savefig(save_filepath) 48 | -------------------------------------------------------------------------------- /smoe/utils/visualization/convert_gif.py: -------------------------------------------------------------------------------- 1 | from PIL import Image 2 | 3 | 4 | def save_images_as_gif(image_paths, output_path, duration=200): 5 | """ 6 | 将图像文件路径列表保存为 GIF 动画文件。 7 | 8 | :param image_paths: 包含图像文件路径的列表 9 | :param output_path: 保存 GIF 动画的文件路径 10 | :param duration: 每帧之间的时间间隔(毫秒) 11 | """ 12 | if not image_paths: 13 | print("Error: No image paths provided.") 14 | return 15 | 16 | try: 17 | # 打开图像文件并将它们添加到图像列表中 18 | images = [Image.open(image_path) for image_path in image_paths] 19 | 20 | # 保存 GIF 动画 21 | images[0].save( 22 | output_path, 23 | save_all=True, 24 | append_images=images[1:], 25 | loop=0, 26 | duration=duration, 27 | ) 28 | print(f"GIF animation saved as {output_path}") 29 | except Exception as e: 30 | print(f"Error: {e}") 31 | 32 | 33 | # 使用示例 34 | image_paths = ["image1.png", "image2.png", "image3.png"] 35 | output_gif = "output.gif" 36 | 37 | save_images_as_gif(image_paths, output_gif) 38 | -------------------------------------------------------------------------------- /smoe/utils/visualization/line.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | 4 | from smoe.utils.io import compress_png_image 5 | 6 | 7 | def line_plot( 8 | xs, 9 | label_to_nums, 10 | title: str = None, 11 | xlabel: str = None, 12 | ylabel: str = None, 13 | save_path: str = None, 14 | ): 15 | fig = plt.figure(figsize=(16, 9)) 16 | ax = fig.add_subplot(111) 17 | for label, nums in label_to_nums.items(): 18 | ax.plot(xs, nums, label=label) 19 | ax.set_xticks(xs) 20 | if title: 21 | ax.set_title(title) 22 | if xlabel: 23 | ax.set_xlabel(xlabel) 24 | if ylabel: 25 | ax.set_ylabel(ylabel) 26 | ax.legend() 27 | ax.grid(True) 28 | ax.set_axisbelow(True) 29 | plt.tight_layout() 30 | if save_path: 31 | plt.savefig(save_path, dpi=320) 32 | compress_png_image(save_path, print_info=False) 33 | plt.close() 34 | 35 | 36 | def line_plot_with_highlight( 37 | xs, 38 | label_to_nums, 39 | highlight_label_to_nums: dict = None, 40 | highlight_linewidth: int = 4, 41 | highlight_color: str = "black", 42 | cmap: str = "viridis", 43 | legend_columns: int = 1, 44 | title: str = None, 45 | xlabel: str = None, 46 | ylabel: str = None, 47 | save_path: str = None, 48 | ): 49 | fig = plt.figure(figsize=(16, 9)) 50 | ax = fig.add_subplot(111) 51 | 52 | cmap = plt.get_cmap(cmap) 53 | colors = np.linspace(0, 1, len(label_to_nums)) 54 | 55 | for i, (label, nums) in enumerate(label_to_nums.items()): 56 | ax.plot(xs, nums, label=label, c=cmap(colors)[i, :3]) 57 | 58 | if highlight_label_to_nums is not None: 59 | for i, (label, nums) in enumerate(highlight_label_to_nums.items()): 60 | ax.plot( 61 | xs, nums, label=label, linewidth=highlight_linewidth, c=highlight_color 62 | ) 63 | 64 | ax.set_xticks(xs) 65 | if title: 66 | ax.set_title(title) 67 | if xlabel: 68 | ax.set_xlabel(xlabel) 69 | if ylabel: 70 | ax.set_ylabel(ylabel) 71 | 72 | ax.legend(ncols=legend_columns) 73 | ax.grid(True) 74 | ax.set_axisbelow(True) 75 | plt.tight_layout() 76 | if save_path: 77 | plt.savefig(save_path, dpi=320) 78 | compress_png_image(save_path, print_info=False) 79 | plt.close() 80 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pjlab-sys4nlp/llama-moe/b17aff436cce58e2fcd7327789c7fadafe15d19e/tests/__init__.py -------------------------------------------------------------------------------- /tests/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pjlab-sys4nlp/llama-moe/b17aff436cce58e2fcd7327789c7fadafe15d19e/tests/data/__init__.py -------------------------------------------------------------------------------- /tests/data/test_aggregation.py: -------------------------------------------------------------------------------- 1 | from smoe.data.aggregation import group_instances 2 | 3 | 4 | def test_group_instances(): 5 | instances = [ 6 | {"input_ids": [1, 2, 3], "labels": [4, 5, 6]}, 7 | {"input_ids": [1, 2, 3], "labels": [4, 5, 6]}, 8 | {"input_ids": [1, 2, 3], "labels": [4, 5, 6]}, 9 | {"input_ids": [1, 2, 3], "labels": [4, 5, 6]}, 10 | {"input_ids": [1, 2, 3], "labels": [4, 5, 6]}, 11 | ] 12 | results = group_instances(instances, block_size=4) 13 | assert results == [ 14 | {"input_ids": [1, 2, 3, 1], "labels": [4, 5, 6, 4]}, 15 | {"input_ids": [2, 3, 1, 2], "labels": [5, 6, 4, 5]}, 16 | {"input_ids": [3, 1, 2, 3], "labels": [6, 4, 5, 6]}, 17 | ] 18 | 19 | 20 | if __name__ == "__main__": 21 | test_group_instances() 22 | -------------------------------------------------------------------------------- /tests/data/test_redpajama.py: -------------------------------------------------------------------------------- 1 | import time 2 | from collections import defaultdict 3 | from pathlib import Path 4 | 5 | from torch.utils.data import DataLoader 6 | 7 | from smoe.data.redpajama import load_streaming_datasets 8 | from smoe.utils.io import dump_jsonlines, load_jsonlines 9 | 10 | 11 | def test_load_streaming_datasets(): 12 | output_dir = Path("/mnt/petrelfs/zhutong/smoe/resources/data_test_with_task_type") 13 | output_dir.mkdir(parents=True, exist_ok=True) 14 | # dataset_dir = Path("resources/data_test") 15 | dataset_dir = Path("resources/data_test_with_task_type") 16 | 17 | # # update new dataset with task type 18 | # for subtask_dir in dataset_dir.glob("*"): 19 | # task_type = subtask_dir.stem 20 | # subtask_out_dir = output_dir.joinpath(task_type) 21 | # subtask_out_dir.mkdir(parents=True, exist_ok=True) 22 | # for file in subtask_dir.glob("*.jsonl"): 23 | # data = load_jsonlines(file) 24 | # for ins in data: 25 | # ins["src"] = task_type 26 | # dump_jsonlines(data, subtask_out_dir.joinpath(file.name)) 27 | 28 | dataset = load_streaming_datasets( 29 | str(dataset_dir), 30 | prob_map={"en_arxiv": 0.5, "en_book": 0.2, "en_c4": 0.3}, 31 | block_size=2048, 32 | ) 33 | num_ds = 0 34 | num_src = defaultdict(lambda: 0) 35 | 36 | start = time.time() 37 | for ds in iter(dataset): 38 | num_ds += 1 39 | # print(num_ds, ds["src"]) 40 | # num_src[ds["src"]] += 1 41 | time_span = time.time() - start 42 | print(num_ds) 43 | print(dict(num_src)) 44 | print(f"Time (ins/s): {num_ds / time_span:.2f}" "") 45 | 46 | """ 47 | block_size: -1 48 | {'en_arxiv': 400, 'en_c4': 214} 49 | Time (ins/s): 64.05 50 | 51 | block_size: 2048 52 | Time (ins/s): 59.94 53 | """ 54 | 55 | 56 | if __name__ == "__main__": 57 | test_load_streaming_datasets() 58 | -------------------------------------------------------------------------------- /tests/entrypoint/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pjlab-sys4nlp/llama-moe/b17aff436cce58e2fcd7327789c7fadafe15d19e/tests/entrypoint/__init__.py -------------------------------------------------------------------------------- /tests/entrypoint/test_conn.py: -------------------------------------------------------------------------------- 1 | import os 2 | import socket 3 | 4 | import torch 5 | import torch.distributed as dist 6 | import torch.nn as nn 7 | 8 | # from accelerate import Accelerator 9 | 10 | 11 | def test_connection(): 12 | string = f"{socket.gethostname()} - MASTER_ADDR: {os.environ['MASTER_ADDR']}:{os.environ['MASTER_PORT']} - WORLD_SIZE: {os.environ['WORLD_SIZE']} - RANK: {os.environ['RANK']}" 13 | print(string) 14 | dist.init_process_group("nccl") 15 | # ac = Accelerator() 16 | m = nn.Linear(5, 10) 17 | m = nn.parallel.DistributedDataParallel(m, device_ids=[dist.get_rank()]) 18 | # m = ac.prepare_model(m) 19 | x = torch.randn(3, 5, device=m.device) 20 | y = m(x) 21 | # dist.all_reduce(y, op=dist.ReduceOp.SUM) 22 | assert y.shape == (3, 10) 23 | # print(f"Done - local: {ac.local_process_index} - rank: {ac.process_index} - world: {ac.num_processes}") 24 | print( 25 | f"Done - {socket.gethostname()} - local: {os.environ['LOCAL_RANK']} - rank: {dist.get_rank()} - world: {dist.get_world_size()}" 26 | ) 27 | 28 | 29 | if __name__ == "__main__": 30 | test_connection() 31 | -------------------------------------------------------------------------------- /tests/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pjlab-sys4nlp/llama-moe/b17aff436cce58e2fcd7327789c7fadafe15d19e/tests/models/__init__.py -------------------------------------------------------------------------------- /tests/models/test_noise_moe.py: -------------------------------------------------------------------------------- 1 | import types 2 | 3 | import torch 4 | 5 | from smoe.modules.moe.moe_layers import LinearGLUMoELayer 6 | from smoe.utils.model_operation.change_llama_moe_forward import ( 7 | forward_topk_balanced_noisy_gate_with_random_expert_selection, 8 | ) 9 | from smoe.utils.seed import set_seed 10 | 11 | 12 | def test_noise_moe(): 13 | input_size = 128 14 | hidden_size = 4096 15 | output_size = 128 16 | hidden_act = "silu" 17 | num_experts = 16 18 | num_selects = 16 19 | size_experts = None 20 | bias = True 21 | 22 | gating_config = { 23 | "gate_type": "TopKBalancedNoisyGate", 24 | "gate_network": "mlp", 25 | "gate_use_softmax": True, 26 | "gate_use_balance": True, 27 | "gate_balance_loss_weight": 0.01, 28 | "gate_add_noise": True, 29 | "gate_noise_epsilon": 1e-2, 30 | } 31 | 32 | calculator_config = { 33 | "calculator_type": "UniversalCalculator", 34 | "multiply_gate_scores": False, 35 | "score_scale_factor": 1.0, 36 | } 37 | 38 | layer = LinearGLUMoELayer( 39 | input_size=input_size, 40 | hidden_size=hidden_size, 41 | output_size=output_size, 42 | hidden_act=hidden_act, 43 | num_experts=num_experts, 44 | num_selects=num_selects, 45 | size_experts=size_experts, 46 | bias=bias, 47 | **gating_config, 48 | **calculator_config, 49 | ) 50 | 51 | batch_size = 64 52 | 53 | layer.gate.forward = types.MethodType( 54 | forward_topk_balanced_noisy_gate_with_random_expert_selection, layer.gate 55 | ) 56 | set_seed(0) 57 | 58 | input = torch.rand((batch_size, input_size)) 59 | output = layer(input) 60 | 61 | 62 | if __name__ == "__main__": 63 | test_noise_moe() 64 | -------------------------------------------------------------------------------- /tests/models/test_noise_moe_residual.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from smoe.modules.moe_residual.moe_residual_layers import LinearGLUMoEResidualLayer 4 | 5 | 6 | def test_noise_moe_residual(): 7 | input_size = 4096 8 | hidden_size = 688 * 14 9 | output_size = 4096 10 | hidden_act = "silu" 11 | num_experts = 14 12 | num_selects = 2 13 | size_experts = None 14 | bias = True 15 | 16 | num_experts_residual = 2 17 | size_experts_residual = None # 688 18 | score_scale_factor_residual = 8.0 19 | use_weighting = False 20 | 21 | gating_config = { 22 | "gate_type": "TopKBalancedNoisyGate", 23 | "gate_network": "mlp", 24 | "gate_use_softmax": True, 25 | "gate_use_balance": True, 26 | "gate_balance_loss_weight": 0.01, 27 | "gate_add_noise": True, 28 | "gate_noise_epsilon": 0.01, 29 | } 30 | 31 | calculator_config = { 32 | "calculator_type": "UniversalCalculator", 33 | "multiply_gate_scores": True, 34 | "score_scale_factor": 8.0, 35 | } 36 | 37 | layer = LinearGLUMoEResidualLayer( 38 | input_size=input_size, 39 | hidden_size=hidden_size, 40 | output_size=output_size, 41 | hidden_act=hidden_act, 42 | num_experts=num_experts, 43 | num_selects=num_selects, 44 | size_experts=size_experts, 45 | bias=bias, 46 | num_experts_residual=num_experts_residual, 47 | size_experts_residual=size_experts_residual, 48 | score_scale_factor_residual=score_scale_factor_residual, 49 | use_weighting=use_weighting, 50 | **gating_config, 51 | **calculator_config, 52 | ) 53 | 54 | batch_size = 64 55 | 56 | input = torch.rand((batch_size, input_size)) 57 | output = layer(input) 58 | 59 | 60 | if __name__ == "__main__": 61 | test_noise_moe_residual() 62 | -------------------------------------------------------------------------------- /tests/models/test_switch_moe.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from smoe.modules.moe.moe_layers import LinearGLUMoELayer 4 | 5 | 6 | def test_switch_moe(): 7 | input_size = 128 8 | hidden_size = 4096 9 | output_size = 128 10 | hidden_act = "silu" 11 | num_experts = 16 12 | num_selects = 1 13 | size_experts = None 14 | bias = True 15 | 16 | gating_config = { 17 | "gate_type": "SwitchBalancedGate", 18 | "gate_network": "mlp", 19 | "gate_use_softmax": True, 20 | "gate_use_balance": True, 21 | "gate_balance_loss_weight": 0.01, 22 | "gate_add_noise": True, 23 | } 24 | 25 | calculator_config = { 26 | "calculator_type": "SwitchDropTokenCalculator", 27 | "multiply_gate_scores": True, 28 | "score_scale_factor": 1.0, 29 | "drop_tokens": True, 30 | "capacity_factor": 1.25, 31 | } 32 | 33 | layer = LinearGLUMoELayer( 34 | input_size=input_size, 35 | hidden_size=hidden_size, 36 | output_size=output_size, 37 | hidden_act=hidden_act, 38 | num_experts=num_experts, 39 | num_selects=num_selects, 40 | size_experts=size_experts, 41 | bias=bias, 42 | **gating_config, 43 | **calculator_config, 44 | ) 45 | 46 | batch_size = 64 47 | 48 | input = torch.rand((batch_size, input_size)) 49 | output = layer(input) 50 | 51 | 52 | if __name__ == "__main__": 53 | test_switch_moe() 54 | -------------------------------------------------------------------------------- /tests/models/test_switch_moe_residual.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from smoe.modules.moe_residual.moe_residual_layers import LinearGLUMoEResidualLayer 4 | 5 | 6 | def test_switch_moe_residual(): 7 | input_size = 4096 8 | hidden_size = 688 * 13 9 | output_size = 4096 10 | hidden_act = "silu" 11 | num_experts = 13 12 | num_selects = 1 13 | size_experts = None 14 | bias = True 15 | 16 | num_experts_residual = 3 17 | size_experts_residual = None # 688 18 | score_scale_factor_residual = 12.0 19 | use_weighting = False 20 | 21 | gating_config = { 22 | "gate_type": "SwitchBalancedGate", 23 | "gate_network": "mlp", 24 | "gate_use_softmax": True, 25 | "gate_use_balance": True, 26 | "gate_balance_loss_weight": 0.01, 27 | "gate_add_noise": False, 28 | } 29 | 30 | calculator_config = { 31 | "calculator_type": "SwitchDropTokenCalculator", 32 | "multiply_gate_scores": True, 33 | "score_scale_factor": 4.0, 34 | "drop_tokens": True, 35 | "capacity_factor": 1.25, 36 | } 37 | 38 | layer = LinearGLUMoEResidualLayer( 39 | input_size=input_size, 40 | hidden_size=hidden_size, 41 | output_size=output_size, 42 | hidden_act=hidden_act, 43 | num_experts=num_experts, 44 | num_selects=num_selects, 45 | size_experts=size_experts, 46 | bias=bias, 47 | num_experts_residual=num_experts_residual, 48 | size_experts_residual=size_experts_residual, 49 | score_scale_factor_residual=score_scale_factor_residual, 50 | use_weighting=use_weighting, 51 | **gating_config, 52 | **calculator_config, 53 | ) 54 | 55 | batch_size = 64 56 | 57 | input = torch.rand((batch_size, input_size)) 58 | output = layer(input) 59 | 60 | 61 | if __name__ == "__main__": 62 | test_switch_moe_residual() 63 | -------------------------------------------------------------------------------- /tests/modules/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pjlab-sys4nlp/llama-moe/b17aff436cce58e2fcd7327789c7fadafe15d19e/tests/modules/__init__.py -------------------------------------------------------------------------------- /tests/modules/test_hook.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | 4 | 5 | # fmt: off 6 | def backward_hook(module, grad_in, grad_out): 7 | print(module.name, "grad_in", len(grad_in), [grad_in[i].shape if grad_in[i] is not None else None for i in range(len(grad_in))], grad_in, sep='\n') 8 | print(module.name, "grad_out", len(grad_out), [grad_out[i].shape if grad_out[i] is not None else None for i in range(len(grad_out))], grad_out, sep='\n') 9 | 10 | 11 | class Model(nn.Module): 12 | def __init__(self, input_dim, hidden_dim, output_dim): 13 | super().__init__() 14 | self.layer1 = nn.Linear(input_dim, hidden_dim, bias=False) 15 | self.layer2 = nn.Linear(hidden_dim, output_dim, bias=False) 16 | self.activation = nn.Sigmoid() 17 | 18 | self.layer1.name = "layer1" 19 | self.layer2.name = "layer2" 20 | 21 | self.layer1.register_backward_hook(backward_hook) 22 | self.layer2.register_backward_hook(backward_hook) 23 | 24 | def forward(self, x): 25 | z1 = self.layer1(x) 26 | z2 = self.layer2(z1) 27 | a2 = self.activation(z2) 28 | return a2 29 | 30 | 31 | def test_hook(): 32 | batch_size = 4 33 | input_dim = 128 34 | hidden_dim = 1024 35 | output_dim = 64 36 | 37 | model = Model(input_dim, hidden_dim, output_dim) 38 | loss_func = nn.MSELoss() 39 | 40 | x = torch.rand((batch_size, input_dim)) 41 | target = torch.rand((batch_size, output_dim)) 42 | 43 | y = model(x) 44 | loss = loss_func(y, target) 45 | loss.backward() 46 | 47 | print(model.layer1.weight.grad, model.layer1.weight.grad.shape) 48 | print(model.layer2.weight.grad, model.layer2.weight.grad.shape) 49 | 50 | 51 | if __name__ == "__main__": 52 | test_hook() 53 | -------------------------------------------------------------------------------- /tests/modules/test_hook_llama_mlp.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from transformers.models.llama.modeling_llama import LlamaMLP 4 | 5 | 6 | # fmt: off 7 | def backward_hook(module, grad_in, grad_out): 8 | print(module.name, "grad_in", len(grad_in), [grad_in[i].shape if grad_in[i] is not None else None for i in range(len(grad_in))], grad_in, sep='\n') 9 | print(module.name, "grad_out", len(grad_out), [grad_out[i].shape if grad_out[i] is not None else None for i in range(len(grad_out))], grad_out, sep='\n') 10 | 11 | 12 | class Config: 13 | def __init__(self): 14 | self.pretraining_tp = 1 15 | self.hidden_size = 128 16 | self.intermediate_size = 1024 17 | self.hidden_act = "silu" 18 | 19 | 20 | def test_hook_llama_mlp(): 21 | batch_size = 2 22 | seq_len = 4 23 | 24 | config = Config() 25 | model = LlamaMLP(config) 26 | 27 | model.up_proj.name = "up_proj" 28 | model.gate_proj.name = "gate_proj" 29 | model.down_proj.name = "down_proj" 30 | 31 | model.up_proj.register_backward_hook(backward_hook) 32 | model.gate_proj.register_backward_hook(backward_hook) 33 | model.down_proj.register_backward_hook(backward_hook) 34 | 35 | loss_func = nn.MSELoss() 36 | 37 | x = torch.rand((batch_size * seq_len, config.hidden_size)) 38 | target = torch.rand((batch_size * seq_len, config.hidden_size)) 39 | 40 | # Wrong "grad_in" and "grad_out" will be captured when using inputs with (batch_size, seq_len, *) format ! 41 | ################################################################# 42 | # x = torch.rand((batch_size, seq_len, config.hidden_size)) 43 | # target = torch.rand((batch_size, seq_len, config.hidden_size)) 44 | ################################################################# 45 | 46 | y = model(x) 47 | loss = loss_func(y, target) 48 | loss.backward() 49 | 50 | print(model.up_proj.name, "grad", model.up_proj.weight.grad, model.up_proj.weight.grad.shape, sep='\n') 51 | print(model.gate_proj.name, "grad", model.gate_proj.weight.grad, model.gate_proj.weight.grad.shape, sep='\n') 52 | print(model.down_proj.name, "grad", model.down_proj.weight.grad, model.down_proj.weight.grad.shape, sep='\n') 53 | 54 | 55 | if __name__ == "__main__": 56 | test_hook_llama_mlp() 57 | -------------------------------------------------------------------------------- /tests/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pjlab-sys4nlp/llama-moe/b17aff436cce58e2fcd7327789c7fadafe15d19e/tests/utils/__init__.py -------------------------------------------------------------------------------- /tests/utils/test_gumble.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def gumbel_rsample(shape): 5 | one = torch.tensor(1.0) 6 | zero = torch.tensor(0.0) 7 | gumbel = torch.distributions.gumbel.Gumbel(zero, one).rsample 8 | return gumbel(shape) 9 | 10 | 11 | def test_gumble(): 12 | shape = (16, 16) 13 | gumbel = gumbel_rsample(shape) * 0.01 14 | print(gumbel) 15 | 16 | normal = torch.randn(shape) * 0.01 17 | print(normal) 18 | 19 | 20 | if __name__ == "__main__": 21 | test_gumble() 22 | -------------------------------------------------------------------------------- /tests/utils/test_logging.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from smoe.utils.logging import get_logger 4 | 5 | 6 | def err_func(): 7 | return 1 / 0 8 | 9 | 10 | def test_log(): 11 | logger = get_logger("test") # noqa: F841 12 | 13 | 14 | def test_err_func(): 15 | with pytest.raises(ZeroDivisionError): 16 | res = err_func() # noqa: F841 17 | -------------------------------------------------------------------------------- /tests/utils/visualization/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pjlab-sys4nlp/llama-moe/b17aff436cce58e2fcd7327789c7fadafe15d19e/tests/utils/visualization/__init__.py -------------------------------------------------------------------------------- /tests/utils/visualization/test_expert_load.py: -------------------------------------------------------------------------------- 1 | import tempfile 2 | 3 | import numpy as np 4 | import pytest 5 | 6 | from smoe.utils.visualization.visualize import ( 7 | visualize_expert_load_barv, 8 | visualize_expert_load_heatmap, 9 | ) 10 | 11 | 12 | def test_visualization_expert_load_heatmap(): 13 | load_sum = np.random.rand(16) 14 | visualize_expert_load_heatmap( 15 | load_sum, 16 | layer_idx=0, 17 | dataset_name="test", 18 | shape=(4, 4), 19 | save_dir=tempfile.mktemp(), 20 | ) 21 | load_sum = np.random.randint(0, 5, size=(16,)) 22 | visualize_expert_load_heatmap( 23 | load_sum, 24 | layer_idx=0, 25 | dataset_name="test", 26 | shape=(4, 4), 27 | save_dir=tempfile.mktemp(), 28 | ) 29 | with pytest.raises(ValueError): 30 | visualize_expert_load_heatmap( 31 | load_sum, 32 | layer_idx=0, 33 | dataset_name="test", 34 | shape=(4, 4), 35 | save_dir=".gitignore", 36 | ) 37 | 38 | 39 | def test_visualization_expert_load_barv(): 40 | load_sum = np.random.rand(16) 41 | visualize_expert_load_barv( 42 | load_sum, 43 | layer_idx=0, 44 | dataset_name="test", 45 | y_max=10, 46 | x_label="experts", 47 | save_dir=tempfile.mktemp(), 48 | ) 49 | with pytest.raises(ValueError): 50 | visualize_expert_load_barv( 51 | load_sum, 52 | layer_idx=0, 53 | dataset_name="test", 54 | y_max=10, 55 | x_label="experts", 56 | save_dir=".gitignore", 57 | ) 58 | -------------------------------------------------------------------------------- /tools/check_killed.py: -------------------------------------------------------------------------------- 1 | import re 2 | import subprocess 3 | from collections import Counter, defaultdict 4 | from pathlib import Path 5 | 6 | 7 | def get_jobstate(job_id): 8 | cmd = f"sacct -j {job_id} -o state -n" 9 | p = subprocess.Popen( 10 | cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT 11 | ) 12 | ret = p.stdout.read().decode("utf8").strip() 13 | return ret 14 | 15 | 16 | def get_data_type_and_part_id(filepath): 17 | path = Path(filepath) 18 | obj = re.search(r"tokenize-(.*?)-part-(\d+).log", path.name) 19 | if obj is None: 20 | return None 21 | data_type, part_id = obj.groups() 22 | return data_type, part_id 23 | 24 | 25 | def check_result(filepath): 26 | path = Path(filepath) 27 | ret = get_data_type_and_part_id(filepath) 28 | if ret is None: 29 | return None 30 | data_type, part_id = ret 31 | content = path.read_text(encoding="utf8") 32 | 33 | if ( 34 | "srun: error: Unable to allocate resources: Reach max user active rpc limit" 35 | in content 36 | or "srun: error: Unable to allocate resources: Socket timed out on send/recv operation" 37 | in content 38 | ): 39 | print(f"Error: {data_type}/{part_id}") 40 | return "error" 41 | 42 | obj = re.search(r"srun: job (\d+) queued and waiting for resources", content) 43 | if obj is None: 44 | print(f"Unknown: {data_type}/{part_id}") 45 | return "unknown" 46 | 47 | job_id = obj.group(1) 48 | jobstate = get_jobstate(job_id) 49 | obj = re.search(r"Tokenization Progress:\s*100%\s*\|.*\|\s*(\d+)/(\d+)", content) 50 | if obj is not None: 51 | progress, total = obj.groups() 52 | if ( 53 | progress == total 54 | and progress is not None 55 | and total is not None 56 | and jobstate != "COMPLETED" 57 | ): 58 | print(f"DEAD_COMPLETED: {data_type}/{part_id} - job: {job_id}") 59 | return "DEAD_COMPLETED" 60 | 61 | print(f"{jobstate}: {data_type}/{part_id}") 62 | return jobstate 63 | 64 | 65 | if __name__ == "__main__": 66 | status = defaultdict(list) 67 | for filepath in Path("logs").glob("tokenize-*.log"): 68 | s = check_result(filepath) 69 | res = get_data_type_and_part_id(filepath) 70 | status[s].append(res) 71 | 72 | print(Counter({k: len(v) for k, v in status.items()}).most_common()) 73 | 74 | def print_val(v, k): 75 | print(f"# {k} = {len(v[k])}") 76 | for path in v[k]: 77 | print(path) 78 | 79 | for key in ["CANCELLED+", "DEAD_COMPLETED", "error", None]: 80 | print_val(status, key) 81 | -------------------------------------------------------------------------------- /tools/cp_files.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | from pathlib import Path 4 | 5 | from tqdm import tqdm 6 | 7 | 8 | def copy_files(src_folder: str, dest_folder: str): 9 | src_folder = Path(src_folder) 10 | dest_folder = Path(dest_folder) 11 | dest_folder.mkdir(parents=True, exist_ok=True) 12 | files = src_folder.glob("**/*.jsonl") 13 | for file in tqdm(files): 14 | dest_file = dest_folder / file.name 15 | if not dest_file.exists(): 16 | # print(str(file), str(dest_file)) 17 | # shutil.copy2(str(file), str(dest_file)) 18 | # link the file to dest_folder 19 | # os.link(str(file), str(dest_file)) 20 | os.symlink(str(file), str(dest_file)) 21 | 22 | 23 | if __name__ == "__main__": 24 | # copy_files( 25 | # "/mnt/petrelfs/share_data/quxiaoye/SlimPajama-fluency-processed/c4_split_fluency/", 26 | # "/mnt/petrelfs/share_data/quxiaoye/SlimPajama-fluency-processed-agg/en_c4/" 27 | # ) 28 | for domain in [ 29 | "en_book", 30 | "en_c4", 31 | "en_cc", 32 | "en_arxiv", 33 | "en_wikipedia", 34 | "en_stack", 35 | "github", 36 | ]: 37 | copy_files( 38 | f"/mnt/petrelfs/share_data/zhutong/data/slimpajama_fluency_mistral_middle_parts/{domain}", 39 | f"/mnt/petrelfs/share_data/zhutong/data/slimpajama_fluency_mistral/{domain}", 40 | ) 41 | -------------------------------------------------------------------------------- /tools/listen.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import time 3 | 4 | from smoe.utils.notification import send_to_wechat 5 | 6 | 7 | def check_sme_pending(): 8 | # run sme | grep "normal PD" | wc -l, if the returned value is 0, then send a notification 9 | cmd = "squeue --me | grep 'normal PD' | wc -l" 10 | p = subprocess.Popen( 11 | cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT 12 | ) 13 | for line in p.stdout.readlines(): 14 | line = line.decode("utf-8") 15 | if int(line) == 0: 16 | send_to_wechat("pending jobs all clear!!!") 17 | return True 18 | return False 19 | 20 | 21 | def check_sme_running(): 22 | # run sme | grep "normal R" | wc -l, if the returned value is 0, then send a notification 23 | cmd = "squeue --me | grep 'normal R' | wc -l" 24 | p = subprocess.Popen( 25 | cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT 26 | ) 27 | for line in p.stdout.readlines(): 28 | line = line.decode("utf-8") 29 | if int(line) == 0: 30 | send_to_wechat("running jobs all clear!!!") 31 | return True 32 | return False 33 | 34 | 35 | def listen(): 36 | # check pending jobs every 10 seconds, if all pending jobs are done, send a notification 37 | no_pending = False 38 | no_running = False 39 | while True: 40 | if not no_pending: 41 | no_pending = check_sme_pending() 42 | time.sleep(10) 43 | if not no_running: 44 | no_running = check_sme_running() 45 | time.sleep(10) 46 | 47 | 48 | if __name__ == "__main__": 49 | listen() 50 | -------------------------------------------------------------------------------- /tools/scl_jobs.sh: -------------------------------------------------------------------------------- 1 | # scancel from the list below 2 | 3 | list=( 4 | "2384204" 5 | "2384206" 6 | "2384207" 7 | "2384208" 8 | "2384209" 9 | "2384210" 10 | "2384211" 11 | "2384213" 12 | "2384215" 13 | "2384216" 14 | "2384217" 15 | "2384218" 16 | "2384220" 17 | "2384221" 18 | "2384222" 19 | "2384223" 20 | "2384226" 21 | "2384228" 22 | "2384230" 23 | "2384231" 24 | "2384233" 25 | "2384234" 26 | "2384264" 27 | "2384262" 28 | "2384261" 29 | "2384259" 30 | "2384257" 31 | "2384255" 32 | "2384253" 33 | "2384251" 34 | "2384249" 35 | "2384244" 36 | "2384242" 37 | "2384240" 38 | "2384238" 39 | "2384236" 40 | ) 41 | 42 | for i in "${list[@]}" 43 | do 44 | scancel $i 45 | done 46 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [flake8] 2 | ignore= 3 | # line length 4 | E501, 5 | # whitespace before ':' 6 | E203, 7 | # line break before binary operator 8 | W503 9 | exclude = 10 | # No need to traverse our git directory 11 | .git, 12 | # There's no value in checking cache directories 13 | __pycache__, 14 | # This contains our built documentation 15 | build, 16 | # This contains builds of flake8 that we don't want to check 17 | dist, 18 | bak, 19 | data, 20 | outputs, 21 | debug.py 22 | --------------------------------------------------------------------------------