├── .env.example
├── .gitattributes
├── .gitignore
├── .pre-commit-config.yaml
├── .vscode
    ├── launch.json
    └── settings.json
├── LICENSE
├── Makefile
├── README.md
├── VERSION
├── conf
    ├── cpt
    │   └── lora.yaml
    └── deepspeed
    │   ├── bf16.json
    │   ├── bf16_zero1.json
    │   ├── bf16_zero1_default.json
    │   ├── bf16_zero2_default.json
    │   ├── bf16_zero3.json
    │   └── fp16.json
├── docs
    ├── Contribution.md
    ├── Installation.md
    ├── LLaMA_MoE.pdf
    ├── Notification.md
    ├── continual_pretraining
    │   └── README.md
    ├── expert_construction
    │   └── README.md
    ├── imgs
    │   ├── MoE-Routing.gif
    │   ├── title-favicon.png
    │   ├── wechat-notification-config.png
    │   └── wechat-notification.jpg
    └── supervised_fine_tuning
    │   └── SFT.md
├── example.py
├── requirements.txt
├── scripts
    ├── analysis
    │   └── get_layer_wise_score_scale_factor.sh
    ├── cpt
    │   ├── 16_2
    │   │   ├── baseline_112gpus_sheared_llama_portion_fluency_sf4.sh
    │   │   └── baseline_112gpus_sheared_llama_portion_fluency_sf8.sh
    │   ├── 8_2
    │   │   ├── baseline_112gpus_8_2_sheared_llama_portion_fluency_sf4.sh
    │   │   └── mixtral_112gpus_8_2_sheared_llama_portion_fluency_sf4.sh
    │   ├── README.md
    │   ├── dynamic_data_selection
    │   │   ├── baseline.sh
    │   │   ├── baseline_112gpus.sh
    │   │   ├── baseline_112gpus_linear_gate.sh
    │   │   ├── baseline_112gpus_scale2.0.sh
    │   │   ├── baseline_112gpus_sheared_llama_portion.sh
    │   │   ├── baseline_112gpus_sheared_llama_portion_fluency.sh
    │   │   ├── baseline_112gpus_sheared_llama_portion_gate_balance_loss0.1.sh
    │   │   ├── baseline_112gpus_sheared_llama_portion_no_ad.sh
    │   │   ├── baseline_32gpus.sh
    │   │   ├── sheared_llama_112gpus.sh
    │   │   ├── sheared_llama_112gpus_100B.sh
    │   │   ├── sheared_llama_32gpus.sh
    │   │   └── sheared_llama_paper.sh
    │   ├── fpt.sh
    │   ├── fpt_13b.sh
    │   ├── fpt_7b.sh
    │   ├── fpt_7b_residual.sh
    │   ├── fpt_7b_weight_norm.sh
    │   ├── fpt_llama2_7b_moefication.sh
    │   ├── fpt_llama2_7b_share.sh
    │   ├── fpt_resume.sh
    │   ├── fpt_switch.sh
    │   ├── fpt_test_lr.sh
    │   ├── gate_loss.sh
    │   ├── lora.sh
    │   ├── multi_jobs.sh
    │   ├── test
    │   │   ├── fpt_7b_residual_test.sh
    │   │   ├── fpt_7b_test.sh
    │   │   └── test_conn.sh
    │   └── vs_upcycle_dense
    │   │   └── fpt_3b_total_10b.sh
    ├── eval
    │   ├── eval_mmlu_moe.sh
    │   ├── ref_loss.sh
    │   └── ref_loss_random_split.sh
    ├── examples
    │   ├── create_noise_llama_moe.sh
    │   ├── create_noise_llama_moe_residual.sh
    │   ├── create_switch_llama_moe.sh
    │   ├── load_llama_moe.sh
    │   ├── load_llama_moe_hf.sh
    │   └── load_llama_moe_residual.sh
    ├── expert_construction
    │   ├── convert
    │   │   ├── run_convert.sh
    │   │   ├── run_convert_gradient.sh
    │   │   ├── run_convert_gradient_residual.sh
    │   │   └── run_convert_mistral.sh
    │   ├── get_hidden_features
    │   │   ├── run_get_hidden_features.sh
    │   │   └── run_prepare_datasets.sh
    │   ├── prune
    │   │   ├── run_prune_gradient.sh
    │   │   ├── run_prune_gradient_convert.sh
    │   │   ├── run_prune_gradient_convert_one4all.sh
    │   │   ├── run_prune_gradient_one4all.sh
    │   │   ├── run_prune_random.sh
    │   │   ├── run_prune_random_convert.sh
    │   │   ├── run_prune_random_convert_one4all.sh
    │   │   └── run_prune_random_one4all.sh
    │   ├── select
    │   │   ├── run_select.sh
    │   │   └── run_select_multiprocess.sh
    │   └── split
    │   │   ├── run_split_clustering.sh
    │   │   ├── run_split_gradient.sh
    │   │   ├── run_split_gradient_get_grads.sh
    │   │   ├── run_split_gradient_one4all.sh
    │   │   ├── run_split_gradient_residual.sh
    │   │   ├── run_split_gradient_residual_one4all.sh
    │   │   ├── run_split_graph.sh
    │   │   ├── run_split_random.sh
    │   │   └── run_split_random_one4all.sh
    ├── sft
    │   ├── 2_16.sh
    │   ├── 2_8.sh
    │   └── 4_16.sh
    ├── test
    │   ├── test_args.sh
    │   └── test_conn.sh
    ├── tokenize
    │   ├── clustering.sh
    │   ├── lines.sh
    │   ├── redpajama.sh
    │   └── slimpajama_convert.sh
    └── visualization
    │   ├── run_visualize_expert_load_one4all.sh
    │   ├── run_visualize_expert_neuron_overlap.sh
    │   ├── run_visualize_expert_neuron_overlap_one4all.sh
    │   ├── run_visualize_expert_neuron_overlap_overview.sh
    │   ├── run_visualize_expert_select_mlp.sh
    │   ├── run_visualize_expert_select_mlp_one4all.sh
    │   ├── run_visualize_mlp_output_scale.sh
    │   ├── run_visualize_swiglu_output.sh
    │   └── run_visualize_swiglu_output_one4all.sh
├── setup.py
├── smoe
    ├── __init__.py
    ├── callbacks
    │   ├── __init__.py
    │   ├── save_model.py
    │   └── tensorboard.py
    ├── data
    │   ├── __init__.py
    │   ├── aggregation.py
    │   ├── collate_fn.py
    │   ├── datasets_moe.py
    │   ├── dynamic_selection.py
    │   ├── redpajama.py
    │   ├── single_file.py
    │   └── streaming.py
    ├── entrypoint
    │   ├── __init__.py
    │   ├── analysis
    │   │   ├── __init__.py
    │   │   ├── act_scale.py
    │   │   ├── clustering_distribution.py
    │   │   ├── gate_load_vis.py
    │   │   ├── get_layer_wise_score_scale_factor.py
    │   │   ├── hidden_before_gate_vis.py
    │   │   └── scale_factor_simulation.py
    │   ├── compress_png_images.py
    │   ├── cpt
    │   │   ├── __init__.py
    │   │   ├── cpt_fpt.py
    │   │   └── cpt_lora.py
    │   ├── download_llama.py
    │   ├── eval
    │   │   ├── __init__.py
    │   │   ├── eval_mmlu_moe_0.py
    │   │   ├── eval_mmlu_moe_1.py
    │   │   ├── eval_mmlu_moe_2.py
    │   │   └── eval_mmlu_moe_3.py
    │   ├── examples
    │   │   ├── __init__.py
    │   │   ├── create_noise_llama_moe.py
    │   │   ├── create_noise_llama_moe_residual.py
    │   │   ├── create_switch_llama_moe.py
    │   │   ├── load_llama_moe.py
    │   │   ├── load_llama_moe_hf.py
    │   │   ├── load_llama_moe_residual.py
    │   │   └── load_relu_llama.py
    │   ├── expert_construction
    │   │   ├── __init__.py
    │   │   ├── llama_convert.py
    │   │   ├── llama_convert_neuron_index.py
    │   │   ├── llama_convert_neuron_index_residual.py
    │   │   ├── llama_get_hidden_features.py
    │   │   ├── llama_prepare_datasets.py
    │   │   ├── llama_prune_gradient.py
    │   │   ├── llama_prune_random.py
    │   │   ├── llama_select_mlp.py
    │   │   ├── llama_select_mlp_multiprocess.py
    │   │   ├── llama_split_clustering.py
    │   │   ├── llama_split_gradient.py
    │   │   ├── llama_split_gradient_get_grads.py
    │   │   ├── llama_split_gradient_residual.py
    │   │   ├── llama_split_graph.py
    │   │   ├── llama_split_graph_trans_gp.py
    │   │   ├── llama_split_random.py
    │   │   └── mistral_convert.py
    │   ├── sft
    │   │   ├── __init__.py
    │   │   └── train_sft.py
    │   ├── text_clustering.py
    │   └── visualization
    │   │   ├── __init__.py
    │   │   ├── visualize_expert_load.py
    │   │   ├── visualize_expert_neuron_overlap.py
    │   │   ├── visualize_expert_neuron_overlap_overview.py
    │   │   ├── visualize_expert_select_mlp.py
    │   │   ├── visualize_gate_loss.py
    │   │   ├── visualize_mlp_output_scale.py
    │   │   └── visualize_swiglu_output.py
    ├── metrics
    │   ├── __init__.py
    │   ├── accuracy.py
    │   └── preprocess.py
    ├── models
    │   ├── __init__.py
    │   ├── llama_moe
    │   │   ├── __init__.py
    │   │   ├── configuration_llama_moe.py
    │   │   ├── modeling_llama_moe.py
    │   │   └── modeling_llama_moe_hf.py
    │   ├── llama_moe_residual
    │   │   ├── __init__.py
    │   │   ├── configuration_llama_moe_residual.py
    │   │   └── modeling_llama_moe_residual.py
    │   ├── mistral
    │   │   ├── __init__.py
    │   │   ├── configuration_mistral.py
    │   │   └── modeling_mistral.py
    │   └── mixtral
    │   │   ├── __init__.py
    │   │   ├── configuration_mixtral.py
    │   │   └── modeling_mixtral.py
    ├── modules
    │   ├── __init__.py
    │   ├── flash_attn.py
    │   ├── moe
    │   │   ├── __init__.py
    │   │   ├── moe_calculators.py
    │   │   ├── moe_experts.py
    │   │   ├── moe_gates.py
    │   │   └── moe_layers.py
    │   ├── moe_residual
    │   │   ├── __init__.py
    │   │   ├── moe_residual_layers.py
    │   │   └── residual_blocks.py
    │   └── norm.py
    ├── trainer
    │   ├── __init__.py
    │   ├── llama_lr_scheduling.py
    │   └── moefy
    │   │   ├── __init__.py
    │   │   └── expert_split_gradient.py
    └── utils
    │   ├── __init__.py
    │   ├── cache_utils.py
    │   ├── config.py
    │   ├── conversation.py
    │   ├── convert_moe_to_dense.py
    │   ├── debugging.py
    │   ├── eval
    │       ├── __init__.py
    │       ├── crop.py
    │       └── gather_results.py
    │   ├── expert_construction
    │       ├── __init__.py
    │       ├── convert_llama_moe.py
    │       ├── convert_llama_moe_neuron_index.py
    │       ├── convert_llama_moe_neuron_index_residual.py
    │       ├── expert_select.py
    │       ├── expert_split.py
    │       ├── expert_split_residual.py
    │       ├── k_means_constrained_cos.py
    │       └── prune_llama.py
    │   ├── extract_text_from_jsonl.py
    │   ├── io.py
    │   ├── kernel_function.py
    │   ├── logging.py
    │   ├── merge_llama_with_lora.py
    │   ├── model_operation
    │       ├── __init__.py
    │       ├── change_llama_forward.py
    │       ├── change_llama_moe_forward.py
    │       ├── modify_llama_model.py
    │       └── modify_llama_moe_model.py
    │   ├── modeling_attn_mask_utils.py
    │   ├── notification.py
    │   ├── operations
    │       ├── __init__.py
    │       ├── operation_list.py
    │       ├── operation_string.py
    │       └── operation_tensor.py
    │   ├── param.py
    │   ├── param_estimation.py
    │   ├── random_utils.py
    │   ├── seed.py
    │   ├── split_files.py
    │   ├── text_clustering.py
    │   ├── tokenize.py
    │   ├── vars.py
    │   └── visualization
    │       ├── __init__.py
    │       ├── bar.py
    │       ├── convert_gif.py
    │       ├── line.py
    │       ├── plotter.py
    │       ├── tsne_torch_model.py
    │       └── visualize.py
├── tests
    ├── __init__.py
    ├── data
    │   ├── __init__.py
    │   ├── test_aggregation.py
    │   ├── test_redpajama.py
    │   └── test_streaming.py
    ├── entrypoint
    │   ├── __init__.py
    │   └── test_conn.py
    ├── models
    │   ├── __init__.py
    │   ├── test_noise_moe.py
    │   ├── test_noise_moe_residual.py
    │   ├── test_switch_moe.py
    │   └── test_switch_moe_residual.py
    ├── modules
    │   ├── __init__.py
    │   ├── test_hook.py
    │   └── test_hook_llama_mlp.py
    └── utils
    │   ├── __init__.py
    │   ├── test_gumble.py
    │   ├── test_logging.py
    │   └── visualization
    │       ├── __init__.py
    │       └── test_expert_load.py
├── tools
    ├── check_killed.py
    ├── cp_files.py
    ├── listen.py
    ├── queue_submit.py
    └── scl_jobs.sh
└── tox.ini


/.env.example:
--------------------------------------------------------------------------------
1 | WECHAT_ROBOT_WEBHOOK="https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=blahblah"
2 | WECHAT_ROBOT_MENTIONS="wechat_user1,user2"
3 | WECHAT_ROBOT_MENTIONS_MOBILE="15600000000,16700000000"
4 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | docs/imgs/title-favicon.png filter=lfs diff=lfs merge=lfs -text
2 | docs/imgs/MoE-Routing.gif filter=lfs diff=lfs merge=lfs -text
3 | docs/LLaMA_MoE.pdf filter=lfs diff=lfs merge=lfs -text
4 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 | - repo: https://github.com/pycqa/isort
 3 |   rev: 5.12.0
 4 |   hooks:
 5 |     - id: isort
 6 |       name: isort (python)
 7 |       args: ["--profile", "black", "--filter-files"]
 8 | - repo: https://github.com/psf/black
 9 |   rev: 22.12.0
10 |   hooks:
11 |     - id: black
12 | - repo: https://github.com/pre-commit/pre-commit-hooks
13 |   rev: v4.4.0
14 |   hooks:
15 |   - id: trailing-whitespace
16 |   - id: end-of-file-fixer
17 |   - id: check-yaml
18 |   - id: check-added-large-files
19 | 


--------------------------------------------------------------------------------
/.vscode/launch.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     // Use IntelliSense to learn about possible attributes.
 3 |     // Hover to view descriptions of existing attributes.
 4 |     // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
 5 |     "version": "0.2.0",
 6 |     "configurations": [
 7 |         {
 8 |             "name": "tokenize",
 9 |             "type": "python",
10 |             "request": "launch",
11 |             "module": "smoe.utils.tokenize",
12 |             "justMyCode": true
13 |         },
14 |         {
15 |             "name": "Python: Remote Attach",
16 |             "type": "python",
17 |             "request": "attach",
18 |             "connect": {
19 |                 "host": "x.x.x.x",
20 |                 "port": 5678
21 |             },
22 |             "pathMappings": [
23 |                 {
24 |                     "localRoot": "${workspaceFolder}",
25 |                     "remoteRoot": "."
26 |                 }
27 |             ],
28 |             "justMyCode": false
29 |         }
30 |     ]
31 | }
32 | 


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 |     "python.testing.pytestArgs": [
3 |         "tests"
4 |     ],
5 |     "python.testing.unittestEnabled": false,
6 |     "python.testing.pytestEnabled": true
7 | }
8 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | all: format clean pre test
 2 | 	echo 'finished'
 3 | 
 4 | .PHONY: format
 5 | format:
 6 | 	isort --profile black --filter-files .
 7 | 	black .
 8 | 
 9 | .PHONY: test
10 | test:
11 | 	coverage run --source smoe -m pytest -vv .
12 | 	coverage report -m
13 | 	flake8
14 | 
15 | .PHONY: pre
16 | pre:
17 | 	pre-commit run --all-files
18 | 
19 | .PHONY: debug
20 | debug:
21 | 	pytest -vv tests/utils/test_logging.py
22 | 
23 | .PHONY: clean
24 | clean:
25 | 	rm -rf build/
26 | 	rm -rf dist/
27 | 	rm -rf *.egg-info/
28 | 	rm -f .coverage
29 | 	rm -f coverage.xml
30 | 	find . | grep -E '(__pycache__|\.pyc|\.pyo$$)' | xargs rm -rf
31 | 


--------------------------------------------------------------------------------
/VERSION:
--------------------------------------------------------------------------------
1 | 0.0.0
2 | 


--------------------------------------------------------------------------------
/conf/cpt/lora.yaml:
--------------------------------------------------------------------------------
1 | # this config file is for demonstration usage only
2 | deepspeed: conf/deepspeed/bf16.json
3 | 


--------------------------------------------------------------------------------
/conf/deepspeed/bf16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "bf16": {
 3 |         "enabled": true
 4 |     },
 5 |     "zero_optimization": {
 6 |         "stage": 1,
 7 |         "allgather_partitions": true,
 8 |         "allgather_bucket_size": 1e8,
 9 |         "overlap_comm": true,
10 |         "reduce_scatter": true,
11 |         "reduce_bucket_size": 1e8,
12 |         "contiguous_gradients": true
13 |     },
14 |     "gradient_accumulation_steps": "auto",
15 |     "gradient_clipping": "auto",
16 |     "steps_per_print": 2000,
17 |     "train_batch_size": "auto",
18 |     "train_micro_batch_size_per_gpu": "auto",
19 |     "wall_clock_breakdown": false
20 | }
21 | 


--------------------------------------------------------------------------------
/conf/deepspeed/bf16_zero1.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "bf16": {
 3 |         "enabled": true
 4 |     },
 5 |     "zero_optimization": {
 6 |         "stage": 1,
 7 |         "allgather_partitions": true,
 8 |         "allgather_bucket_size": 1e8,
 9 |         "overlap_comm": true,
10 |         "reduce_scatter": true,
11 |         "reduce_bucket_size": 1e8,
12 |         "contiguous_gradients": true
13 |     },
14 |     "gradient_accumulation_steps": "auto",
15 |     "gradient_clipping": "auto",
16 |     "steps_per_print": 2000,
17 |     "train_batch_size": "auto",
18 |     "train_micro_batch_size_per_gpu": "auto",
19 |     "wall_clock_breakdown": false
20 | }
21 | 


--------------------------------------------------------------------------------
/conf/deepspeed/bf16_zero1_default.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "bf16": {
 3 |         "enabled": true
 4 |     },
 5 |     "zero_optimization": {
 6 |         "stage": 1
 7 |     },
 8 |     "gradient_accumulation_steps": "auto",
 9 |     "gradient_clipping": "auto",
10 |     "steps_per_print": 2000,
11 |     "train_batch_size": "auto",
12 |     "train_micro_batch_size_per_gpu": "auto",
13 |     "wall_clock_breakdown": false,
14 |     "reduce_bucket_size": 536870912
15 | }
16 | 


--------------------------------------------------------------------------------
/conf/deepspeed/bf16_zero2_default.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "bf16": {
 3 |         "enabled": true
 4 |     },
 5 |     "zero_optimization": {
 6 |         "stage": 2
 7 |     },
 8 |     "gradient_accumulation_steps": "auto",
 9 |     "gradient_clipping": "auto",
10 |     "steps_per_print": 2000,
11 |     "train_batch_size": "auto",
12 |     "train_micro_batch_size_per_gpu": "auto",
13 |     "wall_clock_breakdown": false
14 | }
15 | 


--------------------------------------------------------------------------------
/conf/deepspeed/bf16_zero3.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "bf16": {
 3 |         "enabled": true
 4 |     },
 5 |     "zero_optimization": {
 6 |         "stage": 3
 7 |     },
 8 |     "gradient_accumulation_steps": "auto",
 9 |     "gradient_clipping": "auto",
10 |     "steps_per_print": 2000,
11 |     "train_batch_size": "auto",
12 |     "train_micro_batch_size_per_gpu": "auto",
13 |     "wall_clock_breakdown": false,
14 |     "reduce_bucket_size": 536870912
15 | }
16 | 


--------------------------------------------------------------------------------
/conf/deepspeed/fp16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 100,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1e-10
 9 |     },
10 |     "zero_optimization": {
11 |         "stage": 2,
12 |         "allgather_partitions": true,
13 |         "allgather_bucket_size": 1e8,
14 |         "overlap_comm": true,
15 |         "reduce_scatter": true,
16 |         "reduce_bucket_size": 1e8,
17 |         "contiguous_gradients": true
18 |     },
19 | 
20 |     "gradient_accumulation_steps": "auto",
21 |     "gradient_clipping": "auto",
22 |     "steps_per_print": 2000,
23 |     "train_batch_size": "auto",
24 |     "train_micro_batch_size_per_gpu": "auto",
25 |     "wall_clock_breakdown": false
26 | }
27 | 


--------------------------------------------------------------------------------
/docs/Contribution.md:
--------------------------------------------------------------------------------
 1 | # 🤝 Contribution
 2 | 
 3 | - Make sure the Python version `>=3.10` (a strict version contraint for better type hinting)
 4 | 
 5 | ```bash
 6 | $ conda install git  # upgrade git
 7 | $ git clone git@github.com:pjlab-sys4nlp/llama-moe.git
 8 | $ cd llama-moe
 9 | $ pip install -e .[dev]
10 | $ pre-commit install
11 | ```
12 | 


--------------------------------------------------------------------------------
/docs/Installation.md:
--------------------------------------------------------------------------------
 1 | # 🌴 Installation
 2 | 
 3 | 1. Prepare conda environment: `conda create -n smoe python=3.11` (If your environment name is not `smoe`, you may need to change environment in launching scripts)
 4 | 2. Add correct environment variables in `~/.bashrc` (`gcc` is set to newer version for installing `flash-attn`). e.g.:
 5 |     ```bash
 6 |     export PATH=/mnt/petrelfs/share/cuda-11.8/bin:$PATH
 7 |     export LD_LIBRARY_PATH=/mnt/petrelfs/share/cuda-11.8/lib64:$LD_LIBRARY_PATH
 8 |     export PATH=/mnt/petrelfs/share/gcc-10.1.0/bin:$PATH
 9 |     export LD_LIBRARY_PATH=/mnt/petrelfs/share/gcc-10.1.0/lib64:$LD_LIBRARY_PATH
10 |     ```
11 | 3. Take the variables into effect: `source ~/.bashrc`
12 | 4. Install PyTorch (CUDA-11.8): `pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118`
13 | 5. Install dependencies: `pip install -r requirements.txt`
14 | 6. Install `flash-attn`: `pip install flash-attn==2.0.1 --no-build-isolation`. You may need to follow the [flash-attn installation instructions](https://github.com/Dao-AILab/flash-attention?tab=readme-ov-file#installation-and-features) to avoid some errors.
15 | 7. Install the latest Git: `conda install git`
16 | 8. Clone the repo: `git clone git@github.com:pjlab-sys4nlp/llama-moe.git` (If you don't setup the ssh key to GitHub, you may not able to clone through ssh. Check the [docs](https://docs.github.com/en/authentication/connecting-to-github-with-ssh/adding-a-new-ssh-key-to-your-github-account) about it.)
17 | 9. Change current directory: `cd llama-moe`
18 | 10. Install `smoe` in [editable mode](https://pip.pypa.io/en/stable/cli/pip_install/#cmdoption-e): `pip install -e .[dev]`
19 | 11. Setup `pre-commit` hooks: `pre-commit install`
20 | 


--------------------------------------------------------------------------------
/docs/LLaMA_MoE.pdf:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:a00df5a444cd37a7ea94f28062c73e0cd3f77ecdb5c6bdb163f57f18d22acc7f
3 | size 914313
4 | 


--------------------------------------------------------------------------------
/docs/Notification.md:
--------------------------------------------------------------------------------
 1 | # 💬 Notification
 2 | 
 3 | ## WeChatWork Notification
 4 | 
 5 | ![WeChatWork Notification Example](imgs/wechat-notification.jpg)
 6 | 
 7 | 1. You should create a WeChat Work group if you don't have one.
 8 | 2. Add a group robot from the group settings panel, and get the **webhook url**.
 9 | 3. Create an env file: `cp .env.example .env`
10 | 4. Update the content in the `.env` file, make sure the webhook url is correctly configured. If you'd like to AT someone in the group, you should update the mobile phone number.
11 |    ![Configuration](imgs/wechat-notification-config.png)
12 | 5. Add notification decorator to the code, and you would see the notification messages in the chat group:
13 |     ```python
14 |     from smoe.utils.notification import wechat_sender
15 | 
16 |     @wechat_sender()
17 |     def main():
18 |         raise RuntimeError("error testing")
19 | 
20 |     if __name__ == "__main__":
21 |         main()
22 |     ```
23 | 


--------------------------------------------------------------------------------
/docs/imgs/MoE-Routing.gif:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:d0a31562b85a1ad8d7e62c58dcb3f60bdf19ed70b4becf3f3b0ae51ae1ec19bd
3 | size 608200
4 | 


--------------------------------------------------------------------------------
/docs/imgs/title-favicon.png:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:656e5f3de4440b469d9b7bb928a14872dfb69329b7d539bce620cdef782d804c
3 | size 1167538
4 | 


--------------------------------------------------------------------------------
/docs/imgs/wechat-notification-config.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pjlab-sys4nlp/llama-moe/b17aff436cce58e2fcd7327789c7fadafe15d19e/docs/imgs/wechat-notification-config.png


--------------------------------------------------------------------------------
/docs/imgs/wechat-notification.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pjlab-sys4nlp/llama-moe/b17aff436cce58e2fcd7327789c7fadafe15d19e/docs/imgs/wechat-notification.jpg


--------------------------------------------------------------------------------
/docs/supervised_fine_tuning/SFT.md:
--------------------------------------------------------------------------------
 1 | # Supervised Fine-Tuning (SFT)
 2 | 
 3 | ## Data Preparation
 4 | 
 5 | Download [Deita 6K](https://huggingface.co/datasets/hkust-nlp/deita-6k-v0) to `data/deita/deita_6k.jsonl`.
 6 | 
 7 | ## Training
 8 | 
 9 | Start training in Slurm clusters: `sbatch scripts/sft/2_8.sh`.
10 | 
11 | ## Inference
12 | 
13 | ```python
14 | from transformers import AutoModelForCausalLM
15 | from transformers import AutoTokenizer
16 | 
17 | from src.utils.conversation import Conversation
18 | 
19 | conv = Conversation()
20 | conv.append_message("human", "Give me a three-day plan in Suzhou.")
21 | conv.append_message("gpt", None)
22 | prompt = conv.get_prompt()
23 | print(prompt)
24 | print(prompt[-1] == " ")
25 | 
26 | model_dir = "llama-moe/LLaMA-MoE-v1-3_5B-2_8-sft"
27 | 
28 | tok = AutoTokenizer.from_pretrained(model_dir)
29 | m = AutoModelForCausalLM.from_pretrained(model_dir, trust_remote_code=True)
30 | m.eval()
31 | m.cuda()
32 | 
33 | inputs = tok(prompt, return_tensors="pt")
34 | input_ids = inputs["input_ids"].cuda()
35 | 
36 | output = m.generate(input_ids, max_length=100, temperature=1.0, do_sample=True, use_cache=True)
37 | response = tok.decode(output[0], skip_special_tokens=True)
38 | print(response)
39 | ```
40 | 


--------------------------------------------------------------------------------
/example.py:
--------------------------------------------------------------------------------
 1 | # python>=3.10
 2 | 
 3 | import torch
 4 | from transformers import AutoModelForCausalLM, AutoTokenizer
 5 | 
 6 | model_dir = "llama-moe/LLaMA-MoE-v1-3_5B-2_8"
 7 | tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
 8 | model = AutoModelForCausalLM.from_pretrained(
 9 |     model_dir, torch_dtype=torch.bfloat16, trust_remote_code=True
10 | )
11 | model.eval()
12 | model.to("cuda:0")
13 | 
14 | input_text = "Suzhou is famous of"
15 | inputs = tokenizer(input_text, return_tensors="pt")
16 | inputs = inputs.to("cuda:0")
17 | 
18 | pred = model.generate(**inputs, max_length=50, temperature=0.0)
19 | print(tokenizer.decode(pred.cpu()[0], skip_special_tokens=True))
20 | # Suzhou is famous of its beautiful gardens. The most famous one is the Humble Administrator's Garden. It is a classical Chinese garden with a history of more than 600 years. The garden is divided into three
21 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | accelerate==0.21.0
 2 | black==23.7.0
 3 | coverage==7.2.7
 4 | datasets==2.14.1
 5 | debugpy==1.6.7
 6 | deepspeed==0.10.0
 7 | flake8==6.0.0
 8 | huggingface-hub==0.16.4
 9 | isort==5.12.0
10 | k-means-constrained==0.7.3
11 | nltk==3.8.1
12 | ninja==1.11.1
13 | omegaconf==2.0.6
14 | packaging==23.1
15 | peft==0.4.0
16 | pre-commit==3.3.3
17 | pytest==7.4.0
18 | safetensors==0.3.1
19 | scikit-learn==1.3.0
20 | sentencepiece==0.1.99
21 | tensorboard==2.13.0
22 | tokenizers==0.13.3
23 | torch==2.0.1
24 | torchaudio==2.0.2
25 | torchvision==0.15.2
26 | tqdm==4.65.0
27 | transformers==4.31.0
28 | triton==2.0.0
29 | trl==0.4.7
30 | wandb==0.15.6
31 | xformers==0.0.20
32 | pebble==5.0.3
33 | matplotlib==3.7.2
34 | python-dotenv==1.0.0
35 | sentence-transformers==2.2.2
36 | Pillow==9.4.0
37 | numpy==1.25.0
38 | opencv-python==4.8.1.78
39 | pynvml==11.5.0
40 | PyYaml==6.0.1
41 | pandas<2.1.0
42 | 


--------------------------------------------------------------------------------
/scripts/analysis/get_layer_wise_score_scale_factor.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/bash
 2 | 
 3 | #  llama_7B  llama_13B  llama_30B  llama_base  llama_3B
 4 | #  llama2_7B  llama2_13B  llama2_30B  llama2_base
 5 | llama_size="llama2_7B"
 6 | # model_path=/mnt/petrelfs/share_data/quxiaoye/models/LlamaMoEForCausalLM-copy/Gradient-max-l1_norm-sample-feature_change/llama_13B-16Select4-864Neurons
 7 | model_path=/mnt/petrelfs/share_data/quxiaoye/models/llama2_7B
 8 | 
 9 | data_begin_index=0
10 | data_end_index=500
11 | batch_size=8
12 | # block_size=2048
13 | block_size=4096
14 | 
15 | #save_folder=${llama_size}_dense
16 | save_folder=${llama_size}_moe_trained
17 | 
18 | share_path=/mnt/petrelfs/share_data/quxiaoye
19 | tokenizer_path=${share_path}/models/${llama_size}
20 | data_path=${share_path}/data/vis_data/head30_shuffled_output/shuffled_20.txt
21 | target_scale_file_path=/mnt/petrelfs/dongdaize.d/workspace/train-moe/visualization/mlp-outputs-scale/${save_folder}
22 | save_path=/mnt/petrelfs/dongdaize.d/workspace/train-moe/visualization/mlp-layer-wise-scale-factors/${save_folder}
23 | 
24 | gpus=1
25 | cpus=16
26 | OMP_NUM_THREADS=2 srun --partition=MoE --job-name=visualize --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=spot \
27 |   python -m smoe.entrypoint.analysis.get_layer_wise_score_scale_factor \
28 |   --tokenizer_path ${tokenizer_path} \
29 |   --model_path ${model_path} \
30 |   --target_scale_file_path ${target_scale_file_path} \
31 |   --data_path ${data_path} \
32 |   --save_path ${save_path} \
33 |   --data_begin_index ${data_begin_index} \
34 |   --data_end_index ${data_end_index} \
35 |   --batch_size ${batch_size} \
36 |   --block_size ${block_size}
37 | 


--------------------------------------------------------------------------------
/scripts/cpt/README.md:
--------------------------------------------------------------------------------
1 | # Scripts for Continual Pre-training
2 | 
3 | - `lora.sh`: Parameter-efficient tuning
4 | - `fpt.sh`: Full-parameter pretraining
5 | 


--------------------------------------------------------------------------------
/scripts/cpt/multi_jobs.sh:
--------------------------------------------------------------------------------
 1 | models=(
 2 |     /mnt/petrelfs/share_data/quxiaoye/models/LlamaMoEForCausalLM/Clustering-l2/llama_13B-16Select4-up_proj
 3 |     /mnt/petrelfs/share_data/quxiaoye/models/LlamaMoEForCausalLM/Graph-l2_norm/llama_13B-16Select4-up_proj
 4 |     /mnt/petrelfs/share_data/quxiaoye/models/LlamaMoEForCausalLM/Random/llama_13B-16Select4-up_proj
 5 | )
 6 | 
 7 | for model in "${models[@]}"
 8 | do
 9 |     sbatch scripts/cpt/fpt_13b.sh $model
10 | done
11 | 
12 | 
13 | # Submitted batch job 1904066
14 | # Submitted batch job 1904067
15 | # Submitted batch job 1904068
16 | 


--------------------------------------------------------------------------------
/scripts/cpt/test/test_conn.sh:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/bash
 2 | 
 3 | # SBATCH --job-name=test_conn
 4 | # SBATCH --output=logs/test_conn.log
 5 | # SBATCH --error=logs/test_conn.log
 6 | 
 7 | # SBATCH --partition=MoE_T
 8 | # SBATCH --ntasks-per-node=1
 9 | # SBATCH --cpus-per-task=26
10 | # SBATCH --mem=0
11 | 
12 | # SBATCH --nodes=8
13 | # SBATCH --gres=gpu:1
14 | # SBATCH --quotatype=reserved
15 | 
16 | # srun -p MoE_T -N8 -n8 --gres=gpu:1 -w HOST-10-140-60-[134,141,163,180-181,184] torchrun --nnodes 8 --nproc_per_node 1 tests/entrypoint/test_conn.py
17 | # $ srun -p MoE_T -N8 -n8 --gres=gpu:1 -w HOST-10-140-60-[134,141,163,180-181,184] bash scripts/cpt/test/test_conn.sh
18 | 
19 | nodes=($(scontrol show hostnames $SLURM_JOB_NODELIS))
20 | nodes_array=($nodes)
21 | head_node=${nodes_array[0]}
22 | head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address)
23 | echo "Node: $head_node"
24 | echo "Node IP: $head_node_ip"
25 | echo "Node list: $nodes"
26 | 
27 | torchrun \
28 |     --nnodes ${num_nodes} \
29 |     --nproc_per_node ${num_gpu_per_node} \
30 |     --node_rank $SLURM_NODEID \
31 |     --rdzv_id $RANDOM \
32 |     --rdzv_backend c10d \
33 |     --rdzv_endpoint $head_node:29519 \
34 |     tests/entrypoint/test_conn.py
35 | 


--------------------------------------------------------------------------------
/scripts/eval/eval_mmlu_moe.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/bash
 2 | 
 3 | #  llama_7B  llama_13B  llama_30B  llama_base
 4 | #  llama2_7B  llama2_13B  llama2_30B  llama2_base
 5 | llama_size="llama2_7B"
 6 | 
 7 | num_experts=8             #  8  16
 8 | num_selects=2             #  2  4
 9 | split_type=Random #  Graph-l1_norm  Graph-l2_norm  Clustering-l2  Clustering-cos  Random
10 | select_type=l2_norm       #  plain  positive  l1_norm  l2_norm
11 | proj_type=up_proj       #  gate_proj  up_proj
12 | 
13 | set_num_selects=2 #  手动设置eval时选择的专家数量
14 | 
15 | data_path=/mnt/petrelfs/share_data/quxiaoye
16 | tokenizer_path=${data_path}/models/${llama_size}
17 | data_dir=${data_path}/llama_data/mmlu_data/
18 | model_path=${data_path}/models/LlamaMoEForCausalLM/${split_type}-${select_type}/${llama_size}_${num_experts}Select${num_selects}-${proj_type}
19 | save_path=${data_path}/eval_mmlu_outputs/${split_type}-${select_type}/${llama_size}_${num_experts}Select${num_selects}-${proj_type}-S${set_num_selects}
20 | 
21 | # model_path=${data_path}/models/llama_7B-16Select4-up_proj
22 | # save_path=${data_path}/eval_mmlu_outputs/16select4_16card_bs16_checkpoint15000
23 | 
24 | gpus=1
25 | cpus=$((gpus * 16))
26 | for i in '0' '1' '2' '3'; do
27 |   OMP_NUM_THREADS=16 srun --partition=MoE --mpi=pmi2 --gres=gpu:${gpus} -n1 -c ${cpus} --ntasks-per-node=1 --job-name=test --kill-on-bad-exit=1 \
28 |     python -m smoe.entrypoint.eval.eval_mmlu_moe_${i} \
29 |     --data_dir ${data_dir} \
30 |     --save_dir ${save_path} \
31 |     --tokenizer_path ${tokenizer_path} \
32 |     --model_path ${model_path} \
33 |     --select_num ${set_num_selects} &
34 |   sleep 0.5s
35 | done
36 | 
37 | wait
38 | chmod -R 755 ${save_path}
39 | 


--------------------------------------------------------------------------------
/scripts/examples/create_noise_llama_moe.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/bash
 2 | 
 3 | #  llama_7B  llama_13B  llama_30B  llama_base
 4 | #  llama2_7B  llama2_13B  llama2_30B  llama2_base
 5 | tokenizer_path=/mnt/petrelfs/share_data/quxiaoye/models/llama_7B/
 6 | 
 7 | model_type=LlamaMoEForCausalLM #  LlamaMoEModel  LlamaMoEForCausalLM  LlamaMoEForSequenceClassification
 8 | 
 9 | gpus=1
10 | cpus=16
11 | OMP_NUM_THREADS=8 srun --partition=MoE --job-name=test --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=spot \
12 |   python -m smoe.entrypoint.examples.create_noise_llama_moe \
13 |   --tokenizer_path ${tokenizer_path} \
14 |   --model_type ${model_type}
15 | 


--------------------------------------------------------------------------------
/scripts/examples/create_noise_llama_moe_residual.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/bash
 2 | 
 3 | #  llama_7B  llama_13B  llama_30B  llama_base
 4 | #  llama2_7B  llama2_13B  llama2_30B  llama2_base
 5 | tokenizer_path=/mnt/petrelfs/share_data/quxiaoye/models/llama_7B/
 6 | 
 7 | model_type=LlamaMoEResidualForCausalLM #  LlamaMoEResidualModel  LlamaMoEResidualForCausalLM  LlamaMoEResidualForSequenceClassification
 8 | 
 9 | gpus=1
10 | cpus=16
11 | OMP_NUM_THREADS=8 srun --partition=MoE --job-name=test --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=spot \
12 |   python -m smoe.entrypoint.examples.create_noise_llama_moe_residual \
13 |   --tokenizer_path ${tokenizer_path} \
14 |   --model_type ${model_type}
15 | 


--------------------------------------------------------------------------------
/scripts/examples/create_switch_llama_moe.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/bash
 2 | 
 3 | #  llama_7B  llama_13B  llama_30B  llama_base
 4 | #  llama2_7B  llama2_13B  llama2_30B  llama2_base
 5 | tokenizer_path=/mnt/petrelfs/share_data/quxiaoye/models/llama_7B
 6 | 
 7 | model_type=LlamaMoEForCausalLM #  LlamaMoEModel  LlamaMoEForCausalLM  LlamaMoEForSequenceClassification
 8 | 
 9 | gpus=1
10 | cpus=16
11 | OMP_NUM_THREADS=8 srun --partition=MoE --job-name=test --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=spot \
12 |   python -m smoe.entrypoint.examples.create_switch_llama_moe \
13 |   --tokenizer_path ${tokenizer_path} \
14 |   --model_type ${model_type}
15 | 


--------------------------------------------------------------------------------
/scripts/examples/load_llama_moe.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/bash
 2 | 
 3 | #  llama_7B  llama_13B  llama_30B  llama_base
 4 | #  llama2_7B  llama2_13B  llama2_30B  llama2_base
 5 | base_model=llama_7B
 6 | 
 7 | num_experts=16                 #  8  16
 8 | num_selects=4                  #  2  4
 9 | model_type=LlamaMoEForCausalLM #  LlamaMoEModel  LlamaMoEForCausalLM  LlamaMoEForSequenceClassification
10 | split_type=Random              #  Graph-l1_norm  Graph-l2_norm  Clustering-l2  Clustering-cos  Random
11 | select_type=""                 #  plain  positive  l2_norm
12 | proj_type=up_proj              #  gate_proj  up_proj
13 | 
14 | tokenizer_path=/mnt/petrelfs/share_data/quxiaoye/models/${base_model}/
15 | 
16 | if [ "${select_type}" = "" ]; then
17 |   model_path=/mnt/petrelfs/share_data/quxiaoye/models/${model_type}/${split_type}/${base_model}-${num_experts}Select${num_selects}-${proj_type}/
18 | else
19 |   model_path=/mnt/petrelfs/share_data/quxiaoye/models/${model_type}/${split_type}-${select_type}/${base_model}-${num_experts}Select${num_selects}-${proj_type}/
20 | fi
21 | 
22 | #model_path=/mnt/petrelfs/share_data/quxiaoye/models/LlamaMoEForCausalLM-Prune/Gradient-max-l1_norm-total-feature_grad/llama2_7B-0-0.20Percent-2201Neurons
23 | 
24 | gpus=1
25 | cpus=8
26 | OMP_NUM_THREADS=2 srun --partition=MoE --job-name=test --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=spot \
27 |   python -m smoe.entrypoint.examples.load_llama_moe \
28 |   --tokenizer_path ${tokenizer_path} \
29 |   --model_path ${model_path} \
30 |   --model_type ${model_type}
31 | 


--------------------------------------------------------------------------------
/scripts/examples/load_llama_moe_hf.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/bash
 2 | 
 3 | tokenizer_path=/mnt/petrelfs/share_data/quxiaoye/models/llama-moe-models/LLaMA-MoE-v1-3_0B-2_16
 4 | model_path=/mnt/petrelfs/share_data/quxiaoye/models/llama-moe-models/LLaMA-MoE-v1-3_0B-2_16
 5 | 
 6 | model_type=LlamaMoEForCausalLM #  LlamaMoEModel  LlamaMoEForCausalLM
 7 | 
 8 | gpus=1
 9 | cpus=8
10 | OMP_NUM_THREADS=2 srun --partition=MoE --job-name="☝☝☝" --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=spot \
11 |   python -m smoe.entrypoint.examples.load_llama_moe_hf \
12 |   --tokenizer_path ${tokenizer_path} \
13 |   --model_path ${model_path} \
14 |   --model_type ${model_type}
15 | 


--------------------------------------------------------------------------------
/scripts/examples/load_llama_moe_residual.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/bash
 2 | 
 3 | #  llama_7B  llama_13B  llama_30B  llama_base  llama_3B
 4 | #  llama2_7B  llama2_13B  llama2_30B  llama2_base
 5 | base_model=llama_13B
 6 | 
 7 | num_experts=15         #  13  14  15
 8 | num_experts_residual=1 #  1  2  3
 9 | num_selects=3          #  1  2  3
10 | expert_size=864
11 | # 540 1080 2160 4320 8640
12 | # 688 1376 2752 5504 11008
13 | # 864 1728 3456 6912 13824
14 | model_type=LlamaMoEResidualForCausalLM #  LlamaMoEResidualModel  LlamaMoEResidualForCausalLM  LlamaMoEResidualForSequenceClassification
15 | 
16 | kernel=l1_norm
17 | criterion=max                  #  min  max
18 | accumulate_level=sample        #  sample  total
19 | importance_type=feature_change #  feature_grad  feature_change
20 | 
21 | tokenizer_path=/mnt/petrelfs/share_data/quxiaoye/models/${base_model}/
22 | model_path=/mnt/petrelfs/share_data/quxiaoye/models/${model_type}/Gradient-${criterion}-${kernel}-${accumulate_level}-${importance_type}/${base_model}-${num_experts}Select${num_selects}-${num_experts_residual}Residuals-${expert_size}Neurons-Share
23 | 
24 | gpus=1
25 | cpus=8
26 | OMP_NUM_THREADS=2 srun --partition=MoE --job-name=test --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=spot \
27 |   python -m smoe.entrypoint.examples.load_llama_moe_residual \
28 |   --tokenizer_path ${tokenizer_path} \
29 |   --model_path ${model_path} \
30 |   --model_type ${model_type}
31 | 


--------------------------------------------------------------------------------
/scripts/expert_construction/convert/run_convert.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/bash
 2 | 
 3 | #  llama_7B  llama_13B  llama_30B  llama_base
 4 | #  llama2_7B  llama2_13B  llama2_30B  llama2_base
 5 | #  open_llama_7b
 6 | #  ReluLLaMA-7B
 7 | llama_size="ReluLLaMA-7B"
 8 | 
 9 | num_experts=16           #  4  8  16  32
10 | num_selects=4            #  1  2  4  8
11 | split_type=Clustering-l2 #  Graph-l1_norm  Graph-l2_norm  Clustering-l2  Clustering-cos  Random
12 | proj_type=gate_proj      #  gate_proj  up_proj
13 | select_type=positive     #  plain  positive  l1_norm  l2_norm
14 | 
15 | use_random_gate="False" #  True  False
16 | gate_type="mlp"         #  mlp  linear
17 | use_softmax="False"
18 | multiply_gate_scores="False"
19 | 
20 | score_scale_factor=1.0 #  1.0  2.0  4.0  8.0  16.0
21 | score_scale_factor_file_path=""
22 | #score_scale_factor_file_path=/mnt/petrelfs/dongdaize.d/workspace/train-moe/visualization/mlp-layer-wise-scale-factors/llama_13B_dense
23 | 
24 | convert_type=LlamaMoEForCausalLM #  LlamaMoEModel  LlamaMoEForCausalLM  LlamaMoEForSequenceClassification
25 | 
26 | data_path=/mnt/petrelfs/share_data/quxiaoye
27 | model_path=${data_path}/models/${llama_size}
28 | split_file_path=${data_path}/moefication_results/split/${llama_size}-${num_experts}Expert-Split-${split_type}
29 | 
30 | if [ ${use_random_gate} = "True" ]; then
31 |   select_file_path=""
32 |   save_path=${data_path}/models/${convert_type}/${split_type}/${llama_size}-${num_experts}Select${num_selects}-${proj_type}-Scale${score_scale_factor}
33 | else
34 |   select_file_path="/mnt/petrelfs/share_data/quxiaoye/moefication_results/select/Clustering-l2/ReluLLaMA-7B-16Expert-Select-MLP-positive-random"
35 |   save_path=${data_path}/models/${convert_type}/${split_type}-${select_type}/${llama_size}-${num_experts}Select${num_selects}-${proj_type}-HardBCE
36 |   #  select_file_path=${data_path}/moefication_results/select/${split_type}/${llama_size}-${num_experts}Expert-Select-MLP-${select_type}
37 |   #  save_path=${data_path}/models/${convert_type}/${split_type}-${select_type}/${llama_size}-${num_experts}Select${num_selects}-${proj_type}
38 | fi
39 | 
40 | gpus=0
41 | cpus=8
42 | OMP_NUM_THREADS=2 srun --partition=MoE --job-name=convert --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=auto \
43 |   python -m smoe.entrypoint.expert_construction.llama_convert \
44 |   --model_path ${model_path} \
45 |   --split_file_path ${split_file_path} \
46 |   --select_file_path "${select_file_path}" \
47 |   --save_path ${save_path} \
48 |   --template layers.{}.mlp.${proj_type}.weight \
49 |   --num_experts ${num_experts} \
50 |   --num_selects ${num_selects} \
51 |   --use_random_gate ${use_random_gate} \
52 |   --gate_type ${gate_type} \
53 |   --use_softmax ${use_softmax} \
54 |   --multiply_gate_scores ${multiply_gate_scores} \
55 |   --score_scale_factor ${score_scale_factor} \
56 |   --score_scale_factor_file_path "${score_scale_factor_file_path}" \
57 |   --convert_type ${convert_type}
58 | 


--------------------------------------------------------------------------------
/scripts/expert_construction/convert/run_convert_gradient.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/bash
 2 | 
 3 | #  llama_7B  llama_13B  llama_30B  llama_base  llama_3B
 4 | #  llama2_7B  llama2_13B  llama2_30B  llama2_base
 5 | llama_size="llama2_7B"
 6 | 
 7 | share_neurons=False #  True  False
 8 | num_experts=8      #  2  4  8  16  32
 9 | num_selects=2      #  1  2  4
10 | expert_size=1376
11 | # 540 1080 2160 4320 8640
12 | # 688 1376 2752 5504 11008
13 | # 864 1728 3456 6912 13824
14 | 
15 | score_scale_factor=8.0 #  1.0  2.0  4.0  8.0  16.0
16 | score_scale_factor_file_path=""
17 | #score_scale_factor_file_path=/mnt/petrelfs/dongdaize.d/workspace/train-moe/visualization/mlp-layer-wise-scale-factors/llama_13B_dense
18 | #score_scale_factor_file_path=/mnt/petrelfs/dongdaize.d/workspace/train-moe/visualization/mlp-layer-wise-scale-factors/llama_13B_moe_trained
19 | 
20 | convert_type=LlamaMoEForCausalLM #  LlamaMoEModel  LlamaMoEForCausalLM  LlamaMoEForSequenceClassification
21 | 
22 | kernel=l1_norm
23 | criterion=max                  #  min  max
24 | accumulate_level=sample        #  sample  total
25 | importance_type=feature_change #  feature_grad  feature_change
26 | proj_type=up_proj              #  gate_proj  up_proj
27 | 
28 | data_path=/mnt/petrelfs/share_data/quxiaoye
29 | model_path=${data_path}/models/${llama_size}
30 | split_file_path=${data_path}/moefication_results/split/${llama_size}-Split-Gradient-${criterion}-${kernel}-${accumulate_level}-${importance_type}/${num_experts}Experts-${expert_size}Neurons
31 | save_path=${data_path}/models/${convert_type}/Gradient-${criterion}-${kernel}-${accumulate_level}-${importance_type}/${llama_size}-${num_experts}Select${num_selects}-${expert_size}Neurons
32 | 
33 | gpus=0
34 | cpus=8
35 | if [ ${share_neurons} = "True" ]; then
36 |   split_file_path=${split_file_path}-Share
37 |   save_path=${save_path}-Share
38 |   OMP_NUM_THREADS=2 srun --partition=MoE --job-name=convert --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=auto \
39 |     python -m smoe.entrypoint.expert_construction.llama_convert_neuron_index \
40 |     --model_path ${model_path} \
41 |     --split_file_path ${split_file_path} \
42 |     --select_file_path "" \
43 |     --save_path ${save_path} \
44 |     --template layers.{}.mlp.${proj_type}.weight \
45 |     --num_experts ${num_experts} \
46 |     --num_selects ${num_selects} \
47 |     --score_scale_factor ${score_scale_factor} \
48 |     --score_scale_factor_file_path "${score_scale_factor_file_path}" \
49 |     --convert_type ${convert_type} \
50 |     --use_random_gate True
51 | else
52 |   OMP_NUM_THREADS=8 srun --partition=MoE --job-name=convert --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=auto \
53 |     python -m smoe.entrypoint.expert_construction.llama_convert \
54 |     --model_path ${model_path} \
55 |     --split_file_path ${split_file_path} \
56 |     --select_file_path "" \
57 |     --save_path ${save_path} \
58 |     --template layers.{}.mlp.${proj_type}.weight \
59 |     --num_experts ${num_experts} \
60 |     --num_selects ${num_selects} \
61 |     --score_scale_factor ${score_scale_factor} \
62 |     --score_scale_factor_file_path "${score_scale_factor_file_path}" \
63 |     --convert_type ${convert_type} \
64 |     --use_random_gate True
65 | fi
66 | 


--------------------------------------------------------------------------------
/scripts/expert_construction/convert/run_convert_gradient_residual.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/bash
 2 | 
 3 | #  llama_7B  llama_13B  llama_30B  llama_base  llama_3B
 4 | #  llama2_7B  llama2_13B  llama2_30B  llama2_base
 5 | llama_size="llama2_7B"
 6 | 
 7 | share_neurons=True    #  True  False
 8 | num_experts=7          #  7  14  28
 9 | num_experts_residual=1 #  1  2  3  4
10 | num_selects=1          #  1  2  3  4
11 | expert_size=1376
12 | # 540 1080 2160 4320 8640
13 | # 688 1376 2752 5504 11008
14 | # 864 1728 3456 6912 13824
15 | 
16 | score_scale_factor_residual=1.0 #  4.0  8.0  12.0  16.0
17 | score_scale_factor=4.0          #  4.0  8.0  12.0  16.0
18 | 
19 | convert_type=LlamaMoEResidualForCausalLM #  LlamaMoEResidualModel  LlamaMoEResidualForCausalLM  LlamaMoEResidualForSequenceClassification
20 | 
21 | kernel=l1_norm
22 | criterion=max                  #  min  max
23 | accumulate_level=sample        #  sample  total
24 | importance_type=feature_change #  feature_grad  feature_change
25 | proj_type=up_proj              #  gate_proj  up_proj
26 | 
27 | data_path=/mnt/petrelfs/share_data/quxiaoye
28 | model_path=${data_path}/models/${llama_size}
29 | split_file_path=${data_path}/moefication_results/split/${llama_size}-Split-Gradient-${criterion}-${kernel}-${accumulate_level}-${importance_type}/${num_experts}Experts-${num_experts_residual}Residuals-${expert_size}Neurons
30 | save_path=${data_path}/models/${convert_type}/Gradient-${criterion}-${kernel}-${accumulate_level}-${importance_type}/${llama_size}-${num_experts}Select${num_selects}-${num_experts_residual}Residuals-${expert_size}Neurons
31 | if [ ${share_neurons} = "True" ]; then
32 |   split_file_path=${split_file_path}-Share
33 |   save_path=${save_path}-Share
34 | fi
35 | 
36 | gpus=0
37 | cpus=8
38 | OMP_NUM_THREADS=2 srun --partition=MoE --job-name=convert --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=auto \
39 |   python -m smoe.entrypoint.expert_construction.llama_convert_neuron_index_residual \
40 |   --model_path ${model_path} \
41 |   --split_file_path ${split_file_path} \
42 |   --select_file_path "" \
43 |   --save_path ${save_path} \
44 |   --template layers.{}.mlp.${proj_type}.weight \
45 |   --num_experts ${num_experts} \
46 |   --num_experts_residual ${num_experts_residual} \
47 |   --num_selects ${num_selects} \
48 |   --score_scale_factor ${score_scale_factor} \
49 |   --score_scale_factor_residual ${score_scale_factor_residual} \
50 |   --convert_type ${convert_type} \
51 |   --use_random_gate True
52 | 


--------------------------------------------------------------------------------
/scripts/expert_construction/convert/run_convert_mistral.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/bash
 2 | 
 3 | model_path=/mnt/petrelfs/share_data/quxiaoye/models/Mistral-7B-v0.1
 4 | moe_config_path=/mnt/petrelfs/share_data/quxiaoye/models/Mixtral-8x7B-v0.1
 5 | split_file_path=/mnt/petrelfs/share_data/quxiaoye/moefication_results/split/Mistral-7B-v0.1-8Expert-Split-Random
 6 | save_path=/mnt/petrelfs/share_data/quxiaoye/models/Mixtral-8x7B-v0.1-Random-8Select2
 7 | 
 8 | template=layers.{}.mlp.up_proj.weight
 9 | num_experts=8
10 | num_selects=2
11 | 
12 | gpus=0
13 | cpus=8
14 | OMP_NUM_THREADS=2 srun --partition=MoE --job-name=convert --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=auto \
15 |   python -m smoe.entrypoint.expert_construction.mistral_convert \
16 |   --model_path ${model_path} \
17 |   --moe_config_path ${moe_config_path} \
18 |   --split_file_path ${split_file_path} \
19 |   --save_path ${save_path} \
20 |   --template ${template} \
21 |   --num_experts ${num_experts} \
22 |   --num_selects ${num_selects}
23 | 
24 | chmod -R 755 ${save_path} >/dev/null 2>&1
25 | 


--------------------------------------------------------------------------------
/scripts/expert_construction/get_hidden_features/run_get_hidden_features.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/bash
 2 | 
 3 | #  llama_7B  llama_13B  llama_30B  llama_base
 4 | #  llama2_7B  llama2_13B  llama2_30B  llama2_base
 5 | #  ReluLLaMA-7B
 6 | llama_size="ReluLLaMA-7B"
 7 | save_interval=1
 8 | batch_size=4
 9 | block_size=2048
10 | data_use_percent=0.002
11 | 
12 | proj_type=gate_proj #  gate_proj  up_proj
13 | 
14 | data_path=/mnt/petrelfs/share_data/quxiaoye
15 | model_path=${data_path}/models/${llama_size}
16 | train_data_path=${data_path}/data/moefication_LLAMA_data
17 | train_data_cache_path=${data_path}/data/moefication_LLAMA_data_cache
18 | save_path=${data_path}/moefication_results/features
19 | 
20 | gpus=4
21 | cpus=$((gpus * 16))
22 | quotatype=auto # auto spot reserved
23 | OMP_NUM_THREADS=2 srun --partition=MoE --job-name=get_features --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=${quotatype} \
24 |   torchrun --nproc_per_node=${gpus} -m smoe.entrypoint.expert_construction.llama_get_hidden_features \
25 |   --model_path ${model_path} \
26 |   --train_data_path ${train_data_path} \
27 |   --train_data_cache_path ${train_data_cache_path} \
28 |   --save_path ${save_path} \
29 |   --template layers.{}.mlp.${proj_type}.weight \
30 |   --data_use_percent ${data_use_percent} \
31 |   --save_interval ${save_interval} \
32 |   --batch_size ${batch_size} \
33 |   --block_size ${block_size}
34 | 
35 | wait
36 | 


--------------------------------------------------------------------------------
/scripts/expert_construction/get_hidden_features/run_prepare_datasets.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/bash
 2 | 
 3 | #  llama_7B  llama_13B  llama_30B  llama_base
 4 | #  llama2_7B  llama2_13B  llama2_30B  llama2_base
 5 | llama_size="llama_7B"
 6 | 
 7 | data_path=/mnt/petrelfs/share_data/quxiaoye
 8 | model_path=${data_path}/models/${llama_size}
 9 | train_data_path=${data_path}/data/moefication_LLAMA_data
10 | train_data_cache_path=${data_path}/data/moefication_LLAMA_data_cache
11 | 
12 | gpus=0
13 | cpus=16
14 | OMP_NUM_THREADS=2 srun --partition=MoE --job-name=datasets --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=auto \
15 |   python -m smoe.entrypoint.expert_construction.llama_prepare_datasets \
16 |   --model_path ${model_path} \
17 |   --train_data_path ${train_data_path} \
18 |   --train_data_cache_path ${train_data_cache_path}
19 | 
20 | wait
21 | chmod -R 755 ${train_data_cache_path} >/dev/null 2>&1
22 | 


--------------------------------------------------------------------------------
/scripts/expert_construction/prune/run_prune_gradient.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/bash
 2 | 
 3 | #  llama_7B  llama_13B  llama_30B  llama_base
 4 | #  llama2_7B  llama2_13B  llama2_30B  llama2_base
 5 | llama_size="llama_7B"
 6 | 
 7 | retain_percent=0.20 #  0.99 0.98 0.95 0.90 0.80 0.75 0.70 0.60 0.50 0.40 0.30 0.25 0.20 0.13 0.10 0.06 0.05
 8 | use_grad_sum=True   #  True  False
 9 | 
10 | if [ ${use_grad_sum} = "True" ]; then
11 |   expert_index=All
12 | else
13 |   expert_index=0
14 | fi
15 | 
16 | criterion=max                  #  min  max
17 | kernel=l1_norm                 #  plain  l1_norm  l2_norm
18 | accumulate_level=sample        #  sample  total
19 | importance_type=feature_change #  feature_grad  feature_change
20 | proj_type=up_proj              #  gate_proj  up_proj
21 | 
22 | data_path=/mnt/petrelfs/share_data/quxiaoye
23 | model_path=${data_path}/models/${llama_size}
24 | grad_file_path=${data_path}/moefication_results/split/Gradients/${llama_size}-Gradients-${kernel}-${accumulate_level}-${importance_type}
25 | save_path=${data_path}/moefication_results/prune
26 | 
27 | gpus=0
28 | cpus=8
29 | OMP_NUM_THREADS=2 srun --partition=MoE --job-name=split --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=auto \
30 |   python -m smoe.entrypoint.expert_construction.llama_prune_gradient \
31 |   --model_path ${model_path} \
32 |   --grad_file_path ${grad_file_path} \
33 |   --save_path ${save_path} \
34 |   --retain_percent ${retain_percent} \
35 |   --expert_index ${expert_index} \
36 |   --template layers.{}.mlp.${proj_type}.weight \
37 |   --kernel ${kernel} \
38 |   --accumulate_level ${accumulate_level} \
39 |   --importance_type ${importance_type} \
40 |   --criterion ${criterion} \
41 |   --use_grad_sum ${use_grad_sum}
42 | 
43 | chmod -R 755 ${save_path} >/dev/null 2>&1
44 | 


--------------------------------------------------------------------------------
/scripts/expert_construction/prune/run_prune_gradient_convert.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/bash
 2 | 
 3 | #  llama_7B  llama_13B  llama_30B  llama_base
 4 | #  llama2_7B  llama2_13B  llama2_30B  llama2_base
 5 | llama_size="llama_7B"
 6 | 
 7 | intermediate_size=11008
 8 | retain_percent=99 #  99  98  95  90  80  75  70  60  50  40  30  25  20  13  10  06  05
 9 | expert_size=$((${retain_percent} * ${intermediate_size} / 100))
10 | echo ${expert_size}
11 | 
12 | convert_type=LlamaMoEForCausalLM #  LlamaMoEModel  LlamaMoEForCausalLM  LlamaMoEForSequenceClassification
13 | use_grad_sum=True                #  True  False
14 | 
15 | if [ ${use_grad_sum} = "True" ]; then
16 |   expert_index=All
17 | else
18 |   expert_index=0
19 | fi
20 | 
21 | criterion=min                  #  min  max
22 | kernel=l1_norm                 #  plain  l1_norm  l2_norm
23 | accumulate_level=sample        #  sample  total
24 | importance_type=feature_change #  feature_grad  feature_change
25 | proj_type=up_proj              #  gate_proj  up_proj
26 | 
27 | data_path=/mnt/petrelfs/share_data/quxiaoye
28 | model_path=${data_path}/models/${llama_size}
29 | split_file_path=${data_path}/moefication_results/prune/${llama_size}-Prune-Gradient-${criterion}-${kernel}-${accumulate_level}-${importance_type}/${expert_index}-0.${retain_percent}Percent-${expert_size}Neurons
30 | save_path=${data_path}/models/${convert_type}-Prune/Gradient-${criterion}-${kernel}-${accumulate_level}-${importance_type}/${llama_size}-${expert_index}-0.${retain_percent}Percent-${expert_size}Neurons
31 | 
32 | gpus=0
33 | cpus=16
34 | OMP_NUM_THREADS=2 srun --partition=MoE --job-name=prune-convert --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=auto \
35 |   python -m smoe.entrypoint.expert_construction.llama_convert_neuron_index \
36 |   --model_path ${model_path} \
37 |   --split_file_path ${split_file_path} \
38 |   --select_file_path "" \
39 |   --save_path ${save_path} \
40 |   --template layers.{}.mlp.${proj_type}.weight \
41 |   --num_experts 1 \
42 |   --num_selects 1 \
43 |   --convert_type ${convert_type} \
44 |   --use_random_gate True
45 | 


--------------------------------------------------------------------------------
/scripts/expert_construction/prune/run_prune_gradient_convert_one4all.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/bash
 2 | 
 3 | #  llama_7B  llama_13B  llama_30B  llama_base
 4 | #  llama2_7B  llama2_13B  llama2_30B  llama2_base
 5 | llama_size="llama_7B"
 6 | 
 7 | intermediate_size=11008
 8 | 
 9 | convert_type=LlamaMoEForCausalLM #  LlamaMoEModel  LlamaMoEForCausalLM  LlamaMoEForSequenceClassification
10 | use_grad_sum=True                #  True  False
11 | 
12 | if [ ${use_grad_sum} = "True" ]; then
13 |   expert_index=All
14 | else
15 |   expert_index=0
16 | fi
17 | 
18 | criterion=max                  #  min  max
19 | kernel=l1_norm                 #  plain  l1_norm  l2_norm
20 | accumulate_level=sample        #  sample  total
21 | importance_type=feature_change #  feature_grad  feature_change
22 | proj_type=up_proj              #  gate_proj  up_proj
23 | 
24 | data_path=/mnt/petrelfs/share_data/quxiaoye
25 | model_path=${data_path}/models/${llama_size}
26 | 
27 | gpus=0
28 | cpus=16
29 | for retain_percent in 99 98 95 90 80 75 70 60 50 40 30 25 20 13 10 06 05; do
30 |   expert_size=$((${retain_percent} * ${intermediate_size} / 100))
31 |   echo ${expert_size}
32 |   split_file_path=${data_path}/moefication_results/prune/${llama_size}-Prune-Gradient-${criterion}-${kernel}-${accumulate_level}-${importance_type}/${expert_index}-0.${retain_percent}Percent-${expert_size}Neurons
33 |   save_path=${data_path}/models/${convert_type}-Prune/Gradient-${criterion}-${kernel}-${accumulate_level}-${importance_type}/${llama_size}-${expert_index}-0.${retain_percent}Percent-${expert_size}Neurons
34 | 
35 |   OMP_NUM_THREADS=2 srun --partition=MoE --job-name=prune-convert --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=auto \
36 |     python -m smoe.entrypoint.expert_construction.llama_convert_neuron_index \
37 |     --model_path ${model_path} \
38 |     --split_file_path ${split_file_path} \
39 |     --select_file_path "" \
40 |     --save_path ${save_path} \
41 |     --template layers.{}.mlp.${proj_type}.weight \
42 |     --num_experts 1 \
43 |     --num_selects 1 \
44 |     --convert_type ${convert_type} \
45 |     --use_random_gate True &
46 |   sleep 1
47 | done
48 | 
49 | wait
50 | 


--------------------------------------------------------------------------------
/scripts/expert_construction/prune/run_prune_gradient_one4all.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/bash
 2 | 
 3 | #  llama_7B  llama_13B  llama_30B  llama_base
 4 | #  llama2_7B  llama2_13B  llama2_30B  llama2_base
 5 | llama_size="llama_7B"
 6 | 
 7 | kernel=l1_norm
 8 | proj_type=up_proj
 9 | 
10 | use_grad_sum=True #  True  False
11 | 
12 | if [ ${use_grad_sum} = "True" ]; then
13 |   expert_index=All
14 | else
15 |   expert_index=0
16 | fi
17 | 
18 | data_path=/mnt/petrelfs/share_data/quxiaoye
19 | model_path=${data_path}/models/${llama_size}
20 | save_path=${data_path}/moefication_results/prune
21 | 
22 | gpus=0
23 | cpus=8
24 | for retain_percent in 0.99 0.98 0.95 0.90 0.80 0.75 0.70 0.60 0.50 0.40 0.30 0.25 0.20 0.13 0.10 0.06 0.05; do
25 |   for criterion in min max; do
26 |     for accumulate_level in sample total; do
27 |       for importance_type in feature_grad feature_change; do
28 |         grad_file_path=${data_path}/moefication_results/split/Gradients/${llama_size}-Gradients-${kernel}-${accumulate_level}-${importance_type}
29 | 
30 |         OMP_NUM_THREADS=2 srun --partition=MoE --job-name=split --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=auto \
31 |           python -m smoe.entrypoint.expert_construction.llama_prune_gradient \
32 |           --model_path ${model_path} \
33 |           --grad_file_path ${grad_file_path} \
34 |           --save_path ${save_path} \
35 |           --retain_percent ${retain_percent} \
36 |           --expert_index ${expert_index} \
37 |           --template layers.{}.mlp.${proj_type}.weight \
38 |           --kernel ${kernel} \
39 |           --accumulate_level ${accumulate_level} \
40 |           --importance_type ${importance_type} \
41 |           --criterion ${criterion} \
42 |           --use_grad_sum ${use_grad_sum} &
43 |         sleep 1
44 | 
45 |       done
46 |     done
47 |   done
48 | done
49 | 
50 | wait
51 | chmod -R 755 ${save_path} >/dev/null 2>&1
52 | 


--------------------------------------------------------------------------------
/scripts/expert_construction/prune/run_prune_random.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/bash
 2 | 
 3 | #  llama_7B  llama_13B  llama_30B  llama_base
 4 | #  llama2_7B  llama2_13B  llama2_30B  llama2_base
 5 | llama_size="llama_7B"
 6 | 
 7 | retain_percent=0.05 #  0.99 0.98 0.95 0.90 0.80 0.75 0.70 0.60 0.50 0.40 0.30 0.25 0.20 0.13 0.10 0.06 0.05
 8 | proj_type=up_proj   #  gate_proj  up_proj
 9 | 
10 | data_path=/mnt/petrelfs/share_data/quxiaoye
11 | model_path=${data_path}/models/${llama_size}
12 | save_path=${data_path}/moefication_results/prune
13 | 
14 | gpus=0
15 | cpus=8
16 | OMP_NUM_THREADS=2 srun --partition=MoE --job-name=split --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=auto \
17 |   python -m smoe.entrypoint.expert_construction.llama_prune_random \
18 |   --model_path ${model_path} \
19 |   --save_path ${save_path} \
20 |   --retain_percent ${retain_percent} \
21 |   --template layers.{}.mlp.${proj_type}.weight
22 | 
23 | chmod -R 755 ${save_path} >/dev/null 2>&1
24 | 


--------------------------------------------------------------------------------
/scripts/expert_construction/prune/run_prune_random_convert.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/bash
 2 | 
 3 | #  llama_7B  llama_13B  llama_30B  llama_base
 4 | #  llama2_7B  llama2_13B  llama2_30B  llama2_base
 5 | llama_size="llama_7B"
 6 | 
 7 | intermediate_size=11008
 8 | retain_percent=99 #  99  98  95  90  80  75  70  60  50  40  30  25  20  13  10  06  05
 9 | expert_size=$((${retain_percent} * ${intermediate_size} / 100))
10 | echo ${expert_size}
11 | 
12 | convert_type=LlamaMoEForCausalLM #  LlamaMoEModel  LlamaMoEForCausalLM  LlamaMoEForSequenceClassification
13 | proj_type=up_proj                #  gate_proj  up_proj
14 | 
15 | data_path=/mnt/petrelfs/share_data/quxiaoye
16 | model_path=${data_path}/models/${llama_size}
17 | split_file_path=${data_path}/moefication_results/prune/${llama_size}-Prune-Random/0.${retain_percent}Percent-${expert_size}Neurons
18 | save_path=${data_path}/models/${convert_type}-Prune/Random/${llama_size}-0.${retain_percent}Percent-${expert_size}Neurons
19 | 
20 | gpus=0
21 | cpus=16
22 | OMP_NUM_THREADS=2 srun --partition=MoE --job-name=prune-convert --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=auto \
23 |   python -m smoe.entrypoint.expert_construction.llama_convert_neuron_index \
24 |   --model_path ${model_path} \
25 |   --split_file_path ${split_file_path} \
26 |   --select_file_path "" \
27 |   --save_path ${save_path} \
28 |   --template layers.{}.mlp.${proj_type}.weight \
29 |   --num_experts 1 \
30 |   --num_selects 1 \
31 |   --convert_type ${convert_type} \
32 |   --use_random_gate True
33 | 


--------------------------------------------------------------------------------
/scripts/expert_construction/prune/run_prune_random_convert_one4all.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/bash
 2 | 
 3 | #  llama_7B  llama_13B  llama_30B  llama_base
 4 | #  llama2_7B  llama2_13B  llama2_30B  llama2_base
 5 | llama_size="llama_7B"
 6 | 
 7 | intermediate_size=11008
 8 | 
 9 | convert_type=LlamaMoEForCausalLM #  LlamaMoEModel  LlamaMoEForCausalLM  LlamaMoEForSequenceClassification
10 | proj_type=up_proj                #  gate_proj  up_proj
11 | 
12 | data_path=/mnt/petrelfs/share_data/quxiaoye
13 | model_path=${data_path}/models/${llama_size}
14 | 
15 | gpus=0
16 | cpus=16
17 | for retain_percent in 99 98 95 90 80 75 70 60 50 40 30 25 20 13 10 06 05; do
18 |   expert_size=$((${retain_percent} * ${intermediate_size} / 100))
19 |   echo ${expert_size}
20 |   split_file_path=${data_path}/moefication_results/prune/${llama_size}-Prune-Random/0.${retain_percent}Percent-${expert_size}Neurons
21 |   save_path=${data_path}/models/${convert_type}-Prune/Random/${llama_size}-0.${retain_percent}Percent-${expert_size}Neurons
22 | 
23 |   OMP_NUM_THREADS=2 srun --partition=MoE --job-name=prune-convert --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=auto \
24 |     python -m smoe.entrypoint.expert_construction.llama_convert_neuron_index \
25 |     --model_path ${model_path} \
26 |     --split_file_path ${split_file_path} \
27 |     --select_file_path "" \
28 |     --save_path ${save_path} \
29 |     --template layers.{}.mlp.${proj_type}.weight \
30 |     --num_experts 1 \
31 |     --num_selects 1 \
32 |     --convert_type ${convert_type} \
33 |     --use_random_gate True &
34 |   sleep 1
35 | done
36 | 
37 | wait
38 | 


--------------------------------------------------------------------------------
/scripts/expert_construction/prune/run_prune_random_one4all.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/bash
 2 | 
 3 | #  llama_7B  llama_13B  llama_30B  llama_base
 4 | #  llama2_7B  llama2_13B  llama2_30B  llama2_base
 5 | llama_size="llama2_7B"
 6 | 
 7 | proj_type=up_proj #  gate_proj  up_proj
 8 | 
 9 | data_path=/mnt/petrelfs/share_data/quxiaoye
10 | model_path=${data_path}/models/${llama_size}
11 | save_path=${data_path}/moefication_results/prune
12 | 
13 | gpus=0
14 | cpus=8
15 | for retain_percent in 0.99 0.98 0.95 0.90 0.80 0.75 0.70 0.60 0.50 0.40 0.30 0.25 0.20 0.13 0.10 0.06 0.05; do
16 |   OMP_NUM_THREADS=2 srun --partition=MoE --job-name=split --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=auto \
17 |     python -m smoe.entrypoint.expert_construction.llama_prune_random \
18 |     --model_path ${model_path} \
19 |     --save_path ${save_path} \
20 |     --retain_percent ${retain_percent} \
21 |     --template layers.{}.mlp.${proj_type}.weight &
22 |   sleep 1
23 | done
24 | 
25 | wait
26 | chmod -R 755 ${save_path} >/dev/null 2>&1
27 | 


--------------------------------------------------------------------------------
/scripts/expert_construction/select/run_select.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/bash
 2 | 
 3 | #  llama_7B  llama_13B  llama_30B  llama_base
 4 | #  llama2_7B  llama2_13B  llama2_30B  llama2_base
 5 | #  ReluLLaMA-7B
 6 | llama_size="ReluLLaMA-7B"
 7 | 
 8 | num_experts=16            #  8  16
 9 | num_selects=4             #  2  4
10 | split_type=Clustering-l2  #  Graph-l1_norm  Graph-l2_norm  Clustering-l2  Clustering-cos  Random
11 | select_type=positive      #  plain  positive  l1_norm  l2_norm
12 | mlp_init_criterion=random #  weight  random
13 | proj_type=gate_proj       #  gate_proj  up_proj
14 | 
15 | use_balance="False"
16 | balance_loss_lambda=0.0 # 0.0001
17 | add_noise="False"
18 | use_softmax="False"
19 | 
20 | data_use_percent=1.0 #  1.0  0.71  0.43
21 | train_percent=0.97
22 | batch_size=1024
23 | epochs=800
24 | lr=0.5
25 | 
26 | data_path=/mnt/petrelfs/share_data/quxiaoye
27 | model_path=${data_path}/models/${llama_size}
28 | split_file_path=${data_path}/moefication_results/split/${llama_size}-${num_experts}Expert-Split-${split_type}
29 | hidden_features_path=${data_path}/moefication_results/features/${llama_size}-Hidden-Features
30 | save_path=${data_path}/moefication_results/select/${split_type}
31 | 
32 | save_visualization_path=/mnt/petrelfs/dongdaize.d/workspace/train-moe/visualization/expert-select/${split_type}-${select_type}-${mlp_init_criterion}/${llama_size}-${num_experts}Select${num_selects}-${proj_type}
33 | 
34 | gpus=1
35 | cpus=16
36 | for specify_layer in "0 1" "2 3" "4 5" "6 7" "8 9" "10 11" "12 13" "14 15" "16 17" "18 19" "20 21" "22 23" "24 25" "26 27" "28 29" "30 31"; do # 并行启用任务
37 |   OMP_NUM_THREADS=2 srun --partition=MoE --job-name=select --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=auto \
38 |     python -m smoe.entrypoint.expert_construction.llama_select_mlp \
39 |     --model_path ${model_path} \
40 |     --split_file_path ${split_file_path} \
41 |     --hidden_features_path ${hidden_features_path} \
42 |     --save_path ${save_path} \
43 |     --save_visualization_path ${save_visualization_path} \
44 |     --specify_layer ${specify_layer} \
45 |     --template layers.{}.mlp.${proj_type}.weight \
46 |     --num_experts ${num_experts} \
47 |     --num_selects ${num_selects} \
48 |     --select_criterion ${select_type} \
49 |     --mlp_init_criterion ${mlp_init_criterion} \
50 |     --use_balance ${use_balance} \
51 |     --balance_loss_lambda ${balance_loss_lambda} \
52 |     --add_noise ${add_noise} \
53 |     --use_softmax ${use_softmax} \
54 |     --data_use_percent ${data_use_percent} \
55 |     --train_percent ${train_percent} \
56 |     --batch_size ${batch_size} \
57 |     --epochs ${epochs} \
58 |     --lr ${lr} & # 并行运行下一命令
59 |   sleep 1
60 | done
61 | # "0" "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" "11" "12" "13" "14" "15" "16" "17" "18" "19" "20" "21" "22" "23" "24" "25" "26" "27" "28" "29" "30" "31"
62 | # "0 1" "2 3" "4 5" "6 7" "8 9" "10 11" "12 13" "14 15" "16 17" "18 19" "20 21" "22 23" "24 25" "26 27" "28 29" "30 31"
63 | # "0 1 2 3" "4 5 6 7" "8 9 10 11" "12 13 14 15" "16 17 18 19" "20 21 22 23" "24 25 26 27" "28 29 30 31"
64 | # "0 1 2 3 4 5 6 7" "8 9 10 11 12 13 14 15" "16 17 18 19 20 21 22 23" "24 25 26 27 28 29 30 31"
65 | # "0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15" "16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31"
66 | wait
67 | 


--------------------------------------------------------------------------------
/scripts/expert_construction/select/run_select_multiprocess.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/bash
 2 | 
 3 | #  llama_7B  llama_13B  llama_30B  llama_base
 4 | #  llama2_7B  llama2_13B  llama2_30B  llama2_base
 5 | llama_size="llama_7B"
 6 | 
 7 | num_experts=16           #  8  16
 8 | num_selects=4            #  2  4
 9 | split_type=Clustering-l2 #  Graph-l1_norm  Graph-l2_norm  Clustering-l2  Clustering-cos  Random
10 | select_type=l2_norm      #  plain  positive  l1_norm  l2_norm
11 | proj_type=gate_proj      #  gate_proj  up_proj
12 | 
13 | data_use_percent=1.0
14 | train_percent=0.95
15 | batch_size=1024
16 | epochs=200
17 | lr=0.01
18 | 
19 | data_path=/mnt/petrelfs/share_data/quxiaoye
20 | model_path=${data_path}/models/${llama_size}
21 | split_file_path=${data_path}/moefication_results/split/${llama_size}-${num_experts}Expert-Split-${split_type}
22 | hidden_features_path=${data_path}/moefication_results/features/${llama_size}-Hidden-Features
23 | save_path=${data_path}/moefication_results/select/${split_type}
24 | 
25 | save_visualization_path=/mnt/petrelfs/dongdaize.d/workspace/train-moe/visualization/expert-select/${split_type}-${select_type}/${llama_size}-${num_experts}Select${num_selects}-${proj_type}
26 | 
27 | gpus=4
28 | cpus=$((gpus * 16))
29 | for specify_layer in "0 1 2 3 4 5 6 7" "8 9 10 11 12 13 14 15" "16 17 18 19 20 21 22 23" "24 25 26 27 28 29 30 31"; do # 并行启用任务
30 |   OMP_NUM_THREADS=2 srun --partition=MoE --job-name=select --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=auto \
31 |     python -m smoe.entrypoint.expert_construction.llama_select_mlp_multiprocess \
32 |     --model_path ${model_path} \
33 |     --split_file_path ${split_file_path} \
34 |     --hidden_features_path ${hidden_features_path} \
35 |     --save_path ${save_path} \
36 |     --save_visualization_path ${save_visualization_path} \
37 |     --specify_layer ${specify_layer} \
38 |     --template layers.{}.mlp.${proj_type}.weight \
39 |     --num_experts ${num_experts} \
40 |     --num_selects ${num_selects} \
41 |     --select_criterion ${select_type} \
42 |     --num_threads ${gpus} \
43 |     --use_softmax \
44 |     --data_use_percent ${data_use_percent} \
45 |     --train_percent ${train_percent} \
46 |     --batch_size ${batch_size} \
47 |     --epochs ${epochs} \
48 |     --lr ${lr} & # 并行运行下一命令
49 |   sleep 0.5      # 等待0.5s
50 | done
51 | # "0 1" "2 3" "4 5" "6 7" "8 9" "10 11" "12 13" "14 15" "16 17" "18 19" "20 21" "22 23" "24 25" "26 27" "28 29" "30 31"
52 | # "0 1 2 3" "4 5 6 7" "8 9 10 11" "12 13 14 15" "16 17 18 19" "20 21 22 23" "24 25 26 27" "28 29 30 31"
53 | # "0 1 2 3 4 5 6 7" "8 9 10 11 12 13 14 15" "16 17 18 19 20 21 22 23" "24 25 26 27 28 29 30 31"
54 | # "0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15" "16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31"
55 | 
56 | wait
57 | chmod -R 755 ${save_path} >/dev/null 2>&1
58 | 


--------------------------------------------------------------------------------
/scripts/expert_construction/split/run_split_clustering.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/bash
 2 | 
 3 | #  llama_7B  llama_13B  llama_30B  llama_base
 4 | #  llama2_7B  llama2_13B  llama2_30B  llama2_base
 5 | #  ReluLLaMA-7B
 6 | llama_size="ReluLLaMA-7B"
 7 | 
 8 | num_experts=16    #  8  16
 9 | metric=l2         #  l2  cos
10 | proj_type=up_proj #  gate_proj  up_proj
11 | 
12 | data_path=/mnt/petrelfs/share_data/quxiaoye
13 | model_path=${data_path}/models/${llama_size}
14 | save_path=${data_path}/moefication_results/split
15 | 
16 | gpus=0
17 | cpus=16
18 | OMP_NUM_THREADS=2 srun --partition=MoE --job-name=split --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=auto \
19 |   python -m smoe.entrypoint.expert_construction.llama_split_clustering \
20 |   --model_path ${model_path} \
21 |   --save_path ${save_path} \
22 |   --template layers.{}.mlp.${proj_type}.weight \
23 |   --num_experts ${num_experts} \
24 |   --metric ${metric} \
25 |   --cpu_threads ${cpus}
26 | 


--------------------------------------------------------------------------------
/scripts/expert_construction/split/run_split_gradient.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/bash
 2 | 
 3 | #  llama_7B  llama_13B  llama_30B  llama_base  llama_3B
 4 | #  llama2_7B  llama2_13B  llama2_30B  llama2_base
 5 | llama_size="llama_3B"
 6 | 
 7 | share_neurons=True #  True  False
 8 | expert_num=4
 9 | 
10 | #intermediate_size=8640 #  8640  11008  13824
11 | #scale_factor=4
12 | #expert_size=$(expr ${scale_factor} \* ${intermediate_size} / ${expert_num})
13 | 
14 | expert_size=8640
15 | # 540 1080 2160 4320 8640
16 | # 688 1376 2752 5504 11008
17 | # 864 1728 3456 6912 13824
18 | 
19 | echo ${expert_num} ${expert_size} ${share_neurons}
20 | 
21 | kernel=l1_norm
22 | accumulate_level=sample        #  sample  total
23 | importance_type=feature_change #  feature_grad  feature_change
24 | criterion=max                  #  min  max
25 | proj_type=up_proj
26 | 
27 | data_path=/mnt/petrelfs/share_data/quxiaoye
28 | model_path=${data_path}/models/${llama_size}
29 | score_file_path=${data_path}/moefication_results/split/Gradients${expert_num}/${llama_size}-Gradients-${kernel}-${accumulate_level}-${importance_type}
30 | save_path=${data_path}/moefication_results/split
31 | visualization_path=/mnt/petrelfs/dongdaize.d/workspace/train-moe/visualization/expert-neuron-overlap/cluster${expert_num}/${llama_size}-${expert_size}-${accumulate_level}-${importance_type}-${kernel}-${criterion}-${proj_type}
32 | 
33 | gpus=0
34 | cpus=8
35 | OMP_NUM_THREADS=2 srun --partition=MoE --job-name=split --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=auto \
36 |   python -m smoe.entrypoint.expert_construction.llama_split_gradient \
37 |   --model_path ${model_path} \
38 |   --score_file_path ${score_file_path} \
39 |   --save_path ${save_path} \
40 |   --visualization_path ${visualization_path} \
41 |   --expert_num ${expert_num} \
42 |   --expert_size ${expert_size} \
43 |   --template layers.{}.mlp.${proj_type}.weight \
44 |   --kernel ${kernel} \
45 |   --accumulate_level ${accumulate_level} \
46 |   --importance_type ${importance_type} \
47 |   --criterion ${criterion} \
48 |   --share_neurons ${share_neurons}
49 | 
50 | chmod -R 755 ${save_path} >/dev/null 2>&1
51 | 


--------------------------------------------------------------------------------
/scripts/expert_construction/split/run_split_gradient_one4all.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/bash
 2 | 
 3 | #  llama_7B  llama_13B  llama_30B  llama_base  llama_3B
 4 | #  llama2_7B  llama2_13B  llama2_30B  llama2_base
 5 | llama_size="llama2_7B"
 6 | echo ${llama_size}
 7 | 
 8 | intermediate_size=11008 #  8640  11008  13824
 9 | expert_num_list=(8)
10 | expert_size_list=(1376 2752 5504 11008)
11 | # 540 1080 2160 4320 8640
12 | # 688 1376 2752 5504 11008
13 | # 864 1728 3456 6912 13824
14 | 
15 | kernel=l1_norm
16 | accumulate_level=sample        #  sample  total
17 | importance_type=feature_change #  feature_grad  feature_change
18 | criterion=max                  #  min  max
19 | proj_type=up_proj
20 | 
21 | data_path=/mnt/petrelfs/share_data/quxiaoye
22 | model_path=${data_path}/models/${llama_size}
23 | save_path=${data_path}/moefication_results/split
24 | 
25 | gpus=0
26 | cpus=8
27 | 
28 | share_neurons=True
29 | for expert_num in "${expert_num_list[@]}"; do
30 |   for expert_size in "${expert_size_list[@]}"; do
31 |     echo ${expert_num} ${expert_size} ${share_neurons}
32 |     score_file_path=${data_path}/moefication_results/split/Gradients${expert_num}/${llama_size}-Gradients-${kernel}-${accumulate_level}-${importance_type}
33 |     visualization_path=/mnt/petrelfs/dongdaize.d/workspace/train-moe/visualization/expert-neuron-overlap/cluster${expert_num}/${llama_size}-${expert_size}-${accumulate_level}-${importance_type}-${kernel}-${criterion}-${proj_type}
34 | 
35 |     OMP_NUM_THREADS=2 srun --partition=MoE --job-name=split --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=auto \
36 |       python -m smoe.entrypoint.expert_construction.llama_split_gradient \
37 |       --model_path ${model_path} \
38 |       --score_file_path ${score_file_path} \
39 |       --save_path ${save_path} \
40 |       --visualization_path ${visualization_path} \
41 |       --expert_num ${expert_num} \
42 |       --expert_size ${expert_size} \
43 |       --template layers.{}.mlp.${proj_type}.weight \
44 |       --kernel ${kernel} \
45 |       --accumulate_level ${accumulate_level} \
46 |       --importance_type ${importance_type} \
47 |       --criterion ${criterion} \
48 |       --share_neurons ${share_neurons} &
49 |     sleep 1
50 |   done
51 | done
52 | 
53 | scale_factor=1
54 | share_neurons=False
55 | for expert_num in "${expert_num_list[@]}"; do
56 |   expert_size=$(expr ${scale_factor} \* ${intermediate_size} / ${expert_num})
57 |   echo ${expert_num} ${expert_size} ${share_neurons}
58 | 
59 |   OMP_NUM_THREADS=2 srun --partition=MoE --job-name=split --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=auto \
60 |     python -m smoe.entrypoint.expert_construction.llama_split_gradient \
61 |     --model_path ${model_path} \
62 |     --score_file_path ${score_file_path} \
63 |     --save_path ${save_path} \
64 |     --expert_num ${expert_num} \
65 |     --expert_size ${expert_size} \
66 |     --template layers.{}.mlp.${proj_type}.weight \
67 |     --kernel ${kernel} \
68 |     --accumulate_level ${accumulate_level} \
69 |     --importance_type ${importance_type} \
70 |     --criterion ${criterion} \
71 |     --share_neurons ${share_neurons}
72 | done
73 | 
74 | wait
75 | chmod -R 755 ${save_path} >/dev/null 2>&1
76 | 


--------------------------------------------------------------------------------
/scripts/expert_construction/split/run_split_gradient_residual.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/bash
 2 | 
 3 | #  llama_7B  llama_13B  llama_30B  llama_base  llama_3B
 4 | #  llama2_7B  llama2_13B  llama2_30B  llama2_base
 5 | llama_size="llama2_7B"
 6 | 
 7 | share_neurons=False #  True  False
 8 | expert_num_moe=7
 9 | expert_num_residual=1
10 | total_expert_num=$((${expert_num_moe} + ${expert_num_residual}))
11 | 
12 | #intermediate_size=8640 #  8640  11008  13824
13 | #scale_factor=1
14 | #expert_size=$(expr ${scale_factor} \* ${intermediate_size} / ${total_expert_num})
15 | 
16 | expert_size=1376
17 | # 540 1080 2160 4320 8640
18 | # 688 1376 2752 5504 11008
19 | # 864 1728 3456 6912 13824
20 | 
21 | echo ${total_expert_num}\(${expert_num_moe}+${expert_num_residual}\) ${expert_size} ${share_neurons}
22 | 
23 | kernel=l1_norm
24 | accumulate_level=sample        #  sample  total
25 | importance_type=feature_change #  feature_grad  feature_change
26 | criterion=max                  #  min  max
27 | proj_type=up_proj
28 | 
29 | data_path=/mnt/petrelfs/share_data/quxiaoye
30 | model_path=${data_path}/models/${llama_size}
31 | score_file_path=${data_path}/moefication_results/split/Gradients${total_expert_num}/${llama_size}-Gradients-${kernel}-${accumulate_level}-${importance_type}
32 | save_path=${data_path}/moefication_results/split
33 | visualization_path=/mnt/petrelfs/dongdaize.d/workspace/train-moe/visualization/expert-neuron-overlap/cluster${total_expert_num}-${expert_num_residual}residual-${expert_num_moe}moe/${llama_size}-${expert_size}-${accumulate_level}-${importance_type}-${kernel}-${criterion}-${proj_type}
34 | 
35 | gpus=0
36 | cpus=8
37 | OMP_NUM_THREADS=2 srun --partition=MoE --job-name=split --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=auto \
38 |   python -m smoe.entrypoint.expert_construction.llama_split_gradient_residual \
39 |   --model_path ${model_path} \
40 |   --score_file_path ${score_file_path} \
41 |   --save_path ${save_path} \
42 |   --visualization_path ${visualization_path} \
43 |   --expert_num_moe ${expert_num_moe} \
44 |   --expert_num_residual ${expert_num_residual} \
45 |   --expert_size ${expert_size} \
46 |   --template layers.{}.mlp.${proj_type}.weight \
47 |   --kernel ${kernel} \
48 |   --accumulate_level ${accumulate_level} \
49 |   --importance_type ${importance_type} \
50 |   --criterion ${criterion} \
51 |   --share_neurons ${share_neurons}
52 | 
53 | chmod -R 755 ${save_path} >/dev/null 2>&1
54 | 


--------------------------------------------------------------------------------
/scripts/expert_construction/split/run_split_gradient_residual_one4all.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/bash
 2 | 
 3 | #  llama_7B  llama_13B  llama_30B  llama_base  llama_3B
 4 | #  llama2_7B  llama2_13B  llama2_30B  llama2_base
 5 | llama_size="llama2_7B"
 6 | echo ${llama_size}
 7 | 
 8 | expert_num_moe_list=(13 14 15)
 9 | expert_num_residual_list=(3 2 1)
10 | 
11 | #intermediate_size=8640 #  8640  11008  13824
12 | #scale_factor=1
13 | #expert_size=$(expr ${scale_factor} \* ${intermediate_size} / ${total_expert_num})
14 | 
15 | expert_size=864
16 | # 540 1080 2160 4320 8640
17 | # 688 1376 2752 5504 11008
18 | # 864 1728 3456 6912 13824
19 | 
20 | kernel=l1_norm
21 | accumulate_level=sample        #  sample  total
22 | importance_type=feature_change #  feature_grad  feature_change
23 | criterion=max                  #  min  max
24 | proj_type=up_proj
25 | 
26 | data_path=/mnt/petrelfs/share_data/quxiaoye
27 | model_path=${data_path}/models/${llama_size}
28 | save_path=${data_path}/moefication_results/split
29 | 
30 | gpus=0
31 | cpus=8
32 | for idx in "${!expert_num_moe_list[@]}"; do
33 |   expert_num_moe=${expert_num_moe_list[${idx}]}
34 |   expert_num_residual=${expert_num_residual_list[${idx}]}
35 |   total_expert_num=$((${expert_num_moe} + ${expert_num_residual}))
36 |   score_file_path=${data_path}/moefication_results/split/Gradients${total_expert_num}/${llama_size}-Gradients-${kernel}-${accumulate_level}-${importance_type}
37 |   visualization_path=/mnt/petrelfs/dongdaize.d/workspace/train-moe/visualization/expert-neuron-overlap/cluster${total_expert_num}-${expert_num_residual}residual-${expert_num_moe}moe/${llama_size}-${expert_size}-${accumulate_level}-${importance_type}-${kernel}-${criterion}-${proj_type}
38 | 
39 |   for share_neurons in "True" "False"; do
40 |     echo ${total_expert_num}\(${expert_num_moe}+${expert_num_residual}\) ${expert_size} ${share_neurons}
41 | 
42 |     OMP_NUM_THREADS=2 srun --partition=MoE --job-name=split --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=auto \
43 |       python -m smoe.entrypoint.expert_construction.llama_split_gradient_residual \
44 |       --model_path ${model_path} \
45 |       --score_file_path ${score_file_path} \
46 |       --save_path ${save_path} \
47 |       --visualization_path ${visualization_path} \
48 |       --expert_num_moe ${expert_num_moe} \
49 |       --expert_num_residual ${expert_num_residual} \
50 |       --expert_size ${expert_size} \
51 |       --template layers.{}.mlp.${proj_type}.weight \
52 |       --kernel ${kernel} \
53 |       --accumulate_level ${accumulate_level} \
54 |       --importance_type ${importance_type} \
55 |       --criterion ${criterion} \
56 |       --share_neurons ${share_neurons} &
57 |     sleep 1.0
58 |   done
59 | done
60 | 
61 | chmod -R 755 ${save_path} >/dev/null 2>&1
62 | 


--------------------------------------------------------------------------------
/scripts/expert_construction/split/run_split_graph.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/bash
 2 | 
 3 | #  llama_7B  llama_13B  llama_30B  llama_base
 4 | #  llama2_7B  llama2_13B  llama2_30B  llama2_base
 5 | llama_size=llama_13B
 6 | 
 7 | num_experts=16                        #  8  16
 8 | metric=l1_norm                        #  l1_norm l2_norm plain
 9 | proj_type=up_proj #  gate_proj  up_proj
10 | threshold=1
11 | 
12 | data_path=/mnt/petrelfs/share_data/quxiaoye
13 | model_path=${data_path}/models/${llama_size}
14 | save_path=${data_path}/moefication_results/split/${llama_size}-${num_experts}Expert-Split-Graph-${metric}/
15 | hidden_features_path=${data_path}/moefication_results/features/${llama_size}-Hidden-Features
16 | 
17 | gpus=0
18 | cpus=16
19 | 
20 | # STEP1
21 | 
22 | for specify_layer in {0..39}; do
23 |   OMP_NUM_THREADS=2 srun --partition=MoE --job-name=split --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=auto \
24 |     python -m smoe.entrypoint.expert_construction.llama_split_graph \
25 |     --model_path ${model_path} \
26 |     --save_path ${save_path} \
27 |     --specify_layer ${specify_layer} \
28 |     --template layers.{}.mlp.${proj_type}.weight \
29 |     --num_experts ${num_experts} \
30 |     --threshold ${threshold} \
31 |     --metric ${metric} \
32 |     --hidden_features_path ${hidden_features_path} &
33 |   sleep 0.7
34 | done
35 | wait
36 | 
37 | # STEP2
38 | 
39 | gpmetis_run=/mnt/petrelfs/share_data/quxiaoye/metis_for_graph_split/bin/gpmetis
40 | template1=layers.
41 | template2=.mlp.${proj_type}.weight
42 | 
43 | for layer in {0..39}; do
44 |   OMP_NUM_THREADS=8 srun --partition=MoE --job-name=split --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=auto \
45 |     ${gpmetis_run} ${save_path}/${template1}${layer}${template2} ${num_experts} &
46 |   sleep 0.7
47 | done
48 | wait
49 | 
50 | # STEP3
51 | 
52 | template3=.part.${num_experts}
53 | 
54 | for layer in {0..39}; do
55 |   OMP_NUM_THREADS=8 srun --partition=MoE --job-name=split --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=auto \
56 |     python -m smoe.entrypoint.expert_construction.llama_split_graph_trans_gp \
57 |     --gpmetised_file_path ${save_path}/${template1}${layer}${template2}${template3} &
58 |   sleep 0.7
59 | done
60 | wait
61 | 
62 | chmod -R 755 ${save_path} >/dev/null 2>&1
63 | 


--------------------------------------------------------------------------------
/scripts/expert_construction/split/run_split_random.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/bash
 2 | 
 3 | #  llama_7B  llama_13B  llama_30B  llama_base
 4 | #  llama2_7B  llama2_13B  llama2_30B  llama2_base
 5 | #  open_llama_7b
 6 | #  Mistral-7B-v0.1
 7 | #  ReluLLaMA-7B
 8 | llama_size="ReluLLaMA-7B"
 9 | 
10 | num_experts=16 #  8  16
11 | 
12 | data_path=/mnt/petrelfs/share_data/quxiaoye
13 | model_path=${data_path}/models/${llama_size}
14 | save_path=${data_path}/moefication_results/split
15 | 
16 | gpus=0
17 | cpus=8
18 | OMP_NUM_THREADS=2 srun --partition=MoE --job-name=split --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=auto \
19 |   python -m smoe.entrypoint.expert_construction.llama_split_random \
20 |   --model_path ${model_path} \
21 |   --save_path ${save_path} \
22 |   --template layers.{}.mlp.gate_proj.weight \
23 |   --num_experts ${num_experts}
24 | 


--------------------------------------------------------------------------------
/scripts/expert_construction/split/run_split_random_one4all.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/bash
 2 | 
 3 | #  llama_7B  llama_13B  llama_30B  llama_base
 4 | #  llama2_7B  llama2_13B  llama2_30B  llama2_base
 5 | #  open_llama_7b
 6 | llama_size="open_llama_7b"
 7 | 
 8 | data_path=/mnt/petrelfs/share_data/quxiaoye
 9 | model_path=${data_path}/models/${llama_size}
10 | save_path=${data_path}/moefication_results/split
11 | 
12 | gpus=0
13 | cpus=8
14 | for num_experts in 4 8 16 32; do
15 |   for proj_type in "gate_proj" "up_proj"; do
16 |     OMP_NUM_THREADS=2 srun --partition=MoE --job-name=split --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=auto \
17 |       python -m smoe.entrypoint.expert_construction.llama_split_random \
18 |       --model_path ${model_path} \
19 |       --save_path ${save_path} \
20 |       --template layers.{}.mlp.${proj_type}.weight \
21 |       --num_experts ${num_experts} &
22 |     sleep 0.7
23 |   done
24 | done
25 | 
26 | wait
27 | chmod -R 755 ${save_path} >/dev/null 2>&1
28 | 


--------------------------------------------------------------------------------
/scripts/sft/2_16.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/bash
 2 | 
 3 | #SBATCH --job-name=llama_moe_2_16_deita
 4 | #SBATCH --output=logs/%x-%j.log
 5 | #SBATCH --error=logs/%x-%j.log
 6 | 
 7 | #SBATCH --partition=MoE
 8 | #SBATCH --ntasks-per-node=1
 9 | #SBATCH --cpus-per-task=16
10 | #SBATCH --mem=64G
11 | 
12 | #SBATCH --nodes=1
13 | #SBATCH --gres=gpu:4
14 | #SBATCH --quotatype=auto
15 | 
16 | export WANDB_PROJECT="llama_moe_sft"
17 | num_gpus=4
18 | 
19 | {
20 |     task_name="llama_moe_2_16_deita"
21 |     model_type="auto"
22 |     model_name_or_path="/mnt/petrelfs/zhutong/llama-moe-models/LLaMA-MoE-v1-3_0B-2_16"
23 |     dataset_dir_or_path="data/deita/deita_6k.jsonl"
24 | 
25 |     comment="llama-moe 2/16, deita, w/ balance loss, w/ freeze gate, w/ gate noise"
26 |     base_dir="outputs/llama_moe_sft"
27 |     output_dir="${base_dir}/${task_name}/$SLURM_JOB_NAME-$SLURM_JOB_ID"
28 |     mkdir -p $output_dir
29 |     scontrol write batch_script $SLURM_JOBID $output_dir/sbatch.sh
30 |     git diff > $output_dir/diff.patch
31 |     env > $output_dir/env
32 |     echo -e "Job ID: ${SLURM_JOB_ID}\n\nLog: logs/llama_moe_2_16_deita-$SLURM_JOB_ID.log\n\nGit commit: $(git log -1 --oneline)\n\nGit branch: $(git branch | grep "*")\n\nComment: ${comment}" > $output_dir/comment.txt
33 |     ln -snf $(scontrol show job $SLURM_JOB_ID | grep "StdOut=" | cut -d '=' -f 2) $output_dir/log.log
34 |     echo "$SLURM_JOB_ID" > $base_dir/latest.jobid
35 |     ln -snf $output_dir $base_dir/latest.dir
36 |     ln -snf $(scontrol show job $SLURM_JOB_ID | grep "StdOut=" | cut -d '=' -f 2) $base_dir/latest.log
37 | 
38 |     nodes=($(scontrol show hostnames $SLURM_JOB_NODELIS))
39 |     nodes_array=($nodes)
40 |     head_node=${nodes_array[0]}
41 |     echo "Node: $head_node"
42 | 
43 |     torchrun \
44 |     --nnodes 1 \
45 |     --nproc_per_node $num_gpus \
46 |     --node_rank $SLURM_NODEID \
47 |     --rdzv_id $RANDOM \
48 |     --rdzv_backend c10d \
49 |     --rdzv_endpoint $head_node:29522 \
50 |         -m smoe.entrypoint.sft.train_sft \
51 |             --do_train \
52 |             --freeze_gate True \
53 |             --evaluation_strategy no \
54 |             --run_name $task_name \
55 |             --model_type $model_type \
56 |             --model_name_or_path $model_name_or_path \
57 |             --dataset_dir_or_path $dataset_dir_or_path \
58 |             --output_dir $output_dir \
59 |             --deepspeed conf/ds_bf16_zero1.json \
60 |             --seed 12306 \
61 |             --bf16 True \
62 |             --tf32 True \
63 |             --torch_dtype bfloat16 \
64 |             --per_device_train_batch_size 4 \
65 |             --per_device_eval_batch_size 4 \
66 |             --gradient_accumulation_steps 8 \
67 |             --num_train_epochs 2 \
68 |             --save_strategy steps \
69 |             --save_steps 9999999999999 \
70 |             --save_total_limit 1 \
71 |             --learning_rate 2e-5 \
72 |             --weight_decay 0. \
73 |             --warmup_ratio 0.03 \
74 |             --lr_scheduler_type cosine \
75 |             --logging_steps 1 \
76 |             --model_max_length 2048 \
77 |             --gradient_checkpointing True \
78 |             --report_to wandb
79 | 
80 | }
81 | 


--------------------------------------------------------------------------------
/scripts/sft/2_8.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/bash
 2 | 
 3 | #SBATCH --job-name=llama_moe_2_8_deita
 4 | #SBATCH --output=logs/%x-%j.log
 5 | #SBATCH --error=logs/%x-%j.log
 6 | 
 7 | #SBATCH --partition=MoE
 8 | #SBATCH --ntasks-per-node=1
 9 | #SBATCH --cpus-per-task=16
10 | #SBATCH --mem=64G
11 | 
12 | #SBATCH --nodes=1
13 | #SBATCH --gres=gpu:4
14 | #SBATCH --quotatype=auto
15 | 
16 | export WANDB_PROJECT="llama_moe_sft"
17 | num_gpus=4
18 | 
19 | {
20 |     task_name="llama_moe_2_8_deita"
21 |     model_type="auto"
22 |     model_name_or_path="/mnt/petrelfs/zhutong/llama-moe-models/LLaMA-MoE-v1-3_5B-2_8-new"
23 |     dataset_dir_or_path="data/deita/deita_6k.jsonl"
24 | 
25 |     comment="llama-moe 2/8, deita, w/ balance loss, w/ freeze gate, w/ gate noise"
26 |     base_dir="outputs/llama_moe_sft"
27 |     output_dir="${base_dir}/${task_name}/$SLURM_JOB_NAME-$SLURM_JOB_ID"
28 |     mkdir -p $output_dir
29 |     scontrol write batch_script $SLURM_JOBID $output_dir/sbatch.sh
30 |     git diff > $output_dir/diff.patch
31 |     env > $output_dir/env
32 |     echo -e "Job ID: ${SLURM_JOB_ID}\n\nLog: logs/llama_moe_2_8_deita-$SLURM_JOB_ID.log\n\nGit commit: $(git log -1 --oneline)\n\nGit branch: $(git branch | grep "*")\n\nComment: ${comment}" > $output_dir/comment.txt
33 |     ln -snf $(scontrol show job $SLURM_JOB_ID | grep "StdOut=" | cut -d '=' -f 2) $output_dir/log.log
34 |     echo "$SLURM_JOB_ID" > $base_dir/latest.jobid
35 |     ln -snf $output_dir $base_dir/latest.dir
36 |     ln -snf $(scontrol show job $SLURM_JOB_ID | grep "StdOut=" | cut -d '=' -f 2) $base_dir/latest.log
37 | 
38 |     nodes=($(scontrol show hostnames $SLURM_JOB_NODELIS))
39 |     nodes_array=($nodes)
40 |     head_node=${nodes_array[0]}
41 |     echo "Node: $head_node"
42 | 
43 |     torchrun \
44 |     --nnodes 1 \
45 |     --nproc_per_node $num_gpus \
46 |     --node_rank $SLURM_NODEID \
47 |     --rdzv_id $RANDOM \
48 |     --rdzv_backend c10d \
49 |     --rdzv_endpoint $head_node:29522 \
50 |         -m smoe.entrypoint.sft.train_sft \
51 |             --do_train \
52 |             --freeze_gate True \
53 |             --evaluation_strategy no \
54 |             --run_name $task_name \
55 |             --model_type $model_type \
56 |             --model_name_or_path $model_name_or_path \
57 |             --dataset_dir_or_path $dataset_dir_or_path \
58 |             --output_dir $output_dir \
59 |             --deepspeed conf/deepspeed/bf16_zero1.json \
60 |             --seed 12306 \
61 |             --bf16 True \
62 |             --tf32 True \
63 |             --torch_dtype bfloat16 \
64 |             --per_device_train_batch_size 4 \
65 |             --per_device_eval_batch_size 4 \
66 |             --gradient_accumulation_steps 8 \
67 |             --num_train_epochs 2 \
68 |             --save_strategy steps \
69 |             --save_steps 9999999999999 \
70 |             --save_total_limit 1 \
71 |             --learning_rate 2e-5 \
72 |             --weight_decay 0. \
73 |             --warmup_ratio 0.03 \
74 |             --lr_scheduler_type cosine \
75 |             --logging_steps 1 \
76 |             --model_max_length 2048 \
77 |             --gradient_checkpointing True \
78 |             --report_to wandb
79 | 
80 | }
81 | 


--------------------------------------------------------------------------------
/scripts/sft/4_16.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/bash
 2 | 
 3 | #SBATCH --job-name=llama_moe_4_16_deita
 4 | #SBATCH --output=logs/%x-%j.log
 5 | #SBATCH --error=logs/%x-%j.log
 6 | 
 7 | #SBATCH --partition=MoE
 8 | #SBATCH --ntasks-per-node=1
 9 | #SBATCH --cpus-per-task=16
10 | #SBATCH --mem=64G
11 | 
12 | #SBATCH --nodes=1
13 | #SBATCH --gres=gpu:4
14 | #SBATCH --quotatype=auto
15 | 
16 | export WANDB_PROJECT="llama_moe_sft"
17 | num_gpus=4
18 | 
19 | {
20 |     task_name="llama_moe_4_16_deita"
21 |     model_type="auto"
22 |     model_name_or_path="/mnt/petrelfs/zhutong/llama-moe-models/LLaMA-MoE-v1-3_5B-4_16-new"
23 |     dataset_dir_or_path="data/deita/deita_6k.jsonl"
24 | 
25 |     comment="llama-moe 4/16, deita, w/ balance loss, w/ freeze gate, w/ gate noise"
26 |     base_dir="outputs/llama_moe_sft"
27 |     output_dir="${base_dir}/${task_name}/$SLURM_JOB_NAME-$SLURM_JOB_ID"
28 |     mkdir -p $output_dir
29 |     scontrol write batch_script $SLURM_JOBID $output_dir/sbatch.sh
30 |     git diff > $output_dir/diff.patch
31 |     env > $output_dir/env
32 |     echo -e "Job ID: ${SLURM_JOB_ID}\n\nLog: logs/llama_moe_4_16_deita-$SLURM_JOB_ID.log\n\nGit commit: $(git log -1 --oneline)\n\nGit branch: $(git branch | grep "*")\n\nComment: ${comment}" > $output_dir/comment.txt
33 |     ln -snf $(scontrol show job $SLURM_JOB_ID | grep "StdOut=" | cut -d '=' -f 2) $output_dir/log.log
34 |     echo "$SLURM_JOB_ID" > $base_dir/latest.jobid
35 |     ln -snf $output_dir $base_dir/latest.dir
36 |     ln -snf $(scontrol show job $SLURM_JOB_ID | grep "StdOut=" | cut -d '=' -f 2) $base_dir/latest.log
37 | 
38 |     nodes=($(scontrol show hostnames $SLURM_JOB_NODELIS))
39 |     nodes_array=($nodes)
40 |     head_node=${nodes_array[0]}
41 |     echo "Node: $head_node"
42 | 
43 |     torchrun \
44 |     --nnodes 1 \
45 |     --nproc_per_node $num_gpus \
46 |     --node_rank $SLURM_NODEID \
47 |     --rdzv_id $RANDOM \
48 |     --rdzv_backend c10d \
49 |     --rdzv_endpoint $head_node:29522 \
50 |         -m smoe.entrypoint.sft.train_sft \
51 |             --do_train \
52 |             --freeze_gate True \
53 |             --evaluation_strategy no \
54 |             --run_name $task_name \
55 |             --model_type $model_type \
56 |             --model_name_or_path $model_name_or_path \
57 |             --dataset_dir_or_path $dataset_dir_or_path \
58 |             --output_dir $output_dir \
59 |             --deepspeed conf/ds_bf16_zero1.json \
60 |             --seed 12306 \
61 |             --bf16 True \
62 |             --tf32 True \
63 |             --torch_dtype bfloat16 \
64 |             --per_device_train_batch_size 4 \
65 |             --per_device_eval_batch_size 4 \
66 |             --gradient_accumulation_steps 8 \
67 |             --num_train_epochs 2 \
68 |             --save_strategy steps \
69 |             --save_steps 9999999999999 \
70 |             --save_total_limit 1 \
71 |             --learning_rate 2e-5 \
72 |             --weight_decay 0. \
73 |             --warmup_ratio 0.03 \
74 |             --lr_scheduler_type cosine \
75 |             --logging_steps 1 \
76 |             --model_max_length 2048 \
77 |             --gradient_checkpointing True \
78 |             --report_to wandb
79 | 
80 | }
81 | 


--------------------------------------------------------------------------------
/scripts/test/test_args.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/bash
 2 | 
 3 | templates="1234567"
 4 | 
 5 | gpus=0
 6 | cpus=1
 7 | OMP_NUM_THREADS=2 srun --partition=MoE --job-name=split --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=auto \
 8 |   python -m tests.utils.test_args \
 9 |   --t ${templates}
10 | 


--------------------------------------------------------------------------------
/scripts/test/test_conn.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/bash
 2 | 
 3 | #SBATCH --job-name=test_conn
 4 | #SBATCH --output=logs/%x.log
 5 | #SBATCH --error=logs/%x.log
 6 | 
 7 | #SBATCH --partition=MoE
 8 | #SBATCH --ntasks-per-node=1
 9 | #SBATCH --cpus-per-task=64
10 | #SBATCH --mem=0
11 | 
12 | #SBATCH --nodes=3
13 | #SBATCH --gres=gpu:8
14 | #SBATCH --quotatype=reserved
15 | 
16 | export OMP_NUM_THREADS=4
17 | 
18 | nodes=($(scontrol show hostnames $SLURM_JOB_NODELIS))
19 | nodes_array=($nodes)
20 | head_node=${nodes_array[0]}
21 | 
22 | srun torchrun \
23 |     --nnodes 3 \
24 |     --nproc_per_node 8 \
25 |     --node_rank $SLURM_NODEID \
26 |     --rdzv_id $RANDOM \
27 |     --rdzv_backend c10d \
28 |     --rdzv_endpoint $head_node:29520 \
29 |     tests/entrypoint/test_conn.py
30 | 


--------------------------------------------------------------------------------
/scripts/tokenize/clustering.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/bash
 2 | 
 3 | set -vx
 4 | 
 5 | tokenizer_dir=/mnt/petrelfs/share_data/quxiaoye/models/llama_7B
 6 | data_dir=/mnt/petrelfs/zhutong/smoe/resources/clustering_samples_32
 7 | out_dir=/mnt/petrelfs/zhutong/smoe/resources/clustering_samples_32_tokenized
 8 | logs_dir=logs
 9 | 
10 | mkdir -p $out_dir
11 | mkdir -p $logs_dir
12 | 
13 | # for loop in: en_arxiv, en_book, en_c4, en_cc, en_stack, en_wikipedia, github
14 | for data_type in $(ls $data_dir)
15 | do
16 |     log_path=logs/tokenize_${data_type}_32clusters.log
17 |     nohup srun -p MoE -N1 -n1 --cpus-per-task=32 \
18 |         python -m smoe.utils.tokenize \
19 |             -f jsonl \
20 |             -t $tokenizer_dir \
21 |             -i $data_dir/$data_type \
22 |             -o $out_dir/$data_type \
23 |         1>${log_path} 2>&1 &
24 |     echo "$data_type > $log_path"
25 | done
26 | 


--------------------------------------------------------------------------------
/scripts/tokenize/lines.sh:
--------------------------------------------------------------------------------
1 | # srun -p MoE -N1 -n1 --cpus-per-task=8 python -m smoe.utils.tokenize -f jsonl -t /mnt/petrelfs/share_data/quxiaoye/models/llama_7B -i /mnt/petrelfs/zhutong/smoe/resources/clustering_samples_8/3.jsonl -o /mnt/petrelfs/share_data/quxiaoye/data/8clusters/3.jsonl
2 | nohup srun -p MoE -N1 -n1 --cpus-per-task=8 python -m smoe.utils.tokenize -f jsonl -t /mnt/petrelfs/share_data/quxiaoye/models/llama_7B -i /mnt/petrelfs/zhutong/smoe/resources/clustering_samples_32/5.jsonl -o /mnt/petrelfs/share_data/quxiaoye/data/32clusters/5.jsonl 1>logs/tokenize_32_5.log 2>&1 &
3 | nohup srun -p MoE -N1 -n1 --cpus-per-task=8 python -m smoe.utils.tokenize -f jsonl -t /mnt/petrelfs/share_data/quxiaoye/models/llama_7B -i /mnt/petrelfs/zhutong/smoe/resources/clustering_samples_32/7.jsonl -o /mnt/petrelfs/share_data/quxiaoye/data/32clusters/7.jsonl 1>logs/tokenize_32_7.log 2>&1 &
4 | nohup srun -p MoE -N1 -n1 --cpus-per-task=8 python -m smoe.utils.tokenize -f jsonl -t /mnt/petrelfs/share_data/quxiaoye/models/llama_7B -i /mnt/petrelfs/zhutong/smoe/resources/clustering_samples_32/8.jsonl -o /mnt/petrelfs/share_data/quxiaoye/data/32clusters/8.jsonl 1>logs/tokenize_32_8.log 2>&1 &
5 | nohup srun -p MoE -N1 -n1 --cpus-per-task=8 python -m smoe.utils.tokenize -f jsonl -t /mnt/petrelfs/share_data/quxiaoye/models/llama_7B -i /mnt/petrelfs/zhutong/smoe/resources/clustering_samples_32/12.jsonl -o /mnt/petrelfs/share_data/quxiaoye/data/32clusters/12.jsonl 1>logs/tokenize_32_12.log 2>&1 &
6 | nohup srun -p MoE -N1 -n1 --cpus-per-task=8 python -m smoe.utils.tokenize -f jsonl -t /mnt/petrelfs/share_data/quxiaoye/models/llama_7B -i /mnt/petrelfs/zhutong/smoe/resources/clustering_samples_32/26.jsonl -o /mnt/petrelfs/share_data/quxiaoye/data/32clusters/26.jsonl 1>logs/tokenize_32_26.log 2>&1 &
7 | nohup srun -p MoE -N1 -n1 --cpus-per-task=8 python -m smoe.utils.tokenize -f jsonl -t /mnt/petrelfs/share_data/quxiaoye/models/llama_7B -i /mnt/petrelfs/zhutong/smoe/resources/clustering_samples_32/31.jsonl -o /mnt/petrelfs/share_data/quxiaoye/data/32clusters/31.jsonl 1>logs/tokenize_32_31.log 2>&1 &
8 | 


--------------------------------------------------------------------------------
/scripts/tokenize/redpajama.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/bash
 2 | 
 3 | set -vx
 4 | 
 5 | tokenizer_dir=/mnt/petrelfs/share_data/quxiaoye/models/llama_7B
 6 | data_dir=/mnt/petrelfs/share_data/quxiaoye/pretrain_LLAMA_all_data
 7 | out_dir=/mnt/petrelfs/share_data/quxiaoye/pretrain_LLAMA_all_data_processed
 8 | 
 9 | # tokenizer_dir=/mnt/petrelfs/share_data/quxiaoye/models/llama_3B
10 | # data_dir=/mnt/petrelfs/zhutong/smoe/resources/slimpajama_samples
11 | # out_dir=/mnt/petrelfs/zhutong/smoe/resources/slimpajama_samples_openllama3B_tokenized
12 | 
13 | # tokenizer_dir=/mnt/petrelfs/share_data/quxiaoye/models/llama_7B
14 | # data_dir=/mnt/petrelfs/zhutong/lm-evaluation-harness-b281b0921b636bc36ad05c0b0b0763bd6dd43463/val_set/final
15 | # out_dir=/mnt/petrelfs/share_data/quxiaoye/data/llama1_7B_val_set_tokenized
16 | 
17 | logs_dir=logs
18 | 
19 | mkdir -p $logs_dir
20 | 
21 | # for loop in: en_arxiv, en_book, en_c4, en_cc, en_stack, en_wikipedia, github
22 | for data_type in $(ls $data_dir)
23 | do
24 |     log_path=logs/tokenize_$data_type.log
25 |     nohup srun -p MoE -N1 -n1 --cpus-per-task=32 \
26 |         python -m smoe.utils.tokenize \
27 |             -f jsonl \
28 |             -t $tokenizer_dir \
29 |             -i $data_dir/$data_type \
30 |             -o $out_dir/$data_type \
31 |         1>$logs_dir/tokenize_$data_type.log 2>&1 &
32 |     echo "$data_type > $logs_dir/tokenize_$data_type.log"
33 | done
34 | 


--------------------------------------------------------------------------------
/scripts/tokenize/slimpajama_convert.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/bash
 2 | 
 3 | # set -vx
 4 | 
 5 | content_column=input_ids
 6 | src_tokenizer_dir=/mnt/petrelfs/share_data/zhutong/models/llama2_7B
 7 | tokenizer_dir=/mnt/petrelfs/share_data/zhutong/models/Mistral-7B-v0.1
 8 | 
 9 | data_dir=/mnt/petrelfs/share_data/zhutong/data/slimpajama_fluency_llama_middle_parts
10 | out_dir=/mnt/petrelfs/share_data/zhutong/data/slimpajama_fluency_mistral_middle_parts
11 | # data_dir=/mnt/petrelfs/share_data/zhutong/data/llama1_7B_val_set_tokenized
12 | # out_dir=/mnt/petrelfs/share_data/zhutong/data/mixtral_val_set_tokenized
13 | 
14 | 
15 | logs_dir=logs
16 | 
17 | mkdir -p $logs_dir
18 | 
19 | # for loop in: en_arxiv, en_book, en_c4, en_cc, en_stack, en_wikipedia, github
20 | # for data_type in $(ls $data_dir)
21 | for data_type in "en_arxiv" "en_book" "en_c4" "en_stack" "en_wikipedia" "github"
22 | do
23 |     # get all parts from source data dir
24 |     for part in $(ls $data_dir/$data_type)
25 |     do
26 |         echo "tokenizing $data_dir/$data_type/$part - $(ls $data_dir/$data_type/$part | wc -l)"
27 |         log_path=logs/tokenize-$data_type-$part.log
28 |         nohup srun -p MoE_T -N1 -n1 --cpus-per-task=32 \
29 |             python -m smoe.utils.tokenize \
30 |                 -f jsonl \
31 |                 -c $content_column \
32 |                 -s $src_tokenizer_dir \
33 |                 -t $tokenizer_dir \
34 |                 -i $data_dir/$data_type/$part \
35 |                 -o $out_dir/$data_type/$part \
36 |             1>$log_path 2>&1 &
37 |         # echo "$data_type/$part > $log_path"
38 |         sleep 3
39 |     done
40 | 
41 |     # log_path=logs/tokenize_$data_type.log
42 |     # nohup srun -p MoE_T -N1 -n1 --cpus-per-task=32 \
43 |     #     python -m smoe.utils.tokenize \
44 |     #         -f jsonl \
45 |     #         -s $src_tokenizer_dir \
46 |     #         -c $content_column \
47 |     #         -t $tokenizer_dir \
48 |     #         -i $data_dir/$data_type \
49 |     #         -o $out_dir/$data_type \
50 |     #     1>$logs_dir/tokenize_$data_type.log 2>&1 &
51 |     # echo "$data_type > $logs_dir/tokenize_$data_type.log"
52 | done
53 | 


--------------------------------------------------------------------------------
/scripts/visualization/run_visualize_expert_neuron_overlap.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/bash
 2 | 
 3 | #  llama_7B  llama_13B  llama_30B  llama_base  llama_3B
 4 | #  llama2_7B  llama2_13B  llama2_30B  llama2_base
 5 | llama_size="llama_13B"
 6 | total_clusters=16
 7 | expert_size=864
 8 | # 540 1080 2160 4320 8640
 9 | # 688 1376 2752 5504 11008
10 | # 864 1728 3456 6912 13824
11 | 
12 | criterion=max                  #  min  max
13 | kernel=l1_norm                 #  plain  l1_norm  l2_norm
14 | accumulate_level=sample        #  sample  total
15 | importance_type=feature_change #  feature_grad  feature_change
16 | proj_type=up_proj              #  gate_proj  up_proj
17 | 
18 | if [ ${importance_type} = "feature_grad" ]; then
19 |   template_postfix=grad
20 | else
21 |   template_postfix=change
22 | fi
23 | 
24 | data_path=/mnt/petrelfs/share_data/quxiaoye
25 | model_path=${data_path}/models/${llama_size}
26 | score_file_path=${data_path}/moefication_results/split/Gradients${total_clusters}/${llama_size}-Gradients-${kernel}-${accumulate_level}-${importance_type}
27 | save_path=/mnt/petrelfs/dongdaize.d/workspace/train-moe/visualization/expert-neuron-overlap/cluster${total_clusters}/${llama_size}-${expert_size}-${accumulate_level}-${importance_type}-${kernel}-${criterion}-${proj_type}
28 | 
29 | gpus=0
30 | cpus=4
31 | OMP_NUM_THREADS=2 srun --partition=MoE --job-name=visualize --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=auto \
32 |   python -m smoe.entrypoint.visualization.visualize_expert_neuron_overlap \
33 |   --model_path ${model_path} \
34 |   --score_file_path ${score_file_path} \
35 |   --save_path ${save_path} \
36 |   --expert_size ${expert_size} \
37 |   --score_file_template layers.{}.mlp.${proj_type}.weight.${template_postfix} \
38 |   --criterion ${criterion}
39 | 


--------------------------------------------------------------------------------
/scripts/visualization/run_visualize_expert_neuron_overlap_one4all.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/bash
 2 | 
 3 | #  llama_7B  llama_13B  llama_30B  llama_base  llama_3B
 4 | #  llama2_7B  llama2_13B  llama2_30B  llama2_base
 5 | llama_size="llama_3B"
 6 | total_clusters=16
 7 | 
 8 | criterion=max                  #  min  max
 9 | kernel=l1_norm                 #  plain  l1_norm  l2_norm
10 | accumulate_level=sample        #  sample  total
11 | importance_type=feature_change #  feature_grad  feature_change
12 | proj_type=up_proj              #  gate_proj  up_proj
13 | 
14 | if [ ${importance_type} = "feature_grad" ]; then
15 |   template_postfix=grad
16 | else
17 |   template_postfix=change
18 | fi
19 | 
20 | data_path=/mnt/petrelfs/share_data/quxiaoye
21 | model_path=${data_path}/models/${llama_size}
22 | score_file_path=${data_path}/moefication_results/split/Gradients${total_clusters}/${llama_size}-Gradients-${kernel}-${accumulate_level}-${importance_type}
23 | 
24 | gpus=0
25 | cpus=4
26 | for expert_size in 540 1080 2160 4320; do
27 |   # 540 1080 2160 4320 8640
28 |   # 688 1376 2752 5504 11008
29 |   # 864 1728 3456 6912 13824
30 |   save_path=/mnt/petrelfs/dongdaize.d/workspace/train-moe/visualization/expert-neuron-overlap/cluster${total_clusters}/${llama_size}-${expert_size}-${accumulate_level}-${importance_type}-${kernel}-${criterion}-${proj_type}
31 | 
32 |   OMP_NUM_THREADS=2 srun --partition=MoE --job-name=visualize --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=auto \
33 |     python -m smoe.entrypoint.visualization.visualize_expert_neuron_overlap \
34 |     --model_path ${model_path} \
35 |     --score_file_path ${score_file_path} \
36 |     --save_path ${save_path} \
37 |     --expert_size ${expert_size} \
38 |     --score_file_template layers.{}.mlp.${proj_type}.weight.${template_postfix} \
39 |     --criterion ${criterion} &
40 |   sleep 0.7
41 | done
42 | 


--------------------------------------------------------------------------------
/scripts/visualization/run_visualize_expert_neuron_overlap_overview.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/bash
 2 | 
 3 | #  llama_7B  llama_13B  llama_30B  llama_base  llama_3B
 4 | #  llama2_7B  llama2_13B  llama2_30B  llama2_base
 5 | llama_size="llama_3B"
 6 | total_clusters=16
 7 | 
 8 | criterion=max                  #  min  max
 9 | kernel=l1_norm                 #  plain  l1_norm  l2_norm
10 | accumulate_level=sample        #  sample  total
11 | importance_type=feature_change #  feature_grad  feature_change
12 | proj_type=up_proj              #  gate_proj  up_proj
13 | 
14 | if [ ${importance_type} = "feature_grad" ]; then
15 |   template_postfix=grad
16 | else
17 |   template_postfix=change
18 | fi
19 | 
20 | data_path=/mnt/petrelfs/share_data/quxiaoye
21 | model_path=${data_path}/models/${llama_size}
22 | score_file_path=${data_path}/moefication_results/split/Gradients${total_clusters}/${llama_size}-Gradients-${kernel}-${accumulate_level}-${importance_type}
23 | save_path=/mnt/petrelfs/dongdaize.d/workspace/train-moe/visualization/expert-neuron-overlap-overview/cluster${total_clusters}/${llama_size}-${accumulate_level}-${importance_type}-${kernel}-${criterion}-${proj_type}
24 | 
25 | gpus=0
26 | cpus=4
27 | OMP_NUM_THREADS=2 srun --partition=MoE --job-name=visualize --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=auto \
28 |   python -m smoe.entrypoint.visualization.visualize_expert_neuron_overlap_overview \
29 |   --model_path ${model_path} \
30 |   --score_file_path ${score_file_path} \
31 |   --save_path ${save_path} \
32 |   --score_file_template layers.{}.mlp.${proj_type}.weight.${template_postfix} \
33 |   --criterion ${criterion}
34 | 


--------------------------------------------------------------------------------
/scripts/visualization/run_visualize_expert_select_mlp.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/bash
 2 | 
 3 | #  llama_7B  llama_13B  llama_30B  llama_base
 4 | #  llama2_7B  llama2_13B  llama2_30B  llama2_base
 5 | llama_size="llama2_7B"
 6 | 
 7 | num_experts=8            #  8  16
 8 | num_selects=2            #  2  4
 9 | split_type=Clustering-l2 #  Graph-l1_norm  Graph-l2_norm  Clustering-l2  Clustering-cos  Random
10 | select_type=l2_norm      #  plain  positive  l1_norm  l2_norm
11 | proj_type=gate_proj      #  up_proj  gate_proj
12 | 
13 | result_path=/mnt/petrelfs/share_data/quxiaoye/moefication_results/select/${split_type}/${llama_size}-${num_experts}Expert-Select-MLP-${select_type}
14 | save_path=/mnt/petrelfs/dongdaize.d/workspace/train-moe/visualization/expert-select/${split_type}-${select_type}/${llama_size}-${num_experts}Select${num_selects}-${proj_type}
15 | 
16 | gpus=0
17 | cpus=4
18 | OMP_NUM_THREADS=2 srun --partition=MoE --job-name=visualize --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=auto \
19 |   python -m smoe.entrypoint.visualization.visualize_expert_select_mlp \
20 |   --result_path ${result_path} \
21 |   --save_path ${save_path} \
22 |   --proj_type ${proj_type}
23 | 


--------------------------------------------------------------------------------
/scripts/visualization/run_visualize_expert_select_mlp_one4all.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/bash
 2 | 
 3 | #  llama_7B  llama_13B  llama_30B  llama_base
 4 | #  llama2_7B  llama2_13B  llama2_30B  llama2_base
 5 | llama_size="llama_7B"
 6 | 
 7 | # 定义 num_selects 数组，与 num_experts 一一对应
 8 | declare -a num_experts_array=(8 16)
 9 | declare -a num_selects_array=(2 4)
10 | 
11 | # 可视化所有可能的结果组合，无效进程会自动报错退出
12 | gpus=0
13 | cpus=4
14 | for idx in "${!num_selects_array[@]}"; do
15 |   num_experts="${num_experts_array[$idx]}"
16 |   num_selects="${num_selects_array[$idx]}"
17 |   for split_type in "Graph-l1_norm" "Graph-l2_norm" "Clustering-l2" "Clustering-cos" "Random"; do
18 |     for select_type in "plain" "positive" "l1_norm" "l2_norm"; do
19 |       for proj_type in "gate_proj" "up_proj"; do
20 | 
21 |         result_path=/mnt/petrelfs/share_data/quxiaoye/moefication_results/select/${split_type}/${llama_size}-${num_experts}Expert-Select-MLP-${select_type}
22 |         save_path=/mnt/petrelfs/dongdaize.d/workspace/train-moe/visualization/expert-select/${split_type}-${select_type}/${llama_size}-${num_experts}Select${num_selects}-${proj_type}
23 | 
24 |         # 若result_path存在，则执行可视化
25 |         if [ -d "$result_path" ]; then
26 |           OMP_NUM_THREADS=2 srun --partition=MoE --job-name=visualize --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=auto \
27 |             python -m smoe.entrypoint.visualization.visualize_expert_select_mlp \
28 |             --result_path ${result_path} \
29 |             --save_path ${save_path} \
30 |             --proj_type ${proj_type} & # 并行运行下一命令
31 |           sleep 0.5                    # 等待0.5s
32 |         else
33 |           echo "Directory does not exist: $result_path"
34 |         fi
35 | 
36 |       done
37 |     done
38 |   done
39 | done
40 | 


--------------------------------------------------------------------------------
/scripts/visualization/run_visualize_mlp_output_scale.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/bash
 2 | 
 3 | #  llama_7B  llama_13B  llama_30B  llama_base
 4 | #  llama2_7B  llama2_13B  llama2_30B  llama2_base
 5 | llama_size="llama_13B"
 6 | 
 7 | data_begin_index=0
 8 | data_end_index=500
 9 | batch_size=8
10 | block_size=2048
11 | moe_score_scale_factor=1
12 | 
13 | #save_folder=${llama_size}_dense
14 | #model_path=/mnt/petrelfs/share_data/quxiaoye/models/llama_13B
15 | #is_moe="False"
16 | 
17 | #save_folder=${llama_size}_moe
18 | #model_path=/mnt/petrelfs/share_data/quxiaoye/models/LlamaMoEForCausalLM-copy/Gradient-max-l1_norm-sample-feature_change/llama_13B-16Select4-864Neurons
19 | #is_moe="True"
20 | 
21 | #moe_score_scale_factor=5
22 | #save_folder=${llama_size}_moe_scale${moe_score_scale_factor}
23 | #model_path=/mnt/petrelfs/share_data/quxiaoye/models/LlamaMoEForCausalLM-copy/Gradient-max-l1_norm-sample-feature_change/llama_13B-16Select4-864Neurons
24 | #is_moe="True"
25 | 
26 | save_folder=${llama_size}_moe_trained
27 | model_path=/mnt/petrelfs/share_data/quxiaoye/checkpoint-18000
28 | is_moe="True"
29 | 
30 | share_path=/mnt/petrelfs/share_data/quxiaoye
31 | tokenizer_path=${share_path}/models/${llama_size}
32 | data_path=${share_path}/data/vis_data/head30_shuffled_output/shuffled_20.txt
33 | save_path=/mnt/petrelfs/dongdaize.d/workspace/train-moe/visualization/mlp-outputs-scale/${save_folder}
34 | 
35 | gpus=1
36 | cpus=16
37 | OMP_NUM_THREADS=2 srun --partition=MoE --job-name=visualize --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=auto \
38 |   python -m smoe.entrypoint.visualization.visualize_mlp_output_scale \
39 |   --tokenizer_path ${tokenizer_path} \
40 |   --model_path ${model_path} \
41 |   --data_path ${data_path} \
42 |   --save_path ${save_path} \
43 |   --data_begin_index ${data_begin_index} \
44 |   --data_end_index ${data_end_index} \
45 |   --batch_size ${batch_size} \
46 |   --block_size ${block_size} \
47 |   --is_moe ${is_moe} \
48 |   --moe_score_scale_factor ${moe_score_scale_factor}
49 | 


--------------------------------------------------------------------------------
/scripts/visualization/run_visualize_swiglu_output.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/bash
 2 | 
 3 | #  llama_7B  llama_13B  llama_30B  llama_base
 4 | #  llama2_7B  llama2_13B  llama2_30B  llama2_base
 5 | llama_size="llama_7B"
 6 | 
 7 | proj_type=up_proj           #  gate_proj  up_proj
 8 | visualize_criterion=l2_norm #  plain  l1_norm  l2_norm
 9 | 
10 | data_path=/mnt/petrelfs/share_data/quxiaoye
11 | model_path=${data_path}/models/${llama_size}
12 | hidden_features_path=${data_path}/moefication_results/features/${llama_size}-Hidden-Features
13 | 
14 | save_path=/mnt/petrelfs/dongdaize.d/workspace/train-moe/visualization/swiglu-output/${llama_size}/${proj_type}-${visualize_criterion}
15 | 
16 | gpus=1
17 | cpus=16
18 | for specify_layer in "0 1 2 3" "4 5 6 7" "8 9 10 11" "12 13 14 15" "16 17 18 19" "20 21 22 23" "24 25 26 27" "28 29 30 31"; do # 并行启用任务
19 |   OMP_NUM_THREADS=2 srun --partition=MoE --job-name=visualize --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=auto \
20 |     python -m smoe.entrypoint.visualization.visualize_swiglu_output \
21 |     --model_path ${model_path} \
22 |     --hidden_features_path ${hidden_features_path} \
23 |     --save_path ${save_path} \
24 |     --template layers.{}.mlp.${proj_type}.weight \
25 |     --specify_layer ${specify_layer} \
26 |     --visualize_criterion ${visualize_criterion} & # 并行运行下一命令
27 |   sleep 0.5                                        # 等待0.5s
28 | done
29 | # "0" "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" "11" "12" "13" "14" "15" "16" "17" "18" "19" "20" "21" "22" "23" "24" "25" "26" "27" "28" "29" "30" "31"
30 | # "0 1" "2 3" "4 5" "6 7" "8 9" "10 11" "12 13" "14 15" "16 17" "18 19" "20 21" "22 23" "24 25" "26 27" "28 29" "30 31"
31 | # "0 1 2 3" "4 5 6 7" "8 9 10 11" "12 13 14 15" "16 17 18 19" "20 21 22 23" "24 25 26 27" "28 29 30 31"
32 | # "0 1 2 3 4 5 6 7" "8 9 10 11 12 13 14 15" "16 17 18 19 20 21 22 23" "24 25 26 27 28 29 30 31"
33 | # "0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15" "16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31"
34 | 


--------------------------------------------------------------------------------
/scripts/visualization/run_visualize_swiglu_output_one4all.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/bash
 2 | 
 3 | #  llama_7B  llama_13B  llama_30B  llama_base
 4 | #  llama2_7B  llama2_13B  llama2_30B  llama2_base
 5 | llama_size="llama_7B"
 6 | 
 7 | data_path=/mnt/petrelfs/share_data/quxiaoye
 8 | model_path=${data_path}/models/${llama_size}
 9 | hidden_features_path=${data_path}/moefication_results/features/${llama_size}-Hidden-Features
10 | 
11 | # 可视化所有可能的结果组合，无效进程会自动报错退出
12 | gpus=1
13 | cpus=16
14 | for visualize_criterion in "plain" "l1_norm" "l2_norm"; do
15 |   for proj_type in "gate_proj" "up_proj"; do
16 | 
17 |     save_path=/mnt/petrelfs/dongdaize.d/workspace/train-moe/visualization/swiglu-output/${llama_size}/${proj_type}-${visualize_criterion}
18 | 
19 |     for specify_layer in "0 1 2 3" "4 5 6 7" "8 9 10 11" "12 13 14 15" "16 17 18 19" "20 21 22 23" "24 25 26 27" "28 29 30 31"; do # 并行启用任务
20 |       OMP_NUM_THREADS=2 srun --partition=MoE --job-name=visualize --mpi=pmi2 --gres=gpu:${gpus} -n1 --ntasks-per-node=1 -c ${cpus} --kill-on-bad-exit=1 --quotatype=auto \
21 |         python -m smoe.entrypoint.visualization.visualize_swiglu_output \
22 |         --model_path ${model_path} \
23 |         --hidden_features_path ${hidden_features_path} \
24 |         --save_path ${save_path} \
25 |         --template layers.{}.mlp.${proj_type}.weight \
26 |         --specify_layer ${specify_layer} \
27 |         --visualize_criterion ${visualize_criterion} & # 并行运行下一命令
28 |       sleep 0.5                                        # 等待0.5s
29 | 
30 |     done
31 |   done
32 | done
33 | # "0" "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" "11" "12" "13" "14" "15" "16" "17" "18" "19" "20" "21" "22" "23" "24" "25" "26" "27" "28" "29" "30" "31"
34 | # "0 1" "2 3" "4 5" "6 7" "8 9" "10 11" "12 13" "14 15" "16 17" "18 19" "20 21" "22 23" "24 25" "26 27" "28 29" "30 31"
35 | # "0 1 2 3" "4 5 6 7" "8 9 10 11" "12 13 14 15" "16 17 18 19" "20 21 22 23" "24 25 26 27" "28 29 30 31"
36 | # "0 1 2 3 4 5 6 7" "8 9 10 11 12 13 14 15" "16 17 18 19 20 21 22 23" "24 25 26 27 28 29 30 31"
37 | # "0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15" "16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31"
38 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import setuptools
 4 | 
 5 | readme_filepath = os.path.join(os.path.dirname(os.path.abspath(__file__)), "README.md")
 6 | with open(readme_filepath, "r", encoding="utf8") as fh:
 7 |     long_description = fh.read()
 8 | 
 9 | version_filepath = os.path.join(os.path.dirname(os.path.abspath(__file__)), "VERSION")
10 | with open(version_filepath, "r", encoding="utf8") as fh:
11 |     version = fh.read().strip()
12 | 
13 | setuptools.setup(
14 |     name="smoe",
15 |     version=version,
16 |     author="MoE Group",
17 |     author_email="tzhu1997@outlook.com",
18 |     description="A toolkit for LLM MoE and continual pretraining.",
19 |     long_description_content_type="text/markdown",
20 |     long_description=long_description,
21 |     url="https://github.com/Spico197/smoe",
22 |     packages=setuptools.find_packages(exclude=["tests", "tests.*", "docs", "docs.*"]),
23 |     classifiers=[
24 |         "Programming Language :: Python :: 3",
25 |         "License :: OSI Approved :: Apache Software License",
26 |         "Operating System :: OS Independent",
27 |     ],
28 |     python_requires=">=3.10",
29 |     install_requires=[
30 |         "scikit-learn==1.3.0",
31 |         "omegaconf==2.0.6",
32 |         "tqdm==4.65.0",
33 |         "datasets==2.14.1",
34 |         "transformers==4.31.0",
35 |         "peft==0.4.0",
36 |         "tensorboard==2.13.0",
37 |     ],
38 |     extras_require={
39 |         "dev": [
40 |             "pytest==7.4.0",
41 |             "coverage==7.2.7",
42 |             "black==23.7.0",
43 |             "isort==5.12.0",
44 |             "flake8==6.0.0",
45 |             "pre-commit==3.3.3",
46 |         ]
47 |     },
48 |     include_package_data=True,
49 |     entry_points={},
50 | )
51 | 


--------------------------------------------------------------------------------
/smoe/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pjlab-sys4nlp/llama-moe/b17aff436cce58e2fcd7327789c7fadafe15d19e/smoe/__init__.py


--------------------------------------------------------------------------------
/smoe/callbacks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pjlab-sys4nlp/llama-moe/b17aff436cce58e2fcd7327789c7fadafe15d19e/smoe/callbacks/__init__.py


--------------------------------------------------------------------------------
/smoe/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pjlab-sys4nlp/llama-moe/b17aff436cce58e2fcd7327789c7fadafe15d19e/smoe/data/__init__.py


--------------------------------------------------------------------------------
/smoe/data/aggregation.py:
--------------------------------------------------------------------------------
 1 | from itertools import chain
 2 | 
 3 | 
 4 | def group_texts(examples: dict, block_size: int = 1024):
 5 |     # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
 6 |     # Concatenate all texts.
 7 |     concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
 8 |     total_length = len(concatenated_examples[list(examples.keys())[0]])
 9 |     # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
10 |     # customize this part to your needs.
11 |     if total_length >= block_size:
12 |         total_length = (total_length // block_size) * block_size
13 |     # Split by chunks of max_len.
14 |     result = {
15 |         k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
16 |         for k, t in concatenated_examples.items()
17 |     }
18 |     result["labels"] = result["input_ids"].copy()
19 |     return result
20 | 
21 | 
22 | def group_instances(examples: list[dict], block_size: int = 2048) -> list[dict]:
23 |     """
24 |     Concate examples to a length of block size.
25 | 
26 |     Args:
27 |         examples: a list of dict instances that have multiple keys
28 |         block_size: the length of the concatenated examples
29 |     """
30 | 
31 |     def _concat(examples: list[dict]) -> dict:
32 |         """
33 |         Concatenate the values of each key in the examples.
34 | 
35 |         Args:
36 |             examples: a list of dict instances that have multiple keys
37 |         """
38 |         concatenated_examples = {}
39 |         keys = examples[0].keys()
40 |         for k in keys:
41 |             concatenated_examples[k] = list(chain(*[e[k] for e in examples]))
42 |         if "labels" not in keys and "input_ids" in keys:
43 |             concatenated_examples["labels"] = concatenated_examples["input_ids"]
44 |         return concatenated_examples
45 | 
46 |     def _chunk(examples: dict, block_size: int) -> list[dict]:
47 |         """
48 |         Split the concatenated examples into chunks of block_size.
49 | 
50 |         Args:
51 |             examples: a dict instance that has multiple keys
52 |             block_size: the length of the concatenated examples
53 |         """
54 |         total_length = len(examples[list(examples.keys())[0]])
55 |         if total_length >= block_size:
56 |             total_length = (total_length // block_size) * block_size
57 |         result = {
58 |             k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
59 |             for k, t in examples.items()
60 |         }
61 |         return result
62 | 
63 |     def _decompose(example: dict) -> list[dict]:
64 |         """
65 |         Decompose the example into a list of dict instances.
66 | 
67 |         Args:
68 |             example: a dict instance that has multiple keys
69 |         """
70 |         num_chunks = len(example[list(example.keys())[0]])
71 |         return [{k: example[k][i] for k in example.keys()} for i in range(num_chunks)]
72 | 
73 |     concatenated_examples = _concat(examples)
74 |     chunk = _chunk(concatenated_examples, block_size)
75 |     return _decompose(chunk)
76 | 


--------------------------------------------------------------------------------
/smoe/data/redpajama.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from collections import defaultdict
 3 | from functools import partial
 4 | from pathlib import Path
 5 | 
 6 | from datasets import IterableDataset, load_dataset
 7 | from datasets.combine import interleave_datasets
 8 | from tqdm import tqdm
 9 | 
10 | from smoe.data.aggregation import group_texts
11 | 
12 | logger = logging.getLogger(__name__)
13 | 
14 | 
15 | def load_streaming_datasets(
16 |     data_dir: str,
17 |     prob_map: dict[str, float] = None,
18 |     num_proc: int = None,
19 |     debug_mode: bool = False,
20 |     block_size: int = 1024,
21 |     split: str = "train",
22 |     verbose: bool = True,
23 | ) -> IterableDataset:
24 |     dataset_dir = Path(data_dir)
25 |     files = list(dataset_dir.glob("**/*.jsonl"))
26 |     if debug_mode is True:
27 |         files = [files[0]]
28 | 
29 |     fbar = files
30 |     if verbose:
31 |         fbar = tqdm(files, desc="Loading files")
32 | 
33 |     data_type_to_filepaths = defaultdict(list)
34 |     for filepath in fbar:
35 |         data_type = filepath.parent.stem
36 |         assert (
37 |             data_type in prob_map if prob_map else True
38 |         ), f"{data_type} not in {prob_map.keys()}"
39 |         data_type_to_filepaths[data_type].append(str(filepath))
40 | 
41 |     data_type_to_dataset_list = {}
42 |     grouping_func = partial(group_texts, block_size=block_size)
43 | 
44 |     fbar = None
45 |     if verbose:
46 |         fbar = tqdm(total=len(data_type_to_filepaths), desc="Indexing files")
47 |     for data_type, filepaths in data_type_to_filepaths.items():
48 |         ds = load_dataset(
49 |             "json",
50 |             data_files=filepaths,
51 |             num_proc=num_proc,
52 |             streaming=True,
53 |             split=split,
54 |         )
55 |         grouped_datasets = ds.map(
56 |             grouping_func,
57 |             batched=True,
58 |         )
59 |         data_type_to_dataset_list[data_type] = grouped_datasets
60 | 
61 |     datasets_in_diff_types = []
62 |     probs = []
63 |     dbar = None
64 |     if verbose:
65 |         dbar = tqdm(
66 |             total=len(data_type_to_dataset_list), desc="Mapping datasets with probs"
67 |         )
68 |     for data_type, dataset in data_type_to_dataset_list.items():
69 |         prob = None
70 |         if prob_map:
71 |             prob = prob_map[data_type]
72 |             probs.append(prob)
73 |         datasets_in_diff_types.append(dataset)
74 |         if dbar:
75 |             dbar.update(1)
76 |             dbar.set_postfix({data_type: f"{prob:.3%}%"})
77 | 
78 |     if len(probs) == 0:
79 |         probs = None
80 |     else:
81 |         sum_probs = sum(probs)
82 |         if sum_probs != 1.0:
83 |             logger.warn(f"Summation of prob_map is {sum_probs}, scaling to 1.0")
84 |             probs = [p / sum_probs for p in probs]
85 | 
86 |     if verbose:
87 |         logger.info("Grouping datasets")
88 |     lm_datasets = interleave_datasets(datasets_in_diff_types, probs)
89 | 
90 |     return lm_datasets
91 | 


--------------------------------------------------------------------------------
/smoe/data/single_file.py:
--------------------------------------------------------------------------------
 1 | from functools import partial
 2 | 
 3 | from datasets import IterableDataset, load_dataset
 4 | 
 5 | from smoe.data.aggregation import group_texts
 6 | 
 7 | 
 8 | def load_cached_dataset(
 9 |     filepath: str,
10 |     num_proc: int = None,
11 |     block_size: int = 2048,
12 |     split: str = "train",
13 | ) -> IterableDataset:
14 |     grouping_func = partial(group_texts, block_size=block_size)
15 |     ds = load_dataset(
16 |         "json",
17 |         data_files=filepath,
18 |         num_proc=num_proc,
19 |         split=split,
20 |     )
21 |     grouped_datasets = ds.map(
22 |         grouping_func,
23 |         batched=True,
24 |     )
25 |     return grouped_datasets
26 | 


--------------------------------------------------------------------------------
/smoe/entrypoint/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pjlab-sys4nlp/llama-moe/b17aff436cce58e2fcd7327789c7fadafe15d19e/smoe/entrypoint/__init__.py


--------------------------------------------------------------------------------
/smoe/entrypoint/analysis/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pjlab-sys4nlp/llama-moe/b17aff436cce58e2fcd7327789c7fadafe15d19e/smoe/entrypoint/analysis/__init__.py


--------------------------------------------------------------------------------
/smoe/entrypoint/analysis/clustering_distribution.py:
--------------------------------------------------------------------------------
 1 | """
 2 | python -m smoe.entrypoint.analysis.clustering_distribution -d resources/clustering_7_samples -o results/analysis_clustering7
 3 | """
 4 | 
 5 | import argparse
 6 | from pathlib import Path
 7 | 
 8 | from smoe.utils.io import load_jsonlines
 9 | from smoe.utils.visualization.bar import barh
10 | 
11 | 
12 | def main(args):
13 |     data_dir = Path(args.data_dir)
14 | 
15 |     for file in data_dir.glob("*.jsonl"):
16 |         cluster_idx = file.stem
17 |         source_to_num = {
18 |             "arxiv": 0,
19 |             "books": 0,
20 |             "c4": 0,
21 |             "commoncrawl": 0,
22 |             "github": 0,
23 |             "stackexchange": 0,
24 |             "wikipedia": 0,
25 |         }
26 |         data = load_jsonlines(file)
27 |         for ins in data:
28 |             source = ins["file"].split("-")[0]
29 |             source_to_num[source] += 1
30 |         barh(
31 |             source_to_num,
32 |             title=f"Cluster {cluster_idx}",
33 |             save_filepath=f"{args.out_dir}/cluster_{cluster_idx}.png",
34 |         )
35 |         print(f"Done: {file}")
36 | 
37 | 
38 | if __name__ == "__main__":
39 |     parser = argparse.ArgumentParser()
40 |     parser.add_argument("-d", "--data_dir", required=True)
41 |     parser.add_argument("-o", "--out_dir", required=True)
42 |     args = parser.parse_args()
43 |     main(args)
44 | 


--------------------------------------------------------------------------------
/smoe/entrypoint/compress_png_images.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | 
 4 | from smoe.utils.io import compress_png_image
 5 | 
 6 | 
 7 | def main(args):
 8 |     for dir_path, dir_names, file_names in os.walk(args.root_path):
 9 |         for name in file_names:
10 |             if name.endswith(".png"):
11 |                 compress_png_image(os.path.join(dir_path, name), print_info=True)
12 |     print("All done.")
13 | 
14 | 
15 | if __name__ == "__main__":
16 |     parser = argparse.ArgumentParser()
17 |     parser.add_argument("--root_path", type=str)
18 |     args = parser.parse_args()
19 | 
20 |     args.root_path = "/mnt/petrelfs/dongdaize.d/workspace/train-moe/visualization"
21 | 
22 |     main(args)
23 | 


--------------------------------------------------------------------------------
/smoe/entrypoint/cpt/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pjlab-sys4nlp/llama-moe/b17aff436cce58e2fcd7327789c7fadafe15d19e/smoe/entrypoint/cpt/__init__.py


--------------------------------------------------------------------------------
/smoe/entrypoint/download_llama.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from huggingface_hub import snapshot_download
 3 | from transformers.models.llama.modeling_llama import LlamaForCausalLM
 4 | from transformers.models.llama.tokenization_llama_fast import LlamaTokenizer
 5 | 
 6 | repo_to_download = "openlm-research/open_llama_3b"
 7 | target_dir = "/mnt/petrelfs/share_data/quxiaoye/models/llama_3B"
 8 | 
 9 | snapshot_download(
10 |     repo_id=repo_to_download, local_dir=target_dir, local_dir_use_symlinks=False
11 | )
12 | 
13 | tokenizer = LlamaTokenizer.from_pretrained(target_dir)
14 | model = LlamaForCausalLM.from_pretrained(
15 |     target_dir,
16 |     torch_dtype=torch.float16,
17 |     device_map="cpu",
18 | )
19 | 
20 | prompt = "Q: What is the largest animal?\nA:"
21 | input_ids = tokenizer(prompt, return_tensors="pt").input_ids
22 | 
23 | generation_output = model.generate(input_ids=input_ids, max_new_tokens=32)
24 | print(tokenizer.decode(generation_output[0]))
25 | 


--------------------------------------------------------------------------------
/smoe/entrypoint/eval/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pjlab-sys4nlp/llama-moe/b17aff436cce58e2fcd7327789c7fadafe15d19e/smoe/entrypoint/eval/__init__.py


--------------------------------------------------------------------------------
/smoe/entrypoint/examples/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pjlab-sys4nlp/llama-moe/b17aff436cce58e2fcd7327789c7fadafe15d19e/smoe/entrypoint/examples/__init__.py


--------------------------------------------------------------------------------
/smoe/entrypoint/examples/load_llama_moe_hf.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Load LLaMA MoE model from file.
 3 | """
 4 | 
 5 | import argparse
 6 | 
 7 | import torch.cuda
 8 | from transformers import LlamaTokenizer
 9 | 
10 | from smoe.models.llama_moe.modeling_llama_moe_hf import LlamaMoEModel, LlamaMoEForCausalLM
11 | 
12 | 
13 | def main(args):
14 |     device = "cuda" if torch.cuda.is_available() else "cpu"
15 |     print("Loading model...")
16 | 
17 |     if args.model_type == "LlamaMoEModel":
18 |         model = LlamaMoEModel.from_pretrained(args.model_path)
19 |     elif args.model_type == "LlamaMoEForCausalLM":
20 |         model = LlamaMoEForCausalLM.from_pretrained(args.model_path)
21 |     else:
22 |         raise ValueError
23 | 
24 |     model.config.use_cache = False
25 | 
26 |     # set moe configs
27 |     model.set_moe_num_selects(1)  # 修改专家的选择数量
28 | 
29 |     # set gate configs
30 |     model.set_moe_gate_use_softmax(True)  # 修改是否使用Softmax对门控输出进行激活
31 |     model.set_moe_gate_use_balance(True)  # 修改是否在训练时使用loss平衡专家选择的样本数量
32 |     model.set_moe_gate_balance_loss_weight(0.02)  # 修改平衡loss的权重
33 |     model.set_moe_gate_add_noise(True)  # 修改是否在训练时添加随机噪声到门控输出
34 |     model.set_moe_gate_noise_epsilon(0.02)  # 修改噪声的大小
35 | 
36 |     # set calculator configs
37 |     model.set_moe_calculator_multiply_gate_scores(True)  # 修改是否对专家输出加权
38 |     model.set_moe_calculator_score_scale_factor(16.0)  # 修改专家输出的权重放缩倍数
39 | 
40 |     # reset
41 |     model.reset_gate_network()  # 重新随机初始化门控网络
42 |     model.reset_experts()  # 重新初始化专家参数
43 | 
44 |     """prepare data"""
45 |     sentence_list = [
46 |         "hi hi hi hi hi, hi hi hi hi hi, hi hi hi hi hi",
47 |         "How are you? I'm fine, and you?",
48 |         "<s> <unk> <unk> <unk> <unk> <unk> </s>",
49 |         "I am stupid. Are you sure?",
50 |         "The past is never dead. It is not even past.",
51 |     ]
52 | 
53 |     tokenizer = LlamaTokenizer.from_pretrained(args.tokenizer_path)
54 |     tokenizer.pad_token = tokenizer.eos_token
55 |     tokens = tokenizer(sentence_list, padding=True, return_tensors="pt")
56 |     print(tokens)
57 | 
58 |     """forward test"""
59 |     print("Forwarding inputs...")
60 |     model.half()
61 |     model.to(device)
62 |     tokens.to(device)
63 |     result = model.generate(**tokens, repetition_penalty=1.3, max_length=256)
64 |     print(result)
65 | 
66 |     for i in range(result.shape[0]):
67 |         print(result[i])
68 |         decoded_text = tokenizer.decode(result[i], skip_special_tokens=True)
69 |         print(decoded_text)
70 | 
71 |     print("Done!")
72 | 
73 | 
74 | if __name__ == "__main__":
75 |     parser = argparse.ArgumentParser()
76 |     parser.add_argument("--tokenizer_path", type=str)
77 |     parser.add_argument("--model_path", type=str)
78 |     parser.add_argument(
79 |         "--model_type",
80 |         type=str,
81 |         choices=(
82 |             "LlamaMoEModel",
83 |             "LlamaMoEForCausalLM",
84 |             "LlamaMoEForSequenceClassification",
85 |         ),
86 |     )
87 |     args = parser.parse_args()
88 |     main(args)
89 | 


--------------------------------------------------------------------------------
/smoe/entrypoint/examples/load_relu_llama.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Load ReLU LLaMA model from file.
 3 | """
 4 | 
 5 | import argparse
 6 | 
 7 | import torch.cuda
 8 | from transformers import LlamaForCausalLM, LlamaTokenizer
 9 | 
10 | from smoe.utils.model_operation.modify_llama_model import llama_with_relu_activation
11 | 
12 | 
13 | def main(args):
14 |     device = "cuda" if torch.cuda.is_available() else "cpu"
15 |     print("Loading model...")
16 | 
17 |     model = LlamaForCausalLM.from_pretrained(args.model_path)
18 |     model.model = llama_with_relu_activation(model.model)
19 |     model.config.use_cache = True
20 | 
21 |     """prepare data"""
22 |     sentence_list = [
23 |         "hi hi hi hi hi, hi hi hi hi hi, hi hi hi hi hi",
24 |         "How are you? I'm fine, and you?",
25 |         "<s> <unk> <unk> <unk> <unk> <unk> </s>",
26 |         "I am stupid. Are you sure?",
27 |         "The past is never dead. It is not even past.",
28 |     ]
29 | 
30 |     tokenizer = LlamaTokenizer.from_pretrained(args.tokenizer_path)
31 |     tokenizer.pad_token = tokenizer.eos_token
32 |     tokens = tokenizer(sentence_list, padding=True, return_tensors="pt")
33 |     print(tokens)
34 | 
35 |     """forward test"""
36 |     print("Forwarding inputs...")
37 |     model.half()
38 |     model.to(device)
39 |     tokens.to(device)
40 |     result = model.generate(**tokens, repetition_penalty=2.0, max_length=256)
41 |     print(result)
42 | 
43 |     for i in range(result.shape[0]):
44 |         print(result[i])
45 |         decoded_text = tokenizer.decode(result[i], skip_special_tokens=True)
46 |         print(decoded_text)
47 | 
48 |     print("Done!")
49 | 
50 | 
51 | if __name__ == "__main__":
52 |     parser = argparse.ArgumentParser()
53 |     parser.add_argument(
54 |         "--tokenizer_path",
55 |         type=str,
56 |         default="/mnt/petrelfs/share_data/quxiaoye/models/ReluLLaMA-7B",
57 |     )
58 |     parser.add_argument(
59 |         "--model_path",
60 |         type=str,
61 |         default="/mnt/petrelfs/share_data/quxiaoye/models/ReluLLaMA-7B",
62 |     )
63 |     args = parser.parse_args()
64 |     main(args)
65 | 


--------------------------------------------------------------------------------
/smoe/entrypoint/expert_construction/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pjlab-sys4nlp/llama-moe/b17aff436cce58e2fcd7327789c7fadafe15d19e/smoe/entrypoint/expert_construction/__init__.py


--------------------------------------------------------------------------------
/smoe/entrypoint/expert_construction/llama_prepare_datasets.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import multiprocessing
 3 | import os
 4 | import pickle
 5 | 
 6 | import torch
 7 | from transformers import LlamaTokenizer
 8 | 
 9 | from smoe.data.datasets_moe import LineByLineJsonlTextDataset
10 | 
11 | 
12 | # fmt: off
13 | def process_dataset(args, tokenizer, key, file_name):
14 |     raw_file_path = os.path.join(args.train_data_path, file_name)
15 |     print("\nReading dataset \"" + key + "\" from raw file \"" + raw_file_path + "\"...")
16 | 
17 |     datasets = LineByLineJsonlTextDataset(tokenizer, file_path=raw_file_path, block_size=2048)
18 | 
19 |     if not os.path.exists(args.train_data_cache_path):
20 |         os.makedirs(args.train_data_cache_path)
21 | 
22 |     cached_file_path = os.path.join(args.train_data_cache_path, key + "_cached.pth")
23 |     torch.save(datasets.examples, cached_file_path, pickle_protocol=pickle.HIGHEST_PROTOCOL)
24 |     print(f"Dataset {key}: {sum([torch.sum(datasets[i]['attention_mask']).item() for i in range(len(datasets))])} total tokens.")  # 统计非special token的数量
25 | 
26 | 
27 | if __name__ == "__main__":
28 |     parser = argparse.ArgumentParser()
29 |     parser.add_argument('--model_path', type=str, default="/home/data/models/llama-transformers/7B")
30 |     parser.add_argument('--train_data_path', type=str, default="/home/dongdz/workspace/moefication/llama_data/")
31 |     parser.add_argument('--train_data_cache_path', type=str, default="/home/dongdz/workspace/moefication/llama_data_cache/")
32 | 
33 |     args = parser.parse_args()
34 |     print(args, "\n")
35 | 
36 |     """load tokenizer"""
37 |     tokenizer = LlamaTokenizer.from_pretrained(args.model_path)
38 |     tokenizer.pad_token = tokenizer.eos_token
39 | 
40 |     """prepare datasets"""
41 |     dataset_names = [
42 |         "commoncrawl",
43 |         "c4",
44 |         "github",
45 |         "wikipedia",
46 |         "books",
47 |         "arxiv",
48 |         "stackexchange"
49 |     ]
50 | 
51 |     # read datasets
52 |     pool = multiprocessing.Pool(processes=len(dataset_names))
53 |     for key in dataset_names:
54 |         for file_name in os.listdir(args.train_data_path):
55 |             if key in file_name and file_name.endswith(".jsonl"):
56 |                 pool.apply_async(process_dataset, args=(args, tokenizer, key, file_name))
57 |     pool.close()
58 |     pool.join()
59 | 
60 |     print("Done.")
61 | 


--------------------------------------------------------------------------------
/smoe/entrypoint/expert_construction/llama_prune_gradient.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | 
 4 | import torch
 5 | from tqdm import tqdm
 6 | from transformers import LlamaConfig
 7 | 
 8 | from smoe.utils.expert_construction.prune_llama import GradientPrune
 9 | from smoe.utils.io import torch_load_template_score_file
10 | from smoe.utils.operations.operation_string import str2bool
11 | 
12 | if __name__ == "__main__":
13 |     # fmt: off
14 |     parser = argparse.ArgumentParser()
15 |     parser.add_argument('--model_path', type=str)
16 |     parser.add_argument('--grad_file_path', type=str)
17 |     parser.add_argument('--save_path', type=str)
18 |     parser.add_argument('--expert_index', type=str)
19 |     parser.add_argument('--retain_percent', type=float)
20 |     parser.add_argument('--template', type=str, default='layers.{}.mlp.gate_proj.weight')
21 | 
22 |     parser.add_argument('--kernel', type=str, default="plain", choices=("plain", "l1_norm", "l2_norm"))
23 |     parser.add_argument('--accumulate_level', type=str, default="sample", choices=("sample", "total"))
24 |     parser.add_argument('--importance_type', type=str, default="feature_grad", choices=("feature_grad", "feature_change"))
25 |     parser.add_argument('--criterion', type=str, default="min", choices=("min", "max"))
26 | 
27 |     parser.add_argument('--use_grad_sum', type=str, default="False")
28 | 
29 |     args = parser.parse_args()
30 |     args.use_grad_sum = str2bool(args.use_grad_sum)
31 |     if args.expert_index != "All":
32 |         args.expert_index = int(args.expert_index)
33 |     print(args, "\n")
34 | 
35 |     print("Loading llama config...")
36 |     config = LlamaConfig.from_pretrained(args.model_path)
37 |     expert_size = int(config.intermediate_size * args.retain_percent)
38 | 
39 |     print("Processing layers...")
40 |     save_root_path = args.save_path
41 | 
42 |     if args.importance_type == "feature_grad":
43 |         file_postfix = ".grad"
44 |     elif args.importance_type == "feature_change":
45 |         file_postfix = ".change"
46 |     else:
47 |         raise NotImplementedError
48 | 
49 |     for i in tqdm(range(config.num_hidden_layers)):
50 |         grad_list = torch_load_template_score_file(args.grad_file_path, args.template + file_postfix, i)
51 | 
52 |         if args.use_grad_sum:
53 |             grad_list = torch.stack(grad_list, dim=0).sum(0)
54 |         else:
55 |             grad_list = grad_list[args.expert_index]
56 | 
57 |         args.save_path = os.path.join(
58 |             save_root_path,
59 |             f"{os.path.split(args.model_path)[1]}-Prune-Gradient-{args.criterion}-{args.kernel}-{args.accumulate_level}-{args.importance_type}",
60 |             f"{args.expert_index}-{format(args.retain_percent, '.2f')}Percent-{expert_size}Neurons"
61 |         )
62 | 
63 |         split = GradientPrune(args, args.template, i, grad_list)
64 |         split.prune(expert_size, criterion=args.criterion)
65 |         split.save()
66 |     print("Done.")
67 |     # fmt: on
68 | 


--------------------------------------------------------------------------------
/smoe/entrypoint/expert_construction/llama_prune_random.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | 
 4 | from tqdm import tqdm
 5 | from transformers import LlamaConfig
 6 | 
 7 | from smoe.utils.expert_construction.prune_llama import RandomPrune
 8 | 
 9 | if __name__ == "__main__":
10 |     # fmt: off
11 |     parser = argparse.ArgumentParser()
12 |     parser.add_argument('--model_path', type=str)
13 |     parser.add_argument('--save_path', type=str)
14 |     parser.add_argument('--retain_percent', type=float)
15 |     parser.add_argument('--template', type=str, default='layers.{}.mlp.gate_proj.weight')
16 | 
17 |     args = parser.parse_args()
18 |     print(args, "\n")
19 | 
20 |     print("Loading llama config...")
21 |     config = LlamaConfig.from_pretrained(args.model_path)
22 |     expert_size = int(config.intermediate_size * args.retain_percent)
23 | 
24 |     args.save_path = os.path.join(
25 |         args.save_path,
26 |         f"{os.path.split(args.model_path)[1]}-Prune-Random",
27 |         f"{format(args.retain_percent, '.2f')}Percent-{expert_size}Neurons"
28 |     )
29 | 
30 |     for i in tqdm(range(config.num_hidden_layers)):
31 |         split = RandomPrune(args, args.template, i, config.intermediate_size)
32 |         split.prune(expert_size, seed=0)
33 |         split.save()
34 |     print("Done.")
35 |     # fmt: on
36 | 


--------------------------------------------------------------------------------
/smoe/entrypoint/expert_construction/llama_split_clustering.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | 
 4 | from tqdm import tqdm
 5 | from transformers import LlamaForCausalLM
 6 | 
 7 | from smoe.utils.expert_construction.expert_split import ClusteringSplit
 8 | 
 9 | if __name__ == "__main__":
10 |     # fmt: off
11 |     parser = argparse.ArgumentParser()
12 |     parser.add_argument('--model_path', type=str, default="/home/data/models/llama-transformers/7B")
13 |     parser.add_argument('--save_path', type=str, default="/home/dongdz/workspace/moefication/llama_moe_temp_files/")
14 |     parser.add_argument('--template', type=str, default='layers.{}.mlp.gate_proj.weight')
15 |     parser.add_argument('--num_experts', type=int, default=8, help='number of experts')
16 |     parser.add_argument('--metric', type=str, default="l2", choices=("l2", "cos"))
17 |     parser.add_argument('--cpu_threads', type=int, default=-1)
18 | 
19 |     args = parser.parse_args()
20 |     args.save_path = os.path.join(args.save_path, os.path.split(args.model_path)[1] + "-" + str(args.num_experts) + "Expert-Split-Clustering-" + args.metric)
21 | 
22 |     print("Loading llama model...")
23 |     model = LlamaForCausalLM.from_pretrained(args.model_path).model
24 | 
25 |     for i in tqdm(range(model.config.num_hidden_layers)):
26 |         split = ClusteringSplit(args, model, args.template, i)
27 |         split.split(cpu_threads=args.cpu_threads)
28 |         split.cnt()
29 |         split.save()
30 |     print("Done.")
31 |     # fmt: on
32 | 


--------------------------------------------------------------------------------
/smoe/entrypoint/expert_construction/llama_split_gradient.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | 
 4 | from tqdm import tqdm
 5 | from transformers import LlamaConfig
 6 | 
 7 | from smoe.utils.expert_construction.expert_split import GradientSplit
 8 | from smoe.utils.io import delete_file_or_dir, torch_load_template_score_file
 9 | from smoe.utils.operations.operation_string import str2bool
10 | 
11 | if __name__ == "__main__":
12 |     # fmt: off
13 |     parser = argparse.ArgumentParser()
14 |     parser.add_argument('--model_path', type=str)
15 |     parser.add_argument('--score_file_path', type=str)
16 |     parser.add_argument('--save_path', type=str)
17 |     parser.add_argument('--visualization_path', type=str, default=None)
18 |     parser.add_argument('--expert_num', type=int, default=None)
19 |     parser.add_argument('--expert_size', type=int)
20 |     parser.add_argument('--template', type=str, default='layers.{}.mlp.gate_proj.weight')
21 | 
22 |     parser.add_argument('--kernel', type=str, default="plain", choices=("plain", "l1_norm", "l2_norm"))
23 |     parser.add_argument('--accumulate_level', type=str, default="sample", choices=("sample", "total"))
24 |     parser.add_argument('--criterion', type=str, default="min", choices=("min", "max"))
25 |     parser.add_argument('--importance_type', type=str, default="feature_grad", choices=("feature_grad", "feature_change"))
26 |     parser.add_argument('--share_neurons', type=str, default="False")
27 | 
28 |     args = parser.parse_args()
29 |     args.share_neurons = str2bool(args.share_neurons)
30 |     print(args, "\n")
31 | 
32 |     print("Loading llama config...")
33 |     config = LlamaConfig.from_pretrained(args.model_path)
34 | 
35 |     print("Processing layers...")
36 |     save_root_path = args.save_path
37 | 
38 |     if args.importance_type == "feature_grad":
39 |         file_postfix = ".grad"
40 |     elif args.importance_type == "feature_change":
41 |         file_postfix = ".change"
42 |     else:
43 |         raise NotImplementedError
44 | 
45 |     if args.visualization_path is not None:
46 |         delete_file_or_dir(os.path.join(args.save_path, "total_neurons.txt"))
47 | 
48 |     for i in tqdm(range(config.num_hidden_layers)):
49 |         score_list = torch_load_template_score_file(args.score_file_path, args.template + file_postfix, i)
50 | 
51 |         if args.expert_num is None:
52 |             args.expert_num = len(score_list)
53 |         else:
54 |             assert args.expert_num <= len(score_list)
55 | 
56 |         args.save_path = os.path.join(
57 |             save_root_path,
58 |             f"{os.path.split(args.model_path)[1]}-Split-Gradient-{args.criterion}-{args.kernel}-{args.accumulate_level}-{args.importance_type}",
59 |             f"{args.expert_num}Experts-{args.expert_size}Neurons{'-Share' if args.share_neurons else ''}"
60 |         )
61 | 
62 |         split = GradientSplit(args, args.template, i, score_list)
63 |         split.split(args.expert_num, args.expert_size, criterion=args.criterion, share_neurons=args.share_neurons)
64 |         if not args.share_neurons:
65 |             split.cnt()
66 |         split.save()
67 | 
68 |         if args.visualization_path is not None:
69 |             split.visualize(args.visualization_path, share_neurons=args.share_neurons)
70 |     print("Done.")
71 |     # fmt: on
72 | 


--------------------------------------------------------------------------------
/smoe/entrypoint/expert_construction/llama_split_gradient_residual.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | 
 4 | from tqdm import tqdm
 5 | from transformers import LlamaConfig
 6 | 
 7 | from smoe.utils.expert_construction.expert_split_residual import GradientSplitResidual
 8 | from smoe.utils.io import delete_file_or_dir, torch_load_template_score_file
 9 | from smoe.utils.operations.operation_string import str2bool
10 | 
11 | if __name__ == "__main__":
12 |     # fmt: off
13 |     parser = argparse.ArgumentParser()
14 |     parser.add_argument('--model_path', type=str)
15 |     parser.add_argument('--score_file_path', type=str)
16 |     parser.add_argument('--save_path', type=str)
17 |     parser.add_argument('--visualization_path', type=str, default=None)
18 |     parser.add_argument('--expert_num_moe', type=int)
19 |     parser.add_argument('--expert_num_residual', type=int)
20 |     parser.add_argument('--expert_size', type=int)
21 |     parser.add_argument('--template', type=str, default='layers.{}.mlp.gate_proj.weight')
22 | 
23 |     parser.add_argument('--kernel', type=str, default="plain", choices=("plain", "l1_norm", "l2_norm"))
24 |     parser.add_argument('--accumulate_level', type=str, default="sample", choices=("sample", "total"))
25 |     parser.add_argument('--criterion', type=str, default="min", choices=("min", "max"))
26 |     parser.add_argument('--importance_type', type=str, default="feature_grad", choices=("feature_grad", "feature_change"))
27 |     parser.add_argument('--share_neurons', type=str, default="False")
28 | 
29 |     args = parser.parse_args()
30 |     args.share_neurons = str2bool(args.share_neurons)
31 |     print(args, "\n")
32 | 
33 |     print("Loading llama config...")
34 |     config = LlamaConfig.from_pretrained(args.model_path)
35 | 
36 |     print("Processing layers...")
37 |     save_root_path = args.save_path
38 | 
39 |     if args.importance_type == "feature_grad":
40 |         file_postfix = ".grad"
41 |     elif args.importance_type == "feature_change":
42 |         file_postfix = ".change"
43 |     else:
44 |         raise NotImplementedError
45 | 
46 |     if args.visualization_path is not None:
47 |         delete_file_or_dir(os.path.join(args.save_path, "total_neurons.txt"))
48 | 
49 |     for i in tqdm(range(config.num_hidden_layers)):
50 |         score_list = torch_load_template_score_file(args.score_file_path, args.template + file_postfix, i)
51 | 
52 |         args.save_path = os.path.join(
53 |             save_root_path,
54 |             f"{os.path.split(args.model_path)[1]}-Split-Gradient-{args.criterion}-{args.kernel}-{args.accumulate_level}-{args.importance_type}",
55 |             f"{args.expert_num_moe}Experts-{args.expert_num_residual}Residuals-{args.expert_size}Neurons{'-Share' if args.share_neurons else ''}"
56 |         )
57 | 
58 |         split = GradientSplitResidual(args, args.template, i, score_list)
59 |         split.split(args.expert_num_moe, args.expert_num_residual, args.expert_size, criterion=args.criterion, share_neurons=args.share_neurons)
60 |         split.save()
61 | 
62 |         if args.visualization_path is not None:
63 |             split.visualize(args.visualization_path, share_neurons=args.share_neurons)
64 |     print("Done.")
65 |     # fmt: on
66 | 


--------------------------------------------------------------------------------
/smoe/entrypoint/expert_construction/llama_split_graph.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | 
 4 | from tqdm import tqdm
 5 | from transformers import LlamaForCausalLM
 6 | 
 7 | from smoe.utils.expert_construction.expert_split import GraphSplit
 8 | 
 9 | if __name__ == "__main__":
10 |     # fmt: off
11 |     parser = argparse.ArgumentParser()
12 |     parser.add_argument("--model_path", type=str, default="./model_path")
13 |     parser.add_argument("--hidden_features_path", type=str, default="./hidden_features_path", )
14 |     parser.add_argument("--save_path", type=str, default="./save_path", )
15 |     parser.add_argument('--specify_layer', nargs='+', help='used to specify train layers, example \"--specify_layer 0 1 2 3\"')
16 | 
17 |     parser.add_argument("--template", type=str, default="layers.{}.mlp.gate_proj.weight")
18 |     parser.add_argument("--num_experts", type=int, default=8, help="number of experts")
19 |     parser.add_argument("--metric", type=str, default="l1_norm")
20 |     parser.add_argument("--threshold", type=int, default=1)
21 | 
22 |     args = parser.parse_args()
23 |     # args.save_path = os.path.join(
24 |     #     args.save_path,
25 |     #     os.path.split(args.model_path)[1] + "-" + str(args.num_experts) + "Expert-Split-Graph-" + str(args.metric),
26 |     # )
27 | 
28 |     if not os.path.exists(args.save_path):
29 |         os.makedirs(args.save_path)
30 | 
31 |     print("Loading llama model...")
32 |     model = LlamaForCausalLM.from_pretrained(args.model_path).model
33 | 
34 |     if "specify_layer" in args:
35 |         train_layers = [int(layer) for layer in args.specify_layer]
36 |     else:
37 |         train_layers = range(model.config.num_hidden_layers)
38 | 
39 |     for layer_idx in train_layers:
40 |         print(f"Creating co-activation matrix for layer {layer_idx}...")
41 |         split = GraphSplit(args, model, args.template, layer_idx)
42 |         split.split_and_save()
43 |     print("Done.")
44 |     # fmt: on
45 | 


--------------------------------------------------------------------------------
/smoe/entrypoint/expert_construction/llama_split_graph_trans_gp.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | import random
 4 | import sys
 5 | from collections import defaultdict
 6 | 
 7 | import torch
 8 | 
 9 | if __name__ == "__main__":
10 |     parser = argparse.ArgumentParser()
11 |     parser.add_argument("--gpmetised_file_path", type=str, default="./file_path")
12 | 
13 |     args = parser.parse_args()
14 | 
15 |     labels = []
16 | 
17 |     with open(args.gpmetised_file_path) as fin:
18 |         d = defaultdict(list)
19 |         for i, line in enumerate(fin):
20 |             labels.append(int(line.strip()))
21 |             d[labels[-1]].append(i)
22 | 
23 |     need_move = []
24 | 
25 |     for i in range(max(d.keys()) + 1):
26 |         if i not in d:
27 |             d[i] = []
28 |     print(len(labels), len(d.keys()))
29 | 
30 |     num = len(labels) // len(d.keys())
31 |     for k, v in d.items():
32 |         if len(v) > num:
33 |             random.shuffle(v)
34 |             for i in range(num, len(v)):
35 |                 need_move.append(v[i])
36 |             d[k] = v[:num]
37 | 
38 |     print("need_move", need_move)
39 | 
40 |     random.shuffle(need_move)
41 |     for k, v in d.items():
42 |         if len(v) < num:
43 |             pos = num - len(v)
44 |             v += need_move[:pos]
45 |             need_move = need_move[pos:]
46 |         for x in v:
47 |             labels[x] = k
48 | 
49 |     vec = os.path.basename(args.gpmetised_file_path).split(".")[:-2]
50 |     target = ".".join(vec)
51 | 
52 |     save_folder = os.path.join(os.path.dirname(args.gpmetised_file_path), "gp_split")
53 | 
54 |     if not os.path.exists(save_folder):
55 |         os.makedirs(save_folder)
56 | 
57 |     torch.save(labels, os.path.join(save_folder, target))
58 | 
59 |     from collections import Counter
60 | 
61 |     print(Counter(labels))
62 | 


--------------------------------------------------------------------------------
/smoe/entrypoint/expert_construction/llama_split_random.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | 
 4 | from tqdm import tqdm
 5 | from transformers import AutoConfig
 6 | 
 7 | from smoe.utils.expert_construction.expert_split import RandomSplit
 8 | 
 9 | if __name__ == "__main__":
10 |     # fmt: off
11 |     parser = argparse.ArgumentParser()
12 |     parser.add_argument('--model_path', type=str, default="/home/data/models/llama-transformers/7B")
13 |     parser.add_argument('--save_path', type=str, default="/home/dongdz/workspace/moefication/llama_moe_temp_files/")
14 |     parser.add_argument('--template', type=str, default='layers.{}.mlp.up_proj.weight')
15 |     parser.add_argument('--num_experts', type=int, default=8, help='number of experts')
16 | 
17 |     args = parser.parse_args()
18 |     args.save_path = os.path.join(args.save_path, os.path.split(args.model_path)[1] + "-" + str(args.num_experts) + "Expert-Split-Random")
19 |     print(args, "\n")
20 | 
21 |     print("Loading llama config...")
22 |     config = AutoConfig.from_pretrained(args.model_path)
23 | 
24 |     for i in tqdm(range(config.num_hidden_layers)):
25 |         split = RandomSplit(args, config, args.template, i)
26 |         split.split()
27 |         split.cnt()
28 |         split.save()
29 |     print("Done.")
30 | 


--------------------------------------------------------------------------------
/smoe/entrypoint/sft/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pjlab-sys4nlp/llama-moe/b17aff436cce58e2fcd7327789c7fadafe15d19e/smoe/entrypoint/sft/__init__.py


--------------------------------------------------------------------------------
/smoe/entrypoint/visualization/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pjlab-sys4nlp/llama-moe/b17aff436cce58e2fcd7327789c7fadafe15d19e/smoe/entrypoint/visualization/__init__.py


--------------------------------------------------------------------------------
/smoe/entrypoint/visualization/visualize_expert_neuron_overlap.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Visualization of pair-wise overlap rate & overlap neuron count for moe models constructed by importance criterion (Share=True).
 3 | """
 4 | import argparse
 5 | import os
 6 | 
 7 | import torch
 8 | from tqdm import tqdm
 9 | from transformers import LlamaConfig
10 | 
11 | from smoe.utils.io import delete_file_or_dir, torch_load_template_score_file
12 | from smoe.utils.visualization.visualize import visualize_expert_neuron_overlap
13 | 
14 | # fmt: off
15 | if __name__ == "__main__":
16 |     parser = argparse.ArgumentParser()
17 |     parser.add_argument('--model_path', type=str)
18 |     parser.add_argument('--score_file_path', type=str)
19 |     parser.add_argument('--save_path', type=str)
20 |     parser.add_argument('--expert_size', type=int)
21 |     parser.add_argument('--score_file_template', type=str, default="layers.{}.mlp.up_proj.weight.change")
22 |     parser.add_argument('--criterion', type=str, default="max", choices=("min", "max"))
23 | 
24 |     args = parser.parse_args()
25 |     print("\n", args)
26 | 
27 |     print("Loading llama config...")
28 |     config = LlamaConfig.from_pretrained(args.model_path)
29 | 
30 |     delete_file_or_dir(os.path.join(args.save_path, "total_neurons.txt"))
31 | 
32 |     for layer_id in tqdm(range(config.num_hidden_layers)):
33 |         """read scores from files"""
34 |         score_list = torch_load_template_score_file(args.score_file_path, args.score_file_template, layer_id)
35 |         num_experts = len(score_list)
36 |         scores = torch.stack(score_list, dim=0)
37 | 
38 |         """get selected mask"""
39 |         selected_mask_list = []
40 |         for j, score in enumerate(score_list):
41 |             if args.criterion == "min":
42 |                 sorted_score, index = torch.sort(score)
43 |             elif args.criterion == "max":
44 |                 sorted_score, index = torch.sort(score, descending=True)
45 |             else:
46 |                 raise NotImplementedError
47 |             selected_mask = torch.zeros_like(score, dtype=torch.int)
48 |             selected_mask[index[:args.expert_size]] += 1
49 |             selected_mask_list.append(selected_mask)
50 |         selected_masks = torch.stack(selected_mask_list, dim=0)  # shape(num_experts, intermediate_size)
51 | 
52 |         """visualize"""
53 |         visualize_expert_neuron_overlap(selected_masks, num_experts, config.intermediate_size, args.expert_size, layer_id, save_dir=args.save_path)
54 | 
55 |     print("done.")
56 | 


--------------------------------------------------------------------------------
/smoe/entrypoint/visualization/visualize_expert_select_mlp.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | from smoe.utils.visualization.visualize import visualize_expert_select_mlp
 4 | 
 5 | # fmt: off
 6 | if __name__ == "__main__":
 7 |     parser = argparse.ArgumentParser()
 8 |     parser.add_argument('--result_path', type=str)
 9 |     parser.add_argument('--save_path', type=str)
10 |     parser.add_argument('--proj_type', type=str)
11 | 
12 |     args = parser.parse_args()
13 |     print(args, "\n")
14 | 
15 |     visualize_expert_select_mlp(args.result_path, args.save_path, args.proj_type)
16 | 


--------------------------------------------------------------------------------
/smoe/entrypoint/visualization/visualize_gate_loss.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import statistics as sts
 3 | from collections import defaultdict
 4 | from pathlib import Path
 5 | 
 6 | from smoe.utils.io import load_nums_from_txt
 7 | from smoe.utils.visualization.line import line_plot
 8 | 
 9 | if __name__ == "__main__":
10 |     folders = [
11 |         ["L2", "/mnt/petrelfs/zhutong/smoe/results/llama_7B_MoE_16Select4-l2_norm"],
12 |         ["Random Params", "/mnt/petrelfs/zhutong/smoe/results/random_16select4_moe"],
13 |         [
14 |             "Random Split",
15 |             "/mnt/petrelfs/zhutong/smoe/results/RandomSplit-l2_norm-llama_7B-16Select4-up_proj",
16 |         ],
17 |     ]
18 |     output_fig_file = "results/gate_loss.png"
19 | 
20 |     label_to_nums = defaultdict(list)
21 |     for name, folder in folders:
22 |         folder_path = Path(folder)
23 |         txt_files = list(folder_path.glob("gate_loss_R*_L*.txt"))
24 |         regex = re.compile(r"gate_loss_R(\d+)_L(\d+).txt")
25 |         layer_to_loss = defaultdict(list)
26 |         for txt_file in txt_files:
27 |             rank, layer = regex.search(str(txt_file)).groups()
28 |             rank, layer = int(rank), int(layer)
29 |             layer_to_loss[layer].extend(load_nums_from_txt(txt_file))
30 | 
31 |         layers = []
32 |         for layer, losses in sorted(layer_to_loss.items(), key=lambda item: item[0]):
33 |             layers.append(layer)
34 |             label_to_nums[name].append(sts.mean(losses))
35 | 
36 |     line_plot(
37 |         layers,
38 |         label_to_nums,
39 |         title="gate loss",
40 |         xlabel="layer",
41 |         ylabel="loss",
42 |         save_path=output_fig_file,
43 |     )
44 | 


--------------------------------------------------------------------------------
/smoe/entrypoint/visualization/visualize_swiglu_output.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | 
 4 | import torch.cuda
 5 | from transformers import LlamaConfig
 6 | 
 7 | from smoe.utils.visualization.visualize import visualize_swiglu_output
 8 | 
 9 | # fmt: off
10 | if __name__ == "__main__":
11 |     parser = argparse.ArgumentParser()
12 |     parser.add_argument('--model_path', type=str)
13 |     parser.add_argument('--hidden_features_path', type=str)
14 |     parser.add_argument('--save_path', type=str)
15 |     parser.add_argument('--template', type=str, default='layers.{}.mlp.gate_proj.weight')
16 |     parser.add_argument('--specify_layer', nargs='+', help='used to specify layers for visualization, example \"--specify_layer 0 1 2 3\"')
17 |     parser.add_argument('--visualize_criterion', default='plain', choices=["plain", "l1_norm", "l2_norm"])
18 | 
19 |     args = parser.parse_args()
20 |     print(args, "\n")
21 | 
22 |     device = "cuda:0" if torch.cuda.is_available() else "cpu"
23 |     print(device)
24 | 
25 |     print("Loading llama config...")
26 |     config = LlamaConfig.from_pretrained(args.model_path)
27 | 
28 |     if "specify_layer" in args:
29 |         visualize_layers = [int(layer) for layer in args.specify_layer]
30 |     else:
31 |         visualize_layers = range(config.num_hidden_layers)
32 |     print(visualize_layers)
33 | 
34 |     for layer_idx in visualize_layers:
35 |         print(f"Visualizing SiwGLU output for layer {layer_idx}...")
36 | 
37 |         if "gate_proj" in args.template:
38 |             hidden_outputs_path = os.path.join(args.hidden_features_path, "hidden_gate_outputs", "layer" + str(layer_idx))
39 |             neuron_type = "gate_proj"
40 |         elif "up_proj" in args.template:
41 |             hidden_outputs_path = os.path.join(args.hidden_features_path, "hidden_up_outputs", "layer" + str(layer_idx))
42 |             neuron_type = "up_proj"
43 |         else:
44 |             raise ValueError
45 | 
46 |         if args.visualize_criterion == "plain":
47 |             edge = (-0.5, 0.5)
48 |         elif args.visualize_criterion == "l1_norm":
49 |             edge = (0, 0.5)
50 |         elif args.visualize_criterion == "l2_norm":
51 |             edge = (0, 0.1)
52 |         else:
53 |             raise ValueError
54 | 
55 |         visualize_swiglu_output(hidden_outputs_path, args.save_path, neuron_type, layer_idx, criterion=args.visualize_criterion,
56 |                                 num_bins=1000, edge=edge, device=device)
57 | 


--------------------------------------------------------------------------------
/smoe/metrics/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pjlab-sys4nlp/llama-moe/b17aff436cce58e2fcd7327789c7fadafe15d19e/smoe/metrics/__init__.py


--------------------------------------------------------------------------------
/smoe/metrics/accuracy.py:
--------------------------------------------------------------------------------
 1 | from sklearn.metrics import accuracy_score
 2 | 
 3 | 
 4 | def accuracy(predictions, references, normalize=True, sample_weight=None):
 5 |     return {
 6 |         "accuracy": float(
 7 |             accuracy_score(
 8 |                 references,
 9 |                 predictions,
10 |                 normalize=normalize,
11 |                 sample_weight=sample_weight,
12 |             )
13 |         )
14 |     }
15 | 
16 | 
17 | def compute_metrics(eval_preds):
18 |     preds, labels = eval_preds
19 |     # preds have the same shape as the labels, after the argmax(-1) has been calculated
20 |     # by preprocess_logits_for_metrics but we need to shift the labels
21 |     labels = labels[:, 1:].reshape(-1)
22 |     preds = preds[:, :-1].reshape(-1)
23 |     return accuracy(predictions=preds, references=labels)
24 | 


--------------------------------------------------------------------------------
/smoe/metrics/preprocess.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | def logits_argmax(logits: torch.Tensor | tuple[torch.Tensor], labels):
 5 |     if isinstance(logits, tuple):
 6 |         # Depending on the model and config, logits may contain extra tensors,
 7 |         # like past_key_values, but logits always come first
 8 |         logits = logits[0]
 9 |     return logits.argmax(dim=-1)
10 | 


--------------------------------------------------------------------------------
/smoe/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pjlab-sys4nlp/llama-moe/b17aff436cce58e2fcd7327789c7fadafe15d19e/smoe/models/__init__.py


--------------------------------------------------------------------------------
/smoe/models/llama_moe/__init__.py:
--------------------------------------------------------------------------------
 1 | from .configuration_llama_moe import LlamaMoEConfig  # noqa: F401
 2 | from .modeling_llama_moe import (  # noqa: F401
 3 |     BaseMoEModelOutputWithPast,
 4 |     LlamaMoEDecoderLayer,
 5 |     LlamaMoEForCausalLM,
 6 |     LlamaMoEForSequenceClassification,
 7 |     LlamaMoEModel,
 8 |     LlamaMoEPreTrainedModel,
 9 | )
10 | 


--------------------------------------------------------------------------------
/smoe/models/llama_moe_residual/__init__.py:
--------------------------------------------------------------------------------
1 | from .configuration_llama_moe_residual import LlamaMoEResidualConfig  # noqa: F401
2 | from .modeling_llama_moe_residual import (  # noqa: F401
3 |     LlamaMoEResidualDecoderLayer,
4 |     LlamaMoEResidualForCausalLM,
5 |     LlamaMoEResidualForSequenceClassification,
6 |     LlamaMoEResidualModel,
7 |     LlamaMoEResidualPreTrainedModel,
8 | )
9 | 


--------------------------------------------------------------------------------
/smoe/models/mistral/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Mistral AI and The HuggingFace Inc. team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from typing import TYPE_CHECKING
15 | 
16 | from transformers.utils import (
17 |     OptionalDependencyNotAvailable,
18 |     _LazyModule,
19 |     is_torch_available,
20 | )
21 | 
22 | _import_structure = {
23 |     "configuration_mistral": ["MISTRAL_PRETRAINED_CONFIG_ARCHIVE_MAP", "MistralConfig"],
24 | }
25 | 
26 | 
27 | try:
28 |     if not is_torch_available():
29 |         raise OptionalDependencyNotAvailable()
30 | except OptionalDependencyNotAvailable:
31 |     pass
32 | else:
33 |     _import_structure["modeling_mistral"] = [
34 |         "MistralForCausalLM",
35 |         "MistralModel",
36 |         "MistralPreTrainedModel",
37 |         "MistralForSequenceClassification",
38 |     ]
39 | 
40 | 
41 | if TYPE_CHECKING:
42 |     from .configuration_mistral import (
43 |         MISTRAL_PRETRAINED_CONFIG_ARCHIVE_MAP,
44 |         MistralConfig,
45 |     )
46 | 
47 |     try:
48 |         if not is_torch_available():
49 |             raise OptionalDependencyNotAvailable()
50 |     except OptionalDependencyNotAvailable:
51 |         pass
52 |     else:
53 |         from .modeling_mistral import (
54 |             MistralForCausalLM,
55 |             MistralForSequenceClassification,
56 |             MistralModel,
57 |             MistralPreTrainedModel,
58 |         )
59 | 
60 | 
61 | else:
62 |     import sys
63 | 
64 |     sys.modules[__name__] = _LazyModule(
65 |         __name__, globals()["__file__"], _import_structure, module_spec=__spec__
66 |     )
67 | 


--------------------------------------------------------------------------------
/smoe/models/mixtral/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 Mixtral AI and The HuggingFace Inc. team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from typing import TYPE_CHECKING
15 | 
16 | from transformers.utils import (
17 |     OptionalDependencyNotAvailable,
18 |     _LazyModule,
19 |     is_torch_available,
20 | )
21 | 
22 | _import_structure = {
23 |     "configuration_mixtral": ["MIXTRAL_PRETRAINED_CONFIG_ARCHIVE_MAP", "MixtralConfig"],
24 | }
25 | 
26 | 
27 | try:
28 |     if not is_torch_available():
29 |         raise OptionalDependencyNotAvailable()
30 | except OptionalDependencyNotAvailable:
31 |     pass
32 | else:
33 |     _import_structure["modeling_mixtral"] = [
34 |         "MixtralForCausalLM",
35 |         "MixtralModel",
36 |         "MixtralPreTrainedModel",
37 |         "MixtralForSequenceClassification",
38 |     ]
39 | 
40 | 
41 | if TYPE_CHECKING:
42 |     from .configuration_mixtral import (
43 |         MIXTRAL_PRETRAINED_CONFIG_ARCHIVE_MAP,
44 |         MixtralConfig,
45 |     )
46 | 
47 |     try:
48 |         if not is_torch_available():
49 |             raise OptionalDependencyNotAvailable()
50 |     except OptionalDependencyNotAvailable:
51 |         pass
52 |     else:
53 |         from .modeling_mixtral import (
54 |             MixtralForCausalLM,
55 |             MixtralForSequenceClassification,
56 |             MixtralModel,
57 |             MixtralPreTrainedModel,
58 |         )
59 | 
60 | 
61 | else:
62 |     import sys
63 | 
64 |     sys.modules[__name__] = _LazyModule(
65 |         __name__, globals()["__file__"], _import_structure, module_spec=__spec__
66 |     )
67 | 


--------------------------------------------------------------------------------
/smoe/modules/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pjlab-sys4nlp/llama-moe/b17aff436cce58e2fcd7327789c7fadafe15d19e/smoe/modules/__init__.py


--------------------------------------------------------------------------------
/smoe/modules/moe/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pjlab-sys4nlp/llama-moe/b17aff436cce58e2fcd7327789c7fadafe15d19e/smoe/modules/moe/__init__.py


--------------------------------------------------------------------------------
/smoe/modules/moe_residual/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pjlab-sys4nlp/llama-moe/b17aff436cce58e2fcd7327789c7fadafe15d19e/smoe/modules/moe_residual/__init__.py


--------------------------------------------------------------------------------
/smoe/modules/norm.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.init as init
 4 | 
 5 | 
 6 | class WeightNorm(nn.Module):
 7 |     def __init__(
 8 |         self, hidden_size: int, scale: float = 1.0, device=None, dtype=None
 9 |     ) -> None:
10 |         super().__init__()
11 | 
12 |         self.hsz = hidden_size
13 |         self.scale = scale
14 | 
15 |         self.weight = nn.Parameter(torch.empty(hidden_size, device=device, dtype=dtype))
16 | 
17 |         self.reset_parameters()
18 | 
19 |     def reset_parameters(self):
20 |         # init.ones_(self.weight)
21 |         init.constant_(self.weight, self.scale)
22 | 
23 |     def forward(self, hidden: torch.Tensor) -> torch.Tensor:
24 |         # if torch.isnan(self.weight).any():
25 |         #     remote_breakpoint()
26 |         # return hidden * (self.scale * F.sigmoid(self.weight) + 1.0)
27 |         return hidden * self.weight
28 | 
29 |     def extra_repr(self) -> str:
30 |         return "hsz={}, scale={}".format(self.hsz, self.scale)
31 | 


--------------------------------------------------------------------------------
/smoe/trainer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pjlab-sys4nlp/llama-moe/b17aff436cce58e2fcd7327789c7fadafe15d19e/smoe/trainer/__init__.py


--------------------------------------------------------------------------------
/smoe/trainer/moefy/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pjlab-sys4nlp/llama-moe/b17aff436cce58e2fcd7327789c7fadafe15d19e/smoe/trainer/moefy/__init__.py


--------------------------------------------------------------------------------
/smoe/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pjlab-sys4nlp/llama-moe/b17aff436cce58e2fcd7327789c7fadafe15d19e/smoe/utils/__init__.py


--------------------------------------------------------------------------------
/smoe/utils/debugging.py:
--------------------------------------------------------------------------------
 1 | import socket
 2 | 
 3 | import debugpy
 4 | import torch.distributed as dist
 5 | 
 6 | 
 7 | def remote_breakpoint(host: str = "0.0.0.0", port: int = 5678, rank: int = 0):
 8 |     """
 9 |     This function helps to debug programs running in the remote computing node.
10 | 
11 |     In VSCode, you should add the configuration to the `.vscode/launch.json`, sth. like this 👇
12 |     ```json
13 |     {
14 |         // Use IntelliSense to learn about possible attributes.
15 |         // Hover to view descriptions of existing attributes.
16 |         // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
17 |         "version": "0.2.0",
18 |         "configurations": [
19 |             {
20 |                 "name": "Python: Remote Attach",
21 |                 "type": "python",
22 |                 "request": "attach",
23 |                 "connect": {
24 |                     "host": "<hostname>",
25 |                     "port": 5678
26 |                 },
27 |                 "pathMappings": [
28 |                     {
29 |                         "localRoot": "${workspaceFolder}",
30 |                         "remoteRoot": "."
31 |                     }
32 |                 ],
33 |                 "justMyCode": false
34 |             }
35 |         ]
36 |     }
37 |     ```
38 | 
39 |     Then, you could insert one line of code to the debugging position:
40 |     ```python
41 |     from smoe.utils.debugging import remote_breakpoint; remote_breakpoint()
42 |     ```
43 | 
44 |     After the program starts and encounters the breakpoint, you could remote attach the debugger.
45 |     """
46 | 
47 |     def _dp():
48 |         print(
49 |             f"Waiting for debugger to attach on {host}:{port}, server: {socket.gethostname()}..."
50 |         )
51 |         debugpy.listen((host, port))
52 |         debugpy.wait_for_client()
53 |         breakpoint()
54 | 
55 |     if dist.is_available() and dist.is_initialized():
56 |         if dist.get_rank() == rank:
57 |             _dp()
58 |         dist.barrier()
59 |     else:
60 |         _dp()
61 | 


--------------------------------------------------------------------------------
/smoe/utils/eval/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pjlab-sys4nlp/llama-moe/b17aff436cce58e2fcd7327789c7fadafe15d19e/smoe/utils/eval/__init__.py


--------------------------------------------------------------------------------
/smoe/utils/eval/gather_results.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | 
 4 | import pandas as pd
 5 | 
 6 | 
 7 | def gather_results(args):
 8 |     df = pd.DataFrame(columns=["dataset", "accuracy"])
 9 |     for dir_path, dir_names, file_names in os.walk(args.save_dir):
10 |         print(dir_path)
11 |         for name in sorted(file_names):
12 |             print(name)
13 |             if name == "all_datasets_0.txt":
14 |                 file_path = os.path.join(dir_path, name)
15 |                 with open(file_path, "r") as file:
16 |                     for i, line in enumerate(file.readlines()):
17 |                         acc = float(line[17:22])
18 |                         dataset = line[25:-2]
19 |                         if not dataset in df["dataset"].values.tolist():
20 |                             df.loc[i] = [dataset, acc]
21 | 
22 |             if name == "all_datasets_1.txt":
23 |                 file_path = os.path.join(dir_path, name)
24 |                 with open(file_path, "r") as file:
25 |                     for i, line in enumerate(file.readlines()):
26 |                         acc = float(line[17:22])
27 |                         dataset = line[25:-2]
28 |                         if not dataset in df["dataset"].values.tolist():
29 |                             df.loc[i + 28] = [dataset, acc]
30 | 
31 |             if name == "all_datasets_2.txt":
32 |                 file_path = os.path.join(dir_path, name)
33 |                 with open(file_path, "r") as file:
34 |                     for i, line in enumerate(file.readlines()):
35 |                         acc = float(line[17:22])
36 |                         dataset = line[25:-2]
37 |                         if not dataset in df["dataset"].values.tolist():
38 |                             df.loc[i + 44] = [dataset, acc]
39 | 
40 |             if name == "all_datasets_3.txt":
41 |                 file_path = os.path.join(dir_path, name)
42 |                 with open(file_path, "r") as file:
43 |                     for i, line in enumerate(file.readlines()):
44 |                         acc = float(line[17:22])
45 |                         dataset = line[25:-2]
46 |                         if not dataset in df["dataset"].values.tolist():
47 |                             df.loc[i + 57] = [dataset, acc]
48 | 
49 |         avg_value = float(df["accuracy"].mean())
50 |         df.loc[60] = ["avg_value", avg_value]
51 |         df.to_csv(os.path.join(dir_path, "all_datasets.csv"), index=None)
52 | 
53 | 
54 | if __name__ == "__main__":
55 |     parser = argparse.ArgumentParser()
56 |     parser.add_argument("--save_dir", "-s", type=str, default="results_moe")
57 |     args = parser.parse_args()
58 |     gather_results(args)
59 | 


--------------------------------------------------------------------------------
/smoe/utils/expert_construction/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pjlab-sys4nlp/llama-moe/b17aff436cce58e2fcd7327789c7fadafe15d19e/smoe/utils/expert_construction/__init__.py


--------------------------------------------------------------------------------
/smoe/utils/expert_construction/prune_llama.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pickle
 3 | 
 4 | import numpy as np
 5 | import torch
 6 | 
 7 | from smoe.utils.seed import set_seed
 8 | 
 9 | 
10 | class LayerPrune:
11 |     def __init__(self, config, template, layer):
12 |         self.config = config
13 |         self.template = template
14 |         self.layer = layer
15 | 
16 |     def save(self):
17 |         if not os.path.exists(self.config.save_path):
18 |             os.makedirs(self.config.save_path)
19 | 
20 |         filename = os.path.join(self.config.save_path, self.template.format(self.layer))
21 |         torch.save(self.labels, filename, pickle_protocol=pickle.HIGHEST_PROTOCOL)
22 |         print(f'Expert indices for layer {self.layer} saved to "{filename}".')
23 | 
24 | 
25 | class GradientPrune(LayerPrune):
26 |     # fmt: off
27 |     def __init__(self, config, template, layer, score):
28 |         super().__init__(config, template, layer)
29 |         self.score = score
30 |         self.num_experts = 1
31 |         self.neuron_num = score.size(0)
32 | 
33 |     def sort_by_criterion(self, criterion):
34 |         if criterion == "min":
35 |             sorted_score, sorted_index = self.score.sort(0)
36 |         elif criterion == "max":
37 |             sorted_score, sorted_index = self.score.sort(0, descending=True)
38 |         else:
39 |             raise NotImplementedError
40 |         return sorted_score.tolist(), sorted_index.tolist()
41 | 
42 |     def prune(self, expert_size, criterion="min"):
43 |         sorted_score, sorted_index = self.sort_by_criterion(criterion)
44 |         self.labels = [sorted_index[:expert_size]]  # 与其他的labels不同，此处选择的是神经元索引，而非专家索引
45 |         # print(self.labels)
46 |         # fmt: on
47 | 
48 | 
49 | class RandomPrune(LayerPrune):
50 |     # fmt: off
51 |     def __init__(self, config, template, layer, neuron_num):
52 |         super().__init__(config, template, layer)
53 |         self.num_experts = 1
54 |         self.neuron_num = neuron_num
55 | 
56 |     def prune(self, expert_size, seed=None):
57 |         if seed is not None:
58 |             set_seed(seed)
59 |         index = torch.randperm(self.neuron_num).tolist()
60 |         self.labels = [index[:expert_size]]  # 与其他的labels不同，此处选择的是神经元索引，而非专家索引
61 |         # print(self.labels)
62 |     # fmt: on
63 | 


--------------------------------------------------------------------------------
/smoe/utils/extract_text_from_jsonl.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Extract texts from jsonlines file.
 3 | 
 4 | Example:
 5 |     $ python -m smoe.utils.extract_text_from_jsonl -c content -i resources/redpajama/commoncrawl.jsonl -o resources/redpajama-processed/commoncrawl.txt
 6 | """
 7 | 
 8 | import argparse
 9 | import json
10 | 
11 | 
12 | def get_parser():
13 |     parser = argparse.ArgumentParser()
14 |     parser.add_argument(
15 |         "-c", "--column_name", default="content", help="text column name"
16 |     )
17 |     parser.add_argument("-i", "--input_filepath", help="filepath with text to tokenize")
18 |     parser.add_argument("-o", "--output_filepath", help="output filepath")
19 |     args = parser.parse_args()
20 |     return args
21 | 
22 | 
23 | def extract_text():
24 |     args = get_parser()
25 | 
26 |     with open(args.input_filepath, "r", encoding="utf8") as fin:
27 |         with open(args.output_filepath, "w", encoding="utf8") as fout:
28 |             for line in fin:
29 |                 ins = json.loads(line)
30 |                 text = ins[args.column_name]
31 |                 fout.write(f"{text.strip()}\n")
32 | 
33 | 
34 | if __name__ == "__main__":
35 |     extract_text()
36 | 


--------------------------------------------------------------------------------
/smoe/utils/io.py:
--------------------------------------------------------------------------------
  1 | import gzip
  2 | import json
  3 | import lzma
  4 | import os
  5 | import pickle
  6 | import shutil
  7 | 
  8 | import cv2
  9 | import torch
 10 | 
 11 | 
 12 | def delete_file_or_dir(dir):
 13 |     if os.path.isfile(dir):
 14 |         os.remove(dir)
 15 |     elif os.path.exists(dir):
 16 |         shutil.rmtree(dir)
 17 |     else:
 18 |         pass
 19 | 
 20 | 
 21 | def torch_load_template_file(path, template, layer):
 22 |     target = os.path.join(path, template.format(layer))
 23 |     return torch.load(target)
 24 | 
 25 | 
 26 | def torch_load_template_score_file(path, template, layer):
 27 |     score_list = []
 28 |     for expert_folder_name in sorted(os.listdir(path)):
 29 |         score_file = os.path.join(path, expert_folder_name, template.format(layer))
 30 |         score = torch.load(score_file, map_location="cpu")
 31 |         score_list.append(score)
 32 |     return score_list
 33 | 
 34 | 
 35 | def save_compressed_file_7z(tensor, path):  # 7z
 36 |     with lzma.open(path, "wb") as file:
 37 |         pickle.dump(tensor, file)
 38 | 
 39 | 
 40 | def load_compressed_file_7z(path):  # 7z
 41 |     with lzma.open(path, "rb") as file:
 42 |         data = pickle.load(file)
 43 |     return data
 44 | 
 45 | 
 46 | def save_compressed_file_gz(tensor, path, compresslevel=6):  # gz
 47 |     with gzip.open(path, "wb", compresslevel=compresslevel) as file:
 48 |         pickle.dump(tensor, file)
 49 | 
 50 | 
 51 | def load_compressed_file_gz(path):  # gz
 52 |     with gzip.open(path, "rb") as file:
 53 |         data = pickle.load(file)
 54 |     return data
 55 | 
 56 | 
 57 | class load_jsonlines_iter:
 58 |     def __init__(self, filepath, start_from: int = None) -> None:
 59 |         self.fin = open(filepath, "r", encoding="utf8")
 60 |         if start_from:
 61 |             self.fin.seek(start_from, os.SEEK_SET)
 62 | 
 63 |     def skip_lines(self, num_skip_lines: int):
 64 |         for i, _ in enumerate(self.fin, 1):
 65 |             if i == num_skip_lines:
 66 |                 break
 67 | 
 68 |     def tell(self):
 69 |         return self.fin.tell()
 70 | 
 71 |     def __iter__(self):
 72 |         for line in self.fin:
 73 |             try:
 74 |                 yield json.loads(line)
 75 |             except json.JSONDecodeError:
 76 |                 pass
 77 |         self.fin.close()
 78 | 
 79 | 
 80 | def load_json(filepath):
 81 |     with open(filepath, "r", encoding="utf8") as fin:
 82 |         return json.load(fin)
 83 | 
 84 | 
 85 | def dump_json(obj, filepath, **kwargs):
 86 |     with open(filepath, "w", encoding="utf8") as fout:
 87 |         json.dump(obj, fout, ensure_ascii=False, **kwargs)
 88 | 
 89 | 
 90 | def load_jsonlines(filepath):
 91 |     data = []
 92 |     with open(filepath, "r", encoding="utf8") as fin:
 93 |         for line in fin:
 94 |             data.append(json.loads(line))
 95 |     return data
 96 | 
 97 | 
 98 | def dump_jsonlines(obj, filepath, **kwargs):
 99 |     with open(filepath, "w", encoding="utf8") as fout:
100 |         for ins in obj:
101 |             fout.write(f"{json.dumps(ins, ensure_ascii=False, **kwargs)}\n")
102 | 
103 | 
104 | def compress_png_image(image_path, print_info=False):
105 |     img = cv2.imread(image_path, cv2.IMREAD_COLOR)
106 |     cv2.imwrite(image_path, img, [cv2.IMWRITE_PNG_COMPRESSION, 9])
107 |     if print_info:
108 |         print(f'Done for "{image_path}".')
109 | 


--------------------------------------------------------------------------------
/smoe/utils/kernel_function.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | def pass_kernel_function(tensor, criterion):
 5 |     if criterion == "plain":
 6 |         return tensor
 7 |     elif criterion == "l1_norm":
 8 |         return torch.abs(tensor)
 9 |     elif criterion == "l2_norm":
10 |         return tensor * tensor
11 |     else:
12 |         raise NotImplementedError
13 | 


--------------------------------------------------------------------------------
/smoe/utils/logging.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import sys
 3 | 
 4 | import datasets
 5 | import transformers
 6 | from transformers import TrainingArguments
 7 | 
 8 | # Setup logging
 9 | logging.basicConfig(
10 |     format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
11 |     datefmt="%m/%d/%Y %H:%M:%S",
12 |     level=logging.INFO,
13 |     handlers=[logging.StreamHandler(sys.stdout)],
14 | )
15 | 
16 | transformers.utils.logging.enable_default_handler()
17 | transformers.utils.logging.enable_explicit_format()
18 | transformers.tokenization_utils.logging.set_verbosity_warning()
19 | 
20 | 
21 | def set_logging(should_log, log_level):
22 |     if should_log:
23 |         # The default of training_args.log_level is passive, so we set log level at info here to have that default.
24 |         transformers.utils.logging.set_verbosity_info()
25 | 
26 |     datasets.utils.logging.set_verbosity(log_level)
27 |     transformers.utils.logging.set_verbosity(log_level)
28 | 
29 | 
30 | def get_logger(name, log_level=None):
31 |     logger = logging.getLogger(name)
32 |     if log_level:
33 |         logger.setLevel(log_level)
34 |     return logger
35 | 
36 | 
37 | def get_logger_from_training_args(name: str, training_args: TrainingArguments):
38 |     should_log = training_args.should_log
39 |     log_level = training_args.get_process_log_level()
40 |     set_logging(should_log, log_level)
41 |     logger = get_logger(name, log_level=log_level)
42 |     return logger
43 | 


--------------------------------------------------------------------------------
/smoe/utils/model_operation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pjlab-sys4nlp/llama-moe/b17aff436cce58e2fcd7327789c7fadafe15d19e/smoe/utils/model_operation/__init__.py


--------------------------------------------------------------------------------
/smoe/utils/operations/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pjlab-sys4nlp/llama-moe/b17aff436cce58e2fcd7327789c7fadafe15d19e/smoe/utils/operations/__init__.py


--------------------------------------------------------------------------------
/smoe/utils/operations/operation_string.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from argparse import ArgumentTypeError
 3 | 
 4 | 
 5 | def str2bool(v):
 6 |     if isinstance(v, bool):
 7 |         return v
 8 |     if v.lower() in ("yes", "true", "t", "y", "1"):
 9 |         return True
10 |     elif v.lower() in ("no", "false", "f", "n", "0"):
11 |         return False
12 |     else:
13 |         raise ArgumentTypeError("Boolean value expected.")
14 | 
15 | 
16 | def string2list(string, sep=","):
17 |     if isinstance(string, list) or string is None:
18 |         return string
19 |     else:
20 |         split_string = string.split(sep)
21 |         return [int(num) for num in split_string]
22 | 
23 | 
24 | def extract_numbers(string):
25 |     """Extract numbers (int, float) from a given string."""
26 |     pattern = r"[-+]?\d*\.\d+|\d+"
27 |     matches = re.findall(pattern, string)
28 |     numbers = [float(match) if "." in match else int(match) for match in matches]
29 |     return numbers
30 | 
31 | 
32 | def calculate_non_ascii_ratio(string):
33 |     """Calculate the non-ASCII ratio of a given string."""
34 |     if len(string) == 0:
35 |         non_ascii_ratio = 0.0
36 |     else:
37 |         non_ascii_count = sum(1 for char in string if ord(char) >= 128)
38 |         non_ascii_ratio = non_ascii_count / len(string)
39 |     return non_ascii_ratio
40 | 
41 | 
42 | def remove_non_ascii_code(string):
43 |     """Use a regular expression to remove all non-ASCII characters"""
44 |     string = re.sub(r"[^\x00-\x7F]+", "", string)
45 |     return string
46 | 
47 | 
48 | def replace_non_ascii_code(string):
49 |     """
50 |     Replace common non-ASCII characters with their ASCII counterparts in the given string.
51 | 
52 |     :param string: Input string with non-ASCII characters.
53 |     :return: String with non-ASCII characters replaced.
54 |     """
55 |     string = re.sub(r"“|”", '"', string)
56 |     string = re.sub(r"‘|’", "'", string)
57 |     string = re.sub(r"—|–", "-", string)
58 |     string = re.sub(r"…", "...", string)
59 | 
60 |     return string
61 | 


--------------------------------------------------------------------------------
/smoe/utils/operations/operation_tensor.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | def move_tensors_to_device(input, device):
 5 |     if isinstance(input, dict):
 6 |         for key, value in input.items():
 7 |             if isinstance(value, torch.Tensor):
 8 |                 input[key] = value.to(device)
 9 |         return input
10 | 
11 |     elif isinstance(input, list):
12 |         for i in range(len(input)):
13 |             if isinstance(input[i], torch.Tensor):
14 |                 input[i] = input[i].to(device)
15 |         return input
16 | 
17 |     elif isinstance(input, torch.Tensor):
18 |         return input.to(device)
19 | 
20 |     else:
21 |         raise TypeError(input)
22 | 
23 | 
24 | def tensor2numbers(input):
25 |     if isinstance(input, dict):
26 |         for key, value in input.items():
27 |             if isinstance(value, torch.Tensor):
28 |                 input[key] = value.tolist()
29 |         return input
30 | 
31 |     elif isinstance(input, list):
32 |         for i in range(len(input)):
33 |             if isinstance(input[i], torch.Tensor):
34 |                 input[i] = input[i].tolist()
35 |         return input
36 | 
37 |     elif isinstance(input, torch.Tensor):
38 |         return input.tolist()
39 | 
40 |     else:
41 |         raise TypeError(input)
42 | 
43 | 
44 | def turn_last_true_mask_to_false(mask, true_mask_cnt=None):
45 |     """Turn the last true value to false for each row in a mask matrix."""
46 |     # mask: shape(batch_size, seq_len)
47 |     if true_mask_cnt is None:
48 |         true_mask_cnt = torch.sum(mask, dim=1).unsqueeze(1)
49 |     turn_position_indices = mask.cumsum(dim=1) == true_mask_cnt
50 |     converted_mask = mask.clone()
51 |     converted_mask[turn_position_indices] = False
52 |     return converted_mask
53 | 
54 | 
55 | def turn_first_true_mask_to_false(mask):
56 |     """Turn the first true value to false for each row in a mask matrix."""
57 |     # mask: shape(batch_size, seq_len)
58 |     turn_position_indices = mask.cumsum(dim=1) == 1
59 |     converted_mask = mask.clone()
60 |     converted_mask[turn_position_indices] = False
61 |     return converted_mask
62 | 
63 | 
64 | def last_true_position(mask):
65 |     """Return the index of the last true value in each row in a mask matrix."""
66 |     # mask: shape(batch_size, seq_len)
67 |     true_mask_cnt = torch.sum(mask, dim=1).unsqueeze(1)
68 |     last_true_mask = (mask.cumsum(dim=1) == true_mask_cnt) & mask
69 |     last_true_position = last_true_mask.nonzero()[:, 1].unsqueeze(1)
70 |     return last_true_position
71 | 
72 | 
73 | def pass_kernel_function(tensor, criterion):
74 |     if criterion == "plain":
75 |         return tensor
76 |     elif criterion == "l1_norm":
77 |         return torch.abs(tensor)
78 |     elif criterion == "l2_norm":
79 |         return tensor * tensor
80 |     else:
81 |         raise NotImplementedError
82 | 


--------------------------------------------------------------------------------
/smoe/utils/param.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | 
 3 | from smoe.utils.logging import get_logger
 4 | 
 5 | logger = get_logger(__name__)
 6 | 
 7 | 
 8 | def get_trainable_parameters(model: nn.Module, verbose: bool = True):
 9 |     """
10 |     Prints the number of trainable parameters in the model.
11 | 
12 |     Credit to https://github.com/huggingface/peft/blob/main/src/peft/peft_model.py
13 |     """
14 |     trainable_params = 0
15 |     all_param = 0
16 |     for _, param in model.named_parameters():
17 |         num_params = param.numel()
18 |         # if using DS Zero 3 and the weights are initialized empty
19 |         if num_params == 0 and hasattr(param, "ds_numel"):
20 |             num_params = param.ds_numel
21 | 
22 |         # Due to the design of 4bit linear layers from bitsandbytes
23 |         # one needs to multiply the number of parameters by 2 to get
24 |         # the correct number of parameters
25 |         if param.__class__.__name__ == "Params4bit":
26 |             num_params = num_params * 2
27 | 
28 |         all_param += num_params
29 |         if param.requires_grad:
30 |             trainable_params += num_params
31 |     if verbose:
32 |         logger.info(
33 |             f"trainable params: {trainable_params:,d}"
34 |             f" || all params: {all_param:,d}"
35 |             f" || trainable%: {100 * trainable_params / all_param}"
36 |         )
37 | 
38 |     return trainable_params, all_param
39 | 


--------------------------------------------------------------------------------
/smoe/utils/random_utils.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import string
 3 | 
 4 | 
 5 | def get_random_string(length: int = 8) -> str:
 6 |     """Generate a unique random string.
 7 | 
 8 |     Args:
 9 |         length (int, optional): Length of the random string. Defaults to 16.
10 | 
11 |     Returns:
12 |         str: A unique random string.
13 |     """
14 |     return "".join(random.choices(string.ascii_letters + string.digits, k=length))
15 | 


--------------------------------------------------------------------------------
/smoe/utils/seed.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import random
 3 | from typing import Optional
 4 | 
 5 | import numpy as np
 6 | import torch
 7 | 
 8 | 
 9 | def set_seed(seed: Optional[int] = 1227, set_cudnn: Optional[bool] = False):
10 |     os.environ["PYTHONHASHSEED"] = str(seed)
11 |     random.seed(seed)
12 |     np.random.seed(seed)
13 |     torch.manual_seed(seed)
14 |     torch.cuda.manual_seed(seed)
15 | 
16 |     if set_cudnn:
17 |         torch.backends.cudnn.deterministic = True
18 |         torch.backends.cudnn.benchmark = False
19 | 


--------------------------------------------------------------------------------
/smoe/utils/split_files.py:
--------------------------------------------------------------------------------
 1 | """
 2 | split files in a folder to separate folders
 3 | 
 4 | src: en_arxiv/*
 5 | tgt: output/part0, output/part1, ...
 6 | """
 7 | 
 8 | from pathlib import Path
 9 | 
10 | 
11 | def split_files(src_dir, tgt_dir, num_parts):
12 |     src_dir = Path(src_dir)
13 |     tgt_dir = Path(tgt_dir)
14 |     tgt_dir.mkdir(parents=True, exist_ok=True)
15 | 
16 |     filepaths = sorted(src_dir.glob("*.jsonl"))
17 |     num_files = len(filepaths)
18 |     num_files_per_part = num_files // num_parts
19 |     print(f"{src_dir} --> {tgt_dir}")
20 |     print(f"num_files_per_part: {num_files_per_part}")
21 | 
22 |     for i in range(num_parts):
23 |         start = i * num_files_per_part
24 |         end = (i + 1) * num_files_per_part
25 |         if i == num_parts - 1:
26 |             end = num_files
27 |         print(f"part-{i}, start: {start}, end: {end}")
28 | 
29 |         part_dir = tgt_dir / f"part-{i:06d}"
30 |         part_dir.mkdir(parents=True, exist_ok=True)
31 |         for j in range(start, end):
32 |             filepath = filepaths[j]
33 |             tgt_filepath = part_dir / filepath.name
34 |             tgt_filepath.symlink_to(filepath)
35 | 
36 | 
37 | if __name__ == "__main__":
38 |     for data_type in [
39 |         # "en_arxiv",
40 |         # "en_book",
41 |         # "en_c4",
42 |         "en_cc",
43 |         # "en_stack",
44 |         # "en_wikipedia",
45 |         # "github",
46 |     ]:
47 |         split_files(
48 |             f"/mnt/hwfile/share_data/zhutong/slimpajama_fluency_llama/{data_type}",
49 |             f"/mnt/hwfile/share_data/zhutong/data/slimpajama_fluency_llama_middle_parts/{data_type}",
50 |             30,
51 |         )
52 |     # split_files(
53 |     #     "/mnt/hwfile/share_data/zhutong/slimpajama_fluency_llama/en_arxiv",
54 |     #     "/mnt/hwfile/share_data/zhutong/data/slimpajama_fluency_llama_middle_parts/en_arxiv",
55 |     #     30,
56 |     # )
57 | 


--------------------------------------------------------------------------------
/smoe/utils/text_clustering.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | import joblib
 4 | import numpy as np
 5 | from sentence_transformers import SentenceTransformer
 6 | from sklearn.cluster import KMeans
 7 | 
 8 | from smoe.utils.vars import CLUSTERING_MODEL_NAME
 9 | 
10 | 
11 | class TextClustering:
12 |     def __init__(
13 |         self, num_clusters: int = 16, encoder: str = "all-mpnet-base-v2"
14 |     ) -> None:
15 |         self.kmeans = KMeans(n_clusters=num_clusters)
16 |         self.emb = SentenceTransformer(encoder)
17 | 
18 |     @property
19 |     def num_clusters(self) -> int:
20 |         return self.kmeans.n_clusters
21 | 
22 |     def encode_emb(self, sentences: list[str]) -> np.ndarray:
23 |         arr: np.ndarray = self.emb.encode(sentences=sentences, show_progress_bar=False)
24 |         return arr
25 | 
26 |     def fit_emb(self, emb: np.ndarray):
27 |         self.kmeans.fit(emb)
28 | 
29 |     def fit(self, sentences: list[str]):
30 |         emb_arr = self.encode_emb(sentences)
31 |         self.kmeans.fit(emb_arr)
32 | 
33 |     def predict_emb(self, emb: np.ndarray) -> list[int]:
34 |         return self.kmeans.predict(emb).tolist()
35 | 
36 |     def predict(self, sentences: list[str]) -> list[int]:
37 |         emb_arr = self.encode_emb(sentences)
38 |         return self.predict_emb(emb_arr)
39 | 
40 |     def save_pretrained(self, folder: str):
41 |         model_path = Path(folder) / CLUSTERING_MODEL_NAME
42 |         model_path.parent.mkdir(exist_ok=True, parents=True)
43 |         joblib.dump(self.kmeans, model_path)
44 | 
45 |     @classmethod
46 |     def from_pretrained(cls, folder: str):
47 |         model_path = Path(folder) / CLUSTERING_MODEL_NAME
48 |         kmeans = joblib.load(model_path)
49 |         model = cls()
50 |         model.kmeans = kmeans
51 |         return model
52 | 


--------------------------------------------------------------------------------
/smoe/utils/vars.py:
--------------------------------------------------------------------------------
1 | IGNORE_INDEX = -100
2 | BEST_MODEL_CKPT_DIR = "best"
3 | MIDDLE_MODEL_CKPT_DIR = "middle"
4 | CLUSTERING_MODEL_NAME = "clustering.model"
5 | JSONL_DATASET_CACHE_NAME = "jsonl_dataset-{}.bin"
6 | META_SUFFIX = ".meta"
7 | 


--------------------------------------------------------------------------------
/smoe/utils/visualization/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pjlab-sys4nlp/llama-moe/b17aff436cce58e2fcd7327789c7fadafe15d19e/smoe/utils/visualization/__init__.py


--------------------------------------------------------------------------------
/smoe/utils/visualization/bar.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | 
 6 | def barh(
 7 |     label2num: dict,
 8 |     title: str = "No Title",
 9 |     save_filepath=None,
10 |     sort_type="label",
11 |     limit=None,
12 | ):
13 |     """
14 |     Refers to https://gist.github.com/Spico197/40f0224f9202ef645ac86637a958eaff
15 | 
16 |     Args:
17 |         sort_type: label or num
18 |     """
19 |     assert sort_type in ["label", "num"]
20 |     if sort_type == "label":
21 |         label2num_sorted = sorted(label2num.items(), key=lambda x: x[0])
22 |     else:
23 |         label2num_sorted = sorted(label2num.items(), key=lambda x: x[1])
24 |     if limit:
25 |         label2num_sorted = label2num_sorted[:limit]
26 |     tot = sum([x[1] for x in label2num_sorted])
27 |     fig = plt.figure(figsize=(16, 9), dpi=350)
28 |     ax = fig.add_subplot(111)
29 |     ax.barh(range(len(label2num_sorted)), [x[1] for x in label2num_sorted], zorder=3)
30 |     ax.set_yticks(range(len(label2num_sorted)))
31 |     ax.set_yticklabels(
32 |         [
33 |             "{} - {} ({:.2f}%)".format(x[0], x[1], float(x[1]) / tot * 100)
34 |             for x in label2num_sorted
35 |         ],
36 |         fontsize=16,
37 |     )
38 |     ax.set_xlabel("Total: {}".format(tot), fontsize=16)
39 |     ax.set_title(title)
40 |     ax.grid(zorder=0)
41 |     plt.rc("axes", axisbelow=True)
42 |     plt.rc("ytick", labelsize=16)
43 |     plt.tight_layout()
44 |     # plt.show()
45 |     if save_filepath:
46 |         Path(save_filepath).parent.mkdir(exist_ok=True, parents=True)
47 |         plt.savefig(save_filepath)
48 | 


--------------------------------------------------------------------------------
/smoe/utils/visualization/convert_gif.py:
--------------------------------------------------------------------------------
 1 | from PIL import Image
 2 | 
 3 | 
 4 | def save_images_as_gif(image_paths, output_path, duration=200):
 5 |     """
 6 |     将图像文件路径列表保存为 GIF 动画文件。
 7 | 
 8 |     :param image_paths: 包含图像文件路径的列表
 9 |     :param output_path: 保存 GIF 动画的文件路径
10 |     :param duration: 每帧之间的时间间隔（毫秒）
11 |     """
12 |     if not image_paths:
13 |         print("Error: No image paths provided.")
14 |         return
15 | 
16 |     try:
17 |         # 打开图像文件并将它们添加到图像列表中
18 |         images = [Image.open(image_path) for image_path in image_paths]
19 | 
20 |         # 保存 GIF 动画
21 |         images[0].save(
22 |             output_path,
23 |             save_all=True,
24 |             append_images=images[1:],
25 |             loop=0,
26 |             duration=duration,
27 |         )
28 |         print(f"GIF animation saved as {output_path}")
29 |     except Exception as e:
30 |         print(f"Error: {e}")
31 | 
32 | 
33 | # 使用示例
34 | image_paths = ["image1.png", "image2.png", "image3.png"]
35 | output_gif = "output.gif"
36 | 
37 | save_images_as_gif(image_paths, output_gif)
38 | 


--------------------------------------------------------------------------------
/smoe/utils/visualization/line.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | import numpy as np
 3 | 
 4 | from smoe.utils.io import compress_png_image
 5 | 
 6 | 
 7 | def line_plot(
 8 |     xs,
 9 |     label_to_nums,
10 |     title: str = None,
11 |     xlabel: str = None,
12 |     ylabel: str = None,
13 |     save_path: str = None,
14 | ):
15 |     fig = plt.figure(figsize=(16, 9))
16 |     ax = fig.add_subplot(111)
17 |     for label, nums in label_to_nums.items():
18 |         ax.plot(xs, nums, label=label)
19 |     ax.set_xticks(xs)
20 |     if title:
21 |         ax.set_title(title)
22 |     if xlabel:
23 |         ax.set_xlabel(xlabel)
24 |     if ylabel:
25 |         ax.set_ylabel(ylabel)
26 |     ax.legend()
27 |     ax.grid(True)
28 |     ax.set_axisbelow(True)
29 |     plt.tight_layout()
30 |     if save_path:
31 |         plt.savefig(save_path, dpi=320)
32 |         compress_png_image(save_path, print_info=False)
33 |     plt.close()
34 | 
35 | 
36 | def line_plot_with_highlight(
37 |     xs,
38 |     label_to_nums,
39 |     highlight_label_to_nums: dict = None,
40 |     highlight_linewidth: int = 4,
41 |     highlight_color: str = "black",
42 |     cmap: str = "viridis",
43 |     legend_columns: int = 1,
44 |     title: str = None,
45 |     xlabel: str = None,
46 |     ylabel: str = None,
47 |     save_path: str = None,
48 | ):
49 |     fig = plt.figure(figsize=(16, 9))
50 |     ax = fig.add_subplot(111)
51 | 
52 |     cmap = plt.get_cmap(cmap)
53 |     colors = np.linspace(0, 1, len(label_to_nums))
54 | 
55 |     for i, (label, nums) in enumerate(label_to_nums.items()):
56 |         ax.plot(xs, nums, label=label, c=cmap(colors)[i, :3])
57 | 
58 |     if highlight_label_to_nums is not None:
59 |         for i, (label, nums) in enumerate(highlight_label_to_nums.items()):
60 |             ax.plot(
61 |                 xs, nums, label=label, linewidth=highlight_linewidth, c=highlight_color
62 |             )
63 | 
64 |     ax.set_xticks(xs)
65 |     if title:
66 |         ax.set_title(title)
67 |     if xlabel:
68 |         ax.set_xlabel(xlabel)
69 |     if ylabel:
70 |         ax.set_ylabel(ylabel)
71 | 
72 |     ax.legend(ncols=legend_columns)
73 |     ax.grid(True)
74 |     ax.set_axisbelow(True)
75 |     plt.tight_layout()
76 |     if save_path:
77 |         plt.savefig(save_path, dpi=320)
78 |         compress_png_image(save_path, print_info=False)
79 |     plt.close()
80 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pjlab-sys4nlp/llama-moe/b17aff436cce58e2fcd7327789c7fadafe15d19e/tests/__init__.py


--------------------------------------------------------------------------------
/tests/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pjlab-sys4nlp/llama-moe/b17aff436cce58e2fcd7327789c7fadafe15d19e/tests/data/__init__.py


--------------------------------------------------------------------------------
/tests/data/test_aggregation.py:
--------------------------------------------------------------------------------
 1 | from smoe.data.aggregation import group_instances
 2 | 
 3 | 
 4 | def test_group_instances():
 5 |     instances = [
 6 |         {"input_ids": [1, 2, 3], "labels": [4, 5, 6]},
 7 |         {"input_ids": [1, 2, 3], "labels": [4, 5, 6]},
 8 |         {"input_ids": [1, 2, 3], "labels": [4, 5, 6]},
 9 |         {"input_ids": [1, 2, 3], "labels": [4, 5, 6]},
10 |         {"input_ids": [1, 2, 3], "labels": [4, 5, 6]},
11 |     ]
12 |     results = group_instances(instances, block_size=4)
13 |     assert results == [
14 |         {"input_ids": [1, 2, 3, 1], "labels": [4, 5, 6, 4]},
15 |         {"input_ids": [2, 3, 1, 2], "labels": [5, 6, 4, 5]},
16 |         {"input_ids": [3, 1, 2, 3], "labels": [6, 4, 5, 6]},
17 |     ]
18 | 
19 | 
20 | if __name__ == "__main__":
21 |     test_group_instances()
22 | 


--------------------------------------------------------------------------------
/tests/data/test_redpajama.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | from collections import defaultdict
 3 | from pathlib import Path
 4 | 
 5 | from torch.utils.data import DataLoader
 6 | 
 7 | from smoe.data.redpajama import load_streaming_datasets
 8 | from smoe.utils.io import dump_jsonlines, load_jsonlines
 9 | 
10 | 
11 | def test_load_streaming_datasets():
12 |     output_dir = Path("/mnt/petrelfs/zhutong/smoe/resources/data_test_with_task_type")
13 |     output_dir.mkdir(parents=True, exist_ok=True)
14 |     # dataset_dir = Path("resources/data_test")
15 |     dataset_dir = Path("resources/data_test_with_task_type")
16 | 
17 |     # # update new dataset with task type
18 |     # for subtask_dir in dataset_dir.glob("*"):
19 |     #     task_type = subtask_dir.stem
20 |     #     subtask_out_dir = output_dir.joinpath(task_type)
21 |     #     subtask_out_dir.mkdir(parents=True, exist_ok=True)
22 |     #     for file in subtask_dir.glob("*.jsonl"):
23 |     #         data = load_jsonlines(file)
24 |     #         for ins in data:
25 |     #             ins["src"] = task_type
26 |     #         dump_jsonlines(data, subtask_out_dir.joinpath(file.name))
27 | 
28 |     dataset = load_streaming_datasets(
29 |         str(dataset_dir),
30 |         prob_map={"en_arxiv": 0.5, "en_book": 0.2, "en_c4": 0.3},
31 |         block_size=2048,
32 |     )
33 |     num_ds = 0
34 |     num_src = defaultdict(lambda: 0)
35 | 
36 |     start = time.time()
37 |     for ds in iter(dataset):
38 |         num_ds += 1
39 |         # print(num_ds, ds["src"])
40 |         # num_src[ds["src"]] += 1
41 |     time_span = time.time() - start
42 |     print(num_ds)
43 |     print(dict(num_src))
44 |     print(f"Time (ins/s): {num_ds / time_span:.2f}" "")
45 | 
46 |     """
47 |     block_size: -1
48 |     {'en_arxiv': 400, 'en_c4': 214}
49 |     Time (ins/s): 64.05
50 | 
51 |     block_size: 2048
52 |     Time (ins/s): 59.94
53 |     """
54 | 
55 | 
56 | if __name__ == "__main__":
57 |     test_load_streaming_datasets()
58 | 


--------------------------------------------------------------------------------
/tests/entrypoint/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pjlab-sys4nlp/llama-moe/b17aff436cce58e2fcd7327789c7fadafe15d19e/tests/entrypoint/__init__.py


--------------------------------------------------------------------------------
/tests/entrypoint/test_conn.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import socket
 3 | 
 4 | import torch
 5 | import torch.distributed as dist
 6 | import torch.nn as nn
 7 | 
 8 | # from accelerate import Accelerator
 9 | 
10 | 
11 | def test_connection():
12 |     string = f"{socket.gethostname()} - MASTER_ADDR: {os.environ['MASTER_ADDR']}:{os.environ['MASTER_PORT']} - WORLD_SIZE: {os.environ['WORLD_SIZE']} - RANK: {os.environ['RANK']}"
13 |     print(string)
14 |     dist.init_process_group("nccl")
15 |     # ac = Accelerator()
16 |     m = nn.Linear(5, 10)
17 |     m = nn.parallel.DistributedDataParallel(m, device_ids=[dist.get_rank()])
18 |     # m = ac.prepare_model(m)
19 |     x = torch.randn(3, 5, device=m.device)
20 |     y = m(x)
21 |     # dist.all_reduce(y, op=dist.ReduceOp.SUM)
22 |     assert y.shape == (3, 10)
23 |     # print(f"Done - local: {ac.local_process_index} - rank: {ac.process_index} - world: {ac.num_processes}")
24 |     print(
25 |         f"Done - {socket.gethostname()} - local: {os.environ['LOCAL_RANK']} - rank: {dist.get_rank()} - world: {dist.get_world_size()}"
26 |     )
27 | 
28 | 
29 | if __name__ == "__main__":
30 |     test_connection()
31 | 


--------------------------------------------------------------------------------
/tests/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pjlab-sys4nlp/llama-moe/b17aff436cce58e2fcd7327789c7fadafe15d19e/tests/models/__init__.py


--------------------------------------------------------------------------------
/tests/models/test_noise_moe.py:
--------------------------------------------------------------------------------
 1 | import types
 2 | 
 3 | import torch
 4 | 
 5 | from smoe.modules.moe.moe_layers import LinearGLUMoELayer
 6 | from smoe.utils.model_operation.change_llama_moe_forward import (
 7 |     forward_topk_balanced_noisy_gate_with_random_expert_selection,
 8 | )
 9 | from smoe.utils.seed import set_seed
10 | 
11 | 
12 | def test_noise_moe():
13 |     input_size = 128
14 |     hidden_size = 4096
15 |     output_size = 128
16 |     hidden_act = "silu"
17 |     num_experts = 16
18 |     num_selects = 16
19 |     size_experts = None
20 |     bias = True
21 | 
22 |     gating_config = {
23 |         "gate_type": "TopKBalancedNoisyGate",
24 |         "gate_network": "mlp",
25 |         "gate_use_softmax": True,
26 |         "gate_use_balance": True,
27 |         "gate_balance_loss_weight": 0.01,
28 |         "gate_add_noise": True,
29 |         "gate_noise_epsilon": 1e-2,
30 |     }
31 | 
32 |     calculator_config = {
33 |         "calculator_type": "UniversalCalculator",
34 |         "multiply_gate_scores": False,
35 |         "score_scale_factor": 1.0,
36 |     }
37 | 
38 |     layer = LinearGLUMoELayer(
39 |         input_size=input_size,
40 |         hidden_size=hidden_size,
41 |         output_size=output_size,
42 |         hidden_act=hidden_act,
43 |         num_experts=num_experts,
44 |         num_selects=num_selects,
45 |         size_experts=size_experts,
46 |         bias=bias,
47 |         **gating_config,
48 |         **calculator_config,
49 |     )
50 | 
51 |     batch_size = 64
52 | 
53 |     layer.gate.forward = types.MethodType(
54 |         forward_topk_balanced_noisy_gate_with_random_expert_selection, layer.gate
55 |     )
56 |     set_seed(0)
57 | 
58 |     input = torch.rand((batch_size, input_size))
59 |     output = layer(input)
60 | 
61 | 
62 | if __name__ == "__main__":
63 |     test_noise_moe()
64 | 


--------------------------------------------------------------------------------
/tests/models/test_noise_moe_residual.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from smoe.modules.moe_residual.moe_residual_layers import LinearGLUMoEResidualLayer
 4 | 
 5 | 
 6 | def test_noise_moe_residual():
 7 |     input_size = 4096
 8 |     hidden_size = 688 * 14
 9 |     output_size = 4096
10 |     hidden_act = "silu"
11 |     num_experts = 14
12 |     num_selects = 2
13 |     size_experts = None
14 |     bias = True
15 | 
16 |     num_experts_residual = 2
17 |     size_experts_residual = None  # 688
18 |     score_scale_factor_residual = 8.0
19 |     use_weighting = False
20 | 
21 |     gating_config = {
22 |         "gate_type": "TopKBalancedNoisyGate",
23 |         "gate_network": "mlp",
24 |         "gate_use_softmax": True,
25 |         "gate_use_balance": True,
26 |         "gate_balance_loss_weight": 0.01,
27 |         "gate_add_noise": True,
28 |         "gate_noise_epsilon": 0.01,
29 |     }
30 | 
31 |     calculator_config = {
32 |         "calculator_type": "UniversalCalculator",
33 |         "multiply_gate_scores": True,
34 |         "score_scale_factor": 8.0,
35 |     }
36 | 
37 |     layer = LinearGLUMoEResidualLayer(
38 |         input_size=input_size,
39 |         hidden_size=hidden_size,
40 |         output_size=output_size,
41 |         hidden_act=hidden_act,
42 |         num_experts=num_experts,
43 |         num_selects=num_selects,
44 |         size_experts=size_experts,
45 |         bias=bias,
46 |         num_experts_residual=num_experts_residual,
47 |         size_experts_residual=size_experts_residual,
48 |         score_scale_factor_residual=score_scale_factor_residual,
49 |         use_weighting=use_weighting,
50 |         **gating_config,
51 |         **calculator_config,
52 |     )
53 | 
54 |     batch_size = 64
55 | 
56 |     input = torch.rand((batch_size, input_size))
57 |     output = layer(input)
58 | 
59 | 
60 | if __name__ == "__main__":
61 |     test_noise_moe_residual()
62 | 


--------------------------------------------------------------------------------
/tests/models/test_switch_moe.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from smoe.modules.moe.moe_layers import LinearGLUMoELayer
 4 | 
 5 | 
 6 | def test_switch_moe():
 7 |     input_size = 128
 8 |     hidden_size = 4096
 9 |     output_size = 128
10 |     hidden_act = "silu"
11 |     num_experts = 16
12 |     num_selects = 1
13 |     size_experts = None
14 |     bias = True
15 | 
16 |     gating_config = {
17 |         "gate_type": "SwitchBalancedGate",
18 |         "gate_network": "mlp",
19 |         "gate_use_softmax": True,
20 |         "gate_use_balance": True,
21 |         "gate_balance_loss_weight": 0.01,
22 |         "gate_add_noise": True,
23 |     }
24 | 
25 |     calculator_config = {
26 |         "calculator_type": "SwitchDropTokenCalculator",
27 |         "multiply_gate_scores": True,
28 |         "score_scale_factor": 1.0,
29 |         "drop_tokens": True,
30 |         "capacity_factor": 1.25,
31 |     }
32 | 
33 |     layer = LinearGLUMoELayer(
34 |         input_size=input_size,
35 |         hidden_size=hidden_size,
36 |         output_size=output_size,
37 |         hidden_act=hidden_act,
38 |         num_experts=num_experts,
39 |         num_selects=num_selects,
40 |         size_experts=size_experts,
41 |         bias=bias,
42 |         **gating_config,
43 |         **calculator_config,
44 |     )
45 | 
46 |     batch_size = 64
47 | 
48 |     input = torch.rand((batch_size, input_size))
49 |     output = layer(input)
50 | 
51 | 
52 | if __name__ == "__main__":
53 |     test_switch_moe()
54 | 


--------------------------------------------------------------------------------
/tests/models/test_switch_moe_residual.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from smoe.modules.moe_residual.moe_residual_layers import LinearGLUMoEResidualLayer
 4 | 
 5 | 
 6 | def test_switch_moe_residual():
 7 |     input_size = 4096
 8 |     hidden_size = 688 * 13
 9 |     output_size = 4096
10 |     hidden_act = "silu"
11 |     num_experts = 13
12 |     num_selects = 1
13 |     size_experts = None
14 |     bias = True
15 | 
16 |     num_experts_residual = 3
17 |     size_experts_residual = None  # 688
18 |     score_scale_factor_residual = 12.0
19 |     use_weighting = False
20 | 
21 |     gating_config = {
22 |         "gate_type": "SwitchBalancedGate",
23 |         "gate_network": "mlp",
24 |         "gate_use_softmax": True,
25 |         "gate_use_balance": True,
26 |         "gate_balance_loss_weight": 0.01,
27 |         "gate_add_noise": False,
28 |     }
29 | 
30 |     calculator_config = {
31 |         "calculator_type": "SwitchDropTokenCalculator",
32 |         "multiply_gate_scores": True,
33 |         "score_scale_factor": 4.0,
34 |         "drop_tokens": True,
35 |         "capacity_factor": 1.25,
36 |     }
37 | 
38 |     layer = LinearGLUMoEResidualLayer(
39 |         input_size=input_size,
40 |         hidden_size=hidden_size,
41 |         output_size=output_size,
42 |         hidden_act=hidden_act,
43 |         num_experts=num_experts,
44 |         num_selects=num_selects,
45 |         size_experts=size_experts,
46 |         bias=bias,
47 |         num_experts_residual=num_experts_residual,
48 |         size_experts_residual=size_experts_residual,
49 |         score_scale_factor_residual=score_scale_factor_residual,
50 |         use_weighting=use_weighting,
51 |         **gating_config,
52 |         **calculator_config,
53 |     )
54 | 
55 |     batch_size = 64
56 | 
57 |     input = torch.rand((batch_size, input_size))
58 |     output = layer(input)
59 | 
60 | 
61 | if __name__ == "__main__":
62 |     test_switch_moe_residual()
63 | 


--------------------------------------------------------------------------------
/tests/modules/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pjlab-sys4nlp/llama-moe/b17aff436cce58e2fcd7327789c7fadafe15d19e/tests/modules/__init__.py


--------------------------------------------------------------------------------
/tests/modules/test_hook.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | 
 4 | 
 5 | # fmt: off
 6 | def backward_hook(module, grad_in, grad_out):
 7 |     print(module.name, "grad_in", len(grad_in), [grad_in[i].shape if grad_in[i] is not None else None for i in range(len(grad_in))], grad_in, sep='\n')
 8 |     print(module.name, "grad_out", len(grad_out), [grad_out[i].shape if grad_out[i] is not None else None for i in range(len(grad_out))], grad_out, sep='\n')
 9 | 
10 | 
11 | class Model(nn.Module):
12 |     def __init__(self, input_dim, hidden_dim, output_dim):
13 |         super().__init__()
14 |         self.layer1 = nn.Linear(input_dim, hidden_dim, bias=False)
15 |         self.layer2 = nn.Linear(hidden_dim, output_dim, bias=False)
16 |         self.activation = nn.Sigmoid()
17 | 
18 |         self.layer1.name = "layer1"
19 |         self.layer2.name = "layer2"
20 | 
21 |         self.layer1.register_backward_hook(backward_hook)
22 |         self.layer2.register_backward_hook(backward_hook)
23 | 
24 |     def forward(self, x):
25 |         z1 = self.layer1(x)
26 |         z2 = self.layer2(z1)
27 |         a2 = self.activation(z2)
28 |         return a2
29 | 
30 | 
31 | def test_hook():
32 |     batch_size = 4
33 |     input_dim = 128
34 |     hidden_dim = 1024
35 |     output_dim = 64
36 | 
37 |     model = Model(input_dim, hidden_dim, output_dim)
38 |     loss_func = nn.MSELoss()
39 | 
40 |     x = torch.rand((batch_size, input_dim))
41 |     target = torch.rand((batch_size, output_dim))
42 | 
43 |     y = model(x)
44 |     loss = loss_func(y, target)
45 |     loss.backward()
46 | 
47 |     print(model.layer1.weight.grad, model.layer1.weight.grad.shape)
48 |     print(model.layer2.weight.grad, model.layer2.weight.grad.shape)
49 | 
50 | 
51 | if __name__ == "__main__":
52 |     test_hook()
53 | 


--------------------------------------------------------------------------------
/tests/modules/test_hook_llama_mlp.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | from transformers.models.llama.modeling_llama import LlamaMLP
 4 | 
 5 | 
 6 | # fmt: off
 7 | def backward_hook(module, grad_in, grad_out):
 8 |     print(module.name, "grad_in", len(grad_in), [grad_in[i].shape if grad_in[i] is not None else None for i in range(len(grad_in))], grad_in, sep='\n')
 9 |     print(module.name, "grad_out", len(grad_out), [grad_out[i].shape if grad_out[i] is not None else None for i in range(len(grad_out))], grad_out, sep='\n')
10 | 
11 | 
12 | class Config:
13 |     def __init__(self):
14 |         self.pretraining_tp = 1
15 |         self.hidden_size = 128
16 |         self.intermediate_size = 1024
17 |         self.hidden_act = "silu"
18 | 
19 | 
20 | def test_hook_llama_mlp():
21 |     batch_size = 2
22 |     seq_len = 4
23 | 
24 |     config = Config()
25 |     model = LlamaMLP(config)
26 | 
27 |     model.up_proj.name = "up_proj"
28 |     model.gate_proj.name = "gate_proj"
29 |     model.down_proj.name = "down_proj"
30 | 
31 |     model.up_proj.register_backward_hook(backward_hook)
32 |     model.gate_proj.register_backward_hook(backward_hook)
33 |     model.down_proj.register_backward_hook(backward_hook)
34 | 
35 |     loss_func = nn.MSELoss()
36 | 
37 |     x = torch.rand((batch_size * seq_len, config.hidden_size))
38 |     target = torch.rand((batch_size * seq_len, config.hidden_size))
39 | 
40 |     # Wrong "grad_in" and "grad_out" will be captured when using inputs with (batch_size, seq_len, *) format !
41 |     #################################################################
42 |     # x = torch.rand((batch_size, seq_len, config.hidden_size))
43 |     # target = torch.rand((batch_size, seq_len, config.hidden_size))
44 |     #################################################################
45 | 
46 |     y = model(x)
47 |     loss = loss_func(y, target)
48 |     loss.backward()
49 | 
50 |     print(model.up_proj.name, "grad", model.up_proj.weight.grad, model.up_proj.weight.grad.shape, sep='\n')
51 |     print(model.gate_proj.name, "grad", model.gate_proj.weight.grad, model.gate_proj.weight.grad.shape, sep='\n')
52 |     print(model.down_proj.name, "grad", model.down_proj.weight.grad, model.down_proj.weight.grad.shape, sep='\n')
53 | 
54 | 
55 | if __name__ == "__main__":
56 |     test_hook_llama_mlp()
57 | 


--------------------------------------------------------------------------------
/tests/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pjlab-sys4nlp/llama-moe/b17aff436cce58e2fcd7327789c7fadafe15d19e/tests/utils/__init__.py


--------------------------------------------------------------------------------
/tests/utils/test_gumble.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | def gumbel_rsample(shape):
 5 |     one = torch.tensor(1.0)
 6 |     zero = torch.tensor(0.0)
 7 |     gumbel = torch.distributions.gumbel.Gumbel(zero, one).rsample
 8 |     return gumbel(shape)
 9 | 
10 | 
11 | def test_gumble():
12 |     shape = (16, 16)
13 |     gumbel = gumbel_rsample(shape) * 0.01
14 |     print(gumbel)
15 | 
16 |     normal = torch.randn(shape) * 0.01
17 |     print(normal)
18 | 
19 | 
20 | if __name__ == "__main__":
21 |     test_gumble()
22 | 


--------------------------------------------------------------------------------
/tests/utils/test_logging.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from smoe.utils.logging import get_logger
 4 | 
 5 | 
 6 | def err_func():
 7 |     return 1 / 0
 8 | 
 9 | 
10 | def test_log():
11 |     logger = get_logger("test")  # noqa: F841
12 | 
13 | 
14 | def test_err_func():
15 |     with pytest.raises(ZeroDivisionError):
16 |         res = err_func()  # noqa: F841
17 | 


--------------------------------------------------------------------------------
/tests/utils/visualization/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pjlab-sys4nlp/llama-moe/b17aff436cce58e2fcd7327789c7fadafe15d19e/tests/utils/visualization/__init__.py


--------------------------------------------------------------------------------
/tests/utils/visualization/test_expert_load.py:
--------------------------------------------------------------------------------
 1 | import tempfile
 2 | 
 3 | import numpy as np
 4 | import pytest
 5 | 
 6 | from smoe.utils.visualization.visualize import (
 7 |     visualize_expert_load_barv,
 8 |     visualize_expert_load_heatmap,
 9 | )
10 | 
11 | 
12 | def test_visualization_expert_load_heatmap():
13 |     load_sum = np.random.rand(16)
14 |     visualize_expert_load_heatmap(
15 |         load_sum,
16 |         layer_idx=0,
17 |         dataset_name="test",
18 |         shape=(4, 4),
19 |         save_dir=tempfile.mktemp(),
20 |     )
21 |     load_sum = np.random.randint(0, 5, size=(16,))
22 |     visualize_expert_load_heatmap(
23 |         load_sum,
24 |         layer_idx=0,
25 |         dataset_name="test",
26 |         shape=(4, 4),
27 |         save_dir=tempfile.mktemp(),
28 |     )
29 |     with pytest.raises(ValueError):
30 |         visualize_expert_load_heatmap(
31 |             load_sum,
32 |             layer_idx=0,
33 |             dataset_name="test",
34 |             shape=(4, 4),
35 |             save_dir=".gitignore",
36 |         )
37 | 
38 | 
39 | def test_visualization_expert_load_barv():
40 |     load_sum = np.random.rand(16)
41 |     visualize_expert_load_barv(
42 |         load_sum,
43 |         layer_idx=0,
44 |         dataset_name="test",
45 |         y_max=10,
46 |         x_label="experts",
47 |         save_dir=tempfile.mktemp(),
48 |     )
49 |     with pytest.raises(ValueError):
50 |         visualize_expert_load_barv(
51 |             load_sum,
52 |             layer_idx=0,
53 |             dataset_name="test",
54 |             y_max=10,
55 |             x_label="experts",
56 |             save_dir=".gitignore",
57 |         )
58 | 


--------------------------------------------------------------------------------
/tools/check_killed.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import subprocess
 3 | from collections import Counter, defaultdict
 4 | from pathlib import Path
 5 | 
 6 | 
 7 | def get_jobstate(job_id):
 8 |     cmd = f"sacct -j {job_id} -o state -n"
 9 |     p = subprocess.Popen(
10 |         cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT
11 |     )
12 |     ret = p.stdout.read().decode("utf8").strip()
13 |     return ret
14 | 
15 | 
16 | def get_data_type_and_part_id(filepath):
17 |     path = Path(filepath)
18 |     obj = re.search(r"tokenize-(.*?)-part-(\d+).log", path.name)
19 |     if obj is None:
20 |         return None
21 |     data_type, part_id = obj.groups()
22 |     return data_type, part_id
23 | 
24 | 
25 | def check_result(filepath):
26 |     path = Path(filepath)
27 |     ret = get_data_type_and_part_id(filepath)
28 |     if ret is None:
29 |         return None
30 |     data_type, part_id = ret
31 |     content = path.read_text(encoding="utf8")
32 | 
33 |     if (
34 |         "srun: error: Unable to allocate resources: Reach max user active rpc limit"
35 |         in content
36 |         or "srun: error: Unable to allocate resources: Socket timed out on send/recv operation"
37 |         in content
38 |     ):
39 |         print(f"Error: {data_type}/{part_id}")
40 |         return "error"
41 | 
42 |     obj = re.search(r"srun: job (\d+) queued and waiting for resources", content)
43 |     if obj is None:
44 |         print(f"Unknown: {data_type}/{part_id}")
45 |         return "unknown"
46 | 
47 |     job_id = obj.group(1)
48 |     jobstate = get_jobstate(job_id)
49 |     obj = re.search(r"Tokenization Progress:\s*100%\s*\|.*\|\s*(\d+)/(\d+)", content)
50 |     if obj is not None:
51 |         progress, total = obj.groups()
52 |         if (
53 |             progress == total
54 |             and progress is not None
55 |             and total is not None
56 |             and jobstate != "COMPLETED"
57 |         ):
58 |             print(f"DEAD_COMPLETED: {data_type}/{part_id} - job: {job_id}")
59 |             return "DEAD_COMPLETED"
60 | 
61 |     print(f"{jobstate}: {data_type}/{part_id}")
62 |     return jobstate
63 | 
64 | 
65 | if __name__ == "__main__":
66 |     status = defaultdict(list)
67 |     for filepath in Path("logs").glob("tokenize-*.log"):
68 |         s = check_result(filepath)
69 |         res = get_data_type_and_part_id(filepath)
70 |         status[s].append(res)
71 | 
72 |     print(Counter({k: len(v) for k, v in status.items()}).most_common())
73 | 
74 |     def print_val(v, k):
75 |         print(f"# {k} = {len(v[k])}")
76 |         for path in v[k]:
77 |             print(path)
78 | 
79 |     for key in ["CANCELLED+", "DEAD_COMPLETED", "error", None]:
80 |         print_val(status, key)
81 | 


--------------------------------------------------------------------------------
/tools/cp_files.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shutil
 3 | from pathlib import Path
 4 | 
 5 | from tqdm import tqdm
 6 | 
 7 | 
 8 | def copy_files(src_folder: str, dest_folder: str):
 9 |     src_folder = Path(src_folder)
10 |     dest_folder = Path(dest_folder)
11 |     dest_folder.mkdir(parents=True, exist_ok=True)
12 |     files = src_folder.glob("**/*.jsonl")
13 |     for file in tqdm(files):
14 |         dest_file = dest_folder / file.name
15 |         if not dest_file.exists():
16 |             # print(str(file), str(dest_file))
17 |             # shutil.copy2(str(file), str(dest_file))
18 |             # link the file to dest_folder
19 |             # os.link(str(file), str(dest_file))
20 |             os.symlink(str(file), str(dest_file))
21 | 
22 | 
23 | if __name__ == "__main__":
24 |     # copy_files(
25 |     #     "/mnt/petrelfs/share_data/quxiaoye/SlimPajama-fluency-processed/c4_split_fluency/",
26 |     #     "/mnt/petrelfs/share_data/quxiaoye/SlimPajama-fluency-processed-agg/en_c4/"
27 |     # )
28 |     for domain in [
29 |         "en_book",
30 |         "en_c4",
31 |         "en_cc",
32 |         "en_arxiv",
33 |         "en_wikipedia",
34 |         "en_stack",
35 |         "github",
36 |     ]:
37 |         copy_files(
38 |             f"/mnt/petrelfs/share_data/zhutong/data/slimpajama_fluency_mistral_middle_parts/{domain}",
39 |             f"/mnt/petrelfs/share_data/zhutong/data/slimpajama_fluency_mistral/{domain}",
40 |         )
41 | 


--------------------------------------------------------------------------------
/tools/listen.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | import time
 3 | 
 4 | from smoe.utils.notification import send_to_wechat
 5 | 
 6 | 
 7 | def check_sme_pending():
 8 |     # run sme | grep "normal PD" | wc -l, if the returned value is 0, then send a notification
 9 |     cmd = "squeue --me | grep 'normal PD' | wc -l"
10 |     p = subprocess.Popen(
11 |         cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT
12 |     )
13 |     for line in p.stdout.readlines():
14 |         line = line.decode("utf-8")
15 |         if int(line) == 0:
16 |             send_to_wechat("pending jobs all clear!!!")
17 |             return True
18 |     return False
19 | 
20 | 
21 | def check_sme_running():
22 |     # run sme | grep "normal  R" | wc -l, if the returned value is 0, then send a notification
23 |     cmd = "squeue --me | grep 'normal  R' | wc -l"
24 |     p = subprocess.Popen(
25 |         cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT
26 |     )
27 |     for line in p.stdout.readlines():
28 |         line = line.decode("utf-8")
29 |         if int(line) == 0:
30 |             send_to_wechat("running jobs all clear!!!")
31 |             return True
32 |     return False
33 | 
34 | 
35 | def listen():
36 |     # check pending jobs every 10 seconds, if all pending jobs are done, send a notification
37 |     no_pending = False
38 |     no_running = False
39 |     while True:
40 |         if not no_pending:
41 |             no_pending = check_sme_pending()
42 |             time.sleep(10)
43 |         if not no_running:
44 |             no_running = check_sme_running()
45 |             time.sleep(10)
46 | 
47 | 
48 | if __name__ == "__main__":
49 |     listen()
50 | 


--------------------------------------------------------------------------------
/tools/scl_jobs.sh:
--------------------------------------------------------------------------------
 1 | # scancel from the list below
 2 | 
 3 | list=(
 4 |     "2384204"
 5 |     "2384206"
 6 |     "2384207"
 7 |     "2384208"
 8 |     "2384209"
 9 |     "2384210"
10 |     "2384211"
11 |     "2384213"
12 |     "2384215"
13 |     "2384216"
14 |     "2384217"
15 |     "2384218"
16 |     "2384220"
17 |     "2384221"
18 |     "2384222"
19 |     "2384223"
20 |     "2384226"
21 |     "2384228"
22 |     "2384230"
23 |     "2384231"
24 |     "2384233"
25 |     "2384234"
26 |     "2384264"
27 |     "2384262"
28 |     "2384261"
29 |     "2384259"
30 |     "2384257"
31 |     "2384255"
32 |     "2384253"
33 |     "2384251"
34 |     "2384249"
35 |     "2384244"
36 |     "2384242"
37 |     "2384240"
38 |     "2384238"
39 |     "2384236"
40 | )
41 | 
42 | for i in "${list[@]}"
43 | do
44 |     scancel $i
45 | done
46 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [flake8]
 2 | ignore=
 3 |     # line length
 4 |     E501,
 5 |     # whitespace before ':'
 6 |     E203,
 7 |     # line break before binary operator
 8 |     W503
 9 | exclude =
10 |     # No need to traverse our git directory
11 |     .git,
12 |     # There's no value in checking cache directories
13 |     __pycache__,
14 |     # This contains our built documentation
15 |     build,
16 |     # This contains builds of flake8 that we don't want to check
17 |     dist,
18 |     bak,
19 |     data,
20 |     outputs,
21 |     debug.py
22 | 


--------------------------------------------------------------------------------