├── mlfs ├── mlfstest │ ├── .gitignore │ ├── go.work │ ├── go.mod │ ├── Makefile │ ├── cmd │ │ └── mlfstest-tf-imagenet │ │ │ └── lib.go │ └── go.sum ├── docker │ ├── ubuntu │ │ ├── 1804 │ │ │ ├── tf.tag.txt │ │ │ └── sources.list │ │ ├── 2004 │ │ │ ├── tag.txt │ │ │ ├── tf.tag.txt │ │ │ └── Dockerfile │ │ └── 2204 │ │ │ ├── tag.txt │ │ │ └── Dockerfile │ ├── test-tf.py │ ├── docker-compose.yaml │ └── start.sh ├── vfs │ ├── note_test.go │ ├── dir_test.go │ ├── fiile_test.go │ ├── example.go │ ├── vfile │ │ ├── shard.go │ │ ├── link.go │ │ ├── buffer.go │ │ └── range.go │ ├── ufs │ │ ├── ufs_darwin.go │ │ └── ufs.go │ ├── utils_test.go │ ├── node.go │ ├── dir.go │ ├── path.go │ ├── tree_debug.go │ └── file.go ├── benchmarks │ ├── bench-http.sh │ ├── bench-fuse.sh │ ├── bench-tf-read.sh │ └── run.sh ├── .gitignore ├── test-daemon.sh ├── state │ └── state.go ├── tests │ └── data │ │ ├── imagenet.json │ │ └── squad1.json ├── etc │ ├── apt │ │ └── sources.list.d │ │ │ └── tenplex.list │ ├── mlfs │ │ ├── stop.sh │ │ └── mlfs.sh │ └── os │ │ └── linux │ │ └── mlfs.service ├── utils │ ├── error.go │ ├── text.go │ └── log.go ├── test-client.sh ├── scripts │ ├── get-go.sh │ ├── cache-squad1.sh │ ├── system-install.sh │ └── upload.sh ├── build-imagenet-index.sh ├── notes ├── ds │ ├── trds │ │ ├── trds_test.go │ │ └── example.go │ ├── imagenet.go │ ├── dataset.go │ ├── squad1.go │ └── mnist.go ├── build-squad-index.sh ├── uri │ ├── path.go │ ├── sas.go │ ├── monitor.go │ └── stat.go ├── cmd │ ├── mlfsd │ │ └── mlfsd.go │ ├── tests │ │ ├── cmd │ │ │ ├── test-fuse │ │ │ │ └── test-fuse.go │ │ │ ├── test-md5 │ │ │ │ └── test-md5.go │ │ │ ├── mlfs-test-dist │ │ │ │ └── mlfs-test-dist.go │ │ │ └── mlfs-debug │ │ │ │ └── mlfs-debug.go │ │ └── mlfs-test-upload │ │ │ └── mlfs-test-upload.go │ ├── mlfs-build-tf-index │ │ └── mlfs-build-tf-index.go │ ├── mlfs-edit-index │ │ └── mlfs-edit-index.go │ ├── mlfs-download │ │ └── mlfs-download.go │ └── mlfs-check-index │ │ └── mlfs-check-index.go ├── mlfs │ ├── app.go │ ├── bitmap.go │ ├── tensorfile.go │ ├── dsidx.go │ └── replicate.go ├── build-cloud-index.sh ├── add-imagenet.sh ├── build-imagenet-md5sum.sh ├── add-enwiki-numpy.sh ├── closer │ └── closer.go ├── test-numpy.sh ├── par │ └── par.go ├── .vscode │ └── tasks.json ├── debug-p2p.sh ├── cache │ ├── stat.go │ └── memory.go ├── local-serve.sh ├── convert_index.py ├── .github │ └── workflows │ │ └── docker.yml ├── fuse │ └── fuse.go ├── local-ci.sh ├── www │ └── js │ │ └── bmp.js ├── bimap │ └── bimap.go ├── iotrace │ ├── io.go │ ├── report.go │ └── counter.go ├── hash │ ├── file.go │ └── md5.go ├── pid │ └── peer.go ├── iseq │ └── iseq.go ├── prefetch.sh ├── fsutil │ └── fsutil.go ├── README └── buildinfo │ └── buildinfo.go ├── tenplex-run ├── README.md ├── debug-ssh.sh ├── local_prepare.sh ├── local_clean.sh ├── scripts │ ├── install │ │ └── torch │ │ │ └── cpu.sh │ ├── read-zero-model-state.py │ └── read-zero-optimizer-state.py ├── create-vnet.sh ├── clean.sh ├── pull.sh ├── dbg │ └── dgb.go ├── listflag │ ├── listflag_test.go │ └── listflag.go ├── counter │ └── id.go ├── job │ ├── lib.go │ ├── hosts.go │ ├── params.go │ └── params_bert.go ├── timeout │ └── timeout.go ├── cancelgroup │ └── cancelgroup.go ├── web │ └── web.go ├── docker │ └── lib.go ├── cluster │ ├── cluster_test.go │ └── cluster.go ├── .github │ └── workflows │ │ └── go.yml ├── structflag │ └── structflag_test.go ├── runop │ ├── redundancy.go │ ├── dataset.go │ └── failure.go └── para_config │ └── schedule.go ├── benchmark ├── reconfiguration_horovod │ ├── tag.txt │ ├── add-imagenet.sh │ ├── README.md │ ├── Dockerfile │ ├── run.sh │ ├── train-imagenet.sh │ ├── with-docker │ └── logger.py ├── README.md ├── convergence_impact │ ├── requirements.txt │ ├── run.sh │ └── README.md ├── dynamic_resources │ ├── hosts.txt │ ├── pytorch-para-config.json │ ├── tenplex-para-config.json │ ├── README.md │ ├── pytorch-schedule.json │ └── tenplex-schedule.json ├── reconfiguration_cluster_size │ ├── README.md │ ├── tasks │ │ ├── add_group.yml │ │ └── pull_image.yml │ ├── scale-cluster.sh │ ├── config.sh │ ├── para-config-tp-4to8.json │ ├── para-config-tp-16to32.json │ ├── para-config-tp-8to16.json │ ├── docker.yml │ ├── list-ips.sh │ ├── schedule_16.json │ ├── schedule_8.json │ ├── schedule_32.json │ ├── tenplex.yml │ ├── para-config-dp.json │ ├── para-config-pp.json │ ├── upgrade.sh │ └── recreate-vmss.sh ├── redeployment │ ├── para-config.json │ ├── schedule.json │ ├── README.md │ └── run.sh ├── model_convergence │ ├── schedule-static.json │ ├── schedule-up.json │ ├── para-config-dp.json │ ├── para-config-pp.json │ ├── para-config-tp.json │ ├── schedule-down.json │ └── README.md ├── failure │ ├── schedule.json │ ├── para-config.json │ ├── README.md │ ├── run.sh │ └── plot.py ├── reconfiguration_parallelization │ ├── para-config-dp.json │ ├── para-config-pp.json │ ├── para-config-tp.json │ ├── schedule.json │ ├── README.md │ └── run.sh ├── reconfiguration │ ├── run.sh │ ├── stop.py │ └── README.md ├── common-cloud.sh ├── common.sh └── performance_impact │ ├── README.md │ └── run.sh ├── show-go-mod.sh ├── tests ├── requirements.txt ├── test_delete.py ├── test_load_http.py ├── test_save.py ├── dataset.py ├── test-tensor-file.py └── test_load.py ├── para_config ├── deepspeed │ ├── README.md │ └── layer_map.py └── megatron_lm │ ├── util.py │ ├── gen_para_config.sh │ ├── README.md │ └── rank_map.py ├── scheduler ├── .gitignore ├── scalepoint │ └── scalepoint.go ├── etc │ ├── tenplex │ │ ├── stop-scheduler.sh │ │ └── scheduler.sh │ └── os │ │ └── linux │ │ └── tenplex-scheduler.service ├── README ├── scripts │ ├── config.sh │ ├── scale-cluster.sh │ ├── list-ips.sh │ ├── gen-log-index.py │ ├── build-deb.sh │ ├── collect-logs.sh │ ├── list-ips-komodo.sh │ ├── upload.sh │ ├── install-mlfs.sh │ ├── plot.gp │ ├── upload-logs.sh │ └── recreate-vmss.sh ├── data │ ├── plan-1.json │ ├── plan-2.json │ ├── plan-3.json │ ├── trace.json │ ├── plan-komodo.json │ └── single-job-time.json ├── azure │ ├── run_scheduler.sh │ └── run_user.sh ├── logging │ └── logging.go ├── run_scheduler.sh ├── experiments │ ├── mlfs.go │ ├── lib.go │ └── experiments.go ├── run_user.sh ├── job │ └── job.go ├── scheduler │ └── scheduler_test.go ├── CMakeLists.txt ├── configserver │ └── configserver.go └── cmd │ ├── tenplex-user │ └── tenplex-user.go │ └── tenplex-scheduler │ └── tenplex-scheduler.go ├── state_transformer ├── build_docker.sh ├── statetransform │ ├── padding.go │ ├── padding_test.go │ ├── iter.go │ ├── lib.go │ ├── repartition_test.go │ └── replicate.go ├── meta │ ├── metadata.go │ ├── path.go │ ├── modelkeys.go │ ├── struct_test.go │ └── rankmap.go ├── lib │ └── lib.go ├── Dockerfile ├── mapslice │ └── mapslice_test.go ├── test_state_migrator.sh ├── run_state_migrator.sh ├── cmd │ └── tenplex-state-transformer │ │ └── tenplex-state-transformer.go └── search │ └── file-system.go ├── man └── man1 │ ├── mlfsd.1 │ └── mlfs.1 ├── .gitignore ├── run_test_load.sh ├── Dockerfile ├── tenplex ├── __init__.py ├── mlfs_path.py ├── arguments.py ├── stop.py └── save.py ├── Dockerfile-deepspeed ├── ansible ├── tenplex.yml ├── uninstall.yml └── install.yml ├── .github └── workflows │ ├── go.yml │ └── deb.yml ├── scripts └── pack.sh ├── go.mod ├── .azure └── release-pip.yml ├── azure-pipelines.yml ├── setup.py ├── tensor ├── tensor_test.go ├── dtypes.go └── concat.go ├── ipv4 └── detect.go ├── CMakeLists.txt ├── Makefile └── go.sum /mlfs/mlfstest/.gitignore: -------------------------------------------------------------------------------- 1 | /bin 2 | -------------------------------------------------------------------------------- /tenplex-run/README.md: -------------------------------------------------------------------------------- 1 | # elastique-controller -------------------------------------------------------------------------------- /mlfs/docker/ubuntu/2004/tag.txt: -------------------------------------------------------------------------------- 1 | kungfu.azurecr.io/mlfs-focal:snapshot 2 | -------------------------------------------------------------------------------- /mlfs/docker/ubuntu/2204/tag.txt: -------------------------------------------------------------------------------- 1 | kungfu.azurecr.io/mlfs-jammy:snapshot 2 | -------------------------------------------------------------------------------- /mlfs/mlfstest/go.work: -------------------------------------------------------------------------------- 1 | go 1.19 2 | 3 | use ( 4 | . 5 | .. 6 | ) 7 | -------------------------------------------------------------------------------- /mlfs/vfs/note_test.go: -------------------------------------------------------------------------------- 1 | package vfs 2 | 3 | func isNode(i Node) {} 4 | -------------------------------------------------------------------------------- /benchmark/reconfiguration_horovod/tag.txt: -------------------------------------------------------------------------------- 1 | reconfiguration_horovod:snapshot 2 | -------------------------------------------------------------------------------- /benchmark/README.md: -------------------------------------------------------------------------------- 1 | # Artifact evaluation 2 | 3 | Run `run.sh` in every directory. 4 | -------------------------------------------------------------------------------- /mlfs/docker/ubuntu/1804/tf.tag.txt: -------------------------------------------------------------------------------- 1 | kungfu.azurecr.io/mlfs-bionic-tf1.13.2:snapshot 2 | -------------------------------------------------------------------------------- /mlfs/docker/ubuntu/2004/tf.tag.txt: -------------------------------------------------------------------------------- 1 | kungfu.azurecr.io/mlfs-focal-tf1.13.2:snapshot 2 | -------------------------------------------------------------------------------- /show-go-mod.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | set -e 3 | 4 | cat go.mod | head -n 1 | awk '{print $2}' 5 | -------------------------------------------------------------------------------- /tenplex-run/debug-ssh.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | set -e 3 | 4 | make 5 | ./bin/elastique-test-ssh 6 | -------------------------------------------------------------------------------- /benchmark/convergence_impact/requirements.txt: -------------------------------------------------------------------------------- 1 | scipy 2 | matplotlib 3 | torch 4 | torchvision 5 | -------------------------------------------------------------------------------- /benchmark/dynamic_resources/hosts.txt: -------------------------------------------------------------------------------- 1 | 10.10.10.1 2 | 10.10.10.2 3 | 10.10.10.3 4 | 10.10.10.4 5 | -------------------------------------------------------------------------------- /tests/requirements.txt: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env -S sh -c 'python3 -m pip install -r $0' 2 | pip 3 | torch 4 | -------------------------------------------------------------------------------- /mlfs/benchmarks/bench-http.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | set -e 3 | 4 | make 5 | 6 | ./bin/mlfs-test -port 19999 7 | -------------------------------------------------------------------------------- /mlfs/docker/test-tf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import tensorflow as tf 3 | 4 | 5 | print(tf.__version__) 6 | -------------------------------------------------------------------------------- /mlfs/.gitignore: -------------------------------------------------------------------------------- 1 | *.idx.txt 2 | *.log 3 | *.md5.txt 4 | *.tf_record 5 | /bin 6 | /build 7 | /tmp 8 | __pycache__ 9 | -------------------------------------------------------------------------------- /para_config/deepspeed/README.md: -------------------------------------------------------------------------------- 1 | # Generate parallelisation configuration 2 | 3 | Old files. Might not work anymore 4 | -------------------------------------------------------------------------------- /tenplex-run/local_prepare.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | mkdir -p ~/.tenplex/bin 5 | sudo systemctl restart mlfs 6 | -------------------------------------------------------------------------------- /mlfs/benchmarks/bench-fuse.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | set -e 3 | 4 | make 5 | 6 | root=$HOME/mnt/efs 7 | ./bin/mlfs-test -mnt $root 8 | -------------------------------------------------------------------------------- /mlfs/test-daemon.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | set -e 3 | 4 | make 5 | 6 | ./bin/mlfs daemon -ctrl-port 9999 -http-port 9998 -mnt ./tmp 7 | -------------------------------------------------------------------------------- /mlfs/vfs/dir_test.go: -------------------------------------------------------------------------------- 1 | package vfs 2 | 3 | import "testing" 4 | 5 | func Test_2(t *testing.T) { 6 | d := &dir{} 7 | isNode(d) 8 | } 9 | -------------------------------------------------------------------------------- /mlfs/vfs/fiile_test.go: -------------------------------------------------------------------------------- 1 | package vfs 2 | 3 | import "testing" 4 | 5 | func Test_1(t *testing.T) { 6 | f := &file{} 7 | isNode(f) 8 | } 9 | -------------------------------------------------------------------------------- /scheduler/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | *.log 3 | *.pdf 4 | *.ps 5 | /build 6 | /logs 7 | /transformer-checkpoint 8 | bin 9 | run-id.txt 10 | -------------------------------------------------------------------------------- /tenplex-run/local_clean.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | docker ps -f "name=trainer" -q | xargs docker stop 4 | sudo rm -r ~/.tenplex/training/* 5 | -------------------------------------------------------------------------------- /mlfs/state/state.go: -------------------------------------------------------------------------------- 1 | package state 2 | 3 | type ElasticState struct { 4 | InitProgres int64 5 | ClusterSize int 6 | Rank int 7 | } 8 | -------------------------------------------------------------------------------- /benchmark/reconfiguration_cluster_size/README.md: -------------------------------------------------------------------------------- 1 | # Reconfiguration cluster sizes 2 | _Fig. 15. Reconfiguration time with different cluster sizes_ 3 | -------------------------------------------------------------------------------- /benchmark/redeployment/para-config.json: -------------------------------------------------------------------------------- 1 | { 2 | "8": { 3 | "dp_size": 1, 4 | "pp_size": 2, 5 | "mp_size": 4 6 | } 7 | } 8 | -------------------------------------------------------------------------------- /mlfs/tests/data/imagenet.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "imagenet", 3 | "index-url": "https://tenplex.blob.core.windows.net/data/imagenet.idx.txt" 4 | } 5 | -------------------------------------------------------------------------------- /mlfs/tests/data/squad1.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "squad1", 3 | "index-url": "https://tenplex.blob.core.windows.net/data/squad1/squad1.idx.txt" 4 | } 5 | -------------------------------------------------------------------------------- /scheduler/scalepoint/scalepoint.go: -------------------------------------------------------------------------------- 1 | package scalepoint 2 | 3 | type ScalePoint struct { 4 | Time int `json:"time"` 5 | Size int `json:"size"` 6 | } 7 | -------------------------------------------------------------------------------- /scheduler/etc/tenplex/stop-scheduler.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | set -e 3 | 4 | pid=$(pgrep -f /usr/bin/tenplex-scheduler) 5 | kill -9 $pid 6 | 7 | echo "killed $pid" 8 | -------------------------------------------------------------------------------- /state_transformer/build_docker.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | make 5 | 6 | ./Dockerfile 7 | 8 | docker push kungfu.azurecr.io/mw-megatron-lm-go:latest 9 | -------------------------------------------------------------------------------- /para_config/megatron_lm/util.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | 4 | def remove_dir(path: str): 5 | if os.path.exists(path): 6 | shutil.rmtree(path) 7 | -------------------------------------------------------------------------------- /scheduler/README: -------------------------------------------------------------------------------- 1 | tenplex scheduler 2 | 3 | Run local 4 | mlfs serve -index-url /data/megatron-lm/bert/openwebtext/npzs_seq1024/indices.txt -self 155.198.152.18 5 | -------------------------------------------------------------------------------- /man/man1/mlfsd.1: -------------------------------------------------------------------------------- 1 | .TH mlfsd 2 | 3 | .SH SYNOPSIS 4 | .B mlfsd 5 | 6 | .SY 7 | The mlfs daemon. 8 | .YS 9 | 10 | .SH SEE ALSO 11 | .BR mlfs-build-tf-index 12 | -------------------------------------------------------------------------------- /mlfs/etc/apt/sources.list.d/tenplex.list: -------------------------------------------------------------------------------- 1 | # deb https://tenplex.blob.core.windows.net/public/deb ./ 2 | 3 | deb https://europe-west2-apt.pkg.dev/projects/tenplex tenplex main 4 | -------------------------------------------------------------------------------- /mlfs/utils/error.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import ( 4 | "log" 5 | "os" 6 | ) 7 | 8 | func ExitErr(err error) { 9 | log.Printf("%v", err) 10 | os.Exit(1) 11 | } 12 | -------------------------------------------------------------------------------- /mlfs/test-client.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | set -e 3 | 4 | make 5 | 6 | ./bin/mlfs mount -index-url http://155.198.152.18:20110/ -idx-name a -ctrl-port 9999 7 | ./bin/mlfs bench -mnt ./tmp 8 | -------------------------------------------------------------------------------- /mlfs/vfs/example.go: -------------------------------------------------------------------------------- 1 | package vfs 2 | 3 | func InitExample(r *Tree) { 4 | r.Mkdir(`/`) 5 | r.Mkdir(`/a`) 6 | r.Mkdir(`/a/b`) 7 | r.TouchText(`/a/b/c.txt`, "hello world\n") 8 | } 9 | -------------------------------------------------------------------------------- /man/man1/mlfs.1: -------------------------------------------------------------------------------- 1 | .TH mlfs 2 | 3 | .SH SYNOPSIS 4 | .B mlfs 5 | 6 | .SY 7 | The mlfs command line tool. 8 | .YS 9 | 10 | .SH SEE ALSO 11 | .BR mlfsd 12 | .BR mlfs-build-tf-index 13 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.egg-info 2 | *.log 3 | .DS_Store 4 | .mypy_cache 5 | training 6 | __pycache__ 7 | bin 8 | build 9 | events.out.tfevents.* 10 | *.npz 11 | *.pdf 12 | *.csv 13 | data 14 | *.pt 15 | -------------------------------------------------------------------------------- /mlfs/scripts/get-go.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | set -e 3 | 4 | mkdir -p $HOME/local 5 | cd $HOME/local 6 | 7 | wget https://dl.google.com/go/go1.18.linux-amd64.tar.gz 8 | tar -xf go1.18.linux-amd64.tar.gz 9 | -------------------------------------------------------------------------------- /benchmark/model_convergence/schedule-static.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "step": 0, 4 | "size": 8 5 | }, 6 | { 7 | "step": 200, 8 | "size": 0 9 | } 10 | ] 11 | -------------------------------------------------------------------------------- /benchmark/reconfiguration_cluster_size/tasks/add_group.yml: -------------------------------------------------------------------------------- 1 | - name: add docker group 2 | become: true 3 | ansible.builtin.user: 4 | name: sospreviewer01 5 | groups: [docker] 6 | append: yes 7 | -------------------------------------------------------------------------------- /scheduler/scripts/config.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | v100x1=Standard_NC6s_v3 4 | v100x2=Standard_NC12s_v3 5 | v100x4=Standard_NC24s_v3 6 | 7 | group=kungfu 8 | size=$v100x4 9 | name=tenplex-mw-v100x4 10 | -------------------------------------------------------------------------------- /mlfs/scripts/cache-squad1.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | set -e 3 | 4 | ./bin/mlfs-fetch -ctrl-port 20000 -file 'https://minddata.blob.core.windows.net/data/squad1/train.tf_record' -md5 67eb6da21920dda01ec75cd6e1a5b8d7 5 | -------------------------------------------------------------------------------- /tenplex-run/scripts/install/torch/cpu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | python3 -m pip install torch==1.10.2+cpu torchvision==0.11.3+cpu torchaudio==0.10.2+cpu -f https://download.pytorch.org/whl/cpu/torch_stable.html 4 | -------------------------------------------------------------------------------- /mlfs/build-imagenet-index.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | set -e 3 | 4 | list_tf_records() { 5 | ls /data/imagenet/records/train* | sort 6 | } 7 | 8 | mlfs-build-tf-index -m 16 -output imagenet.idx.txt $(list_tf_records) 9 | -------------------------------------------------------------------------------- /benchmark/reconfiguration_cluster_size/scale-cluster.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | set -e 3 | 4 | . ./config.sh 5 | 6 | n="$1" 7 | az vmss scale -g $group -n $name --new-capacity $n -o table 8 | 9 | echo "scaled to $n" 10 | -------------------------------------------------------------------------------- /run_test_load.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | export PYTHONPATH="$HOME/Elasticity/Repo/Megatron-LM" 6 | 7 | python test_load.py \ 8 | --device-rank 0 \ 9 | --mlfs-path "/mnt/mlfs/job/job-single" 10 | -------------------------------------------------------------------------------- /benchmark/reconfiguration_cluster_size/config.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | v100x1=Standard_NC6s_v3 4 | v100x2=Standard_NC12s_v3 5 | v100x4=Standard_NC24s_v3 6 | 7 | group=kungfu 8 | size=$v100x4 9 | name=tenplex-mw-v100x4 10 | -------------------------------------------------------------------------------- /mlfs/etc/mlfs/stop.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | set -e 3 | 4 | # https://superuser.com/questions/1146388/systemd-state-stop-sigterm-timed-out 5 | 6 | pid=$(pgrep -f /usr/bin/mlfsd) 7 | kill -9 $pid 8 | 9 | echo "killed $pid" 10 | -------------------------------------------------------------------------------- /para_config/megatron_lm/gen_para_config.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | python gen_para_config.py \ 5 | --model gpt \ 6 | --size large \ 7 | --precision fp16 \ 8 | --pp 1 \ 9 | --tp 2 \ 10 | --dp 2 11 | -------------------------------------------------------------------------------- /scheduler/scripts/scale-cluster.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | set -e 3 | 4 | cd $(dirname $0)/.. 5 | . ./scripts/config.sh 6 | 7 | n="$1" 8 | az vmss scale -g $group -n $name --new-capacity $n -o table 9 | 10 | echo "scaled to $n" 11 | -------------------------------------------------------------------------------- /scheduler/scripts/list-ips.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | set -e 3 | 4 | cd $(dirname $0)/.. 5 | . ./scripts/config.sh 6 | 7 | az vmss nic list -g kungfu --vmss-name $name --query '[].ipConfigurations[0].privateIpAddress' -o table -o table | sed 1,2d 8 | -------------------------------------------------------------------------------- /tenplex-run/create-vnet.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | set -e 3 | 4 | name=elastique 5 | 6 | docker network create --driver overlay --scope swarm --attachable elastique 7 | 8 | # TODO: extract Subnet from JSON 9 | docker network inspect $name 10 | -------------------------------------------------------------------------------- /scheduler/data/plan-1.json: -------------------------------------------------------------------------------- 1 | { 2 | "jobs": [ 3 | { 4 | "steps": 100, 5 | "delay": 0 6 | }, 7 | { 8 | "steps": 100, 9 | "delay": 6 10 | } 11 | ] 12 | } 13 | -------------------------------------------------------------------------------- /benchmark/failure/schedule.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "step": 0, 4 | "size": 16 5 | }, 6 | { 7 | "step": 50, 8 | "size": 8 9 | }, 10 | { 11 | "step": 60, 12 | "size": 0 13 | } 14 | ] 15 | -------------------------------------------------------------------------------- /benchmark/redeployment/schedule.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "step": 0, 4 | "size": 8 5 | }, 6 | { 7 | "step": 50, 8 | "size": 8 9 | }, 10 | { 11 | "step": 60, 12 | "size": 0 13 | } 14 | ] 15 | -------------------------------------------------------------------------------- /mlfs/benchmarks/bench-tf-read.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | set -e 3 | 4 | export LD_LIBRARY_PATH=/usr/local/cuda-10.0/lib64 5 | cd $(dirname $0)/.. 6 | export CUDA_VISIBLE_DEVICES=0 7 | 8 | ./benchmarks/tf_read.py --fake-data 1 9 | # ./benchmarks/tf_read.py 10 | -------------------------------------------------------------------------------- /benchmark/failure/para-config.json: -------------------------------------------------------------------------------- 1 | { 2 | "16": { 3 | "dp_size": 2, 4 | "pp_size": 2, 5 | "mp_size": 4 6 | }, 7 | "8":{ 8 | "dp_size": 1, 9 | "pp_size": 2, 10 | "mp_size": 4 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /benchmark/model_convergence/schedule-up.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "step": 0, 4 | "size": 4 5 | }, 6 | { 7 | "step": 100, 8 | "size": 8 9 | }, 10 | { 11 | "step": 200, 12 | "size": 0 13 | } 14 | ] 15 | -------------------------------------------------------------------------------- /benchmark/model_convergence/para-config-dp.json: -------------------------------------------------------------------------------- 1 | { 2 | "4": { 3 | "dp_size": 4, 4 | "pp_size": 1, 5 | "mp_size": 1 6 | }, 7 | "8": { 8 | "dp_size": 8, 9 | "pp_size": 1, 10 | "mp_size": 1 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /benchmark/model_convergence/para-config-pp.json: -------------------------------------------------------------------------------- 1 | { 2 | "4": { 3 | "dp_size": 1, 4 | "pp_size": 4, 5 | "mp_size": 1 6 | }, 7 | "8": { 8 | "dp_size": 1, 9 | "pp_size": 8, 10 | "mp_size": 1 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /benchmark/model_convergence/para-config-tp.json: -------------------------------------------------------------------------------- 1 | { 2 | "4": { 3 | "dp_size": 1, 4 | "pp_size": 1, 5 | "mp_size": 4 6 | }, 7 | "8": { 8 | "dp_size": 1, 9 | "pp_size": 1, 10 | "mp_size": 8 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /benchmark/model_convergence/schedule-down.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "step": 0, 4 | "size": 8 5 | }, 6 | { 7 | "step": 100, 8 | "size": 4 9 | }, 10 | { 11 | "step": 200, 12 | "size": 0 13 | } 14 | ] 15 | -------------------------------------------------------------------------------- /scheduler/scripts/gen-log-index.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # import glob 3 | import sys 4 | 5 | 6 | def main(args): 7 | for f in args: 8 | a = "
  • %s
  • " % (f, f) 9 | print(a) 10 | 11 | 12 | main(sys.argv[1:]) 13 | -------------------------------------------------------------------------------- /benchmark/dynamic_resources/pytorch-para-config.json: -------------------------------------------------------------------------------- 1 | { 2 | "8": { 3 | "dp_size": 1, 4 | "pp_size": 4, 5 | "mp_size": 2 6 | }, 7 | "16": { 8 | "dp_size": 2, 9 | "pp_size": 4, 10 | "mp_size": 2 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /para_config/megatron_lm/README.md: -------------------------------------------------------------------------------- 1 | # Generate parallelisation configuration 2 | 3 | ## Example 4 | ```py 5 | python gen_para_config.py \ 6 | --model gpt \ 7 | --size large \ 8 | --precision fp16 \ 9 | --pp 1 \ 10 | --tp 2 \ 11 | --dp 2 12 | ``` 13 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env -S sh -c 'docker build --rm -t kungfu.azurecr.io/mw-megatron-lm-23.06-tenplex:latest -f $0 .' 2 | 3 | FROM kungfu.azurecr.io/mw-megatron-lm-23.06:latest 4 | 5 | # Tenplex 6 | ADD . /workspace/tenplex 7 | RUN cd /workspace/tenplex && \ 8 | pip install . 9 | -------------------------------------------------------------------------------- /tenplex-run/clean.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | HOSTS="155.198.152.18 155.198.152.19 155.198.152.23" 4 | 5 | for host in $HOSTS; do 6 | ssh $host "docker ps -f \"name=trainer\" -q | xargs docker stop" & 7 | ssh $host "sudo rm -r ~/.tenplex/training/*" & 8 | done 9 | 10 | wait 11 | -------------------------------------------------------------------------------- /benchmark/convergence_impact/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | python -u mnist.py 2>&1 | tee mnist.log 5 | python -u mnist.py --inconsistent-dataset 2>&1 | tee inconsistent_dataset.log 6 | python -u mnist_batch_size.py 2>&1 | tee inconsistent_batch_size.log 7 | 8 | python plot.py 9 | -------------------------------------------------------------------------------- /benchmark/reconfiguration_cluster_size/para-config-tp-4to8.json: -------------------------------------------------------------------------------- 1 | { 2 | "4": { 3 | "dp_size": 1, 4 | "pp_size": 1, 5 | "mp_size": 4 6 | }, 7 | "8": { 8 | "dp_size": 1, 9 | "pp_size": 1, 10 | "mp_size": 8 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /benchmark/reconfiguration_parallelization/para-config-dp.json: -------------------------------------------------------------------------------- 1 | { 2 | "8": { 3 | "dp_size": 1, 4 | "pp_size": 2, 5 | "mp_size": 4 6 | }, 7 | "16": { 8 | "dp_size": 2, 9 | "pp_size": 2, 10 | "mp_size": 4 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /benchmark/reconfiguration_parallelization/para-config-pp.json: -------------------------------------------------------------------------------- 1 | { 2 | "8": { 3 | "dp_size": 1, 4 | "pp_size": 2, 5 | "mp_size": 4 6 | }, 7 | "16": { 8 | "dp_size": 1, 9 | "pp_size": 4, 10 | "mp_size": 4 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /benchmark/reconfiguration_parallelization/para-config-tp.json: -------------------------------------------------------------------------------- 1 | { 2 | "8": { 3 | "dp_size": 1, 4 | "pp_size": 2, 5 | "mp_size": 4 6 | }, 7 | "16": { 8 | "dp_size": 1, 9 | "pp_size": 2, 10 | "mp_size": 8 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /mlfs/notes: -------------------------------------------------------------------------------- 1 | 7.68TB Samsung PM883 2.5" Enterprise SSD, SATA 3 (RAID 6) 2 | 3 | 64 parallel read: 8.57 GiB/s 4 | 5 | 6 | sequential read speed, 550MB/s 7 | 8 | 9 | tmpfs 10 | 11 | 17.07 GiB/s 12 | 13 | Samsung 980 PRO 500GB M.2 PCIe 4.0 NVMe SSD (Mirrored) 14 | 16.92 GiB/s 15 | -------------------------------------------------------------------------------- /tenplex/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | """ 3 | 4 | try: 5 | from .load import load, load_http 6 | from .save import save 7 | from .stop import check_stop 8 | except: 9 | # When torch is not installed 10 | # ModuleNotFoundError: No module named 'torch' 11 | pass 12 | -------------------------------------------------------------------------------- /benchmark/reconfiguration/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | sudo rm -rf /mnt/k1d2/ckpt/* 5 | python -u training.py 2>&1 | tee run_scale_down.log 6 | 7 | # sudo rm -rf /mnt/k1d2/ckpt/* 8 | # python -u training.py --scale-up 2>&1 | tee run_scale_up.log 9 | 10 | # python plot.py 11 | -------------------------------------------------------------------------------- /benchmark/reconfiguration_cluster_size/para-config-tp-16to32.json: -------------------------------------------------------------------------------- 1 | { 2 | "16": { 3 | "dp_size": 2, 4 | "pp_size": 4, 5 | "mp_size": 2 6 | }, 7 | "32": { 8 | "dp_size": 2, 9 | "pp_size": 4, 10 | "mp_size": 4 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /benchmark/reconfiguration_cluster_size/para-config-tp-8to16.json: -------------------------------------------------------------------------------- 1 | { 2 | "8": { 3 | "dp_size": 1, 4 | "pp_size": 4, 5 | "mp_size": 2 6 | }, 7 | "16": { 8 | "dp_size": 1, 9 | "pp_size": 4, 10 | "mp_size": 4 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /mlfs/etc/mlfs/mlfs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | set -e 3 | 4 | flags() { 5 | echo -http-port 19999 6 | echo -ctrl-port 20010 7 | echo -mnt /mnt/mlfs 8 | echo -tmp /tmp/mlfs 9 | echo -su 10 | echo -log-req 11 | } 12 | 13 | /usr/bin/mlfsd $(flags) 14 | 15 | echo "$0 stopped" 16 | -------------------------------------------------------------------------------- /benchmark/reconfiguration_cluster_size/docker.yml: -------------------------------------------------------------------------------- 1 | # ansible-playbook -i hosts.txt ansible/tenplex.yml 2 | 3 | - name: pull image 4 | hosts: all 5 | 6 | tasks: 7 | - import_tasks: ./tasks/pull_image.yml 8 | 9 | #- debug: msg="{{ log.stdout }}" 10 | #- debug: msg="{{ log.stderr }}" 11 | -------------------------------------------------------------------------------- /Dockerfile-deepspeed: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env -S sh -c 'docker build --rm -t kungfu.azurecr.io/mw-deepspeed-tenplex:latest -f $0 .' 2 | 3 | FROM kungfu.azurecr.io/mw-deepspeed:latest 4 | 5 | USER root 6 | 7 | # Tenplex 8 | ADD . /workspace/tenplex 9 | RUN cd /workspace/tenplex && \ 10 | pip install . 11 | -------------------------------------------------------------------------------- /mlfs/mlfstest/go.mod: -------------------------------------------------------------------------------- 1 | module mlfstest 2 | 3 | go 1.19 4 | 5 | require github.com/lgarithm/proc v0.3.2-0.20221205141105-3ebbaa57acfd 6 | 7 | require ( 8 | golang.org/x/crypto v0.0.0-20200604202706-70a84ac30bf9 // indirect 9 | golang.org/x/sys v0.0.0-20190412213103-97732733099d // indirect 10 | ) 11 | -------------------------------------------------------------------------------- /tenplex-run/pull.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | HOSTS="10.0.0.5 10.0.0.6 10.0.0.8 10.0.0.9" 5 | JOBID="cd1e6f634c" 6 | 7 | mkdir -p ~/.tenplex/training/$JOBID 8 | 9 | for host in $HOSTS; do 10 | scp -r $host:~/.tenplex/training/$JOBID/* ~/.tenplex/training/$JOBID & 11 | done 12 | 13 | wait 14 | -------------------------------------------------------------------------------- /mlfs/vfs/vfile/shard.go: -------------------------------------------------------------------------------- 1 | package vfile 2 | 3 | import "github.com/kungfu-team/tenplex/mlfs/iseq" 4 | 5 | func (f IndexedFiles) Shard(i, n int) *vfile { 6 | seq := iseq.Seq(iseq.Iota(f.NumRange())) 7 | seq = seq.Shard(i, n) 8 | return &vfile{ 9 | ranges: f.NamedRanges().Select(seq.Get()), 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /mlfs/ds/trds/trds_test.go: -------------------------------------------------------------------------------- 1 | package trds 2 | 3 | import ( 4 | "fmt" 5 | "testing" 6 | ) 7 | 8 | func Test_1(t *testing.T) { 9 | xs := []int{2, 2, 2, 2, 2} 10 | ps := groupIntList(xs) 11 | fmt.Printf("%v\n", ps) 12 | if len(ps) != 1 { 13 | t.Fail() 14 | } 15 | if ps[0].Second != 5 { 16 | t.Fail() 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /ansible/tenplex.yml: -------------------------------------------------------------------------------- 1 | # ansible-playbook -i hosts.txt ansible/tenplex.yml 2 | 3 | - name: install tenplex 4 | hosts: all 5 | remote_user: kungfu 6 | 7 | tasks: 8 | # - import_tasks: ./uninstall.yml 9 | - import_tasks: ./install.yml 10 | 11 | - debug: msg="{{ log.stdout }}" 12 | - debug: msg="{{ log.stderr }}" 13 | -------------------------------------------------------------------------------- /mlfs/build-squad-index.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | set -e 3 | 4 | cd $(dirname $0) 5 | 6 | list_squad_records() { 7 | if [ $(uname) = "Darwin" ]; then 8 | echo $HOME/squad1_train.tf_record 9 | else 10 | echo /data/squad1/train.tf_record 11 | fi 12 | } 13 | 14 | ./bin/mlfs-build-tf-index $(list_squad_records) 15 | -------------------------------------------------------------------------------- /scheduler/data/plan-2.json: -------------------------------------------------------------------------------- 1 | { 2 | "jobs": [ 3 | { 4 | "steps": 300, 5 | "delay": 0 6 | }, 7 | { 8 | "steps": 200, 9 | "delay": 10 10 | }, 11 | { 12 | "steps": 200, 13 | "delay": 10 14 | } 15 | ] 16 | } 17 | -------------------------------------------------------------------------------- /scheduler/data/plan-3.json: -------------------------------------------------------------------------------- 1 | { 2 | "jobs": [ 3 | { 4 | "steps": 300, 5 | "delay": 0 6 | }, 7 | { 8 | "steps": 200, 9 | "delay": 10 10 | }, 11 | { 12 | "steps": 200, 13 | "delay": 10 14 | } 15 | ] 16 | } 17 | -------------------------------------------------------------------------------- /scheduler/etc/tenplex/scheduler.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | set -e 3 | 4 | flags() { 5 | echo -detect-self-ip eth0 6 | 7 | # echo -reinstall 8 | echo -u kungfu 9 | 10 | echo -tenplex-state-transformer /usr/bin/tenplex-state-transformer 11 | } 12 | 13 | /usr/bin/tenplex-scheduler $(flags) 14 | 15 | echo "$0 stopped" 16 | -------------------------------------------------------------------------------- /.github/workflows/go.yml: -------------------------------------------------------------------------------- 1 | name: Go 2 | 3 | on: 4 | - push 5 | - pull_request 6 | 7 | jobs: 8 | test: 9 | # https://help.github.com/en/articles/virtual-environments-for-github-actions#supported-virtual-environments 10 | runs-on: ubuntu-20.04 11 | 12 | steps: 13 | - uses: actions/checkout@v3 14 | - run: make 15 | -------------------------------------------------------------------------------- /mlfs/uri/path.go: -------------------------------------------------------------------------------- 1 | package uri 2 | 3 | import ( 4 | "net/url" 5 | "path" 6 | "strings" 7 | ) 8 | 9 | func AppendPath(a, b string) string { 10 | b = strings.TrimLeft(b, `/`) 11 | u, err := url.Parse(a) 12 | if err != nil { 13 | return path.Join(a, b) 14 | } 15 | u.Path = path.Join(u.Path, b) 16 | return u.String() 17 | } 18 | -------------------------------------------------------------------------------- /scheduler/azure/run_scheduler.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | export GO=/usr/local/go/bin/go 5 | 6 | echo "Building scheduler ..." 7 | make all 8 | mkdir -p $HOME/.tenplex/scheduler/bin 9 | cp -v ./vendors/tenplex-run/mlfs/bin/mlfsd $HOME/.tenplex/scheduler/bin 10 | 11 | echo "Running scheduler ..." 12 | ./bin/tenplex-scheduler 13 | -------------------------------------------------------------------------------- /benchmark/reconfiguration_cluster_size/list-ips.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | cd $(dirname $0) 5 | 6 | . ./config.sh 7 | echo $name 8 | 9 | list_hosts() { 10 | az vmss nic list -g kungfu --vmss-name $name --query '[].ipConfigurations[0].privateIPAddress' -o table | sed 1,2d 11 | } 12 | 13 | list_hosts | tee hosts.txt 14 | 15 | -------------------------------------------------------------------------------- /benchmark/reconfiguration_cluster_size/schedule_16.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "step": 0, 4 | "size": 8 5 | }, 6 | { 7 | "step": 50, 8 | "size": 16 9 | }, 10 | { 11 | "step": 60, 12 | "size": 8 13 | }, 14 | { 15 | "step": 70, 16 | "size": 0 17 | } 18 | ] 19 | -------------------------------------------------------------------------------- /benchmark/reconfiguration_cluster_size/schedule_8.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "step": 0, 4 | "size": 4 5 | }, 6 | { 7 | "step": 50, 8 | "size": 8 9 | }, 10 | { 11 | "step": 60, 12 | "size": 4 13 | }, 14 | { 15 | "step": 70, 16 | "size": 0 17 | } 18 | ] 19 | -------------------------------------------------------------------------------- /benchmark/reconfiguration_parallelization/schedule.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "step": 0, 4 | "size": 8 5 | }, 6 | { 7 | "step": 50, 8 | "size": 16 9 | }, 10 | { 11 | "step": 60, 12 | "size": 8 13 | }, 14 | { 15 | "step": 70, 16 | "size": 0 17 | } 18 | ] 19 | -------------------------------------------------------------------------------- /benchmark/reconfiguration_cluster_size/schedule_32.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "step": 0, 4 | "size": 16 5 | }, 6 | { 7 | "step": 50, 8 | "size": 32 9 | }, 10 | { 11 | "step": 60, 12 | "size": 16 13 | }, 14 | { 15 | "step": 70, 16 | "size": 0 17 | } 18 | ] 19 | -------------------------------------------------------------------------------- /mlfs/vfs/ufs/ufs_darwin.go: -------------------------------------------------------------------------------- 1 | package ufs 2 | 3 | import ( 4 | "log" 5 | 6 | "github.com/kungfu-team/tenplex/mlfs/vfs" 7 | ) 8 | 9 | func Umount(mnt string) { 10 | log.Printf("TODO: support FUSE Umount on darwin") 11 | } 12 | 13 | func Start(mnt string, r *vfs.Tree, super bool) { 14 | log.Printf("TODO: support FUSE Mount on darwin") 15 | } 16 | -------------------------------------------------------------------------------- /mlfs/docker/docker-compose.yaml: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env -S sh -c 'docker compose -f $0 up' 2 | 3 | services: 4 | mlfs: 5 | # build: . 6 | image: kungfu.azurecr.io/mlfs:snapshot 7 | command: /sbin/init 8 | privileged: true 9 | deploy: 10 | replicas: 4 11 | 12 | networks: 13 | default: 14 | name: mlfs 15 | # external: true 16 | -------------------------------------------------------------------------------- /mlfs/etc/os/linux/mlfs.service: -------------------------------------------------------------------------------- 1 | # /lib/systemd/system/mlfs.service 2 | 3 | [Unit] 4 | Description=MLFS 5 | After=network.target 6 | StartLimitIntervalSec=0 7 | 8 | [Service] 9 | ExecStart=/etc/mlfs/mlfs.sh 10 | ExecStop=-/etc/mlfs/stop.sh 11 | 12 | Restart=always 13 | RestartSec=1 14 | Type=simple 15 | 16 | [Install] 17 | WantedBy=multi-user.target 18 | -------------------------------------------------------------------------------- /mlfs/cmd/mlfsd/mlfsd.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "context" 5 | "flag" 6 | 7 | "github.com/kungfu-team/tenplex/mlfs/mlfs" 8 | "github.com/kungfu-team/tenplex/mlfs/utils" 9 | ) 10 | 11 | func main() { 12 | var d mlfs.Daemon 13 | d.RegisterFlags(flag.CommandLine) 14 | flag.Parse() 15 | utils.LogArgs() 16 | d.RunCtx(context.Background()) 17 | } 18 | -------------------------------------------------------------------------------- /mlfs/mlfs/app.go: -------------------------------------------------------------------------------- 1 | package mlfs 2 | 3 | import ( 4 | "flag" 5 | "os" 6 | "path" 7 | "time" 8 | ) 9 | 10 | func Main(main func() error) { 11 | flag.Parse() 12 | t0 := time.Now() 13 | prog := path.Base(os.Args[0]) 14 | defer func() { log.Printf("%s took %s", prog, time.Since(t0)) }() 15 | if err := main(); err != nil { 16 | log.Fatal(err) 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /tenplex-run/dbg/dgb.go: -------------------------------------------------------------------------------- 1 | package dbg 2 | 3 | import "github.com/lgarithm/proc" 4 | 5 | func SSH(p proc.Proc) proc.P { 6 | target := p.Host 7 | if len(p.User) > 0 { 8 | target = p.User + `@` + p.Host 9 | } 10 | args := []string{ 11 | `-v`, 12 | target, 13 | p.Prog, 14 | } 15 | args = append(args, p.Args...) 16 | return proc.PC(`ssh`, args...) 17 | } 18 | -------------------------------------------------------------------------------- /mlfs/mlfs/bitmap.go: -------------------------------------------------------------------------------- 1 | package mlfs 2 | 3 | import ( 4 | "image" 5 | "image/color" 6 | ) 7 | 8 | type BitVec struct{} 9 | 10 | func makeBitmap(h, w int) *image.RGBA { 11 | r := image.Rect(0, 0, w, h) 12 | img := image.NewRGBA(r) 13 | for i := 0; i < h; i++ { 14 | for j := 0; j < w; j++ { 15 | img.Set(j, i, color.Black) 16 | } 17 | } 18 | return img 19 | } 20 | -------------------------------------------------------------------------------- /benchmark/dynamic_resources/tenplex-para-config.json: -------------------------------------------------------------------------------- 1 | { 2 | "4": { 3 | "dp_size": 1, 4 | "pp_size": 2, 5 | "mp_size": 2 6 | }, 7 | "8": { 8 | "dp_size": 1, 9 | "pp_size": 4, 10 | "mp_size": 2 11 | }, 12 | "16": { 13 | "dp_size": 2, 14 | "pp_size": 4, 15 | "mp_size": 2 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /benchmark/model_convergence/README.md: -------------------------------------------------------------------------------- 1 | # Model convergence 2 | _Fig. 16. Model convergence with reconfiguration_ 3 | 4 | We evaluate Tenplex’s impact on model convergence. For this, we use the BERT-large model with the OpenWeb-Text dataset deployed on the on-premise cluster. At training step 100, we either increase or decrease the resources and compare them to a baseline without change. 5 | -------------------------------------------------------------------------------- /state_transformer/statetransform/padding.go: -------------------------------------------------------------------------------- 1 | package statetransform 2 | 3 | func VocabSizePadding(vocabSize int, mpSize int) int { 4 | makeVocabSizeDivisibleBy := 128 5 | after := vocabSize 6 | multiple := makeVocabSizeDivisibleBy * mpSize 7 | for { 8 | if after%multiple != 0 { 9 | after += 1 10 | } else { 11 | break 12 | } 13 | } 14 | return after 15 | } 16 | -------------------------------------------------------------------------------- /state_transformer/statetransform/padding_test.go: -------------------------------------------------------------------------------- 1 | package statetransform 2 | 3 | import "testing" 4 | 5 | func TestVocabSizePadding(t *testing.T) { 6 | mp := 2 7 | s := VocabSizePadding(30524, mp) 8 | t.Logf("vocab size with padding with MP %d: %d", mp, s) 9 | mp = 4 10 | s = VocabSizePadding(30524, mp) 11 | t.Logf("vocab size with padding with MP %d: %d", mp, s) 12 | } 13 | -------------------------------------------------------------------------------- /mlfs/build-cloud-index.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | set -e 3 | 4 | cd $(dirname $0) 5 | 6 | host=minddata.blob.core.windows.net 7 | 8 | list_tf_records() { 9 | for i in $(seq 1024); do 10 | echo https://$host/data/imagenet/records/train-$(printf "%05d" $((i - 1)))-of-01024 11 | done 12 | } 13 | 14 | ./bin/mlfs-build-tf-index -m 8 -output imagenet.idx.txt $(list_tf_records) 15 | -------------------------------------------------------------------------------- /scheduler/data/trace.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "time": 300, 4 | "size": 4 5 | }, 6 | { 7 | "time": 141, 8 | "size": 2 9 | }, 10 | { 11 | "time": 437, 12 | "size": 8 13 | }, 14 | { 15 | "time": 11, 16 | "size": 16 17 | }, 18 | { 19 | "time": 0, 20 | "size": 0 21 | } 22 | ] 23 | -------------------------------------------------------------------------------- /scripts/pack.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | set -e 3 | 4 | cd $(dirname $0)/.. 5 | rm -rf build 6 | mkdir -p build 7 | cd build 8 | 9 | branch=$(git rev-parse --abbrev-ref HEAD) 10 | rev=$(git rev-list --count HEAD) 11 | commit=$(git rev-parse --short HEAD) 12 | 13 | export VERSION="0.0.${rev}-git-${branch}-rev${rev}-${commit}" 14 | 15 | cmake .. 16 | make package 17 | 18 | dpkg -c *.deb 19 | -------------------------------------------------------------------------------- /mlfs/add-imagenet.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | set -e 3 | 4 | flags() { 5 | echo -idx-name imagenet 6 | echo -index-url https://tenplex.blob.core.windows.net/data/imagenet.idx.txt 7 | echo -ctrl-port 20000 8 | 9 | echo -progress 0 10 | echo -global-batch-size 23 11 | echo -dp-size 4 12 | 13 | # echo -fetch 14 | # echo -m 64 15 | } 16 | 17 | mlfs mount $(flags) 18 | -------------------------------------------------------------------------------- /benchmark/reconfiguration_cluster_size/tenplex.yml: -------------------------------------------------------------------------------- 1 | # ansible-playbook -i hosts.txt ansible/tenplex.yml 2 | 3 | - name: install tenplex 4 | hosts: all 5 | 6 | tasks: 7 | - import_tasks: ../../ansible/uninstall.yml 8 | - import_tasks: ../../ansible/install.yml 9 | - import_tasks: ./tasks/add_group.yml 10 | 11 | #- debug: msg="{{ log.stdout }}" 12 | #- debug: msg="{{ log.stderr }}" 13 | -------------------------------------------------------------------------------- /mlfs/build-imagenet-md5sum.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | set -e 3 | 4 | make 5 | 6 | cd $(dirname $0) 7 | 8 | list_tf_records() { 9 | # SSD/HDD 10 | ls /data/imagenet/records/train* | sort 11 | 12 | # tmpfs 13 | # ls $HOME/mnt/tmp/train* | sort 14 | 15 | # NVMe 16 | #ls $HOME/data/train* | sort 17 | } 18 | 19 | ./bin/mlfs-md5sum -m 64 -output imagenet.md5.txt $(list_tf_records) 20 | -------------------------------------------------------------------------------- /mlfs/cmd/tests/cmd/test-fuse/test-fuse.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "flag" 5 | 6 | "github.com/kungfu-team/tenplex/mlfs/fuse" 7 | "github.com/kungfu-team/tenplex/mlfs/utils" 8 | ) 9 | 10 | var ( 11 | mnt = flag.String("mnt", "", "") 12 | ) 13 | 14 | func main() { 15 | flag.Parse() 16 | f, err := fuse.New(*mnt) 17 | if err != nil { 18 | utils.ExitErr(err) 19 | } 20 | f.Run() 21 | } 22 | -------------------------------------------------------------------------------- /tenplex-run/listflag/listflag_test.go: -------------------------------------------------------------------------------- 1 | package listflag_test 2 | 3 | import ( 4 | "flag" 5 | "testing" 6 | 7 | "github.com/kungfu-team/tenplex/tenplex-run/listflag" 8 | ) 9 | 10 | func isFlagValue(flag.Value) {} 11 | 12 | func Test_1(t *testing.T) { 13 | var x listflag.Strings 14 | isFlagValue(&x) 15 | } 16 | 17 | func Test_2(t *testing.T) { 18 | var x listflag.Ints 19 | isFlagValue(&x) 20 | } 21 | -------------------------------------------------------------------------------- /mlfs/add-enwiki-numpy.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | set -e 3 | 4 | make 5 | 6 | flags() { 7 | echo -idx-name enwiki 8 | echo -idx-file /data/megatron-lm/bert/enwiki/npzs_seq512/indices.txt 9 | echo -ctrl-port 20010 10 | 11 | echo -progress 0 12 | echo -global-batch-size 32 13 | echo -cluster-size 4 14 | 15 | # echo -fetch 16 | 17 | echo -m 64 18 | } 19 | 20 | ./bin/mlfs mount $(flags) 21 | -------------------------------------------------------------------------------- /scheduler/azure/run_user.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | export GO=/usr/local/go/bin/go 5 | 6 | make 7 | 8 | join_() { 9 | local IFS="," 10 | echo "$*" 11 | } 12 | 13 | w1=10.0.0.9 14 | w2=10.0.0.8 15 | 16 | ips() { 17 | echo $w1 18 | echo $w2 19 | } 20 | 21 | flags() { 22 | echo -gpu-per-host 4 23 | echo -hosts "$(join_ $(ips))" 24 | } 25 | 26 | # echo 27 | ./bin/tenplex-user $(flags) 28 | -------------------------------------------------------------------------------- /state_transformer/meta/metadata.go: -------------------------------------------------------------------------------- 1 | package meta 2 | 3 | type Metadata struct { 4 | SourceRankMap *RankMap 5 | TargetRankMap *RankMap 6 | SourceStructs map[int]map[string]interface{} 7 | TargetStructs map[int]map[string]interface{} 8 | SourceGroupSizes map[int]map[int]int 9 | TargetGroupSizes map[int]map[int]int 10 | SourceModelKeys map[int][][]string 11 | TargetModelKeys map[int][][]string 12 | } 13 | -------------------------------------------------------------------------------- /tenplex-run/counter/id.go: -------------------------------------------------------------------------------- 1 | package counter 2 | 3 | func NewCounterFunc() func() int { 4 | var id int 5 | return func() int { x := id; id++; return x } 6 | } 7 | 8 | func New() *Counter { 9 | return &Counter{} 10 | } 11 | 12 | type Counter struct { 13 | n int 14 | } 15 | 16 | func (c *Counter) Next() int { 17 | id := c.n 18 | c.n++ 19 | return id 20 | } 21 | 22 | func (c *Counter) Reset() { 23 | c.n = 0 24 | } 25 | -------------------------------------------------------------------------------- /mlfs/closer/closer.go: -------------------------------------------------------------------------------- 1 | package closer 2 | 3 | import "io" 4 | 5 | type closer struct { 6 | r io.Reader 7 | close func() error 8 | } 9 | 10 | func ReadClose(r io.Reader, close func() error) io.ReadCloser { 11 | return &closer{r: r, close: close} 12 | } 13 | 14 | func (c *closer) Read(buf []byte) (int, error) { 15 | return c.r.Read(buf) 16 | } 17 | 18 | func (c *closer) Close() error { 19 | return c.close() 20 | } 21 | -------------------------------------------------------------------------------- /mlfs/test-numpy.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | set -e 3 | 4 | make 5 | 6 | # DATA_DIR=/data/megatron-lm/bert/npz_concat 7 | DATA_DIR=/data/megatron-lm/bert/test 8 | 9 | flags() { 10 | # echo -index-file $DATA_DIR/indices.txt 11 | echo -index-file $DATA_DIR/old-format.txt 12 | # echo -data-file $DATA_DIR/samples.npzs 13 | 14 | echo -dp-size 4 15 | echo -global-batch-size 32 16 | } 17 | 18 | ./bin/mlfs-gen-numpy $(flags) 19 | -------------------------------------------------------------------------------- /mlfs/scripts/system-install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | set -e 4 | 5 | cd $(dirname $0)/.. 6 | 7 | export PATH=/usr/local/go/bin:$PATH 8 | 9 | make 10 | 11 | rm -fr build 12 | ./scripts/pack.sh 13 | 14 | set +e 15 | echo "stopping mlfsd" 16 | sudo systemctl stop mlfs 17 | echo "stopped mlfsd" 18 | set -e 19 | 20 | sudo dpkg -i ./build/*.deb 21 | sudo systemctl daemon-reload 22 | 23 | sudo systemctl start mlfs 24 | 25 | echo "done $0" 26 | -------------------------------------------------------------------------------- /scheduler/etc/os/linux/tenplex-scheduler.service: -------------------------------------------------------------------------------- 1 | # /lib/systemd/system/tenplex-scheduler.service 2 | 3 | [Unit] 4 | Description=Tenplex Scheduler 5 | After=network.target 6 | StartLimitIntervalSec=0 7 | 8 | [Service] 9 | ExecStart=/etc/tenplex/scheduler.sh 10 | ExecStop=-/etc/tenplex/stop-scheduler.sh 11 | User=kungfu 12 | 13 | Restart=always 14 | RestartSec=1 15 | Type=simple 16 | 17 | [Install] 18 | WantedBy=multi-user.target 19 | -------------------------------------------------------------------------------- /scheduler/scripts/build-deb.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | set -e 3 | 4 | cd $(dirname $0)/.. 5 | 6 | GOBIN=$PWD/bin go install -v ./... 7 | 8 | rm -rf build 9 | mkdir -p build 10 | cd build 11 | 12 | branch=$(git rev-parse --abbrev-ref HEAD) 13 | rev=$(git rev-list --count HEAD) 14 | commit=$(git rev-parse --short HEAD) 15 | export VERSION="0.0.1-git-${branch}-rev${rev}-${commit}" 16 | 17 | cmake .. 18 | make package 19 | 20 | dpkg -c *.deb 21 | -------------------------------------------------------------------------------- /mlfs/ds/trds/example.go: -------------------------------------------------------------------------------- 1 | package trds 2 | 3 | import ( 4 | "bytes" 5 | 6 | "github.com/kungfu-team/tenplex/mlfs/vfs" 7 | "github.com/kungfu-team/tenplex/mlfs/vfs/vfile" 8 | ) 9 | 10 | func InitExample(r *vfs.Tree) { 11 | r.Mkdir(`/`) 12 | idx, err := vfile.LoadIdxFile(`a.idx.txt`) 13 | if err != nil { 14 | return 15 | } 16 | bs := &bytes.Buffer{} 17 | vfile.SaveIdx(bs, idx) 18 | r.TouchText(`/index.txt`, bs.String()) 19 | } 20 | -------------------------------------------------------------------------------- /benchmark/reconfiguration/stop.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import subprocess 3 | 4 | 5 | def main(): 6 | # hosts = ["komodo01", "komodo02", "komodo03", "komodo04"] 7 | hosts = ["komodo01", "komodo02"] 8 | 9 | for ho in hosts: 10 | subprocess.run( 11 | f"ssh {ho} docker ps -q -f name='worker' | xargs docker stop".split(" "), 12 | check=False, 13 | ) 14 | 15 | 16 | if __name__ == "__main__": 17 | main() 18 | -------------------------------------------------------------------------------- /benchmark/reconfiguration_horovod/add-imagenet.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | set -e 3 | 4 | flags() { 5 | echo -idx-name imagenet 6 | echo -index-url /data/imagenet/imagenet.idx.txt 7 | 8 | echo -ctrl-port 20010 9 | 10 | echo -progress 0 11 | echo -global-batch-size 64 12 | echo -dp-size 1 13 | 14 | echo -job fig-13 15 | } 16 | 17 | sudo systemctl stop mlfs 18 | sudo systemctl start mlfs 19 | 20 | mlfs info 21 | mlfs mount $(flags) 22 | -------------------------------------------------------------------------------- /mlfs/par/par.go: -------------------------------------------------------------------------------- 1 | package par 2 | 3 | import "sync" 4 | 5 | type par struct { 6 | wg sync.WaitGroup 7 | ch chan struct{} 8 | } 9 | 10 | func New(m int) *par { 11 | p := &par{ 12 | ch: make(chan struct{}, m), 13 | } 14 | return p 15 | } 16 | 17 | func (p *par) Do(f func()) { 18 | p.wg.Add(1) 19 | p.ch <- struct{}{} 20 | go func() { 21 | f() 22 | <-p.ch 23 | p.wg.Done() 24 | }() 25 | } 26 | 27 | func (p *par) Wait() { 28 | p.wg.Wait() 29 | } 30 | -------------------------------------------------------------------------------- /benchmark/reconfiguration_horovod/README.md: -------------------------------------------------------------------------------- 1 | # Reconfiguration Horovod 2 | _Fig. 13. Reconfiguration time against Horovod_ 3 | 4 | We also compare Tenplex’s overhead to Horovod, a distributed training library without elasticity support, and Horovod-Elastic, which also supports scaling under data parallelism only by periodically checkpointing the model state. We deploy a ResNet50 model with the ImageNet dataset in the on-premise cluster, and measure throughput when training on 2 GPUs. 5 | -------------------------------------------------------------------------------- /benchmark/redeployment/README.md: -------------------------------------------------------------------------------- 1 | # Redeployment 2 | _Fig. 10. Redeployment time of DL job_ 3 | 4 | We evaluate how long Tenplex takes to redeploy DL jobs with different model sizes onto a new set of GPU resources. As a baseline, we compare against Tenplex-Central, which follows the approach of PyTorch Elastic or DeepSpeed: it holds all DL job state at a single central worker. In this experiment, we therefore specifically explore the benefit of Tenplex’s distributed state management. 5 | -------------------------------------------------------------------------------- /scheduler/scripts/collect-logs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | set -e 3 | 4 | cd $(dirname $0)/.. 5 | 6 | collect_logs() { 7 | mkdir -p logs 8 | for ip in $(./scripts/list-ips.sh); do 9 | echo $ip 10 | scp -r $ip:.tenplex/training logs 11 | done 12 | } 13 | 14 | main() { 15 | for h in $(./scripts/list-ips.sh); do 16 | echo $h 17 | ssh $h find /mnt/mlfs | tee logs/$h.mlfs.log 18 | done 19 | 20 | collect_logs 21 | } 22 | 23 | main 24 | -------------------------------------------------------------------------------- /scheduler/scripts/list-ips-komodo.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | set -e 3 | 4 | echo "10.10.10.1" 5 | echo "10.10.10.2" 6 | echo "10.10.10.3" 7 | echo "10.10.10.4" 8 | 9 | # cd $(dirname $0)/.. 10 | 11 | # for i in $(seq 4); do 12 | # if [ $i -eq 1 ]; then # hack 13 | # ip -o -4 addr list eth0 | awk -F ' *|/' '{print $4}' 14 | # else 15 | # domain=komodo$(printf "%02d" $i).doc.res.ic.ac.uk 16 | # host $domain | awk '{print $4}' 17 | # fi 18 | # done 19 | -------------------------------------------------------------------------------- /state_transformer/lib/lib.go: -------------------------------------------------------------------------------- 1 | package lib 2 | 3 | import "golang.org/x/exp/slices" 4 | 5 | func InSlice(ele string, sl []string) bool { 6 | return slices.Contains(sl, ele) 7 | } 8 | 9 | func IsSubSlice(subSl []string, sl []string) bool { 10 | for i, s := range sl { 11 | if subSl[0] == s { 12 | for j := 1; j < len(subSl); j++ { 13 | if subSl[j] != sl[i+j] { 14 | return false 15 | } 16 | } 17 | return true 18 | } 19 | } 20 | return false 21 | } 22 | -------------------------------------------------------------------------------- /benchmark/reconfiguration_cluster_size/tasks/pull_image.yml: -------------------------------------------------------------------------------- 1 | - name: check UID 2 | ignore_errors: yes 3 | ansible.builtin.shell: 4 | cmd: id 5 | register: log 6 | 7 | - name: docker ps 8 | ignore_errors: yes 9 | ansible.builtin.shell: 10 | cmd: docker ps 11 | register: log 12 | 13 | - name: pull docker image 14 | ignore_errors: yes 15 | ansible.builtin.shell: 16 | cmd: docker pull kungfu.azurecr.io/mw-megatron-lm-23.06-update-v100 17 | register: log 18 | -------------------------------------------------------------------------------- /mlfs/ds/imagenet.go: -------------------------------------------------------------------------------- 1 | package ds 2 | 3 | import "github.com/kungfu-team/tenplex/mlfs/hash" 4 | 5 | var ( 6 | ImagenetIndex = hash.HashedFile{ 7 | MD5: `dfe57e9541f8cb7affedefd3c633326e`, 8 | URLs: []string{`https://minddata.blob.core.windows.net/data/imagenet.idx.txt`}, 9 | } 10 | 11 | ImagenetMd5 = hash.HashedFile{ 12 | MD5: `91d0846314a61c32f42726aaa05ea9e7`, 13 | URLs: []string{`https://minddata.blob.core.windows.net/data/imagenet/md5sum.txt`}, 14 | } 15 | ) 16 | -------------------------------------------------------------------------------- /state_transformer/Dockerfile: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env -S sh -c 'docker build --rm -t kungfu.azurecr.io/mw-megatron-lm-go:latest -f $0 .' 2 | FROM kungfu.azurecr.io/mw-megatron-lm-data-commit:latest 3 | 4 | ADD go /usr/local 5 | ENV PATH=$PATH:/usr/local/go/bin 6 | RUN whereis go 7 | RUN go version 8 | ENV GO=/usr/local/bin/go 9 | 10 | # State Transformer 11 | ADD . /workspace/state_transformer 12 | WORKDIR /workspace/state_transformer 13 | RUN make 14 | WORKDIR /workspace/Megatron-LM 15 | -------------------------------------------------------------------------------- /benchmark/reconfiguration_cluster_size/para-config-dp.json: -------------------------------------------------------------------------------- 1 | { 2 | "4": { 3 | "dp_size": 1, 4 | "pp_size": 1, 5 | "mp_size": 4 6 | }, 7 | "8": { 8 | "dp_size": 2, 9 | "pp_size": 1, 10 | "mp_size": 4 11 | }, 12 | "16": { 13 | "dp_size": 4, 14 | "pp_size": 1, 15 | "mp_size": 4 16 | }, 17 | "32": { 18 | "dp_size": 8, 19 | "pp_size": 1, 20 | "mp_size": 4 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /benchmark/reconfiguration_cluster_size/para-config-pp.json: -------------------------------------------------------------------------------- 1 | { 2 | "4": { 3 | "dp_size": 1, 4 | "pp_size": 1, 5 | "mp_size": 4 6 | }, 7 | "8": { 8 | "dp_size": 1, 9 | "pp_size": 2, 10 | "mp_size": 4 11 | }, 12 | "16": { 13 | "dp_size": 1, 14 | "pp_size": 4, 15 | "mp_size": 4 16 | }, 17 | "32": { 18 | "dp_size": 1, 19 | "pp_size": 8, 20 | "mp_size": 4 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /mlfs/.vscode/tasks.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": "2.0.0", 3 | "tasks": [ 4 | { 5 | "label": "echo", 6 | "type": "shell", 7 | "command": "./build", 8 | "group": { 9 | "kind": "build", 10 | "isDefault": true 11 | } 12 | }, 13 | { 14 | "label": "run", 15 | "type": "shell", 16 | "command": "./x" 17 | }, 18 | { 19 | "label": "debug", 20 | "type": "shell", 21 | "command": "./x" 22 | } 23 | ] 24 | } 25 | -------------------------------------------------------------------------------- /ansible/uninstall.yml: -------------------------------------------------------------------------------- 1 | # - name: cleanup 2 | # become: true 3 | # ansible.builtin.shell: 4 | # cmd: killall apt 5 | # register: log 6 | 7 | - name: stop 8 | become: true 9 | ignore_errors: yes 10 | ansible.builtin.shell: 11 | cmd: systemctl stop mlfs 12 | register: log 13 | 14 | - name: uninstall 15 | become: true 16 | ansible.builtin.apt: 17 | state: absent 18 | pkg: 19 | - mlfs 20 | 21 | - debug: msg="{{ log.stdout }}" 22 | - debug: msg="{{ log.stderr }}" 23 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/kungfu-team/tenplex 2 | 3 | go 1.18 4 | 5 | require ( 6 | bazil.org/fuse v0.0.0-20230120002735-62a210ff1fd5 7 | github.com/lgarithm/go v0.0.0-20230108194319-abf8008ecd81 8 | github.com/lgarithm/proc v0.4.5-0.20240417004737-9b169ad5c322 9 | github.com/lsds/KungFu v0.2.5 10 | golang.org/x/exp v0.0.0-20240416160154-fe59bbe5cc7f 11 | golang.org/x/sync v0.7.0 12 | ) 13 | 14 | require ( 15 | golang.org/x/crypto v0.22.0 // indirect 16 | golang.org/x/sys v0.19.0 // indirect 17 | ) 18 | -------------------------------------------------------------------------------- /benchmark/common-cloud.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | join_() { 4 | local IFS=$1 5 | shift 6 | echo "$*" 7 | } 8 | 9 | join() { join_ , $@; } 10 | 11 | logfile="$(basename $0).log" 12 | 13 | base_flags() { 14 | echo -image kungfu.azurecr.io/mw-megatron-lm-23.06-update-v100 15 | 16 | echo -user $USER 17 | 18 | echo -mlfs-port 20010 19 | echo -tenplex-prefix "$HOME/.tenplex" 20 | 21 | # echo -logfile 22 | } 23 | 24 | tenplex_run_with() { 25 | tenplex-run $($1) 2>&1 | tee $logfile 26 | } 27 | -------------------------------------------------------------------------------- /benchmark/common.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | join_() { 4 | local IFS=$1 5 | shift 6 | echo "$*" 7 | } 8 | 9 | join() { join_ , $@; } 10 | 11 | logfile="$(basename $0).log" 12 | 13 | base_flags() { 14 | echo -image "kungfu.azurecr.io/mw-megatron-lm-23.06-update:v0.0.3" 15 | 16 | echo -user $USER 17 | 18 | echo -mlfs-port 20010 19 | echo -tenplex-prefix "$HOME/.tenplex" 20 | 21 | # echo -logfile 22 | } 23 | 24 | tenplex_run_with() { 25 | tenplex-run $($1) 2>&1 | tee $logfile 26 | } 27 | -------------------------------------------------------------------------------- /mlfs/cmd/tests/cmd/test-md5/test-md5.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "log" 5 | "time" 6 | 7 | "github.com/kungfu-team/tenplex/mlfs/ds" 8 | ) 9 | 10 | func main() { 11 | t0 := time.Now() 12 | defer func() { log.Printf("took %s", time.Since(t0)) }() 13 | ds.ImagenetIndex.Check() 14 | ds.ImagenetMd5.Check() 15 | ds.Squad1Index.Check() 16 | ds.Squad1MD5.Check() 17 | 18 | ds.MnistTrainImages.Check() 19 | ds.MnistTrainLabels.Check() 20 | ds.MnistTestImages.Check() 21 | ds.MnistTestLabels.Check() 22 | } 23 | -------------------------------------------------------------------------------- /mlfs/debug-p2p.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | set -e 3 | 4 | make 5 | # ./bin/mlfs mount 6 | 7 | peer1="127.0.0.1:8080" 8 | peer2="127.0.0.1:8081" 9 | 10 | peer_flags() { 11 | echo -r 1 12 | echo -peers "$peer1,$peer2" 13 | echo -log-req 14 | } 15 | 16 | localhost="127.0.0.1" 17 | 18 | ./bin/mlfs daemon $(peer_flags) -host $localhost -ctrl-port 8080 -http-port 10000 & 19 | pid1=$! 20 | echo $p1 21 | 22 | ./bin/mlfs daemon $(peer_flags) -host $localhost -ctrl-port 8081 -http-port 10001 & 23 | pid2=$! 24 | echo $p2 25 | 26 | wait 27 | -------------------------------------------------------------------------------- /benchmark/reconfiguration_horovod/Dockerfile: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env -S sh -c 'docker build --rm -t $(cat $(dirname $0)/tag.txt) -f $0 $(dirname $0)' 2 | 3 | FROM nvcr.io/nvidia/tensorflow:23.03-tf1-py3 4 | 5 | RUN python3 -m pip install tenplex -i https://pkgs.dev.azure.com/gli7/releases/_packaging/nightly/pypi/simple -U 6 | 7 | WORKDIR /work 8 | ADD logger.py . 9 | ADD imagenet.py . 10 | ADD imagenet_resnet.py . 11 | ADD imagenet_resnet_horovod_elastic.py . 12 | ADD train-imagenet.sh . 13 | 14 | # ENV OMPI_ALLOW_RUN_AS_ROOT=1 15 | ENTRYPOINT [] 16 | -------------------------------------------------------------------------------- /mlfs/cache/stat.go: -------------------------------------------------------------------------------- 1 | package cache 2 | 3 | import ( 4 | "log" 5 | "sync/atomic" 6 | ) 7 | 8 | type Stat struct { 9 | miss int64 10 | hit int64 11 | } 12 | 13 | func (s *Stat) Hit() { 14 | atomic.AddInt64(&s.hit, 1) 15 | } 16 | 17 | func (s *Stat) Miss() { 18 | atomic.AddInt64(&s.miss, 1) 19 | } 20 | 21 | func (s *Stat) Log() { 22 | h := (atomic.LoadInt64(&s.hit)) 23 | m := (atomic.LoadInt64(&s.miss)) 24 | r := float32(m) / float32(h+m) 25 | log.Printf("miss rate: %.2f%% (%d / %d)", r*100.0, m, m+h) 26 | } 27 | 28 | var LogCache = false 29 | -------------------------------------------------------------------------------- /scheduler/data/plan-komodo.json: -------------------------------------------------------------------------------- 1 | { 2 | "jobs": [ 3 | { 4 | "steps": 1000, 5 | "delay": 0, 6 | "dataset": { 7 | "Name": "openwebtext", 8 | "IndexURL": "http://155.198.152.18:20110/" 9 | } 10 | }, 11 | { 12 | "steps": 1000, 13 | "delay": 4, 14 | "dataset": { 15 | "Name": "openwebtext", 16 | "IndexURL": "http://155.198.152.18:20110/" 17 | } 18 | } 19 | ] 20 | } 21 | -------------------------------------------------------------------------------- /benchmark/reconfiguration/README.md: -------------------------------------------------------------------------------- 1 | # Reconfiguration 2 | _Fig. 12. Reconfiguration time against DeepSpeed and Singularity_ 3 | 4 | This experiment compares the reconfiguration approach of Tenplex with (i) a model library of an elastic DL system (DeepSpeed) and (ii) a virtual device approach that performs full GPU state migration (Singularity). 5 | 6 | ## Megatron-Deespeed 7 | [Repo](https://github.com/kungfu-team/Megatron-DeepSpeed/tree/mw-before-rebase) 8 | 9 | ## Note 10 | For the roundevouz to work `/etc/hosts` must include the host's domain name. 11 | -------------------------------------------------------------------------------- /mlfs/cmd/tests/mlfs-test-upload/mlfs-test-upload.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "flag" 5 | "log" 6 | 7 | "github.com/kungfu-team/tenplex/mlfs/mlfs" 8 | "github.com/kungfu-team/tenplex/tensor" 9 | ) 10 | 11 | var ( 12 | port = flag.Int("p", 8080, ``) 13 | ) 14 | 15 | func main() { 16 | flag.Parse() 17 | c, err := mlfs.NewClient(*port) 18 | if err != nil { 19 | log.Panic(err) 20 | } 21 | x := tensor.New(`f32`, 2, 2, 2) 22 | if err := c.Upload(`/a/b/c`, x.Data); err != nil { 23 | log.Panic(err) 24 | } 25 | log.Printf("done") 26 | } 27 | -------------------------------------------------------------------------------- /mlfs/ds/dataset.go: -------------------------------------------------------------------------------- 1 | package ds 2 | 3 | type Dataset struct { 4 | Name string `json:"name" flag:"dataset"` 5 | IndexURL string `json:"index-url" flag:"index-url"` 6 | Size int `json:"size"` // Total number of samples 7 | } 8 | 9 | var ( 10 | SQuAD1Test = Dataset{ 11 | Name: `squad1-test`, 12 | IndexURL: `https://minddata.blob.core.windows.net/data/squad1/squad1.idx.txt`, 13 | } 14 | 15 | Imagenet = Dataset{ 16 | Name: `imagenet`, 17 | IndexURL: `https://minddata.blob.core.windows.net/data/imagenet.idx.txt`, 18 | } 19 | ) 20 | -------------------------------------------------------------------------------- /mlfs/ds/squad1.go: -------------------------------------------------------------------------------- 1 | package ds 2 | 3 | import "github.com/kungfu-team/tenplex/mlfs/hash" 4 | 5 | // https://minddata.blob.core.windows.net/data/squad1/squad1.md5.txt 6 | 7 | var ( 8 | Squad1Index = hash.HashedFile{ 9 | MD5: `57015fef3d187f14a57a55ff04166e0c`, 10 | URLs: []string{`https://minddata.blob.core.windows.net/data/squad1/squad1.idx.txt`}, 11 | } 12 | 13 | Squad1MD5 = hash.HashedFile{ 14 | MD5: `9e1ed608ed476e8fed2fbf84ff378884`, 15 | URLs: []string{`https://minddata.blob.core.windows.net/data/squad1/squad1.md5.txt`}, 16 | } 17 | ) 18 | -------------------------------------------------------------------------------- /.azure/release-pip.yml: -------------------------------------------------------------------------------- 1 | parameters: 2 | - name: feed # name of the parameter; required 3 | type: string # data type of the parameter; required 4 | 5 | steps: 6 | - script: python3 -m pip install wheel twine 7 | displayName: 'install deps' 8 | 9 | - script: python3 -m pip wheel -v . 10 | displayName: 'build whl' 11 | 12 | - task: TwineAuthenticate@1 13 | inputs: 14 | artifactFeed: 'releases/${{ parameters.feed }}' 15 | 16 | - script: python3 -m twine upload -r ${{ parameters.feed }} --config-file $(PYPIRC_PATH) ./*.whl 17 | displayName: Publish 18 | -------------------------------------------------------------------------------- /state_transformer/mapslice/mapslice_test.go: -------------------------------------------------------------------------------- 1 | package mapslice 2 | 3 | import ( 4 | "encoding/json" 5 | "fmt" 6 | "log" 7 | "testing" 8 | ) 9 | 10 | func Test_1(t *testing.T) { 11 | ms := MapSlice{ 12 | MapItem{"abc", 123, 0}, 13 | MapItem{"def", 456, 0}, 14 | MapItem{"ghi", 789, 0}, 15 | } 16 | 17 | b, err := json.Marshal(ms) 18 | if err != nil { 19 | log.Fatal(err) 20 | } 21 | fmt.Println(string(b)) 22 | 23 | ms = MapSlice{} 24 | if err := json.Unmarshal(b, &ms); err != nil { 25 | log.Fatal(err) 26 | } 27 | 28 | fmt.Println(ms) 29 | } 30 | -------------------------------------------------------------------------------- /mlfs/local-serve.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | set -e 3 | 4 | make 5 | 6 | # ./bin/mlfs-check-index $(cat tests/data/*.json | jq -r '."index-url"') 7 | 8 | # ./bin/mlfs-debug -ds ./tests/data/squad1.json 9 | # ./bin/mlfs-debug -ds ./tests/data/imagenet.json 10 | 11 | ./bin/mlfs-edit-index \ 12 | -index-url $(cat tests/data/imagenet.json | jq -r '."index-url"') \ 13 | -o a.index.txt \ 14 | -localize 15 | 16 | ./bin/mlfs-check-index ./a.index.txt 17 | ./bin/mlfs serve -index-url ./a.index.txt -self 155.198.152.18 18 | # ./bin/mlfs daemon -ctrl-port 9999 -http-port 9998 -mnt ./tmp 19 | -------------------------------------------------------------------------------- /tenplex/mlfs_path.py: -------------------------------------------------------------------------------- 1 | class MLFSPath(object): 2 | 3 | def __init__(self, mnt='/mnt/mlfs') -> None: 4 | self.mnt = mnt 5 | 6 | def _path(self, p): 7 | return self.mnt + p 8 | 9 | def _read_lines(self, filename): 10 | return [line.strip() for line in open(self._path(filename))] 11 | 12 | def filenames(self, job, rank): 13 | lines = self._read_lines 14 | head = lines(f'/job/{job}/head.txt')[0] 15 | part = lines(head)[rank] 16 | names = lines(f'{part}/list.txt') 17 | return [self._path(n) for n in names] 18 | -------------------------------------------------------------------------------- /state_transformer/statetransform/iter.go: -------------------------------------------------------------------------------- 1 | package statetransform 2 | 3 | import ( 4 | "strconv" 5 | 6 | "github.com/kungfu-team/tenplex/state_transformer/client" 7 | "github.com/kungfu-team/tenplex/state_transformer/meta" 8 | ) 9 | 10 | func setIter(conf *meta.Config, targetDevice int, cl client.CheckpointClient) error { 11 | if targetDevice%conf.GpusPerHost != 0 { // only once per host 12 | return nil 13 | } 14 | 15 | err := cl.UploadValue([]byte(strconv.Itoa(conf.Step)), "iter", targetDevice, true) 16 | if err != nil { 17 | return err 18 | } 19 | 20 | return nil 21 | } 22 | -------------------------------------------------------------------------------- /azure-pipelines.yml: -------------------------------------------------------------------------------- 1 | # Starter pipeline 2 | # Start with a minimal pipeline that you can customize to build and deploy your code. 3 | # Add steps that build, run tests, deploy, and more: 4 | # https://aka.ms/yaml 5 | 6 | trigger: 7 | - main 8 | 9 | pool: 10 | # ERROR InvalidDistribution: Invalid distribution metadata: unrecognized or 11 | # malformed field 'license-file' 12 | # vmImage: ubuntu-24.04 13 | 14 | vmImage: ubuntu-22.04 # FIXME: deprecate setup.py 15 | 16 | steps: 17 | 18 | - template: ./.azure/release-pip.yml 19 | parameters: 20 | feed: nightly 21 | -------------------------------------------------------------------------------- /mlfs/scripts/upload.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | set -e 3 | 4 | if [ -z "$SAS" ]; then 5 | echo "SAS NOT set" 6 | exit 1 7 | fi 8 | 9 | upload() { 10 | local filename=$1 11 | local path=$2 12 | SA=tenplex 13 | URI="https://$SA.blob.core.windows.net/$path" 14 | 15 | echo "uploading $filename to $URI" 16 | curl -v -X PUT \ 17 | -H 'x-ms-blob-type: BlockBlob' \ 18 | -H 'x-ms-version: 2015-02-21' \ 19 | -H "Content-Type: $ContentType" \ 20 | "$URI?$SAS" --data-binary @$filename 21 | echo "uploaded $URI" 22 | } 23 | 24 | upload "$1" "$2" 25 | -------------------------------------------------------------------------------- /mlfs/utils/text.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import ( 4 | "io" 5 | "strings" 6 | 7 | "github.com/kungfu-team/tenplex/mlfs/uri" 8 | ) 9 | 10 | func Readlines(filename string) ([]string, error) { 11 | f, err := uri.Open(filename) 12 | if err != nil { 13 | return nil, err 14 | } 15 | defer f.Close() 16 | bs, err := io.ReadAll(f) 17 | if err != nil { 18 | return nil, err 19 | } 20 | var ls []string 21 | for _, l := range strings.Split(string(bs), "\n") { 22 | l = strings.TrimSpace(l) 23 | if len(l) > 0 { 24 | ls = append(ls, l) 25 | } 26 | } 27 | return ls, nil 28 | } 29 | -------------------------------------------------------------------------------- /mlfs/convert_index.py: -------------------------------------------------------------------------------- 1 | def main(): 2 | old_index_file_path = '/data/megatron-lm/bert/test/indices.txt' 3 | new_index_file_path = './new_indices.txt' 4 | 5 | with open(old_index_file_path, 'r') as old_index_file: 6 | old_index_lines = old_index_file.readlines() 7 | 8 | old_indices = [int(l) for l in old_index_lines] 9 | 10 | with open(new_index_file_path, 'w') as new_index_file: 11 | for i in range(len(old_indices) - 1): 12 | new_index_file.write(f'{old_indices[i]} {old_index_lines[i+1]}') 13 | 14 | 15 | if __name__ == '__main__': 16 | main() 17 | -------------------------------------------------------------------------------- /tenplex-run/job/lib.go: -------------------------------------------------------------------------------- 1 | package job 2 | 3 | import ( 4 | "github.com/lgarithm/proc" 5 | ) 6 | 7 | var ( 8 | Par = proc.Par // (P, P, ...) -> P 9 | Seq = proc.Seq // (P, ...) -> P 10 | Term = proc.Term 11 | Echo = proc.Echo 12 | Shell = proc.Shell 13 | Ignore = proc.Ignore 14 | Run = proc.Run 15 | Ssh = proc.SSH 16 | // Ssh = dbg.SSH 17 | ) 18 | 19 | type ( 20 | P = proc.P 21 | Proc = proc.Proc 22 | ) 23 | 24 | func Pmap(f func(string) P, hs ...string) []P { 25 | var ps []P 26 | for _, h := range hs { 27 | ps = append(ps, f(h)) 28 | } 29 | return ps 30 | } 31 | -------------------------------------------------------------------------------- /mlfs/.github/workflows/docker.yml: -------------------------------------------------------------------------------- 1 | --- 2 | name: Docker 3 | 'on': 4 | # - push 5 | # - pull_request 6 | - workflow_dispatch 7 | 8 | jobs: 9 | build: 10 | runs-on: ubuntu-20.04 11 | steps: 12 | - run: docker images 13 | - uses: actions/checkout@v2 14 | 15 | - run: ./docker/ubuntu/2004/Dockerfile 16 | env: 17 | SSH_KEY: ${{ secrets.KUNGFU_RSA }} 18 | 19 | # - run: ./docker/ubuntu/2004/Dockerfile.tf1.13.2-gpu 20 | - run: ./docker/ubuntu/1804/Dockerfile.tf1.13.2-gpu 21 | env: 22 | SSH_KEY: ${{ secrets.KUNGFU_RSA }} 23 | 24 | - run: docker images 25 | -------------------------------------------------------------------------------- /mlfs/fuse/fuse.go: -------------------------------------------------------------------------------- 1 | package fuse 2 | 3 | import ( 4 | "log" 5 | "os" 6 | ) 7 | 8 | type FUSE struct { 9 | mnt string 10 | ch chan struct{} 11 | dev *os.File 12 | } 13 | 14 | func New(mnt string) (*FUSE, error) { 15 | dev, err := os.Open(`/dev/fuse`) 16 | if err != nil { 17 | return nil, err 18 | } 19 | f := &FUSE{ 20 | mnt: mnt, 21 | ch: make(chan struct{}), 22 | dev: dev, 23 | } 24 | return f, nil 25 | } 26 | 27 | func (f *FUSE) Run() { 28 | for { 29 | buf := make([]byte, 1024) 30 | n, err := f.dev.Read(buf) 31 | log.Printf("%d,%v", n, err) 32 | _ = <-f.ch 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # pip3 install --no-index --user -U . 2 | 3 | import os, time 4 | 5 | 6 | def auto_version(): 7 | major = 0 8 | minor = 0 9 | t = os.getenv('GIT_COMMIT_TIMESTAMP') 10 | patch = str(t) if t else int(time.time()) 11 | return '.'.join([str(x) for x in [major, minor, patch]]) 12 | 13 | 14 | from setuptools import find_packages, setup 15 | 16 | setup( 17 | name='tenplex', 18 | version=auto_version(), 19 | packages=find_packages(), 20 | description='', 21 | url='', 22 | ext_modules=[], 23 | setup_requires=[], 24 | install_requires=[], 25 | ) 26 | 27 | -------------------------------------------------------------------------------- /tenplex-run/timeout/timeout.go: -------------------------------------------------------------------------------- 1 | package timeout 2 | 3 | import ( 4 | "context" 5 | "log" 6 | "sync/atomic" 7 | "time" 8 | ) 9 | 10 | type timeout struct { 11 | done int32 12 | } 13 | 14 | func New(d time.Duration, cancel context.CancelFunc) *timeout { 15 | t := &timeout{} 16 | go func() { 17 | time.Sleep(d) 18 | done := atomic.LoadInt32(&t.done) 19 | if done != 0 { 20 | return 21 | } 22 | log.Printf("timeout adter %s", d) 23 | if cancel != nil { 24 | cancel() 25 | } 26 | }() 27 | return t 28 | } 29 | 30 | func (t *timeout) Done() { 31 | atomic.StoreInt32(&t.done, 1) 32 | } 33 | -------------------------------------------------------------------------------- /tensor/tensor_test.go: -------------------------------------------------------------------------------- 1 | package tensor_test 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | "testing" 7 | 8 | "github.com/kungfu-team/tenplex/tensor" 9 | ) 10 | 11 | func Test_1(t *testing.T) { 12 | /* 13 | 0 1 2 3 14 | 4 5 6 7 15 | 8 9 10 11 16 | 12 13 14 15 17 | */ 18 | x := tensor.New(`i32`, 4, 4) 19 | { 20 | x := tensor.I32(x) 21 | for i := range x { 22 | x[i] = int32(i) 23 | } 24 | } 25 | 26 | y := x.Range(tensor.Slice(1, 3), tensor.Slice(1, 3)) 27 | { 28 | y := tensor.I32(y) 29 | for _, e := range y { 30 | fmt.Fprintf(os.Stderr, "%d\n", e) 31 | } 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /benchmark/reconfiguration_parallelization/README.md: -------------------------------------------------------------------------------- 1 | # Reconfiguration Parallelizations 2 | _Fig. 14. Reconfiguration time with different parallelizations_ 3 | 4 | We examine the impact of the parallelization configuration on reconfiguration time for different model sizes. We deploy Tenplex and Tenplex-Central, which manages the state in a single node, with the different GPT-3 models on the on-premise cluster. For data parallelism (D), we change the configuration from (M, P, D) = (4, 2, 1) to (4, 2, 2); for pipeline parallelism (P) from (4, 2, 1) to (4, 4, 1); and for model parallelism (M) from (4, 2, 1) to (8, 2, 1). 5 | -------------------------------------------------------------------------------- /mlfs/vfs/utils_test.go: -------------------------------------------------------------------------------- 1 | package vfs_test 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/kungfu-team/tenplex/mlfs/vfs" 7 | ) 8 | 9 | func Test_RmRecursive(t *testing.T) { 10 | r := vfs.New() 11 | script := ` 12 | mkdir / 13 | mkdir /a 14 | mkdir /a/b 15 | mkdir /a/c 16 | touch /a/b/x.txt 17 | touch /a/b/y.txt 18 | touch /a/c/z.txt 19 | ` 20 | if err := runScript(r, script); err != nil { 21 | t.Fail() 22 | } 23 | nf, nd, err := vfs.RmRecursive(r, `/a`) 24 | if err != nil { 25 | t.Fail() 26 | } 27 | if nf != 3 { 28 | t.Fail() 29 | } 30 | if nd != 3 { 31 | t.Fail() 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /scheduler/logging/logging.go: -------------------------------------------------------------------------------- 1 | package logging 2 | 3 | import ( 4 | "io" 5 | "log" 6 | "os" 7 | "path" 8 | ) 9 | 10 | func SetupLogger(name string) { 11 | log.SetPrefix(`[` + name + `] `) 12 | log.SetFlags(0) 13 | r, w := io.Pipe() 14 | log.SetOutput(w) 15 | go func(r io.Reader) { 16 | logfile := path.Join(`logs`, name+`.log`) 17 | if err := os.MkdirAll(path.Dir(logfile), os.ModePerm); err != nil { 18 | log.Printf("create logdir failed: %v", err) 19 | } 20 | if lf, err := os.Create(logfile); err == nil { 21 | r = io.TeeReader(r, lf) 22 | } 23 | io.Copy(os.Stderr, r) 24 | }(r) 25 | } 26 | -------------------------------------------------------------------------------- /mlfs/cmd/tests/cmd/mlfs-test-dist/mlfs-test-dist.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "log" 5 | "os" 6 | "path" 7 | "time" 8 | 9 | "github.com/kungfu-team/tenplex/mlfs/ds" 10 | "github.com/kungfu-team/tenplex/mlfs/mlfs/t" 11 | ) 12 | 13 | var pwd, _ = os.Getwd() 14 | 15 | func main() { 16 | t0 := time.Now() 17 | defer func() { log.Printf("took %s", time.Since(t0)) }() 18 | dt := t.DistTest{ 19 | HTTPPort: 30000, 20 | CtrlPort: 40000, 21 | // Mount: path.Join(pwd, `mnt`), 22 | Tmp: path.Join(pwd, `tmp`), 23 | JobID: `A`, 24 | DP: 4, 25 | DS: ds.Imagenet, 26 | } 27 | dt.Run() 28 | } 29 | -------------------------------------------------------------------------------- /scheduler/scripts/upload.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | set -e 3 | 4 | if [ -z "$SAS" ]; then 5 | echo "SAS NOT set" 6 | exit 1 7 | fi 8 | 9 | upload() { 10 | local filename=$1 11 | local path=$2 12 | SA=tenplex 13 | URI="https://$SA.blob.core.windows.net/$path" 14 | 15 | echo "uploading $filename to $URI" 16 | curl -s -w "\n%{http_code}\n" -X PUT \ 17 | -H 'x-ms-blob-type: BlockBlob' \ 18 | -H 'x-ms-version: 2015-02-21' \ 19 | -H "Content-Type: $ContentType" \ 20 | "$URI?$SAS" --data-binary @$filename 21 | echo "uploaded $URI" 22 | } 23 | 24 | upload "$1" "$2" 25 | -------------------------------------------------------------------------------- /state_transformer/meta/path.go: -------------------------------------------------------------------------------- 1 | package meta 2 | 3 | import ( 4 | "fmt" 5 | "path" 6 | ) 7 | 8 | func GetStructPath(c *Config, before bool) string { 9 | suffix := func(pp, mp, dp int) string { 10 | return fmt.Sprintf("pp%02d/mp%02d/dp%02d", pp, mp, dp) 11 | } 12 | var sfx string 13 | if before { 14 | sfx = suffix(c.SourcePPDegree, c.SourceMPDegree, c.SourceDPDegree) 15 | } else { 16 | sfx = suffix(c.TargetPPDegree, c.TargetMPDegree, c.TargetDPDegree) 17 | } 18 | return path.Join( 19 | c.CkptStructDir, 20 | c.MdpLibrary, 21 | c.Precision, 22 | c.Model, 23 | c.ModelSize, 24 | sfx, 25 | ) 26 | } 27 | -------------------------------------------------------------------------------- /tensor/dtypes.go: -------------------------------------------------------------------------------- 1 | package tensor 2 | 3 | import "fmt" 4 | 5 | func eq(a, b string) { 6 | if a != b { 7 | panic(fmt.Errorf("%s != %s", a, b)) 8 | } 9 | } 10 | 11 | func to[R any](name string, t *Tensor) []R { 12 | eq(t.Dtype, name) 13 | return *(*[]R)(t.sliceHeader()) 14 | } 15 | 16 | func to_[R any](name string) func(*Tensor) []R { return func(t *Tensor) []R { return to[R](name, t) } } 17 | 18 | var ( 19 | U8 = to_[uint8](`u8`) 20 | U32 = to_[uint32](`u32`) 21 | I8 = to_[int8](`i8`) 22 | I32 = to_[int32](`i32`) 23 | F32 = to_[float32](`f32`) 24 | ) 25 | 26 | func Raw(t *Tensor) []byte { return t.Data } 27 | -------------------------------------------------------------------------------- /state_transformer/test_state_migrator.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | make 5 | 6 | ./bin/tests \ 7 | --ckpt-dir "/data/$USER/mlfs" \ 8 | --ckpt-struct-dir "$HOME/Elasticity/Repo/tenplex-run/transformer-checkpoint" \ 9 | --source-pp-degree 2 \ 10 | --target-pp-degree 3 \ 11 | --source-mp-degree 2 \ 12 | --target-mp-degree 4 \ 13 | --source-size 8 \ 14 | --target-size 12 \ 15 | --precision "fp16" \ 16 | --input-timestamp "a" \ 17 | --output-timestamp "b" \ 18 | --hosts "10.10.10.1" \ 19 | --mdp-library "megatron-lm" \ 20 | --sequence-length 1024 \ 21 | --target-rank 0 22 | -------------------------------------------------------------------------------- /benchmark/performance_impact/README.md: -------------------------------------------------------------------------------- 1 | # Performance impact 2 | _Fig. 3. Performance impact of different parallelization configurations on 16 GPUs_ 3 | 4 | When the GPU resources of a DL job change at runtime, a parallelization configuration that was optimal at deployment time may no longer be optimal with the new GPUs. We demonstrate this empirically in Fig. 3, which shows the training throughput (in samples/second) when training BERT and GPT-3 models using Megatron-LM on 16 GPUs under a range of parallelization configurations. Each parallelization configuration varies the degree of model, pipeline and data parallelism, and thus alters the GPU allocation. 5 | -------------------------------------------------------------------------------- /state_transformer/run_state_migrator.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | make 5 | 6 | ./bin/tenplex-state-transformer \ 7 | --ckpt-dir "/data/$USER/mlfs" \ 8 | --ckpt-struct-dir "$HOME/Elasticity/Repo/tenplex-run/transformer-checkpoint" \ 9 | --source-pp-degree 2 \ 10 | --target-pp-degree 3 \ 11 | --source-mp-degree 2 \ 12 | --target-mp-degree 4 \ 13 | --source-size 8 \ 14 | --target-size 12 \ 15 | --precision "fp16" \ 16 | --input-timestamp "a" \ 17 | --output-timestamp "b" \ 18 | --hosts "10.10.10.1" \ 19 | --mdp-library "megatron-lm" \ 20 | --sequence-length 1024 \ 21 | --target-rank 0 22 | -------------------------------------------------------------------------------- /benchmark/reconfiguration_horovod/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | set -e 3 | 4 | cd $(dirname $0) 5 | 6 | with_log_file() { 7 | local filename=$1 8 | shift 9 | $@ | tee $filename 10 | echo "logged to $filename $ $@" 11 | } 12 | 13 | ./add-imagenet.sh 14 | 15 | with_log_file 1.log ./with-docker horovodrun -np 2 python3 ./imagenet_resnet.py --data-dir /data/imagenet/records 16 | with_log_file 2.log ./with-docker horovodrun -np 2 python3 ./imagenet_resnet_horovod_elastic.py --data-dir /data/imagenet/records 17 | with_log_file 3.log ./with-docker horovodrun -np 2 python3 ./imagenet_resnet.py --mlfs-dir /mnt/mlfs --job fig-13 18 | 19 | python3 plot.py 20 | -------------------------------------------------------------------------------- /benchmark/failure/README.md: -------------------------------------------------------------------------------- 1 | # Failure recovery 2 | _Fig. 11. Failure recovery time (GPT-3 2.7 B)_ 3 | 4 | We explore how Tenplex manages to recover efficiently from failures, even in scenarios that require dynamic reconfiguration due to a change in the number of GPUs. We emulate faults of 4, 8, and 12 GPUs and measure the failure recovery and reconfiguration time. We use the GPT-3 2.7 B model with the Wikipedia dataset on the on-premise cluster. We compare Tenplex to a system that always recovers from the last checkpoint , which results in an average loss of 50 training steps. The parallelization configuration is (M, D, P) = (4, 2, 2), i.e. there are two model replicas. 5 | -------------------------------------------------------------------------------- /mlfs/vfs/ufs/ufs.go: -------------------------------------------------------------------------------- 1 | package ufs 2 | 3 | import ( 4 | "errors" 5 | "log" 6 | "os" 7 | 8 | "github.com/kungfu-team/tenplex/mlfs/vfs" 9 | ) 10 | 11 | type FS struct { 12 | r *vfs.Tree 13 | allowWrite bool 14 | log *log.Logger 15 | } 16 | 17 | func New(r *vfs.Tree) *FS { 18 | return &FS{ 19 | r: r, 20 | log: log.New(os.Stderr, `[fuse] `, 0), 21 | } 22 | } 23 | 24 | type Dir struct { 25 | fs *FS 26 | r *vfs.Tree 27 | id int 28 | n vfs.DirNode 29 | } 30 | 31 | type File struct { 32 | fs *FS 33 | id int 34 | n vfs.FileNode 35 | 36 | // debug 37 | name string 38 | } 39 | 40 | var errReadOnly = errors.New(`readonly`) 41 | -------------------------------------------------------------------------------- /tests/test_delete.py: -------------------------------------------------------------------------------- 1 | from tenplex.mlfs_client import MLFSClient 2 | 3 | 4 | def createTestTree(client): 5 | client.upload_txt("/a/b.txt", "1") 6 | client.upload_txt("/a/c/d.txt", "2") 7 | client.upload_txt("/a/c/e.txt", "3") 8 | 9 | 10 | def test(): 11 | ip = "localhost" 12 | port = 20010 13 | client = MLFSClient(ip, port) 14 | 15 | createTestTree(client) 16 | 17 | path = "/a" 18 | num_files, num_dirs = client.delete(path) 19 | print(f"num files {num_files}") 20 | print(f"num dirs {num_dirs}") 21 | assert num_files == 3 22 | assert num_dirs == 2 23 | 24 | 25 | if __name__ == "__main__": 26 | test() 27 | -------------------------------------------------------------------------------- /mlfs/vfs/vfile/link.go: -------------------------------------------------------------------------------- 1 | package vfile 2 | 3 | import ( 4 | "io" 5 | 6 | "github.com/kungfu-team/tenplex/mlfs/uri" 7 | ) 8 | 9 | type link struct { 10 | string 11 | int64 12 | } 13 | 14 | func Link(url string, size int64) link { return link{url, size} } 15 | 16 | func (f link) Size() int64 { return f.int64 } 17 | 18 | func (f link) Open() io.ReadCloser { 19 | r := io.NewSectionReader(f, 0, f.Size()) 20 | return io.NopCloser(r) 21 | } 22 | 23 | func (f link) ReadAt(buf []byte, pos int64) (int, error) { 24 | r, err := uri.OpenRange(f.string, pos, f.int64) 25 | if err != nil { 26 | return 0, err 27 | } 28 | defer r.Close() 29 | return r.Read(buf) 30 | } 31 | -------------------------------------------------------------------------------- /mlfs/local-ci.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | set -e 3 | 4 | make 5 | 6 | make sys-install 7 | make reload 8 | 9 | ./bin/mlfs info 10 | 11 | # ./bin/mlfs-cli -sas "minddata:$(cat $HOME/.az/minddata.sas)" 12 | 13 | ./bin/mlfs mount -global-batch-size 23 -dp-size 4 \ 14 | -idx-name squad1 \ 15 | -index-url https://minddata.blob.core.windows.net/data/squad1/squad1.idx.txt 16 | 17 | ./bin/mlfs fetch -file 'https://minddata.blob.core.windows.net/data/squad1/train.tf_record' -md5 67eb6da21920dda01ec75cd6e1a5b8d7 18 | 19 | sleep 1 # 2023/01/16 10:00:56 open /mnt/mlfs/job/0/head.txt: transport endpoint is not connected 20 | ./bin/mlfs bench -mnt /mnt/mlfs 21 | 22 | tree /mnt/mlfs/ 23 | -------------------------------------------------------------------------------- /tenplex-run/cancelgroup/cancelgroup.go: -------------------------------------------------------------------------------- 1 | package cancelgroup 2 | 3 | import ( 4 | "context" 5 | 6 | "github.com/lgarithm/proc" 7 | ) 8 | 9 | type ( 10 | P = proc.P 11 | ) 12 | 13 | func CancelGroup(ps []P, defaultErr error, cancel context.CancelFunc) P { 14 | var qs []P 15 | for _, p := range ps { 16 | var err error = defaultErr 17 | qs = append(qs, 18 | proc.Seq( 19 | proc.Ignore( 20 | proc.Seq( 21 | p, 22 | proc.FnOk(func() { err = nil }), 23 | ), 24 | ), 25 | proc.Fn(func() error { 26 | if err != nil { 27 | cancel() 28 | } 29 | return err 30 | }), 31 | )) 32 | } 33 | return proc.Par(qs...) 34 | } 35 | -------------------------------------------------------------------------------- /mlfs/www/js/bmp.js: -------------------------------------------------------------------------------- 1 | draw = (t, h, w) => { 2 | var c = document.createElement('canvas'); 3 | c.id = "CursorLayer"; 4 | c.width = w; 5 | c.height = h; 6 | // c.style.zIndex = 8; 7 | // c.style.position = "absolute"; 8 | // c.style.border = "1px solid"; 9 | // c.fill 10 | 11 | var ctx = c.getContext("2d"); 12 | 13 | ctx.beginPath(); 14 | ctx.rect(0, 0, w, h); 15 | ctx.fillStyle = "red"; 16 | ctx.fill(); 17 | return c; 18 | }; 19 | 20 | main = () => { 21 | c = draw(0, 128, 1024); 22 | document.body.appendChild(c); 23 | }; 24 | 25 | window.onload = () => { 26 | // console.log(document.body); 27 | main() 28 | } 29 | -------------------------------------------------------------------------------- /mlfs/mlfstest/Makefile: -------------------------------------------------------------------------------- 1 | GO := $(if $(GO),$(GO),$(HOME)/local/go/bin/go) 2 | CUDA := $(if $(CUDA),$(CUDA),$(shell [ -c /dev/nvidia0 ] && echo cuda)) 3 | TAGS := $(if $(TAGS),$(TAGS),) 4 | 5 | default: binaries test 6 | 7 | 8 | binaries: 9 | GOBIN=$(CURDIR)/bin $(GO) install -v -tags "$(TAGS)" ./... 10 | 11 | install: 12 | $(GO) install -v -tags "$(TAGS)" ./... 13 | 14 | test: 15 | $(GO) test -v -tags "$(TAGS)" ./... 16 | 17 | update: 18 | $(GO) get -u ./... 19 | 20 | clean: 21 | $(GO) clean -v -cache ./... 22 | 23 | tidy: 24 | $(GO) mod tidy 25 | 26 | format: 27 | $(GO) fmt ./... 28 | 29 | i: install 30 | 31 | 32 | u: update tidy 33 | 34 | 35 | t: test 36 | 37 | 38 | -------------------------------------------------------------------------------- /mlfs/vfs/node.go: -------------------------------------------------------------------------------- 1 | package vfs 2 | 3 | import "io" 4 | 5 | type Node interface { 6 | IsDir() bool 7 | 8 | AsFile() FileNode 9 | AsDir() DirNode 10 | } 11 | 12 | type DirNode interface { 13 | Items() []Item 14 | Add(string, int, bool) 15 | Del(id int) 16 | } 17 | 18 | type FileNode interface { 19 | io.ReaderAt 20 | 21 | Open() io.ReadCloser 22 | Size() int64 23 | } 24 | 25 | type FileMode interface { 26 | IsExecutable() bool 27 | } 28 | 29 | type fileNode struct { 30 | f FileNode 31 | } 32 | 33 | func (f *fileNode) IsDir() bool { return false } 34 | 35 | func (f *fileNode) AsFile() FileNode { return f.f } 36 | 37 | func (f *fileNode) AsDir() DirNode { return nil } 38 | -------------------------------------------------------------------------------- /tenplex-run/web/web.go: -------------------------------------------------------------------------------- 1 | package web 2 | 3 | import ( 4 | "fmt" 5 | "log" 6 | "net/http" 7 | "os" 8 | ) 9 | 10 | func WithLogReq(h http.Handler) http.Handler { 11 | return http.HandlerFunc(func(w http.ResponseWriter, req *http.Request) { 12 | LogRequest(req) 13 | h.ServeHTTP(w, req) 14 | }) 15 | } 16 | 17 | var LogRequest = func(r *http.Request) { 18 | accessLog.Printf("%s %s | %s %s", r.Method, r.URL, r.RemoteAddr, r.UserAgent()) 19 | } 20 | var accessLog = logger{l: log.New(os.Stderr, "[access] ", 0)} 21 | 22 | type logger struct{ l *log.Logger } 23 | 24 | func (l *logger) Printf(format string, v ...interface{}) { 25 | l.l.Output(2, fmt.Sprintf(format, v...)) 26 | } 27 | -------------------------------------------------------------------------------- /mlfs/bimap/bimap.go: -------------------------------------------------------------------------------- 1 | package bimap 2 | 3 | type BiMap struct { 4 | f, g map[string]string 5 | } 6 | 7 | func New() *BiMap { 8 | return &BiMap{ 9 | f: make(map[string]string), 10 | g: make(map[string]string), 11 | } 12 | } 13 | 14 | func (m *BiMap) Add(k, v string) bool { 15 | // log.Printf("adding %s: %s", k, v) 16 | _, a := m.f[k] 17 | _, b := m.g[v] 18 | if a || b { 19 | return false 20 | } 21 | m.f[k] = v 22 | m.g[v] = k 23 | return true 24 | } 25 | 26 | func (m *BiMap) Get(k string) (string, bool) { 27 | v, ok := m.f[k] 28 | return v, ok 29 | } 30 | 31 | func (m *BiMap) RGet(v string) (string, bool) { 32 | // log.Printf("RGet %s", v) 33 | k, ok := m.g[v] 34 | return k, ok 35 | } 36 | -------------------------------------------------------------------------------- /scheduler/run_scheduler.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | echo "Building scheduler ..." 5 | make 6 | 7 | echo "Running scheduler ..." 8 | PREFIX=$HOME/.tenplex/scheduler 9 | 10 | flag() { 11 | # echo -detect-self-ip eth0 12 | echo -detect-self-ip ib0 13 | echo -reinstall 14 | echo -u marcel 15 | echo -tenplex-state-transformer /home/marcel/Elasticity/Repo/tenplex-state-transformer/go/bin/tenplex-state-transformer 16 | } 17 | 18 | if [ ! -d transformer-checkpoint ]; then 19 | git clone git@github.com:/kungfu-team/transformer-checkpoint.git 20 | fi 21 | 22 | cd transformer-checkpoint 23 | git pull 24 | cd - 25 | 26 | $PREFIX/bin/tenplex-scheduler $(flag) 27 | 28 | echo "$0 done" 29 | -------------------------------------------------------------------------------- /tenplex-run/docker/lib.go: -------------------------------------------------------------------------------- 1 | package docker 2 | 3 | import ( 4 | "github.com/lgarithm/proc" 5 | ) 6 | 7 | type ( 8 | At = proc.UserHost 9 | P = proc.P 10 | Proc = proc.Proc 11 | ) 12 | 13 | var ( 14 | par = proc.Par 15 | out = proc.Output 16 | seq = proc.Seq 17 | Main = proc.Main 18 | psh = proc.Psh 19 | at = proc.At 20 | echo = proc.Echo 21 | lmd = proc.Lambda 22 | ignore = proc.Ignore 23 | urpc = proc.Urpc 24 | ) 25 | 26 | func fmap[X any, Y any](f func(X) Y, xs ...X) []Y { 27 | var ys []Y 28 | for _, x := range xs { 29 | ys = append(ys, f(x)) 30 | } 31 | return ys 32 | } 33 | 34 | func parmap[T any](f func(T) P, xs ...T) P { return par(fmap(f, xs...)...) } 35 | -------------------------------------------------------------------------------- /mlfs/docker/start.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | set -e 3 | 4 | tag=$(cat $(dirname $0)/ubuntu/1804/tf.tag.txt) 5 | 6 | docker_run_flags() { 7 | # echo --privileged 8 | echo --cap-add SYS_ADMIN 9 | echo --device /dev/fuse 10 | 11 | # https://forum.rclone.org/t/fusermount-permission-denied-in-docker-rclone/13914/6 12 | echo --security-opt apparmor:unconfine # For FUSE 13 | 14 | # https://medium.com/swlh/docker-and-systemd-381dfd7e4628 15 | echo -v /sys/fs/cgroup/:/sys/fs/cgroup:ro # For systemd 16 | 17 | echo -v $PWD/benchmarks:/benchmarks 18 | echo --rm 19 | echo -d 20 | echo --name mlfs 21 | } 22 | 23 | docker rm -f mlfs 24 | docker run $(docker_run_flags) --gpus "device=0" -it $tag /sbin/init 25 | -------------------------------------------------------------------------------- /tenplex/arguments.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | 4 | def add_tenplex_args(parser: argparse.ArgumentParser): 5 | group = parser.add_argument_group(title="Tenplex") 6 | 7 | group.add_argument("--tenplex", action="store_true") 8 | group.add_argument("--mlfs-path", type=str, default=None) 9 | group.add_argument("--jobid", type=str, default=None) 10 | group.add_argument("--host-ip", type=str, default=None) 11 | group.add_argument("--mlfs-port", type=int, default=None) 12 | group.add_argument("--scheduler-addr", type=str, default=None) 13 | group.add_argument("--tenplex-train-iters", type=int, default=None) 14 | group.add_argument("--gen-para-config", action="store_true") 15 | 16 | return parser 17 | -------------------------------------------------------------------------------- /mlfs/mlfstest/cmd/mlfstest-tf-imagenet/lib.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "strconv" 5 | 6 | "github.com/lgarithm/proc" 7 | ) 8 | 9 | type ( 10 | P = proc.P 11 | ) 12 | 13 | var ( 14 | seq = proc.Seq 15 | pc = proc.PC 16 | ignore = proc.Ignore 17 | echo = proc.Echo 18 | try = proc.Try 19 | 20 | str = strconv.Itoa 21 | ) 22 | 23 | func dockerExec(cmd string, args ...string) P { 24 | ss := []string{ 25 | `exec`, `-t`, name, 26 | cmd, 27 | } 28 | ss = append(ss, args...) 29 | return pc(`docker`, ss...) 30 | } 31 | 32 | func dockerCp(a, b string) P { 33 | return pc(`docker`, `cp`, a, name+`:`+b) 34 | } 35 | 36 | func If(ok bool, p P) P { 37 | if ok { 38 | return p 39 | } 40 | return seq() 41 | } 42 | -------------------------------------------------------------------------------- /mlfs/vfs/dir.go: -------------------------------------------------------------------------------- 1 | package vfs 2 | 3 | type Item struct { 4 | IsDir bool 5 | Name string 6 | Id int 7 | } 8 | 9 | type dir struct { 10 | items []Item 11 | } 12 | 13 | func (d *dir) IsDir() bool { return true } 14 | 15 | func (d *dir) AsFile() FileNode { return nil } 16 | 17 | func (d *dir) AsDir() DirNode { return d } 18 | 19 | func (d *dir) Items() []Item { return d.items } 20 | 21 | func (d *dir) Add(name string, id int, isdir bool) { 22 | d.items = append(d.items, Item{IsDir: isdir, Id: id, Name: name}) 23 | } 24 | 25 | func (d *dir) Del(id int) { 26 | var j int 27 | for i := range d.items { 28 | if d.items[i].Id != id { 29 | d.items[j] = d.items[i] 30 | j++ 31 | } 32 | } 33 | d.items = d.items[:j] 34 | } 35 | -------------------------------------------------------------------------------- /tests/test_load_http.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | from tenplex.load import load_http 4 | 5 | 6 | def main(): 7 | # parser = argparse.ArgumentParser(description='Write checkpoint') 8 | # parser.add_argument('--device-rank', type=int) 9 | # parser.add_argument('--mlfs-path', type=str) 10 | # args = parser.parse_args() 11 | 12 | job_id = "13b4a21fc1" 13 | device_rank = 0 14 | # ip = "155.198.152.18" 15 | ip = "localhost" 16 | port = 20010 17 | 18 | ckpt, step = load_http(job_id, device_rank, ip, port) 19 | print(f"ckpt {ckpt.keys()}") 20 | print(f"step {step}") 21 | 22 | print(ckpt["optimizer"]["fp32_from_fp16_params"][0]) 23 | 24 | 25 | if __name__ == '__main__': 26 | main() 27 | -------------------------------------------------------------------------------- /benchmark/reconfiguration_cluster_size/upgrade.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | set -e 3 | 4 | now_sec() { 5 | date +%s 6 | } 7 | 8 | _show_duration() { 9 | echo "$1s" 10 | } 11 | 12 | measure() { 13 | echo "BEGIN $@" 14 | local begin=$(now_sec) 15 | $@ 16 | local end=$(now_sec) 17 | local duration=$((end - begin)) 18 | echo "END $@, took $(_show_duration $duration)" | tee -a measure.log 19 | } 20 | 21 | wait_docker() { 22 | measure ansible-playbook -i hosts.txt ./docker.yml # took 269s 23 | } 24 | 25 | upgrade_cluster() { 26 | measure ansible-playbook -i hosts.txt ./tenplex.yml 27 | 28 | for i in $(seq 10); do 29 | wait_docker 30 | sleep 2 31 | done 32 | } 33 | 34 | measure upgrade_cluster 35 | -------------------------------------------------------------------------------- /scheduler/experiments/mlfs.go: -------------------------------------------------------------------------------- 1 | package experiments 2 | 3 | import "github.com/lgarithm/proc/experimental" 4 | 5 | func ReInstallMLFS(a At) P { 6 | const script = ` 7 | set -e 8 | echo "deb https://europe-west2-apt.pkg.dev/projects/tenplex tenplex main" | sudo tee /etc/apt/sources.list.d/tenplex.list 9 | curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | gpg --dearmor | sudo tee /etc/apt/trusted.gpg.d/packages-cloud-google-apt.gpg >/dev/null 10 | sudo apt update 11 | sudo apt remove -y mlfs 12 | sudo apt reinstall -y mlfs 13 | sudo systemctl stop mlfs 14 | sudo systemctl start mlfs 15 | mlfs info 16 | ` 17 | return seq( 18 | runScript(a, script, `install-mlfs.sh`, false), 19 | ) 20 | } 21 | 22 | var runScript = experimental.RunScript 23 | -------------------------------------------------------------------------------- /tenplex-run/cluster/cluster_test.go: -------------------------------------------------------------------------------- 1 | package cluster_test 2 | 3 | import ( 4 | "flag" 5 | "testing" 6 | 7 | "github.com/kungfu-team/tenplex/tenplex-run/cluster" 8 | ) 9 | 10 | func Test_1(t *testing.T) { 11 | var c cluster.Cluster 12 | f := flag.NewFlagSet(`prog`, flag.ExitOnError) 13 | c.RegisterFlags(f) 14 | f.Parse([]string{ 15 | `-gpu-per-host`, `8`, 16 | `-hosts`, `1.2.3.4,4.3.2.1`, 17 | }) 18 | t.Logf("%#v", c) 19 | if c.GPUsPerHost != 8 { 20 | t.Errorf("parse -gpu-per-host failed: %q", c.GPUsPerHost) 21 | } 22 | if c.GPUsPerContainer != 4 { 23 | t.Errorf("default -gpu-per-container failed: %q", c.GPUsPerContainer) 24 | } 25 | if c.Hosts[0] != `1.2.3.4` { 26 | t.Errorf("parse -hosts failed: %q", c.Hosts) 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /benchmark/reconfiguration_horovod/train-imagenet.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | set -e 3 | 4 | run() { 5 | local np=$1 6 | shift 7 | horovodrun -np $np $@ 8 | } 9 | 10 | train_flags_disk() { 11 | echo --data-dir /data/imagenet/records 12 | } 13 | 14 | train_flags_tenplex() { 15 | echo --mlfs-dir /mnt/mlfs 16 | echo --job fig-13 17 | } 18 | 19 | with_log_file() { 20 | local filename=$1 21 | shift 22 | $@ | tee $filename 23 | echo "logged to $filename $ $@" 24 | } 25 | 26 | # with_log_file 1.log run 2 python3 ./imagenet_resnet.py $(train_flags_disk) 27 | # with_log_file 3.log run 2 python3 ./imagenet_resnet_horovod_elastic.py $(train_flags_disk) 28 | # with_log_file 2.log run 2 python3 ./imagenet_resnet.py $(train_flags_tenplex) 29 | -------------------------------------------------------------------------------- /scheduler/scripts/install-mlfs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # https://tenplex.blob.core.windows.net/public/deb/Release 4 | # https://tenplex.blob.core.windows.net/public/deb/Packages 5 | # https://tenplex.blob.core.windows.net/public/deb/mlfs_0.0.1-git-main-rev1-97718f7_amd64.deb 6 | 7 | install_mlfs() { 8 | echo 'deb https://tenplex.blob.core.windows.net/public/deb ./' | sudo tee /etc/apt/sources.list.d/tenplex.list 9 | curl -s https://tenplex.blob.core.windows.net/public/deb/tenplex.gpg | sudo apt-key add - 10 | sudo apt update 11 | sudo apt remove -y mlfs # TODO: fix deb package version number 12 | sudo apt reinstall -y mlfs 13 | sudo systemctl stop mlfs 14 | sudo systemctl start mlfs 15 | mlfs-admin 16 | } 17 | 18 | install_mlfs 19 | -------------------------------------------------------------------------------- /state_transformer/statetransform/lib.go: -------------------------------------------------------------------------------- 1 | package statetransform 2 | 3 | import "strconv" 4 | 5 | func equal[T int | string](a, b []T) bool { 6 | if len(a) != len(b) { 7 | return false 8 | } 9 | for i, v := range a { 10 | if v != b[i] { 11 | return false 12 | } 13 | } 14 | return true 15 | } 16 | 17 | func isInt(str string) bool { 18 | _, err := strconv.Atoi(str) 19 | return err == nil 20 | } 21 | 22 | func isIntAndCheck(arr []string, key int) bool { 23 | if key >= len(arr) { 24 | return false 25 | } 26 | _, err := strconv.Atoi(arr[key]) 27 | return err == nil 28 | } 29 | 30 | func equalAndCheck[T int | string](arr []T, key int, val T) bool { 31 | if key >= len(arr) { 32 | return false 33 | } 34 | return arr[key] == val 35 | } 36 | -------------------------------------------------------------------------------- /ipv4/detect.go: -------------------------------------------------------------------------------- 1 | package ipv4 2 | 3 | import "net" 4 | 5 | func Detect(nicName string) string { 6 | nics, err := net.Interfaces() 7 | if err != nil { 8 | return "" 9 | } 10 | for _, nic := range nics { 11 | if len(nicName) > 0 && nicName != nic.Name { 12 | continue 13 | } 14 | addrs, err := nic.Addrs() 15 | if err != nil { 16 | continue 17 | } 18 | for _, addr := range addrs { 19 | var ip net.IP 20 | switch v := addr.(type) { 21 | case *net.IPNet: 22 | ip = v.IP 23 | case *net.IPAddr: 24 | ip = v.IP 25 | } 26 | if ip != nil { 27 | ip = ip.To4() 28 | } 29 | if ip != nil { 30 | // fmt.Printf("%s %s\n", nic.Name, ip.String()) 31 | return ip.String() 32 | } 33 | } 34 | } 35 | return "" 36 | } 37 | -------------------------------------------------------------------------------- /mlfs/cmd/mlfs-build-tf-index/mlfs-build-tf-index.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "flag" 5 | "log" 6 | "time" 7 | 8 | "github.com/kungfu-team/tenplex/mlfs/tfrecord" 9 | "github.com/kungfu-team/tenplex/mlfs/vfs/vfile" 10 | ) 11 | 12 | var ( 13 | m = flag.Int("m", 2, "") 14 | filename = flag.String("output", "a.idx.txt", "") 15 | ) 16 | 17 | func main() { 18 | flag.Parse() 19 | t0 := time.Now() 20 | defer func() { log.Printf("took %s", time.Since(t0)) }() 21 | idx, err := tfrecord.BuildIndex(flag.Args(), *m) 22 | if err != nil { 23 | log.Printf("%v", err) 24 | return 25 | } 26 | if err := vfile.SaveIdxFile(*filename, idx); err != nil { 27 | log.Printf("%v", err) 28 | return 29 | } 30 | log.Printf("generated %s", *filename) 31 | } 32 | -------------------------------------------------------------------------------- /tenplex-run/job/hosts.go: -------------------------------------------------------------------------------- 1 | package job 2 | 3 | import "fmt" 4 | 5 | var ( 6 | // komodo01 = `komodo01.doc.res.ic.ac.uk` 7 | komodo01 = `10.10.10.1` 8 | // komodo02 = `komodo02.doc.res.ic.ac.uk` 9 | komodo02 = `10.10.10.2` 10 | // komodo03 = `komodo03.doc.res.ic.ac.uk` 11 | komodo03 = `10.10.10.3` 12 | // komodo04 = `komodo04.doc.res.ic.ac.uk` 13 | komodo04 = `10.10.10.4` 14 | 15 | dockerIPs = genDockerIPRange(32) 16 | ) 17 | 18 | // generate a private IP range for docker swarm 19 | func genDockerIPRange(n int) []string { 20 | var ips []string 21 | for i := 0; i < n; i++ { 22 | // TODO: extract subnet range from JSON 23 | // ip := fmt.Sprintf("10.10.10.%d", 140+i) 24 | ip := fmt.Sprintf("trainer-%02d", i) 25 | ips = append(ips, ip) 26 | } 27 | return ips 28 | } 29 | -------------------------------------------------------------------------------- /tenplex-run/cluster/cluster.go: -------------------------------------------------------------------------------- 1 | package cluster 2 | 3 | import ( 4 | "flag" 5 | 6 | "github.com/kungfu-team/tenplex/tenplex-run/listflag" 7 | "github.com/kungfu-team/tenplex/tenplex-run/structflag" 8 | ) 9 | 10 | type Cluster struct { 11 | GPUsPerHost int `flag:"gpu-per-host" default:"4"` 12 | GPUsPerContainer int `flag:"gpu-per-container" default:"4"` 13 | Hosts listflag.Strings `flag:"hosts"` 14 | } 15 | 16 | func NewCluster(gpuPerHost int, gpusPerContainer int, hosts ...string) *Cluster { 17 | return &Cluster{ 18 | GPUsPerHost: gpuPerHost, 19 | GPUsPerContainer: gpusPerContainer, 20 | Hosts: hosts, 21 | } 22 | } 23 | 24 | func (c *Cluster) RegisterFlags(flag *flag.FlagSet) { structflag.RegisterFlags(c, flag) } 25 | -------------------------------------------------------------------------------- /state_transformer/meta/modelkeys.go: -------------------------------------------------------------------------------- 1 | package meta 2 | 3 | import ( 4 | "encoding/json" 5 | "os" 6 | "path" 7 | "strconv" 8 | ) 9 | 10 | func LoadModelKeys(conf *Config, before bool) (map[int][][]string, error) { 11 | structPath := GetStructPath(conf, before) 12 | modelKeysPath := path.Join(structPath, "model_keys.json") 13 | var payload map[string][][]string 14 | content, err := os.ReadFile(modelKeysPath) 15 | if err != nil { 16 | return nil, err 17 | } 18 | err = json.Unmarshal(content, &payload) 19 | if err != nil { 20 | return nil, err 21 | } 22 | modelKeys := make(map[int][][]string) 23 | for k, v := range payload { 24 | i, err := strconv.Atoi(k) 25 | if err != nil { 26 | return nil, err 27 | } 28 | modelKeys[i] = v 29 | } 30 | return modelKeys, nil 31 | } 32 | -------------------------------------------------------------------------------- /mlfs/vfs/path.go: -------------------------------------------------------------------------------- 1 | package vfs 2 | 3 | import ( 4 | "errors" 5 | "strings" 6 | ) 7 | 8 | type pstr string 9 | 10 | type filepath []string 11 | 12 | func (p filepath) P() pstr { 13 | s := `/` + strings.Join(p, `/`) 14 | return pstr(s) 15 | } 16 | 17 | var errNoParent = errors.New(`root has no parent`) 18 | 19 | func (p filepath) parent() filepath { 20 | if len(p) < 1 { 21 | panic(errNoParent) 22 | } 23 | return p[:len(p)-1] 24 | } 25 | 26 | func (p filepath) basename() string { 27 | if len(p) < 1 { 28 | panic(errNoParent) 29 | } 30 | return p[len(p)-1] 31 | } 32 | 33 | func ParseP(p string) filepath { 34 | var parts []string 35 | for _, name := range strings.Split(p, `/`) { 36 | if len(name) > 0 { 37 | parts = append(parts, name) 38 | } 39 | } 40 | return parts 41 | } 42 | -------------------------------------------------------------------------------- /state_transformer/cmd/tenplex-state-transformer/tenplex-state-transformer.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "flag" 5 | "log" 6 | "time" 7 | 8 | "github.com/kungfu-team/tenplex/state_transformer/meta" 9 | "github.com/kungfu-team/tenplex/state_transformer/statetransform" 10 | ) 11 | 12 | func main() { 13 | startTransform := time.Now() 14 | var conf meta.Config 15 | conf.RegisterFlags(flag.CommandLine) 16 | flag.Parse() 17 | conf.Complete() 18 | log.Printf("config %+v", conf) 19 | log.Printf("target device %v", conf.TargetRank) 20 | if err := statetransform.MigrateState(&conf, conf.TargetRank); err != nil { 21 | log.Panicf("Transformation for device %d failed with %v", conf.TargetRank, err) 22 | } 23 | log.Printf("State transformation took %s", time.Since(startTransform)) 24 | } 25 | -------------------------------------------------------------------------------- /tenplex-run/.github/workflows/go.yml: -------------------------------------------------------------------------------- 1 | name: Go 2 | 3 | on: 4 | - push 5 | 6 | jobs: 7 | linux: 8 | runs-on: ubuntu-20.04 9 | 10 | steps: 11 | - uses: actions/checkout@v2 12 | 13 | - uses: actions/setup-go@v2 14 | with: 15 | go-version: '1.18' 16 | 17 | - run: sudo apt install -y openssh-client # for ssh-keygen 18 | - run: mkdir -p $HOME/.ssh 19 | - run: echo "${KUNGFU_RSA}" > $HOME/.ssh/id_rsa 20 | env: 21 | KUNGFU_RSA: ${{ secrets.KUNGFU_RSA }} 22 | - run: chmod 0600 $HOME/.ssh/id_rsa 23 | - run: ssh-keygen -y -f $HOME/.ssh/id_rsa > $HOME/.ssh/id_rsa.pub 24 | 25 | - run: git config --global url."git@github.com:".insteadOf "https://github.com/" 26 | - run: go env -w GOPRIVATE=* 27 | 28 | - run: GO=$(which go) make 29 | -------------------------------------------------------------------------------- /mlfs/iotrace/io.go: -------------------------------------------------------------------------------- 1 | package iotrace 2 | 3 | import "io" 4 | 5 | type TracedWriter struct { 6 | w io.Writer 7 | c *Counter 8 | } 9 | 10 | func TraceWriter(w io.Writer, c *Counter) io.Writer { 11 | return &TracedWriter{ 12 | w: w, 13 | c: c, 14 | } 15 | } 16 | 17 | func (w *TracedWriter) Write(bs []byte) (int, error) { 18 | n, err := w.w.Write(bs) 19 | w.c.Add(int64(n)) 20 | return n, err 21 | } 22 | 23 | type TracedReader struct { 24 | r io.Reader 25 | c *Counter 26 | } 27 | 28 | func TraceReader(r io.Reader, c *Counter) io.Reader { 29 | if c == nil { 30 | return r 31 | } 32 | return &TracedReader{ 33 | r: r, 34 | c: c, 35 | } 36 | } 37 | 38 | func (r *TracedReader) Read(bs []byte) (int, error) { 39 | n, err := r.r.Read(bs) 40 | r.c.Add(int64(n)) 41 | return n, err 42 | } 43 | -------------------------------------------------------------------------------- /state_transformer/statetransform/repartition_test.go: -------------------------------------------------------------------------------- 1 | package statetransform 2 | 3 | import ( 4 | "testing" 5 | ) 6 | 7 | func TestMapToUnitedRequests(t *testing.T) { 8 | sourceDim := 256 9 | targetDim := 512 10 | sourceMPSize := 4 11 | targetMPRank := 1 12 | reqs, err := mapToUnitedRequests(sourceDim, targetDim, sourceMPSize, targetMPRank) 13 | if err != nil { 14 | t.Fail() 15 | } 16 | t.Logf("requests %v", reqs) 17 | } 18 | 19 | func TestMapToSourceRequests(t *testing.T) { 20 | sourceDim := 256 21 | targetDim := 512 22 | sourceMPSize := 4 23 | targetMPRank := 1 24 | reqs, err := mapToUnitedRequests(sourceDim, targetDim, sourceMPSize, targetMPRank) 25 | if err != nil { 26 | t.Fail() 27 | } 28 | reqs = mapToSourceRequests(reqs, sourceDim) 29 | t.Logf("requests %v", reqs) 30 | } 31 | -------------------------------------------------------------------------------- /tenplex/stop.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import requests 4 | import torch 5 | 6 | 7 | def check_stop(scheduler_addr: str): 8 | if scheduler_addr is None: 9 | return False 10 | stop = False 11 | rank = torch.distributed.get_rank() 12 | if rank == 0: 13 | url = scheduler_addr 14 | url = os.path.join(url, "stop") 15 | req = requests.get(url, timeout=12) 16 | txt = req.text 17 | if txt == "stop": 18 | stop = True 19 | if stop: 20 | stop_ten = torch.tensor(1, dtype=torch.int32, device=torch.device("cuda")) 21 | else: 22 | stop_ten = torch.tensor(0, dtype=torch.int32, device=torch.device("cuda")) 23 | torch.distributed.all_reduce(stop_ten) 24 | if stop_ten > 0: 25 | return True 26 | return False 27 | -------------------------------------------------------------------------------- /scheduler/scripts/plot.gp: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env gnuplot 2 | 3 | set terminal postscript portrait dashed color size 14,9 font 44 fontscale 1 4 | # set terminal postscript portrait dashed monochrome size 14,9 font 44 fontscale 1 5 | set datafile missing '-' 6 | set boxwidth 0.9 absolute 7 | 8 | set style fill solid 1.00 border lt -1 9 | set style data lines 10 | 11 | set xtics () 12 | set xtics border in scale 0,0 nomirror autojustify 13 | set xtics nomirror rotate by -45 14 | set xtics norangelimit 15 | 16 | set key fixed right bottom vertical Right noreverse noenhanced autotitle nobox 17 | set key outside; 18 | 19 | NO_ANIMATION = 1 20 | 21 | set output 'p1.ps' 22 | 23 | f1 = 'a.log' 24 | f2 = 'b.log' 25 | 26 | plot f1 using ($1 / 60.0):2 title 'a loss', \ 27 | f2 using ($1 / 60.0):2 title 'b loss' \ 28 | -------------------------------------------------------------------------------- /tests/test_save.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import torch 4 | 5 | from .save import save 6 | 7 | 8 | def main(): 9 | parser = argparse.ArgumentParser(description='Write checkpoint') 10 | parser.add_argument('--ckpt-path', type=str) 11 | parser.add_argument('--job-id', type=str) 12 | parser.add_argument('--step', type=str) 13 | parser.add_argument('--device-rank', type=int) 14 | parser.add_argument('--mlfs-path', type=str) 15 | parser.add_argument('--ip', type=str) 16 | parser.add_argument('--port', type=int) 17 | args = parser.parse_args() 18 | 19 | ckpt = torch.load(args.ckpt_path, map_location='cpu') 20 | save(ckpt, args.job_id, args.step, args.device_rank, args.mlfs_path, 21 | args.ip, args.port) 22 | 23 | 24 | if __name__ == '__main__': 25 | main() 26 | -------------------------------------------------------------------------------- /benchmark/dynamic_resources/README.md: -------------------------------------------------------------------------------- 1 | # Dynamic resources 2 | _Fig. 9. Elastic DL job convergence with multi-dimensional parallelism under dynamic GPU changes_ 3 | 4 | First, we explore the benefits of supporting elasticity in DL jobs with multi-dimensional parallelism, scaling across all parallelism dimensions when the GPU allocation changes. 5 | In this experiment, we train DL jobs with the GPT-3 XL model on the on-premise 16-GPU cluster. The job runtime and elastic scaling events are derived based on Microsoft’s Philly trace: over the runtime of 538 mins, we scale based on the average every 35 mins. During a scaling event, we change the number of GPUs for a job between 16, 8, and 4 GPUs. 6 | 7 | # Run 8 | ```sh 9 | ./run.sh 10 | ``` 11 | 12 | ## Note 13 | The dynamic resources experiment runs for about 24 hours 14 | -------------------------------------------------------------------------------- /scheduler/scripts/upload-logs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | set -e 3 | 4 | export PATH=$PATH:$HOME/local/bin 5 | 6 | cd $(dirname $0)/.. 7 | 8 | if [ ! -f run-id.txt ]; then 9 | date +%s >run-id.txt 10 | fi 11 | 12 | RUN_ID=$(cat run-id.txt) 13 | 14 | echo "Using RUN_ID: $RUN_ID" 15 | 16 | SA=tenplex 17 | PREFIX="https://$SA.blob.core.windows.net/public/_debug/scheduler/$RUN_ID" 18 | 19 | _list_logs() { 20 | find logs -type f 21 | ls *.log 22 | } 23 | 24 | list_logs() { _list_logs | sort; } 25 | 26 | upload() { 27 | URL=$PREFIX/$1 28 | ucp $1 $URL 29 | echo "uploaded to $URL" 30 | } 31 | 32 | main() { 33 | for f in $(list_logs); do 34 | echo $f 35 | upload $f 36 | done 37 | ./scripts/gen-log-index.py $(list_logs) >index.html 38 | upload index.html 39 | } 40 | 41 | main 42 | -------------------------------------------------------------------------------- /mlfs/vfs/vfile/buffer.go: -------------------------------------------------------------------------------- 1 | package vfile 2 | 3 | import ( 4 | "bytes" 5 | "io" 6 | ) 7 | 8 | type Buffer struct { 9 | bs []byte 10 | } 11 | 12 | func NewBuffer() *Buffer { 13 | return &Buffer{} 14 | } 15 | 16 | func (f *Buffer) Open() io.ReadCloser { 17 | r := bytes.NewBuffer(f.bs) 18 | return io.NopCloser(r) 19 | } 20 | 21 | func (f *Buffer) Size() int64 { 22 | return int64(len(f.bs)) 23 | } 24 | 25 | func (f *Buffer) Truncate() { 26 | f.bs = nil 27 | } 28 | 29 | func (f *Buffer) ReadAt(buf []byte, pos int64) (int, error) { 30 | br := bytes.NewBuffer(f.bs[pos:]) 31 | return br.Read(buf) 32 | } 33 | 34 | func (f *Buffer) WriteAt(buf []byte, pos int64) (int, error) { 35 | if n := len(buf) + int(pos); n > len(f.bs) { 36 | f.bs = append(f.bs, make([]byte, n-len(f.bs))...) 37 | } 38 | return copy(f.bs[pos:], buf), nil 39 | } 40 | -------------------------------------------------------------------------------- /scheduler/run_user.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | cd $(dirname $0) 5 | make 6 | 7 | join_() { 8 | local IFS=$1 9 | shift 10 | echo "$*" 11 | } 12 | 13 | echo "Listing IPs" 14 | # host=$(join_ , $(./scripts/list-ips.sh)) 15 | host=$(join_ , $(./scripts/list-ips-komodo.sh)) 16 | echo "using host=$host" 17 | 18 | for h in $(echo $host | tr ',' '\n'); do 19 | gpu_per_host=$(ssh $h nvidia-smi -L | wc -l) 20 | echo "$gpu_per_host GPUs on $h" 21 | done 22 | 23 | # echo -failure 24 | flags() { 25 | echo -hosts $host 26 | echo -gpu-per-host $gpu_per_host # TODO: auto detect 27 | echo -image kungfu.azurecr.io/mw-megatron-lm-update 28 | echo -plan ./data/single-job-time.json 29 | echo -timed-job 30 | } 31 | 32 | PREFIX=$HOME/.tenplex/scheduler 33 | $PREFIX/bin/tenplex-user $(flags) 34 | 35 | echo "$0 done" 36 | -------------------------------------------------------------------------------- /mlfs/hash/file.go: -------------------------------------------------------------------------------- 1 | package hash 2 | 3 | import ( 4 | "fmt" 5 | "log" 6 | 7 | "github.com/kungfu-team/tenplex/mlfs/uri" 8 | ) 9 | 10 | type HashedFile struct { 11 | MD5 string 12 | URLs []string 13 | } 14 | 15 | func (f *HashedFile) Check() { 16 | for _, u := range f.URLs { 17 | ok, got, err := md5Check(f.MD5, u) 18 | if err != nil { 19 | log.Printf("%v", err) 20 | continue 21 | } 22 | if !ok { 23 | fmt.Printf("failed: %s != md5(%s) = %s\n", f.MD5, u, got) 24 | continue 25 | } 26 | fmt.Printf("OK: %s = md5(%s)\n", f.MD5, u) 27 | } 28 | } 29 | 30 | func md5Check(sum string, url string) (bool, string, error) { 31 | f, err := uri.Open(url) 32 | if err != nil { 33 | return false, "", err 34 | } 35 | defer f.Close() 36 | got, err := md5sum(f, nil) 37 | if err != nil { 38 | return false, got, err 39 | } 40 | return sum == got, got, nil 41 | } 42 | -------------------------------------------------------------------------------- /mlfs/ds/mnist.go: -------------------------------------------------------------------------------- 1 | package ds 2 | 3 | import "github.com/kungfu-team/tenplex/mlfs/hash" 4 | 5 | var ( 6 | MnistTrainImages = hash.HashedFile{ 7 | MD5: `f68b3c2dcbeaaa9fbdd348bbdeb94873`, 8 | URLs: []string{`https://storage.googleapis.com/cvdf-datasets/mnist/train-images-idx3-ubyte.gz`}, 9 | } 10 | MnistTrainLabels = hash.HashedFile{ 11 | MD5: `d53e105ee54ea40749a09fcbcd1e9432`, 12 | URLs: []string{`https://storage.googleapis.com/cvdf-datasets/mnist/train-labels-idx1-ubyte.gz`}, 13 | } 14 | MnistTestImages = hash.HashedFile{ 15 | MD5: `9fb629c4189551a2d022fa330f9573f3`, 16 | URLs: []string{`https://storage.googleapis.com/cvdf-datasets/mnist/t10k-images-idx3-ubyte.gz`}, 17 | } 18 | MnistTestLabels = hash.HashedFile{ 19 | MD5: `ec29112dd5afa0611ce80d1b7f02629c`, 20 | URLs: []string{`https://storage.googleapis.com/cvdf-datasets/mnist/t10k-labels-idx1-ubyte.gz`}, 21 | } 22 | ) 23 | -------------------------------------------------------------------------------- /scheduler/job/job.go: -------------------------------------------------------------------------------- 1 | package job 2 | 3 | import ( 4 | "strings" 5 | 6 | "github.com/kungfu-team/tenplex/mlfs/ds" 7 | "github.com/kungfu-team/tenplex/scheduler/scalepoint" 8 | ) 9 | 10 | type Job struct { 11 | Framework string 12 | Precision string 13 | BatchSize int 14 | MicroBatchSize int 15 | SequenceLength int 16 | Dataset ds.Dataset 17 | Image string 18 | Model string 19 | ID string 20 | Steps int 21 | ModelSize string 22 | NumLayers int 23 | VocabSize int 24 | Failure int 25 | } 26 | 27 | func ShowJobIds(jss ...[]Job) string { 28 | var ids []string 29 | for _, js := range jss { 30 | for _, j := range js { 31 | ids = append(ids, j.ID) 32 | } 33 | } 34 | return strings.Join(ids, ",") 35 | } 36 | 37 | type TimedJob struct { 38 | Job Job 39 | Timing []scalepoint.ScalePoint 40 | } 41 | -------------------------------------------------------------------------------- /mlfs/uri/sas.go: -------------------------------------------------------------------------------- 1 | package uri 2 | 3 | import ( 4 | "encoding/json" 5 | "fmt" 6 | "io" 7 | "log" 8 | "net/url" 9 | "time" 10 | ) 11 | 12 | var t0 = time.Now() 13 | 14 | func checkSAS(filename, sas string) error { 15 | q, err := url.ParseQuery(sas) 16 | if err != nil { 17 | return err 18 | } 19 | se, err := parseTime(q.Get(`se`)) 20 | if err != nil { 21 | return err 22 | } 23 | if se.Before(t0) { 24 | log.Printf("%s expired %s ago", filename, t0.Sub(*se)) 25 | } 26 | return nil 27 | } 28 | 29 | func parseTime(s string) (*time.Time, error) { 30 | var i struct { 31 | T time.Time `json:"time"` 32 | } 33 | if err := json.Unmarshal([]byte(fmt.Sprintf(`{"time": %q}`, s)), &i); err != nil { 34 | return nil, err 35 | } 36 | return &i.T, nil 37 | } 38 | 39 | func Debug(w io.Writer) { 40 | for sa, sas := range opener.azSAS { 41 | fmt.Fprintf(w, "%q: %q\n", sa, sas) 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /tenplex-run/structflag/structflag_test.go: -------------------------------------------------------------------------------- 1 | package structflag_test 2 | 3 | import ( 4 | "flag" 5 | "strings" 6 | "testing" 7 | 8 | "github.com/kungfu-team/tenplex/tenplex-run/structflag" 9 | ) 10 | 11 | type Base struct { 12 | Y int `flag:"y"` 13 | } 14 | 15 | type App struct { 16 | Base 17 | Name string `flag:"name"` 18 | X int `flag:"x"` 19 | OK bool `flag:"ok"` 20 | } 21 | 22 | func Test_1(t *testing.T) { 23 | var a App 24 | f := flag.NewFlagSet(`cmd`, flag.ExitOnError) 25 | structflag.RegisterFlags(&a, f) // won't register Base 26 | // structflag.RegisterFlags(&a.Base, f) 27 | } 28 | 29 | func Test_2(t *testing.T) { 30 | a := App{ 31 | Name: `abc`, 32 | X: 2, 33 | OK: true, 34 | } 35 | args := structflag.ToGoArgs(&a) 36 | want := `-name abc -x 2 -ok` 37 | if got := strings.Join(args, " "); got != want { 38 | t.Errorf("%q != %q", got, want) 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /tenplex-run/runop/redundancy.go: -------------------------------------------------------------------------------- 1 | package runop 2 | 3 | import ( 4 | "fmt" 5 | 6 | "github.com/kungfu-team/tenplex/mlfs/mlfs" 7 | "github.com/kungfu-team/tenplex/mlfs/pid" 8 | "github.com/kungfu-team/tenplex/tenplex-run/job" 9 | ) 10 | 11 | func setRedundancy(jobConf *job.JobConfig) error { 12 | redu := 1 13 | 14 | var peerList mlfs.PeerList 15 | for _, host := range jobConf.Cluster.Hosts { 16 | peerList = append(peerList, mlfs.Peer{IPv4: pid.MustParseIPv4(host), Port: mlfs.DefaultCtrlPort}) 17 | } 18 | 19 | for _, host := range jobConf.Cluster.Hosts { 20 | cli, err := mlfs.NewClientTo(host, mlfs.DefaultCtrlPort) 21 | if err != nil { 22 | return fmt.Errorf("%s %v", host, err) 23 | } 24 | err = cli.SetPeers(peerList) 25 | if err != nil { 26 | return err 27 | } 28 | err = cli.SetRedundency(redu) 29 | if err != nil { 30 | return err 31 | } 32 | } 33 | return nil 34 | } 35 | -------------------------------------------------------------------------------- /mlfs/vfs/tree_debug.go: -------------------------------------------------------------------------------- 1 | package vfs 2 | 3 | import ( 4 | "fmt" 5 | "io" 6 | ) 7 | 8 | func (t *Tree) Dump(w io.Writer) { 9 | for i, p := range t.ps { 10 | n := t.nodes[i] 11 | var c rune 12 | var size int 13 | var unit string 14 | if n.IsDir() { 15 | c = 'd' 16 | size = len(n.AsDir().Items()) 17 | unit = `files` 18 | } else { 19 | c = '-' 20 | size = int(n.AsFile().Size()) 21 | unit = `bytes` 22 | } 23 | fmt.Fprintf(w, "%8d %c %12d %s %s\n", i, c, size, unit, p) 24 | } 25 | fmt.Fprintf(w, "%d nodes\n", t.Count()) 26 | } 27 | 28 | func (t *Tree) Stat() { 29 | n := t.Count() 30 | nd := t.nDirs 31 | nf := n - nd 32 | fmt.Printf("%d nodes, %d dirs, %d files\n", n, nd, nf) 33 | } 34 | 35 | func (t *Tree) AllFiles(w io.Writer) { 36 | for i, p := range t.ps { 37 | n := t.nodes[i] 38 | if !n.IsDir() { 39 | fmt.Fprintf(w, "%s\n", p) 40 | } 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /.github/workflows/deb.yml: -------------------------------------------------------------------------------- 1 | name: deb 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | 8 | jobs: 9 | test: 10 | # https://help.github.com/en/articles/virtual-environments-for-github-actions#supported-virtual-environments 11 | runs-on: ubuntu-20.04 12 | 13 | steps: 14 | - uses: actions/checkout@v3 15 | with: 16 | fetch-depth: 0 17 | 18 | - run: make 19 | - run: make deb 20 | 21 | - run: | 22 | KEY_FILE=$HOME/gcloud-key.json 23 | echo "${GCLOUD_KEY}" > $KEY_FILE 24 | gcloud auth login --cred-file=$KEY_FILE 25 | rm $KEY_FILE 26 | 27 | gcloud config set project tenplex 28 | env: 29 | GCLOUD_KEY: ${{ secrets.GCLOUD_KEY }} 30 | 31 | - run: | 32 | REPO=tenplex 33 | DEB=`ls build/*.deb` 34 | LOC=europe-west2 35 | gcloud artifacts apt upload $REPO --location=$LOC --source=$DEB 36 | -------------------------------------------------------------------------------- /benchmark/redeployment/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | set -e 4 | 5 | . $(dirname $0)/../common.sh 6 | 7 | hosts() { 8 | echo "10.10.10.1" 9 | echo "10.10.10.2" 10 | echo "10.10.10.3" 11 | echo "10.10.10.4" 12 | } 13 | 14 | model_sizes() { 15 | echo "6.7B" 16 | echo "2.7B" 17 | echo "xl" 18 | } 19 | 20 | comb_flags() { 21 | base_flags 22 | echo -model "gpt" 23 | echo -dataset "enwiki" 24 | echo -index-url "/data/megatron-lm/gpt-2/enwiki/npzs_seq1024_new/indices.txt" 25 | echo -hosts $(join $(hosts)) 26 | echo -schedule "$(dirname $0)/schedule.json" 27 | echo -model-sizes $(join $(model_sizes)) 28 | echo -batch-sizes 128 29 | echo -micro-batch-sizes 8 30 | echo -para-config "$(dirname $0)/para-config.json" 31 | echo -redeploy 32 | echo -central 33 | } 34 | 35 | tenplex-multi-experiment $(comb_flags) 2>&1 | tee redeploy.log 36 | 37 | python plot.py 38 | -------------------------------------------------------------------------------- /mlfs/cmd/tests/cmd/mlfs-debug/mlfs-debug.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "encoding/json" 5 | "flag" 6 | "log" 7 | "os" 8 | 9 | "github.com/kungfu-team/tenplex/mlfs/ds" 10 | "github.com/kungfu-team/tenplex/mlfs/vfs/vfile" 11 | ) 12 | 13 | var ( 14 | dataset = flag.String(`ds`, ``, ``) 15 | ) 16 | 17 | func main() { 18 | flag.Parse() 19 | var ds ds.Dataset 20 | panicErr(loadJSONFile(*dataset, &ds)) 21 | log.Printf("%q", ds.IndexURL) 22 | i, err := vfile.LoadIdxFile(ds.IndexURL) 23 | if err != nil { 24 | panic(err) 25 | } 26 | i.SetHost(``) 27 | for _, f := range i { 28 | log.Printf("%q", f.Filepath) 29 | } 30 | } 31 | 32 | func loadJSONFile(filename string, i interface{}) error { 33 | f, err := os.Open(filename) 34 | if err != nil { 35 | return err 36 | } 37 | return json.NewDecoder(f).Decode(i) 38 | } 39 | func panicErr(err error) { 40 | if err != nil { 41 | panic(err) 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /mlfs/vfs/file.go: -------------------------------------------------------------------------------- 1 | package vfs 2 | 3 | import ( 4 | "bytes" 5 | "io" 6 | ) 7 | 8 | type file struct { 9 | bs []byte 10 | } 11 | 12 | func ToFile(bs []byte) *file { return &file{bs: bs} } 13 | 14 | func (f *file) IsDir() bool { return false } 15 | 16 | func (f *file) IsExecutable() bool { return bytes.HasPrefix(f.bs, []byte(`#!`)) } 17 | 18 | func (f *file) AsFile() FileNode { return f } 19 | 20 | func (f *file) AsDir() DirNode { return nil } 21 | 22 | func (f *file) Open() io.ReadCloser { 23 | r := bytes.NewBuffer(f.bs) 24 | return io.NopCloser(r) 25 | } 26 | 27 | func (f *file) Size() int64 { 28 | return int64(len(f.bs)) 29 | } 30 | 31 | func (f *file) ReadAt(buf []byte, pos int64) (int, error) { 32 | n := min(len(buf), len(f.bs)-int(pos)) 33 | copy(buf[:n], f.bs[pos:(pos)+int64(n)]) 34 | return n, nil 35 | } 36 | 37 | func min(a, b int) int { 38 | if a < b { 39 | return a 40 | } 41 | return b 42 | } 43 | -------------------------------------------------------------------------------- /mlfs/hash/md5.go: -------------------------------------------------------------------------------- 1 | package hash 2 | 3 | import ( 4 | "crypto/md5" 5 | "fmt" 6 | "io" 7 | "os" 8 | 9 | "github.com/kungfu-team/tenplex/mlfs/iotrace" 10 | ) 11 | 12 | type md5db struct { 13 | hashToPath map[string]string 14 | pathToHash map[string]string 15 | } 16 | 17 | func NewMD5DB() *md5db { 18 | db := &md5db{ 19 | hashToPath: make(map[string]string), 20 | pathToHash: make(map[string]string), 21 | } 22 | return db 23 | } 24 | 25 | // func (db*md5db) 26 | 27 | func FileMD5(c *iotrace.Counter, filename string) (string, error) { 28 | f, err := os.Open(filename) 29 | if err != nil { 30 | return "", err 31 | } 32 | defer f.Close() 33 | return md5sum(f, c) 34 | } 35 | 36 | func md5sum(r io.Reader, c *iotrace.Counter) (string, error) { 37 | h := md5.New() 38 | if _, err := io.Copy(h, iotrace.TraceReader(r, c)); err != nil { 39 | return "", err 40 | } 41 | return fmt.Sprintf("%x", h.Sum(nil)), nil 42 | } 43 | -------------------------------------------------------------------------------- /mlfs/mlfs/tensorfile.go: -------------------------------------------------------------------------------- 1 | package mlfs 2 | 3 | import ( 4 | "bytes" 5 | "fmt" 6 | 7 | "github.com/kungfu-team/tenplex/mlfs/vfs" 8 | "github.com/kungfu-team/tenplex/tensor" 9 | ) 10 | 11 | type Tensor = tensor.Tensor 12 | 13 | func (e *MLFS) TouchTensor(p string, t *Tensor) error { 14 | log.Printf("TouchTensor: %q", p) 15 | if _, err := e.tree.TouchText(p+`.meta`, func() string { 16 | bs := &bytes.Buffer{} 17 | fmt.Fprintf(bs, "%s\n", t.Dtype) 18 | dims := t.Dims 19 | fmt.Fprintf(bs, "%d\n", len(dims)) 20 | for _, d := range dims { 21 | fmt.Fprintf(bs, "%d\n", d) 22 | } 23 | return bs.String() 24 | }()); err != nil { 25 | log.Printf("TouchTensor meta: %q", p) 26 | return err 27 | } 28 | // TODO: support write large bytes to read files instead of RAM 29 | if _, err := e.tree.TouchFile(p, vfs.ToFile(t.Data)); err != nil { 30 | log.Printf("TouchTensor data: %q", p) 31 | return err 32 | } 33 | return nil 34 | } 35 | -------------------------------------------------------------------------------- /benchmark/reconfiguration_horovod/with-docker: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | set -e 4 | 5 | # name=$(cat $(dirname $0)/name.txt) 6 | name=reconfiguration_horovod 7 | tag=$(cat $(dirname $0)/tag.txt) 8 | 9 | join_() { 10 | local IFS=$1 11 | shift 12 | echo "$*" 13 | } 14 | 15 | join() { join_ , $@; } 16 | 17 | gpus() { 18 | local n=2 19 | seq 0 $((n - 1)) 20 | } 21 | 22 | docker_mount() { echo -v $1:$2; } 23 | docker_forward() { docker_mount $1 $1; } 24 | 25 | docker_run_flags() { 26 | echo --rm 27 | echo --gpus 28 | echo "\"device=$(join $(gpus))\"" 29 | 30 | echo -i 31 | 32 | echo --name $name 33 | 34 | docker_forward /mnt/mlfs 35 | docker_forward /data/imagenet/records 36 | } 37 | 38 | docker_run() { docker run $(docker_run_flags) -t $tag $@; } 39 | 40 | main() { 41 | if [ -z "$1" ]; then 42 | docker_run bash 43 | else 44 | docker_run $@ 45 | fi 46 | } 47 | 48 | main $@ 49 | -------------------------------------------------------------------------------- /benchmark/failure/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | . ../common.sh 5 | 6 | hosts() { 7 | echo "10.10.10.1" 8 | echo "10.10.10.2" 9 | echo "10.10.10.3" 10 | echo "10.10.10.4" 11 | } 12 | 13 | 14 | flags() { 15 | echo -framework "megatron-lm" 16 | echo -model "gpt" 17 | echo -model-size "2.7B" 18 | echo -dataset "enwiki" 19 | echo -batch-size 128 20 | echo -micro-batch-size 8 21 | echo -precision "fp16" 22 | echo -index-url "/data/megatron-lm/gpt-2/enwiki/npzs_seq1024_new/indices.txt" 23 | echo -hosts "$(join $(hosts))" 24 | echo -schedule-file "schedule.json" 25 | echo -para-config "para-config.json" 26 | echo -mlfs-port 20010 27 | echo -gpu-per-host 4 28 | echo -gpu-per-container 4 29 | echo -seq-length 1024 30 | } 31 | 32 | for i in 4 8 12 33 | do 34 | tenplex-run $(flags) -failure $i 2>&1 | tee failure_$i.log 35 | mv logs logs_$i 36 | done 37 | 38 | python plot.py 39 | -------------------------------------------------------------------------------- /scheduler/scripts/recreate-vmss.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | set -e 3 | 4 | cd $(dirname $0)/.. 5 | . ./scripts/config.sh 6 | 7 | # The Image type for a Virtual Machine Scale Set may not be changed. 8 | # image=tenplex-base-image 9 | image="tenplex-mw" 10 | image=$(az image show -n $image -g kungfu | jq -r .id) 11 | echo "Using image $image" 12 | 13 | storage=Premium_LRS # SSD 14 | 15 | flags() { 16 | echo --admin-username kungfu 17 | echo --vnet-name tenplex-relayVNET 18 | echo --subnet tenplex-relaySubnet 19 | echo --disable-overprovision 20 | echo --image $image 21 | echo --instance-count 0 22 | echo --vm-sku $size 23 | echo --location westeurope 24 | echo --storage-sku $storage 25 | # echo --lb '""' 26 | } 27 | 28 | recreate() { 29 | az vmss delete -g $group -n $name 30 | echo "deleted $name" 31 | 32 | az vmss create -g $group -n $name $(flags) --lb "" 33 | echo "created $name" 34 | } 35 | 36 | recreate 37 | -------------------------------------------------------------------------------- /benchmark/dynamic_resources/pytorch-schedule.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "time": 35, 4 | "size": 16 5 | }, 6 | { 7 | "time": 35, 8 | "size": 8 9 | }, 10 | { 11 | "time": 35, 12 | "size": 8 13 | }, 14 | { 15 | "time": 35, 16 | "size": 16 17 | }, 18 | { 19 | "time": 35, 20 | "size": 8 21 | }, 22 | { 23 | "time": 35, 24 | "size": 8 25 | }, 26 | { 27 | "time": 35, 28 | "size": 16 29 | }, 30 | { 31 | "time": 35, 32 | "size": 8 33 | }, 34 | { 35 | "time": 35, 36 | "size": 8 37 | }, 38 | { 39 | "time": 35, 40 | "size": 16 41 | }, 42 | { 43 | "time": 35, 44 | "size": 8 45 | }, 46 | { 47 | "time": 13, 48 | "size": 8 49 | }, 50 | { 51 | "time": 0, 52 | "size": 0 53 | } 54 | ] 55 | -------------------------------------------------------------------------------- /benchmark/reconfiguration_cluster_size/recreate-vmss.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | set -e 3 | 4 | . ./config.sh 5 | 6 | # The Image type for a Virtual Machine Scale Set may not be changed. 7 | # image=tenplex-base-image 8 | # image="tenplex-mw" 9 | image="tenplex-2024-08" 10 | image=$(az image show -n $image -g kungfu | jq -r .id) 11 | 12 | echo "Using image $image" 13 | 14 | storage=Premium_LRS # SSD 15 | 16 | create_flags() { 17 | echo --admin-username $USER 18 | #echo --admin-username kungfu 19 | echo --vnet-name tenplex-relayVNET 20 | echo --subnet tenplex-relaySubnet 21 | echo --image $image 22 | echo --instance-count 0 23 | echo --vm-sku $size 24 | echo --location westeurope 25 | echo --storage-sku $storage 26 | echo --lb-sku Standard 27 | } 28 | 29 | recreate() { 30 | az vmss delete -g $group -n $name 31 | echo "deleted $name" 32 | 33 | az vmss create -g $group -n $name $(create_flags) --lb "" 34 | echo "created $name" 35 | } 36 | 37 | recreate 38 | -------------------------------------------------------------------------------- /scheduler/scheduler/scheduler_test.go: -------------------------------------------------------------------------------- 1 | //go:build exclude 2 | 3 | package scheduler 4 | 5 | import ( 6 | "fmt" 7 | "io" 8 | "log" 9 | "net/http" 10 | "strings" 11 | "testing" 12 | ) 13 | 14 | func TestScheduler(t *testing.T) { 15 | ip := "localhost" 16 | port := 22222 17 | url := fmt.Sprintf("http://%s:%d/stop", ip, port) 18 | 19 | resp, err := http.Get(url) 20 | if err != nil { 21 | t.Fatalf("error %v", err) 22 | } 23 | if resp.StatusCode != 200 { 24 | t.Fatalf("POST failed, status code: %d", resp.StatusCode) 25 | } 26 | body, err := io.ReadAll(resp.Body) 27 | if err != nil { 28 | t.Fatalf("error %v", err) 29 | } 30 | t.Logf("body %s", string(body)) 31 | } 32 | 33 | func TestNextLowerPowTwo(t *testing.T) { 34 | x := nextLowerPowTwo(33) 35 | if x == 32 { 36 | t.Logf("success") 37 | return 38 | } 39 | t.Logf("failed") 40 | } 41 | 42 | func TestPlayground(t *testing.T) { 43 | n := "iter" 44 | splitName := strings.SplitN(n, ".", 2) 45 | log.Printf("%d", len(splitName)) 46 | } 47 | -------------------------------------------------------------------------------- /benchmark/reconfiguration_horovod/logger.py: -------------------------------------------------------------------------------- 1 | import time 2 | import numpy as np 3 | 4 | 5 | class Logger(object): 6 | 7 | def __init__(self, log_period=10) -> None: 8 | self.t0 = time.time() 9 | self.img_secs = [] 10 | self.step = 0 11 | self.trained = 0 12 | self.log_period = log_period 13 | 14 | def add(self, trained): 15 | self.step += 1 16 | self.trained += trained 17 | 18 | if self.step % self.log_period == 0: 19 | t1 = time.time() 20 | took = t1 - self.t0 21 | 22 | img_sec = self.trained / took 23 | self.t0 = t1 24 | self.trained = 0 25 | 26 | print('step #%d : %.1f img/sec' % (self.step, img_sec)) 27 | self.img_secs.append(img_sec) 28 | 29 | def report(self): 30 | img_sec_mean = np.mean(self.img_secs) 31 | img_sec_conf = 1.96 * np.std(self.img_secs) 32 | print('RESULT Img/sec: %.1f +-%.1f' % (img_sec_mean, img_sec_conf)) 33 | -------------------------------------------------------------------------------- /mlfs/docker/ubuntu/2204/Dockerfile: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env -S sh -c 'docker build --rm--build-arg SSH_KEY="${SSH_KEY}" -t $(cat $(dirname $0)/tag.txt) -f $0 $(dirname $0)/../../..' 2 | 3 | FROM ubuntu:jammy AS builder 4 | 5 | ENV DEBIAN_FRONTEND=noninteractive 6 | 7 | RUN apt update 8 | RUN apt install -y golang-go make git cmake 9 | 10 | RUN mkdir $HOME/.ssh 11 | RUN echo "StrictHostKeyChecking no" >$HOME/.ssh/config 12 | ARG SSH_KEY 13 | RUN echo "${SSH_KEY}" >$HOME/.ssh/id_rsa 14 | RUN chmod 0600 $HOME/.ssh/id_rsa 15 | RUN ssh-keygen -y -f $HOME/.ssh/id_rsa >$HOME/.ssh/id_rsa.pub 16 | 17 | RUN git config --global url."git@github.com:".insteadOf "https://github.com/" 18 | RUN go env -w GOPRIVATE=* 19 | 20 | WORKDIR /src 21 | ADD . . 22 | RUN GOBIN=$PWD/bin go install -v ./... 23 | RUN ./scripts/pack.sh 24 | RUN cp ./build/*.deb mlfs.deb 25 | 26 | FROM ubuntu:jammy 27 | 28 | RUN apt update 29 | RUN apt install -y systemd init fuse 30 | 31 | COPY --from=builder /src/mlfs.deb / 32 | RUN dpkg -i /mlfs.deb && rm /mlfs.deb 33 | -------------------------------------------------------------------------------- /benchmark/convergence_impact/README.md: -------------------------------------------------------------------------------- 1 | # Convergence impact 2 | _Fig. 2. Impact of GPU change on training convergence (Changing GPUs from 2 to 4 with GPT-3 and MNIST)_ 3 | 4 | Fig. 2a shows how model convergence, plotted as the loss value, is affected after adding a GPU (vertical orange line) under data parallelism. The solid black line shows regular model convergence with a static GPU allocation; the dashed red line shows convergence after the scale-out event when the dataset is processed inconsistently after re-partitioning: when resuming the training in the middle of the epoch, the first half of the training data is used twice, which overfits the model and reduces the loss value unreasonably. 5 | 6 | In Fig. 2b, we show how the global batch size must be kept constant after adding a GPU (vertical orange line) under data parallelism. The solid black line shows model convergence (measured as loss) without the GPU change. The dashed red line shows the divergence when the GPU allocation changes but the device batch size remains constant. 7 | -------------------------------------------------------------------------------- /tenplex/save.py: -------------------------------------------------------------------------------- 1 | import requests 2 | 3 | from .mlfs_client import MLFSClient 4 | 5 | 6 | def save( 7 | ckpt: dict, 8 | job_id: str, 9 | step: int, 10 | device_rank: int, 11 | mlfs_path: str, 12 | ip: str, 13 | port: int, 14 | ): 15 | path = f"/job/{job_id}/save/{device_rank}" 16 | print(f"save checkpoint to {path}") 17 | 18 | client = MLFSClient(ip, port) 19 | 20 | dire = None 21 | try: 22 | dire = client.get_dir(path) 23 | except requests.HTTPError: 24 | print(f"{path} does not exist yet") 25 | 26 | if dire: 27 | try: 28 | client.delete(path) 29 | print("deleted previous save dir") 30 | except requests.HTTPError as err: 31 | print(f"save delete {path} {err}") 32 | print(f"number of elements in dir {len(dire)}") 33 | raise err 34 | 35 | client.save_traverse(ckpt, path) 36 | client.upload_txt(f"job/{job_id}/iter", str(step)) 37 | 38 | print(f"did save checkpoint to {path}") 39 | -------------------------------------------------------------------------------- /state_transformer/meta/struct_test.go: -------------------------------------------------------------------------------- 1 | //go:build exclude 2 | 3 | package meta 4 | 5 | import "testing" 6 | 7 | func Test_Load(t *testing.T) { 8 | conf := Config{ 9 | CkptStructDir: "/home/marcel/.tenplex/transformer-checkpoint", 10 | SourceMPDegree: 4, 11 | TargetMPDegree: 2, 12 | SourcePPDegree: 3, 13 | TargetPPDegree: 2, 14 | SourceSize: 12, 15 | TargetSize: 8, 16 | SourceDPDegree: 1, 17 | TargetDPDegree: 2, 18 | Precision: "fp16", 19 | OutputTimestamp: "", 20 | InputTimestamp: "", 21 | SourceHosts: []string{"a", "b", "c"}, 22 | TargetHosts: []string{"a", "b"}, 23 | Port: 20010, 24 | GpusPerHost: 4, 25 | MdpLibrary: "megatron-lm", 26 | SeqLength: 1024, 27 | JobID: "jobid", 28 | NumLayers: 24, 29 | } 30 | rankMap, err := CreateRankMap(&conf, true) 31 | stru, err := LoadStructs(&conf, rankMap, true) 32 | if err != nil { 33 | t.Logf("Error %v", err) 34 | return 35 | } 36 | t.Logf("Structures length %d", len(stru)) 37 | } 38 | -------------------------------------------------------------------------------- /mlfs/uri/monitor.go: -------------------------------------------------------------------------------- 1 | package uri 2 | 3 | import ( 4 | "io" 5 | 6 | "github.com/kungfu-team/tenplex/mlfs/closer" 7 | "github.com/kungfu-team/tenplex/mlfs/iotrace" 8 | ) 9 | 10 | type monitor struct { 11 | c *iotrace.Counter 12 | name string 13 | } 14 | 15 | func newMonitor(name string) *monitor { 16 | m := &monitor{ 17 | c: iotrace.NewCounter(), 18 | name: name, 19 | } 20 | go m.Run() 21 | return m 22 | } 23 | 24 | func (m *monitor) Run() { 25 | iotrace.Monitor(m.c, m.name) 26 | } 27 | 28 | func (m *monitor) Trace(r io.ReadCloser) io.ReadCloser { 29 | r1 := iotrace.TraceReader(r, m.c) 30 | return closer.ReadClose(r1, r.Close) 31 | } 32 | 33 | var ( 34 | httpRangeRate = newMonitor(`http partial download rate: `) 35 | httpFullRate = newMonitor(`http full download rate: `) 36 | fileReadRate = newMonitor(`file read rate: `) 37 | ) 38 | 39 | func withHTTPTrace(f io.ReadCloser, bgn, end int64) io.ReadCloser { 40 | if bgn == 0 && end == -1 { 41 | f = httpFullRate.Trace(f) 42 | } else { 43 | f = httpRangeRate.Trace(f) 44 | } 45 | return f 46 | } 47 | -------------------------------------------------------------------------------- /benchmark/reconfiguration_parallelization/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | set -e 4 | 5 | . $(dirname $0)/../common.sh 6 | 7 | hosts() { 8 | echo "10.10.10.1" 9 | echo "10.10.10.2" 10 | echo "10.10.10.3" 11 | echo "10.10.10.4" 12 | } 13 | 14 | model_sizes() { 15 | echo "6.7B" 16 | echo "2.7B" 17 | echo "xl" 18 | } 19 | 20 | comb_flags() { 21 | base_flags 22 | echo -model "gpt" 23 | echo -dataset "enwiki" 24 | echo -index-url "/data/megatron-lm/gpt-2/enwiki/npzs_seq1024_new/indices.txt" 25 | echo -hosts $(join $(hosts)) 26 | echo -schedule "$(dirname $0)/schedule.json" 27 | echo -model-sizes $(join $(model_sizes)) 28 | echo -batch-sizes 128 29 | echo -micro-batch-sizes 8 30 | echo -central 31 | } 32 | 33 | tenplex-multi-experiment $(comb_flags) -para-config para-config-dp.json 2>&1 | tee para_dp.log 34 | tenplex-multi-experiment $(comb_flags) -para-config para-config-tp.json 2>&1 | tee para_tp.log 35 | tenplex-multi-experiment $(comb_flags) -para-config para-config-pp.json 2>&1 | tee para_pp.log 36 | 37 | python plot.py 38 | -------------------------------------------------------------------------------- /tenplex-run/job/params.go: -------------------------------------------------------------------------------- 1 | package job 2 | 3 | import ( 4 | "strconv" 5 | 6 | "github.com/kungfu-team/tenplex/tenplex-run/para_config" 7 | ) 8 | 9 | var str = strconv.Itoa 10 | 11 | type TrainingConfig struct { 12 | NumNodes int 13 | GPUPerNode int 14 | MDP para_config.MultiDimensionalParallelism 15 | 16 | TrainIters int 17 | 18 | LogInterval int 19 | SaveInterval int 20 | EvalInterval int 21 | 22 | Precision string 23 | } 24 | 25 | type GenCmdFunc func(c TrainingConfig, rank int, jobID string, host string, jConf *JobConfig) []string 26 | 27 | type TransformerSize struct { 28 | Layers int 29 | HiddenSize int 30 | AttentionHeads int 31 | } 32 | 33 | func (s TransformerSize) ToPyArgs() []string { 34 | return []string{ 35 | `--num-layers`, str(s.Layers), 36 | `--hidden-size`, str(s.HiddenSize), 37 | `--num-attention-heads`, str(s.AttentionHeads), 38 | } 39 | } 40 | 41 | func TFSize(nl, hs, ah int) TransformerSize { 42 | return TransformerSize{ 43 | Layers: nl, 44 | HiddenSize: hs, 45 | AttentionHeads: ah, 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /mlfs/mlfstest/go.sum: -------------------------------------------------------------------------------- 1 | github.com/lgarithm/proc v0.3.2-0.20221205141105-3ebbaa57acfd h1:3TKH+pOzdcVhdd3owi+PhIadcdH5C5U51CiR/ltdutc= 2 | github.com/lgarithm/proc v0.3.2-0.20221205141105-3ebbaa57acfd/go.mod h1:ODAmNzweK407/Z8BaSNqs6tTac6/JkLr+injrsAXq20= 3 | golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= 4 | golang.org/x/crypto v0.0.0-20200604202706-70a84ac30bf9 h1:vEg9joUBmeBcK9iSJftGNf3coIG4HqZElCPehJsfAYM= 5 | golang.org/x/crypto v0.0.0-20200604202706-70a84ac30bf9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= 6 | golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= 7 | golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= 8 | golang.org/x/sys v0.0.0-20190412213103-97732733099d h1:+R4KGOnez64A81RvjARKc4UT5/tI9ujCIVX+P5KiHuI= 9 | golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 10 | golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= 11 | -------------------------------------------------------------------------------- /scheduler/experiments/lib.go: -------------------------------------------------------------------------------- 1 | package experiments 2 | 3 | import ( 4 | "strings" 5 | 6 | "github.com/kungfu-team/tenplex/scheduler/azcli" 7 | "github.com/lgarithm/proc" 8 | ) 9 | 10 | type ( 11 | At = proc.UserHost 12 | P = proc.P 13 | Proc = proc.Proc 14 | ) 15 | 16 | var ( 17 | par = proc.Par 18 | out = proc.Output 19 | seq = proc.Seq 20 | Main = proc.Main 21 | psh = proc.Psh 22 | at = proc.At 23 | echo = proc.Echo 24 | lmd = proc.Lambda 25 | ignore = proc.Ignore 26 | urpc = proc.Urpc 27 | ) 28 | 29 | func getPubIP(name, group string) string { 30 | o := string(out(psh(azcli.GetPubIP(name, group)))) 31 | return strings.Trim(o, "\n\"") 32 | } 33 | 34 | func getIP(name, group string) string { 35 | o := string(out(psh(azcli.GetIP(name, group)))) 36 | return strings.Trim(o, "\n\"") 37 | } 38 | 39 | func fmap[X any, Y any](f func(X) Y, xs ...X) []Y { 40 | var ys []Y 41 | for _, x := range xs { 42 | ys = append(ys, f(x)) 43 | } 44 | return ys 45 | } 46 | 47 | func parmap[T any](f func(T) P, xs ...T) P { return par(fmap(f, xs...)...) } 48 | -------------------------------------------------------------------------------- /mlfs/docker/ubuntu/2004/Dockerfile: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env -S sh -c 'docker build --rm --build-arg SSH_KEY="${SSH_KEY}" -t $(cat $(dirname $0)/tag.txt) -f $0 $(dirname $0)/../../..' 2 | 3 | FROM ubuntu:focal AS builder 4 | 5 | ENV DEBIAN_FRONTEND=noninteractive 6 | 7 | RUN apt update 8 | RUN apt install -y software-properties-common 9 | RUN add-apt-repository ppa:longsleep/golang-backports 10 | RUN apt install -y golang-go make git cmake 11 | 12 | RUN mkdir $HOME/.ssh 13 | RUN echo "StrictHostKeyChecking no" >$HOME/.ssh/config 14 | ARG SSH_KEY 15 | RUN echo "${SSH_KEY}" >$HOME/.ssh/id_rsa 16 | RUN chmod 0600 $HOME/.ssh/id_rsa 17 | RUN ssh-keygen -y -f $HOME/.ssh/id_rsa >$HOME/.ssh/id_rsa.pub 18 | 19 | RUN git config --global url."git@github.com:".insteadOf "https://github.com/" 20 | RUN go env -w GOPRIVATE=* 21 | 22 | WORKDIR /src 23 | ADD . . 24 | RUN GOBIN=$PWD/bin go install -v ./... 25 | RUN ./scripts/pack.sh 26 | RUN cp ./build/*.deb mlfs.deb 27 | 28 | FROM ubuntu:focal 29 | 30 | RUN apt update 31 | RUN apt install -y systemd init fuse 32 | 33 | COPY --from=builder /src/mlfs.deb / 34 | RUN dpkg -i /mlfs.deb && rm /mlfs.deb 35 | -------------------------------------------------------------------------------- /scheduler/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | CMAKE_MINIMUM_REQUIRED(VERSION 3.5) 2 | PROJECT(tenplex-scheduler) 3 | 4 | SET(CPACK_GENERATOR "DEB") 5 | SET(CPACK_DEBIAN_PACKAGE_MAINTAINER "lg") 6 | SET(CPACK_PACKAGE_VERSION $ENV{VERSION}) 7 | SET(CPACK_DEBIAN_FILE_NAME DEB-DEFAULT) 8 | INCLUDE(CPack) 9 | 10 | INSTALL(PROGRAMS ${CMAKE_SOURCE_DIR}/bin/tenplex-scheduler DESTINATION bin) 11 | INSTALL(PROGRAMS ${CMAKE_SOURCE_DIR}/bin/tenplex-user DESTINATION bin) 12 | INSTALL(PROGRAMS ${CMAKE_SOURCE_DIR}/bin/tenplex-state-transformer 13 | DESTINATION bin) 14 | 15 | INSTALL(FILES ${CMAKE_SOURCE_DIR}/etc/os/linux/tenplex-scheduler.service 16 | DESTINATION /lib/systemd/system) 17 | 18 | FUNCTION(INSTALL_SCRIPT target) 19 | INSTALL( 20 | FILES ${target} 21 | PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ # 22 | WORLD_READ WORLD_EXECUTE 23 | DESTINATION /etc/tenplex) 24 | ENDFUNCTION() 25 | 26 | INSTALL_SCRIPT(${CMAKE_SOURCE_DIR}/etc/tenplex/scheduler.sh) 27 | INSTALL_SCRIPT(${CMAKE_SOURCE_DIR}/etc/tenplex/stop-scheduler.sh) 28 | 29 | # INSTALL(DIRECTORY ${CMAKE_SOURCE_DIR}/man DESTINATION share) 30 | -------------------------------------------------------------------------------- /mlfs/iotrace/report.go: -------------------------------------------------------------------------------- 1 | package iotrace 2 | 3 | import ( 4 | golog "log" 5 | "os" 6 | "sync" 7 | "time" 8 | ) 9 | 10 | var log = golog.New(os.Stderr, `[mlfs] io % `, 0) 11 | 12 | type reporter struct { 13 | stopped chan struct{} 14 | wg sync.WaitGroup 15 | } 16 | 17 | func Reporter(c *Counter, prefix string) *reporter { 18 | r := &reporter{stopped: make(chan struct{}, 1)} 19 | r.wg.Add(1) 20 | go func() { 21 | for { 22 | select { 23 | case <-r.stopped: 24 | log.Printf("%soverall rate: %s", prefix, c.ShowRate()) 25 | r.wg.Done() 26 | return 27 | default: 28 | time.Sleep(1 * time.Second) 29 | log.Printf("%s%s", prefix, c.ShowRate()) 30 | } 31 | } 32 | 33 | }() 34 | return r 35 | } 36 | 37 | func (r *reporter) Stop() { 38 | r.stopped <- struct{}{} 39 | r.wg.Wait() 40 | } 41 | 42 | func Monitor(c *Counter, prefix string) { 43 | r := &reporter{stopped: make(chan struct{}, 1)} 44 | r.wg.Add(1) 45 | go func() { 46 | for { 47 | time.Sleep(1 * time.Second) 48 | if !c.Zero() { 49 | log.Printf("%s%s", prefix, c.ShowRate()) 50 | c.Reset() 51 | } 52 | } 53 | }() 54 | } 55 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | CMAKE_MINIMUM_REQUIRED(VERSION 3.5) 2 | PROJECT(mlfs) 3 | 4 | SET(CPACK_GENERATOR "DEB") 5 | SET(CPACK_DEBIAN_PACKAGE_MAINTAINER "g.li@imperial.ac.uk") 6 | SET(CPACK_PACKAGE_VERSION $ENV{VERSION}) 7 | SET(CPACK_DEBIAN_FILE_NAME DEB-DEFAULT) 8 | INCLUDE(CPack) 9 | 10 | FUNCTION(INSTALL_BIN TARGET) 11 | INSTALL(PROGRAMS ${CMAKE_SOURCE_DIR}/bin/${TARGET} DESTINATION bin) 12 | ENDFUNCTION() 13 | 14 | INSTALL_BIN(mlfs-build-tf-index) 15 | INSTALL_BIN(mlfs-check-index) 16 | INSTALL_BIN(mlfs-download) 17 | INSTALL_BIN(mlfs-edit-index) 18 | INSTALL_BIN(mlfs) 19 | INSTALL_BIN(mlfsd) 20 | INSTALL_BIN(tenplex-state-transformer) 21 | 22 | INSTALL(FILES ${CMAKE_SOURCE_DIR}/mlfs/etc/os/linux/mlfs.service 23 | DESTINATION /lib/systemd/system) 24 | INSTALL( 25 | FILES ${CMAKE_SOURCE_DIR}/mlfs/etc/mlfs/mlfs.sh 26 | PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ WORLD_READ 27 | DESTINATION /etc/mlfs) 28 | INSTALL( 29 | FILES ${CMAKE_SOURCE_DIR}/mlfs/etc/mlfs/stop.sh 30 | PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ WORLD_READ 31 | DESTINATION /etc/mlfs) 32 | 33 | INSTALL(DIRECTORY ${CMAKE_SOURCE_DIR}/man DESTINATION share) 34 | -------------------------------------------------------------------------------- /mlfs/mlfs/dsidx.go: -------------------------------------------------------------------------------- 1 | package mlfs 2 | 3 | import ( 4 | "image" 5 | "image/color" 6 | 7 | "github.com/kungfu-team/tenplex/mlfs/vfs/vfile" 8 | ) 9 | 10 | type DSIDX struct { 11 | idx vfile.IndexedFiles 12 | ridx []int 13 | lidx []int 14 | maxRegions int 15 | totalRegions int 16 | } 17 | 18 | func newDSIDX(idx vfile.IndexedFiles) *DSIDX { 19 | var maxRegions, totalRegions int 20 | var ridx, lidx []int 21 | for i, f := range idx { 22 | n := len(f.Ranges) 23 | if n > maxRegions { 24 | maxRegions = n 25 | } 26 | totalRegions += n 27 | for j := 0; j < n; j++ { 28 | ridx = append(ridx, i) 29 | lidx = append(lidx, j) 30 | } 31 | } 32 | d := &DSIDX{ 33 | idx: idx, 34 | ridx: ridx, 35 | lidx: lidx, 36 | maxRegions: maxRegions, 37 | totalRegions: totalRegions, 38 | } 39 | log.Printf("maxRegions: %d", maxRegions) 40 | return d 41 | } 42 | 43 | func (d *DSIDX) bmap(ids []int) image.Image { 44 | img := makeBitmap(len(d.idx), d.maxRegions) 45 | for _, id := range ids { 46 | i := d.ridx[id] 47 | j := d.lidx[id] 48 | img.Set(j, i, color.White) 49 | } 50 | return img 51 | } 52 | -------------------------------------------------------------------------------- /state_transformer/meta/rankmap.go: -------------------------------------------------------------------------------- 1 | package meta 2 | 3 | import ( 4 | "encoding/json" 5 | "os" 6 | "path" 7 | "strconv" 8 | ) 9 | 10 | type MDPRank struct { 11 | PPRank int 12 | MPRank int 13 | DPRank int 14 | } 15 | 16 | type RankMap struct { 17 | Rank map[MDPRank]int 18 | MDPRank map[int]MDPRank 19 | } 20 | 21 | func CreateRankMap(config *Config, before bool) (*RankMap, error) { 22 | structPath := GetStructPath(config, before) 23 | jsonPath := path.Join(structPath, "rank_map.json") 24 | content, err := os.ReadFile(jsonPath) 25 | if err != nil { 26 | return nil, err 27 | } 28 | var payload map[string]map[string]int 29 | err = json.Unmarshal(content, &payload) 30 | if err != nil { 31 | return nil, err 32 | } 33 | 34 | ranks := make(map[MDPRank]int) 35 | MDPRanks := make(map[int]MDPRank) 36 | for r, val := range payload { 37 | rInt, err := strconv.Atoi(r) 38 | if err != nil { 39 | return nil, err 40 | } 41 | mdpRank := MDPRank{PPRank: val["pp_rank"], MPRank: val["mp_rank"], DPRank: val["dp_rank"]} 42 | MDPRanks[rInt] = mdpRank 43 | ranks[mdpRank] = rInt 44 | } 45 | rankMap := RankMap{Rank: ranks, MDPRank: MDPRanks} 46 | return &rankMap, nil 47 | } 48 | -------------------------------------------------------------------------------- /mlfs/benchmarks/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | set -e 4 | 5 | export PYTHON=$(which python3.6) 6 | 7 | cd $(dirname $0) 8 | 9 | kungfu_run_flags() { 10 | echo -q 11 | echo -logdir logs/$JOB_ID 12 | echo -np 4 13 | } 14 | 15 | kungfu_run() { 16 | echo "JOB_ID: $JOB_ID" 17 | kungfu-run $(kungfu_run_flags) $@ 18 | } 19 | 20 | flags_mount() { 21 | echo --index-file $HOME/tf-index-16.idx.txt 22 | echo --seed 1 23 | echo --global-batch-size 128 24 | echo --tfrecord-fs $PWD/../../bin/tfrecord-fs 25 | } 26 | 27 | flags() { 28 | flags_mount 29 | echo --run-train-op 30 | } 31 | 32 | flags_baseline() { 33 | flags 34 | echo --prefix $HOME/mnt/all 35 | } 36 | 37 | flags_fake_data() { 38 | flags 39 | echo --prefix $HOME/mnt/all 40 | echo --fake-data 41 | } 42 | 43 | flags_vfs() { 44 | flags 45 | } 46 | 47 | main() { 48 | export JOB_ID=vfs 49 | kungfu_run $PYTHON train_resnet50.py $(flags_vfs) 50 | 51 | # export JOB_ID=base 52 | # kungfu_run $PYTHON train_resnet50.py $(flags_baseline) 53 | 54 | # export JOB_ID=fakedata 55 | # kungfu_run $PYTHON train_resnet50.py $(flags_fake_data) 56 | } 57 | 58 | rm -fr *.log 59 | main 60 | -------------------------------------------------------------------------------- /state_transformer/search/file-system.go: -------------------------------------------------------------------------------- 1 | package search 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | "path" 7 | "strings" 8 | ) 9 | 10 | func isTensor(name string) (bool, error) { 11 | split := strings.SplitN(name, ".", 2) 12 | if len(split) != 2 { 13 | return false, fmt.Errorf("string split has not exactly 2 parts") 14 | } 15 | return split[1] == "numpy.ndarray", nil 16 | } 17 | 18 | func SearchFSForTensors(basePath string) ([]string, error) { 19 | dirEntries, err := os.ReadDir(basePath) 20 | if err != nil { 21 | return nil, err 22 | } 23 | 24 | var tensors []string 25 | for _, dirEntry := range dirEntries { 26 | info, err := dirEntry.Info() 27 | if err != nil { 28 | return nil, err 29 | } 30 | 31 | if dirEntry.IsDir() { 32 | newTensors, err := SearchFSForTensors(path.Join(basePath, info.Name())) 33 | tensors = append(tensors, newTensors...) 34 | if err != nil { 35 | return nil, err 36 | } 37 | } else { // isFile 38 | isTen, err := isTensor(info.Name()) 39 | if err != nil { 40 | return nil, err 41 | } 42 | if isTen { 43 | tensors = append(tensors, path.Join(basePath, info.Name())) 44 | } 45 | } 46 | } 47 | return tensors, nil 48 | } 49 | -------------------------------------------------------------------------------- /state_transformer/statetransform/replicate.go: -------------------------------------------------------------------------------- 1 | package statetransform 2 | 3 | import ( 4 | "fmt" 5 | "log" 6 | "strings" 7 | 8 | "github.com/kungfu-team/tenplex/state_transformer/client" 9 | "github.com/kungfu-team/tenplex/state_transformer/meta" 10 | ) 11 | 12 | func replicateTensor(conf *meta.Config, ckptCl *client.CheckpointClient, sourceKey, targetKey []string, sourceMDPRank, targetMDPRank *meta.MDPRank) error { 13 | sourcePath := strings.Join(sourceKey, "/") 14 | sourcePath = fmt.Sprintf("%s.numpy.ndarray", sourcePath) 15 | ten, err := ckptCl.QueryMegatronTensor(sourceMDPRank, conf.InputTimestamp, sourcePath, nil) 16 | if err != nil { 17 | log.Printf("query tensor to replicate failed.\nwith error %v.\nsource key %v, target key %v, source MDP rank %v, target MDP rank %v", err, sourceKey, targetKey, sourceMDPRank, targetMDPRank) 18 | return err 19 | } 20 | targetPath := strings.Join(targetKey, "/") 21 | targetPath = fmt.Sprintf("%s.numpy.ndarray", targetPath) 22 | err = ckptCl.UploadMegatronTensor(ten, targetMDPRank, conf.OutputTimestamp, targetPath) 23 | if err != nil { 24 | return err 25 | } 26 | 27 | if err != nil { 28 | return err 29 | } 30 | 31 | return nil 32 | } 33 | -------------------------------------------------------------------------------- /tests/dataset.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | from tenplex.dataset import GPTDataset as TenplexGPTDataset 3 | 4 | 5 | def main(): 6 | num_scaling = 5 7 | idx_path = "/data/megatron-lm/gpt-2/enwiki/npzs_seq1024/indices.txt" 8 | dp_size = 2 9 | dp_rank = 1 10 | job_id = "dataset-test" 11 | batch_size = 128 12 | 13 | for _ in range(num_scaling): 14 | progress = num_scaling * batch_size * 2048 15 | 16 | mount_cmd = [ 17 | "mlfs", "mount", 18 | "-idx-name", "openwebtext", 19 | "-index-url", f"{idx_path}", 20 | "-ctrl-port", "20010", 21 | "-progress", f"{progress}", 22 | "-global-batch-size", f"{batch_size}", 23 | "-dp-size", f"{dp_size}", 24 | "-job", job_id, 25 | ] 26 | subprocess.run(mount_cmd, check=True) 27 | print("finished MLFS mount") 28 | 29 | mlfs_path = "/mnt/mlfs" 30 | dataset = TenplexGPTDataset(mlfs_path, job_id, dp_rank) 31 | dataset = iter(dataset) 32 | 33 | for _ in range(50_000): 34 | sample = next(dataset) 35 | txt = sample["text"] 36 | 37 | 38 | if __name__ == "__main__": 39 | main() 40 | -------------------------------------------------------------------------------- /mlfs/iotrace/counter.go: -------------------------------------------------------------------------------- 1 | package iotrace 2 | 3 | import ( 4 | "fmt" 5 | "sync/atomic" 6 | "time" 7 | ) 8 | 9 | type Counter struct { 10 | t0 time.Time 11 | n int64 12 | } 13 | 14 | func NewCounter() *Counter { 15 | return &Counter{ 16 | t0: time.Now(), 17 | } 18 | } 19 | 20 | func (c *Counter) Zero() bool { 21 | return atomic.LoadInt64(&c.n) == 0 22 | } 23 | 24 | func (c *Counter) Add(n int64) { 25 | atomic.AddInt64(&c.n, n) 26 | } 27 | 28 | func (c *Counter) Reset() { 29 | c.t0 = time.Now() 30 | atomic.StoreInt64(&c.n, 0) 31 | } 32 | 33 | func (c *Counter) ShowRate() string { 34 | n := atomic.LoadInt64(&c.n) 35 | return ShowRate(Rate(n, time.Since(c.t0))) 36 | } 37 | 38 | func Rate(n int64, d time.Duration) float64 { 39 | return float64(n) / (float64(d) / float64(time.Second)) 40 | } 41 | 42 | func ShowRate(r float64) string { 43 | const Ki = 1 << 10 44 | const Mi = 1 << 20 45 | const Gi = 1 << 30 46 | switch { 47 | case r > Gi: 48 | return fmt.Sprintf("%.2f GiB/s", r/float64(Gi)) 49 | case r > Mi: 50 | return fmt.Sprintf("%.2f MiB/s", r/float64(Mi)) 51 | case r > Ki: 52 | return fmt.Sprintf("%.2f KiB/s", r/float64(Ki)) 53 | default: 54 | return fmt.Sprintf("%.2f B/s", r) 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /scheduler/data/single-job-time.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "time": 35, 4 | "size": 4 5 | }, 6 | { 7 | "time": 35, 8 | "size": 8 9 | }, 10 | { 11 | "time": 35, 12 | "size": 16 13 | }, 14 | { 15 | "time": 35, 16 | "size": 8 17 | }, 18 | { 19 | "time": 35, 20 | "size": 4 21 | }, 22 | { 23 | "time": 35, 24 | "size": 8 25 | }, 26 | { 27 | "time": 35, 28 | "size": 16 29 | }, 30 | { 31 | "time": 35, 32 | "size": 8 33 | }, 34 | { 35 | "time": 35, 36 | "size": 4 37 | }, 38 | { 39 | "time": 35, 40 | "size": 8 41 | }, 42 | { 43 | "time": 35, 44 | "size": 16 45 | }, 46 | { 47 | "time": 35, 48 | "size": 8 49 | }, 50 | { 51 | "time": 35, 52 | "size": 4 53 | }, 54 | { 55 | "time": 35, 56 | "size": 8 57 | }, 58 | { 59 | "time": 35, 60 | "size": 16 61 | }, 62 | { 63 | "time": 13, 64 | "size": 8 65 | }, 66 | { 67 | "time": 0, 68 | "size": 0 69 | } 70 | ] 71 | -------------------------------------------------------------------------------- /benchmark/dynamic_resources/tenplex-schedule.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "time": 35, 4 | "size": 16 5 | }, 6 | { 7 | "time": 35, 8 | "size": 8 9 | }, 10 | { 11 | "time": 35, 12 | "size": 4 13 | }, 14 | { 15 | "time": 35, 16 | "size": 8 17 | }, 18 | { 19 | "time": 35, 20 | "size": 16 21 | }, 22 | { 23 | "time": 35, 24 | "size": 8 25 | }, 26 | { 27 | "time": 35, 28 | "size": 4 29 | }, 30 | { 31 | "time": 35, 32 | "size": 8 33 | }, 34 | { 35 | "time": 35, 36 | "size": 16 37 | }, 38 | { 39 | "time": 35, 40 | "size": 8 41 | }, 42 | { 43 | "time": 35, 44 | "size": 4 45 | }, 46 | { 47 | "time": 35, 48 | "size": 8 49 | }, 50 | { 51 | "time": 35, 52 | "size": 16 53 | }, 54 | { 55 | "time": 35, 56 | "size": 8 57 | }, 58 | { 59 | "time": 35, 60 | "size": 4 61 | }, 62 | { 63 | "time": 13, 64 | "size": 8 65 | }, 66 | { 67 | "time": 0, 68 | "size": 0 69 | } 70 | ] 71 | -------------------------------------------------------------------------------- /mlfs/pid/peer.go: -------------------------------------------------------------------------------- 1 | package pid 2 | 3 | import ( 4 | "fmt" 5 | "net" 6 | "strconv" 7 | ) 8 | 9 | // PeerID is the unique identifier of a peer. 10 | type PeerID struct { 11 | IPv4 uint32 12 | Port uint16 13 | } 14 | 15 | func (p PeerID) String() string { 16 | return net.JoinHostPort(FormatIPv4(p.IPv4), strconv.Itoa(int(p.Port))) 17 | } 18 | 19 | func (p PeerID) ColocatedWith(q PeerID) bool { 20 | return p.IPv4 == q.IPv4 21 | } 22 | 23 | func (p PeerID) ListenAddr(strict bool) PeerID { 24 | if strict { 25 | return PeerID{IPv4: p.IPv4, Port: p.Port} 26 | } 27 | return PeerID{IPv4: 0, Port: p.Port} 28 | } 29 | 30 | func (p PeerID) SockFile() string { 31 | return fmt.Sprintf(`/tmp/goml-peer-%d.sock`, p.Port) 32 | } 33 | 34 | func ParsePeerID(val string) (*PeerID, error) { 35 | host, p, err := net.SplitHostPort(val) 36 | if err != nil { 37 | return nil, err 38 | } 39 | ipv4, err := ParseIPv4(host) // FIXME: checkout error 40 | if err != nil { 41 | return nil, err 42 | } 43 | port, err := strconv.Atoi(p) 44 | if err != nil { 45 | return nil, err 46 | } 47 | if int(uint16(port)) != port { 48 | return nil, errInvalidPort 49 | } 50 | return &PeerID{ 51 | IPv4: ipv4, 52 | Port: uint16(port), 53 | }, nil 54 | } 55 | -------------------------------------------------------------------------------- /tenplex-run/listflag/listflag.go: -------------------------------------------------------------------------------- 1 | package listflag 2 | 3 | import ( 4 | "flag" 5 | "strconv" 6 | "strings" 7 | ) 8 | 9 | type Strings []string 10 | 11 | func (v *Strings) Set(args string) error { 12 | *v = nil 13 | for _, t := range strings.Split(args, ",") { 14 | if s := strings.TrimSpace(t); len(s) > 0 { 15 | *v = append(*v, s) 16 | } 17 | } 18 | return nil 19 | } 20 | 21 | func (v *Strings) String() string { return strings.Join(*v, ",") } 22 | 23 | func String(name string, v Strings, usage string) *Strings { 24 | r := make(Strings, len(v)) 25 | copy(r, v) 26 | flag.Var(&r, name, usage) 27 | return &r 28 | } 29 | 30 | type Ints []int 31 | 32 | func (v *Ints) Set(args string) error { 33 | *v = nil 34 | for _, t := range strings.Split(args, ",") { 35 | s := strings.TrimSpace(t) 36 | n, err := strconv.Atoi(s) 37 | if err != nil { 38 | return err 39 | } 40 | *v = append(*v, n) 41 | } 42 | return nil 43 | } 44 | 45 | func (v *Ints) String() string { 46 | var ss []string 47 | for _, n := range *v { 48 | ss = append(ss, strconv.Itoa(n)) 49 | } 50 | return strings.Join(ss, ",") 51 | } 52 | 53 | func Int(name string, v Ints, usage string) *Ints { 54 | r := make(Ints, len(v)) 55 | copy(r, v) 56 | flag.Var(&r, name, usage) 57 | return &r 58 | } 59 | -------------------------------------------------------------------------------- /mlfs/utils/log.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import ( 4 | "fmt" 5 | "log" 6 | "os" 7 | "time" 8 | ) 9 | 10 | func LogArgs() { 11 | for i, a := range os.Args { 12 | fmt.Printf("[arg] [%d]=%s\n", i, a) 13 | } 14 | } 15 | 16 | func LogEnv() { 17 | for _, e := range os.Environ() { 18 | fmt.Printf("[env] %s\n", e) 19 | } 20 | } 21 | 22 | func ShowSize(n int64) string { 23 | const ( 24 | Ki = 1 << 10 25 | Mi = 1 << 20 26 | Gi = 1 << 30 27 | Ti = 1 << 40 28 | ) 29 | if n >= Ti { 30 | return fmt.Sprintf("%.1fTi", float64(n)/float64(Ti)) 31 | } else if n >= Gi { 32 | return fmt.Sprintf("%.1fGiB", float64(n)/float64(Gi)) 33 | } else if n >= Mi { 34 | return fmt.Sprintf("%.1fMiB", float64(n)/float64(Mi)) 35 | } else if n >= Ki { 36 | return fmt.Sprintf("%.1fKiB", float64(n)/float64(Ki)) 37 | } 38 | return fmt.Sprintf("%d", n) 39 | } 40 | 41 | func Percent(p, n int) float64 { return 100.0 * float64(p) / float64(n) } 42 | 43 | func LogETA(t0 time.Time, progress, total int) { 44 | d := time.Since(t0) 45 | r := Percent(progress, total) 46 | if progress == 0 { 47 | log.Printf("%.1f%% took %s, ETA: %s", r, d, `?`) 48 | return 49 | } 50 | remain := time.Duration(float64(d) * float64(total-progress) / float64(progress)) 51 | log.Printf("%.1f%% took %s, ETA: %s", r, d, remain) 52 | } 53 | -------------------------------------------------------------------------------- /tenplex-run/runop/dataset.go: -------------------------------------------------------------------------------- 1 | package runop 2 | 3 | import ( 4 | "fmt" 5 | "log" 6 | 7 | "github.com/kungfu-team/tenplex/mlfs/ds" 8 | "github.com/kungfu-team/tenplex/mlfs/mlfs" 9 | "github.com/kungfu-team/tenplex/tenplex-run/job" 10 | ) 11 | 12 | func mount(cli *mlfs.Client, ds ds.Dataset, jobID string, batchSize, progress, dpSize, seed int, noShuffle bool) error { 13 | if err := cli.AddIndex(ds.Name, ds.IndexURL); err != nil { 14 | return err 15 | } 16 | if err := cli.Mount(jobID, ds.Name, int64(progress), batchSize, dpSize, seed, noShuffle); err != nil { 17 | return err 18 | } 19 | var s string 20 | if err := cli.GetRoot(&s); err != nil { 21 | return err 22 | } 23 | return nil 24 | } 25 | 26 | func AddDataset(dpSize, progress int, jobConf *job.JobConfig) error { 27 | for _, host := range jobConf.Cluster.Hosts { 28 | cli, err := mlfs.NewClientTo(host, jobConf.MLFSPort) 29 | if err != nil { 30 | return fmt.Errorf("%s %v", host, err) 31 | } 32 | if err := mount(cli, jobConf.Dataset, jobConf.ID, jobConf.BatchSize, progress, dpSize, jobConf.Seed, jobConf.NoShuffle); err != nil { 33 | return fmt.Errorf("%s %v", host, err) 34 | } 35 | log.Printf("Dataset added: host %s, batch size %d, progress %d, DP size %d", host, jobConf.BatchSize, progress, dpSize) 36 | } 37 | return nil 38 | } 39 | -------------------------------------------------------------------------------- /mlfs/iseq/iseq.go: -------------------------------------------------------------------------------- 1 | package iseq 2 | 3 | import ( 4 | "math/rand" 5 | ) 6 | 7 | type ISeq struct { 8 | seq []int 9 | } 10 | 11 | func Seq(s []int) ISeq { 12 | return ISeq{s} 13 | } 14 | 15 | func (is *ISeq) Empty() bool { 16 | return len(is.seq) == 0 17 | } 18 | 19 | func (is *ISeq) Take(n int) ISeq { 20 | a, b := split(n, is.seq) 21 | is.seq = b 22 | return ISeq{a} 23 | } 24 | 25 | func (is *ISeq) Shard(i, m int) ISeq { 26 | k := ceilDiv(len(is.seq), m) 27 | a := i * k 28 | b := min(a+k, len(is.seq)) 29 | return ISeq{seq: is.seq[a:b]} 30 | } 31 | 32 | func (is *ISeq) Len() int { 33 | return len(is.seq) 34 | } 35 | 36 | func (is *ISeq) Get() []int { 37 | return is.seq[:] 38 | } 39 | 40 | func Iota(n int) []int { 41 | s := make([]int, n) 42 | for i := range s { 43 | s[i] = i 44 | } 45 | return s 46 | } 47 | 48 | func Shuffle(s []int, seed int) { 49 | r := rand.New(rand.NewSource(int64(seed))) 50 | r.Shuffle(len(s), func(i, j int) { 51 | s[i], s[j] = s[j], s[i] 52 | }) 53 | } 54 | 55 | func split(n int, s []int) ([]int, []int) { 56 | if n >= len(s) { 57 | return s, nil 58 | } 59 | return s[:n], s[n:] 60 | } 61 | 62 | func ceilDiv(a, b int) int { 63 | if a%b == 0 { 64 | return a / b 65 | } 66 | return a/b + 1 67 | } 68 | 69 | func min(a, b int) int { 70 | if a < b { 71 | return a 72 | } 73 | return b 74 | } 75 | -------------------------------------------------------------------------------- /mlfs/cache/memory.go: -------------------------------------------------------------------------------- 1 | package cache 2 | 3 | import ( 4 | "bytes" 5 | "io" 6 | "sync" 7 | "sync/atomic" 8 | 9 | "github.com/kungfu-team/tenplex/mlfs/vfs" 10 | ) 11 | 12 | type memcached struct { 13 | f vfs.FileNode 14 | 15 | cached int32 16 | bs []byte 17 | mu sync.Mutex 18 | } 19 | 20 | func Memcache(f vfs.FileNode) *memcached { 21 | return &memcached{f: f} 22 | } 23 | 24 | func (f *memcached) isCached() bool { 25 | return atomic.LoadInt32(&f.cached) > 0 26 | } 27 | 28 | func (f *memcached) Size() int64 { return f.f.Size() } 29 | 30 | func (f *memcached) Open() io.ReadCloser { 31 | if f.isCached() { 32 | return io.NopCloser(bytes.NewReader(f.bs)) 33 | } 34 | return f.f.Open() 35 | } 36 | 37 | func (f *memcached) ReadAt(buf []byte, pos int64) (int, error) { 38 | if f.isCached() { 39 | br := bytes.NewReader(f.bs) 40 | return br.ReadAt(buf, pos) 41 | } 42 | return f.f.ReadAt(buf, pos) 43 | } 44 | 45 | func (f *memcached) Cache() { 46 | if f.isCached() { 47 | return 48 | } 49 | f.mu.Lock() 50 | defer f.mu.Unlock() 51 | r := f.f.Open() 52 | bs, err := io.ReadAll(r) 53 | r.Close() 54 | if err == nil { 55 | f.bs = bs 56 | atomic.StoreInt32(&f.cached, 1) 57 | } 58 | } 59 | 60 | func (f *memcached) Uncache() { 61 | atomic.StoreInt32(&f.cached, 0) 62 | f.mu.Lock() 63 | defer f.mu.Unlock() 64 | f.bs = nil 65 | } 66 | -------------------------------------------------------------------------------- /scheduler/experiments/experiments.go: -------------------------------------------------------------------------------- 1 | package experiments 2 | 3 | import ( 4 | "fmt" 5 | "log" 6 | "sync" 7 | ) 8 | 9 | type Setup struct { 10 | Prefix string 11 | NWorkers int 12 | Group string 13 | 14 | IPs []string 15 | PubIPs []string 16 | } 17 | 18 | func NewSetup(p string, n int, g string) *Setup { 19 | s := &Setup{ 20 | Prefix: p, 21 | NWorkers: n, 22 | Group: g, 23 | } 24 | s.GetIPs() 25 | return s 26 | } 27 | 28 | func (s Setup) Names() []string { 29 | var names []string 30 | for i := 0; i < s.NWorkers; i++ { 31 | names = append(names, fmt.Sprintf("%s%02d", s.Prefix, i+1)) 32 | } 33 | return names 34 | } 35 | 36 | func (s *Setup) GetIPs() { 37 | names := s.Names() 38 | ips := make([]string, len(names)) 39 | pubIPs := make([]string, len(names)) 40 | { 41 | var wg sync.WaitGroup 42 | for i := range names { 43 | wg.Add(1) 44 | go func(i int) { 45 | pubIPs[i] = getPubIP(names[i], s.Group) 46 | wg.Done() 47 | }(i) 48 | wg.Add(1) 49 | go func(i int) { 50 | ips[i] = getIP(names[i], s.Group) 51 | wg.Done() 52 | }(i) 53 | } 54 | wg.Wait() 55 | } 56 | // 57 | for i, name := range names { 58 | if len(pubIPs[i]) == 0 { 59 | pubIPs[i] = ips[i] 60 | } 61 | log.Printf("public IP of %s: %s", name, pubIPs[i]) 62 | } 63 | // 64 | s.IPs = ips 65 | s.PubIPs = pubIPs 66 | } 67 | -------------------------------------------------------------------------------- /para_config/deepspeed/layer_map.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import re 4 | 5 | 6 | def main(): 7 | framework = 'deepspeed' 8 | pp_size = 2 9 | mp_size = 1 10 | dp_size = 2 11 | total_size = pp_size * mp_size * dp_size 12 | model_size = 'medium' 13 | direc = f'{framework}/gpt-2/{model_size}/pp{pp_size:02d}/mp{mp_size:02d}/dp{dp_size:02d}' 14 | 15 | mapping = dict() 16 | 17 | for rank in range(total_size): 18 | rank_dir = os.path.join(direc, f'rank{rank:02d}') 19 | if not os.path.exists(rank_dir): 20 | continue 21 | 22 | layer_numbers = [] 23 | tp_rank = None 24 | for entry in os.scandir(rank_dir): 25 | pattern = r'layer_(\d+)-model_(\d+)-model_states.json' 26 | mat = re.match(pattern, entry.name) 27 | if mat is None: 28 | continue 29 | layer_num = int(mat.group(1)) 30 | tp_rank = int(mat.group(2)) 31 | layer_numbers.append(layer_num) 32 | 33 | if layer_numbers: 34 | mapping[rank] = { 35 | 'tp_rank': tp_rank, 36 | 'layer_numbers': layer_numbers 37 | } 38 | 39 | with open(f'{direc}/layer_map.json', 'w') as json_file: 40 | json.dump(mapping, json_file, indent=4) 41 | 42 | 43 | if __name__ == "__main__": 44 | main() 45 | -------------------------------------------------------------------------------- /mlfs/cmd/mlfs-edit-index/mlfs-edit-index.go: -------------------------------------------------------------------------------- 1 | /* 2 | e.g. 3 | 4 | mlfs-edit-index -index-url imagenet.idx.txt -o sub-imagenet.idx.txt -take 128 -from https://minddata.blob.core.windows.net -to https://tenplex.blob.core.windows.net 5 | */ 6 | package main 7 | 8 | import ( 9 | "flag" 10 | "log" 11 | "strings" 12 | 13 | "github.com/kungfu-team/tenplex/mlfs/mlfs" 14 | "github.com/kungfu-team/tenplex/mlfs/vfs/vfile" 15 | ) 16 | 17 | var ( 18 | idxFile = flag.String("index-url", "", "") 19 | output = flag.String("o", "", "") 20 | 21 | take = flag.Int(`take`, 0, ``) 22 | localize = flag.Bool(`localize`, false, ``) 23 | replaceFrom = flag.String(`from`, ``, ``) 24 | replaceTo = flag.String(`to`, ``, ``) 25 | ) 26 | 27 | func main() { mlfs.Main(Main) } 28 | 29 | func Main() error { 30 | log.Printf("loading from %q", *idxFile) 31 | idx, err := vfile.LoadIdxFile(*idxFile) 32 | if err != nil { 33 | return err 34 | } 35 | if *take > 0 { 36 | idx = idx[:*take] 37 | } 38 | if *localize { 39 | idx.SetHost(``) 40 | } else if len(*replaceFrom) > 0 { 41 | replaceURL(idx, *replaceFrom, *replaceTo) 42 | } 43 | log.Printf("saving to %q", *output) 44 | return vfile.SaveIdxFile(*output, idx) 45 | } 46 | 47 | func replaceURL(fs vfile.IndexedFiles, from, to string) { 48 | for i := range fs { 49 | fs[i].Filepath = strings.Replace(fs[i].Filepath, from, to, 1) 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /scheduler/configserver/configserver.go: -------------------------------------------------------------------------------- 1 | package configserver 2 | 3 | import ( 4 | "context" 5 | "fmt" 6 | "net" 7 | "net/http" 8 | "strconv" 9 | 10 | kfcs "github.com/lsds/KungFu/srcs/go/kungfu/elastic/configserver" 11 | "github.com/lsds/KungFu/srcs/go/log" 12 | ) 13 | 14 | func RunBuiltinConfigServer(port int) { 15 | const endpoint = `/config` 16 | addr := net.JoinHostPort("", strconv.Itoa(port)) 17 | log.Infof("running builtin config server listening %s%s", addr, endpoint) 18 | _, cancel := context.WithCancel(context.TODO()) 19 | defer cancel() 20 | srv := &http.Server{ 21 | Addr: addr, 22 | Handler: logRequest(kfcs.New(cancel, nil, endpoint)), 23 | } 24 | srv.SetKeepAlivesEnabled(false) 25 | if err := srv.ListenAndServe(); err != nil { 26 | log.Errorf("config server stopped: %v", err) 27 | } 28 | } 29 | 30 | func logRequest(h http.Handler) http.Handler { 31 | return http.HandlerFunc(func(w http.ResponseWriter, req *http.Request) { 32 | log.Debugf("%s %s from %s, UA: %s", req.Method, req.URL.Path, req.RemoteAddr, req.UserAgent()) 33 | h.ServeHTTP(w, req) 34 | }) 35 | } 36 | 37 | func Stop(port int) error { 38 | resp, err := http.Get(fmt.Sprintf("http://localhost:%d/stop", port)) 39 | if err != nil { 40 | return err 41 | } 42 | if resp.StatusCode != 200 { 43 | return fmt.Errorf("stop failed, status code: %d", resp.StatusCode) 44 | } 45 | return nil 46 | } 47 | -------------------------------------------------------------------------------- /para_config/megatron_lm/rank_map.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | 5 | def gen_rank_map( 6 | framework: str, 7 | model: str, 8 | model_size: str, 9 | precision: str, 10 | pp_size: int, 11 | tp_size: int, 12 | dp_size: int, 13 | job_dir: str, 14 | repo: str, 15 | ): 16 | size = pp_size * tp_size * dp_size 17 | out_dir = os.path.join(repo, f"{framework}/{precision}/{model}/{model_size}") 18 | out_dir = os.path.join(out_dir, f"pp{pp_size:02d}/mp{tp_size:02d}/dp{dp_size:02d}") 19 | gpus_container = 4 20 | num_nodes = size // gpus_container 21 | 22 | mapping = {} 23 | 24 | for rank in range(size): 25 | for node in range(num_nodes): 26 | rank_path = os.path.join( 27 | job_dir, f"{node}/ckpt/{rank}/rank_{rank:02d}.json" 28 | ) 29 | 30 | if not os.path.exists(rank_path): 31 | continue 32 | 33 | with open(rank_path, "r", encoding="utf-8") as rank_file: 34 | ranks = json.load(rank_file) 35 | 36 | mapping[rank] = { 37 | "pp_rank": ranks["pp"], 38 | "mp_rank": ranks["tp"], 39 | "dp_rank": ranks["dp"], 40 | } 41 | 42 | path = os.path.join(out_dir, "rank_map.json") 43 | with open(path, "w", encoding="utf-8") as json_file: 44 | json.dump(mapping, json_file, indent=4) 45 | -------------------------------------------------------------------------------- /ansible/install.yml: -------------------------------------------------------------------------------- 1 | - name: add apt sources list 2 | become: true 3 | ansible.builtin.shell: 4 | cmd: echo "deb https://europe-west2-apt.pkg.dev/projects/tenplex tenplex main" | sudo tee /etc/apt/sources.list.d/tenplex.list 5 | register: log 6 | 7 | - name: add apt key 8 | become: true 9 | ansible.builtin.shell: 10 | cmd: curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | gpg --dearmor | sudo tee /etc/apt/trusted.gpg.d/packages-cloud-google-apt.gpg >/dev/null 11 | register: log 12 | 13 | - name: apt update 14 | become: true 15 | ansible.builtin.shell: 16 | cmd: apt update 17 | register: log 18 | 19 | # - name: Update apt repository cache 20 | # ansible.builtin.apt: 21 | # update_cache: yes 22 | 23 | - name: stop mlfs 24 | become: true 25 | ignore_errors: yes 26 | ansible.builtin.shell: 27 | cmd: | 28 | rm /etc/mlfs/tenplex.sas 29 | systemctl stop mlfs 30 | 31 | register: log 32 | - name: Install a list of packages 33 | become: true 34 | ansible.builtin.apt: 35 | pkg: 36 | - fuse3 37 | - mlfs 38 | 39 | - name: reload mlfs 40 | become: true 41 | ignore_errors: yes 42 | ansible.builtin.shell: 43 | cmd: | 44 | systemctl daemon-reload 45 | systemctl restart mlfs 46 | register: log 47 | 48 | - name: show info 49 | # command: mlfs info 50 | ansible.builtin.shell: 51 | cmd: mlfs info 52 | register: log 53 | -------------------------------------------------------------------------------- /mlfs/prefetch.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | set -e 4 | 5 | make 6 | 7 | SA=minddata 8 | SAS=$(cat $HOME/.az/$SA.sas) 9 | 10 | URL=https://$SA.blob.core.windows.net/data/imagenet/imagenet.md5.txt?$SAS 11 | 12 | # wget -O imagenet.md5.txt $URL 13 | 14 | prefetch() { 15 | local md5=$1 16 | local URL="https://minddata.blob.core.windows.net/data/imagenet/records/$2" 17 | 18 | ./bin/mlfs-fetch -ctrl-port 20000 -file $URL -md5 $md5 19 | } 20 | 21 | prefetch_idx_file() { 22 | local idx_file=$1 23 | cat $idx_file | while read line; do 24 | local md5=$(echo $line | awk '{print $1}') 25 | local filename=$(echo $line | awk '{print $2}') 26 | prefetch $md5 $filename 27 | done 28 | } 29 | 30 | prefetch_idx_file imagenet.md5.txt 31 | 32 | # prefetch 8c7f3aa5f4f227f261717028d6c76c6e train-00000-of-01024 33 | # prefetch 99943ca2bd3c48baa633a2f4ee805f6c train-00001-of-01024 34 | # prefetch c117e44c7f83b80ebfbbddf990773b8a train-00002-of-01024 35 | # prefetch 47644a7c6c924358e207cba2f2c51727 train-00003-of-01024 36 | # prefetch c733217f52e73fd72f6566c9569d2d40 train-00004-of-01024 37 | # prefetch 05170c43f2c4be60b46c391d98b52481 train-00005-of-01024 38 | # prefetch 190dbbfdd581623a1a90835bb9a23460 train-00006-of-01024 39 | # prefetch 0663659d61497f6546e90abcf8b1e08d train-00007-of-01024 40 | 41 | # https://minddata.blob.core.windows.net/data/imagenet/records/train-01001-of-01024 1251 42 | -------------------------------------------------------------------------------- /scheduler/cmd/tenplex-user/tenplex-user.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "flag" 5 | "log" 6 | "net" 7 | "os" 8 | "path" 9 | "time" 10 | 11 | "github.com/kungfu-team/tenplex/scheduler/experiments/fakeuser" 12 | "github.com/kungfu-team/tenplex/scheduler/logging" 13 | "github.com/lgarithm/go/tr" 14 | ) 15 | 16 | func main() { 17 | prog := path.Base(os.Args[0]) 18 | logging.SetupLogger(prog) 19 | defer tr.Patient(prog, 30*time.Second).Done() 20 | var u fakeuser.User 21 | u.RegisterFlags(flag.CommandLine) 22 | flag.Parse() 23 | // u.Hosts = resolveHosts(*hosts) // TODO: make it work 24 | if len(u.PlansFile) > 0 { 25 | if u.SingleTimedJob { 26 | if err := u.RunSingleJob(); err != nil { 27 | log.Panic(err) 28 | } 29 | } else { 30 | if err := u.RunPlans(); err != nil { 31 | log.Panic(err) 32 | } 33 | } 34 | } else { 35 | log.Printf("! using deprecated Run") 36 | u.Run() 37 | } 38 | } 39 | 40 | func resolveHosts(hosts []string) []string { 41 | var ips []string 42 | for i, h := range hosts { 43 | ip := resolve(h) 44 | log.Printf("#%d : %s -> %s", i, h, ip) 45 | ips = append(ips, ip) 46 | } 47 | return hosts 48 | } 49 | 50 | func resolve(h string) string { // TODO: does't work for self 51 | addrs, err := net.LookupHost(h) 52 | if err != nil { 53 | return h 54 | } 55 | for _, a := range addrs { 56 | log.Printf("%s", a) 57 | return a 58 | } 59 | return h 60 | } 61 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | PREFIX := $(if $(PREFIX),$(PREFIX),$(HOME)/local) 2 | WHICH_GO := $(shell which go) 3 | DEFAULT_GO := $(if $(WHICH_GO),$(WHICH_GO),$(HOME)/local/go/bin/go) 4 | GO := $(if $(GO),$(GO),$(DEFAULT_GO)) 5 | CUDA := $(if $(CUDA),$(CUDA),$(shell [ -c /dev/nvidia0 ] && echo cuda)) 6 | # BIN_DIR := $(if $(BIN_DIR),$(BIN_DIR),$(HOME)/.tenplex/bin) 7 | BIN_DIR := $(CURDIR)/bin 8 | 9 | GO_MOD := $(shell ./show-go-mod.sh) 10 | buildinfo := $(GO_MOD)/mlfs/buildinfo 11 | LDFLAGS += -X $(buildinfo).BuildHost=$(shell hostname) 12 | LDFLAGS += -X $(buildinfo).BuildTimestamp=$(shell date +%s) 13 | LDFLAGS += -X $(buildinfo).GitCommit=$(shell git rev-list -1 HEAD) 14 | LDFLAGS += -X $(buildinfo).GitBranch=$(shell git rev-parse --abbrev-ref HEAD) 15 | LDFLAGS += -X $(buildinfo).GitRev=$(shell git rev-list --count HEAD) 16 | 17 | default: binaries test 18 | 19 | binaries: bin 20 | GOBIN=$(PWD)/bin $(GO) install -ldflags "$(LDFLAGS)" -v ./... 21 | 22 | install: 23 | $(GO) install -ldflags "$(LDFLAGS)" -v ./... 24 | 25 | test: 26 | $(GO) test -v ./... 27 | 28 | update: 29 | $(GO) get -u ./... 30 | 31 | clean: 32 | $(GO) clean -v -cache ./... 33 | 34 | tidy: 35 | $(GO) mod tidy 36 | 37 | format: 38 | $(GO) fmt ./... 39 | 40 | i: install 41 | 42 | 43 | u: update tidy 44 | 45 | 46 | t: test 47 | 48 | 49 | bin: 50 | mkdir -p $(BIN_DIR) 51 | 52 | deb: binaries 53 | ./scripts/pack.sh 54 | 55 | sys-install: deb 56 | sudo dpkg -i ./build/*.deb 57 | -------------------------------------------------------------------------------- /mlfs/fsutil/fsutil.go: -------------------------------------------------------------------------------- 1 | package fsutil 2 | 3 | import ( 4 | "errors" 5 | "io" 6 | "strconv" 7 | "strings" 8 | 9 | "github.com/kungfu-team/tenplex/mlfs/vfs" 10 | ) 11 | 12 | var ( 13 | errNodeNotExists = errors.New("node not exist") 14 | errNotFile = errors.New("not a file") 15 | ) 16 | 17 | func ReadTextLines(r *vfs.Tree, p string) ([]string, error) { 18 | n, _, ok := r.Get(p) 19 | if !ok { 20 | return nil, errNodeNotExists 21 | } 22 | if n.IsDir() { 23 | return nil, errNotFile 24 | } 25 | bs, err := io.ReadAll(n.AsFile().Open()) 26 | if err != nil { 27 | return nil, err 28 | } 29 | var lines []string 30 | for _, line := range strings.Split(string(bs), "\n") { 31 | line = strings.TrimSpace(line) 32 | if len(line) > 0 { 33 | lines = append(lines, line) 34 | } 35 | } 36 | return lines, nil 37 | } 38 | 39 | func ReadIntLines(r *vfs.Tree, p string) ([]int, error) { 40 | n, _, ok := r.Get(p) 41 | if !ok { 42 | return nil, errNodeNotExists 43 | } 44 | if n.IsDir() { 45 | return nil, errNotFile 46 | } 47 | bs, err := io.ReadAll(n.AsFile().Open()) 48 | if err != nil { 49 | return nil, err 50 | } 51 | var xs []int 52 | for _, line := range strings.Split(string(bs), "\n") { 53 | line = strings.TrimSpace(line) 54 | if len(line) > 0 { 55 | x, err := strconv.Atoi(line) 56 | if err != nil { 57 | return xs, err 58 | } 59 | xs = append(xs, x) 60 | } 61 | } 62 | return xs, nil 63 | } 64 | -------------------------------------------------------------------------------- /mlfs/README: -------------------------------------------------------------------------------- 1 | elastic filesystem 2 | 3 | https://github.com/kungfu-team/mlfs 4 | 5 | 6 | Code Structure 7 | 8 | ./cmd - source for executable binaries 9 | 10 | ./vfs - a plain abstraction of FS 11 | ./vfs/hfs - expose vfs as HTTP endpoint 12 | ./vfs/ufs - expose vfs as FUSE endpoint 13 | ./vfs/vfile - builtin virtual file types 14 | ... 15 | 16 | 17 | 18 | ./mlfs - wraps vfs 19 | 20 | ./mfs - model fs // TODO? 21 | ?? 22 | ?? 23 | 24 | ./dsfs - dataset fs 25 | ./ds - dataset 26 | ./ds/trds - TFRecord dataset 27 | 28 | ./tfrecord - the TFRecord format 29 | 30 | 31 | ./iotrace - trace utils 32 | 33 | 34 | Golang requirements 35 | 1.18 36 | 37 | 38 | system install 39 | make deb 40 | sudo dpkg -i ./build/*.deb 41 | systemctl status mlfs 42 | sudo systemctl enable mlfs 43 | sudo systemctl start mlfs 44 | systemctl status mlfs 45 | 46 | 47 | pre-build deb packages: 48 | 49 | https://tenplex.blob.core.windows.net/public/deb/Packages 50 | 51 | 52 | Install with apt 53 | 54 | echo 'deb https://tenplex.blob.core.windows.net/public/deb ./' | sudo tee /etc/apt/sources.list.d/tenplex.list 55 | curl -s https://tenplex.blob.core.windows.net/public/deb/tenplex.gpg | sudo apt-key add - 56 | sudo apt update 57 | sudo apt install -y mlfs 58 | 59 | Known Bugs 60 | r.OpenAt(88758): 503 Egress is over the account limit. 61 | -------------------------------------------------------------------------------- /tests/test-tensor-file.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import numpy as np 4 | 5 | from tensor_file import query_tensor_file, read_tensor_file, upload_tensor 6 | 7 | 8 | def test_1(): 9 | x = read_tensor_file('a') 10 | print(x) 11 | 12 | 13 | def test_2(): 14 | # x = T() 15 | # x[1:2, 3:4, :] 16 | x = query_tensor_file('a', [slice(), slice()]) 17 | print(x) 18 | 19 | 20 | usr = os.getenv('USER') 21 | 22 | 23 | def test_upload_with(x, path): 24 | print(x) 25 | upload_tensor(path, x) 26 | mnt = f'/data/{usr}/mlfs' 27 | y = read_tensor_file(mnt + path) 28 | print(y) 29 | # assert (x .eq() y) 30 | 31 | 32 | def test_upload(): 33 | x = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32) 34 | test_upload_with(x, '/x') 35 | 36 | x = np.array([[1, 2, 3], [7, 8, 9], [4, 5, 6]], dtype=np.int8) 37 | test_upload_with(x, '/y') 38 | 39 | 40 | def test_query(): 41 | data = list(range(16)) 42 | x = np.array(data, dtype=np.float32).reshape((4, 4)) 43 | test_upload_with(x, '/x') 44 | 45 | y = query_tensor_file('/x', [slice(1, 3), slice(1, 3)]) 46 | print(y) 47 | 48 | y = query_tensor_file('/x', [slice(None), slice(None)]) 49 | print(y) 50 | 51 | y = query_tensor_file('/x', [slice(1, 3)]) 52 | print(y) 53 | 54 | y = query_tensor_file('/x', [slice(None), slice(1, 3)]) 55 | print(y) 56 | 57 | 58 | # test_1() 59 | # test_2() 60 | 61 | # test_upload() 62 | test_query() 63 | -------------------------------------------------------------------------------- /tests/test_load.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import copy 3 | import os 4 | 5 | import numpy as np 6 | import torch 7 | 8 | from tenplex.load import load 9 | 10 | 11 | def traverse(value, keys=None): 12 | if keys is None: 13 | keys = [] 14 | 15 | if isinstance(value, dict): 16 | for key, val in value.items(): 17 | new_keys = copy.deepcopy(keys) 18 | new_keys.append(key) 19 | traverse(val, new_keys) 20 | return 21 | if isinstance(value, (list, set, tuple)): 22 | for i, val in enumerate(value): 23 | new_keys = copy.deepcopy(keys) 24 | new_keys.append(f'{i}') 25 | traverse(val, new_keys) 26 | return 27 | 28 | if isinstance(value, torch.Tensor): 29 | tensor = value.detach().cpu().numpy() 30 | typ = type(tensor) 31 | print(f'{keys} is {typ} and shape {value.shape}') 32 | return 33 | if isinstance(value, np.ndarray): 34 | typ = type(value) 35 | print(f'{keys} is {typ} and shape {value.shape}') 36 | return 37 | 38 | 39 | def main(): 40 | parser = argparse.ArgumentParser(description='Write checkpoint') 41 | parser.add_argument('--device-rank', type=int) 42 | parser.add_argument('--mlfs-path', type=str) 43 | args = parser.parse_args() 44 | 45 | ckpt = load(args.device_rank, args.mlfs_path) 46 | traverse(ckpt) 47 | 48 | 49 | if __name__ == '__main__': 50 | main() 51 | -------------------------------------------------------------------------------- /tenplex-run/runop/failure.go: -------------------------------------------------------------------------------- 1 | package runop 2 | 3 | import ( 4 | "fmt" 5 | "io" 6 | "log" 7 | "net/http" 8 | "net/url" 9 | "path" 10 | "time" 11 | 12 | "github.com/kungfu-team/tenplex/tenplex-run/job" 13 | ) 14 | 15 | var client = http.Client{Timeout: 3 * time.Second} 16 | 17 | func simulateFailures(jc *job.JobConfig, n int) error { 18 | for i := 0; i < n; i++ { 19 | if err := simulateOneFailure(jc, i/jc.Cluster.GPUsPerHost, i); err != nil { 20 | log.Printf("%s failed: %v", "simulateOneFailure", err) 21 | return err 22 | } 23 | } 24 | return nil 25 | } 26 | 27 | func simulateOneFailure(jobConf *job.JobConfig, hostIdx int, workerID int) error { 28 | ip := jobConf.Cluster.Hosts[hostIdx] 29 | p := path.Join("/job", jobConf.ID, "save", str(workerID)) 30 | log.Printf("simulateOneFailure by del: %s from [%d]=%s", p, hostIdx, ip) 31 | q := url.Values{} 32 | q.Set(`path`, p) 33 | u := url.URL{ 34 | Scheme: `http`, 35 | Host: fmt.Sprintf("%s:%d", ip, jobConf.MLFSPort), 36 | Path: "/delete1", 37 | RawQuery: q.Encode(), 38 | } 39 | req, err := http.NewRequest(http.MethodDelete, u.String(), nil) 40 | if err != nil { 41 | return err 42 | } 43 | resp, err := client.Do(req) 44 | if err != nil { 45 | return err 46 | } 47 | defer resp.Body.Close() 48 | bs, _ := io.ReadAll(resp.Body) 49 | if resp.StatusCode != http.StatusOK { 50 | return fmt.Errorf("delete target dir failed, status: %s, error: %s, url: %s", resp.Status, string(bs), u.String()) 51 | } 52 | return nil 53 | } 54 | -------------------------------------------------------------------------------- /tenplex-run/para_config/schedule.go: -------------------------------------------------------------------------------- 1 | package para_config 2 | 3 | import ( 4 | "bytes" 5 | "encoding/json" 6 | "fmt" 7 | "log" 8 | "os" 9 | ) 10 | 11 | type ScalingPoint struct { 12 | Step *int `json:"step"` 13 | Time *int `json:"time"` 14 | Size int `json:"size"` 15 | } 16 | 17 | func (s ScalingPoint) String() string { 18 | buf := &bytes.Buffer{} 19 | if s.Step != nil { 20 | fmt.Fprintf(buf, "step: %d", *s.Step) 21 | } 22 | if s.Time != nil { 23 | fmt.Fprintf(buf, "time: %d", *s.Time) 24 | } 25 | fmt.Fprintf(buf, ", size: %d", s.Size) 26 | return buf.String() 27 | } 28 | 29 | var Empty MultiDimensionalParallelism // Size == PPSize == MPSize == 0 30 | 31 | type Schedule []ScalingPoint 32 | 33 | func (s Schedule) String() string { 34 | buf := &bytes.Buffer{} 35 | for i, sp := range s { 36 | if i > 0 { 37 | fmt.Fprintf(buf, ", ") 38 | } 39 | fmt.Fprintf(buf, "%s", sp) 40 | } 41 | return `Schedule{` + buf.String() + `}` 42 | } 43 | 44 | func GenSchedule(scheduleFile string) Schedule { 45 | s, err := LoadScheduleFile(scheduleFile) 46 | if err != nil { 47 | log.Panicf("LoadScheduleFile: %v", err) 48 | } 49 | for i, p := range s { 50 | log.Printf("schedule[%d/%d]: %s", i+1, len(s), p) 51 | } 52 | return s 53 | } 54 | 55 | func LoadScheduleFile(filename string) (Schedule, error) { 56 | f, err := os.Open(filename) 57 | if err != nil { 58 | return nil, err 59 | } 60 | var obj Schedule 61 | if err := json.NewDecoder(f).Decode(&obj); err != nil { 62 | return nil, err 63 | } 64 | return obj, nil 65 | } 66 | -------------------------------------------------------------------------------- /tenplex-run/job/params_bert.go: -------------------------------------------------------------------------------- 1 | package job 2 | 3 | import "log" 4 | 5 | func GenMegatronLMBERTCmd(c TrainingConfig, rank int, jobID string, host string, jConf *JobConfig) []string { 6 | cmd := []string{ 7 | `torchrun`, 8 | } 9 | cmd = append(cmd, jConf.DistFlags(c, rank)...) 10 | cmd = append(cmd, `/workspace/Megatron-LM/pretrain_bert.py`) 11 | var sizes = map[string]TransformerSize{ 12 | `base`: TFSize(12, 768, 12), 13 | `large`: TFSize(24, 1024, 16), 14 | } 15 | bert_args := []string{ 16 | `--seq-length`, str(1024), // default: 512 17 | `--max-position-embeddings`, str(1024), // default: 512 18 | `--lr`, `0.0001`, 19 | `--lr-decay-iters`, str(10000), 20 | `--train-iters`, str(10000), 21 | `--tenplex-train-iters`, str(c.TrainIters), 22 | `--min-lr`, `0.00001`, 23 | `--lr-warmup-fraction`, `0.01`, 24 | `--micro-batch-size`, str(jConf.MicroBatchSize), // default: 4 25 | `--global-batch-size`, str(jConf.BatchSize), // default: 32 26 | `--vocab-file`, `/workspace/Megatron-LM/vocab/bert-large-uncased-vocab.txt`, 27 | `--split`, `949,50,1`, 28 | `--data-path`, `/data/dataset/bert_text_sentence`, 29 | `--distributed-backend`, `nccl`, 30 | } 31 | if ts, ok := sizes[jConf.ModelSize]; ok { 32 | bert_args = append(bert_args, ts.ToPyArgs()...) 33 | } else { 34 | log.Fatalf("Model size not matching %s", jConf.ModelSize) 35 | } 36 | cmd = append(cmd, bert_args...) 37 | cmd = append(cmd, jConf.LogFlags(c)...) 38 | cmd = append(cmd, jConf.TenplexFlags(c, host)...) 39 | cmd = append(cmd, jConf.OtherFlags(c)...) 40 | return cmd 41 | } 42 | -------------------------------------------------------------------------------- /mlfs/docker/ubuntu/1804/sources.list: -------------------------------------------------------------------------------- 1 | deb http://azure.archive.ubuntu.com/ubuntu/ bionic main restricted 2 | # deb-src http://azure.archive.ubuntu.com/ubuntu/ bionic main restricted 3 | 4 | deb http://azure.archive.ubuntu.com/ubuntu/ bionic-updates main restricted 5 | # deb-src http://azure.archive.ubuntu.com/ubuntu/ bionic-updates main restricted 6 | 7 | deb http://azure.archive.ubuntu.com/ubuntu/ bionic universe 8 | # deb-src http://azure.archive.ubuntu.com/ubuntu/ bionic universe 9 | deb http://azure.archive.ubuntu.com/ubuntu/ bionic-updates universe 10 | # deb-src http://azure.archive.ubuntu.com/ubuntu/ bionic-updates universe 11 | 12 | deb http://azure.archive.ubuntu.com/ubuntu/ bionic multiverse 13 | # deb-src http://azure.archive.ubuntu.com/ubuntu/ bionic multiverse 14 | deb http://azure.archive.ubuntu.com/ubuntu/ bionic-updates multiverse 15 | # deb-src http://azure.archive.ubuntu.com/ubuntu/ bionic-updates multiverse 16 | 17 | deb http://azure.archive.ubuntu.com/ubuntu/ bionic-backports main restricted universe multiverse 18 | # deb-src http://azure.archive.ubuntu.com/ubuntu/ bionic-backports main restricted universe multiverse 19 | 20 | deb http://security.ubuntu.com/ubuntu bionic-security main restricted 21 | # deb-src http://security.ubuntu.com/ubuntu bionic-security main restricted 22 | deb http://security.ubuntu.com/ubuntu bionic-security universe 23 | # deb-src http://security.ubuntu.com/ubuntu bionic-security universe 24 | deb http://security.ubuntu.com/ubuntu bionic-security multiverse 25 | # deb-src http://security.ubuntu.com/ubuntu bionic-security multiverse 26 | -------------------------------------------------------------------------------- /mlfs/uri/stat.go: -------------------------------------------------------------------------------- 1 | package uri 2 | 3 | import ( 4 | "errors" 5 | "net/http" 6 | "net/url" 7 | "os" 8 | "strconv" 9 | ) 10 | 11 | type Info struct { 12 | Size int64 13 | } 14 | 15 | func (o *Opener) Stat(uri string) (*Info, error) { 16 | u, err := url.Parse(uri) 17 | if err != nil { 18 | return nil, err 19 | } 20 | return o.stat(*u) 21 | } 22 | 23 | func (o *Opener) stat(u url.URL) (*Info, error) { 24 | switch u.Scheme { 25 | case "http": 26 | return o.statHTTP(u) 27 | case "https": 28 | return o.statHTTP(o.addAzureCreds(u)) 29 | case "file": 30 | return statFile(u.Path) 31 | case "": 32 | return statFile(u.Path) 33 | } 34 | return nil, errUnsupportedURLScheme 35 | } 36 | 37 | func (o *Opener) statHTTP(u url.URL) (*Info, error) { 38 | req, err := http.NewRequest(http.MethodHead, u.String(), nil) 39 | if err != nil { 40 | return nil, err 41 | } 42 | resp, err := o.client.Do(req) 43 | if err != nil { 44 | return nil, err 45 | } 46 | defer resp.Body.Close() 47 | if resp.StatusCode != http.StatusOK { 48 | return nil, errors.New(resp.Status) 49 | } 50 | if cl := resp.Header.Get(`Content-Length`); len(cl) > 0 { 51 | n, err := strconv.ParseInt(cl, 10, 64) 52 | if err != nil { 53 | return nil, err 54 | } 55 | return &Info{Size: n}, nil 56 | } 57 | return &Info{Size: -1}, nil 58 | } 59 | 60 | func statFile(name string) (*Info, error) { 61 | info, err := os.Stat(name) 62 | if err != nil { 63 | return nil, err 64 | } 65 | return &Info{Size: info.Size()}, nil 66 | } 67 | 68 | func Stat(uri string) (*Info, error) { 69 | return opener.Stat(uri) 70 | } 71 | -------------------------------------------------------------------------------- /tenplex-run/scripts/read-zero-model-state.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import torch 4 | 5 | 6 | def show_pt(o, p='/'): 7 | if isinstance(o, dict): 8 | for k, v in sorted(o.items()): 9 | show_pt(v, p + '/' + k) 10 | else: 11 | print('{} :: {}'.format(p, o.__class__)) 12 | 13 | 14 | def show_dict(d): 15 | for k, v in sorted(d.items()): 16 | print('{:32} :: {}'.format(k, v.__class__)) 17 | 18 | 19 | def read_pt_file(filename): 20 | print(filename) 21 | d = torch.load(filename, map_location=torch.device('cpu')) 22 | show_pt(d) 23 | 24 | # show_dict(d) 25 | 26 | # print('{} buffer_names'.format(len(d['buffer_names']))) 27 | # for i, s in enumerate(d['buffer_names']): 28 | # print('{:6} {}'.format(i, s)) 29 | # print('') 30 | 31 | # print('lr_scheduler:') 32 | # show_dict(d['lr_scheduler']) 33 | # print('') 34 | 35 | # print('module:') 36 | # show_dict(d['module']) 37 | # print('') 38 | 39 | # print('language_model:') 40 | # show_dict(d['module']['language_model']) 41 | # print('') 42 | 43 | # print('embedding:') 44 | # show_dict(d['module']['language_model']['embedding']) 45 | # print('') 46 | # print('transformer:') 47 | # show_dict(d['module']['language_model']['transformer']) 48 | # print('') 49 | 50 | # 51 | # optimizer_state_dict 52 | # param_shapes 53 | # ds_config 54 | # ds_version 55 | 56 | 57 | def main(args): 58 | for filename in args: 59 | read_pt_file(filename) 60 | print('') 61 | 62 | 63 | main(sys.argv[1:]) 64 | -------------------------------------------------------------------------------- /mlfs/mlfs/replicate.go: -------------------------------------------------------------------------------- 1 | package mlfs 2 | 3 | import ( 4 | "bytes" 5 | "io" 6 | "net/http" 7 | "net/url" 8 | ) 9 | 10 | const ( 11 | X_Replica = `x-replica` 12 | ) 13 | 14 | func (s *webUI) replicateRequest(w http.ResponseWriter, r *http.Request) error { 15 | log.Printf("replicate RawQuery: %q", r.URL.RawQuery) 16 | bs, err := io.ReadAll(r.Body) 17 | if err != nil { 18 | return err 19 | } 20 | log.Printf("body: %d bytes", len(bs)) 21 | for i := 0; i < s.e.redundency+1; i++ { 22 | id := s.e.peers[(s.e.rank+i)%len(s.e.peers)] 23 | u := url.URL{ 24 | // Scheme: r.URL.Scheme,// is empty 25 | Scheme: `http`, 26 | Host: id.String(), 27 | Path: r.URL.Path, 28 | RawQuery: r.URL.RawQuery, 29 | } 30 | req, err := http.NewRequest(r.Method, u.String(), bytes.NewBuffer(bs)) 31 | if err != nil { 32 | return err 33 | } 34 | for k, vs := range r.Header { 35 | for _, v := range vs { 36 | req.Header.Add(k, v) 37 | } 38 | } 39 | req.Header.Set(X_Replica, str(s.e.redundency)) 40 | resp, err := http.DefaultClient.Do(req) 41 | if err != nil { 42 | return err 43 | } 44 | resp.Body.Close() 45 | } 46 | return nil 47 | } 48 | 49 | func (s *webUI) replicated(f http.HandlerFunc) http.HandlerFunc { 50 | return func(w http.ResponseWriter, req *http.Request) { 51 | if s.e.redundency > 0 && parseInt(req.Header.Get(X_Replica)) == 0 { 52 | if err := s.replicateRequest(w, req); err != nil { 53 | log.Printf("replicateRequest: %v", err) 54 | http.Error(w, err.Error(), http.StatusInternalServerError) 55 | } 56 | return 57 | } 58 | f(w, req) 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /mlfs/vfs/vfile/range.go: -------------------------------------------------------------------------------- 1 | package vfile 2 | 3 | import "net/url" 4 | 5 | // Range represents [Begin, End) where Begin <= End 6 | type Range struct { 7 | Begin uint64 8 | End uint64 9 | } 10 | 11 | func (r Range) Len() uint64 { 12 | return r.End - r.Begin 13 | } 14 | 15 | type NamedRange struct { 16 | Name string 17 | Range Range 18 | } 19 | 20 | type IndexedRange struct { 21 | ID int 22 | Range Range 23 | } 24 | 25 | type Ranges []Range 26 | 27 | type NamedRanges []NamedRange 28 | 29 | type IndexedFile struct { 30 | Filepath string 31 | Ranges Ranges 32 | } 33 | 34 | func (f IndexedFile) IndexedBytes() uint64 { 35 | var n uint64 36 | for _, r := range f.Ranges { 37 | n += r.Len() 38 | } 39 | return n 40 | } 41 | 42 | type IndexedFiles []IndexedFile 43 | 44 | func (i IndexedFiles) NumRange() int { 45 | var n int 46 | for _, f := range i { 47 | n += len(f.Ranges) 48 | } 49 | return n 50 | } 51 | 52 | func (i IndexedFiles) NamedRanges() NamedRanges { 53 | var rs NamedRanges 54 | for _, f := range i { 55 | for _, r := range f.Ranges { 56 | rs = append(rs, NamedRange{f.Filepath, r}) 57 | } 58 | } 59 | return rs 60 | } 61 | 62 | func (rs NamedRanges) Select(s []int) NamedRanges { 63 | var qs NamedRanges 64 | for _, i := range s { 65 | qs = append(qs, rs[i]) 66 | } 67 | return qs 68 | } 69 | 70 | func (idx IndexedFiles) SetHost(host string) { 71 | for i, f := range idx { 72 | u, err := url.Parse(f.Filepath) 73 | if err != nil { 74 | continue 75 | } 76 | u.Host = host 77 | if u.Host == `` { 78 | u.Scheme = `` 79 | } 80 | idx[i].Filepath = u.String() 81 | } 82 | } 83 | -------------------------------------------------------------------------------- /scheduler/cmd/tenplex-scheduler/tenplex-scheduler.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "flag" 5 | "log" 6 | "os" 7 | "os/user" 8 | "path" 9 | "time" 10 | 11 | "github.com/kungfu-team/tenplex/ipv4" 12 | "github.com/kungfu-team/tenplex/scheduler/logging" 13 | "github.com/kungfu-team/tenplex/scheduler/scheduler" 14 | "github.com/lgarithm/go/tr" 15 | ) 16 | 17 | func main() { 18 | prog := path.Base(os.Args[0]) 19 | logging.SetupLogger(prog) 20 | defer tr.Patient(prog, 300*time.Second).Done() 21 | var runDir = `/run/tenplex` 22 | user, err := user.Current() 23 | if err != nil { 24 | log.Panic(err) 25 | } 26 | log.Printf("user: %s", user.Username) 27 | if user.Username != `root` { 28 | runDir = user.HomeDir 29 | } 30 | if pwd, _ := os.Getwd(); pwd == `/` { 31 | if err := setupWorkDir(runDir); err != nil { 32 | log.Panic(err) 33 | } 34 | } 35 | logDirs() 36 | var d scheduler.Daemon 37 | d.RegisterFlags(flag.CommandLine) 38 | flag.Parse() 39 | if len(d.DetectIPv4) > 0 || len(d.SelfIP) == 0 { 40 | if d.SelfIP = detectIP(d.DetectIPv4); len(d.SelfIP) == 0 { 41 | log.Panic("self IP is empty") 42 | } 43 | } 44 | log.Printf("using self ip: %s", d.SelfIP) 45 | d.Run() 46 | } 47 | 48 | var detectIP = ipv4.Detect 49 | 50 | func setupWorkDir(dir string) error { 51 | if err := os.MkdirAll(dir, os.ModePerm); err != nil { 52 | return err 53 | } 54 | if err := os.Chdir(dir); err != nil { 55 | return err 56 | } 57 | return nil 58 | } 59 | 60 | func logDirs() { 61 | pwd, _ := os.Getwd() 62 | log.Printf("pwd: %s", pwd) 63 | home, _ := os.UserHomeDir() 64 | log.Printf("home: %s", home) 65 | } 66 | -------------------------------------------------------------------------------- /benchmark/performance_impact/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | set -e 3 | 4 | . $(dirname $0)/../common.sh 5 | 6 | hosts() { 7 | echo "10.10.10.1" 8 | echo "10.10.10.2" 9 | echo "10.10.10.3" 10 | echo "10.10.10.4" 11 | } 12 | 13 | model_sizes() { 14 | # echo "6.7B" 15 | echo "2.7B" 16 | echo "xl" 17 | # echo "large" 18 | } 19 | 20 | batch_sizes() { 21 | echo 128 22 | } 23 | 24 | micro_batch_sizes() { 25 | echo 8 26 | } 27 | 28 | mdp_sizes() { 29 | echo 16 30 | } 31 | 32 | bert_flags() { 33 | echo -model "bert" 34 | echo -model-sizes "large" 35 | 36 | echo -dataset "openwebtext" 37 | echo -index-url "/data/megatron-lm/bert/openwebtext/npzs_seq1024/indices.txt" 38 | } 39 | 40 | gpt_flags() { 41 | echo -model "gpt" 42 | echo -model-sizes "2.7B" 43 | 44 | echo -dataset "enwiki" 45 | echo -index-url "/data/megatron-lm/gpt-2/enwiki/npzs_seq1024_new/indices.txt" 46 | } 47 | 48 | comb_flags() { 49 | echo -hosts $(join $(hosts)) 50 | 51 | echo -batch-sizes $(join $(batch_sizes)) 52 | echo -micro-batch-sizes $(join $(micro_batch_sizes)) 53 | 54 | echo -mdp-sizes $(join $(mdp_sizes)) 55 | } 56 | 57 | common_flags() { 58 | base_flags 59 | comb_flags 60 | echo -timeout 30 61 | } 62 | 63 | run_bert() { 64 | tenplex-perf-impact $(common_flags) $(bert_flags) 65 | } 66 | 67 | run_gpt() { 68 | tenplex-perf-impact $(common_flags) $(gpt_flags) 69 | } 70 | 71 | main() { 72 | run_bert 73 | run_gpt 74 | } 75 | 76 | with_nohup() { 77 | nohup $@ >out.log 2>err.log & 78 | } 79 | 80 | main 81 | 82 | python extract.py 83 | python plot.py 84 | -------------------------------------------------------------------------------- /benchmark/failure/plot.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import matplotlib.pyplot as plt 4 | import numpy as np 5 | 6 | 7 | def main(): 8 | sys = "Scalai" 9 | width = 0.3 10 | 11 | failures = [4, 8, 12] 12 | 13 | times = np.array([23.5, 19.6, 19.6]) 14 | tenplex_rerun = np.array([0, 0, 360]) 15 | times_tenplex = times + tenplex_rerun 16 | baseline_rerun = np.array([360, 360, 360]) 17 | times_baseline = times + baseline_rerun 18 | 19 | plt.rcParams["hatch.linewidth"] = 3 20 | linewidth = 2 21 | 22 | plt.rc("figure", figsize=[8, 4.5]) 23 | fig, ax = plt.subplots() 24 | 25 | x = np.arange(len(failures)) 26 | 27 | ax.bar( 28 | x, 29 | times_tenplex, 30 | width=width, 31 | label=sys, 32 | hatch="//", 33 | fill=False, 34 | edgecolor="tab:blue", 35 | linewidth=linewidth, 36 | ) 37 | ax.bar( 38 | x + 1.1 * width, 39 | times_baseline, 40 | width=width, 41 | label="Baseline", 42 | hatch="--", 43 | fill=False, 44 | edgecolor="tab:orange", 45 | linewidth=linewidth, 46 | ) 47 | 48 | fontsize = 26 49 | labelsize = 22 50 | ax.grid(axis="y") 51 | ax.set_axisbelow(True) 52 | ax.set_ylim(top=600) 53 | ax.tick_params(labelsize=labelsize) 54 | ax.set_xlabel("Number of GPU failures", fontsize=fontsize) 55 | ax.set_ylabel("Time in seconds", fontsize=fontsize) 56 | ax.legend(fontsize=labelsize) 57 | ax.set_xticks(x + 0.5 * width, failures) 58 | 59 | fig.tight_layout() 60 | plt.savefig("./failure.pdf") 61 | 62 | 63 | if __name__ == "__main__": 64 | main() 65 | -------------------------------------------------------------------------------- /mlfs/buildinfo/buildinfo.go: -------------------------------------------------------------------------------- 1 | package buildinfo 2 | 3 | import ( 4 | "fmt" 5 | "io" 6 | "net/http" 7 | "strconv" 8 | "time" 9 | ) 10 | 11 | var t0 = time.Now() 12 | 13 | type BuildInfo struct { 14 | BuildTimestamp string 15 | BuildTime time.Time 16 | BuildHost string 17 | GitCommit string 18 | GitBranch string 19 | GitRev string 20 | } 21 | 22 | func (i *BuildInfo) Parse() { 23 | if n, err := strconv.Atoi(i.BuildTimestamp); err == nil { 24 | i.BuildTime = time.Unix(int64(n), 0) 25 | } 26 | } 27 | 28 | func (i *BuildInfo) Show(w io.Writer) { 29 | fmt.Fprintf(w, "git branch: %s\n", i.GitBranch) 30 | fmt.Fprintf(w, "git commit: %s\n", i.GitCommit) 31 | fmt.Fprintf(w, "git rev: %s\n", i.GitRev) 32 | fmt.Fprintf(w, "build host: %s\n", i.BuildHost) 33 | if i.BuildTime.Unix() > 0 { 34 | fmt.Fprintf(w, "build age: %s\n", time.Since(i.BuildTime)) 35 | } else { 36 | fmt.Fprintf(w, "build age: %s (%q)\n", `?`, i.BuildTimestamp) 37 | } 38 | fmt.Fprintf(w, "run age: %s\n", time.Since(t0)) 39 | } 40 | 41 | func (i *BuildInfo) ServeHTTP(w http.ResponseWriter, r *http.Request) { i.Show(w) } 42 | 43 | var Default BuildInfo 44 | 45 | // func Set(i BuildInfo) { 46 | // Default = i 47 | // Default.Parse() 48 | // } 49 | 50 | var ( 51 | BuildHost string 52 | BuildTimestamp string 53 | GitCommit string 54 | GitBranch string 55 | GitRev string 56 | ) 57 | 58 | func init() { 59 | Default = BuildInfo{ 60 | BuildHost: BuildHost, 61 | BuildTimestamp: BuildTimestamp, 62 | GitCommit: GitCommit, 63 | GitBranch: GitBranch, 64 | GitRev: GitRev, 65 | } 66 | Default.Parse() 67 | } 68 | -------------------------------------------------------------------------------- /tenplex-run/scripts/read-zero-optimizer-state.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import torch 4 | 5 | 6 | def show_dict(d): 7 | for k, v in sorted(d.items()): 8 | print('{:32} :: {}'.format(k, v.__class__)) 9 | 10 | 11 | def read_pt_file(filename): 12 | print(filename) 13 | d = torch.load(filename, map_location=torch.device('cpu')) 14 | show_dict(d) 15 | return 16 | # 17 | # optimizer_state_dict 18 | # param_shapes 19 | # ds_config 20 | # ds_version 21 | print(d['ds_version']) #0.5.9+d93d924 22 | # /workspace/DeepSpeedExamples/Megatron-LM-v1.1.5-ZeRO3/examples/ds_zero_stage_2_config.json 23 | print(d['ds_config']) 24 | print(d['param_shapes'].__class__) # list 25 | for i, x in enumerate(d['param_shapes']): 26 | print(i) 27 | # print(x) 28 | for k, v in x.items(): 29 | print(k) 30 | print(v) 31 | break 32 | print(len(x)) 33 | 34 | # print(d['optimizer_state_dict']) 35 | print('optimizer_state_dict:') 36 | for k, v in d['optimizer_state_dict'].items(): 37 | print(k) 38 | print(v.__class__) 39 | 40 | print('base_optimizer_state:') 41 | for s in d['optimizer_state_dict']['base_optimizer_state']: 42 | print(s) 43 | 44 | print('partition_count:') 45 | for s in d['optimizer_state_dict']['partition_count']: 46 | print(s) 47 | 48 | print('single_partition_of_fp32_groups:') 49 | for s in d['optimizer_state_dict']['single_partition_of_fp32_groups']: 50 | print(s) 51 | 52 | 53 | def main(args): 54 | for filename in args: 55 | read_pt_file(filename) 56 | 57 | 58 | main(sys.argv[1:]) 59 | -------------------------------------------------------------------------------- /mlfs/cmd/mlfs-download/mlfs-download.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "flag" 5 | "io" 6 | "log" 7 | "net/url" 8 | "os" 9 | "path" 10 | "strings" 11 | "time" 12 | 13 | "github.com/kungfu-team/tenplex/mlfs/mlfs" 14 | "github.com/kungfu-team/tenplex/mlfs/uri" 15 | "github.com/kungfu-team/tenplex/mlfs/utils" 16 | "github.com/kungfu-team/tenplex/mlfs/vfs/vfile" 17 | ) 18 | 19 | var ( 20 | localRoot = flag.String("o", ".", "") 21 | idxFile = flag.String("index-url", "", "") 22 | ) 23 | 24 | func main() { mlfs.Main(Main) } 25 | 26 | func Main() error { 27 | t0 := time.Now() 28 | fs, err := vfile.LoadIdxFile(*idxFile) 29 | if err != nil { 30 | return err 31 | } 32 | for i, f := range fs { 33 | localFile := path.Join(*localRoot, getRelPath(f.Filepath)) 34 | if err := downloadOne(f.Filepath, localFile); err != nil { 35 | return err 36 | } 37 | log.Printf("downloaded %d/%d %s -> %s", i+1, len(fs), f.Filepath, localFile) 38 | utils.LogETA(t0, i+1, len(fs)) 39 | } 40 | return nil 41 | } 42 | 43 | func downloadOne(filepath, localFile string) error { 44 | r, err := uri.Open(filepath) 45 | if err != nil { 46 | return err 47 | } 48 | defer r.Close() 49 | if err := os.MkdirAll(path.Dir(localFile), os.ModePerm); err != nil { 50 | return err 51 | } 52 | w, err := os.Create(localFile) 53 | if err != nil { 54 | return err 55 | } 56 | defer w.Close() 57 | if _, err := io.Copy(w, r); err != nil { 58 | return err 59 | } 60 | return nil 61 | } 62 | 63 | func getRelPath(filepath string) string { 64 | u, err := url.Parse(filepath) 65 | if err != nil { 66 | return filepath 67 | } 68 | p := u.Path 69 | p = strings.TrimPrefix(p, `/`) 70 | return p 71 | } 72 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | bazil.org/fuse v0.0.0-20230120002735-62a210ff1fd5 h1:A0NsYy4lDBZAC6QiYeJ4N+XuHIKBpyhAVRMHRQZKTeQ= 2 | bazil.org/fuse v0.0.0-20230120002735-62a210ff1fd5/go.mod h1:gG3RZAMXCa/OTes6rr9EwusmR1OH1tDDy+cg9c5YliY= 3 | github.com/lgarithm/go v0.0.0-20230108194319-abf8008ecd81 h1:GFGlLe+MAKlSJYw3peERfTyLbv86tzf8YLEL3AVSPu4= 4 | github.com/lgarithm/go v0.0.0-20230108194319-abf8008ecd81/go.mod h1:FA2Pf2Af/7iMNJEuHDI79ywTadX28XifXiqO4kkVSIc= 5 | github.com/lgarithm/proc v0.4.5-0.20240417004737-9b169ad5c322 h1:PnylIiY58FSTLvfbuuEgTSO5xEaLOGGAWrs2aG6T70w= 6 | github.com/lgarithm/proc v0.4.5-0.20240417004737-9b169ad5c322/go.mod h1:8Eqa3ExkUYuyj/GwamAMgH09IAO1p7TL29wpeaPgSNg= 7 | github.com/lsds/KungFu v0.2.5 h1:2SJ/PMTcvLLhBZABAbR760c+QzWrCF8qZUqK1xDJiqM= 8 | github.com/lsds/KungFu v0.2.5/go.mod h1:FdILPtKYV4/ShJ38H7WbDunoKQ8l3Q4mJckRfqVbJn4= 9 | github.com/tv42/httpunix v0.0.0-20191220191345-2ba4b9c3382c h1:u6SKchux2yDvFQnDHS3lPnIRmfVJ5Sxy3ao2SIdysLQ= 10 | golang.org/x/crypto v0.22.0 h1:g1v0xeRhjcugydODzvb3mEM9SQ0HGp9s/nh3COQ/C30= 11 | golang.org/x/crypto v0.22.0/go.mod h1:vr6Su+7cTlO45qkww3VDJlzDn0ctJvRgYbC2NvXHt+M= 12 | golang.org/x/exp v0.0.0-20240416160154-fe59bbe5cc7f h1:99ci1mjWVBWwJiEKYY6jWa4d2nTQVIEhZIptnrVb1XY= 13 | golang.org/x/exp v0.0.0-20240416160154-fe59bbe5cc7f/go.mod h1:/lliqkxwWAhPjf5oSOIJup2XcqJaw8RGS6k3TGEc7GI= 14 | golang.org/x/sync v0.7.0 h1:YsImfSBoP9QPYL0xyKJPq0gcaJdG3rInoqxTWbfQu9M= 15 | golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= 16 | golang.org/x/sys v0.19.0 h1:q5f1RH2jigJ1MoAWp2KTp3gm5zAGFUTarQZ5U386+4o= 17 | golang.org/x/sys v0.19.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= 18 | golang.org/x/term v0.19.0 h1:+ThwsDv+tYfnJFhF4L8jITxu1tdTWRTZpdsWgEgjL6Q= 19 | -------------------------------------------------------------------------------- /mlfs/cmd/mlfs-check-index/mlfs-check-index.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "errors" 5 | "flag" 6 | "fmt" 7 | "log" 8 | "net/url" 9 | "time" 10 | 11 | "github.com/kungfu-team/tenplex/mlfs/mlfs" 12 | "github.com/kungfu-team/tenplex/mlfs/uri" 13 | "github.com/kungfu-team/tenplex/mlfs/vfs/vfile" 14 | ) 15 | 16 | // ./bin/mlfs-check-index $(cat tests/data/*.json | jq -r '."idx-url"') 17 | 18 | func main() { mlfs.Main(Main) } 19 | 20 | func Main() error { 21 | for _, f := range flag.Args() { 22 | if err := checkIndexFile(f); err != nil { 23 | return err 24 | } 25 | } 26 | return nil 27 | } 28 | 29 | func checkIndexFile(filename string) error { 30 | t0 := time.Now() 31 | defer func() { log.Printf("took %s", time.Since(t0)) }() 32 | fs, err := vfile.LoadIdxFile(filename) 33 | if err != nil { 34 | return err 35 | } 36 | domain := getDomain(filename) 37 | for i, f := range fs { 38 | log.Printf("checking %d/%d", i+1, len(fs)) 39 | info, err := uri.Stat(f.Filepath) 40 | if err != nil { 41 | return err 42 | } 43 | if info.Size < 0 { 44 | return errCannotGetSize 45 | } 46 | if size := int64(f.IndexedBytes()); size != info.Size { 47 | return fmt.Errorf("%v: %d, expect %d", errUnexpectedSize, info.Size, size) 48 | } 49 | if getDomain(f.Filepath) != domain { 50 | log.Printf("%s file has different domain", f.Filepath) 51 | } 52 | } 53 | fmt.Printf("OK: %s\n", filename) 54 | return nil 55 | } 56 | 57 | var ( 58 | errCannotGetSize = errors.New(`can't get size`) 59 | errUnexpectedSize = errors.New(`unexpected get size`) 60 | ) 61 | 62 | func getDomain(filepath string) string { 63 | u, err := url.Parse(filepath) 64 | if err != nil { 65 | return "" 66 | } 67 | return u.Host 68 | } 69 | -------------------------------------------------------------------------------- /tensor/concat.go: -------------------------------------------------------------------------------- 1 | package tensor 2 | 3 | import ( 4 | "fmt" 5 | ) 6 | 7 | func copyCatDim(t *Tensor, tens []*Tensor) *Tensor { 8 | tIdx := 0 9 | for _, ten := range tens { 10 | for tenIdx := 0; tenIdx < ten.Dims[0]; tenIdx++ { 11 | x := t.Sub(tIdx) 12 | y := ten.Sub(tenIdx) 13 | copy(x.Data, y.Data) 14 | tIdx += 1 15 | } 16 | 17 | } 18 | 19 | return t 20 | } 21 | 22 | func copyInto(t *Tensor, tens []*Tensor, dim int) *Tensor { 23 | if dim == 0 { 24 | return copyCatDim(t, tens) 25 | } 26 | for i := 0; i < t.Dims[0]; i++ { 27 | subTens := make([]*Tensor, len(tens)) 28 | for j, ten := range tens { 29 | subTens[j] = ten.Sub(i) 30 | } 31 | x := t.Sub(i) 32 | copyInto(x, subTens, dim-1) 33 | } 34 | return t 35 | } 36 | 37 | func Concat(tens []*Tensor, dim int) (*Tensor, error) { 38 | l := len(tens[0].Dims) 39 | if l <= dim { 40 | return nil, fmt.Errorf("dim %d larger tensor rank %d", dim, l) 41 | } 42 | for tenDim := range tens[0].Dims { 43 | dtypeZero := tens[0].Dtype 44 | sizeZero := tens[0].Dims[tenDim] 45 | for i := 1; i < len(tens); i++ { 46 | if tenDim != dim { 47 | size := tens[i].Dims[tenDim] 48 | if sizeZero != size { 49 | return nil, fmt.Errorf("dimension tenDim is unequal for tensors") 50 | } 51 | } 52 | if tens[i].Dtype != dtypeZero { 53 | return nil, fmt.Errorf("dtype of tensor %d is unequal", i) 54 | } 55 | } 56 | } 57 | newDimSize := 0 58 | for _, t := range tens { 59 | newDimSize = newDimSize + t.Dims[dim] 60 | } 61 | newDims := make([]int, len(tens[0].Dims)) 62 | copy(newDims, tens[0].Dims) 63 | newDims[dim] = newDimSize 64 | newTen := New(tens[0].Dtype, newDims...) 65 | newTen = copyInto(newTen, tens, dim) 66 | 67 | return newTen, nil 68 | } 69 | --------------------------------------------------------------------------------