├── mlfs
├── mlfstest
│ ├── .gitignore
│ ├── go.work
│ ├── go.mod
│ ├── Makefile
│ ├── cmd
│ │ └── mlfstest-tf-imagenet
│ │ │ └── lib.go
│ └── go.sum
├── docker
│ ├── ubuntu
│ │ ├── 1804
│ │ │ ├── tf.tag.txt
│ │ │ └── sources.list
│ │ ├── 2004
│ │ │ ├── tag.txt
│ │ │ ├── tf.tag.txt
│ │ │ └── Dockerfile
│ │ └── 2204
│ │ │ ├── tag.txt
│ │ │ └── Dockerfile
│ ├── test-tf.py
│ ├── docker-compose.yaml
│ └── start.sh
├── vfs
│ ├── note_test.go
│ ├── dir_test.go
│ ├── fiile_test.go
│ ├── example.go
│ ├── vfile
│ │ ├── shard.go
│ │ ├── link.go
│ │ ├── buffer.go
│ │ └── range.go
│ ├── ufs
│ │ ├── ufs_darwin.go
│ │ └── ufs.go
│ ├── utils_test.go
│ ├── node.go
│ ├── dir.go
│ ├── path.go
│ ├── tree_debug.go
│ └── file.go
├── benchmarks
│ ├── bench-http.sh
│ ├── bench-fuse.sh
│ ├── bench-tf-read.sh
│ └── run.sh
├── .gitignore
├── test-daemon.sh
├── state
│ └── state.go
├── tests
│ └── data
│ │ ├── imagenet.json
│ │ └── squad1.json
├── etc
│ ├── apt
│ │ └── sources.list.d
│ │ │ └── tenplex.list
│ ├── mlfs
│ │ ├── stop.sh
│ │ └── mlfs.sh
│ └── os
│ │ └── linux
│ │ └── mlfs.service
├── utils
│ ├── error.go
│ ├── text.go
│ └── log.go
├── test-client.sh
├── scripts
│ ├── get-go.sh
│ ├── cache-squad1.sh
│ ├── system-install.sh
│ └── upload.sh
├── build-imagenet-index.sh
├── notes
├── ds
│ ├── trds
│ │ ├── trds_test.go
│ │ └── example.go
│ ├── imagenet.go
│ ├── dataset.go
│ ├── squad1.go
│ └── mnist.go
├── build-squad-index.sh
├── uri
│ ├── path.go
│ ├── sas.go
│ ├── monitor.go
│ └── stat.go
├── cmd
│ ├── mlfsd
│ │ └── mlfsd.go
│ ├── tests
│ │ ├── cmd
│ │ │ ├── test-fuse
│ │ │ │ └── test-fuse.go
│ │ │ ├── test-md5
│ │ │ │ └── test-md5.go
│ │ │ ├── mlfs-test-dist
│ │ │ │ └── mlfs-test-dist.go
│ │ │ └── mlfs-debug
│ │ │ │ └── mlfs-debug.go
│ │ └── mlfs-test-upload
│ │ │ └── mlfs-test-upload.go
│ ├── mlfs-build-tf-index
│ │ └── mlfs-build-tf-index.go
│ ├── mlfs-edit-index
│ │ └── mlfs-edit-index.go
│ ├── mlfs-download
│ │ └── mlfs-download.go
│ └── mlfs-check-index
│ │ └── mlfs-check-index.go
├── mlfs
│ ├── app.go
│ ├── bitmap.go
│ ├── tensorfile.go
│ ├── dsidx.go
│ └── replicate.go
├── build-cloud-index.sh
├── add-imagenet.sh
├── build-imagenet-md5sum.sh
├── add-enwiki-numpy.sh
├── closer
│ └── closer.go
├── test-numpy.sh
├── par
│ └── par.go
├── .vscode
│ └── tasks.json
├── debug-p2p.sh
├── cache
│ ├── stat.go
│ └── memory.go
├── local-serve.sh
├── convert_index.py
├── .github
│ └── workflows
│ │ └── docker.yml
├── fuse
│ └── fuse.go
├── local-ci.sh
├── www
│ └── js
│ │ └── bmp.js
├── bimap
│ └── bimap.go
├── iotrace
│ ├── io.go
│ ├── report.go
│ └── counter.go
├── hash
│ ├── file.go
│ └── md5.go
├── pid
│ └── peer.go
├── iseq
│ └── iseq.go
├── prefetch.sh
├── fsutil
│ └── fsutil.go
├── README
└── buildinfo
│ └── buildinfo.go
├── tenplex-run
├── README.md
├── debug-ssh.sh
├── local_prepare.sh
├── local_clean.sh
├── scripts
│ ├── install
│ │ └── torch
│ │ │ └── cpu.sh
│ ├── read-zero-model-state.py
│ └── read-zero-optimizer-state.py
├── create-vnet.sh
├── clean.sh
├── pull.sh
├── dbg
│ └── dgb.go
├── listflag
│ ├── listflag_test.go
│ └── listflag.go
├── counter
│ └── id.go
├── job
│ ├── lib.go
│ ├── hosts.go
│ ├── params.go
│ └── params_bert.go
├── timeout
│ └── timeout.go
├── cancelgroup
│ └── cancelgroup.go
├── web
│ └── web.go
├── docker
│ └── lib.go
├── cluster
│ ├── cluster_test.go
│ └── cluster.go
├── .github
│ └── workflows
│ │ └── go.yml
├── structflag
│ └── structflag_test.go
├── runop
│ ├── redundancy.go
│ ├── dataset.go
│ └── failure.go
└── para_config
│ └── schedule.go
├── benchmark
├── reconfiguration_horovod
│ ├── tag.txt
│ ├── add-imagenet.sh
│ ├── README.md
│ ├── Dockerfile
│ ├── run.sh
│ ├── train-imagenet.sh
│ ├── with-docker
│ └── logger.py
├── README.md
├── convergence_impact
│ ├── requirements.txt
│ ├── run.sh
│ └── README.md
├── dynamic_resources
│ ├── hosts.txt
│ ├── pytorch-para-config.json
│ ├── tenplex-para-config.json
│ ├── README.md
│ ├── pytorch-schedule.json
│ └── tenplex-schedule.json
├── reconfiguration_cluster_size
│ ├── README.md
│ ├── tasks
│ │ ├── add_group.yml
│ │ └── pull_image.yml
│ ├── scale-cluster.sh
│ ├── config.sh
│ ├── para-config-tp-4to8.json
│ ├── para-config-tp-16to32.json
│ ├── para-config-tp-8to16.json
│ ├── docker.yml
│ ├── list-ips.sh
│ ├── schedule_16.json
│ ├── schedule_8.json
│ ├── schedule_32.json
│ ├── tenplex.yml
│ ├── para-config-dp.json
│ ├── para-config-pp.json
│ ├── upgrade.sh
│ └── recreate-vmss.sh
├── redeployment
│ ├── para-config.json
│ ├── schedule.json
│ ├── README.md
│ └── run.sh
├── model_convergence
│ ├── schedule-static.json
│ ├── schedule-up.json
│ ├── para-config-dp.json
│ ├── para-config-pp.json
│ ├── para-config-tp.json
│ ├── schedule-down.json
│ └── README.md
├── failure
│ ├── schedule.json
│ ├── para-config.json
│ ├── README.md
│ ├── run.sh
│ └── plot.py
├── reconfiguration_parallelization
│ ├── para-config-dp.json
│ ├── para-config-pp.json
│ ├── para-config-tp.json
│ ├── schedule.json
│ ├── README.md
│ └── run.sh
├── reconfiguration
│ ├── run.sh
│ ├── stop.py
│ └── README.md
├── common-cloud.sh
├── common.sh
└── performance_impact
│ ├── README.md
│ └── run.sh
├── show-go-mod.sh
├── tests
├── requirements.txt
├── test_delete.py
├── test_load_http.py
├── test_save.py
├── dataset.py
├── test-tensor-file.py
└── test_load.py
├── para_config
├── deepspeed
│ ├── README.md
│ └── layer_map.py
└── megatron_lm
│ ├── util.py
│ ├── gen_para_config.sh
│ ├── README.md
│ └── rank_map.py
├── scheduler
├── .gitignore
├── scalepoint
│ └── scalepoint.go
├── etc
│ ├── tenplex
│ │ ├── stop-scheduler.sh
│ │ └── scheduler.sh
│ └── os
│ │ └── linux
│ │ └── tenplex-scheduler.service
├── README
├── scripts
│ ├── config.sh
│ ├── scale-cluster.sh
│ ├── list-ips.sh
│ ├── gen-log-index.py
│ ├── build-deb.sh
│ ├── collect-logs.sh
│ ├── list-ips-komodo.sh
│ ├── upload.sh
│ ├── install-mlfs.sh
│ ├── plot.gp
│ ├── upload-logs.sh
│ └── recreate-vmss.sh
├── data
│ ├── plan-1.json
│ ├── plan-2.json
│ ├── plan-3.json
│ ├── trace.json
│ ├── plan-komodo.json
│ └── single-job-time.json
├── azure
│ ├── run_scheduler.sh
│ └── run_user.sh
├── logging
│ └── logging.go
├── run_scheduler.sh
├── experiments
│ ├── mlfs.go
│ ├── lib.go
│ └── experiments.go
├── run_user.sh
├── job
│ └── job.go
├── scheduler
│ └── scheduler_test.go
├── CMakeLists.txt
├── configserver
│ └── configserver.go
└── cmd
│ ├── tenplex-user
│ └── tenplex-user.go
│ └── tenplex-scheduler
│ └── tenplex-scheduler.go
├── state_transformer
├── build_docker.sh
├── statetransform
│ ├── padding.go
│ ├── padding_test.go
│ ├── iter.go
│ ├── lib.go
│ ├── repartition_test.go
│ └── replicate.go
├── meta
│ ├── metadata.go
│ ├── path.go
│ ├── modelkeys.go
│ ├── struct_test.go
│ └── rankmap.go
├── lib
│ └── lib.go
├── Dockerfile
├── mapslice
│ └── mapslice_test.go
├── test_state_migrator.sh
├── run_state_migrator.sh
├── cmd
│ └── tenplex-state-transformer
│ │ └── tenplex-state-transformer.go
└── search
│ └── file-system.go
├── man
└── man1
│ ├── mlfsd.1
│ └── mlfs.1
├── .gitignore
├── run_test_load.sh
├── Dockerfile
├── tenplex
├── __init__.py
├── mlfs_path.py
├── arguments.py
├── stop.py
└── save.py
├── Dockerfile-deepspeed
├── ansible
├── tenplex.yml
├── uninstall.yml
└── install.yml
├── .github
└── workflows
│ ├── go.yml
│ └── deb.yml
├── scripts
└── pack.sh
├── go.mod
├── .azure
└── release-pip.yml
├── azure-pipelines.yml
├── setup.py
├── tensor
├── tensor_test.go
├── dtypes.go
└── concat.go
├── ipv4
└── detect.go
├── CMakeLists.txt
├── Makefile
└── go.sum
/mlfs/mlfstest/.gitignore:
--------------------------------------------------------------------------------
1 | /bin
2 |
--------------------------------------------------------------------------------
/tenplex-run/README.md:
--------------------------------------------------------------------------------
1 | # elastique-controller
--------------------------------------------------------------------------------
/mlfs/docker/ubuntu/2004/tag.txt:
--------------------------------------------------------------------------------
1 | kungfu.azurecr.io/mlfs-focal:snapshot
2 |
--------------------------------------------------------------------------------
/mlfs/docker/ubuntu/2204/tag.txt:
--------------------------------------------------------------------------------
1 | kungfu.azurecr.io/mlfs-jammy:snapshot
2 |
--------------------------------------------------------------------------------
/mlfs/mlfstest/go.work:
--------------------------------------------------------------------------------
1 | go 1.19
2 |
3 | use (
4 | .
5 | ..
6 | )
7 |
--------------------------------------------------------------------------------
/mlfs/vfs/note_test.go:
--------------------------------------------------------------------------------
1 | package vfs
2 |
3 | func isNode(i Node) {}
4 |
--------------------------------------------------------------------------------
/benchmark/reconfiguration_horovod/tag.txt:
--------------------------------------------------------------------------------
1 | reconfiguration_horovod:snapshot
2 |
--------------------------------------------------------------------------------
/benchmark/README.md:
--------------------------------------------------------------------------------
1 | # Artifact evaluation
2 |
3 | Run `run.sh` in every directory.
4 |
--------------------------------------------------------------------------------
/mlfs/docker/ubuntu/1804/tf.tag.txt:
--------------------------------------------------------------------------------
1 | kungfu.azurecr.io/mlfs-bionic-tf1.13.2:snapshot
2 |
--------------------------------------------------------------------------------
/mlfs/docker/ubuntu/2004/tf.tag.txt:
--------------------------------------------------------------------------------
1 | kungfu.azurecr.io/mlfs-focal-tf1.13.2:snapshot
2 |
--------------------------------------------------------------------------------
/show-go-mod.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | set -e
3 |
4 | cat go.mod | head -n 1 | awk '{print $2}'
5 |
--------------------------------------------------------------------------------
/tenplex-run/debug-ssh.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | set -e
3 |
4 | make
5 | ./bin/elastique-test-ssh
6 |
--------------------------------------------------------------------------------
/benchmark/convergence_impact/requirements.txt:
--------------------------------------------------------------------------------
1 | scipy
2 | matplotlib
3 | torch
4 | torchvision
5 |
--------------------------------------------------------------------------------
/benchmark/dynamic_resources/hosts.txt:
--------------------------------------------------------------------------------
1 | 10.10.10.1
2 | 10.10.10.2
3 | 10.10.10.3
4 | 10.10.10.4
5 |
--------------------------------------------------------------------------------
/tests/requirements.txt:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env -S sh -c 'python3 -m pip install -r $0'
2 | pip
3 | torch
4 |
--------------------------------------------------------------------------------
/mlfs/benchmarks/bench-http.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | set -e
3 |
4 | make
5 |
6 | ./bin/mlfs-test -port 19999
7 |
--------------------------------------------------------------------------------
/mlfs/docker/test-tf.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | import tensorflow as tf
3 |
4 |
5 | print(tf.__version__)
6 |
--------------------------------------------------------------------------------
/mlfs/.gitignore:
--------------------------------------------------------------------------------
1 | *.idx.txt
2 | *.log
3 | *.md5.txt
4 | *.tf_record
5 | /bin
6 | /build
7 | /tmp
8 | __pycache__
9 |
--------------------------------------------------------------------------------
/para_config/deepspeed/README.md:
--------------------------------------------------------------------------------
1 | # Generate parallelisation configuration
2 |
3 | Old files. Might not work anymore
4 |
--------------------------------------------------------------------------------
/tenplex-run/local_prepare.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | mkdir -p ~/.tenplex/bin
5 | sudo systemctl restart mlfs
6 |
--------------------------------------------------------------------------------
/mlfs/benchmarks/bench-fuse.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | set -e
3 |
4 | make
5 |
6 | root=$HOME/mnt/efs
7 | ./bin/mlfs-test -mnt $root
8 |
--------------------------------------------------------------------------------
/mlfs/test-daemon.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | set -e
3 |
4 | make
5 |
6 | ./bin/mlfs daemon -ctrl-port 9999 -http-port 9998 -mnt ./tmp
7 |
--------------------------------------------------------------------------------
/mlfs/vfs/dir_test.go:
--------------------------------------------------------------------------------
1 | package vfs
2 |
3 | import "testing"
4 |
5 | func Test_2(t *testing.T) {
6 | d := &dir{}
7 | isNode(d)
8 | }
9 |
--------------------------------------------------------------------------------
/mlfs/vfs/fiile_test.go:
--------------------------------------------------------------------------------
1 | package vfs
2 |
3 | import "testing"
4 |
5 | func Test_1(t *testing.T) {
6 | f := &file{}
7 | isNode(f)
8 | }
9 |
--------------------------------------------------------------------------------
/scheduler/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | *.log
3 | *.pdf
4 | *.ps
5 | /build
6 | /logs
7 | /transformer-checkpoint
8 | bin
9 | run-id.txt
10 |
--------------------------------------------------------------------------------
/tenplex-run/local_clean.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | docker ps -f "name=trainer" -q | xargs docker stop
4 | sudo rm -r ~/.tenplex/training/*
5 |
--------------------------------------------------------------------------------
/mlfs/state/state.go:
--------------------------------------------------------------------------------
1 | package state
2 |
3 | type ElasticState struct {
4 | InitProgres int64
5 | ClusterSize int
6 | Rank int
7 | }
8 |
--------------------------------------------------------------------------------
/benchmark/reconfiguration_cluster_size/README.md:
--------------------------------------------------------------------------------
1 | # Reconfiguration cluster sizes
2 | _Fig. 15. Reconfiguration time with different cluster sizes_
3 |
--------------------------------------------------------------------------------
/benchmark/redeployment/para-config.json:
--------------------------------------------------------------------------------
1 | {
2 | "8": {
3 | "dp_size": 1,
4 | "pp_size": 2,
5 | "mp_size": 4
6 | }
7 | }
8 |
--------------------------------------------------------------------------------
/mlfs/tests/data/imagenet.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "imagenet",
3 | "index-url": "https://tenplex.blob.core.windows.net/data/imagenet.idx.txt"
4 | }
5 |
--------------------------------------------------------------------------------
/mlfs/tests/data/squad1.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "squad1",
3 | "index-url": "https://tenplex.blob.core.windows.net/data/squad1/squad1.idx.txt"
4 | }
5 |
--------------------------------------------------------------------------------
/scheduler/scalepoint/scalepoint.go:
--------------------------------------------------------------------------------
1 | package scalepoint
2 |
3 | type ScalePoint struct {
4 | Time int `json:"time"`
5 | Size int `json:"size"`
6 | }
7 |
--------------------------------------------------------------------------------
/scheduler/etc/tenplex/stop-scheduler.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | set -e
3 |
4 | pid=$(pgrep -f /usr/bin/tenplex-scheduler)
5 | kill -9 $pid
6 |
7 | echo "killed $pid"
8 |
--------------------------------------------------------------------------------
/state_transformer/build_docker.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | make
5 |
6 | ./Dockerfile
7 |
8 | docker push kungfu.azurecr.io/mw-megatron-lm-go:latest
9 |
--------------------------------------------------------------------------------
/para_config/megatron_lm/util.py:
--------------------------------------------------------------------------------
1 | import os
2 | import shutil
3 |
4 | def remove_dir(path: str):
5 | if os.path.exists(path):
6 | shutil.rmtree(path)
7 |
--------------------------------------------------------------------------------
/scheduler/README:
--------------------------------------------------------------------------------
1 | tenplex scheduler
2 |
3 | Run local
4 | mlfs serve -index-url /data/megatron-lm/bert/openwebtext/npzs_seq1024/indices.txt -self 155.198.152.18
5 |
--------------------------------------------------------------------------------
/man/man1/mlfsd.1:
--------------------------------------------------------------------------------
1 | .TH mlfsd
2 |
3 | .SH SYNOPSIS
4 | .B mlfsd
5 |
6 | .SY
7 | The mlfs daemon.
8 | .YS
9 |
10 | .SH SEE ALSO
11 | .BR mlfs-build-tf-index
12 |
--------------------------------------------------------------------------------
/mlfs/etc/apt/sources.list.d/tenplex.list:
--------------------------------------------------------------------------------
1 | # deb https://tenplex.blob.core.windows.net/public/deb ./
2 |
3 | deb https://europe-west2-apt.pkg.dev/projects/tenplex tenplex main
4 |
--------------------------------------------------------------------------------
/mlfs/utils/error.go:
--------------------------------------------------------------------------------
1 | package utils
2 |
3 | import (
4 | "log"
5 | "os"
6 | )
7 |
8 | func ExitErr(err error) {
9 | log.Printf("%v", err)
10 | os.Exit(1)
11 | }
12 |
--------------------------------------------------------------------------------
/mlfs/test-client.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | set -e
3 |
4 | make
5 |
6 | ./bin/mlfs mount -index-url http://155.198.152.18:20110/ -idx-name a -ctrl-port 9999
7 | ./bin/mlfs bench -mnt ./tmp
8 |
--------------------------------------------------------------------------------
/mlfs/vfs/example.go:
--------------------------------------------------------------------------------
1 | package vfs
2 |
3 | func InitExample(r *Tree) {
4 | r.Mkdir(`/`)
5 | r.Mkdir(`/a`)
6 | r.Mkdir(`/a/b`)
7 | r.TouchText(`/a/b/c.txt`, "hello world\n")
8 | }
9 |
--------------------------------------------------------------------------------
/man/man1/mlfs.1:
--------------------------------------------------------------------------------
1 | .TH mlfs
2 |
3 | .SH SYNOPSIS
4 | .B mlfs
5 |
6 | .SY
7 | The mlfs command line tool.
8 | .YS
9 |
10 | .SH SEE ALSO
11 | .BR mlfsd
12 | .BR mlfs-build-tf-index
13 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.egg-info
2 | *.log
3 | .DS_Store
4 | .mypy_cache
5 | training
6 | __pycache__
7 | bin
8 | build
9 | events.out.tfevents.*
10 | *.npz
11 | *.pdf
12 | *.csv
13 | data
14 | *.pt
15 |
--------------------------------------------------------------------------------
/mlfs/scripts/get-go.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | set -e
3 |
4 | mkdir -p $HOME/local
5 | cd $HOME/local
6 |
7 | wget https://dl.google.com/go/go1.18.linux-amd64.tar.gz
8 | tar -xf go1.18.linux-amd64.tar.gz
9 |
--------------------------------------------------------------------------------
/benchmark/model_convergence/schedule-static.json:
--------------------------------------------------------------------------------
1 | [
2 | {
3 | "step": 0,
4 | "size": 8
5 | },
6 | {
7 | "step": 200,
8 | "size": 0
9 | }
10 | ]
11 |
--------------------------------------------------------------------------------
/benchmark/reconfiguration_cluster_size/tasks/add_group.yml:
--------------------------------------------------------------------------------
1 | - name: add docker group
2 | become: true
3 | ansible.builtin.user:
4 | name: sospreviewer01
5 | groups: [docker]
6 | append: yes
7 |
--------------------------------------------------------------------------------
/scheduler/scripts/config.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | v100x1=Standard_NC6s_v3
4 | v100x2=Standard_NC12s_v3
5 | v100x4=Standard_NC24s_v3
6 |
7 | group=kungfu
8 | size=$v100x4
9 | name=tenplex-mw-v100x4
10 |
--------------------------------------------------------------------------------
/mlfs/scripts/cache-squad1.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | set -e
3 |
4 | ./bin/mlfs-fetch -ctrl-port 20000 -file 'https://minddata.blob.core.windows.net/data/squad1/train.tf_record' -md5 67eb6da21920dda01ec75cd6e1a5b8d7
5 |
--------------------------------------------------------------------------------
/tenplex-run/scripts/install/torch/cpu.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | python3 -m pip install torch==1.10.2+cpu torchvision==0.11.3+cpu torchaudio==0.10.2+cpu -f https://download.pytorch.org/whl/cpu/torch_stable.html
4 |
--------------------------------------------------------------------------------
/mlfs/build-imagenet-index.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | set -e
3 |
4 | list_tf_records() {
5 | ls /data/imagenet/records/train* | sort
6 | }
7 |
8 | mlfs-build-tf-index -m 16 -output imagenet.idx.txt $(list_tf_records)
9 |
--------------------------------------------------------------------------------
/benchmark/reconfiguration_cluster_size/scale-cluster.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | set -e
3 |
4 | . ./config.sh
5 |
6 | n="$1"
7 | az vmss scale -g $group -n $name --new-capacity $n -o table
8 |
9 | echo "scaled to $n"
10 |
--------------------------------------------------------------------------------
/run_test_load.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -e
4 |
5 | export PYTHONPATH="$HOME/Elasticity/Repo/Megatron-LM"
6 |
7 | python test_load.py \
8 | --device-rank 0 \
9 | --mlfs-path "/mnt/mlfs/job/job-single"
10 |
--------------------------------------------------------------------------------
/benchmark/reconfiguration_cluster_size/config.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | v100x1=Standard_NC6s_v3
4 | v100x2=Standard_NC12s_v3
5 | v100x4=Standard_NC24s_v3
6 |
7 | group=kungfu
8 | size=$v100x4
9 | name=tenplex-mw-v100x4
10 |
--------------------------------------------------------------------------------
/mlfs/etc/mlfs/stop.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | set -e
3 |
4 | # https://superuser.com/questions/1146388/systemd-state-stop-sigterm-timed-out
5 |
6 | pid=$(pgrep -f /usr/bin/mlfsd)
7 | kill -9 $pid
8 |
9 | echo "killed $pid"
10 |
--------------------------------------------------------------------------------
/para_config/megatron_lm/gen_para_config.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | python gen_para_config.py \
5 | --model gpt \
6 | --size large \
7 | --precision fp16 \
8 | --pp 1 \
9 | --tp 2 \
10 | --dp 2
11 |
--------------------------------------------------------------------------------
/scheduler/scripts/scale-cluster.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | set -e
3 |
4 | cd $(dirname $0)/..
5 | . ./scripts/config.sh
6 |
7 | n="$1"
8 | az vmss scale -g $group -n $name --new-capacity $n -o table
9 |
10 | echo "scaled to $n"
11 |
--------------------------------------------------------------------------------
/scheduler/scripts/list-ips.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | set -e
3 |
4 | cd $(dirname $0)/..
5 | . ./scripts/config.sh
6 |
7 | az vmss nic list -g kungfu --vmss-name $name --query '[].ipConfigurations[0].privateIpAddress' -o table -o table | sed 1,2d
8 |
--------------------------------------------------------------------------------
/tenplex-run/create-vnet.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | set -e
3 |
4 | name=elastique
5 |
6 | docker network create --driver overlay --scope swarm --attachable elastique
7 |
8 | # TODO: extract Subnet from JSON
9 | docker network inspect $name
10 |
--------------------------------------------------------------------------------
/scheduler/data/plan-1.json:
--------------------------------------------------------------------------------
1 | {
2 | "jobs": [
3 | {
4 | "steps": 100,
5 | "delay": 0
6 | },
7 | {
8 | "steps": 100,
9 | "delay": 6
10 | }
11 | ]
12 | }
13 |
--------------------------------------------------------------------------------
/benchmark/failure/schedule.json:
--------------------------------------------------------------------------------
1 | [
2 | {
3 | "step": 0,
4 | "size": 16
5 | },
6 | {
7 | "step": 50,
8 | "size": 8
9 | },
10 | {
11 | "step": 60,
12 | "size": 0
13 | }
14 | ]
15 |
--------------------------------------------------------------------------------
/benchmark/redeployment/schedule.json:
--------------------------------------------------------------------------------
1 | [
2 | {
3 | "step": 0,
4 | "size": 8
5 | },
6 | {
7 | "step": 50,
8 | "size": 8
9 | },
10 | {
11 | "step": 60,
12 | "size": 0
13 | }
14 | ]
15 |
--------------------------------------------------------------------------------
/mlfs/benchmarks/bench-tf-read.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | set -e
3 |
4 | export LD_LIBRARY_PATH=/usr/local/cuda-10.0/lib64
5 | cd $(dirname $0)/..
6 | export CUDA_VISIBLE_DEVICES=0
7 |
8 | ./benchmarks/tf_read.py --fake-data 1
9 | # ./benchmarks/tf_read.py
10 |
--------------------------------------------------------------------------------
/benchmark/failure/para-config.json:
--------------------------------------------------------------------------------
1 | {
2 | "16": {
3 | "dp_size": 2,
4 | "pp_size": 2,
5 | "mp_size": 4
6 | },
7 | "8":{
8 | "dp_size": 1,
9 | "pp_size": 2,
10 | "mp_size": 4
11 | }
12 | }
13 |
--------------------------------------------------------------------------------
/benchmark/model_convergence/schedule-up.json:
--------------------------------------------------------------------------------
1 | [
2 | {
3 | "step": 0,
4 | "size": 4
5 | },
6 | {
7 | "step": 100,
8 | "size": 8
9 | },
10 | {
11 | "step": 200,
12 | "size": 0
13 | }
14 | ]
15 |
--------------------------------------------------------------------------------
/benchmark/model_convergence/para-config-dp.json:
--------------------------------------------------------------------------------
1 | {
2 | "4": {
3 | "dp_size": 4,
4 | "pp_size": 1,
5 | "mp_size": 1
6 | },
7 | "8": {
8 | "dp_size": 8,
9 | "pp_size": 1,
10 | "mp_size": 1
11 | }
12 | }
13 |
--------------------------------------------------------------------------------
/benchmark/model_convergence/para-config-pp.json:
--------------------------------------------------------------------------------
1 | {
2 | "4": {
3 | "dp_size": 1,
4 | "pp_size": 4,
5 | "mp_size": 1
6 | },
7 | "8": {
8 | "dp_size": 1,
9 | "pp_size": 8,
10 | "mp_size": 1
11 | }
12 | }
13 |
--------------------------------------------------------------------------------
/benchmark/model_convergence/para-config-tp.json:
--------------------------------------------------------------------------------
1 | {
2 | "4": {
3 | "dp_size": 1,
4 | "pp_size": 1,
5 | "mp_size": 4
6 | },
7 | "8": {
8 | "dp_size": 1,
9 | "pp_size": 1,
10 | "mp_size": 8
11 | }
12 | }
13 |
--------------------------------------------------------------------------------
/benchmark/model_convergence/schedule-down.json:
--------------------------------------------------------------------------------
1 | [
2 | {
3 | "step": 0,
4 | "size": 8
5 | },
6 | {
7 | "step": 100,
8 | "size": 4
9 | },
10 | {
11 | "step": 200,
12 | "size": 0
13 | }
14 | ]
15 |
--------------------------------------------------------------------------------
/scheduler/scripts/gen-log-index.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # import glob
3 | import sys
4 |
5 |
6 | def main(args):
7 | for f in args:
8 | a = "
%s" % (f, f)
9 | print(a)
10 |
11 |
12 | main(sys.argv[1:])
13 |
--------------------------------------------------------------------------------
/benchmark/dynamic_resources/pytorch-para-config.json:
--------------------------------------------------------------------------------
1 | {
2 | "8": {
3 | "dp_size": 1,
4 | "pp_size": 4,
5 | "mp_size": 2
6 | },
7 | "16": {
8 | "dp_size": 2,
9 | "pp_size": 4,
10 | "mp_size": 2
11 | }
12 | }
13 |
--------------------------------------------------------------------------------
/para_config/megatron_lm/README.md:
--------------------------------------------------------------------------------
1 | # Generate parallelisation configuration
2 |
3 | ## Example
4 | ```py
5 | python gen_para_config.py \
6 | --model gpt \
7 | --size large \
8 | --precision fp16 \
9 | --pp 1 \
10 | --tp 2 \
11 | --dp 2
12 | ```
13 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env -S sh -c 'docker build --rm -t kungfu.azurecr.io/mw-megatron-lm-23.06-tenplex:latest -f $0 .'
2 |
3 | FROM kungfu.azurecr.io/mw-megatron-lm-23.06:latest
4 |
5 | # Tenplex
6 | ADD . /workspace/tenplex
7 | RUN cd /workspace/tenplex && \
8 | pip install .
9 |
--------------------------------------------------------------------------------
/tenplex-run/clean.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | HOSTS="155.198.152.18 155.198.152.19 155.198.152.23"
4 |
5 | for host in $HOSTS; do
6 | ssh $host "docker ps -f \"name=trainer\" -q | xargs docker stop" &
7 | ssh $host "sudo rm -r ~/.tenplex/training/*" &
8 | done
9 |
10 | wait
11 |
--------------------------------------------------------------------------------
/benchmark/convergence_impact/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | python -u mnist.py 2>&1 | tee mnist.log
5 | python -u mnist.py --inconsistent-dataset 2>&1 | tee inconsistent_dataset.log
6 | python -u mnist_batch_size.py 2>&1 | tee inconsistent_batch_size.log
7 |
8 | python plot.py
9 |
--------------------------------------------------------------------------------
/benchmark/reconfiguration_cluster_size/para-config-tp-4to8.json:
--------------------------------------------------------------------------------
1 | {
2 | "4": {
3 | "dp_size": 1,
4 | "pp_size": 1,
5 | "mp_size": 4
6 | },
7 | "8": {
8 | "dp_size": 1,
9 | "pp_size": 1,
10 | "mp_size": 8
11 | }
12 | }
13 |
--------------------------------------------------------------------------------
/benchmark/reconfiguration_parallelization/para-config-dp.json:
--------------------------------------------------------------------------------
1 | {
2 | "8": {
3 | "dp_size": 1,
4 | "pp_size": 2,
5 | "mp_size": 4
6 | },
7 | "16": {
8 | "dp_size": 2,
9 | "pp_size": 2,
10 | "mp_size": 4
11 | }
12 | }
13 |
--------------------------------------------------------------------------------
/benchmark/reconfiguration_parallelization/para-config-pp.json:
--------------------------------------------------------------------------------
1 | {
2 | "8": {
3 | "dp_size": 1,
4 | "pp_size": 2,
5 | "mp_size": 4
6 | },
7 | "16": {
8 | "dp_size": 1,
9 | "pp_size": 4,
10 | "mp_size": 4
11 | }
12 | }
13 |
--------------------------------------------------------------------------------
/benchmark/reconfiguration_parallelization/para-config-tp.json:
--------------------------------------------------------------------------------
1 | {
2 | "8": {
3 | "dp_size": 1,
4 | "pp_size": 2,
5 | "mp_size": 4
6 | },
7 | "16": {
8 | "dp_size": 1,
9 | "pp_size": 2,
10 | "mp_size": 8
11 | }
12 | }
13 |
--------------------------------------------------------------------------------
/mlfs/notes:
--------------------------------------------------------------------------------
1 | 7.68TB Samsung PM883 2.5" Enterprise SSD, SATA 3 (RAID 6)
2 |
3 | 64 parallel read: 8.57 GiB/s
4 |
5 |
6 | sequential read speed, 550MB/s
7 |
8 |
9 | tmpfs
10 |
11 | 17.07 GiB/s
12 |
13 | Samsung 980 PRO 500GB M.2 PCIe 4.0 NVMe SSD (Mirrored)
14 | 16.92 GiB/s
15 |
--------------------------------------------------------------------------------
/tenplex/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | """
3 |
4 | try:
5 | from .load import load, load_http
6 | from .save import save
7 | from .stop import check_stop
8 | except:
9 | # When torch is not installed
10 | # ModuleNotFoundError: No module named 'torch'
11 | pass
12 |
--------------------------------------------------------------------------------
/benchmark/reconfiguration/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | sudo rm -rf /mnt/k1d2/ckpt/*
5 | python -u training.py 2>&1 | tee run_scale_down.log
6 |
7 | # sudo rm -rf /mnt/k1d2/ckpt/*
8 | # python -u training.py --scale-up 2>&1 | tee run_scale_up.log
9 |
10 | # python plot.py
11 |
--------------------------------------------------------------------------------
/benchmark/reconfiguration_cluster_size/para-config-tp-16to32.json:
--------------------------------------------------------------------------------
1 | {
2 | "16": {
3 | "dp_size": 2,
4 | "pp_size": 4,
5 | "mp_size": 2
6 | },
7 | "32": {
8 | "dp_size": 2,
9 | "pp_size": 4,
10 | "mp_size": 4
11 | }
12 | }
13 |
--------------------------------------------------------------------------------
/benchmark/reconfiguration_cluster_size/para-config-tp-8to16.json:
--------------------------------------------------------------------------------
1 | {
2 | "8": {
3 | "dp_size": 1,
4 | "pp_size": 4,
5 | "mp_size": 2
6 | },
7 | "16": {
8 | "dp_size": 1,
9 | "pp_size": 4,
10 | "mp_size": 4
11 | }
12 | }
13 |
--------------------------------------------------------------------------------
/mlfs/etc/mlfs/mlfs.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | set -e
3 |
4 | flags() {
5 | echo -http-port 19999
6 | echo -ctrl-port 20010
7 | echo -mnt /mnt/mlfs
8 | echo -tmp /tmp/mlfs
9 | echo -su
10 | echo -log-req
11 | }
12 |
13 | /usr/bin/mlfsd $(flags)
14 |
15 | echo "$0 stopped"
16 |
--------------------------------------------------------------------------------
/benchmark/reconfiguration_cluster_size/docker.yml:
--------------------------------------------------------------------------------
1 | # ansible-playbook -i hosts.txt ansible/tenplex.yml
2 |
3 | - name: pull image
4 | hosts: all
5 |
6 | tasks:
7 | - import_tasks: ./tasks/pull_image.yml
8 |
9 | #- debug: msg="{{ log.stdout }}"
10 | #- debug: msg="{{ log.stderr }}"
11 |
--------------------------------------------------------------------------------
/Dockerfile-deepspeed:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env -S sh -c 'docker build --rm -t kungfu.azurecr.io/mw-deepspeed-tenplex:latest -f $0 .'
2 |
3 | FROM kungfu.azurecr.io/mw-deepspeed:latest
4 |
5 | USER root
6 |
7 | # Tenplex
8 | ADD . /workspace/tenplex
9 | RUN cd /workspace/tenplex && \
10 | pip install .
11 |
--------------------------------------------------------------------------------
/mlfs/mlfstest/go.mod:
--------------------------------------------------------------------------------
1 | module mlfstest
2 |
3 | go 1.19
4 |
5 | require github.com/lgarithm/proc v0.3.2-0.20221205141105-3ebbaa57acfd
6 |
7 | require (
8 | golang.org/x/crypto v0.0.0-20200604202706-70a84ac30bf9 // indirect
9 | golang.org/x/sys v0.0.0-20190412213103-97732733099d // indirect
10 | )
11 |
--------------------------------------------------------------------------------
/tenplex-run/pull.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | HOSTS="10.0.0.5 10.0.0.6 10.0.0.8 10.0.0.9"
5 | JOBID="cd1e6f634c"
6 |
7 | mkdir -p ~/.tenplex/training/$JOBID
8 |
9 | for host in $HOSTS; do
10 | scp -r $host:~/.tenplex/training/$JOBID/* ~/.tenplex/training/$JOBID &
11 | done
12 |
13 | wait
14 |
--------------------------------------------------------------------------------
/mlfs/vfs/vfile/shard.go:
--------------------------------------------------------------------------------
1 | package vfile
2 |
3 | import "github.com/kungfu-team/tenplex/mlfs/iseq"
4 |
5 | func (f IndexedFiles) Shard(i, n int) *vfile {
6 | seq := iseq.Seq(iseq.Iota(f.NumRange()))
7 | seq = seq.Shard(i, n)
8 | return &vfile{
9 | ranges: f.NamedRanges().Select(seq.Get()),
10 | }
11 | }
12 |
--------------------------------------------------------------------------------
/mlfs/ds/trds/trds_test.go:
--------------------------------------------------------------------------------
1 | package trds
2 |
3 | import (
4 | "fmt"
5 | "testing"
6 | )
7 |
8 | func Test_1(t *testing.T) {
9 | xs := []int{2, 2, 2, 2, 2}
10 | ps := groupIntList(xs)
11 | fmt.Printf("%v\n", ps)
12 | if len(ps) != 1 {
13 | t.Fail()
14 | }
15 | if ps[0].Second != 5 {
16 | t.Fail()
17 | }
18 | }
19 |
--------------------------------------------------------------------------------
/ansible/tenplex.yml:
--------------------------------------------------------------------------------
1 | # ansible-playbook -i hosts.txt ansible/tenplex.yml
2 |
3 | - name: install tenplex
4 | hosts: all
5 | remote_user: kungfu
6 |
7 | tasks:
8 | # - import_tasks: ./uninstall.yml
9 | - import_tasks: ./install.yml
10 |
11 | - debug: msg="{{ log.stdout }}"
12 | - debug: msg="{{ log.stderr }}"
13 |
--------------------------------------------------------------------------------
/mlfs/build-squad-index.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | set -e
3 |
4 | cd $(dirname $0)
5 |
6 | list_squad_records() {
7 | if [ $(uname) = "Darwin" ]; then
8 | echo $HOME/squad1_train.tf_record
9 | else
10 | echo /data/squad1/train.tf_record
11 | fi
12 | }
13 |
14 | ./bin/mlfs-build-tf-index $(list_squad_records)
15 |
--------------------------------------------------------------------------------
/scheduler/data/plan-2.json:
--------------------------------------------------------------------------------
1 | {
2 | "jobs": [
3 | {
4 | "steps": 300,
5 | "delay": 0
6 | },
7 | {
8 | "steps": 200,
9 | "delay": 10
10 | },
11 | {
12 | "steps": 200,
13 | "delay": 10
14 | }
15 | ]
16 | }
17 |
--------------------------------------------------------------------------------
/scheduler/data/plan-3.json:
--------------------------------------------------------------------------------
1 | {
2 | "jobs": [
3 | {
4 | "steps": 300,
5 | "delay": 0
6 | },
7 | {
8 | "steps": 200,
9 | "delay": 10
10 | },
11 | {
12 | "steps": 200,
13 | "delay": 10
14 | }
15 | ]
16 | }
17 |
--------------------------------------------------------------------------------
/scheduler/etc/tenplex/scheduler.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | set -e
3 |
4 | flags() {
5 | echo -detect-self-ip eth0
6 |
7 | # echo -reinstall
8 | echo -u kungfu
9 |
10 | echo -tenplex-state-transformer /usr/bin/tenplex-state-transformer
11 | }
12 |
13 | /usr/bin/tenplex-scheduler $(flags)
14 |
15 | echo "$0 stopped"
16 |
--------------------------------------------------------------------------------
/.github/workflows/go.yml:
--------------------------------------------------------------------------------
1 | name: Go
2 |
3 | on:
4 | - push
5 | - pull_request
6 |
7 | jobs:
8 | test:
9 | # https://help.github.com/en/articles/virtual-environments-for-github-actions#supported-virtual-environments
10 | runs-on: ubuntu-20.04
11 |
12 | steps:
13 | - uses: actions/checkout@v3
14 | - run: make
15 |
--------------------------------------------------------------------------------
/mlfs/uri/path.go:
--------------------------------------------------------------------------------
1 | package uri
2 |
3 | import (
4 | "net/url"
5 | "path"
6 | "strings"
7 | )
8 |
9 | func AppendPath(a, b string) string {
10 | b = strings.TrimLeft(b, `/`)
11 | u, err := url.Parse(a)
12 | if err != nil {
13 | return path.Join(a, b)
14 | }
15 | u.Path = path.Join(u.Path, b)
16 | return u.String()
17 | }
18 |
--------------------------------------------------------------------------------
/scheduler/azure/run_scheduler.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | export GO=/usr/local/go/bin/go
5 |
6 | echo "Building scheduler ..."
7 | make all
8 | mkdir -p $HOME/.tenplex/scheduler/bin
9 | cp -v ./vendors/tenplex-run/mlfs/bin/mlfsd $HOME/.tenplex/scheduler/bin
10 |
11 | echo "Running scheduler ..."
12 | ./bin/tenplex-scheduler
13 |
--------------------------------------------------------------------------------
/benchmark/reconfiguration_cluster_size/list-ips.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | cd $(dirname $0)
5 |
6 | . ./config.sh
7 | echo $name
8 |
9 | list_hosts() {
10 | az vmss nic list -g kungfu --vmss-name $name --query '[].ipConfigurations[0].privateIPAddress' -o table | sed 1,2d
11 | }
12 |
13 | list_hosts | tee hosts.txt
14 |
15 |
--------------------------------------------------------------------------------
/benchmark/reconfiguration_cluster_size/schedule_16.json:
--------------------------------------------------------------------------------
1 | [
2 | {
3 | "step": 0,
4 | "size": 8
5 | },
6 | {
7 | "step": 50,
8 | "size": 16
9 | },
10 | {
11 | "step": 60,
12 | "size": 8
13 | },
14 | {
15 | "step": 70,
16 | "size": 0
17 | }
18 | ]
19 |
--------------------------------------------------------------------------------
/benchmark/reconfiguration_cluster_size/schedule_8.json:
--------------------------------------------------------------------------------
1 | [
2 | {
3 | "step": 0,
4 | "size": 4
5 | },
6 | {
7 | "step": 50,
8 | "size": 8
9 | },
10 | {
11 | "step": 60,
12 | "size": 4
13 | },
14 | {
15 | "step": 70,
16 | "size": 0
17 | }
18 | ]
19 |
--------------------------------------------------------------------------------
/benchmark/reconfiguration_parallelization/schedule.json:
--------------------------------------------------------------------------------
1 | [
2 | {
3 | "step": 0,
4 | "size": 8
5 | },
6 | {
7 | "step": 50,
8 | "size": 16
9 | },
10 | {
11 | "step": 60,
12 | "size": 8
13 | },
14 | {
15 | "step": 70,
16 | "size": 0
17 | }
18 | ]
19 |
--------------------------------------------------------------------------------
/benchmark/reconfiguration_cluster_size/schedule_32.json:
--------------------------------------------------------------------------------
1 | [
2 | {
3 | "step": 0,
4 | "size": 16
5 | },
6 | {
7 | "step": 50,
8 | "size": 32
9 | },
10 | {
11 | "step": 60,
12 | "size": 16
13 | },
14 | {
15 | "step": 70,
16 | "size": 0
17 | }
18 | ]
19 |
--------------------------------------------------------------------------------
/mlfs/vfs/ufs/ufs_darwin.go:
--------------------------------------------------------------------------------
1 | package ufs
2 |
3 | import (
4 | "log"
5 |
6 | "github.com/kungfu-team/tenplex/mlfs/vfs"
7 | )
8 |
9 | func Umount(mnt string) {
10 | log.Printf("TODO: support FUSE Umount on darwin")
11 | }
12 |
13 | func Start(mnt string, r *vfs.Tree, super bool) {
14 | log.Printf("TODO: support FUSE Mount on darwin")
15 | }
16 |
--------------------------------------------------------------------------------
/mlfs/docker/docker-compose.yaml:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env -S sh -c 'docker compose -f $0 up'
2 |
3 | services:
4 | mlfs:
5 | # build: .
6 | image: kungfu.azurecr.io/mlfs:snapshot
7 | command: /sbin/init
8 | privileged: true
9 | deploy:
10 | replicas: 4
11 |
12 | networks:
13 | default:
14 | name: mlfs
15 | # external: true
16 |
--------------------------------------------------------------------------------
/mlfs/etc/os/linux/mlfs.service:
--------------------------------------------------------------------------------
1 | # /lib/systemd/system/mlfs.service
2 |
3 | [Unit]
4 | Description=MLFS
5 | After=network.target
6 | StartLimitIntervalSec=0
7 |
8 | [Service]
9 | ExecStart=/etc/mlfs/mlfs.sh
10 | ExecStop=-/etc/mlfs/stop.sh
11 |
12 | Restart=always
13 | RestartSec=1
14 | Type=simple
15 |
16 | [Install]
17 | WantedBy=multi-user.target
18 |
--------------------------------------------------------------------------------
/mlfs/cmd/mlfsd/mlfsd.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "context"
5 | "flag"
6 |
7 | "github.com/kungfu-team/tenplex/mlfs/mlfs"
8 | "github.com/kungfu-team/tenplex/mlfs/utils"
9 | )
10 |
11 | func main() {
12 | var d mlfs.Daemon
13 | d.RegisterFlags(flag.CommandLine)
14 | flag.Parse()
15 | utils.LogArgs()
16 | d.RunCtx(context.Background())
17 | }
18 |
--------------------------------------------------------------------------------
/mlfs/mlfs/app.go:
--------------------------------------------------------------------------------
1 | package mlfs
2 |
3 | import (
4 | "flag"
5 | "os"
6 | "path"
7 | "time"
8 | )
9 |
10 | func Main(main func() error) {
11 | flag.Parse()
12 | t0 := time.Now()
13 | prog := path.Base(os.Args[0])
14 | defer func() { log.Printf("%s took %s", prog, time.Since(t0)) }()
15 | if err := main(); err != nil {
16 | log.Fatal(err)
17 | }
18 | }
19 |
--------------------------------------------------------------------------------
/tenplex-run/dbg/dgb.go:
--------------------------------------------------------------------------------
1 | package dbg
2 |
3 | import "github.com/lgarithm/proc"
4 |
5 | func SSH(p proc.Proc) proc.P {
6 | target := p.Host
7 | if len(p.User) > 0 {
8 | target = p.User + `@` + p.Host
9 | }
10 | args := []string{
11 | `-v`,
12 | target,
13 | p.Prog,
14 | }
15 | args = append(args, p.Args...)
16 | return proc.PC(`ssh`, args...)
17 | }
18 |
--------------------------------------------------------------------------------
/mlfs/mlfs/bitmap.go:
--------------------------------------------------------------------------------
1 | package mlfs
2 |
3 | import (
4 | "image"
5 | "image/color"
6 | )
7 |
8 | type BitVec struct{}
9 |
10 | func makeBitmap(h, w int) *image.RGBA {
11 | r := image.Rect(0, 0, w, h)
12 | img := image.NewRGBA(r)
13 | for i := 0; i < h; i++ {
14 | for j := 0; j < w; j++ {
15 | img.Set(j, i, color.Black)
16 | }
17 | }
18 | return img
19 | }
20 |
--------------------------------------------------------------------------------
/benchmark/dynamic_resources/tenplex-para-config.json:
--------------------------------------------------------------------------------
1 | {
2 | "4": {
3 | "dp_size": 1,
4 | "pp_size": 2,
5 | "mp_size": 2
6 | },
7 | "8": {
8 | "dp_size": 1,
9 | "pp_size": 4,
10 | "mp_size": 2
11 | },
12 | "16": {
13 | "dp_size": 2,
14 | "pp_size": 4,
15 | "mp_size": 2
16 | }
17 | }
18 |
--------------------------------------------------------------------------------
/benchmark/model_convergence/README.md:
--------------------------------------------------------------------------------
1 | # Model convergence
2 | _Fig. 16. Model convergence with reconfiguration_
3 |
4 | We evaluate Tenplex’s impact on model convergence. For this, we use the BERT-large model with the OpenWeb-Text dataset deployed on the on-premise cluster. At training step 100, we either increase or decrease the resources and compare them to a baseline without change.
5 |
--------------------------------------------------------------------------------
/state_transformer/statetransform/padding.go:
--------------------------------------------------------------------------------
1 | package statetransform
2 |
3 | func VocabSizePadding(vocabSize int, mpSize int) int {
4 | makeVocabSizeDivisibleBy := 128
5 | after := vocabSize
6 | multiple := makeVocabSizeDivisibleBy * mpSize
7 | for {
8 | if after%multiple != 0 {
9 | after += 1
10 | } else {
11 | break
12 | }
13 | }
14 | return after
15 | }
16 |
--------------------------------------------------------------------------------
/state_transformer/statetransform/padding_test.go:
--------------------------------------------------------------------------------
1 | package statetransform
2 |
3 | import "testing"
4 |
5 | func TestVocabSizePadding(t *testing.T) {
6 | mp := 2
7 | s := VocabSizePadding(30524, mp)
8 | t.Logf("vocab size with padding with MP %d: %d", mp, s)
9 | mp = 4
10 | s = VocabSizePadding(30524, mp)
11 | t.Logf("vocab size with padding with MP %d: %d", mp, s)
12 | }
13 |
--------------------------------------------------------------------------------
/mlfs/build-cloud-index.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | set -e
3 |
4 | cd $(dirname $0)
5 |
6 | host=minddata.blob.core.windows.net
7 |
8 | list_tf_records() {
9 | for i in $(seq 1024); do
10 | echo https://$host/data/imagenet/records/train-$(printf "%05d" $((i - 1)))-of-01024
11 | done
12 | }
13 |
14 | ./bin/mlfs-build-tf-index -m 8 -output imagenet.idx.txt $(list_tf_records)
15 |
--------------------------------------------------------------------------------
/scheduler/data/trace.json:
--------------------------------------------------------------------------------
1 | [
2 | {
3 | "time": 300,
4 | "size": 4
5 | },
6 | {
7 | "time": 141,
8 | "size": 2
9 | },
10 | {
11 | "time": 437,
12 | "size": 8
13 | },
14 | {
15 | "time": 11,
16 | "size": 16
17 | },
18 | {
19 | "time": 0,
20 | "size": 0
21 | }
22 | ]
23 |
--------------------------------------------------------------------------------
/scripts/pack.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | set -e
3 |
4 | cd $(dirname $0)/..
5 | rm -rf build
6 | mkdir -p build
7 | cd build
8 |
9 | branch=$(git rev-parse --abbrev-ref HEAD)
10 | rev=$(git rev-list --count HEAD)
11 | commit=$(git rev-parse --short HEAD)
12 |
13 | export VERSION="0.0.${rev}-git-${branch}-rev${rev}-${commit}"
14 |
15 | cmake ..
16 | make package
17 |
18 | dpkg -c *.deb
19 |
--------------------------------------------------------------------------------
/mlfs/add-imagenet.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | set -e
3 |
4 | flags() {
5 | echo -idx-name imagenet
6 | echo -index-url https://tenplex.blob.core.windows.net/data/imagenet.idx.txt
7 | echo -ctrl-port 20000
8 |
9 | echo -progress 0
10 | echo -global-batch-size 23
11 | echo -dp-size 4
12 |
13 | # echo -fetch
14 | # echo -m 64
15 | }
16 |
17 | mlfs mount $(flags)
18 |
--------------------------------------------------------------------------------
/benchmark/reconfiguration_cluster_size/tenplex.yml:
--------------------------------------------------------------------------------
1 | # ansible-playbook -i hosts.txt ansible/tenplex.yml
2 |
3 | - name: install tenplex
4 | hosts: all
5 |
6 | tasks:
7 | - import_tasks: ../../ansible/uninstall.yml
8 | - import_tasks: ../../ansible/install.yml
9 | - import_tasks: ./tasks/add_group.yml
10 |
11 | #- debug: msg="{{ log.stdout }}"
12 | #- debug: msg="{{ log.stderr }}"
13 |
--------------------------------------------------------------------------------
/mlfs/build-imagenet-md5sum.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | set -e
3 |
4 | make
5 |
6 | cd $(dirname $0)
7 |
8 | list_tf_records() {
9 | # SSD/HDD
10 | ls /data/imagenet/records/train* | sort
11 |
12 | # tmpfs
13 | # ls $HOME/mnt/tmp/train* | sort
14 |
15 | # NVMe
16 | #ls $HOME/data/train* | sort
17 | }
18 |
19 | ./bin/mlfs-md5sum -m 64 -output imagenet.md5.txt $(list_tf_records)
20 |
--------------------------------------------------------------------------------
/mlfs/cmd/tests/cmd/test-fuse/test-fuse.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "flag"
5 |
6 | "github.com/kungfu-team/tenplex/mlfs/fuse"
7 | "github.com/kungfu-team/tenplex/mlfs/utils"
8 | )
9 |
10 | var (
11 | mnt = flag.String("mnt", "", "")
12 | )
13 |
14 | func main() {
15 | flag.Parse()
16 | f, err := fuse.New(*mnt)
17 | if err != nil {
18 | utils.ExitErr(err)
19 | }
20 | f.Run()
21 | }
22 |
--------------------------------------------------------------------------------
/tenplex-run/listflag/listflag_test.go:
--------------------------------------------------------------------------------
1 | package listflag_test
2 |
3 | import (
4 | "flag"
5 | "testing"
6 |
7 | "github.com/kungfu-team/tenplex/tenplex-run/listflag"
8 | )
9 |
10 | func isFlagValue(flag.Value) {}
11 |
12 | func Test_1(t *testing.T) {
13 | var x listflag.Strings
14 | isFlagValue(&x)
15 | }
16 |
17 | func Test_2(t *testing.T) {
18 | var x listflag.Ints
19 | isFlagValue(&x)
20 | }
21 |
--------------------------------------------------------------------------------
/mlfs/add-enwiki-numpy.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | set -e
3 |
4 | make
5 |
6 | flags() {
7 | echo -idx-name enwiki
8 | echo -idx-file /data/megatron-lm/bert/enwiki/npzs_seq512/indices.txt
9 | echo -ctrl-port 20010
10 |
11 | echo -progress 0
12 | echo -global-batch-size 32
13 | echo -cluster-size 4
14 |
15 | # echo -fetch
16 |
17 | echo -m 64
18 | }
19 |
20 | ./bin/mlfs mount $(flags)
21 |
--------------------------------------------------------------------------------
/scheduler/azure/run_user.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | export GO=/usr/local/go/bin/go
5 |
6 | make
7 |
8 | join_() {
9 | local IFS=","
10 | echo "$*"
11 | }
12 |
13 | w1=10.0.0.9
14 | w2=10.0.0.8
15 |
16 | ips() {
17 | echo $w1
18 | echo $w2
19 | }
20 |
21 | flags() {
22 | echo -gpu-per-host 4
23 | echo -hosts "$(join_ $(ips))"
24 | }
25 |
26 | # echo
27 | ./bin/tenplex-user $(flags)
28 |
--------------------------------------------------------------------------------
/state_transformer/meta/metadata.go:
--------------------------------------------------------------------------------
1 | package meta
2 |
3 | type Metadata struct {
4 | SourceRankMap *RankMap
5 | TargetRankMap *RankMap
6 | SourceStructs map[int]map[string]interface{}
7 | TargetStructs map[int]map[string]interface{}
8 | SourceGroupSizes map[int]map[int]int
9 | TargetGroupSizes map[int]map[int]int
10 | SourceModelKeys map[int][][]string
11 | TargetModelKeys map[int][][]string
12 | }
13 |
--------------------------------------------------------------------------------
/tenplex-run/counter/id.go:
--------------------------------------------------------------------------------
1 | package counter
2 |
3 | func NewCounterFunc() func() int {
4 | var id int
5 | return func() int { x := id; id++; return x }
6 | }
7 |
8 | func New() *Counter {
9 | return &Counter{}
10 | }
11 |
12 | type Counter struct {
13 | n int
14 | }
15 |
16 | func (c *Counter) Next() int {
17 | id := c.n
18 | c.n++
19 | return id
20 | }
21 |
22 | func (c *Counter) Reset() {
23 | c.n = 0
24 | }
25 |
--------------------------------------------------------------------------------
/mlfs/closer/closer.go:
--------------------------------------------------------------------------------
1 | package closer
2 |
3 | import "io"
4 |
5 | type closer struct {
6 | r io.Reader
7 | close func() error
8 | }
9 |
10 | func ReadClose(r io.Reader, close func() error) io.ReadCloser {
11 | return &closer{r: r, close: close}
12 | }
13 |
14 | func (c *closer) Read(buf []byte) (int, error) {
15 | return c.r.Read(buf)
16 | }
17 |
18 | func (c *closer) Close() error {
19 | return c.close()
20 | }
21 |
--------------------------------------------------------------------------------
/mlfs/test-numpy.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | set -e
3 |
4 | make
5 |
6 | # DATA_DIR=/data/megatron-lm/bert/npz_concat
7 | DATA_DIR=/data/megatron-lm/bert/test
8 |
9 | flags() {
10 | # echo -index-file $DATA_DIR/indices.txt
11 | echo -index-file $DATA_DIR/old-format.txt
12 | # echo -data-file $DATA_DIR/samples.npzs
13 |
14 | echo -dp-size 4
15 | echo -global-batch-size 32
16 | }
17 |
18 | ./bin/mlfs-gen-numpy $(flags)
19 |
--------------------------------------------------------------------------------
/mlfs/scripts/system-install.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | set -e
4 |
5 | cd $(dirname $0)/..
6 |
7 | export PATH=/usr/local/go/bin:$PATH
8 |
9 | make
10 |
11 | rm -fr build
12 | ./scripts/pack.sh
13 |
14 | set +e
15 | echo "stopping mlfsd"
16 | sudo systemctl stop mlfs
17 | echo "stopped mlfsd"
18 | set -e
19 |
20 | sudo dpkg -i ./build/*.deb
21 | sudo systemctl daemon-reload
22 |
23 | sudo systemctl start mlfs
24 |
25 | echo "done $0"
26 |
--------------------------------------------------------------------------------
/scheduler/etc/os/linux/tenplex-scheduler.service:
--------------------------------------------------------------------------------
1 | # /lib/systemd/system/tenplex-scheduler.service
2 |
3 | [Unit]
4 | Description=Tenplex Scheduler
5 | After=network.target
6 | StartLimitIntervalSec=0
7 |
8 | [Service]
9 | ExecStart=/etc/tenplex/scheduler.sh
10 | ExecStop=-/etc/tenplex/stop-scheduler.sh
11 | User=kungfu
12 |
13 | Restart=always
14 | RestartSec=1
15 | Type=simple
16 |
17 | [Install]
18 | WantedBy=multi-user.target
19 |
--------------------------------------------------------------------------------
/scheduler/scripts/build-deb.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | set -e
3 |
4 | cd $(dirname $0)/..
5 |
6 | GOBIN=$PWD/bin go install -v ./...
7 |
8 | rm -rf build
9 | mkdir -p build
10 | cd build
11 |
12 | branch=$(git rev-parse --abbrev-ref HEAD)
13 | rev=$(git rev-list --count HEAD)
14 | commit=$(git rev-parse --short HEAD)
15 | export VERSION="0.0.1-git-${branch}-rev${rev}-${commit}"
16 |
17 | cmake ..
18 | make package
19 |
20 | dpkg -c *.deb
21 |
--------------------------------------------------------------------------------
/mlfs/ds/trds/example.go:
--------------------------------------------------------------------------------
1 | package trds
2 |
3 | import (
4 | "bytes"
5 |
6 | "github.com/kungfu-team/tenplex/mlfs/vfs"
7 | "github.com/kungfu-team/tenplex/mlfs/vfs/vfile"
8 | )
9 |
10 | func InitExample(r *vfs.Tree) {
11 | r.Mkdir(`/`)
12 | idx, err := vfile.LoadIdxFile(`a.idx.txt`)
13 | if err != nil {
14 | return
15 | }
16 | bs := &bytes.Buffer{}
17 | vfile.SaveIdx(bs, idx)
18 | r.TouchText(`/index.txt`, bs.String())
19 | }
20 |
--------------------------------------------------------------------------------
/benchmark/reconfiguration/stop.py:
--------------------------------------------------------------------------------
1 | import glob
2 | import subprocess
3 |
4 |
5 | def main():
6 | # hosts = ["komodo01", "komodo02", "komodo03", "komodo04"]
7 | hosts = ["komodo01", "komodo02"]
8 |
9 | for ho in hosts:
10 | subprocess.run(
11 | f"ssh {ho} docker ps -q -f name='worker' | xargs docker stop".split(" "),
12 | check=False,
13 | )
14 |
15 |
16 | if __name__ == "__main__":
17 | main()
18 |
--------------------------------------------------------------------------------
/benchmark/reconfiguration_horovod/add-imagenet.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | set -e
3 |
4 | flags() {
5 | echo -idx-name imagenet
6 | echo -index-url /data/imagenet/imagenet.idx.txt
7 |
8 | echo -ctrl-port 20010
9 |
10 | echo -progress 0
11 | echo -global-batch-size 64
12 | echo -dp-size 1
13 |
14 | echo -job fig-13
15 | }
16 |
17 | sudo systemctl stop mlfs
18 | sudo systemctl start mlfs
19 |
20 | mlfs info
21 | mlfs mount $(flags)
22 |
--------------------------------------------------------------------------------
/mlfs/par/par.go:
--------------------------------------------------------------------------------
1 | package par
2 |
3 | import "sync"
4 |
5 | type par struct {
6 | wg sync.WaitGroup
7 | ch chan struct{}
8 | }
9 |
10 | func New(m int) *par {
11 | p := &par{
12 | ch: make(chan struct{}, m),
13 | }
14 | return p
15 | }
16 |
17 | func (p *par) Do(f func()) {
18 | p.wg.Add(1)
19 | p.ch <- struct{}{}
20 | go func() {
21 | f()
22 | <-p.ch
23 | p.wg.Done()
24 | }()
25 | }
26 |
27 | func (p *par) Wait() {
28 | p.wg.Wait()
29 | }
30 |
--------------------------------------------------------------------------------
/benchmark/reconfiguration_horovod/README.md:
--------------------------------------------------------------------------------
1 | # Reconfiguration Horovod
2 | _Fig. 13. Reconfiguration time against Horovod_
3 |
4 | We also compare Tenplex’s overhead to Horovod, a distributed training library without elasticity support, and Horovod-Elastic, which also supports scaling under data parallelism only by periodically checkpointing the model state. We deploy a ResNet50 model with the ImageNet dataset in the on-premise cluster, and measure throughput when training on 2 GPUs.
5 |
--------------------------------------------------------------------------------
/benchmark/redeployment/README.md:
--------------------------------------------------------------------------------
1 | # Redeployment
2 | _Fig. 10. Redeployment time of DL job_
3 |
4 | We evaluate how long Tenplex takes to redeploy DL jobs with different model sizes onto a new set of GPU resources. As a baseline, we compare against Tenplex-Central, which follows the approach of PyTorch Elastic or DeepSpeed: it holds all DL job state at a single central worker. In this experiment, we therefore specifically explore the benefit of Tenplex’s distributed state management.
5 |
--------------------------------------------------------------------------------
/scheduler/scripts/collect-logs.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | set -e
3 |
4 | cd $(dirname $0)/..
5 |
6 | collect_logs() {
7 | mkdir -p logs
8 | for ip in $(./scripts/list-ips.sh); do
9 | echo $ip
10 | scp -r $ip:.tenplex/training logs
11 | done
12 | }
13 |
14 | main() {
15 | for h in $(./scripts/list-ips.sh); do
16 | echo $h
17 | ssh $h find /mnt/mlfs | tee logs/$h.mlfs.log
18 | done
19 |
20 | collect_logs
21 | }
22 |
23 | main
24 |
--------------------------------------------------------------------------------
/scheduler/scripts/list-ips-komodo.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | set -e
3 |
4 | echo "10.10.10.1"
5 | echo "10.10.10.2"
6 | echo "10.10.10.3"
7 | echo "10.10.10.4"
8 |
9 | # cd $(dirname $0)/..
10 |
11 | # for i in $(seq 4); do
12 | # if [ $i -eq 1 ]; then # hack
13 | # ip -o -4 addr list eth0 | awk -F ' *|/' '{print $4}'
14 | # else
15 | # domain=komodo$(printf "%02d" $i).doc.res.ic.ac.uk
16 | # host $domain | awk '{print $4}'
17 | # fi
18 | # done
19 |
--------------------------------------------------------------------------------
/state_transformer/lib/lib.go:
--------------------------------------------------------------------------------
1 | package lib
2 |
3 | import "golang.org/x/exp/slices"
4 |
5 | func InSlice(ele string, sl []string) bool {
6 | return slices.Contains(sl, ele)
7 | }
8 |
9 | func IsSubSlice(subSl []string, sl []string) bool {
10 | for i, s := range sl {
11 | if subSl[0] == s {
12 | for j := 1; j < len(subSl); j++ {
13 | if subSl[j] != sl[i+j] {
14 | return false
15 | }
16 | }
17 | return true
18 | }
19 | }
20 | return false
21 | }
22 |
--------------------------------------------------------------------------------
/benchmark/reconfiguration_cluster_size/tasks/pull_image.yml:
--------------------------------------------------------------------------------
1 | - name: check UID
2 | ignore_errors: yes
3 | ansible.builtin.shell:
4 | cmd: id
5 | register: log
6 |
7 | - name: docker ps
8 | ignore_errors: yes
9 | ansible.builtin.shell:
10 | cmd: docker ps
11 | register: log
12 |
13 | - name: pull docker image
14 | ignore_errors: yes
15 | ansible.builtin.shell:
16 | cmd: docker pull kungfu.azurecr.io/mw-megatron-lm-23.06-update-v100
17 | register: log
18 |
--------------------------------------------------------------------------------
/mlfs/ds/imagenet.go:
--------------------------------------------------------------------------------
1 | package ds
2 |
3 | import "github.com/kungfu-team/tenplex/mlfs/hash"
4 |
5 | var (
6 | ImagenetIndex = hash.HashedFile{
7 | MD5: `dfe57e9541f8cb7affedefd3c633326e`,
8 | URLs: []string{`https://minddata.blob.core.windows.net/data/imagenet.idx.txt`},
9 | }
10 |
11 | ImagenetMd5 = hash.HashedFile{
12 | MD5: `91d0846314a61c32f42726aaa05ea9e7`,
13 | URLs: []string{`https://minddata.blob.core.windows.net/data/imagenet/md5sum.txt`},
14 | }
15 | )
16 |
--------------------------------------------------------------------------------
/state_transformer/Dockerfile:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env -S sh -c 'docker build --rm -t kungfu.azurecr.io/mw-megatron-lm-go:latest -f $0 .'
2 | FROM kungfu.azurecr.io/mw-megatron-lm-data-commit:latest
3 |
4 | ADD go /usr/local
5 | ENV PATH=$PATH:/usr/local/go/bin
6 | RUN whereis go
7 | RUN go version
8 | ENV GO=/usr/local/bin/go
9 |
10 | # State Transformer
11 | ADD . /workspace/state_transformer
12 | WORKDIR /workspace/state_transformer
13 | RUN make
14 | WORKDIR /workspace/Megatron-LM
15 |
--------------------------------------------------------------------------------
/benchmark/reconfiguration_cluster_size/para-config-dp.json:
--------------------------------------------------------------------------------
1 | {
2 | "4": {
3 | "dp_size": 1,
4 | "pp_size": 1,
5 | "mp_size": 4
6 | },
7 | "8": {
8 | "dp_size": 2,
9 | "pp_size": 1,
10 | "mp_size": 4
11 | },
12 | "16": {
13 | "dp_size": 4,
14 | "pp_size": 1,
15 | "mp_size": 4
16 | },
17 | "32": {
18 | "dp_size": 8,
19 | "pp_size": 1,
20 | "mp_size": 4
21 | }
22 | }
23 |
--------------------------------------------------------------------------------
/benchmark/reconfiguration_cluster_size/para-config-pp.json:
--------------------------------------------------------------------------------
1 | {
2 | "4": {
3 | "dp_size": 1,
4 | "pp_size": 1,
5 | "mp_size": 4
6 | },
7 | "8": {
8 | "dp_size": 1,
9 | "pp_size": 2,
10 | "mp_size": 4
11 | },
12 | "16": {
13 | "dp_size": 1,
14 | "pp_size": 4,
15 | "mp_size": 4
16 | },
17 | "32": {
18 | "dp_size": 1,
19 | "pp_size": 8,
20 | "mp_size": 4
21 | }
22 | }
23 |
--------------------------------------------------------------------------------
/mlfs/.vscode/tasks.json:
--------------------------------------------------------------------------------
1 | {
2 | "version": "2.0.0",
3 | "tasks": [
4 | {
5 | "label": "echo",
6 | "type": "shell",
7 | "command": "./build",
8 | "group": {
9 | "kind": "build",
10 | "isDefault": true
11 | }
12 | },
13 | {
14 | "label": "run",
15 | "type": "shell",
16 | "command": "./x"
17 | },
18 | {
19 | "label": "debug",
20 | "type": "shell",
21 | "command": "./x"
22 | }
23 | ]
24 | }
25 |
--------------------------------------------------------------------------------
/ansible/uninstall.yml:
--------------------------------------------------------------------------------
1 | # - name: cleanup
2 | # become: true
3 | # ansible.builtin.shell:
4 | # cmd: killall apt
5 | # register: log
6 |
7 | - name: stop
8 | become: true
9 | ignore_errors: yes
10 | ansible.builtin.shell:
11 | cmd: systemctl stop mlfs
12 | register: log
13 |
14 | - name: uninstall
15 | become: true
16 | ansible.builtin.apt:
17 | state: absent
18 | pkg:
19 | - mlfs
20 |
21 | - debug: msg="{{ log.stdout }}"
22 | - debug: msg="{{ log.stderr }}"
23 |
--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
1 | module github.com/kungfu-team/tenplex
2 |
3 | go 1.18
4 |
5 | require (
6 | bazil.org/fuse v0.0.0-20230120002735-62a210ff1fd5
7 | github.com/lgarithm/go v0.0.0-20230108194319-abf8008ecd81
8 | github.com/lgarithm/proc v0.4.5-0.20240417004737-9b169ad5c322
9 | github.com/lsds/KungFu v0.2.5
10 | golang.org/x/exp v0.0.0-20240416160154-fe59bbe5cc7f
11 | golang.org/x/sync v0.7.0
12 | )
13 |
14 | require (
15 | golang.org/x/crypto v0.22.0 // indirect
16 | golang.org/x/sys v0.19.0 // indirect
17 | )
18 |
--------------------------------------------------------------------------------
/benchmark/common-cloud.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | join_() {
4 | local IFS=$1
5 | shift
6 | echo "$*"
7 | }
8 |
9 | join() { join_ , $@; }
10 |
11 | logfile="$(basename $0).log"
12 |
13 | base_flags() {
14 | echo -image kungfu.azurecr.io/mw-megatron-lm-23.06-update-v100
15 |
16 | echo -user $USER
17 |
18 | echo -mlfs-port 20010
19 | echo -tenplex-prefix "$HOME/.tenplex"
20 |
21 | # echo -logfile
22 | }
23 |
24 | tenplex_run_with() {
25 | tenplex-run $($1) 2>&1 | tee $logfile
26 | }
27 |
--------------------------------------------------------------------------------
/benchmark/common.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | join_() {
4 | local IFS=$1
5 | shift
6 | echo "$*"
7 | }
8 |
9 | join() { join_ , $@; }
10 |
11 | logfile="$(basename $0).log"
12 |
13 | base_flags() {
14 | echo -image "kungfu.azurecr.io/mw-megatron-lm-23.06-update:v0.0.3"
15 |
16 | echo -user $USER
17 |
18 | echo -mlfs-port 20010
19 | echo -tenplex-prefix "$HOME/.tenplex"
20 |
21 | # echo -logfile
22 | }
23 |
24 | tenplex_run_with() {
25 | tenplex-run $($1) 2>&1 | tee $logfile
26 | }
27 |
--------------------------------------------------------------------------------
/mlfs/cmd/tests/cmd/test-md5/test-md5.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "log"
5 | "time"
6 |
7 | "github.com/kungfu-team/tenplex/mlfs/ds"
8 | )
9 |
10 | func main() {
11 | t0 := time.Now()
12 | defer func() { log.Printf("took %s", time.Since(t0)) }()
13 | ds.ImagenetIndex.Check()
14 | ds.ImagenetMd5.Check()
15 | ds.Squad1Index.Check()
16 | ds.Squad1MD5.Check()
17 |
18 | ds.MnistTrainImages.Check()
19 | ds.MnistTrainLabels.Check()
20 | ds.MnistTestImages.Check()
21 | ds.MnistTestLabels.Check()
22 | }
23 |
--------------------------------------------------------------------------------
/mlfs/debug-p2p.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | set -e
3 |
4 | make
5 | # ./bin/mlfs mount
6 |
7 | peer1="127.0.0.1:8080"
8 | peer2="127.0.0.1:8081"
9 |
10 | peer_flags() {
11 | echo -r 1
12 | echo -peers "$peer1,$peer2"
13 | echo -log-req
14 | }
15 |
16 | localhost="127.0.0.1"
17 |
18 | ./bin/mlfs daemon $(peer_flags) -host $localhost -ctrl-port 8080 -http-port 10000 &
19 | pid1=$!
20 | echo $p1
21 |
22 | ./bin/mlfs daemon $(peer_flags) -host $localhost -ctrl-port 8081 -http-port 10001 &
23 | pid2=$!
24 | echo $p2
25 |
26 | wait
27 |
--------------------------------------------------------------------------------
/benchmark/reconfiguration_horovod/Dockerfile:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env -S sh -c 'docker build --rm -t $(cat $(dirname $0)/tag.txt) -f $0 $(dirname $0)'
2 |
3 | FROM nvcr.io/nvidia/tensorflow:23.03-tf1-py3
4 |
5 | RUN python3 -m pip install tenplex -i https://pkgs.dev.azure.com/gli7/releases/_packaging/nightly/pypi/simple -U
6 |
7 | WORKDIR /work
8 | ADD logger.py .
9 | ADD imagenet.py .
10 | ADD imagenet_resnet.py .
11 | ADD imagenet_resnet_horovod_elastic.py .
12 | ADD train-imagenet.sh .
13 |
14 | # ENV OMPI_ALLOW_RUN_AS_ROOT=1
15 | ENTRYPOINT []
16 |
--------------------------------------------------------------------------------
/mlfs/cache/stat.go:
--------------------------------------------------------------------------------
1 | package cache
2 |
3 | import (
4 | "log"
5 | "sync/atomic"
6 | )
7 |
8 | type Stat struct {
9 | miss int64
10 | hit int64
11 | }
12 |
13 | func (s *Stat) Hit() {
14 | atomic.AddInt64(&s.hit, 1)
15 | }
16 |
17 | func (s *Stat) Miss() {
18 | atomic.AddInt64(&s.miss, 1)
19 | }
20 |
21 | func (s *Stat) Log() {
22 | h := (atomic.LoadInt64(&s.hit))
23 | m := (atomic.LoadInt64(&s.miss))
24 | r := float32(m) / float32(h+m)
25 | log.Printf("miss rate: %.2f%% (%d / %d)", r*100.0, m, m+h)
26 | }
27 |
28 | var LogCache = false
29 |
--------------------------------------------------------------------------------
/scheduler/data/plan-komodo.json:
--------------------------------------------------------------------------------
1 | {
2 | "jobs": [
3 | {
4 | "steps": 1000,
5 | "delay": 0,
6 | "dataset": {
7 | "Name": "openwebtext",
8 | "IndexURL": "http://155.198.152.18:20110/"
9 | }
10 | },
11 | {
12 | "steps": 1000,
13 | "delay": 4,
14 | "dataset": {
15 | "Name": "openwebtext",
16 | "IndexURL": "http://155.198.152.18:20110/"
17 | }
18 | }
19 | ]
20 | }
21 |
--------------------------------------------------------------------------------
/benchmark/reconfiguration/README.md:
--------------------------------------------------------------------------------
1 | # Reconfiguration
2 | _Fig. 12. Reconfiguration time against DeepSpeed and Singularity_
3 |
4 | This experiment compares the reconfiguration approach of Tenplex with (i) a model library of an elastic DL system (DeepSpeed) and (ii) a virtual device approach that performs full GPU state migration (Singularity).
5 |
6 | ## Megatron-Deespeed
7 | [Repo](https://github.com/kungfu-team/Megatron-DeepSpeed/tree/mw-before-rebase)
8 |
9 | ## Note
10 | For the roundevouz to work `/etc/hosts` must include the host's domain name.
11 |
--------------------------------------------------------------------------------
/mlfs/cmd/tests/mlfs-test-upload/mlfs-test-upload.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "flag"
5 | "log"
6 |
7 | "github.com/kungfu-team/tenplex/mlfs/mlfs"
8 | "github.com/kungfu-team/tenplex/tensor"
9 | )
10 |
11 | var (
12 | port = flag.Int("p", 8080, ``)
13 | )
14 |
15 | func main() {
16 | flag.Parse()
17 | c, err := mlfs.NewClient(*port)
18 | if err != nil {
19 | log.Panic(err)
20 | }
21 | x := tensor.New(`f32`, 2, 2, 2)
22 | if err := c.Upload(`/a/b/c`, x.Data); err != nil {
23 | log.Panic(err)
24 | }
25 | log.Printf("done")
26 | }
27 |
--------------------------------------------------------------------------------
/mlfs/ds/dataset.go:
--------------------------------------------------------------------------------
1 | package ds
2 |
3 | type Dataset struct {
4 | Name string `json:"name" flag:"dataset"`
5 | IndexURL string `json:"index-url" flag:"index-url"`
6 | Size int `json:"size"` // Total number of samples
7 | }
8 |
9 | var (
10 | SQuAD1Test = Dataset{
11 | Name: `squad1-test`,
12 | IndexURL: `https://minddata.blob.core.windows.net/data/squad1/squad1.idx.txt`,
13 | }
14 |
15 | Imagenet = Dataset{
16 | Name: `imagenet`,
17 | IndexURL: `https://minddata.blob.core.windows.net/data/imagenet.idx.txt`,
18 | }
19 | )
20 |
--------------------------------------------------------------------------------
/mlfs/ds/squad1.go:
--------------------------------------------------------------------------------
1 | package ds
2 |
3 | import "github.com/kungfu-team/tenplex/mlfs/hash"
4 |
5 | // https://minddata.blob.core.windows.net/data/squad1/squad1.md5.txt
6 |
7 | var (
8 | Squad1Index = hash.HashedFile{
9 | MD5: `57015fef3d187f14a57a55ff04166e0c`,
10 | URLs: []string{`https://minddata.blob.core.windows.net/data/squad1/squad1.idx.txt`},
11 | }
12 |
13 | Squad1MD5 = hash.HashedFile{
14 | MD5: `9e1ed608ed476e8fed2fbf84ff378884`,
15 | URLs: []string{`https://minddata.blob.core.windows.net/data/squad1/squad1.md5.txt`},
16 | }
17 | )
18 |
--------------------------------------------------------------------------------
/.azure/release-pip.yml:
--------------------------------------------------------------------------------
1 | parameters:
2 | - name: feed # name of the parameter; required
3 | type: string # data type of the parameter; required
4 |
5 | steps:
6 | - script: python3 -m pip install wheel twine
7 | displayName: 'install deps'
8 |
9 | - script: python3 -m pip wheel -v .
10 | displayName: 'build whl'
11 |
12 | - task: TwineAuthenticate@1
13 | inputs:
14 | artifactFeed: 'releases/${{ parameters.feed }}'
15 |
16 | - script: python3 -m twine upload -r ${{ parameters.feed }} --config-file $(PYPIRC_PATH) ./*.whl
17 | displayName: Publish
18 |
--------------------------------------------------------------------------------
/state_transformer/mapslice/mapslice_test.go:
--------------------------------------------------------------------------------
1 | package mapslice
2 |
3 | import (
4 | "encoding/json"
5 | "fmt"
6 | "log"
7 | "testing"
8 | )
9 |
10 | func Test_1(t *testing.T) {
11 | ms := MapSlice{
12 | MapItem{"abc", 123, 0},
13 | MapItem{"def", 456, 0},
14 | MapItem{"ghi", 789, 0},
15 | }
16 |
17 | b, err := json.Marshal(ms)
18 | if err != nil {
19 | log.Fatal(err)
20 | }
21 | fmt.Println(string(b))
22 |
23 | ms = MapSlice{}
24 | if err := json.Unmarshal(b, &ms); err != nil {
25 | log.Fatal(err)
26 | }
27 |
28 | fmt.Println(ms)
29 | }
30 |
--------------------------------------------------------------------------------
/mlfs/local-serve.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | set -e
3 |
4 | make
5 |
6 | # ./bin/mlfs-check-index $(cat tests/data/*.json | jq -r '."index-url"')
7 |
8 | # ./bin/mlfs-debug -ds ./tests/data/squad1.json
9 | # ./bin/mlfs-debug -ds ./tests/data/imagenet.json
10 |
11 | ./bin/mlfs-edit-index \
12 | -index-url $(cat tests/data/imagenet.json | jq -r '."index-url"') \
13 | -o a.index.txt \
14 | -localize
15 |
16 | ./bin/mlfs-check-index ./a.index.txt
17 | ./bin/mlfs serve -index-url ./a.index.txt -self 155.198.152.18
18 | # ./bin/mlfs daemon -ctrl-port 9999 -http-port 9998 -mnt ./tmp
19 |
--------------------------------------------------------------------------------
/tenplex/mlfs_path.py:
--------------------------------------------------------------------------------
1 | class MLFSPath(object):
2 |
3 | def __init__(self, mnt='/mnt/mlfs') -> None:
4 | self.mnt = mnt
5 |
6 | def _path(self, p):
7 | return self.mnt + p
8 |
9 | def _read_lines(self, filename):
10 | return [line.strip() for line in open(self._path(filename))]
11 |
12 | def filenames(self, job, rank):
13 | lines = self._read_lines
14 | head = lines(f'/job/{job}/head.txt')[0]
15 | part = lines(head)[rank]
16 | names = lines(f'{part}/list.txt')
17 | return [self._path(n) for n in names]
18 |
--------------------------------------------------------------------------------
/state_transformer/statetransform/iter.go:
--------------------------------------------------------------------------------
1 | package statetransform
2 |
3 | import (
4 | "strconv"
5 |
6 | "github.com/kungfu-team/tenplex/state_transformer/client"
7 | "github.com/kungfu-team/tenplex/state_transformer/meta"
8 | )
9 |
10 | func setIter(conf *meta.Config, targetDevice int, cl client.CheckpointClient) error {
11 | if targetDevice%conf.GpusPerHost != 0 { // only once per host
12 | return nil
13 | }
14 |
15 | err := cl.UploadValue([]byte(strconv.Itoa(conf.Step)), "iter", targetDevice, true)
16 | if err != nil {
17 | return err
18 | }
19 |
20 | return nil
21 | }
22 |
--------------------------------------------------------------------------------
/azure-pipelines.yml:
--------------------------------------------------------------------------------
1 | # Starter pipeline
2 | # Start with a minimal pipeline that you can customize to build and deploy your code.
3 | # Add steps that build, run tests, deploy, and more:
4 | # https://aka.ms/yaml
5 |
6 | trigger:
7 | - main
8 |
9 | pool:
10 | # ERROR InvalidDistribution: Invalid distribution metadata: unrecognized or
11 | # malformed field 'license-file'
12 | # vmImage: ubuntu-24.04
13 |
14 | vmImage: ubuntu-22.04 # FIXME: deprecate setup.py
15 |
16 | steps:
17 |
18 | - template: ./.azure/release-pip.yml
19 | parameters:
20 | feed: nightly
21 |
--------------------------------------------------------------------------------
/mlfs/scripts/upload.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | set -e
3 |
4 | if [ -z "$SAS" ]; then
5 | echo "SAS NOT set"
6 | exit 1
7 | fi
8 |
9 | upload() {
10 | local filename=$1
11 | local path=$2
12 | SA=tenplex
13 | URI="https://$SA.blob.core.windows.net/$path"
14 |
15 | echo "uploading $filename to $URI"
16 | curl -v -X PUT \
17 | -H 'x-ms-blob-type: BlockBlob' \
18 | -H 'x-ms-version: 2015-02-21' \
19 | -H "Content-Type: $ContentType" \
20 | "$URI?$SAS" --data-binary @$filename
21 | echo "uploaded $URI"
22 | }
23 |
24 | upload "$1" "$2"
25 |
--------------------------------------------------------------------------------
/mlfs/utils/text.go:
--------------------------------------------------------------------------------
1 | package utils
2 |
3 | import (
4 | "io"
5 | "strings"
6 |
7 | "github.com/kungfu-team/tenplex/mlfs/uri"
8 | )
9 |
10 | func Readlines(filename string) ([]string, error) {
11 | f, err := uri.Open(filename)
12 | if err != nil {
13 | return nil, err
14 | }
15 | defer f.Close()
16 | bs, err := io.ReadAll(f)
17 | if err != nil {
18 | return nil, err
19 | }
20 | var ls []string
21 | for _, l := range strings.Split(string(bs), "\n") {
22 | l = strings.TrimSpace(l)
23 | if len(l) > 0 {
24 | ls = append(ls, l)
25 | }
26 | }
27 | return ls, nil
28 | }
29 |
--------------------------------------------------------------------------------
/mlfs/convert_index.py:
--------------------------------------------------------------------------------
1 | def main():
2 | old_index_file_path = '/data/megatron-lm/bert/test/indices.txt'
3 | new_index_file_path = './new_indices.txt'
4 |
5 | with open(old_index_file_path, 'r') as old_index_file:
6 | old_index_lines = old_index_file.readlines()
7 |
8 | old_indices = [int(l) for l in old_index_lines]
9 |
10 | with open(new_index_file_path, 'w') as new_index_file:
11 | for i in range(len(old_indices) - 1):
12 | new_index_file.write(f'{old_indices[i]} {old_index_lines[i+1]}')
13 |
14 |
15 | if __name__ == '__main__':
16 | main()
17 |
--------------------------------------------------------------------------------
/tenplex-run/job/lib.go:
--------------------------------------------------------------------------------
1 | package job
2 |
3 | import (
4 | "github.com/lgarithm/proc"
5 | )
6 |
7 | var (
8 | Par = proc.Par // (P, P, ...) -> P
9 | Seq = proc.Seq // (P, ...) -> P
10 | Term = proc.Term
11 | Echo = proc.Echo
12 | Shell = proc.Shell
13 | Ignore = proc.Ignore
14 | Run = proc.Run
15 | Ssh = proc.SSH
16 | // Ssh = dbg.SSH
17 | )
18 |
19 | type (
20 | P = proc.P
21 | Proc = proc.Proc
22 | )
23 |
24 | func Pmap(f func(string) P, hs ...string) []P {
25 | var ps []P
26 | for _, h := range hs {
27 | ps = append(ps, f(h))
28 | }
29 | return ps
30 | }
31 |
--------------------------------------------------------------------------------
/mlfs/.github/workflows/docker.yml:
--------------------------------------------------------------------------------
1 | ---
2 | name: Docker
3 | 'on':
4 | # - push
5 | # - pull_request
6 | - workflow_dispatch
7 |
8 | jobs:
9 | build:
10 | runs-on: ubuntu-20.04
11 | steps:
12 | - run: docker images
13 | - uses: actions/checkout@v2
14 |
15 | - run: ./docker/ubuntu/2004/Dockerfile
16 | env:
17 | SSH_KEY: ${{ secrets.KUNGFU_RSA }}
18 |
19 | # - run: ./docker/ubuntu/2004/Dockerfile.tf1.13.2-gpu
20 | - run: ./docker/ubuntu/1804/Dockerfile.tf1.13.2-gpu
21 | env:
22 | SSH_KEY: ${{ secrets.KUNGFU_RSA }}
23 |
24 | - run: docker images
25 |
--------------------------------------------------------------------------------
/mlfs/fuse/fuse.go:
--------------------------------------------------------------------------------
1 | package fuse
2 |
3 | import (
4 | "log"
5 | "os"
6 | )
7 |
8 | type FUSE struct {
9 | mnt string
10 | ch chan struct{}
11 | dev *os.File
12 | }
13 |
14 | func New(mnt string) (*FUSE, error) {
15 | dev, err := os.Open(`/dev/fuse`)
16 | if err != nil {
17 | return nil, err
18 | }
19 | f := &FUSE{
20 | mnt: mnt,
21 | ch: make(chan struct{}),
22 | dev: dev,
23 | }
24 | return f, nil
25 | }
26 |
27 | func (f *FUSE) Run() {
28 | for {
29 | buf := make([]byte, 1024)
30 | n, err := f.dev.Read(buf)
31 | log.Printf("%d,%v", n, err)
32 | _ = <-f.ch
33 | }
34 | }
35 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | # pip3 install --no-index --user -U .
2 |
3 | import os, time
4 |
5 |
6 | def auto_version():
7 | major = 0
8 | minor = 0
9 | t = os.getenv('GIT_COMMIT_TIMESTAMP')
10 | patch = str(t) if t else int(time.time())
11 | return '.'.join([str(x) for x in [major, minor, patch]])
12 |
13 |
14 | from setuptools import find_packages, setup
15 |
16 | setup(
17 | name='tenplex',
18 | version=auto_version(),
19 | packages=find_packages(),
20 | description='',
21 | url='',
22 | ext_modules=[],
23 | setup_requires=[],
24 | install_requires=[],
25 | )
26 |
27 |
--------------------------------------------------------------------------------
/tenplex-run/timeout/timeout.go:
--------------------------------------------------------------------------------
1 | package timeout
2 |
3 | import (
4 | "context"
5 | "log"
6 | "sync/atomic"
7 | "time"
8 | )
9 |
10 | type timeout struct {
11 | done int32
12 | }
13 |
14 | func New(d time.Duration, cancel context.CancelFunc) *timeout {
15 | t := &timeout{}
16 | go func() {
17 | time.Sleep(d)
18 | done := atomic.LoadInt32(&t.done)
19 | if done != 0 {
20 | return
21 | }
22 | log.Printf("timeout adter %s", d)
23 | if cancel != nil {
24 | cancel()
25 | }
26 | }()
27 | return t
28 | }
29 |
30 | func (t *timeout) Done() {
31 | atomic.StoreInt32(&t.done, 1)
32 | }
33 |
--------------------------------------------------------------------------------
/tensor/tensor_test.go:
--------------------------------------------------------------------------------
1 | package tensor_test
2 |
3 | import (
4 | "fmt"
5 | "os"
6 | "testing"
7 |
8 | "github.com/kungfu-team/tenplex/tensor"
9 | )
10 |
11 | func Test_1(t *testing.T) {
12 | /*
13 | 0 1 2 3
14 | 4 5 6 7
15 | 8 9 10 11
16 | 12 13 14 15
17 | */
18 | x := tensor.New(`i32`, 4, 4)
19 | {
20 | x := tensor.I32(x)
21 | for i := range x {
22 | x[i] = int32(i)
23 | }
24 | }
25 |
26 | y := x.Range(tensor.Slice(1, 3), tensor.Slice(1, 3))
27 | {
28 | y := tensor.I32(y)
29 | for _, e := range y {
30 | fmt.Fprintf(os.Stderr, "%d\n", e)
31 | }
32 | }
33 | }
34 |
--------------------------------------------------------------------------------
/benchmark/reconfiguration_parallelization/README.md:
--------------------------------------------------------------------------------
1 | # Reconfiguration Parallelizations
2 | _Fig. 14. Reconfiguration time with different parallelizations_
3 |
4 | We examine the impact of the parallelization configuration on reconfiguration time for different model sizes. We deploy Tenplex and Tenplex-Central, which manages the state in a single node, with the different GPT-3 models on the on-premise cluster. For data parallelism (D), we change the configuration from (M, P, D) = (4, 2, 1) to (4, 2, 2); for pipeline parallelism (P) from (4, 2, 1) to (4, 4, 1); and for model parallelism (M) from (4, 2, 1) to (8, 2, 1).
5 |
--------------------------------------------------------------------------------
/mlfs/vfs/utils_test.go:
--------------------------------------------------------------------------------
1 | package vfs_test
2 |
3 | import (
4 | "testing"
5 |
6 | "github.com/kungfu-team/tenplex/mlfs/vfs"
7 | )
8 |
9 | func Test_RmRecursive(t *testing.T) {
10 | r := vfs.New()
11 | script := `
12 | mkdir /
13 | mkdir /a
14 | mkdir /a/b
15 | mkdir /a/c
16 | touch /a/b/x.txt
17 | touch /a/b/y.txt
18 | touch /a/c/z.txt
19 | `
20 | if err := runScript(r, script); err != nil {
21 | t.Fail()
22 | }
23 | nf, nd, err := vfs.RmRecursive(r, `/a`)
24 | if err != nil {
25 | t.Fail()
26 | }
27 | if nf != 3 {
28 | t.Fail()
29 | }
30 | if nd != 3 {
31 | t.Fail()
32 | }
33 | }
34 |
--------------------------------------------------------------------------------
/scheduler/logging/logging.go:
--------------------------------------------------------------------------------
1 | package logging
2 |
3 | import (
4 | "io"
5 | "log"
6 | "os"
7 | "path"
8 | )
9 |
10 | func SetupLogger(name string) {
11 | log.SetPrefix(`[` + name + `] `)
12 | log.SetFlags(0)
13 | r, w := io.Pipe()
14 | log.SetOutput(w)
15 | go func(r io.Reader) {
16 | logfile := path.Join(`logs`, name+`.log`)
17 | if err := os.MkdirAll(path.Dir(logfile), os.ModePerm); err != nil {
18 | log.Printf("create logdir failed: %v", err)
19 | }
20 | if lf, err := os.Create(logfile); err == nil {
21 | r = io.TeeReader(r, lf)
22 | }
23 | io.Copy(os.Stderr, r)
24 | }(r)
25 | }
26 |
--------------------------------------------------------------------------------
/mlfs/cmd/tests/cmd/mlfs-test-dist/mlfs-test-dist.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "log"
5 | "os"
6 | "path"
7 | "time"
8 |
9 | "github.com/kungfu-team/tenplex/mlfs/ds"
10 | "github.com/kungfu-team/tenplex/mlfs/mlfs/t"
11 | )
12 |
13 | var pwd, _ = os.Getwd()
14 |
15 | func main() {
16 | t0 := time.Now()
17 | defer func() { log.Printf("took %s", time.Since(t0)) }()
18 | dt := t.DistTest{
19 | HTTPPort: 30000,
20 | CtrlPort: 40000,
21 | // Mount: path.Join(pwd, `mnt`),
22 | Tmp: path.Join(pwd, `tmp`),
23 | JobID: `A`,
24 | DP: 4,
25 | DS: ds.Imagenet,
26 | }
27 | dt.Run()
28 | }
29 |
--------------------------------------------------------------------------------
/scheduler/scripts/upload.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | set -e
3 |
4 | if [ -z "$SAS" ]; then
5 | echo "SAS NOT set"
6 | exit 1
7 | fi
8 |
9 | upload() {
10 | local filename=$1
11 | local path=$2
12 | SA=tenplex
13 | URI="https://$SA.blob.core.windows.net/$path"
14 |
15 | echo "uploading $filename to $URI"
16 | curl -s -w "\n%{http_code}\n" -X PUT \
17 | -H 'x-ms-blob-type: BlockBlob' \
18 | -H 'x-ms-version: 2015-02-21' \
19 | -H "Content-Type: $ContentType" \
20 | "$URI?$SAS" --data-binary @$filename
21 | echo "uploaded $URI"
22 | }
23 |
24 | upload "$1" "$2"
25 |
--------------------------------------------------------------------------------
/state_transformer/meta/path.go:
--------------------------------------------------------------------------------
1 | package meta
2 |
3 | import (
4 | "fmt"
5 | "path"
6 | )
7 |
8 | func GetStructPath(c *Config, before bool) string {
9 | suffix := func(pp, mp, dp int) string {
10 | return fmt.Sprintf("pp%02d/mp%02d/dp%02d", pp, mp, dp)
11 | }
12 | var sfx string
13 | if before {
14 | sfx = suffix(c.SourcePPDegree, c.SourceMPDegree, c.SourceDPDegree)
15 | } else {
16 | sfx = suffix(c.TargetPPDegree, c.TargetMPDegree, c.TargetDPDegree)
17 | }
18 | return path.Join(
19 | c.CkptStructDir,
20 | c.MdpLibrary,
21 | c.Precision,
22 | c.Model,
23 | c.ModelSize,
24 | sfx,
25 | )
26 | }
27 |
--------------------------------------------------------------------------------
/tensor/dtypes.go:
--------------------------------------------------------------------------------
1 | package tensor
2 |
3 | import "fmt"
4 |
5 | func eq(a, b string) {
6 | if a != b {
7 | panic(fmt.Errorf("%s != %s", a, b))
8 | }
9 | }
10 |
11 | func to[R any](name string, t *Tensor) []R {
12 | eq(t.Dtype, name)
13 | return *(*[]R)(t.sliceHeader())
14 | }
15 |
16 | func to_[R any](name string) func(*Tensor) []R { return func(t *Tensor) []R { return to[R](name, t) } }
17 |
18 | var (
19 | U8 = to_[uint8](`u8`)
20 | U32 = to_[uint32](`u32`)
21 | I8 = to_[int8](`i8`)
22 | I32 = to_[int32](`i32`)
23 | F32 = to_[float32](`f32`)
24 | )
25 |
26 | func Raw(t *Tensor) []byte { return t.Data }
27 |
--------------------------------------------------------------------------------
/state_transformer/test_state_migrator.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | make
5 |
6 | ./bin/tests \
7 | --ckpt-dir "/data/$USER/mlfs" \
8 | --ckpt-struct-dir "$HOME/Elasticity/Repo/tenplex-run/transformer-checkpoint" \
9 | --source-pp-degree 2 \
10 | --target-pp-degree 3 \
11 | --source-mp-degree 2 \
12 | --target-mp-degree 4 \
13 | --source-size 8 \
14 | --target-size 12 \
15 | --precision "fp16" \
16 | --input-timestamp "a" \
17 | --output-timestamp "b" \
18 | --hosts "10.10.10.1" \
19 | --mdp-library "megatron-lm" \
20 | --sequence-length 1024 \
21 | --target-rank 0
22 |
--------------------------------------------------------------------------------
/benchmark/performance_impact/README.md:
--------------------------------------------------------------------------------
1 | # Performance impact
2 | _Fig. 3. Performance impact of different parallelization configurations on 16 GPUs_
3 |
4 | When the GPU resources of a DL job change at runtime, a parallelization configuration that was optimal at deployment time may no longer be optimal with the new GPUs. We demonstrate this empirically in Fig. 3, which shows the training throughput (in samples/second) when training BERT and GPT-3 models using Megatron-LM on 16 GPUs under a range of parallelization configurations. Each parallelization configuration varies the degree of model, pipeline and data parallelism, and thus alters the GPU allocation.
5 |
--------------------------------------------------------------------------------
/state_transformer/run_state_migrator.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | make
5 |
6 | ./bin/tenplex-state-transformer \
7 | --ckpt-dir "/data/$USER/mlfs" \
8 | --ckpt-struct-dir "$HOME/Elasticity/Repo/tenplex-run/transformer-checkpoint" \
9 | --source-pp-degree 2 \
10 | --target-pp-degree 3 \
11 | --source-mp-degree 2 \
12 | --target-mp-degree 4 \
13 | --source-size 8 \
14 | --target-size 12 \
15 | --precision "fp16" \
16 | --input-timestamp "a" \
17 | --output-timestamp "b" \
18 | --hosts "10.10.10.1" \
19 | --mdp-library "megatron-lm" \
20 | --sequence-length 1024 \
21 | --target-rank 0
22 |
--------------------------------------------------------------------------------
/benchmark/reconfiguration_horovod/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | set -e
3 |
4 | cd $(dirname $0)
5 |
6 | with_log_file() {
7 | local filename=$1
8 | shift
9 | $@ | tee $filename
10 | echo "logged to $filename $ $@"
11 | }
12 |
13 | ./add-imagenet.sh
14 |
15 | with_log_file 1.log ./with-docker horovodrun -np 2 python3 ./imagenet_resnet.py --data-dir /data/imagenet/records
16 | with_log_file 2.log ./with-docker horovodrun -np 2 python3 ./imagenet_resnet_horovod_elastic.py --data-dir /data/imagenet/records
17 | with_log_file 3.log ./with-docker horovodrun -np 2 python3 ./imagenet_resnet.py --mlfs-dir /mnt/mlfs --job fig-13
18 |
19 | python3 plot.py
20 |
--------------------------------------------------------------------------------
/benchmark/failure/README.md:
--------------------------------------------------------------------------------
1 | # Failure recovery
2 | _Fig. 11. Failure recovery time (GPT-3 2.7 B)_
3 |
4 | We explore how Tenplex manages to recover efficiently from failures, even in scenarios that require dynamic reconfiguration due to a change in the number of GPUs. We emulate faults of 4, 8, and 12 GPUs and measure the failure recovery and reconfiguration time. We use the GPT-3 2.7 B model with the Wikipedia dataset on the on-premise cluster. We compare Tenplex to a system that always recovers from the last checkpoint , which results in an average loss of 50 training steps. The parallelization configuration is (M, D, P) = (4, 2, 2), i.e. there are two model replicas.
5 |
--------------------------------------------------------------------------------
/mlfs/vfs/ufs/ufs.go:
--------------------------------------------------------------------------------
1 | package ufs
2 |
3 | import (
4 | "errors"
5 | "log"
6 | "os"
7 |
8 | "github.com/kungfu-team/tenplex/mlfs/vfs"
9 | )
10 |
11 | type FS struct {
12 | r *vfs.Tree
13 | allowWrite bool
14 | log *log.Logger
15 | }
16 |
17 | func New(r *vfs.Tree) *FS {
18 | return &FS{
19 | r: r,
20 | log: log.New(os.Stderr, `[fuse] `, 0),
21 | }
22 | }
23 |
24 | type Dir struct {
25 | fs *FS
26 | r *vfs.Tree
27 | id int
28 | n vfs.DirNode
29 | }
30 |
31 | type File struct {
32 | fs *FS
33 | id int
34 | n vfs.FileNode
35 |
36 | // debug
37 | name string
38 | }
39 |
40 | var errReadOnly = errors.New(`readonly`)
41 |
--------------------------------------------------------------------------------
/tests/test_delete.py:
--------------------------------------------------------------------------------
1 | from tenplex.mlfs_client import MLFSClient
2 |
3 |
4 | def createTestTree(client):
5 | client.upload_txt("/a/b.txt", "1")
6 | client.upload_txt("/a/c/d.txt", "2")
7 | client.upload_txt("/a/c/e.txt", "3")
8 |
9 |
10 | def test():
11 | ip = "localhost"
12 | port = 20010
13 | client = MLFSClient(ip, port)
14 |
15 | createTestTree(client)
16 |
17 | path = "/a"
18 | num_files, num_dirs = client.delete(path)
19 | print(f"num files {num_files}")
20 | print(f"num dirs {num_dirs}")
21 | assert num_files == 3
22 | assert num_dirs == 2
23 |
24 |
25 | if __name__ == "__main__":
26 | test()
27 |
--------------------------------------------------------------------------------
/mlfs/vfs/vfile/link.go:
--------------------------------------------------------------------------------
1 | package vfile
2 |
3 | import (
4 | "io"
5 |
6 | "github.com/kungfu-team/tenplex/mlfs/uri"
7 | )
8 |
9 | type link struct {
10 | string
11 | int64
12 | }
13 |
14 | func Link(url string, size int64) link { return link{url, size} }
15 |
16 | func (f link) Size() int64 { return f.int64 }
17 |
18 | func (f link) Open() io.ReadCloser {
19 | r := io.NewSectionReader(f, 0, f.Size())
20 | return io.NopCloser(r)
21 | }
22 |
23 | func (f link) ReadAt(buf []byte, pos int64) (int, error) {
24 | r, err := uri.OpenRange(f.string, pos, f.int64)
25 | if err != nil {
26 | return 0, err
27 | }
28 | defer r.Close()
29 | return r.Read(buf)
30 | }
31 |
--------------------------------------------------------------------------------
/mlfs/local-ci.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | set -e
3 |
4 | make
5 |
6 | make sys-install
7 | make reload
8 |
9 | ./bin/mlfs info
10 |
11 | # ./bin/mlfs-cli -sas "minddata:$(cat $HOME/.az/minddata.sas)"
12 |
13 | ./bin/mlfs mount -global-batch-size 23 -dp-size 4 \
14 | -idx-name squad1 \
15 | -index-url https://minddata.blob.core.windows.net/data/squad1/squad1.idx.txt
16 |
17 | ./bin/mlfs fetch -file 'https://minddata.blob.core.windows.net/data/squad1/train.tf_record' -md5 67eb6da21920dda01ec75cd6e1a5b8d7
18 |
19 | sleep 1 # 2023/01/16 10:00:56 open /mnt/mlfs/job/0/head.txt: transport endpoint is not connected
20 | ./bin/mlfs bench -mnt /mnt/mlfs
21 |
22 | tree /mnt/mlfs/
23 |
--------------------------------------------------------------------------------
/tenplex-run/cancelgroup/cancelgroup.go:
--------------------------------------------------------------------------------
1 | package cancelgroup
2 |
3 | import (
4 | "context"
5 |
6 | "github.com/lgarithm/proc"
7 | )
8 |
9 | type (
10 | P = proc.P
11 | )
12 |
13 | func CancelGroup(ps []P, defaultErr error, cancel context.CancelFunc) P {
14 | var qs []P
15 | for _, p := range ps {
16 | var err error = defaultErr
17 | qs = append(qs,
18 | proc.Seq(
19 | proc.Ignore(
20 | proc.Seq(
21 | p,
22 | proc.FnOk(func() { err = nil }),
23 | ),
24 | ),
25 | proc.Fn(func() error {
26 | if err != nil {
27 | cancel()
28 | }
29 | return err
30 | }),
31 | ))
32 | }
33 | return proc.Par(qs...)
34 | }
35 |
--------------------------------------------------------------------------------
/mlfs/www/js/bmp.js:
--------------------------------------------------------------------------------
1 | draw = (t, h, w) => {
2 | var c = document.createElement('canvas');
3 | c.id = "CursorLayer";
4 | c.width = w;
5 | c.height = h;
6 | // c.style.zIndex = 8;
7 | // c.style.position = "absolute";
8 | // c.style.border = "1px solid";
9 | // c.fill
10 |
11 | var ctx = c.getContext("2d");
12 |
13 | ctx.beginPath();
14 | ctx.rect(0, 0, w, h);
15 | ctx.fillStyle = "red";
16 | ctx.fill();
17 | return c;
18 | };
19 |
20 | main = () => {
21 | c = draw(0, 128, 1024);
22 | document.body.appendChild(c);
23 | };
24 |
25 | window.onload = () => {
26 | // console.log(document.body);
27 | main()
28 | }
29 |
--------------------------------------------------------------------------------
/mlfs/mlfstest/Makefile:
--------------------------------------------------------------------------------
1 | GO := $(if $(GO),$(GO),$(HOME)/local/go/bin/go)
2 | CUDA := $(if $(CUDA),$(CUDA),$(shell [ -c /dev/nvidia0 ] && echo cuda))
3 | TAGS := $(if $(TAGS),$(TAGS),)
4 |
5 | default: binaries test
6 |
7 |
8 | binaries:
9 | GOBIN=$(CURDIR)/bin $(GO) install -v -tags "$(TAGS)" ./...
10 |
11 | install:
12 | $(GO) install -v -tags "$(TAGS)" ./...
13 |
14 | test:
15 | $(GO) test -v -tags "$(TAGS)" ./...
16 |
17 | update:
18 | $(GO) get -u ./...
19 |
20 | clean:
21 | $(GO) clean -v -cache ./...
22 |
23 | tidy:
24 | $(GO) mod tidy
25 |
26 | format:
27 | $(GO) fmt ./...
28 |
29 | i: install
30 |
31 |
32 | u: update tidy
33 |
34 |
35 | t: test
36 |
37 |
38 |
--------------------------------------------------------------------------------
/mlfs/vfs/node.go:
--------------------------------------------------------------------------------
1 | package vfs
2 |
3 | import "io"
4 |
5 | type Node interface {
6 | IsDir() bool
7 |
8 | AsFile() FileNode
9 | AsDir() DirNode
10 | }
11 |
12 | type DirNode interface {
13 | Items() []Item
14 | Add(string, int, bool)
15 | Del(id int)
16 | }
17 |
18 | type FileNode interface {
19 | io.ReaderAt
20 |
21 | Open() io.ReadCloser
22 | Size() int64
23 | }
24 |
25 | type FileMode interface {
26 | IsExecutable() bool
27 | }
28 |
29 | type fileNode struct {
30 | f FileNode
31 | }
32 |
33 | func (f *fileNode) IsDir() bool { return false }
34 |
35 | func (f *fileNode) AsFile() FileNode { return f.f }
36 |
37 | func (f *fileNode) AsDir() DirNode { return nil }
38 |
--------------------------------------------------------------------------------
/tenplex-run/web/web.go:
--------------------------------------------------------------------------------
1 | package web
2 |
3 | import (
4 | "fmt"
5 | "log"
6 | "net/http"
7 | "os"
8 | )
9 |
10 | func WithLogReq(h http.Handler) http.Handler {
11 | return http.HandlerFunc(func(w http.ResponseWriter, req *http.Request) {
12 | LogRequest(req)
13 | h.ServeHTTP(w, req)
14 | })
15 | }
16 |
17 | var LogRequest = func(r *http.Request) {
18 | accessLog.Printf("%s %s | %s %s", r.Method, r.URL, r.RemoteAddr, r.UserAgent())
19 | }
20 | var accessLog = logger{l: log.New(os.Stderr, "[access] ", 0)}
21 |
22 | type logger struct{ l *log.Logger }
23 |
24 | func (l *logger) Printf(format string, v ...interface{}) {
25 | l.l.Output(2, fmt.Sprintf(format, v...))
26 | }
27 |
--------------------------------------------------------------------------------
/mlfs/bimap/bimap.go:
--------------------------------------------------------------------------------
1 | package bimap
2 |
3 | type BiMap struct {
4 | f, g map[string]string
5 | }
6 |
7 | func New() *BiMap {
8 | return &BiMap{
9 | f: make(map[string]string),
10 | g: make(map[string]string),
11 | }
12 | }
13 |
14 | func (m *BiMap) Add(k, v string) bool {
15 | // log.Printf("adding %s: %s", k, v)
16 | _, a := m.f[k]
17 | _, b := m.g[v]
18 | if a || b {
19 | return false
20 | }
21 | m.f[k] = v
22 | m.g[v] = k
23 | return true
24 | }
25 |
26 | func (m *BiMap) Get(k string) (string, bool) {
27 | v, ok := m.f[k]
28 | return v, ok
29 | }
30 |
31 | func (m *BiMap) RGet(v string) (string, bool) {
32 | // log.Printf("RGet %s", v)
33 | k, ok := m.g[v]
34 | return k, ok
35 | }
36 |
--------------------------------------------------------------------------------
/scheduler/run_scheduler.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | echo "Building scheduler ..."
5 | make
6 |
7 | echo "Running scheduler ..."
8 | PREFIX=$HOME/.tenplex/scheduler
9 |
10 | flag() {
11 | # echo -detect-self-ip eth0
12 | echo -detect-self-ip ib0
13 | echo -reinstall
14 | echo -u marcel
15 | echo -tenplex-state-transformer /home/marcel/Elasticity/Repo/tenplex-state-transformer/go/bin/tenplex-state-transformer
16 | }
17 |
18 | if [ ! -d transformer-checkpoint ]; then
19 | git clone git@github.com:/kungfu-team/transformer-checkpoint.git
20 | fi
21 |
22 | cd transformer-checkpoint
23 | git pull
24 | cd -
25 |
26 | $PREFIX/bin/tenplex-scheduler $(flag)
27 |
28 | echo "$0 done"
29 |
--------------------------------------------------------------------------------
/tenplex-run/docker/lib.go:
--------------------------------------------------------------------------------
1 | package docker
2 |
3 | import (
4 | "github.com/lgarithm/proc"
5 | )
6 |
7 | type (
8 | At = proc.UserHost
9 | P = proc.P
10 | Proc = proc.Proc
11 | )
12 |
13 | var (
14 | par = proc.Par
15 | out = proc.Output
16 | seq = proc.Seq
17 | Main = proc.Main
18 | psh = proc.Psh
19 | at = proc.At
20 | echo = proc.Echo
21 | lmd = proc.Lambda
22 | ignore = proc.Ignore
23 | urpc = proc.Urpc
24 | )
25 |
26 | func fmap[X any, Y any](f func(X) Y, xs ...X) []Y {
27 | var ys []Y
28 | for _, x := range xs {
29 | ys = append(ys, f(x))
30 | }
31 | return ys
32 | }
33 |
34 | func parmap[T any](f func(T) P, xs ...T) P { return par(fmap(f, xs...)...) }
35 |
--------------------------------------------------------------------------------
/mlfs/docker/start.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | set -e
3 |
4 | tag=$(cat $(dirname $0)/ubuntu/1804/tf.tag.txt)
5 |
6 | docker_run_flags() {
7 | # echo --privileged
8 | echo --cap-add SYS_ADMIN
9 | echo --device /dev/fuse
10 |
11 | # https://forum.rclone.org/t/fusermount-permission-denied-in-docker-rclone/13914/6
12 | echo --security-opt apparmor:unconfine # For FUSE
13 |
14 | # https://medium.com/swlh/docker-and-systemd-381dfd7e4628
15 | echo -v /sys/fs/cgroup/:/sys/fs/cgroup:ro # For systemd
16 |
17 | echo -v $PWD/benchmarks:/benchmarks
18 | echo --rm
19 | echo -d
20 | echo --name mlfs
21 | }
22 |
23 | docker rm -f mlfs
24 | docker run $(docker_run_flags) --gpus "device=0" -it $tag /sbin/init
25 |
--------------------------------------------------------------------------------
/tenplex/arguments.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 |
4 | def add_tenplex_args(parser: argparse.ArgumentParser):
5 | group = parser.add_argument_group(title="Tenplex")
6 |
7 | group.add_argument("--tenplex", action="store_true")
8 | group.add_argument("--mlfs-path", type=str, default=None)
9 | group.add_argument("--jobid", type=str, default=None)
10 | group.add_argument("--host-ip", type=str, default=None)
11 | group.add_argument("--mlfs-port", type=int, default=None)
12 | group.add_argument("--scheduler-addr", type=str, default=None)
13 | group.add_argument("--tenplex-train-iters", type=int, default=None)
14 | group.add_argument("--gen-para-config", action="store_true")
15 |
16 | return parser
17 |
--------------------------------------------------------------------------------
/mlfs/mlfstest/cmd/mlfstest-tf-imagenet/lib.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "strconv"
5 |
6 | "github.com/lgarithm/proc"
7 | )
8 |
9 | type (
10 | P = proc.P
11 | )
12 |
13 | var (
14 | seq = proc.Seq
15 | pc = proc.PC
16 | ignore = proc.Ignore
17 | echo = proc.Echo
18 | try = proc.Try
19 |
20 | str = strconv.Itoa
21 | )
22 |
23 | func dockerExec(cmd string, args ...string) P {
24 | ss := []string{
25 | `exec`, `-t`, name,
26 | cmd,
27 | }
28 | ss = append(ss, args...)
29 | return pc(`docker`, ss...)
30 | }
31 |
32 | func dockerCp(a, b string) P {
33 | return pc(`docker`, `cp`, a, name+`:`+b)
34 | }
35 |
36 | func If(ok bool, p P) P {
37 | if ok {
38 | return p
39 | }
40 | return seq()
41 | }
42 |
--------------------------------------------------------------------------------
/mlfs/vfs/dir.go:
--------------------------------------------------------------------------------
1 | package vfs
2 |
3 | type Item struct {
4 | IsDir bool
5 | Name string
6 | Id int
7 | }
8 |
9 | type dir struct {
10 | items []Item
11 | }
12 |
13 | func (d *dir) IsDir() bool { return true }
14 |
15 | func (d *dir) AsFile() FileNode { return nil }
16 |
17 | func (d *dir) AsDir() DirNode { return d }
18 |
19 | func (d *dir) Items() []Item { return d.items }
20 |
21 | func (d *dir) Add(name string, id int, isdir bool) {
22 | d.items = append(d.items, Item{IsDir: isdir, Id: id, Name: name})
23 | }
24 |
25 | func (d *dir) Del(id int) {
26 | var j int
27 | for i := range d.items {
28 | if d.items[i].Id != id {
29 | d.items[j] = d.items[i]
30 | j++
31 | }
32 | }
33 | d.items = d.items[:j]
34 | }
35 |
--------------------------------------------------------------------------------
/tests/test_load_http.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 | from tenplex.load import load_http
4 |
5 |
6 | def main():
7 | # parser = argparse.ArgumentParser(description='Write checkpoint')
8 | # parser.add_argument('--device-rank', type=int)
9 | # parser.add_argument('--mlfs-path', type=str)
10 | # args = parser.parse_args()
11 |
12 | job_id = "13b4a21fc1"
13 | device_rank = 0
14 | # ip = "155.198.152.18"
15 | ip = "localhost"
16 | port = 20010
17 |
18 | ckpt, step = load_http(job_id, device_rank, ip, port)
19 | print(f"ckpt {ckpt.keys()}")
20 | print(f"step {step}")
21 |
22 | print(ckpt["optimizer"]["fp32_from_fp16_params"][0])
23 |
24 |
25 | if __name__ == '__main__':
26 | main()
27 |
--------------------------------------------------------------------------------
/benchmark/reconfiguration_cluster_size/upgrade.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | set -e
3 |
4 | now_sec() {
5 | date +%s
6 | }
7 |
8 | _show_duration() {
9 | echo "$1s"
10 | }
11 |
12 | measure() {
13 | echo "BEGIN $@"
14 | local begin=$(now_sec)
15 | $@
16 | local end=$(now_sec)
17 | local duration=$((end - begin))
18 | echo "END $@, took $(_show_duration $duration)" | tee -a measure.log
19 | }
20 |
21 | wait_docker() {
22 | measure ansible-playbook -i hosts.txt ./docker.yml # took 269s
23 | }
24 |
25 | upgrade_cluster() {
26 | measure ansible-playbook -i hosts.txt ./tenplex.yml
27 |
28 | for i in $(seq 10); do
29 | wait_docker
30 | sleep 2
31 | done
32 | }
33 |
34 | measure upgrade_cluster
35 |
--------------------------------------------------------------------------------
/scheduler/experiments/mlfs.go:
--------------------------------------------------------------------------------
1 | package experiments
2 |
3 | import "github.com/lgarithm/proc/experimental"
4 |
5 | func ReInstallMLFS(a At) P {
6 | const script = `
7 | set -e
8 | echo "deb https://europe-west2-apt.pkg.dev/projects/tenplex tenplex main" | sudo tee /etc/apt/sources.list.d/tenplex.list
9 | curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | gpg --dearmor | sudo tee /etc/apt/trusted.gpg.d/packages-cloud-google-apt.gpg >/dev/null
10 | sudo apt update
11 | sudo apt remove -y mlfs
12 | sudo apt reinstall -y mlfs
13 | sudo systemctl stop mlfs
14 | sudo systemctl start mlfs
15 | mlfs info
16 | `
17 | return seq(
18 | runScript(a, script, `install-mlfs.sh`, false),
19 | )
20 | }
21 |
22 | var runScript = experimental.RunScript
23 |
--------------------------------------------------------------------------------
/tenplex-run/cluster/cluster_test.go:
--------------------------------------------------------------------------------
1 | package cluster_test
2 |
3 | import (
4 | "flag"
5 | "testing"
6 |
7 | "github.com/kungfu-team/tenplex/tenplex-run/cluster"
8 | )
9 |
10 | func Test_1(t *testing.T) {
11 | var c cluster.Cluster
12 | f := flag.NewFlagSet(`prog`, flag.ExitOnError)
13 | c.RegisterFlags(f)
14 | f.Parse([]string{
15 | `-gpu-per-host`, `8`,
16 | `-hosts`, `1.2.3.4,4.3.2.1`,
17 | })
18 | t.Logf("%#v", c)
19 | if c.GPUsPerHost != 8 {
20 | t.Errorf("parse -gpu-per-host failed: %q", c.GPUsPerHost)
21 | }
22 | if c.GPUsPerContainer != 4 {
23 | t.Errorf("default -gpu-per-container failed: %q", c.GPUsPerContainer)
24 | }
25 | if c.Hosts[0] != `1.2.3.4` {
26 | t.Errorf("parse -hosts failed: %q", c.Hosts)
27 | }
28 | }
29 |
--------------------------------------------------------------------------------
/benchmark/reconfiguration_horovod/train-imagenet.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | set -e
3 |
4 | run() {
5 | local np=$1
6 | shift
7 | horovodrun -np $np $@
8 | }
9 |
10 | train_flags_disk() {
11 | echo --data-dir /data/imagenet/records
12 | }
13 |
14 | train_flags_tenplex() {
15 | echo --mlfs-dir /mnt/mlfs
16 | echo --job fig-13
17 | }
18 |
19 | with_log_file() {
20 | local filename=$1
21 | shift
22 | $@ | tee $filename
23 | echo "logged to $filename $ $@"
24 | }
25 |
26 | # with_log_file 1.log run 2 python3 ./imagenet_resnet.py $(train_flags_disk)
27 | # with_log_file 3.log run 2 python3 ./imagenet_resnet_horovod_elastic.py $(train_flags_disk)
28 | # with_log_file 2.log run 2 python3 ./imagenet_resnet.py $(train_flags_tenplex)
29 |
--------------------------------------------------------------------------------
/scheduler/scripts/install-mlfs.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | # https://tenplex.blob.core.windows.net/public/deb/Release
4 | # https://tenplex.blob.core.windows.net/public/deb/Packages
5 | # https://tenplex.blob.core.windows.net/public/deb/mlfs_0.0.1-git-main-rev1-97718f7_amd64.deb
6 |
7 | install_mlfs() {
8 | echo 'deb https://tenplex.blob.core.windows.net/public/deb ./' | sudo tee /etc/apt/sources.list.d/tenplex.list
9 | curl -s https://tenplex.blob.core.windows.net/public/deb/tenplex.gpg | sudo apt-key add -
10 | sudo apt update
11 | sudo apt remove -y mlfs # TODO: fix deb package version number
12 | sudo apt reinstall -y mlfs
13 | sudo systemctl stop mlfs
14 | sudo systemctl start mlfs
15 | mlfs-admin
16 | }
17 |
18 | install_mlfs
19 |
--------------------------------------------------------------------------------
/state_transformer/statetransform/lib.go:
--------------------------------------------------------------------------------
1 | package statetransform
2 |
3 | import "strconv"
4 |
5 | func equal[T int | string](a, b []T) bool {
6 | if len(a) != len(b) {
7 | return false
8 | }
9 | for i, v := range a {
10 | if v != b[i] {
11 | return false
12 | }
13 | }
14 | return true
15 | }
16 |
17 | func isInt(str string) bool {
18 | _, err := strconv.Atoi(str)
19 | return err == nil
20 | }
21 |
22 | func isIntAndCheck(arr []string, key int) bool {
23 | if key >= len(arr) {
24 | return false
25 | }
26 | _, err := strconv.Atoi(arr[key])
27 | return err == nil
28 | }
29 |
30 | func equalAndCheck[T int | string](arr []T, key int, val T) bool {
31 | if key >= len(arr) {
32 | return false
33 | }
34 | return arr[key] == val
35 | }
36 |
--------------------------------------------------------------------------------
/ipv4/detect.go:
--------------------------------------------------------------------------------
1 | package ipv4
2 |
3 | import "net"
4 |
5 | func Detect(nicName string) string {
6 | nics, err := net.Interfaces()
7 | if err != nil {
8 | return ""
9 | }
10 | for _, nic := range nics {
11 | if len(nicName) > 0 && nicName != nic.Name {
12 | continue
13 | }
14 | addrs, err := nic.Addrs()
15 | if err != nil {
16 | continue
17 | }
18 | for _, addr := range addrs {
19 | var ip net.IP
20 | switch v := addr.(type) {
21 | case *net.IPNet:
22 | ip = v.IP
23 | case *net.IPAddr:
24 | ip = v.IP
25 | }
26 | if ip != nil {
27 | ip = ip.To4()
28 | }
29 | if ip != nil {
30 | // fmt.Printf("%s %s\n", nic.Name, ip.String())
31 | return ip.String()
32 | }
33 | }
34 | }
35 | return ""
36 | }
37 |
--------------------------------------------------------------------------------
/mlfs/cmd/mlfs-build-tf-index/mlfs-build-tf-index.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "flag"
5 | "log"
6 | "time"
7 |
8 | "github.com/kungfu-team/tenplex/mlfs/tfrecord"
9 | "github.com/kungfu-team/tenplex/mlfs/vfs/vfile"
10 | )
11 |
12 | var (
13 | m = flag.Int("m", 2, "")
14 | filename = flag.String("output", "a.idx.txt", "")
15 | )
16 |
17 | func main() {
18 | flag.Parse()
19 | t0 := time.Now()
20 | defer func() { log.Printf("took %s", time.Since(t0)) }()
21 | idx, err := tfrecord.BuildIndex(flag.Args(), *m)
22 | if err != nil {
23 | log.Printf("%v", err)
24 | return
25 | }
26 | if err := vfile.SaveIdxFile(*filename, idx); err != nil {
27 | log.Printf("%v", err)
28 | return
29 | }
30 | log.Printf("generated %s", *filename)
31 | }
32 |
--------------------------------------------------------------------------------
/tenplex-run/job/hosts.go:
--------------------------------------------------------------------------------
1 | package job
2 |
3 | import "fmt"
4 |
5 | var (
6 | // komodo01 = `komodo01.doc.res.ic.ac.uk`
7 | komodo01 = `10.10.10.1`
8 | // komodo02 = `komodo02.doc.res.ic.ac.uk`
9 | komodo02 = `10.10.10.2`
10 | // komodo03 = `komodo03.doc.res.ic.ac.uk`
11 | komodo03 = `10.10.10.3`
12 | // komodo04 = `komodo04.doc.res.ic.ac.uk`
13 | komodo04 = `10.10.10.4`
14 |
15 | dockerIPs = genDockerIPRange(32)
16 | )
17 |
18 | // generate a private IP range for docker swarm
19 | func genDockerIPRange(n int) []string {
20 | var ips []string
21 | for i := 0; i < n; i++ {
22 | // TODO: extract subnet range from JSON
23 | // ip := fmt.Sprintf("10.10.10.%d", 140+i)
24 | ip := fmt.Sprintf("trainer-%02d", i)
25 | ips = append(ips, ip)
26 | }
27 | return ips
28 | }
29 |
--------------------------------------------------------------------------------
/tenplex-run/cluster/cluster.go:
--------------------------------------------------------------------------------
1 | package cluster
2 |
3 | import (
4 | "flag"
5 |
6 | "github.com/kungfu-team/tenplex/tenplex-run/listflag"
7 | "github.com/kungfu-team/tenplex/tenplex-run/structflag"
8 | )
9 |
10 | type Cluster struct {
11 | GPUsPerHost int `flag:"gpu-per-host" default:"4"`
12 | GPUsPerContainer int `flag:"gpu-per-container" default:"4"`
13 | Hosts listflag.Strings `flag:"hosts"`
14 | }
15 |
16 | func NewCluster(gpuPerHost int, gpusPerContainer int, hosts ...string) *Cluster {
17 | return &Cluster{
18 | GPUsPerHost: gpuPerHost,
19 | GPUsPerContainer: gpusPerContainer,
20 | Hosts: hosts,
21 | }
22 | }
23 |
24 | func (c *Cluster) RegisterFlags(flag *flag.FlagSet) { structflag.RegisterFlags(c, flag) }
25 |
--------------------------------------------------------------------------------
/state_transformer/meta/modelkeys.go:
--------------------------------------------------------------------------------
1 | package meta
2 |
3 | import (
4 | "encoding/json"
5 | "os"
6 | "path"
7 | "strconv"
8 | )
9 |
10 | func LoadModelKeys(conf *Config, before bool) (map[int][][]string, error) {
11 | structPath := GetStructPath(conf, before)
12 | modelKeysPath := path.Join(structPath, "model_keys.json")
13 | var payload map[string][][]string
14 | content, err := os.ReadFile(modelKeysPath)
15 | if err != nil {
16 | return nil, err
17 | }
18 | err = json.Unmarshal(content, &payload)
19 | if err != nil {
20 | return nil, err
21 | }
22 | modelKeys := make(map[int][][]string)
23 | for k, v := range payload {
24 | i, err := strconv.Atoi(k)
25 | if err != nil {
26 | return nil, err
27 | }
28 | modelKeys[i] = v
29 | }
30 | return modelKeys, nil
31 | }
32 |
--------------------------------------------------------------------------------
/mlfs/vfs/path.go:
--------------------------------------------------------------------------------
1 | package vfs
2 |
3 | import (
4 | "errors"
5 | "strings"
6 | )
7 |
8 | type pstr string
9 |
10 | type filepath []string
11 |
12 | func (p filepath) P() pstr {
13 | s := `/` + strings.Join(p, `/`)
14 | return pstr(s)
15 | }
16 |
17 | var errNoParent = errors.New(`root has no parent`)
18 |
19 | func (p filepath) parent() filepath {
20 | if len(p) < 1 {
21 | panic(errNoParent)
22 | }
23 | return p[:len(p)-1]
24 | }
25 |
26 | func (p filepath) basename() string {
27 | if len(p) < 1 {
28 | panic(errNoParent)
29 | }
30 | return p[len(p)-1]
31 | }
32 |
33 | func ParseP(p string) filepath {
34 | var parts []string
35 | for _, name := range strings.Split(p, `/`) {
36 | if len(name) > 0 {
37 | parts = append(parts, name)
38 | }
39 | }
40 | return parts
41 | }
42 |
--------------------------------------------------------------------------------
/state_transformer/cmd/tenplex-state-transformer/tenplex-state-transformer.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "flag"
5 | "log"
6 | "time"
7 |
8 | "github.com/kungfu-team/tenplex/state_transformer/meta"
9 | "github.com/kungfu-team/tenplex/state_transformer/statetransform"
10 | )
11 |
12 | func main() {
13 | startTransform := time.Now()
14 | var conf meta.Config
15 | conf.RegisterFlags(flag.CommandLine)
16 | flag.Parse()
17 | conf.Complete()
18 | log.Printf("config %+v", conf)
19 | log.Printf("target device %v", conf.TargetRank)
20 | if err := statetransform.MigrateState(&conf, conf.TargetRank); err != nil {
21 | log.Panicf("Transformation for device %d failed with %v", conf.TargetRank, err)
22 | }
23 | log.Printf("State transformation took %s", time.Since(startTransform))
24 | }
25 |
--------------------------------------------------------------------------------
/tenplex-run/.github/workflows/go.yml:
--------------------------------------------------------------------------------
1 | name: Go
2 |
3 | on:
4 | - push
5 |
6 | jobs:
7 | linux:
8 | runs-on: ubuntu-20.04
9 |
10 | steps:
11 | - uses: actions/checkout@v2
12 |
13 | - uses: actions/setup-go@v2
14 | with:
15 | go-version: '1.18'
16 |
17 | - run: sudo apt install -y openssh-client # for ssh-keygen
18 | - run: mkdir -p $HOME/.ssh
19 | - run: echo "${KUNGFU_RSA}" > $HOME/.ssh/id_rsa
20 | env:
21 | KUNGFU_RSA: ${{ secrets.KUNGFU_RSA }}
22 | - run: chmod 0600 $HOME/.ssh/id_rsa
23 | - run: ssh-keygen -y -f $HOME/.ssh/id_rsa > $HOME/.ssh/id_rsa.pub
24 |
25 | - run: git config --global url."git@github.com:".insteadOf "https://github.com/"
26 | - run: go env -w GOPRIVATE=*
27 |
28 | - run: GO=$(which go) make
29 |
--------------------------------------------------------------------------------
/mlfs/iotrace/io.go:
--------------------------------------------------------------------------------
1 | package iotrace
2 |
3 | import "io"
4 |
5 | type TracedWriter struct {
6 | w io.Writer
7 | c *Counter
8 | }
9 |
10 | func TraceWriter(w io.Writer, c *Counter) io.Writer {
11 | return &TracedWriter{
12 | w: w,
13 | c: c,
14 | }
15 | }
16 |
17 | func (w *TracedWriter) Write(bs []byte) (int, error) {
18 | n, err := w.w.Write(bs)
19 | w.c.Add(int64(n))
20 | return n, err
21 | }
22 |
23 | type TracedReader struct {
24 | r io.Reader
25 | c *Counter
26 | }
27 |
28 | func TraceReader(r io.Reader, c *Counter) io.Reader {
29 | if c == nil {
30 | return r
31 | }
32 | return &TracedReader{
33 | r: r,
34 | c: c,
35 | }
36 | }
37 |
38 | func (r *TracedReader) Read(bs []byte) (int, error) {
39 | n, err := r.r.Read(bs)
40 | r.c.Add(int64(n))
41 | return n, err
42 | }
43 |
--------------------------------------------------------------------------------
/state_transformer/statetransform/repartition_test.go:
--------------------------------------------------------------------------------
1 | package statetransform
2 |
3 | import (
4 | "testing"
5 | )
6 |
7 | func TestMapToUnitedRequests(t *testing.T) {
8 | sourceDim := 256
9 | targetDim := 512
10 | sourceMPSize := 4
11 | targetMPRank := 1
12 | reqs, err := mapToUnitedRequests(sourceDim, targetDim, sourceMPSize, targetMPRank)
13 | if err != nil {
14 | t.Fail()
15 | }
16 | t.Logf("requests %v", reqs)
17 | }
18 |
19 | func TestMapToSourceRequests(t *testing.T) {
20 | sourceDim := 256
21 | targetDim := 512
22 | sourceMPSize := 4
23 | targetMPRank := 1
24 | reqs, err := mapToUnitedRequests(sourceDim, targetDim, sourceMPSize, targetMPRank)
25 | if err != nil {
26 | t.Fail()
27 | }
28 | reqs = mapToSourceRequests(reqs, sourceDim)
29 | t.Logf("requests %v", reqs)
30 | }
31 |
--------------------------------------------------------------------------------
/tenplex/stop.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import requests
4 | import torch
5 |
6 |
7 | def check_stop(scheduler_addr: str):
8 | if scheduler_addr is None:
9 | return False
10 | stop = False
11 | rank = torch.distributed.get_rank()
12 | if rank == 0:
13 | url = scheduler_addr
14 | url = os.path.join(url, "stop")
15 | req = requests.get(url, timeout=12)
16 | txt = req.text
17 | if txt == "stop":
18 | stop = True
19 | if stop:
20 | stop_ten = torch.tensor(1, dtype=torch.int32, device=torch.device("cuda"))
21 | else:
22 | stop_ten = torch.tensor(0, dtype=torch.int32, device=torch.device("cuda"))
23 | torch.distributed.all_reduce(stop_ten)
24 | if stop_ten > 0:
25 | return True
26 | return False
27 |
--------------------------------------------------------------------------------
/scheduler/scripts/plot.gp:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env gnuplot
2 |
3 | set terminal postscript portrait dashed color size 14,9 font 44 fontscale 1
4 | # set terminal postscript portrait dashed monochrome size 14,9 font 44 fontscale 1
5 | set datafile missing '-'
6 | set boxwidth 0.9 absolute
7 |
8 | set style fill solid 1.00 border lt -1
9 | set style data lines
10 |
11 | set xtics ()
12 | set xtics border in scale 0,0 nomirror autojustify
13 | set xtics nomirror rotate by -45
14 | set xtics norangelimit
15 |
16 | set key fixed right bottom vertical Right noreverse noenhanced autotitle nobox
17 | set key outside;
18 |
19 | NO_ANIMATION = 1
20 |
21 | set output 'p1.ps'
22 |
23 | f1 = 'a.log'
24 | f2 = 'b.log'
25 |
26 | plot f1 using ($1 / 60.0):2 title 'a loss', \
27 | f2 using ($1 / 60.0):2 title 'b loss' \
28 |
--------------------------------------------------------------------------------
/tests/test_save.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 | import torch
4 |
5 | from .save import save
6 |
7 |
8 | def main():
9 | parser = argparse.ArgumentParser(description='Write checkpoint')
10 | parser.add_argument('--ckpt-path', type=str)
11 | parser.add_argument('--job-id', type=str)
12 | parser.add_argument('--step', type=str)
13 | parser.add_argument('--device-rank', type=int)
14 | parser.add_argument('--mlfs-path', type=str)
15 | parser.add_argument('--ip', type=str)
16 | parser.add_argument('--port', type=int)
17 | args = parser.parse_args()
18 |
19 | ckpt = torch.load(args.ckpt_path, map_location='cpu')
20 | save(ckpt, args.job_id, args.step, args.device_rank, args.mlfs_path,
21 | args.ip, args.port)
22 |
23 |
24 | if __name__ == '__main__':
25 | main()
26 |
--------------------------------------------------------------------------------
/benchmark/dynamic_resources/README.md:
--------------------------------------------------------------------------------
1 | # Dynamic resources
2 | _Fig. 9. Elastic DL job convergence with multi-dimensional parallelism under dynamic GPU changes_
3 |
4 | First, we explore the benefits of supporting elasticity in DL jobs with multi-dimensional parallelism, scaling across all parallelism dimensions when the GPU allocation changes.
5 | In this experiment, we train DL jobs with the GPT-3 XL model on the on-premise 16-GPU cluster. The job runtime and elastic scaling events are derived based on Microsoft’s Philly trace: over the runtime of 538 mins, we scale based on the average every 35 mins. During a scaling event, we change the number of GPUs for a job between 16, 8, and 4 GPUs.
6 |
7 | # Run
8 | ```sh
9 | ./run.sh
10 | ```
11 |
12 | ## Note
13 | The dynamic resources experiment runs for about 24 hours
14 |
--------------------------------------------------------------------------------
/scheduler/scripts/upload-logs.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | set -e
3 |
4 | export PATH=$PATH:$HOME/local/bin
5 |
6 | cd $(dirname $0)/..
7 |
8 | if [ ! -f run-id.txt ]; then
9 | date +%s >run-id.txt
10 | fi
11 |
12 | RUN_ID=$(cat run-id.txt)
13 |
14 | echo "Using RUN_ID: $RUN_ID"
15 |
16 | SA=tenplex
17 | PREFIX="https://$SA.blob.core.windows.net/public/_debug/scheduler/$RUN_ID"
18 |
19 | _list_logs() {
20 | find logs -type f
21 | ls *.log
22 | }
23 |
24 | list_logs() { _list_logs | sort; }
25 |
26 | upload() {
27 | URL=$PREFIX/$1
28 | ucp $1 $URL
29 | echo "uploaded to $URL"
30 | }
31 |
32 | main() {
33 | for f in $(list_logs); do
34 | echo $f
35 | upload $f
36 | done
37 | ./scripts/gen-log-index.py $(list_logs) >index.html
38 | upload index.html
39 | }
40 |
41 | main
42 |
--------------------------------------------------------------------------------
/mlfs/vfs/vfile/buffer.go:
--------------------------------------------------------------------------------
1 | package vfile
2 |
3 | import (
4 | "bytes"
5 | "io"
6 | )
7 |
8 | type Buffer struct {
9 | bs []byte
10 | }
11 |
12 | func NewBuffer() *Buffer {
13 | return &Buffer{}
14 | }
15 |
16 | func (f *Buffer) Open() io.ReadCloser {
17 | r := bytes.NewBuffer(f.bs)
18 | return io.NopCloser(r)
19 | }
20 |
21 | func (f *Buffer) Size() int64 {
22 | return int64(len(f.bs))
23 | }
24 |
25 | func (f *Buffer) Truncate() {
26 | f.bs = nil
27 | }
28 |
29 | func (f *Buffer) ReadAt(buf []byte, pos int64) (int, error) {
30 | br := bytes.NewBuffer(f.bs[pos:])
31 | return br.Read(buf)
32 | }
33 |
34 | func (f *Buffer) WriteAt(buf []byte, pos int64) (int, error) {
35 | if n := len(buf) + int(pos); n > len(f.bs) {
36 | f.bs = append(f.bs, make([]byte, n-len(f.bs))...)
37 | }
38 | return copy(f.bs[pos:], buf), nil
39 | }
40 |
--------------------------------------------------------------------------------
/scheduler/run_user.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | cd $(dirname $0)
5 | make
6 |
7 | join_() {
8 | local IFS=$1
9 | shift
10 | echo "$*"
11 | }
12 |
13 | echo "Listing IPs"
14 | # host=$(join_ , $(./scripts/list-ips.sh))
15 | host=$(join_ , $(./scripts/list-ips-komodo.sh))
16 | echo "using host=$host"
17 |
18 | for h in $(echo $host | tr ',' '\n'); do
19 | gpu_per_host=$(ssh $h nvidia-smi -L | wc -l)
20 | echo "$gpu_per_host GPUs on $h"
21 | done
22 |
23 | # echo -failure
24 | flags() {
25 | echo -hosts $host
26 | echo -gpu-per-host $gpu_per_host # TODO: auto detect
27 | echo -image kungfu.azurecr.io/mw-megatron-lm-update
28 | echo -plan ./data/single-job-time.json
29 | echo -timed-job
30 | }
31 |
32 | PREFIX=$HOME/.tenplex/scheduler
33 | $PREFIX/bin/tenplex-user $(flags)
34 |
35 | echo "$0 done"
36 |
--------------------------------------------------------------------------------
/mlfs/hash/file.go:
--------------------------------------------------------------------------------
1 | package hash
2 |
3 | import (
4 | "fmt"
5 | "log"
6 |
7 | "github.com/kungfu-team/tenplex/mlfs/uri"
8 | )
9 |
10 | type HashedFile struct {
11 | MD5 string
12 | URLs []string
13 | }
14 |
15 | func (f *HashedFile) Check() {
16 | for _, u := range f.URLs {
17 | ok, got, err := md5Check(f.MD5, u)
18 | if err != nil {
19 | log.Printf("%v", err)
20 | continue
21 | }
22 | if !ok {
23 | fmt.Printf("failed: %s != md5(%s) = %s\n", f.MD5, u, got)
24 | continue
25 | }
26 | fmt.Printf("OK: %s = md5(%s)\n", f.MD5, u)
27 | }
28 | }
29 |
30 | func md5Check(sum string, url string) (bool, string, error) {
31 | f, err := uri.Open(url)
32 | if err != nil {
33 | return false, "", err
34 | }
35 | defer f.Close()
36 | got, err := md5sum(f, nil)
37 | if err != nil {
38 | return false, got, err
39 | }
40 | return sum == got, got, nil
41 | }
42 |
--------------------------------------------------------------------------------
/mlfs/ds/mnist.go:
--------------------------------------------------------------------------------
1 | package ds
2 |
3 | import "github.com/kungfu-team/tenplex/mlfs/hash"
4 |
5 | var (
6 | MnistTrainImages = hash.HashedFile{
7 | MD5: `f68b3c2dcbeaaa9fbdd348bbdeb94873`,
8 | URLs: []string{`https://storage.googleapis.com/cvdf-datasets/mnist/train-images-idx3-ubyte.gz`},
9 | }
10 | MnistTrainLabels = hash.HashedFile{
11 | MD5: `d53e105ee54ea40749a09fcbcd1e9432`,
12 | URLs: []string{`https://storage.googleapis.com/cvdf-datasets/mnist/train-labels-idx1-ubyte.gz`},
13 | }
14 | MnistTestImages = hash.HashedFile{
15 | MD5: `9fb629c4189551a2d022fa330f9573f3`,
16 | URLs: []string{`https://storage.googleapis.com/cvdf-datasets/mnist/t10k-images-idx3-ubyte.gz`},
17 | }
18 | MnistTestLabels = hash.HashedFile{
19 | MD5: `ec29112dd5afa0611ce80d1b7f02629c`,
20 | URLs: []string{`https://storage.googleapis.com/cvdf-datasets/mnist/t10k-labels-idx1-ubyte.gz`},
21 | }
22 | )
23 |
--------------------------------------------------------------------------------
/scheduler/job/job.go:
--------------------------------------------------------------------------------
1 | package job
2 |
3 | import (
4 | "strings"
5 |
6 | "github.com/kungfu-team/tenplex/mlfs/ds"
7 | "github.com/kungfu-team/tenplex/scheduler/scalepoint"
8 | )
9 |
10 | type Job struct {
11 | Framework string
12 | Precision string
13 | BatchSize int
14 | MicroBatchSize int
15 | SequenceLength int
16 | Dataset ds.Dataset
17 | Image string
18 | Model string
19 | ID string
20 | Steps int
21 | ModelSize string
22 | NumLayers int
23 | VocabSize int
24 | Failure int
25 | }
26 |
27 | func ShowJobIds(jss ...[]Job) string {
28 | var ids []string
29 | for _, js := range jss {
30 | for _, j := range js {
31 | ids = append(ids, j.ID)
32 | }
33 | }
34 | return strings.Join(ids, ",")
35 | }
36 |
37 | type TimedJob struct {
38 | Job Job
39 | Timing []scalepoint.ScalePoint
40 | }
41 |
--------------------------------------------------------------------------------
/mlfs/uri/sas.go:
--------------------------------------------------------------------------------
1 | package uri
2 |
3 | import (
4 | "encoding/json"
5 | "fmt"
6 | "io"
7 | "log"
8 | "net/url"
9 | "time"
10 | )
11 |
12 | var t0 = time.Now()
13 |
14 | func checkSAS(filename, sas string) error {
15 | q, err := url.ParseQuery(sas)
16 | if err != nil {
17 | return err
18 | }
19 | se, err := parseTime(q.Get(`se`))
20 | if err != nil {
21 | return err
22 | }
23 | if se.Before(t0) {
24 | log.Printf("%s expired %s ago", filename, t0.Sub(*se))
25 | }
26 | return nil
27 | }
28 |
29 | func parseTime(s string) (*time.Time, error) {
30 | var i struct {
31 | T time.Time `json:"time"`
32 | }
33 | if err := json.Unmarshal([]byte(fmt.Sprintf(`{"time": %q}`, s)), &i); err != nil {
34 | return nil, err
35 | }
36 | return &i.T, nil
37 | }
38 |
39 | func Debug(w io.Writer) {
40 | for sa, sas := range opener.azSAS {
41 | fmt.Fprintf(w, "%q: %q\n", sa, sas)
42 | }
43 | }
44 |
--------------------------------------------------------------------------------
/tenplex-run/structflag/structflag_test.go:
--------------------------------------------------------------------------------
1 | package structflag_test
2 |
3 | import (
4 | "flag"
5 | "strings"
6 | "testing"
7 |
8 | "github.com/kungfu-team/tenplex/tenplex-run/structflag"
9 | )
10 |
11 | type Base struct {
12 | Y int `flag:"y"`
13 | }
14 |
15 | type App struct {
16 | Base
17 | Name string `flag:"name"`
18 | X int `flag:"x"`
19 | OK bool `flag:"ok"`
20 | }
21 |
22 | func Test_1(t *testing.T) {
23 | var a App
24 | f := flag.NewFlagSet(`cmd`, flag.ExitOnError)
25 | structflag.RegisterFlags(&a, f) // won't register Base
26 | // structflag.RegisterFlags(&a.Base, f)
27 | }
28 |
29 | func Test_2(t *testing.T) {
30 | a := App{
31 | Name: `abc`,
32 | X: 2,
33 | OK: true,
34 | }
35 | args := structflag.ToGoArgs(&a)
36 | want := `-name abc -x 2 -ok`
37 | if got := strings.Join(args, " "); got != want {
38 | t.Errorf("%q != %q", got, want)
39 | }
40 | }
41 |
--------------------------------------------------------------------------------
/tenplex-run/runop/redundancy.go:
--------------------------------------------------------------------------------
1 | package runop
2 |
3 | import (
4 | "fmt"
5 |
6 | "github.com/kungfu-team/tenplex/mlfs/mlfs"
7 | "github.com/kungfu-team/tenplex/mlfs/pid"
8 | "github.com/kungfu-team/tenplex/tenplex-run/job"
9 | )
10 |
11 | func setRedundancy(jobConf *job.JobConfig) error {
12 | redu := 1
13 |
14 | var peerList mlfs.PeerList
15 | for _, host := range jobConf.Cluster.Hosts {
16 | peerList = append(peerList, mlfs.Peer{IPv4: pid.MustParseIPv4(host), Port: mlfs.DefaultCtrlPort})
17 | }
18 |
19 | for _, host := range jobConf.Cluster.Hosts {
20 | cli, err := mlfs.NewClientTo(host, mlfs.DefaultCtrlPort)
21 | if err != nil {
22 | return fmt.Errorf("%s %v", host, err)
23 | }
24 | err = cli.SetPeers(peerList)
25 | if err != nil {
26 | return err
27 | }
28 | err = cli.SetRedundency(redu)
29 | if err != nil {
30 | return err
31 | }
32 | }
33 | return nil
34 | }
35 |
--------------------------------------------------------------------------------
/mlfs/vfs/tree_debug.go:
--------------------------------------------------------------------------------
1 | package vfs
2 |
3 | import (
4 | "fmt"
5 | "io"
6 | )
7 |
8 | func (t *Tree) Dump(w io.Writer) {
9 | for i, p := range t.ps {
10 | n := t.nodes[i]
11 | var c rune
12 | var size int
13 | var unit string
14 | if n.IsDir() {
15 | c = 'd'
16 | size = len(n.AsDir().Items())
17 | unit = `files`
18 | } else {
19 | c = '-'
20 | size = int(n.AsFile().Size())
21 | unit = `bytes`
22 | }
23 | fmt.Fprintf(w, "%8d %c %12d %s %s\n", i, c, size, unit, p)
24 | }
25 | fmt.Fprintf(w, "%d nodes\n", t.Count())
26 | }
27 |
28 | func (t *Tree) Stat() {
29 | n := t.Count()
30 | nd := t.nDirs
31 | nf := n - nd
32 | fmt.Printf("%d nodes, %d dirs, %d files\n", n, nd, nf)
33 | }
34 |
35 | func (t *Tree) AllFiles(w io.Writer) {
36 | for i, p := range t.ps {
37 | n := t.nodes[i]
38 | if !n.IsDir() {
39 | fmt.Fprintf(w, "%s\n", p)
40 | }
41 | }
42 | }
43 |
--------------------------------------------------------------------------------
/.github/workflows/deb.yml:
--------------------------------------------------------------------------------
1 | name: deb
2 |
3 | on:
4 | push:
5 | branches:
6 | - main
7 |
8 | jobs:
9 | test:
10 | # https://help.github.com/en/articles/virtual-environments-for-github-actions#supported-virtual-environments
11 | runs-on: ubuntu-20.04
12 |
13 | steps:
14 | - uses: actions/checkout@v3
15 | with:
16 | fetch-depth: 0
17 |
18 | - run: make
19 | - run: make deb
20 |
21 | - run: |
22 | KEY_FILE=$HOME/gcloud-key.json
23 | echo "${GCLOUD_KEY}" > $KEY_FILE
24 | gcloud auth login --cred-file=$KEY_FILE
25 | rm $KEY_FILE
26 |
27 | gcloud config set project tenplex
28 | env:
29 | GCLOUD_KEY: ${{ secrets.GCLOUD_KEY }}
30 |
31 | - run: |
32 | REPO=tenplex
33 | DEB=`ls build/*.deb`
34 | LOC=europe-west2
35 | gcloud artifacts apt upload $REPO --location=$LOC --source=$DEB
36 |
--------------------------------------------------------------------------------
/benchmark/redeployment/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | set -e
4 |
5 | . $(dirname $0)/../common.sh
6 |
7 | hosts() {
8 | echo "10.10.10.1"
9 | echo "10.10.10.2"
10 | echo "10.10.10.3"
11 | echo "10.10.10.4"
12 | }
13 |
14 | model_sizes() {
15 | echo "6.7B"
16 | echo "2.7B"
17 | echo "xl"
18 | }
19 |
20 | comb_flags() {
21 | base_flags
22 | echo -model "gpt"
23 | echo -dataset "enwiki"
24 | echo -index-url "/data/megatron-lm/gpt-2/enwiki/npzs_seq1024_new/indices.txt"
25 | echo -hosts $(join $(hosts))
26 | echo -schedule "$(dirname $0)/schedule.json"
27 | echo -model-sizes $(join $(model_sizes))
28 | echo -batch-sizes 128
29 | echo -micro-batch-sizes 8
30 | echo -para-config "$(dirname $0)/para-config.json"
31 | echo -redeploy
32 | echo -central
33 | }
34 |
35 | tenplex-multi-experiment $(comb_flags) 2>&1 | tee redeploy.log
36 |
37 | python plot.py
38 |
--------------------------------------------------------------------------------
/mlfs/cmd/tests/cmd/mlfs-debug/mlfs-debug.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "encoding/json"
5 | "flag"
6 | "log"
7 | "os"
8 |
9 | "github.com/kungfu-team/tenplex/mlfs/ds"
10 | "github.com/kungfu-team/tenplex/mlfs/vfs/vfile"
11 | )
12 |
13 | var (
14 | dataset = flag.String(`ds`, ``, ``)
15 | )
16 |
17 | func main() {
18 | flag.Parse()
19 | var ds ds.Dataset
20 | panicErr(loadJSONFile(*dataset, &ds))
21 | log.Printf("%q", ds.IndexURL)
22 | i, err := vfile.LoadIdxFile(ds.IndexURL)
23 | if err != nil {
24 | panic(err)
25 | }
26 | i.SetHost(``)
27 | for _, f := range i {
28 | log.Printf("%q", f.Filepath)
29 | }
30 | }
31 |
32 | func loadJSONFile(filename string, i interface{}) error {
33 | f, err := os.Open(filename)
34 | if err != nil {
35 | return err
36 | }
37 | return json.NewDecoder(f).Decode(i)
38 | }
39 | func panicErr(err error) {
40 | if err != nil {
41 | panic(err)
42 | }
43 | }
44 |
--------------------------------------------------------------------------------
/mlfs/vfs/file.go:
--------------------------------------------------------------------------------
1 | package vfs
2 |
3 | import (
4 | "bytes"
5 | "io"
6 | )
7 |
8 | type file struct {
9 | bs []byte
10 | }
11 |
12 | func ToFile(bs []byte) *file { return &file{bs: bs} }
13 |
14 | func (f *file) IsDir() bool { return false }
15 |
16 | func (f *file) IsExecutable() bool { return bytes.HasPrefix(f.bs, []byte(`#!`)) }
17 |
18 | func (f *file) AsFile() FileNode { return f }
19 |
20 | func (f *file) AsDir() DirNode { return nil }
21 |
22 | func (f *file) Open() io.ReadCloser {
23 | r := bytes.NewBuffer(f.bs)
24 | return io.NopCloser(r)
25 | }
26 |
27 | func (f *file) Size() int64 {
28 | return int64(len(f.bs))
29 | }
30 |
31 | func (f *file) ReadAt(buf []byte, pos int64) (int, error) {
32 | n := min(len(buf), len(f.bs)-int(pos))
33 | copy(buf[:n], f.bs[pos:(pos)+int64(n)])
34 | return n, nil
35 | }
36 |
37 | func min(a, b int) int {
38 | if a < b {
39 | return a
40 | }
41 | return b
42 | }
43 |
--------------------------------------------------------------------------------
/mlfs/hash/md5.go:
--------------------------------------------------------------------------------
1 | package hash
2 |
3 | import (
4 | "crypto/md5"
5 | "fmt"
6 | "io"
7 | "os"
8 |
9 | "github.com/kungfu-team/tenplex/mlfs/iotrace"
10 | )
11 |
12 | type md5db struct {
13 | hashToPath map[string]string
14 | pathToHash map[string]string
15 | }
16 |
17 | func NewMD5DB() *md5db {
18 | db := &md5db{
19 | hashToPath: make(map[string]string),
20 | pathToHash: make(map[string]string),
21 | }
22 | return db
23 | }
24 |
25 | // func (db*md5db)
26 |
27 | func FileMD5(c *iotrace.Counter, filename string) (string, error) {
28 | f, err := os.Open(filename)
29 | if err != nil {
30 | return "", err
31 | }
32 | defer f.Close()
33 | return md5sum(f, c)
34 | }
35 |
36 | func md5sum(r io.Reader, c *iotrace.Counter) (string, error) {
37 | h := md5.New()
38 | if _, err := io.Copy(h, iotrace.TraceReader(r, c)); err != nil {
39 | return "", err
40 | }
41 | return fmt.Sprintf("%x", h.Sum(nil)), nil
42 | }
43 |
--------------------------------------------------------------------------------
/mlfs/mlfs/tensorfile.go:
--------------------------------------------------------------------------------
1 | package mlfs
2 |
3 | import (
4 | "bytes"
5 | "fmt"
6 |
7 | "github.com/kungfu-team/tenplex/mlfs/vfs"
8 | "github.com/kungfu-team/tenplex/tensor"
9 | )
10 |
11 | type Tensor = tensor.Tensor
12 |
13 | func (e *MLFS) TouchTensor(p string, t *Tensor) error {
14 | log.Printf("TouchTensor: %q", p)
15 | if _, err := e.tree.TouchText(p+`.meta`, func() string {
16 | bs := &bytes.Buffer{}
17 | fmt.Fprintf(bs, "%s\n", t.Dtype)
18 | dims := t.Dims
19 | fmt.Fprintf(bs, "%d\n", len(dims))
20 | for _, d := range dims {
21 | fmt.Fprintf(bs, "%d\n", d)
22 | }
23 | return bs.String()
24 | }()); err != nil {
25 | log.Printf("TouchTensor meta: %q", p)
26 | return err
27 | }
28 | // TODO: support write large bytes to read files instead of RAM
29 | if _, err := e.tree.TouchFile(p, vfs.ToFile(t.Data)); err != nil {
30 | log.Printf("TouchTensor data: %q", p)
31 | return err
32 | }
33 | return nil
34 | }
35 |
--------------------------------------------------------------------------------
/benchmark/reconfiguration_horovod/with-docker:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | set -e
4 |
5 | # name=$(cat $(dirname $0)/name.txt)
6 | name=reconfiguration_horovod
7 | tag=$(cat $(dirname $0)/tag.txt)
8 |
9 | join_() {
10 | local IFS=$1
11 | shift
12 | echo "$*"
13 | }
14 |
15 | join() { join_ , $@; }
16 |
17 | gpus() {
18 | local n=2
19 | seq 0 $((n - 1))
20 | }
21 |
22 | docker_mount() { echo -v $1:$2; }
23 | docker_forward() { docker_mount $1 $1; }
24 |
25 | docker_run_flags() {
26 | echo --rm
27 | echo --gpus
28 | echo "\"device=$(join $(gpus))\""
29 |
30 | echo -i
31 |
32 | echo --name $name
33 |
34 | docker_forward /mnt/mlfs
35 | docker_forward /data/imagenet/records
36 | }
37 |
38 | docker_run() { docker run $(docker_run_flags) -t $tag $@; }
39 |
40 | main() {
41 | if [ -z "$1" ]; then
42 | docker_run bash
43 | else
44 | docker_run $@
45 | fi
46 | }
47 |
48 | main $@
49 |
--------------------------------------------------------------------------------
/benchmark/failure/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | . ../common.sh
5 |
6 | hosts() {
7 | echo "10.10.10.1"
8 | echo "10.10.10.2"
9 | echo "10.10.10.3"
10 | echo "10.10.10.4"
11 | }
12 |
13 |
14 | flags() {
15 | echo -framework "megatron-lm"
16 | echo -model "gpt"
17 | echo -model-size "2.7B"
18 | echo -dataset "enwiki"
19 | echo -batch-size 128
20 | echo -micro-batch-size 8
21 | echo -precision "fp16"
22 | echo -index-url "/data/megatron-lm/gpt-2/enwiki/npzs_seq1024_new/indices.txt"
23 | echo -hosts "$(join $(hosts))"
24 | echo -schedule-file "schedule.json"
25 | echo -para-config "para-config.json"
26 | echo -mlfs-port 20010
27 | echo -gpu-per-host 4
28 | echo -gpu-per-container 4
29 | echo -seq-length 1024
30 | }
31 |
32 | for i in 4 8 12
33 | do
34 | tenplex-run $(flags) -failure $i 2>&1 | tee failure_$i.log
35 | mv logs logs_$i
36 | done
37 |
38 | python plot.py
39 |
--------------------------------------------------------------------------------
/scheduler/scripts/recreate-vmss.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | set -e
3 |
4 | cd $(dirname $0)/..
5 | . ./scripts/config.sh
6 |
7 | # The Image type for a Virtual Machine Scale Set may not be changed.
8 | # image=tenplex-base-image
9 | image="tenplex-mw"
10 | image=$(az image show -n $image -g kungfu | jq -r .id)
11 | echo "Using image $image"
12 |
13 | storage=Premium_LRS # SSD
14 |
15 | flags() {
16 | echo --admin-username kungfu
17 | echo --vnet-name tenplex-relayVNET
18 | echo --subnet tenplex-relaySubnet
19 | echo --disable-overprovision
20 | echo --image $image
21 | echo --instance-count 0
22 | echo --vm-sku $size
23 | echo --location westeurope
24 | echo --storage-sku $storage
25 | # echo --lb '""'
26 | }
27 |
28 | recreate() {
29 | az vmss delete -g $group -n $name
30 | echo "deleted $name"
31 |
32 | az vmss create -g $group -n $name $(flags) --lb ""
33 | echo "created $name"
34 | }
35 |
36 | recreate
37 |
--------------------------------------------------------------------------------
/benchmark/dynamic_resources/pytorch-schedule.json:
--------------------------------------------------------------------------------
1 | [
2 | {
3 | "time": 35,
4 | "size": 16
5 | },
6 | {
7 | "time": 35,
8 | "size": 8
9 | },
10 | {
11 | "time": 35,
12 | "size": 8
13 | },
14 | {
15 | "time": 35,
16 | "size": 16
17 | },
18 | {
19 | "time": 35,
20 | "size": 8
21 | },
22 | {
23 | "time": 35,
24 | "size": 8
25 | },
26 | {
27 | "time": 35,
28 | "size": 16
29 | },
30 | {
31 | "time": 35,
32 | "size": 8
33 | },
34 | {
35 | "time": 35,
36 | "size": 8
37 | },
38 | {
39 | "time": 35,
40 | "size": 16
41 | },
42 | {
43 | "time": 35,
44 | "size": 8
45 | },
46 | {
47 | "time": 13,
48 | "size": 8
49 | },
50 | {
51 | "time": 0,
52 | "size": 0
53 | }
54 | ]
55 |
--------------------------------------------------------------------------------
/benchmark/reconfiguration_cluster_size/recreate-vmss.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | set -e
3 |
4 | . ./config.sh
5 |
6 | # The Image type for a Virtual Machine Scale Set may not be changed.
7 | # image=tenplex-base-image
8 | # image="tenplex-mw"
9 | image="tenplex-2024-08"
10 | image=$(az image show -n $image -g kungfu | jq -r .id)
11 |
12 | echo "Using image $image"
13 |
14 | storage=Premium_LRS # SSD
15 |
16 | create_flags() {
17 | echo --admin-username $USER
18 | #echo --admin-username kungfu
19 | echo --vnet-name tenplex-relayVNET
20 | echo --subnet tenplex-relaySubnet
21 | echo --image $image
22 | echo --instance-count 0
23 | echo --vm-sku $size
24 | echo --location westeurope
25 | echo --storage-sku $storage
26 | echo --lb-sku Standard
27 | }
28 |
29 | recreate() {
30 | az vmss delete -g $group -n $name
31 | echo "deleted $name"
32 |
33 | az vmss create -g $group -n $name $(create_flags) --lb ""
34 | echo "created $name"
35 | }
36 |
37 | recreate
38 |
--------------------------------------------------------------------------------
/scheduler/scheduler/scheduler_test.go:
--------------------------------------------------------------------------------
1 | //go:build exclude
2 |
3 | package scheduler
4 |
5 | import (
6 | "fmt"
7 | "io"
8 | "log"
9 | "net/http"
10 | "strings"
11 | "testing"
12 | )
13 |
14 | func TestScheduler(t *testing.T) {
15 | ip := "localhost"
16 | port := 22222
17 | url := fmt.Sprintf("http://%s:%d/stop", ip, port)
18 |
19 | resp, err := http.Get(url)
20 | if err != nil {
21 | t.Fatalf("error %v", err)
22 | }
23 | if resp.StatusCode != 200 {
24 | t.Fatalf("POST failed, status code: %d", resp.StatusCode)
25 | }
26 | body, err := io.ReadAll(resp.Body)
27 | if err != nil {
28 | t.Fatalf("error %v", err)
29 | }
30 | t.Logf("body %s", string(body))
31 | }
32 |
33 | func TestNextLowerPowTwo(t *testing.T) {
34 | x := nextLowerPowTwo(33)
35 | if x == 32 {
36 | t.Logf("success")
37 | return
38 | }
39 | t.Logf("failed")
40 | }
41 |
42 | func TestPlayground(t *testing.T) {
43 | n := "iter"
44 | splitName := strings.SplitN(n, ".", 2)
45 | log.Printf("%d", len(splitName))
46 | }
47 |
--------------------------------------------------------------------------------
/benchmark/reconfiguration_horovod/logger.py:
--------------------------------------------------------------------------------
1 | import time
2 | import numpy as np
3 |
4 |
5 | class Logger(object):
6 |
7 | def __init__(self, log_period=10) -> None:
8 | self.t0 = time.time()
9 | self.img_secs = []
10 | self.step = 0
11 | self.trained = 0
12 | self.log_period = log_period
13 |
14 | def add(self, trained):
15 | self.step += 1
16 | self.trained += trained
17 |
18 | if self.step % self.log_period == 0:
19 | t1 = time.time()
20 | took = t1 - self.t0
21 |
22 | img_sec = self.trained / took
23 | self.t0 = t1
24 | self.trained = 0
25 |
26 | print('step #%d : %.1f img/sec' % (self.step, img_sec))
27 | self.img_secs.append(img_sec)
28 |
29 | def report(self):
30 | img_sec_mean = np.mean(self.img_secs)
31 | img_sec_conf = 1.96 * np.std(self.img_secs)
32 | print('RESULT Img/sec: %.1f +-%.1f' % (img_sec_mean, img_sec_conf))
33 |
--------------------------------------------------------------------------------
/mlfs/docker/ubuntu/2204/Dockerfile:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env -S sh -c 'docker build --rm--build-arg SSH_KEY="${SSH_KEY}" -t $(cat $(dirname $0)/tag.txt) -f $0 $(dirname $0)/../../..'
2 |
3 | FROM ubuntu:jammy AS builder
4 |
5 | ENV DEBIAN_FRONTEND=noninteractive
6 |
7 | RUN apt update
8 | RUN apt install -y golang-go make git cmake
9 |
10 | RUN mkdir $HOME/.ssh
11 | RUN echo "StrictHostKeyChecking no" >$HOME/.ssh/config
12 | ARG SSH_KEY
13 | RUN echo "${SSH_KEY}" >$HOME/.ssh/id_rsa
14 | RUN chmod 0600 $HOME/.ssh/id_rsa
15 | RUN ssh-keygen -y -f $HOME/.ssh/id_rsa >$HOME/.ssh/id_rsa.pub
16 |
17 | RUN git config --global url."git@github.com:".insteadOf "https://github.com/"
18 | RUN go env -w GOPRIVATE=*
19 |
20 | WORKDIR /src
21 | ADD . .
22 | RUN GOBIN=$PWD/bin go install -v ./...
23 | RUN ./scripts/pack.sh
24 | RUN cp ./build/*.deb mlfs.deb
25 |
26 | FROM ubuntu:jammy
27 |
28 | RUN apt update
29 | RUN apt install -y systemd init fuse
30 |
31 | COPY --from=builder /src/mlfs.deb /
32 | RUN dpkg -i /mlfs.deb && rm /mlfs.deb
33 |
--------------------------------------------------------------------------------
/benchmark/convergence_impact/README.md:
--------------------------------------------------------------------------------
1 | # Convergence impact
2 | _Fig. 2. Impact of GPU change on training convergence (Changing GPUs from 2 to 4 with GPT-3 and MNIST)_
3 |
4 | Fig. 2a shows how model convergence, plotted as the loss value, is affected after adding a GPU (vertical orange line) under data parallelism. The solid black line shows regular model convergence with a static GPU allocation; the dashed red line shows convergence after the scale-out event when the dataset is processed inconsistently after re-partitioning: when resuming the training in the middle of the epoch, the first half of the training data is used twice, which overfits the model and reduces the loss value unreasonably.
5 |
6 | In Fig. 2b, we show how the global batch size must be kept constant after adding a GPU (vertical orange line) under data parallelism. The solid black line shows model convergence (measured as loss) without the GPU change. The dashed red line shows the divergence when the GPU allocation changes but the device batch size remains constant.
7 |
--------------------------------------------------------------------------------
/tenplex/save.py:
--------------------------------------------------------------------------------
1 | import requests
2 |
3 | from .mlfs_client import MLFSClient
4 |
5 |
6 | def save(
7 | ckpt: dict,
8 | job_id: str,
9 | step: int,
10 | device_rank: int,
11 | mlfs_path: str,
12 | ip: str,
13 | port: int,
14 | ):
15 | path = f"/job/{job_id}/save/{device_rank}"
16 | print(f"save checkpoint to {path}")
17 |
18 | client = MLFSClient(ip, port)
19 |
20 | dire = None
21 | try:
22 | dire = client.get_dir(path)
23 | except requests.HTTPError:
24 | print(f"{path} does not exist yet")
25 |
26 | if dire:
27 | try:
28 | client.delete(path)
29 | print("deleted previous save dir")
30 | except requests.HTTPError as err:
31 | print(f"save delete {path} {err}")
32 | print(f"number of elements in dir {len(dire)}")
33 | raise err
34 |
35 | client.save_traverse(ckpt, path)
36 | client.upload_txt(f"job/{job_id}/iter", str(step))
37 |
38 | print(f"did save checkpoint to {path}")
39 |
--------------------------------------------------------------------------------
/state_transformer/meta/struct_test.go:
--------------------------------------------------------------------------------
1 | //go:build exclude
2 |
3 | package meta
4 |
5 | import "testing"
6 |
7 | func Test_Load(t *testing.T) {
8 | conf := Config{
9 | CkptStructDir: "/home/marcel/.tenplex/transformer-checkpoint",
10 | SourceMPDegree: 4,
11 | TargetMPDegree: 2,
12 | SourcePPDegree: 3,
13 | TargetPPDegree: 2,
14 | SourceSize: 12,
15 | TargetSize: 8,
16 | SourceDPDegree: 1,
17 | TargetDPDegree: 2,
18 | Precision: "fp16",
19 | OutputTimestamp: "",
20 | InputTimestamp: "",
21 | SourceHosts: []string{"a", "b", "c"},
22 | TargetHosts: []string{"a", "b"},
23 | Port: 20010,
24 | GpusPerHost: 4,
25 | MdpLibrary: "megatron-lm",
26 | SeqLength: 1024,
27 | JobID: "jobid",
28 | NumLayers: 24,
29 | }
30 | rankMap, err := CreateRankMap(&conf, true)
31 | stru, err := LoadStructs(&conf, rankMap, true)
32 | if err != nil {
33 | t.Logf("Error %v", err)
34 | return
35 | }
36 | t.Logf("Structures length %d", len(stru))
37 | }
38 |
--------------------------------------------------------------------------------
/mlfs/uri/monitor.go:
--------------------------------------------------------------------------------
1 | package uri
2 |
3 | import (
4 | "io"
5 |
6 | "github.com/kungfu-team/tenplex/mlfs/closer"
7 | "github.com/kungfu-team/tenplex/mlfs/iotrace"
8 | )
9 |
10 | type monitor struct {
11 | c *iotrace.Counter
12 | name string
13 | }
14 |
15 | func newMonitor(name string) *monitor {
16 | m := &monitor{
17 | c: iotrace.NewCounter(),
18 | name: name,
19 | }
20 | go m.Run()
21 | return m
22 | }
23 |
24 | func (m *monitor) Run() {
25 | iotrace.Monitor(m.c, m.name)
26 | }
27 |
28 | func (m *monitor) Trace(r io.ReadCloser) io.ReadCloser {
29 | r1 := iotrace.TraceReader(r, m.c)
30 | return closer.ReadClose(r1, r.Close)
31 | }
32 |
33 | var (
34 | httpRangeRate = newMonitor(`http partial download rate: `)
35 | httpFullRate = newMonitor(`http full download rate: `)
36 | fileReadRate = newMonitor(`file read rate: `)
37 | )
38 |
39 | func withHTTPTrace(f io.ReadCloser, bgn, end int64) io.ReadCloser {
40 | if bgn == 0 && end == -1 {
41 | f = httpFullRate.Trace(f)
42 | } else {
43 | f = httpRangeRate.Trace(f)
44 | }
45 | return f
46 | }
47 |
--------------------------------------------------------------------------------
/benchmark/reconfiguration_parallelization/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | set -e
4 |
5 | . $(dirname $0)/../common.sh
6 |
7 | hosts() {
8 | echo "10.10.10.1"
9 | echo "10.10.10.2"
10 | echo "10.10.10.3"
11 | echo "10.10.10.4"
12 | }
13 |
14 | model_sizes() {
15 | echo "6.7B"
16 | echo "2.7B"
17 | echo "xl"
18 | }
19 |
20 | comb_flags() {
21 | base_flags
22 | echo -model "gpt"
23 | echo -dataset "enwiki"
24 | echo -index-url "/data/megatron-lm/gpt-2/enwiki/npzs_seq1024_new/indices.txt"
25 | echo -hosts $(join $(hosts))
26 | echo -schedule "$(dirname $0)/schedule.json"
27 | echo -model-sizes $(join $(model_sizes))
28 | echo -batch-sizes 128
29 | echo -micro-batch-sizes 8
30 | echo -central
31 | }
32 |
33 | tenplex-multi-experiment $(comb_flags) -para-config para-config-dp.json 2>&1 | tee para_dp.log
34 | tenplex-multi-experiment $(comb_flags) -para-config para-config-tp.json 2>&1 | tee para_tp.log
35 | tenplex-multi-experiment $(comb_flags) -para-config para-config-pp.json 2>&1 | tee para_pp.log
36 |
37 | python plot.py
38 |
--------------------------------------------------------------------------------
/tenplex-run/job/params.go:
--------------------------------------------------------------------------------
1 | package job
2 |
3 | import (
4 | "strconv"
5 |
6 | "github.com/kungfu-team/tenplex/tenplex-run/para_config"
7 | )
8 |
9 | var str = strconv.Itoa
10 |
11 | type TrainingConfig struct {
12 | NumNodes int
13 | GPUPerNode int
14 | MDP para_config.MultiDimensionalParallelism
15 |
16 | TrainIters int
17 |
18 | LogInterval int
19 | SaveInterval int
20 | EvalInterval int
21 |
22 | Precision string
23 | }
24 |
25 | type GenCmdFunc func(c TrainingConfig, rank int, jobID string, host string, jConf *JobConfig) []string
26 |
27 | type TransformerSize struct {
28 | Layers int
29 | HiddenSize int
30 | AttentionHeads int
31 | }
32 |
33 | func (s TransformerSize) ToPyArgs() []string {
34 | return []string{
35 | `--num-layers`, str(s.Layers),
36 | `--hidden-size`, str(s.HiddenSize),
37 | `--num-attention-heads`, str(s.AttentionHeads),
38 | }
39 | }
40 |
41 | func TFSize(nl, hs, ah int) TransformerSize {
42 | return TransformerSize{
43 | Layers: nl,
44 | HiddenSize: hs,
45 | AttentionHeads: ah,
46 | }
47 | }
48 |
--------------------------------------------------------------------------------
/mlfs/mlfstest/go.sum:
--------------------------------------------------------------------------------
1 | github.com/lgarithm/proc v0.3.2-0.20221205141105-3ebbaa57acfd h1:3TKH+pOzdcVhdd3owi+PhIadcdH5C5U51CiR/ltdutc=
2 | github.com/lgarithm/proc v0.3.2-0.20221205141105-3ebbaa57acfd/go.mod h1:ODAmNzweK407/Z8BaSNqs6tTac6/JkLr+injrsAXq20=
3 | golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
4 | golang.org/x/crypto v0.0.0-20200604202706-70a84ac30bf9 h1:vEg9joUBmeBcK9iSJftGNf3coIG4HqZElCPehJsfAYM=
5 | golang.org/x/crypto v0.0.0-20200604202706-70a84ac30bf9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
6 | golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
7 | golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
8 | golang.org/x/sys v0.0.0-20190412213103-97732733099d h1:+R4KGOnez64A81RvjARKc4UT5/tI9ujCIVX+P5KiHuI=
9 | golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
10 | golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
11 |
--------------------------------------------------------------------------------
/scheduler/experiments/lib.go:
--------------------------------------------------------------------------------
1 | package experiments
2 |
3 | import (
4 | "strings"
5 |
6 | "github.com/kungfu-team/tenplex/scheduler/azcli"
7 | "github.com/lgarithm/proc"
8 | )
9 |
10 | type (
11 | At = proc.UserHost
12 | P = proc.P
13 | Proc = proc.Proc
14 | )
15 |
16 | var (
17 | par = proc.Par
18 | out = proc.Output
19 | seq = proc.Seq
20 | Main = proc.Main
21 | psh = proc.Psh
22 | at = proc.At
23 | echo = proc.Echo
24 | lmd = proc.Lambda
25 | ignore = proc.Ignore
26 | urpc = proc.Urpc
27 | )
28 |
29 | func getPubIP(name, group string) string {
30 | o := string(out(psh(azcli.GetPubIP(name, group))))
31 | return strings.Trim(o, "\n\"")
32 | }
33 |
34 | func getIP(name, group string) string {
35 | o := string(out(psh(azcli.GetIP(name, group))))
36 | return strings.Trim(o, "\n\"")
37 | }
38 |
39 | func fmap[X any, Y any](f func(X) Y, xs ...X) []Y {
40 | var ys []Y
41 | for _, x := range xs {
42 | ys = append(ys, f(x))
43 | }
44 | return ys
45 | }
46 |
47 | func parmap[T any](f func(T) P, xs ...T) P { return par(fmap(f, xs...)...) }
48 |
--------------------------------------------------------------------------------
/mlfs/docker/ubuntu/2004/Dockerfile:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env -S sh -c 'docker build --rm --build-arg SSH_KEY="${SSH_KEY}" -t $(cat $(dirname $0)/tag.txt) -f $0 $(dirname $0)/../../..'
2 |
3 | FROM ubuntu:focal AS builder
4 |
5 | ENV DEBIAN_FRONTEND=noninteractive
6 |
7 | RUN apt update
8 | RUN apt install -y software-properties-common
9 | RUN add-apt-repository ppa:longsleep/golang-backports
10 | RUN apt install -y golang-go make git cmake
11 |
12 | RUN mkdir $HOME/.ssh
13 | RUN echo "StrictHostKeyChecking no" >$HOME/.ssh/config
14 | ARG SSH_KEY
15 | RUN echo "${SSH_KEY}" >$HOME/.ssh/id_rsa
16 | RUN chmod 0600 $HOME/.ssh/id_rsa
17 | RUN ssh-keygen -y -f $HOME/.ssh/id_rsa >$HOME/.ssh/id_rsa.pub
18 |
19 | RUN git config --global url."git@github.com:".insteadOf "https://github.com/"
20 | RUN go env -w GOPRIVATE=*
21 |
22 | WORKDIR /src
23 | ADD . .
24 | RUN GOBIN=$PWD/bin go install -v ./...
25 | RUN ./scripts/pack.sh
26 | RUN cp ./build/*.deb mlfs.deb
27 |
28 | FROM ubuntu:focal
29 |
30 | RUN apt update
31 | RUN apt install -y systemd init fuse
32 |
33 | COPY --from=builder /src/mlfs.deb /
34 | RUN dpkg -i /mlfs.deb && rm /mlfs.deb
35 |
--------------------------------------------------------------------------------
/scheduler/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | CMAKE_MINIMUM_REQUIRED(VERSION 3.5)
2 | PROJECT(tenplex-scheduler)
3 |
4 | SET(CPACK_GENERATOR "DEB")
5 | SET(CPACK_DEBIAN_PACKAGE_MAINTAINER "lg")
6 | SET(CPACK_PACKAGE_VERSION $ENV{VERSION})
7 | SET(CPACK_DEBIAN_FILE_NAME DEB-DEFAULT)
8 | INCLUDE(CPack)
9 |
10 | INSTALL(PROGRAMS ${CMAKE_SOURCE_DIR}/bin/tenplex-scheduler DESTINATION bin)
11 | INSTALL(PROGRAMS ${CMAKE_SOURCE_DIR}/bin/tenplex-user DESTINATION bin)
12 | INSTALL(PROGRAMS ${CMAKE_SOURCE_DIR}/bin/tenplex-state-transformer
13 | DESTINATION bin)
14 |
15 | INSTALL(FILES ${CMAKE_SOURCE_DIR}/etc/os/linux/tenplex-scheduler.service
16 | DESTINATION /lib/systemd/system)
17 |
18 | FUNCTION(INSTALL_SCRIPT target)
19 | INSTALL(
20 | FILES ${target}
21 | PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ #
22 | WORLD_READ WORLD_EXECUTE
23 | DESTINATION /etc/tenplex)
24 | ENDFUNCTION()
25 |
26 | INSTALL_SCRIPT(${CMAKE_SOURCE_DIR}/etc/tenplex/scheduler.sh)
27 | INSTALL_SCRIPT(${CMAKE_SOURCE_DIR}/etc/tenplex/stop-scheduler.sh)
28 |
29 | # INSTALL(DIRECTORY ${CMAKE_SOURCE_DIR}/man DESTINATION share)
30 |
--------------------------------------------------------------------------------
/mlfs/iotrace/report.go:
--------------------------------------------------------------------------------
1 | package iotrace
2 |
3 | import (
4 | golog "log"
5 | "os"
6 | "sync"
7 | "time"
8 | )
9 |
10 | var log = golog.New(os.Stderr, `[mlfs] io % `, 0)
11 |
12 | type reporter struct {
13 | stopped chan struct{}
14 | wg sync.WaitGroup
15 | }
16 |
17 | func Reporter(c *Counter, prefix string) *reporter {
18 | r := &reporter{stopped: make(chan struct{}, 1)}
19 | r.wg.Add(1)
20 | go func() {
21 | for {
22 | select {
23 | case <-r.stopped:
24 | log.Printf("%soverall rate: %s", prefix, c.ShowRate())
25 | r.wg.Done()
26 | return
27 | default:
28 | time.Sleep(1 * time.Second)
29 | log.Printf("%s%s", prefix, c.ShowRate())
30 | }
31 | }
32 |
33 | }()
34 | return r
35 | }
36 |
37 | func (r *reporter) Stop() {
38 | r.stopped <- struct{}{}
39 | r.wg.Wait()
40 | }
41 |
42 | func Monitor(c *Counter, prefix string) {
43 | r := &reporter{stopped: make(chan struct{}, 1)}
44 | r.wg.Add(1)
45 | go func() {
46 | for {
47 | time.Sleep(1 * time.Second)
48 | if !c.Zero() {
49 | log.Printf("%s%s", prefix, c.ShowRate())
50 | c.Reset()
51 | }
52 | }
53 | }()
54 | }
55 |
--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | CMAKE_MINIMUM_REQUIRED(VERSION 3.5)
2 | PROJECT(mlfs)
3 |
4 | SET(CPACK_GENERATOR "DEB")
5 | SET(CPACK_DEBIAN_PACKAGE_MAINTAINER "g.li@imperial.ac.uk")
6 | SET(CPACK_PACKAGE_VERSION $ENV{VERSION})
7 | SET(CPACK_DEBIAN_FILE_NAME DEB-DEFAULT)
8 | INCLUDE(CPack)
9 |
10 | FUNCTION(INSTALL_BIN TARGET)
11 | INSTALL(PROGRAMS ${CMAKE_SOURCE_DIR}/bin/${TARGET} DESTINATION bin)
12 | ENDFUNCTION()
13 |
14 | INSTALL_BIN(mlfs-build-tf-index)
15 | INSTALL_BIN(mlfs-check-index)
16 | INSTALL_BIN(mlfs-download)
17 | INSTALL_BIN(mlfs-edit-index)
18 | INSTALL_BIN(mlfs)
19 | INSTALL_BIN(mlfsd)
20 | INSTALL_BIN(tenplex-state-transformer)
21 |
22 | INSTALL(FILES ${CMAKE_SOURCE_DIR}/mlfs/etc/os/linux/mlfs.service
23 | DESTINATION /lib/systemd/system)
24 | INSTALL(
25 | FILES ${CMAKE_SOURCE_DIR}/mlfs/etc/mlfs/mlfs.sh
26 | PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ WORLD_READ
27 | DESTINATION /etc/mlfs)
28 | INSTALL(
29 | FILES ${CMAKE_SOURCE_DIR}/mlfs/etc/mlfs/stop.sh
30 | PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ WORLD_READ
31 | DESTINATION /etc/mlfs)
32 |
33 | INSTALL(DIRECTORY ${CMAKE_SOURCE_DIR}/man DESTINATION share)
34 |
--------------------------------------------------------------------------------
/mlfs/mlfs/dsidx.go:
--------------------------------------------------------------------------------
1 | package mlfs
2 |
3 | import (
4 | "image"
5 | "image/color"
6 |
7 | "github.com/kungfu-team/tenplex/mlfs/vfs/vfile"
8 | )
9 |
10 | type DSIDX struct {
11 | idx vfile.IndexedFiles
12 | ridx []int
13 | lidx []int
14 | maxRegions int
15 | totalRegions int
16 | }
17 |
18 | func newDSIDX(idx vfile.IndexedFiles) *DSIDX {
19 | var maxRegions, totalRegions int
20 | var ridx, lidx []int
21 | for i, f := range idx {
22 | n := len(f.Ranges)
23 | if n > maxRegions {
24 | maxRegions = n
25 | }
26 | totalRegions += n
27 | for j := 0; j < n; j++ {
28 | ridx = append(ridx, i)
29 | lidx = append(lidx, j)
30 | }
31 | }
32 | d := &DSIDX{
33 | idx: idx,
34 | ridx: ridx,
35 | lidx: lidx,
36 | maxRegions: maxRegions,
37 | totalRegions: totalRegions,
38 | }
39 | log.Printf("maxRegions: %d", maxRegions)
40 | return d
41 | }
42 |
43 | func (d *DSIDX) bmap(ids []int) image.Image {
44 | img := makeBitmap(len(d.idx), d.maxRegions)
45 | for _, id := range ids {
46 | i := d.ridx[id]
47 | j := d.lidx[id]
48 | img.Set(j, i, color.White)
49 | }
50 | return img
51 | }
52 |
--------------------------------------------------------------------------------
/state_transformer/meta/rankmap.go:
--------------------------------------------------------------------------------
1 | package meta
2 |
3 | import (
4 | "encoding/json"
5 | "os"
6 | "path"
7 | "strconv"
8 | )
9 |
10 | type MDPRank struct {
11 | PPRank int
12 | MPRank int
13 | DPRank int
14 | }
15 |
16 | type RankMap struct {
17 | Rank map[MDPRank]int
18 | MDPRank map[int]MDPRank
19 | }
20 |
21 | func CreateRankMap(config *Config, before bool) (*RankMap, error) {
22 | structPath := GetStructPath(config, before)
23 | jsonPath := path.Join(structPath, "rank_map.json")
24 | content, err := os.ReadFile(jsonPath)
25 | if err != nil {
26 | return nil, err
27 | }
28 | var payload map[string]map[string]int
29 | err = json.Unmarshal(content, &payload)
30 | if err != nil {
31 | return nil, err
32 | }
33 |
34 | ranks := make(map[MDPRank]int)
35 | MDPRanks := make(map[int]MDPRank)
36 | for r, val := range payload {
37 | rInt, err := strconv.Atoi(r)
38 | if err != nil {
39 | return nil, err
40 | }
41 | mdpRank := MDPRank{PPRank: val["pp_rank"], MPRank: val["mp_rank"], DPRank: val["dp_rank"]}
42 | MDPRanks[rInt] = mdpRank
43 | ranks[mdpRank] = rInt
44 | }
45 | rankMap := RankMap{Rank: ranks, MDPRank: MDPRanks}
46 | return &rankMap, nil
47 | }
48 |
--------------------------------------------------------------------------------
/mlfs/benchmarks/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | set -e
4 |
5 | export PYTHON=$(which python3.6)
6 |
7 | cd $(dirname $0)
8 |
9 | kungfu_run_flags() {
10 | echo -q
11 | echo -logdir logs/$JOB_ID
12 | echo -np 4
13 | }
14 |
15 | kungfu_run() {
16 | echo "JOB_ID: $JOB_ID"
17 | kungfu-run $(kungfu_run_flags) $@
18 | }
19 |
20 | flags_mount() {
21 | echo --index-file $HOME/tf-index-16.idx.txt
22 | echo --seed 1
23 | echo --global-batch-size 128
24 | echo --tfrecord-fs $PWD/../../bin/tfrecord-fs
25 | }
26 |
27 | flags() {
28 | flags_mount
29 | echo --run-train-op
30 | }
31 |
32 | flags_baseline() {
33 | flags
34 | echo --prefix $HOME/mnt/all
35 | }
36 |
37 | flags_fake_data() {
38 | flags
39 | echo --prefix $HOME/mnt/all
40 | echo --fake-data
41 | }
42 |
43 | flags_vfs() {
44 | flags
45 | }
46 |
47 | main() {
48 | export JOB_ID=vfs
49 | kungfu_run $PYTHON train_resnet50.py $(flags_vfs)
50 |
51 | # export JOB_ID=base
52 | # kungfu_run $PYTHON train_resnet50.py $(flags_baseline)
53 |
54 | # export JOB_ID=fakedata
55 | # kungfu_run $PYTHON train_resnet50.py $(flags_fake_data)
56 | }
57 |
58 | rm -fr *.log
59 | main
60 |
--------------------------------------------------------------------------------
/state_transformer/search/file-system.go:
--------------------------------------------------------------------------------
1 | package search
2 |
3 | import (
4 | "fmt"
5 | "os"
6 | "path"
7 | "strings"
8 | )
9 |
10 | func isTensor(name string) (bool, error) {
11 | split := strings.SplitN(name, ".", 2)
12 | if len(split) != 2 {
13 | return false, fmt.Errorf("string split has not exactly 2 parts")
14 | }
15 | return split[1] == "numpy.ndarray", nil
16 | }
17 |
18 | func SearchFSForTensors(basePath string) ([]string, error) {
19 | dirEntries, err := os.ReadDir(basePath)
20 | if err != nil {
21 | return nil, err
22 | }
23 |
24 | var tensors []string
25 | for _, dirEntry := range dirEntries {
26 | info, err := dirEntry.Info()
27 | if err != nil {
28 | return nil, err
29 | }
30 |
31 | if dirEntry.IsDir() {
32 | newTensors, err := SearchFSForTensors(path.Join(basePath, info.Name()))
33 | tensors = append(tensors, newTensors...)
34 | if err != nil {
35 | return nil, err
36 | }
37 | } else { // isFile
38 | isTen, err := isTensor(info.Name())
39 | if err != nil {
40 | return nil, err
41 | }
42 | if isTen {
43 | tensors = append(tensors, path.Join(basePath, info.Name()))
44 | }
45 | }
46 | }
47 | return tensors, nil
48 | }
49 |
--------------------------------------------------------------------------------
/state_transformer/statetransform/replicate.go:
--------------------------------------------------------------------------------
1 | package statetransform
2 |
3 | import (
4 | "fmt"
5 | "log"
6 | "strings"
7 |
8 | "github.com/kungfu-team/tenplex/state_transformer/client"
9 | "github.com/kungfu-team/tenplex/state_transformer/meta"
10 | )
11 |
12 | func replicateTensor(conf *meta.Config, ckptCl *client.CheckpointClient, sourceKey, targetKey []string, sourceMDPRank, targetMDPRank *meta.MDPRank) error {
13 | sourcePath := strings.Join(sourceKey, "/")
14 | sourcePath = fmt.Sprintf("%s.numpy.ndarray", sourcePath)
15 | ten, err := ckptCl.QueryMegatronTensor(sourceMDPRank, conf.InputTimestamp, sourcePath, nil)
16 | if err != nil {
17 | log.Printf("query tensor to replicate failed.\nwith error %v.\nsource key %v, target key %v, source MDP rank %v, target MDP rank %v", err, sourceKey, targetKey, sourceMDPRank, targetMDPRank)
18 | return err
19 | }
20 | targetPath := strings.Join(targetKey, "/")
21 | targetPath = fmt.Sprintf("%s.numpy.ndarray", targetPath)
22 | err = ckptCl.UploadMegatronTensor(ten, targetMDPRank, conf.OutputTimestamp, targetPath)
23 | if err != nil {
24 | return err
25 | }
26 |
27 | if err != nil {
28 | return err
29 | }
30 |
31 | return nil
32 | }
33 |
--------------------------------------------------------------------------------
/tests/dataset.py:
--------------------------------------------------------------------------------
1 | import subprocess
2 | from tenplex.dataset import GPTDataset as TenplexGPTDataset
3 |
4 |
5 | def main():
6 | num_scaling = 5
7 | idx_path = "/data/megatron-lm/gpt-2/enwiki/npzs_seq1024/indices.txt"
8 | dp_size = 2
9 | dp_rank = 1
10 | job_id = "dataset-test"
11 | batch_size = 128
12 |
13 | for _ in range(num_scaling):
14 | progress = num_scaling * batch_size * 2048
15 |
16 | mount_cmd = [
17 | "mlfs", "mount",
18 | "-idx-name", "openwebtext",
19 | "-index-url", f"{idx_path}",
20 | "-ctrl-port", "20010",
21 | "-progress", f"{progress}",
22 | "-global-batch-size", f"{batch_size}",
23 | "-dp-size", f"{dp_size}",
24 | "-job", job_id,
25 | ]
26 | subprocess.run(mount_cmd, check=True)
27 | print("finished MLFS mount")
28 |
29 | mlfs_path = "/mnt/mlfs"
30 | dataset = TenplexGPTDataset(mlfs_path, job_id, dp_rank)
31 | dataset = iter(dataset)
32 |
33 | for _ in range(50_000):
34 | sample = next(dataset)
35 | txt = sample["text"]
36 |
37 |
38 | if __name__ == "__main__":
39 | main()
40 |
--------------------------------------------------------------------------------
/mlfs/iotrace/counter.go:
--------------------------------------------------------------------------------
1 | package iotrace
2 |
3 | import (
4 | "fmt"
5 | "sync/atomic"
6 | "time"
7 | )
8 |
9 | type Counter struct {
10 | t0 time.Time
11 | n int64
12 | }
13 |
14 | func NewCounter() *Counter {
15 | return &Counter{
16 | t0: time.Now(),
17 | }
18 | }
19 |
20 | func (c *Counter) Zero() bool {
21 | return atomic.LoadInt64(&c.n) == 0
22 | }
23 |
24 | func (c *Counter) Add(n int64) {
25 | atomic.AddInt64(&c.n, n)
26 | }
27 |
28 | func (c *Counter) Reset() {
29 | c.t0 = time.Now()
30 | atomic.StoreInt64(&c.n, 0)
31 | }
32 |
33 | func (c *Counter) ShowRate() string {
34 | n := atomic.LoadInt64(&c.n)
35 | return ShowRate(Rate(n, time.Since(c.t0)))
36 | }
37 |
38 | func Rate(n int64, d time.Duration) float64 {
39 | return float64(n) / (float64(d) / float64(time.Second))
40 | }
41 |
42 | func ShowRate(r float64) string {
43 | const Ki = 1 << 10
44 | const Mi = 1 << 20
45 | const Gi = 1 << 30
46 | switch {
47 | case r > Gi:
48 | return fmt.Sprintf("%.2f GiB/s", r/float64(Gi))
49 | case r > Mi:
50 | return fmt.Sprintf("%.2f MiB/s", r/float64(Mi))
51 | case r > Ki:
52 | return fmt.Sprintf("%.2f KiB/s", r/float64(Ki))
53 | default:
54 | return fmt.Sprintf("%.2f B/s", r)
55 | }
56 | }
57 |
--------------------------------------------------------------------------------
/scheduler/data/single-job-time.json:
--------------------------------------------------------------------------------
1 | [
2 | {
3 | "time": 35,
4 | "size": 4
5 | },
6 | {
7 | "time": 35,
8 | "size": 8
9 | },
10 | {
11 | "time": 35,
12 | "size": 16
13 | },
14 | {
15 | "time": 35,
16 | "size": 8
17 | },
18 | {
19 | "time": 35,
20 | "size": 4
21 | },
22 | {
23 | "time": 35,
24 | "size": 8
25 | },
26 | {
27 | "time": 35,
28 | "size": 16
29 | },
30 | {
31 | "time": 35,
32 | "size": 8
33 | },
34 | {
35 | "time": 35,
36 | "size": 4
37 | },
38 | {
39 | "time": 35,
40 | "size": 8
41 | },
42 | {
43 | "time": 35,
44 | "size": 16
45 | },
46 | {
47 | "time": 35,
48 | "size": 8
49 | },
50 | {
51 | "time": 35,
52 | "size": 4
53 | },
54 | {
55 | "time": 35,
56 | "size": 8
57 | },
58 | {
59 | "time": 35,
60 | "size": 16
61 | },
62 | {
63 | "time": 13,
64 | "size": 8
65 | },
66 | {
67 | "time": 0,
68 | "size": 0
69 | }
70 | ]
71 |
--------------------------------------------------------------------------------
/benchmark/dynamic_resources/tenplex-schedule.json:
--------------------------------------------------------------------------------
1 | [
2 | {
3 | "time": 35,
4 | "size": 16
5 | },
6 | {
7 | "time": 35,
8 | "size": 8
9 | },
10 | {
11 | "time": 35,
12 | "size": 4
13 | },
14 | {
15 | "time": 35,
16 | "size": 8
17 | },
18 | {
19 | "time": 35,
20 | "size": 16
21 | },
22 | {
23 | "time": 35,
24 | "size": 8
25 | },
26 | {
27 | "time": 35,
28 | "size": 4
29 | },
30 | {
31 | "time": 35,
32 | "size": 8
33 | },
34 | {
35 | "time": 35,
36 | "size": 16
37 | },
38 | {
39 | "time": 35,
40 | "size": 8
41 | },
42 | {
43 | "time": 35,
44 | "size": 4
45 | },
46 | {
47 | "time": 35,
48 | "size": 8
49 | },
50 | {
51 | "time": 35,
52 | "size": 16
53 | },
54 | {
55 | "time": 35,
56 | "size": 8
57 | },
58 | {
59 | "time": 35,
60 | "size": 4
61 | },
62 | {
63 | "time": 13,
64 | "size": 8
65 | },
66 | {
67 | "time": 0,
68 | "size": 0
69 | }
70 | ]
71 |
--------------------------------------------------------------------------------
/mlfs/pid/peer.go:
--------------------------------------------------------------------------------
1 | package pid
2 |
3 | import (
4 | "fmt"
5 | "net"
6 | "strconv"
7 | )
8 |
9 | // PeerID is the unique identifier of a peer.
10 | type PeerID struct {
11 | IPv4 uint32
12 | Port uint16
13 | }
14 |
15 | func (p PeerID) String() string {
16 | return net.JoinHostPort(FormatIPv4(p.IPv4), strconv.Itoa(int(p.Port)))
17 | }
18 |
19 | func (p PeerID) ColocatedWith(q PeerID) bool {
20 | return p.IPv4 == q.IPv4
21 | }
22 |
23 | func (p PeerID) ListenAddr(strict bool) PeerID {
24 | if strict {
25 | return PeerID{IPv4: p.IPv4, Port: p.Port}
26 | }
27 | return PeerID{IPv4: 0, Port: p.Port}
28 | }
29 |
30 | func (p PeerID) SockFile() string {
31 | return fmt.Sprintf(`/tmp/goml-peer-%d.sock`, p.Port)
32 | }
33 |
34 | func ParsePeerID(val string) (*PeerID, error) {
35 | host, p, err := net.SplitHostPort(val)
36 | if err != nil {
37 | return nil, err
38 | }
39 | ipv4, err := ParseIPv4(host) // FIXME: checkout error
40 | if err != nil {
41 | return nil, err
42 | }
43 | port, err := strconv.Atoi(p)
44 | if err != nil {
45 | return nil, err
46 | }
47 | if int(uint16(port)) != port {
48 | return nil, errInvalidPort
49 | }
50 | return &PeerID{
51 | IPv4: ipv4,
52 | Port: uint16(port),
53 | }, nil
54 | }
55 |
--------------------------------------------------------------------------------
/tenplex-run/listflag/listflag.go:
--------------------------------------------------------------------------------
1 | package listflag
2 |
3 | import (
4 | "flag"
5 | "strconv"
6 | "strings"
7 | )
8 |
9 | type Strings []string
10 |
11 | func (v *Strings) Set(args string) error {
12 | *v = nil
13 | for _, t := range strings.Split(args, ",") {
14 | if s := strings.TrimSpace(t); len(s) > 0 {
15 | *v = append(*v, s)
16 | }
17 | }
18 | return nil
19 | }
20 |
21 | func (v *Strings) String() string { return strings.Join(*v, ",") }
22 |
23 | func String(name string, v Strings, usage string) *Strings {
24 | r := make(Strings, len(v))
25 | copy(r, v)
26 | flag.Var(&r, name, usage)
27 | return &r
28 | }
29 |
30 | type Ints []int
31 |
32 | func (v *Ints) Set(args string) error {
33 | *v = nil
34 | for _, t := range strings.Split(args, ",") {
35 | s := strings.TrimSpace(t)
36 | n, err := strconv.Atoi(s)
37 | if err != nil {
38 | return err
39 | }
40 | *v = append(*v, n)
41 | }
42 | return nil
43 | }
44 |
45 | func (v *Ints) String() string {
46 | var ss []string
47 | for _, n := range *v {
48 | ss = append(ss, strconv.Itoa(n))
49 | }
50 | return strings.Join(ss, ",")
51 | }
52 |
53 | func Int(name string, v Ints, usage string) *Ints {
54 | r := make(Ints, len(v))
55 | copy(r, v)
56 | flag.Var(&r, name, usage)
57 | return &r
58 | }
59 |
--------------------------------------------------------------------------------
/mlfs/utils/log.go:
--------------------------------------------------------------------------------
1 | package utils
2 |
3 | import (
4 | "fmt"
5 | "log"
6 | "os"
7 | "time"
8 | )
9 |
10 | func LogArgs() {
11 | for i, a := range os.Args {
12 | fmt.Printf("[arg] [%d]=%s\n", i, a)
13 | }
14 | }
15 |
16 | func LogEnv() {
17 | for _, e := range os.Environ() {
18 | fmt.Printf("[env] %s\n", e)
19 | }
20 | }
21 |
22 | func ShowSize(n int64) string {
23 | const (
24 | Ki = 1 << 10
25 | Mi = 1 << 20
26 | Gi = 1 << 30
27 | Ti = 1 << 40
28 | )
29 | if n >= Ti {
30 | return fmt.Sprintf("%.1fTi", float64(n)/float64(Ti))
31 | } else if n >= Gi {
32 | return fmt.Sprintf("%.1fGiB", float64(n)/float64(Gi))
33 | } else if n >= Mi {
34 | return fmt.Sprintf("%.1fMiB", float64(n)/float64(Mi))
35 | } else if n >= Ki {
36 | return fmt.Sprintf("%.1fKiB", float64(n)/float64(Ki))
37 | }
38 | return fmt.Sprintf("%d", n)
39 | }
40 |
41 | func Percent(p, n int) float64 { return 100.0 * float64(p) / float64(n) }
42 |
43 | func LogETA(t0 time.Time, progress, total int) {
44 | d := time.Since(t0)
45 | r := Percent(progress, total)
46 | if progress == 0 {
47 | log.Printf("%.1f%% took %s, ETA: %s", r, d, `?`)
48 | return
49 | }
50 | remain := time.Duration(float64(d) * float64(total-progress) / float64(progress))
51 | log.Printf("%.1f%% took %s, ETA: %s", r, d, remain)
52 | }
53 |
--------------------------------------------------------------------------------
/tenplex-run/runop/dataset.go:
--------------------------------------------------------------------------------
1 | package runop
2 |
3 | import (
4 | "fmt"
5 | "log"
6 |
7 | "github.com/kungfu-team/tenplex/mlfs/ds"
8 | "github.com/kungfu-team/tenplex/mlfs/mlfs"
9 | "github.com/kungfu-team/tenplex/tenplex-run/job"
10 | )
11 |
12 | func mount(cli *mlfs.Client, ds ds.Dataset, jobID string, batchSize, progress, dpSize, seed int, noShuffle bool) error {
13 | if err := cli.AddIndex(ds.Name, ds.IndexURL); err != nil {
14 | return err
15 | }
16 | if err := cli.Mount(jobID, ds.Name, int64(progress), batchSize, dpSize, seed, noShuffle); err != nil {
17 | return err
18 | }
19 | var s string
20 | if err := cli.GetRoot(&s); err != nil {
21 | return err
22 | }
23 | return nil
24 | }
25 |
26 | func AddDataset(dpSize, progress int, jobConf *job.JobConfig) error {
27 | for _, host := range jobConf.Cluster.Hosts {
28 | cli, err := mlfs.NewClientTo(host, jobConf.MLFSPort)
29 | if err != nil {
30 | return fmt.Errorf("%s %v", host, err)
31 | }
32 | if err := mount(cli, jobConf.Dataset, jobConf.ID, jobConf.BatchSize, progress, dpSize, jobConf.Seed, jobConf.NoShuffle); err != nil {
33 | return fmt.Errorf("%s %v", host, err)
34 | }
35 | log.Printf("Dataset added: host %s, batch size %d, progress %d, DP size %d", host, jobConf.BatchSize, progress, dpSize)
36 | }
37 | return nil
38 | }
39 |
--------------------------------------------------------------------------------
/mlfs/iseq/iseq.go:
--------------------------------------------------------------------------------
1 | package iseq
2 |
3 | import (
4 | "math/rand"
5 | )
6 |
7 | type ISeq struct {
8 | seq []int
9 | }
10 |
11 | func Seq(s []int) ISeq {
12 | return ISeq{s}
13 | }
14 |
15 | func (is *ISeq) Empty() bool {
16 | return len(is.seq) == 0
17 | }
18 |
19 | func (is *ISeq) Take(n int) ISeq {
20 | a, b := split(n, is.seq)
21 | is.seq = b
22 | return ISeq{a}
23 | }
24 |
25 | func (is *ISeq) Shard(i, m int) ISeq {
26 | k := ceilDiv(len(is.seq), m)
27 | a := i * k
28 | b := min(a+k, len(is.seq))
29 | return ISeq{seq: is.seq[a:b]}
30 | }
31 |
32 | func (is *ISeq) Len() int {
33 | return len(is.seq)
34 | }
35 |
36 | func (is *ISeq) Get() []int {
37 | return is.seq[:]
38 | }
39 |
40 | func Iota(n int) []int {
41 | s := make([]int, n)
42 | for i := range s {
43 | s[i] = i
44 | }
45 | return s
46 | }
47 |
48 | func Shuffle(s []int, seed int) {
49 | r := rand.New(rand.NewSource(int64(seed)))
50 | r.Shuffle(len(s), func(i, j int) {
51 | s[i], s[j] = s[j], s[i]
52 | })
53 | }
54 |
55 | func split(n int, s []int) ([]int, []int) {
56 | if n >= len(s) {
57 | return s, nil
58 | }
59 | return s[:n], s[n:]
60 | }
61 |
62 | func ceilDiv(a, b int) int {
63 | if a%b == 0 {
64 | return a / b
65 | }
66 | return a/b + 1
67 | }
68 |
69 | func min(a, b int) int {
70 | if a < b {
71 | return a
72 | }
73 | return b
74 | }
75 |
--------------------------------------------------------------------------------
/mlfs/cache/memory.go:
--------------------------------------------------------------------------------
1 | package cache
2 |
3 | import (
4 | "bytes"
5 | "io"
6 | "sync"
7 | "sync/atomic"
8 |
9 | "github.com/kungfu-team/tenplex/mlfs/vfs"
10 | )
11 |
12 | type memcached struct {
13 | f vfs.FileNode
14 |
15 | cached int32
16 | bs []byte
17 | mu sync.Mutex
18 | }
19 |
20 | func Memcache(f vfs.FileNode) *memcached {
21 | return &memcached{f: f}
22 | }
23 |
24 | func (f *memcached) isCached() bool {
25 | return atomic.LoadInt32(&f.cached) > 0
26 | }
27 |
28 | func (f *memcached) Size() int64 { return f.f.Size() }
29 |
30 | func (f *memcached) Open() io.ReadCloser {
31 | if f.isCached() {
32 | return io.NopCloser(bytes.NewReader(f.bs))
33 | }
34 | return f.f.Open()
35 | }
36 |
37 | func (f *memcached) ReadAt(buf []byte, pos int64) (int, error) {
38 | if f.isCached() {
39 | br := bytes.NewReader(f.bs)
40 | return br.ReadAt(buf, pos)
41 | }
42 | return f.f.ReadAt(buf, pos)
43 | }
44 |
45 | func (f *memcached) Cache() {
46 | if f.isCached() {
47 | return
48 | }
49 | f.mu.Lock()
50 | defer f.mu.Unlock()
51 | r := f.f.Open()
52 | bs, err := io.ReadAll(r)
53 | r.Close()
54 | if err == nil {
55 | f.bs = bs
56 | atomic.StoreInt32(&f.cached, 1)
57 | }
58 | }
59 |
60 | func (f *memcached) Uncache() {
61 | atomic.StoreInt32(&f.cached, 0)
62 | f.mu.Lock()
63 | defer f.mu.Unlock()
64 | f.bs = nil
65 | }
66 |
--------------------------------------------------------------------------------
/scheduler/experiments/experiments.go:
--------------------------------------------------------------------------------
1 | package experiments
2 |
3 | import (
4 | "fmt"
5 | "log"
6 | "sync"
7 | )
8 |
9 | type Setup struct {
10 | Prefix string
11 | NWorkers int
12 | Group string
13 |
14 | IPs []string
15 | PubIPs []string
16 | }
17 |
18 | func NewSetup(p string, n int, g string) *Setup {
19 | s := &Setup{
20 | Prefix: p,
21 | NWorkers: n,
22 | Group: g,
23 | }
24 | s.GetIPs()
25 | return s
26 | }
27 |
28 | func (s Setup) Names() []string {
29 | var names []string
30 | for i := 0; i < s.NWorkers; i++ {
31 | names = append(names, fmt.Sprintf("%s%02d", s.Prefix, i+1))
32 | }
33 | return names
34 | }
35 |
36 | func (s *Setup) GetIPs() {
37 | names := s.Names()
38 | ips := make([]string, len(names))
39 | pubIPs := make([]string, len(names))
40 | {
41 | var wg sync.WaitGroup
42 | for i := range names {
43 | wg.Add(1)
44 | go func(i int) {
45 | pubIPs[i] = getPubIP(names[i], s.Group)
46 | wg.Done()
47 | }(i)
48 | wg.Add(1)
49 | go func(i int) {
50 | ips[i] = getIP(names[i], s.Group)
51 | wg.Done()
52 | }(i)
53 | }
54 | wg.Wait()
55 | }
56 | //
57 | for i, name := range names {
58 | if len(pubIPs[i]) == 0 {
59 | pubIPs[i] = ips[i]
60 | }
61 | log.Printf("public IP of %s: %s", name, pubIPs[i])
62 | }
63 | //
64 | s.IPs = ips
65 | s.PubIPs = pubIPs
66 | }
67 |
--------------------------------------------------------------------------------
/para_config/deepspeed/layer_map.py:
--------------------------------------------------------------------------------
1 | import json
2 | import os
3 | import re
4 |
5 |
6 | def main():
7 | framework = 'deepspeed'
8 | pp_size = 2
9 | mp_size = 1
10 | dp_size = 2
11 | total_size = pp_size * mp_size * dp_size
12 | model_size = 'medium'
13 | direc = f'{framework}/gpt-2/{model_size}/pp{pp_size:02d}/mp{mp_size:02d}/dp{dp_size:02d}'
14 |
15 | mapping = dict()
16 |
17 | for rank in range(total_size):
18 | rank_dir = os.path.join(direc, f'rank{rank:02d}')
19 | if not os.path.exists(rank_dir):
20 | continue
21 |
22 | layer_numbers = []
23 | tp_rank = None
24 | for entry in os.scandir(rank_dir):
25 | pattern = r'layer_(\d+)-model_(\d+)-model_states.json'
26 | mat = re.match(pattern, entry.name)
27 | if mat is None:
28 | continue
29 | layer_num = int(mat.group(1))
30 | tp_rank = int(mat.group(2))
31 | layer_numbers.append(layer_num)
32 |
33 | if layer_numbers:
34 | mapping[rank] = {
35 | 'tp_rank': tp_rank,
36 | 'layer_numbers': layer_numbers
37 | }
38 |
39 | with open(f'{direc}/layer_map.json', 'w') as json_file:
40 | json.dump(mapping, json_file, indent=4)
41 |
42 |
43 | if __name__ == "__main__":
44 | main()
45 |
--------------------------------------------------------------------------------
/mlfs/cmd/mlfs-edit-index/mlfs-edit-index.go:
--------------------------------------------------------------------------------
1 | /*
2 | e.g.
3 |
4 | mlfs-edit-index -index-url imagenet.idx.txt -o sub-imagenet.idx.txt -take 128 -from https://minddata.blob.core.windows.net -to https://tenplex.blob.core.windows.net
5 | */
6 | package main
7 |
8 | import (
9 | "flag"
10 | "log"
11 | "strings"
12 |
13 | "github.com/kungfu-team/tenplex/mlfs/mlfs"
14 | "github.com/kungfu-team/tenplex/mlfs/vfs/vfile"
15 | )
16 |
17 | var (
18 | idxFile = flag.String("index-url", "", "")
19 | output = flag.String("o", "", "")
20 |
21 | take = flag.Int(`take`, 0, ``)
22 | localize = flag.Bool(`localize`, false, ``)
23 | replaceFrom = flag.String(`from`, ``, ``)
24 | replaceTo = flag.String(`to`, ``, ``)
25 | )
26 |
27 | func main() { mlfs.Main(Main) }
28 |
29 | func Main() error {
30 | log.Printf("loading from %q", *idxFile)
31 | idx, err := vfile.LoadIdxFile(*idxFile)
32 | if err != nil {
33 | return err
34 | }
35 | if *take > 0 {
36 | idx = idx[:*take]
37 | }
38 | if *localize {
39 | idx.SetHost(``)
40 | } else if len(*replaceFrom) > 0 {
41 | replaceURL(idx, *replaceFrom, *replaceTo)
42 | }
43 | log.Printf("saving to %q", *output)
44 | return vfile.SaveIdxFile(*output, idx)
45 | }
46 |
47 | func replaceURL(fs vfile.IndexedFiles, from, to string) {
48 | for i := range fs {
49 | fs[i].Filepath = strings.Replace(fs[i].Filepath, from, to, 1)
50 | }
51 | }
52 |
--------------------------------------------------------------------------------
/scheduler/configserver/configserver.go:
--------------------------------------------------------------------------------
1 | package configserver
2 |
3 | import (
4 | "context"
5 | "fmt"
6 | "net"
7 | "net/http"
8 | "strconv"
9 |
10 | kfcs "github.com/lsds/KungFu/srcs/go/kungfu/elastic/configserver"
11 | "github.com/lsds/KungFu/srcs/go/log"
12 | )
13 |
14 | func RunBuiltinConfigServer(port int) {
15 | const endpoint = `/config`
16 | addr := net.JoinHostPort("", strconv.Itoa(port))
17 | log.Infof("running builtin config server listening %s%s", addr, endpoint)
18 | _, cancel := context.WithCancel(context.TODO())
19 | defer cancel()
20 | srv := &http.Server{
21 | Addr: addr,
22 | Handler: logRequest(kfcs.New(cancel, nil, endpoint)),
23 | }
24 | srv.SetKeepAlivesEnabled(false)
25 | if err := srv.ListenAndServe(); err != nil {
26 | log.Errorf("config server stopped: %v", err)
27 | }
28 | }
29 |
30 | func logRequest(h http.Handler) http.Handler {
31 | return http.HandlerFunc(func(w http.ResponseWriter, req *http.Request) {
32 | log.Debugf("%s %s from %s, UA: %s", req.Method, req.URL.Path, req.RemoteAddr, req.UserAgent())
33 | h.ServeHTTP(w, req)
34 | })
35 | }
36 |
37 | func Stop(port int) error {
38 | resp, err := http.Get(fmt.Sprintf("http://localhost:%d/stop", port))
39 | if err != nil {
40 | return err
41 | }
42 | if resp.StatusCode != 200 {
43 | return fmt.Errorf("stop failed, status code: %d", resp.StatusCode)
44 | }
45 | return nil
46 | }
47 |
--------------------------------------------------------------------------------
/para_config/megatron_lm/rank_map.py:
--------------------------------------------------------------------------------
1 | import json
2 | import os
3 |
4 |
5 | def gen_rank_map(
6 | framework: str,
7 | model: str,
8 | model_size: str,
9 | precision: str,
10 | pp_size: int,
11 | tp_size: int,
12 | dp_size: int,
13 | job_dir: str,
14 | repo: str,
15 | ):
16 | size = pp_size * tp_size * dp_size
17 | out_dir = os.path.join(repo, f"{framework}/{precision}/{model}/{model_size}")
18 | out_dir = os.path.join(out_dir, f"pp{pp_size:02d}/mp{tp_size:02d}/dp{dp_size:02d}")
19 | gpus_container = 4
20 | num_nodes = size // gpus_container
21 |
22 | mapping = {}
23 |
24 | for rank in range(size):
25 | for node in range(num_nodes):
26 | rank_path = os.path.join(
27 | job_dir, f"{node}/ckpt/{rank}/rank_{rank:02d}.json"
28 | )
29 |
30 | if not os.path.exists(rank_path):
31 | continue
32 |
33 | with open(rank_path, "r", encoding="utf-8") as rank_file:
34 | ranks = json.load(rank_file)
35 |
36 | mapping[rank] = {
37 | "pp_rank": ranks["pp"],
38 | "mp_rank": ranks["tp"],
39 | "dp_rank": ranks["dp"],
40 | }
41 |
42 | path = os.path.join(out_dir, "rank_map.json")
43 | with open(path, "w", encoding="utf-8") as json_file:
44 | json.dump(mapping, json_file, indent=4)
45 |
--------------------------------------------------------------------------------
/ansible/install.yml:
--------------------------------------------------------------------------------
1 | - name: add apt sources list
2 | become: true
3 | ansible.builtin.shell:
4 | cmd: echo "deb https://europe-west2-apt.pkg.dev/projects/tenplex tenplex main" | sudo tee /etc/apt/sources.list.d/tenplex.list
5 | register: log
6 |
7 | - name: add apt key
8 | become: true
9 | ansible.builtin.shell:
10 | cmd: curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | gpg --dearmor | sudo tee /etc/apt/trusted.gpg.d/packages-cloud-google-apt.gpg >/dev/null
11 | register: log
12 |
13 | - name: apt update
14 | become: true
15 | ansible.builtin.shell:
16 | cmd: apt update
17 | register: log
18 |
19 | # - name: Update apt repository cache
20 | # ansible.builtin.apt:
21 | # update_cache: yes
22 |
23 | - name: stop mlfs
24 | become: true
25 | ignore_errors: yes
26 | ansible.builtin.shell:
27 | cmd: |
28 | rm /etc/mlfs/tenplex.sas
29 | systemctl stop mlfs
30 |
31 | register: log
32 | - name: Install a list of packages
33 | become: true
34 | ansible.builtin.apt:
35 | pkg:
36 | - fuse3
37 | - mlfs
38 |
39 | - name: reload mlfs
40 | become: true
41 | ignore_errors: yes
42 | ansible.builtin.shell:
43 | cmd: |
44 | systemctl daemon-reload
45 | systemctl restart mlfs
46 | register: log
47 |
48 | - name: show info
49 | # command: mlfs info
50 | ansible.builtin.shell:
51 | cmd: mlfs info
52 | register: log
53 |
--------------------------------------------------------------------------------
/mlfs/prefetch.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | set -e
4 |
5 | make
6 |
7 | SA=minddata
8 | SAS=$(cat $HOME/.az/$SA.sas)
9 |
10 | URL=https://$SA.blob.core.windows.net/data/imagenet/imagenet.md5.txt?$SAS
11 |
12 | # wget -O imagenet.md5.txt $URL
13 |
14 | prefetch() {
15 | local md5=$1
16 | local URL="https://minddata.blob.core.windows.net/data/imagenet/records/$2"
17 |
18 | ./bin/mlfs-fetch -ctrl-port 20000 -file $URL -md5 $md5
19 | }
20 |
21 | prefetch_idx_file() {
22 | local idx_file=$1
23 | cat $idx_file | while read line; do
24 | local md5=$(echo $line | awk '{print $1}')
25 | local filename=$(echo $line | awk '{print $2}')
26 | prefetch $md5 $filename
27 | done
28 | }
29 |
30 | prefetch_idx_file imagenet.md5.txt
31 |
32 | # prefetch 8c7f3aa5f4f227f261717028d6c76c6e train-00000-of-01024
33 | # prefetch 99943ca2bd3c48baa633a2f4ee805f6c train-00001-of-01024
34 | # prefetch c117e44c7f83b80ebfbbddf990773b8a train-00002-of-01024
35 | # prefetch 47644a7c6c924358e207cba2f2c51727 train-00003-of-01024
36 | # prefetch c733217f52e73fd72f6566c9569d2d40 train-00004-of-01024
37 | # prefetch 05170c43f2c4be60b46c391d98b52481 train-00005-of-01024
38 | # prefetch 190dbbfdd581623a1a90835bb9a23460 train-00006-of-01024
39 | # prefetch 0663659d61497f6546e90abcf8b1e08d train-00007-of-01024
40 |
41 | # https://minddata.blob.core.windows.net/data/imagenet/records/train-01001-of-01024 1251
42 |
--------------------------------------------------------------------------------
/scheduler/cmd/tenplex-user/tenplex-user.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "flag"
5 | "log"
6 | "net"
7 | "os"
8 | "path"
9 | "time"
10 |
11 | "github.com/kungfu-team/tenplex/scheduler/experiments/fakeuser"
12 | "github.com/kungfu-team/tenplex/scheduler/logging"
13 | "github.com/lgarithm/go/tr"
14 | )
15 |
16 | func main() {
17 | prog := path.Base(os.Args[0])
18 | logging.SetupLogger(prog)
19 | defer tr.Patient(prog, 30*time.Second).Done()
20 | var u fakeuser.User
21 | u.RegisterFlags(flag.CommandLine)
22 | flag.Parse()
23 | // u.Hosts = resolveHosts(*hosts) // TODO: make it work
24 | if len(u.PlansFile) > 0 {
25 | if u.SingleTimedJob {
26 | if err := u.RunSingleJob(); err != nil {
27 | log.Panic(err)
28 | }
29 | } else {
30 | if err := u.RunPlans(); err != nil {
31 | log.Panic(err)
32 | }
33 | }
34 | } else {
35 | log.Printf("! using deprecated Run")
36 | u.Run()
37 | }
38 | }
39 |
40 | func resolveHosts(hosts []string) []string {
41 | var ips []string
42 | for i, h := range hosts {
43 | ip := resolve(h)
44 | log.Printf("#%d : %s -> %s", i, h, ip)
45 | ips = append(ips, ip)
46 | }
47 | return hosts
48 | }
49 |
50 | func resolve(h string) string { // TODO: does't work for self
51 | addrs, err := net.LookupHost(h)
52 | if err != nil {
53 | return h
54 | }
55 | for _, a := range addrs {
56 | log.Printf("%s", a)
57 | return a
58 | }
59 | return h
60 | }
61 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | PREFIX := $(if $(PREFIX),$(PREFIX),$(HOME)/local)
2 | WHICH_GO := $(shell which go)
3 | DEFAULT_GO := $(if $(WHICH_GO),$(WHICH_GO),$(HOME)/local/go/bin/go)
4 | GO := $(if $(GO),$(GO),$(DEFAULT_GO))
5 | CUDA := $(if $(CUDA),$(CUDA),$(shell [ -c /dev/nvidia0 ] && echo cuda))
6 | # BIN_DIR := $(if $(BIN_DIR),$(BIN_DIR),$(HOME)/.tenplex/bin)
7 | BIN_DIR := $(CURDIR)/bin
8 |
9 | GO_MOD := $(shell ./show-go-mod.sh)
10 | buildinfo := $(GO_MOD)/mlfs/buildinfo
11 | LDFLAGS += -X $(buildinfo).BuildHost=$(shell hostname)
12 | LDFLAGS += -X $(buildinfo).BuildTimestamp=$(shell date +%s)
13 | LDFLAGS += -X $(buildinfo).GitCommit=$(shell git rev-list -1 HEAD)
14 | LDFLAGS += -X $(buildinfo).GitBranch=$(shell git rev-parse --abbrev-ref HEAD)
15 | LDFLAGS += -X $(buildinfo).GitRev=$(shell git rev-list --count HEAD)
16 |
17 | default: binaries test
18 |
19 | binaries: bin
20 | GOBIN=$(PWD)/bin $(GO) install -ldflags "$(LDFLAGS)" -v ./...
21 |
22 | install:
23 | $(GO) install -ldflags "$(LDFLAGS)" -v ./...
24 |
25 | test:
26 | $(GO) test -v ./...
27 |
28 | update:
29 | $(GO) get -u ./...
30 |
31 | clean:
32 | $(GO) clean -v -cache ./...
33 |
34 | tidy:
35 | $(GO) mod tidy
36 |
37 | format:
38 | $(GO) fmt ./...
39 |
40 | i: install
41 |
42 |
43 | u: update tidy
44 |
45 |
46 | t: test
47 |
48 |
49 | bin:
50 | mkdir -p $(BIN_DIR)
51 |
52 | deb: binaries
53 | ./scripts/pack.sh
54 |
55 | sys-install: deb
56 | sudo dpkg -i ./build/*.deb
57 |
--------------------------------------------------------------------------------
/mlfs/fsutil/fsutil.go:
--------------------------------------------------------------------------------
1 | package fsutil
2 |
3 | import (
4 | "errors"
5 | "io"
6 | "strconv"
7 | "strings"
8 |
9 | "github.com/kungfu-team/tenplex/mlfs/vfs"
10 | )
11 |
12 | var (
13 | errNodeNotExists = errors.New("node not exist")
14 | errNotFile = errors.New("not a file")
15 | )
16 |
17 | func ReadTextLines(r *vfs.Tree, p string) ([]string, error) {
18 | n, _, ok := r.Get(p)
19 | if !ok {
20 | return nil, errNodeNotExists
21 | }
22 | if n.IsDir() {
23 | return nil, errNotFile
24 | }
25 | bs, err := io.ReadAll(n.AsFile().Open())
26 | if err != nil {
27 | return nil, err
28 | }
29 | var lines []string
30 | for _, line := range strings.Split(string(bs), "\n") {
31 | line = strings.TrimSpace(line)
32 | if len(line) > 0 {
33 | lines = append(lines, line)
34 | }
35 | }
36 | return lines, nil
37 | }
38 |
39 | func ReadIntLines(r *vfs.Tree, p string) ([]int, error) {
40 | n, _, ok := r.Get(p)
41 | if !ok {
42 | return nil, errNodeNotExists
43 | }
44 | if n.IsDir() {
45 | return nil, errNotFile
46 | }
47 | bs, err := io.ReadAll(n.AsFile().Open())
48 | if err != nil {
49 | return nil, err
50 | }
51 | var xs []int
52 | for _, line := range strings.Split(string(bs), "\n") {
53 | line = strings.TrimSpace(line)
54 | if len(line) > 0 {
55 | x, err := strconv.Atoi(line)
56 | if err != nil {
57 | return xs, err
58 | }
59 | xs = append(xs, x)
60 | }
61 | }
62 | return xs, nil
63 | }
64 |
--------------------------------------------------------------------------------
/mlfs/README:
--------------------------------------------------------------------------------
1 | elastic filesystem
2 |
3 | https://github.com/kungfu-team/mlfs
4 |
5 |
6 | Code Structure
7 |
8 | ./cmd - source for executable binaries
9 |
10 | ./vfs - a plain abstraction of FS
11 | ./vfs/hfs - expose vfs as HTTP endpoint
12 | ./vfs/ufs - expose vfs as FUSE endpoint
13 | ./vfs/vfile - builtin virtual file types
14 | ...
15 |
16 |
17 |
18 | ./mlfs - wraps vfs
19 |
20 | ./mfs - model fs // TODO?
21 | ??
22 | ??
23 |
24 | ./dsfs - dataset fs
25 | ./ds - dataset
26 | ./ds/trds - TFRecord dataset
27 |
28 | ./tfrecord - the TFRecord format
29 |
30 |
31 | ./iotrace - trace utils
32 |
33 |
34 | Golang requirements
35 | 1.18
36 |
37 |
38 | system install
39 | make deb
40 | sudo dpkg -i ./build/*.deb
41 | systemctl status mlfs
42 | sudo systemctl enable mlfs
43 | sudo systemctl start mlfs
44 | systemctl status mlfs
45 |
46 |
47 | pre-build deb packages:
48 |
49 | https://tenplex.blob.core.windows.net/public/deb/Packages
50 |
51 |
52 | Install with apt
53 |
54 | echo 'deb https://tenplex.blob.core.windows.net/public/deb ./' | sudo tee /etc/apt/sources.list.d/tenplex.list
55 | curl -s https://tenplex.blob.core.windows.net/public/deb/tenplex.gpg | sudo apt-key add -
56 | sudo apt update
57 | sudo apt install -y mlfs
58 |
59 | Known Bugs
60 | r.OpenAt(88758): 503 Egress is over the account limit.
61 |
--------------------------------------------------------------------------------
/tests/test-tensor-file.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import numpy as np
4 |
5 | from tensor_file import query_tensor_file, read_tensor_file, upload_tensor
6 |
7 |
8 | def test_1():
9 | x = read_tensor_file('a')
10 | print(x)
11 |
12 |
13 | def test_2():
14 | # x = T()
15 | # x[1:2, 3:4, :]
16 | x = query_tensor_file('a', [slice(), slice()])
17 | print(x)
18 |
19 |
20 | usr = os.getenv('USER')
21 |
22 |
23 | def test_upload_with(x, path):
24 | print(x)
25 | upload_tensor(path, x)
26 | mnt = f'/data/{usr}/mlfs'
27 | y = read_tensor_file(mnt + path)
28 | print(y)
29 | # assert (x .eq() y)
30 |
31 |
32 | def test_upload():
33 | x = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
34 | test_upload_with(x, '/x')
35 |
36 | x = np.array([[1, 2, 3], [7, 8, 9], [4, 5, 6]], dtype=np.int8)
37 | test_upload_with(x, '/y')
38 |
39 |
40 | def test_query():
41 | data = list(range(16))
42 | x = np.array(data, dtype=np.float32).reshape((4, 4))
43 | test_upload_with(x, '/x')
44 |
45 | y = query_tensor_file('/x', [slice(1, 3), slice(1, 3)])
46 | print(y)
47 |
48 | y = query_tensor_file('/x', [slice(None), slice(None)])
49 | print(y)
50 |
51 | y = query_tensor_file('/x', [slice(1, 3)])
52 | print(y)
53 |
54 | y = query_tensor_file('/x', [slice(None), slice(1, 3)])
55 | print(y)
56 |
57 |
58 | # test_1()
59 | # test_2()
60 |
61 | # test_upload()
62 | test_query()
63 |
--------------------------------------------------------------------------------
/tests/test_load.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import copy
3 | import os
4 |
5 | import numpy as np
6 | import torch
7 |
8 | from tenplex.load import load
9 |
10 |
11 | def traverse(value, keys=None):
12 | if keys is None:
13 | keys = []
14 |
15 | if isinstance(value, dict):
16 | for key, val in value.items():
17 | new_keys = copy.deepcopy(keys)
18 | new_keys.append(key)
19 | traverse(val, new_keys)
20 | return
21 | if isinstance(value, (list, set, tuple)):
22 | for i, val in enumerate(value):
23 | new_keys = copy.deepcopy(keys)
24 | new_keys.append(f'{i}')
25 | traverse(val, new_keys)
26 | return
27 |
28 | if isinstance(value, torch.Tensor):
29 | tensor = value.detach().cpu().numpy()
30 | typ = type(tensor)
31 | print(f'{keys} is {typ} and shape {value.shape}')
32 | return
33 | if isinstance(value, np.ndarray):
34 | typ = type(value)
35 | print(f'{keys} is {typ} and shape {value.shape}')
36 | return
37 |
38 |
39 | def main():
40 | parser = argparse.ArgumentParser(description='Write checkpoint')
41 | parser.add_argument('--device-rank', type=int)
42 | parser.add_argument('--mlfs-path', type=str)
43 | args = parser.parse_args()
44 |
45 | ckpt = load(args.device_rank, args.mlfs_path)
46 | traverse(ckpt)
47 |
48 |
49 | if __name__ == '__main__':
50 | main()
51 |
--------------------------------------------------------------------------------
/tenplex-run/runop/failure.go:
--------------------------------------------------------------------------------
1 | package runop
2 |
3 | import (
4 | "fmt"
5 | "io"
6 | "log"
7 | "net/http"
8 | "net/url"
9 | "path"
10 | "time"
11 |
12 | "github.com/kungfu-team/tenplex/tenplex-run/job"
13 | )
14 |
15 | var client = http.Client{Timeout: 3 * time.Second}
16 |
17 | func simulateFailures(jc *job.JobConfig, n int) error {
18 | for i := 0; i < n; i++ {
19 | if err := simulateOneFailure(jc, i/jc.Cluster.GPUsPerHost, i); err != nil {
20 | log.Printf("%s failed: %v", "simulateOneFailure", err)
21 | return err
22 | }
23 | }
24 | return nil
25 | }
26 |
27 | func simulateOneFailure(jobConf *job.JobConfig, hostIdx int, workerID int) error {
28 | ip := jobConf.Cluster.Hosts[hostIdx]
29 | p := path.Join("/job", jobConf.ID, "save", str(workerID))
30 | log.Printf("simulateOneFailure by del: %s from [%d]=%s", p, hostIdx, ip)
31 | q := url.Values{}
32 | q.Set(`path`, p)
33 | u := url.URL{
34 | Scheme: `http`,
35 | Host: fmt.Sprintf("%s:%d", ip, jobConf.MLFSPort),
36 | Path: "/delete1",
37 | RawQuery: q.Encode(),
38 | }
39 | req, err := http.NewRequest(http.MethodDelete, u.String(), nil)
40 | if err != nil {
41 | return err
42 | }
43 | resp, err := client.Do(req)
44 | if err != nil {
45 | return err
46 | }
47 | defer resp.Body.Close()
48 | bs, _ := io.ReadAll(resp.Body)
49 | if resp.StatusCode != http.StatusOK {
50 | return fmt.Errorf("delete target dir failed, status: %s, error: %s, url: %s", resp.Status, string(bs), u.String())
51 | }
52 | return nil
53 | }
54 |
--------------------------------------------------------------------------------
/tenplex-run/para_config/schedule.go:
--------------------------------------------------------------------------------
1 | package para_config
2 |
3 | import (
4 | "bytes"
5 | "encoding/json"
6 | "fmt"
7 | "log"
8 | "os"
9 | )
10 |
11 | type ScalingPoint struct {
12 | Step *int `json:"step"`
13 | Time *int `json:"time"`
14 | Size int `json:"size"`
15 | }
16 |
17 | func (s ScalingPoint) String() string {
18 | buf := &bytes.Buffer{}
19 | if s.Step != nil {
20 | fmt.Fprintf(buf, "step: %d", *s.Step)
21 | }
22 | if s.Time != nil {
23 | fmt.Fprintf(buf, "time: %d", *s.Time)
24 | }
25 | fmt.Fprintf(buf, ", size: %d", s.Size)
26 | return buf.String()
27 | }
28 |
29 | var Empty MultiDimensionalParallelism // Size == PPSize == MPSize == 0
30 |
31 | type Schedule []ScalingPoint
32 |
33 | func (s Schedule) String() string {
34 | buf := &bytes.Buffer{}
35 | for i, sp := range s {
36 | if i > 0 {
37 | fmt.Fprintf(buf, ", ")
38 | }
39 | fmt.Fprintf(buf, "%s", sp)
40 | }
41 | return `Schedule{` + buf.String() + `}`
42 | }
43 |
44 | func GenSchedule(scheduleFile string) Schedule {
45 | s, err := LoadScheduleFile(scheduleFile)
46 | if err != nil {
47 | log.Panicf("LoadScheduleFile: %v", err)
48 | }
49 | for i, p := range s {
50 | log.Printf("schedule[%d/%d]: %s", i+1, len(s), p)
51 | }
52 | return s
53 | }
54 |
55 | func LoadScheduleFile(filename string) (Schedule, error) {
56 | f, err := os.Open(filename)
57 | if err != nil {
58 | return nil, err
59 | }
60 | var obj Schedule
61 | if err := json.NewDecoder(f).Decode(&obj); err != nil {
62 | return nil, err
63 | }
64 | return obj, nil
65 | }
66 |
--------------------------------------------------------------------------------
/tenplex-run/job/params_bert.go:
--------------------------------------------------------------------------------
1 | package job
2 |
3 | import "log"
4 |
5 | func GenMegatronLMBERTCmd(c TrainingConfig, rank int, jobID string, host string, jConf *JobConfig) []string {
6 | cmd := []string{
7 | `torchrun`,
8 | }
9 | cmd = append(cmd, jConf.DistFlags(c, rank)...)
10 | cmd = append(cmd, `/workspace/Megatron-LM/pretrain_bert.py`)
11 | var sizes = map[string]TransformerSize{
12 | `base`: TFSize(12, 768, 12),
13 | `large`: TFSize(24, 1024, 16),
14 | }
15 | bert_args := []string{
16 | `--seq-length`, str(1024), // default: 512
17 | `--max-position-embeddings`, str(1024), // default: 512
18 | `--lr`, `0.0001`,
19 | `--lr-decay-iters`, str(10000),
20 | `--train-iters`, str(10000),
21 | `--tenplex-train-iters`, str(c.TrainIters),
22 | `--min-lr`, `0.00001`,
23 | `--lr-warmup-fraction`, `0.01`,
24 | `--micro-batch-size`, str(jConf.MicroBatchSize), // default: 4
25 | `--global-batch-size`, str(jConf.BatchSize), // default: 32
26 | `--vocab-file`, `/workspace/Megatron-LM/vocab/bert-large-uncased-vocab.txt`,
27 | `--split`, `949,50,1`,
28 | `--data-path`, `/data/dataset/bert_text_sentence`,
29 | `--distributed-backend`, `nccl`,
30 | }
31 | if ts, ok := sizes[jConf.ModelSize]; ok {
32 | bert_args = append(bert_args, ts.ToPyArgs()...)
33 | } else {
34 | log.Fatalf("Model size not matching %s", jConf.ModelSize)
35 | }
36 | cmd = append(cmd, bert_args...)
37 | cmd = append(cmd, jConf.LogFlags(c)...)
38 | cmd = append(cmd, jConf.TenplexFlags(c, host)...)
39 | cmd = append(cmd, jConf.OtherFlags(c)...)
40 | return cmd
41 | }
42 |
--------------------------------------------------------------------------------
/mlfs/docker/ubuntu/1804/sources.list:
--------------------------------------------------------------------------------
1 | deb http://azure.archive.ubuntu.com/ubuntu/ bionic main restricted
2 | # deb-src http://azure.archive.ubuntu.com/ubuntu/ bionic main restricted
3 |
4 | deb http://azure.archive.ubuntu.com/ubuntu/ bionic-updates main restricted
5 | # deb-src http://azure.archive.ubuntu.com/ubuntu/ bionic-updates main restricted
6 |
7 | deb http://azure.archive.ubuntu.com/ubuntu/ bionic universe
8 | # deb-src http://azure.archive.ubuntu.com/ubuntu/ bionic universe
9 | deb http://azure.archive.ubuntu.com/ubuntu/ bionic-updates universe
10 | # deb-src http://azure.archive.ubuntu.com/ubuntu/ bionic-updates universe
11 |
12 | deb http://azure.archive.ubuntu.com/ubuntu/ bionic multiverse
13 | # deb-src http://azure.archive.ubuntu.com/ubuntu/ bionic multiverse
14 | deb http://azure.archive.ubuntu.com/ubuntu/ bionic-updates multiverse
15 | # deb-src http://azure.archive.ubuntu.com/ubuntu/ bionic-updates multiverse
16 |
17 | deb http://azure.archive.ubuntu.com/ubuntu/ bionic-backports main restricted universe multiverse
18 | # deb-src http://azure.archive.ubuntu.com/ubuntu/ bionic-backports main restricted universe multiverse
19 |
20 | deb http://security.ubuntu.com/ubuntu bionic-security main restricted
21 | # deb-src http://security.ubuntu.com/ubuntu bionic-security main restricted
22 | deb http://security.ubuntu.com/ubuntu bionic-security universe
23 | # deb-src http://security.ubuntu.com/ubuntu bionic-security universe
24 | deb http://security.ubuntu.com/ubuntu bionic-security multiverse
25 | # deb-src http://security.ubuntu.com/ubuntu bionic-security multiverse
26 |
--------------------------------------------------------------------------------
/mlfs/uri/stat.go:
--------------------------------------------------------------------------------
1 | package uri
2 |
3 | import (
4 | "errors"
5 | "net/http"
6 | "net/url"
7 | "os"
8 | "strconv"
9 | )
10 |
11 | type Info struct {
12 | Size int64
13 | }
14 |
15 | func (o *Opener) Stat(uri string) (*Info, error) {
16 | u, err := url.Parse(uri)
17 | if err != nil {
18 | return nil, err
19 | }
20 | return o.stat(*u)
21 | }
22 |
23 | func (o *Opener) stat(u url.URL) (*Info, error) {
24 | switch u.Scheme {
25 | case "http":
26 | return o.statHTTP(u)
27 | case "https":
28 | return o.statHTTP(o.addAzureCreds(u))
29 | case "file":
30 | return statFile(u.Path)
31 | case "":
32 | return statFile(u.Path)
33 | }
34 | return nil, errUnsupportedURLScheme
35 | }
36 |
37 | func (o *Opener) statHTTP(u url.URL) (*Info, error) {
38 | req, err := http.NewRequest(http.MethodHead, u.String(), nil)
39 | if err != nil {
40 | return nil, err
41 | }
42 | resp, err := o.client.Do(req)
43 | if err != nil {
44 | return nil, err
45 | }
46 | defer resp.Body.Close()
47 | if resp.StatusCode != http.StatusOK {
48 | return nil, errors.New(resp.Status)
49 | }
50 | if cl := resp.Header.Get(`Content-Length`); len(cl) > 0 {
51 | n, err := strconv.ParseInt(cl, 10, 64)
52 | if err != nil {
53 | return nil, err
54 | }
55 | return &Info{Size: n}, nil
56 | }
57 | return &Info{Size: -1}, nil
58 | }
59 |
60 | func statFile(name string) (*Info, error) {
61 | info, err := os.Stat(name)
62 | if err != nil {
63 | return nil, err
64 | }
65 | return &Info{Size: info.Size()}, nil
66 | }
67 |
68 | func Stat(uri string) (*Info, error) {
69 | return opener.Stat(uri)
70 | }
71 |
--------------------------------------------------------------------------------
/tenplex-run/scripts/read-zero-model-state.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | import torch
4 |
5 |
6 | def show_pt(o, p='/'):
7 | if isinstance(o, dict):
8 | for k, v in sorted(o.items()):
9 | show_pt(v, p + '/' + k)
10 | else:
11 | print('{} :: {}'.format(p, o.__class__))
12 |
13 |
14 | def show_dict(d):
15 | for k, v in sorted(d.items()):
16 | print('{:32} :: {}'.format(k, v.__class__))
17 |
18 |
19 | def read_pt_file(filename):
20 | print(filename)
21 | d = torch.load(filename, map_location=torch.device('cpu'))
22 | show_pt(d)
23 |
24 | # show_dict(d)
25 |
26 | # print('{} buffer_names'.format(len(d['buffer_names'])))
27 | # for i, s in enumerate(d['buffer_names']):
28 | # print('{:6} {}'.format(i, s))
29 | # print('')
30 |
31 | # print('lr_scheduler:')
32 | # show_dict(d['lr_scheduler'])
33 | # print('')
34 |
35 | # print('module:')
36 | # show_dict(d['module'])
37 | # print('')
38 |
39 | # print('language_model:')
40 | # show_dict(d['module']['language_model'])
41 | # print('')
42 |
43 | # print('embedding:')
44 | # show_dict(d['module']['language_model']['embedding'])
45 | # print('')
46 | # print('transformer:')
47 | # show_dict(d['module']['language_model']['transformer'])
48 | # print('')
49 |
50 | #
51 | # optimizer_state_dict
52 | # param_shapes
53 | # ds_config
54 | # ds_version
55 |
56 |
57 | def main(args):
58 | for filename in args:
59 | read_pt_file(filename)
60 | print('')
61 |
62 |
63 | main(sys.argv[1:])
64 |
--------------------------------------------------------------------------------
/mlfs/mlfs/replicate.go:
--------------------------------------------------------------------------------
1 | package mlfs
2 |
3 | import (
4 | "bytes"
5 | "io"
6 | "net/http"
7 | "net/url"
8 | )
9 |
10 | const (
11 | X_Replica = `x-replica`
12 | )
13 |
14 | func (s *webUI) replicateRequest(w http.ResponseWriter, r *http.Request) error {
15 | log.Printf("replicate RawQuery: %q", r.URL.RawQuery)
16 | bs, err := io.ReadAll(r.Body)
17 | if err != nil {
18 | return err
19 | }
20 | log.Printf("body: %d bytes", len(bs))
21 | for i := 0; i < s.e.redundency+1; i++ {
22 | id := s.e.peers[(s.e.rank+i)%len(s.e.peers)]
23 | u := url.URL{
24 | // Scheme: r.URL.Scheme,// is empty
25 | Scheme: `http`,
26 | Host: id.String(),
27 | Path: r.URL.Path,
28 | RawQuery: r.URL.RawQuery,
29 | }
30 | req, err := http.NewRequest(r.Method, u.String(), bytes.NewBuffer(bs))
31 | if err != nil {
32 | return err
33 | }
34 | for k, vs := range r.Header {
35 | for _, v := range vs {
36 | req.Header.Add(k, v)
37 | }
38 | }
39 | req.Header.Set(X_Replica, str(s.e.redundency))
40 | resp, err := http.DefaultClient.Do(req)
41 | if err != nil {
42 | return err
43 | }
44 | resp.Body.Close()
45 | }
46 | return nil
47 | }
48 |
49 | func (s *webUI) replicated(f http.HandlerFunc) http.HandlerFunc {
50 | return func(w http.ResponseWriter, req *http.Request) {
51 | if s.e.redundency > 0 && parseInt(req.Header.Get(X_Replica)) == 0 {
52 | if err := s.replicateRequest(w, req); err != nil {
53 | log.Printf("replicateRequest: %v", err)
54 | http.Error(w, err.Error(), http.StatusInternalServerError)
55 | }
56 | return
57 | }
58 | f(w, req)
59 | }
60 | }
61 |
--------------------------------------------------------------------------------
/mlfs/vfs/vfile/range.go:
--------------------------------------------------------------------------------
1 | package vfile
2 |
3 | import "net/url"
4 |
5 | // Range represents [Begin, End) where Begin <= End
6 | type Range struct {
7 | Begin uint64
8 | End uint64
9 | }
10 |
11 | func (r Range) Len() uint64 {
12 | return r.End - r.Begin
13 | }
14 |
15 | type NamedRange struct {
16 | Name string
17 | Range Range
18 | }
19 |
20 | type IndexedRange struct {
21 | ID int
22 | Range Range
23 | }
24 |
25 | type Ranges []Range
26 |
27 | type NamedRanges []NamedRange
28 |
29 | type IndexedFile struct {
30 | Filepath string
31 | Ranges Ranges
32 | }
33 |
34 | func (f IndexedFile) IndexedBytes() uint64 {
35 | var n uint64
36 | for _, r := range f.Ranges {
37 | n += r.Len()
38 | }
39 | return n
40 | }
41 |
42 | type IndexedFiles []IndexedFile
43 |
44 | func (i IndexedFiles) NumRange() int {
45 | var n int
46 | for _, f := range i {
47 | n += len(f.Ranges)
48 | }
49 | return n
50 | }
51 |
52 | func (i IndexedFiles) NamedRanges() NamedRanges {
53 | var rs NamedRanges
54 | for _, f := range i {
55 | for _, r := range f.Ranges {
56 | rs = append(rs, NamedRange{f.Filepath, r})
57 | }
58 | }
59 | return rs
60 | }
61 |
62 | func (rs NamedRanges) Select(s []int) NamedRanges {
63 | var qs NamedRanges
64 | for _, i := range s {
65 | qs = append(qs, rs[i])
66 | }
67 | return qs
68 | }
69 |
70 | func (idx IndexedFiles) SetHost(host string) {
71 | for i, f := range idx {
72 | u, err := url.Parse(f.Filepath)
73 | if err != nil {
74 | continue
75 | }
76 | u.Host = host
77 | if u.Host == `` {
78 | u.Scheme = ``
79 | }
80 | idx[i].Filepath = u.String()
81 | }
82 | }
83 |
--------------------------------------------------------------------------------
/scheduler/cmd/tenplex-scheduler/tenplex-scheduler.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "flag"
5 | "log"
6 | "os"
7 | "os/user"
8 | "path"
9 | "time"
10 |
11 | "github.com/kungfu-team/tenplex/ipv4"
12 | "github.com/kungfu-team/tenplex/scheduler/logging"
13 | "github.com/kungfu-team/tenplex/scheduler/scheduler"
14 | "github.com/lgarithm/go/tr"
15 | )
16 |
17 | func main() {
18 | prog := path.Base(os.Args[0])
19 | logging.SetupLogger(prog)
20 | defer tr.Patient(prog, 300*time.Second).Done()
21 | var runDir = `/run/tenplex`
22 | user, err := user.Current()
23 | if err != nil {
24 | log.Panic(err)
25 | }
26 | log.Printf("user: %s", user.Username)
27 | if user.Username != `root` {
28 | runDir = user.HomeDir
29 | }
30 | if pwd, _ := os.Getwd(); pwd == `/` {
31 | if err := setupWorkDir(runDir); err != nil {
32 | log.Panic(err)
33 | }
34 | }
35 | logDirs()
36 | var d scheduler.Daemon
37 | d.RegisterFlags(flag.CommandLine)
38 | flag.Parse()
39 | if len(d.DetectIPv4) > 0 || len(d.SelfIP) == 0 {
40 | if d.SelfIP = detectIP(d.DetectIPv4); len(d.SelfIP) == 0 {
41 | log.Panic("self IP is empty")
42 | }
43 | }
44 | log.Printf("using self ip: %s", d.SelfIP)
45 | d.Run()
46 | }
47 |
48 | var detectIP = ipv4.Detect
49 |
50 | func setupWorkDir(dir string) error {
51 | if err := os.MkdirAll(dir, os.ModePerm); err != nil {
52 | return err
53 | }
54 | if err := os.Chdir(dir); err != nil {
55 | return err
56 | }
57 | return nil
58 | }
59 |
60 | func logDirs() {
61 | pwd, _ := os.Getwd()
62 | log.Printf("pwd: %s", pwd)
63 | home, _ := os.UserHomeDir()
64 | log.Printf("home: %s", home)
65 | }
66 |
--------------------------------------------------------------------------------
/benchmark/performance_impact/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | set -e
3 |
4 | . $(dirname $0)/../common.sh
5 |
6 | hosts() {
7 | echo "10.10.10.1"
8 | echo "10.10.10.2"
9 | echo "10.10.10.3"
10 | echo "10.10.10.4"
11 | }
12 |
13 | model_sizes() {
14 | # echo "6.7B"
15 | echo "2.7B"
16 | echo "xl"
17 | # echo "large"
18 | }
19 |
20 | batch_sizes() {
21 | echo 128
22 | }
23 |
24 | micro_batch_sizes() {
25 | echo 8
26 | }
27 |
28 | mdp_sizes() {
29 | echo 16
30 | }
31 |
32 | bert_flags() {
33 | echo -model "bert"
34 | echo -model-sizes "large"
35 |
36 | echo -dataset "openwebtext"
37 | echo -index-url "/data/megatron-lm/bert/openwebtext/npzs_seq1024/indices.txt"
38 | }
39 |
40 | gpt_flags() {
41 | echo -model "gpt"
42 | echo -model-sizes "2.7B"
43 |
44 | echo -dataset "enwiki"
45 | echo -index-url "/data/megatron-lm/gpt-2/enwiki/npzs_seq1024_new/indices.txt"
46 | }
47 |
48 | comb_flags() {
49 | echo -hosts $(join $(hosts))
50 |
51 | echo -batch-sizes $(join $(batch_sizes))
52 | echo -micro-batch-sizes $(join $(micro_batch_sizes))
53 |
54 | echo -mdp-sizes $(join $(mdp_sizes))
55 | }
56 |
57 | common_flags() {
58 | base_flags
59 | comb_flags
60 | echo -timeout 30
61 | }
62 |
63 | run_bert() {
64 | tenplex-perf-impact $(common_flags) $(bert_flags)
65 | }
66 |
67 | run_gpt() {
68 | tenplex-perf-impact $(common_flags) $(gpt_flags)
69 | }
70 |
71 | main() {
72 | run_bert
73 | run_gpt
74 | }
75 |
76 | with_nohup() {
77 | nohup $@ >out.log 2>err.log &
78 | }
79 |
80 | main
81 |
82 | python extract.py
83 | python plot.py
84 |
--------------------------------------------------------------------------------
/benchmark/failure/plot.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import matplotlib.pyplot as plt
4 | import numpy as np
5 |
6 |
7 | def main():
8 | sys = "Scalai"
9 | width = 0.3
10 |
11 | failures = [4, 8, 12]
12 |
13 | times = np.array([23.5, 19.6, 19.6])
14 | tenplex_rerun = np.array([0, 0, 360])
15 | times_tenplex = times + tenplex_rerun
16 | baseline_rerun = np.array([360, 360, 360])
17 | times_baseline = times + baseline_rerun
18 |
19 | plt.rcParams["hatch.linewidth"] = 3
20 | linewidth = 2
21 |
22 | plt.rc("figure", figsize=[8, 4.5])
23 | fig, ax = plt.subplots()
24 |
25 | x = np.arange(len(failures))
26 |
27 | ax.bar(
28 | x,
29 | times_tenplex,
30 | width=width,
31 | label=sys,
32 | hatch="//",
33 | fill=False,
34 | edgecolor="tab:blue",
35 | linewidth=linewidth,
36 | )
37 | ax.bar(
38 | x + 1.1 * width,
39 | times_baseline,
40 | width=width,
41 | label="Baseline",
42 | hatch="--",
43 | fill=False,
44 | edgecolor="tab:orange",
45 | linewidth=linewidth,
46 | )
47 |
48 | fontsize = 26
49 | labelsize = 22
50 | ax.grid(axis="y")
51 | ax.set_axisbelow(True)
52 | ax.set_ylim(top=600)
53 | ax.tick_params(labelsize=labelsize)
54 | ax.set_xlabel("Number of GPU failures", fontsize=fontsize)
55 | ax.set_ylabel("Time in seconds", fontsize=fontsize)
56 | ax.legend(fontsize=labelsize)
57 | ax.set_xticks(x + 0.5 * width, failures)
58 |
59 | fig.tight_layout()
60 | plt.savefig("./failure.pdf")
61 |
62 |
63 | if __name__ == "__main__":
64 | main()
65 |
--------------------------------------------------------------------------------
/mlfs/buildinfo/buildinfo.go:
--------------------------------------------------------------------------------
1 | package buildinfo
2 |
3 | import (
4 | "fmt"
5 | "io"
6 | "net/http"
7 | "strconv"
8 | "time"
9 | )
10 |
11 | var t0 = time.Now()
12 |
13 | type BuildInfo struct {
14 | BuildTimestamp string
15 | BuildTime time.Time
16 | BuildHost string
17 | GitCommit string
18 | GitBranch string
19 | GitRev string
20 | }
21 |
22 | func (i *BuildInfo) Parse() {
23 | if n, err := strconv.Atoi(i.BuildTimestamp); err == nil {
24 | i.BuildTime = time.Unix(int64(n), 0)
25 | }
26 | }
27 |
28 | func (i *BuildInfo) Show(w io.Writer) {
29 | fmt.Fprintf(w, "git branch: %s\n", i.GitBranch)
30 | fmt.Fprintf(w, "git commit: %s\n", i.GitCommit)
31 | fmt.Fprintf(w, "git rev: %s\n", i.GitRev)
32 | fmt.Fprintf(w, "build host: %s\n", i.BuildHost)
33 | if i.BuildTime.Unix() > 0 {
34 | fmt.Fprintf(w, "build age: %s\n", time.Since(i.BuildTime))
35 | } else {
36 | fmt.Fprintf(w, "build age: %s (%q)\n", `?`, i.BuildTimestamp)
37 | }
38 | fmt.Fprintf(w, "run age: %s\n", time.Since(t0))
39 | }
40 |
41 | func (i *BuildInfo) ServeHTTP(w http.ResponseWriter, r *http.Request) { i.Show(w) }
42 |
43 | var Default BuildInfo
44 |
45 | // func Set(i BuildInfo) {
46 | // Default = i
47 | // Default.Parse()
48 | // }
49 |
50 | var (
51 | BuildHost string
52 | BuildTimestamp string
53 | GitCommit string
54 | GitBranch string
55 | GitRev string
56 | )
57 |
58 | func init() {
59 | Default = BuildInfo{
60 | BuildHost: BuildHost,
61 | BuildTimestamp: BuildTimestamp,
62 | GitCommit: GitCommit,
63 | GitBranch: GitBranch,
64 | GitRev: GitRev,
65 | }
66 | Default.Parse()
67 | }
68 |
--------------------------------------------------------------------------------
/tenplex-run/scripts/read-zero-optimizer-state.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | import torch
4 |
5 |
6 | def show_dict(d):
7 | for k, v in sorted(d.items()):
8 | print('{:32} :: {}'.format(k, v.__class__))
9 |
10 |
11 | def read_pt_file(filename):
12 | print(filename)
13 | d = torch.load(filename, map_location=torch.device('cpu'))
14 | show_dict(d)
15 | return
16 | #
17 | # optimizer_state_dict
18 | # param_shapes
19 | # ds_config
20 | # ds_version
21 | print(d['ds_version']) #0.5.9+d93d924
22 | # /workspace/DeepSpeedExamples/Megatron-LM-v1.1.5-ZeRO3/examples/ds_zero_stage_2_config.json
23 | print(d['ds_config'])
24 | print(d['param_shapes'].__class__) # list
25 | for i, x in enumerate(d['param_shapes']):
26 | print(i)
27 | # print(x)
28 | for k, v in x.items():
29 | print(k)
30 | print(v)
31 | break
32 | print(len(x))
33 |
34 | # print(d['optimizer_state_dict'])
35 | print('optimizer_state_dict:')
36 | for k, v in d['optimizer_state_dict'].items():
37 | print(k)
38 | print(v.__class__)
39 |
40 | print('base_optimizer_state:')
41 | for s in d['optimizer_state_dict']['base_optimizer_state']:
42 | print(s)
43 |
44 | print('partition_count:')
45 | for s in d['optimizer_state_dict']['partition_count']:
46 | print(s)
47 |
48 | print('single_partition_of_fp32_groups:')
49 | for s in d['optimizer_state_dict']['single_partition_of_fp32_groups']:
50 | print(s)
51 |
52 |
53 | def main(args):
54 | for filename in args:
55 | read_pt_file(filename)
56 |
57 |
58 | main(sys.argv[1:])
59 |
--------------------------------------------------------------------------------
/mlfs/cmd/mlfs-download/mlfs-download.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "flag"
5 | "io"
6 | "log"
7 | "net/url"
8 | "os"
9 | "path"
10 | "strings"
11 | "time"
12 |
13 | "github.com/kungfu-team/tenplex/mlfs/mlfs"
14 | "github.com/kungfu-team/tenplex/mlfs/uri"
15 | "github.com/kungfu-team/tenplex/mlfs/utils"
16 | "github.com/kungfu-team/tenplex/mlfs/vfs/vfile"
17 | )
18 |
19 | var (
20 | localRoot = flag.String("o", ".", "")
21 | idxFile = flag.String("index-url", "", "")
22 | )
23 |
24 | func main() { mlfs.Main(Main) }
25 |
26 | func Main() error {
27 | t0 := time.Now()
28 | fs, err := vfile.LoadIdxFile(*idxFile)
29 | if err != nil {
30 | return err
31 | }
32 | for i, f := range fs {
33 | localFile := path.Join(*localRoot, getRelPath(f.Filepath))
34 | if err := downloadOne(f.Filepath, localFile); err != nil {
35 | return err
36 | }
37 | log.Printf("downloaded %d/%d %s -> %s", i+1, len(fs), f.Filepath, localFile)
38 | utils.LogETA(t0, i+1, len(fs))
39 | }
40 | return nil
41 | }
42 |
43 | func downloadOne(filepath, localFile string) error {
44 | r, err := uri.Open(filepath)
45 | if err != nil {
46 | return err
47 | }
48 | defer r.Close()
49 | if err := os.MkdirAll(path.Dir(localFile), os.ModePerm); err != nil {
50 | return err
51 | }
52 | w, err := os.Create(localFile)
53 | if err != nil {
54 | return err
55 | }
56 | defer w.Close()
57 | if _, err := io.Copy(w, r); err != nil {
58 | return err
59 | }
60 | return nil
61 | }
62 |
63 | func getRelPath(filepath string) string {
64 | u, err := url.Parse(filepath)
65 | if err != nil {
66 | return filepath
67 | }
68 | p := u.Path
69 | p = strings.TrimPrefix(p, `/`)
70 | return p
71 | }
72 |
--------------------------------------------------------------------------------
/go.sum:
--------------------------------------------------------------------------------
1 | bazil.org/fuse v0.0.0-20230120002735-62a210ff1fd5 h1:A0NsYy4lDBZAC6QiYeJ4N+XuHIKBpyhAVRMHRQZKTeQ=
2 | bazil.org/fuse v0.0.0-20230120002735-62a210ff1fd5/go.mod h1:gG3RZAMXCa/OTes6rr9EwusmR1OH1tDDy+cg9c5YliY=
3 | github.com/lgarithm/go v0.0.0-20230108194319-abf8008ecd81 h1:GFGlLe+MAKlSJYw3peERfTyLbv86tzf8YLEL3AVSPu4=
4 | github.com/lgarithm/go v0.0.0-20230108194319-abf8008ecd81/go.mod h1:FA2Pf2Af/7iMNJEuHDI79ywTadX28XifXiqO4kkVSIc=
5 | github.com/lgarithm/proc v0.4.5-0.20240417004737-9b169ad5c322 h1:PnylIiY58FSTLvfbuuEgTSO5xEaLOGGAWrs2aG6T70w=
6 | github.com/lgarithm/proc v0.4.5-0.20240417004737-9b169ad5c322/go.mod h1:8Eqa3ExkUYuyj/GwamAMgH09IAO1p7TL29wpeaPgSNg=
7 | github.com/lsds/KungFu v0.2.5 h1:2SJ/PMTcvLLhBZABAbR760c+QzWrCF8qZUqK1xDJiqM=
8 | github.com/lsds/KungFu v0.2.5/go.mod h1:FdILPtKYV4/ShJ38H7WbDunoKQ8l3Q4mJckRfqVbJn4=
9 | github.com/tv42/httpunix v0.0.0-20191220191345-2ba4b9c3382c h1:u6SKchux2yDvFQnDHS3lPnIRmfVJ5Sxy3ao2SIdysLQ=
10 | golang.org/x/crypto v0.22.0 h1:g1v0xeRhjcugydODzvb3mEM9SQ0HGp9s/nh3COQ/C30=
11 | golang.org/x/crypto v0.22.0/go.mod h1:vr6Su+7cTlO45qkww3VDJlzDn0ctJvRgYbC2NvXHt+M=
12 | golang.org/x/exp v0.0.0-20240416160154-fe59bbe5cc7f h1:99ci1mjWVBWwJiEKYY6jWa4d2nTQVIEhZIptnrVb1XY=
13 | golang.org/x/exp v0.0.0-20240416160154-fe59bbe5cc7f/go.mod h1:/lliqkxwWAhPjf5oSOIJup2XcqJaw8RGS6k3TGEc7GI=
14 | golang.org/x/sync v0.7.0 h1:YsImfSBoP9QPYL0xyKJPq0gcaJdG3rInoqxTWbfQu9M=
15 | golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
16 | golang.org/x/sys v0.19.0 h1:q5f1RH2jigJ1MoAWp2KTp3gm5zAGFUTarQZ5U386+4o=
17 | golang.org/x/sys v0.19.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
18 | golang.org/x/term v0.19.0 h1:+ThwsDv+tYfnJFhF4L8jITxu1tdTWRTZpdsWgEgjL6Q=
19 |
--------------------------------------------------------------------------------
/mlfs/cmd/mlfs-check-index/mlfs-check-index.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "errors"
5 | "flag"
6 | "fmt"
7 | "log"
8 | "net/url"
9 | "time"
10 |
11 | "github.com/kungfu-team/tenplex/mlfs/mlfs"
12 | "github.com/kungfu-team/tenplex/mlfs/uri"
13 | "github.com/kungfu-team/tenplex/mlfs/vfs/vfile"
14 | )
15 |
16 | // ./bin/mlfs-check-index $(cat tests/data/*.json | jq -r '."idx-url"')
17 |
18 | func main() { mlfs.Main(Main) }
19 |
20 | func Main() error {
21 | for _, f := range flag.Args() {
22 | if err := checkIndexFile(f); err != nil {
23 | return err
24 | }
25 | }
26 | return nil
27 | }
28 |
29 | func checkIndexFile(filename string) error {
30 | t0 := time.Now()
31 | defer func() { log.Printf("took %s", time.Since(t0)) }()
32 | fs, err := vfile.LoadIdxFile(filename)
33 | if err != nil {
34 | return err
35 | }
36 | domain := getDomain(filename)
37 | for i, f := range fs {
38 | log.Printf("checking %d/%d", i+1, len(fs))
39 | info, err := uri.Stat(f.Filepath)
40 | if err != nil {
41 | return err
42 | }
43 | if info.Size < 0 {
44 | return errCannotGetSize
45 | }
46 | if size := int64(f.IndexedBytes()); size != info.Size {
47 | return fmt.Errorf("%v: %d, expect %d", errUnexpectedSize, info.Size, size)
48 | }
49 | if getDomain(f.Filepath) != domain {
50 | log.Printf("%s file has different domain", f.Filepath)
51 | }
52 | }
53 | fmt.Printf("OK: %s\n", filename)
54 | return nil
55 | }
56 |
57 | var (
58 | errCannotGetSize = errors.New(`can't get size`)
59 | errUnexpectedSize = errors.New(`unexpected get size`)
60 | )
61 |
62 | func getDomain(filepath string) string {
63 | u, err := url.Parse(filepath)
64 | if err != nil {
65 | return ""
66 | }
67 | return u.Host
68 | }
69 |
--------------------------------------------------------------------------------
/tensor/concat.go:
--------------------------------------------------------------------------------
1 | package tensor
2 |
3 | import (
4 | "fmt"
5 | )
6 |
7 | func copyCatDim(t *Tensor, tens []*Tensor) *Tensor {
8 | tIdx := 0
9 | for _, ten := range tens {
10 | for tenIdx := 0; tenIdx < ten.Dims[0]; tenIdx++ {
11 | x := t.Sub(tIdx)
12 | y := ten.Sub(tenIdx)
13 | copy(x.Data, y.Data)
14 | tIdx += 1
15 | }
16 |
17 | }
18 |
19 | return t
20 | }
21 |
22 | func copyInto(t *Tensor, tens []*Tensor, dim int) *Tensor {
23 | if dim == 0 {
24 | return copyCatDim(t, tens)
25 | }
26 | for i := 0; i < t.Dims[0]; i++ {
27 | subTens := make([]*Tensor, len(tens))
28 | for j, ten := range tens {
29 | subTens[j] = ten.Sub(i)
30 | }
31 | x := t.Sub(i)
32 | copyInto(x, subTens, dim-1)
33 | }
34 | return t
35 | }
36 |
37 | func Concat(tens []*Tensor, dim int) (*Tensor, error) {
38 | l := len(tens[0].Dims)
39 | if l <= dim {
40 | return nil, fmt.Errorf("dim %d larger tensor rank %d", dim, l)
41 | }
42 | for tenDim := range tens[0].Dims {
43 | dtypeZero := tens[0].Dtype
44 | sizeZero := tens[0].Dims[tenDim]
45 | for i := 1; i < len(tens); i++ {
46 | if tenDim != dim {
47 | size := tens[i].Dims[tenDim]
48 | if sizeZero != size {
49 | return nil, fmt.Errorf("dimension tenDim is unequal for tensors")
50 | }
51 | }
52 | if tens[i].Dtype != dtypeZero {
53 | return nil, fmt.Errorf("dtype of tensor %d is unequal", i)
54 | }
55 | }
56 | }
57 | newDimSize := 0
58 | for _, t := range tens {
59 | newDimSize = newDimSize + t.Dims[dim]
60 | }
61 | newDims := make([]int, len(tens[0].Dims))
62 | copy(newDims, tens[0].Dims)
63 | newDims[dim] = newDimSize
64 | newTen := New(tens[0].Dtype, newDims...)
65 | newTen = copyInto(newTen, tens, dim)
66 |
67 | return newTen, nil
68 | }
69 |
--------------------------------------------------------------------------------