├── .clang-format
├── .gitignore
├── .gitmodules
├── CMakeLists.txt
├── COMMITTERS.md
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── bin
    └── heturun
├── cmake
    ├── Modules
    │   ├── FindCUB.cmake
    │   ├── FindCUDNN.cmake
    │   ├── FindMETIS.cmake
    │   ├── FindMKL.cmake
    │   ├── FindNCCL.cmake
    │   ├── FindTHRUST.cmake
    │   └── FindZMQ.cmake
    └── config.example.cmake
├── environment.yml
├── examples
    ├── auto_parallel
    │   ├── .gitignore
    │   ├── cnn
    │   │   ├── analyze.py
    │   │   ├── experiment_scripts
    │   │   │   ├── .gitignore
    │   │   │   ├── compare.py
    │   │   │   ├── gen_nooverlap_scripts.py
    │   │   │   ├── gen_pipeopt_scripts.py
    │   │   │   ├── parse.py
    │   │   │   └── w16.yml
    │   │   ├── gen_configs.py
    │   │   ├── main.py
    │   │   ├── models
    │   │   │   ├── __init__.py
    │   │   │   ├── alexnet.py
    │   │   │   ├── inception_v3.py
    │   │   │   ├── resnet101.py
    │   │   │   ├── vgg19.py
    │   │   │   └── wide_resnet.py
    │   │   ├── test_models.py
    │   │   ├── test_scripts
    │   │   │   ├── compare.py
    │   │   │   ├── test_flexflow.py
    │   │   │   ├── test_gpipe.py
    │   │   │   ├── test_optcnn.py
    │   │   │   └── test_pipedream.py
    │   │   ├── test_simple_strategies.py
    │   │   └── torch_models
    │   │   │   ├── __init__.py
    │   │   │   ├── alexnet.py
    │   │   │   ├── inception_v3.py
    │   │   │   ├── resnet101.py
    │   │   │   ├── vgg19.py
    │   │   │   └── wide_resnet.py
    │   └── transformer
    │   │   ├── .gitignore
    │   │   ├── bert_main.py
    │   │   ├── experiment_scripts
    │   │       ├── .gitignore
    │   │       ├── compare.py
    │   │       ├── gen_large_noover_scripts.py
    │   │       ├── gen_pipeopt_scripts.py
    │   │       ├── parse.py
    │   │       └── w16.yml
    │   │   ├── gpt2_main.py
    │   │   ├── load_data.py
    │   │   ├── models
    │   │       ├── __init__.py
    │   │       ├── bert_config.py
    │   │       ├── gpt2_config.py
    │   │       ├── hetu_bert.py
    │   │       ├── hetu_gpt2.py
    │   │       ├── torch_bert.py
    │   │       └── torch_gpt2.py
    │   │   ├── test_bert.py
    │   │   ├── test_gpt2.py
    │   │   └── test_megatronlm.py
    ├── cnn
    │   ├── README.md
    │   ├── local_s1.yml
    │   ├── main.py
    │   ├── models
    │   │   ├── AlexNet.py
    │   │   ├── CNN.py
    │   │   ├── LSTM.py
    │   │   ├── LeNet.py
    │   │   ├── LogReg.py
    │   │   ├── MLP.py
    │   │   ├── RNN.py
    │   │   ├── ResNet.py
    │   │   ├── VGG.py
    │   │   └── __init__.py
    │   ├── pytorch_models
    │   │   ├── __init__.py
    │   │   ├── mlp.py
    │   │   ├── resnet.py
    │   │   ├── rnn.py
    │   │   └── vgg.py
    │   ├── run_tf_horovod.py
    │   ├── scripts
    │   │   ├── hetu_16gpu.sh
    │   │   ├── hetu_16gpu_ps.sh
    │   │   ├── hetu_1gpu.sh
    │   │   ├── hetu_2gpu_ps.sh
    │   │   ├── hetu_8gpu.sh
    │   │   ├── hetu_config16allreduce.yml
    │   │   ├── hetu_config16ps.yml
    │   │   ├── horovod_16gpu.sh
    │   │   ├── horovod_8gpu.sh
    │   │   ├── pytorch_16gpu_0.sh
    │   │   ├── pytorch_16gpu_1.sh
    │   │   ├── pytorch_1gpu.sh
    │   │   ├── pytorch_8gpu.sh
    │   │   ├── tf_16gpu_worker0.sh
    │   │   ├── tf_16gpu_worker1.sh
    │   │   ├── tf_1gpu.sh
    │   │   └── tf_8gpu.sh
    │   ├── settings
    │   │   ├── tf_dist_s1_w16.json
    │   │   ├── tf_dist_s1_w4.json
    │   │   └── tf_dist_s1_w8.json
    │   ├── tf_launch_server.py
    │   ├── tf_launch_worker.py
    │   ├── tf_main.py
    │   ├── tf_models
    │   │   ├── __init__.py
    │   │   ├── tf_CNN.py
    │   │   ├── tf_LSTM.py
    │   │   ├── tf_LeNet.py
    │   │   ├── tf_LogReg.py
    │   │   ├── tf_MLP.py
    │   │   ├── tf_RNN.py
    │   │   ├── tf_ResNet.py
    │   │   └── tf_VGG.py
    │   ├── torch_main.py
    │   ├── worker_conf0.json
    │   └── worker_conf1.json
    ├── ctr
    │   ├── .gitignore
    │   ├── README.md
    │   ├── data_utils.py
    │   ├── kill.sh
    │   ├── models
    │   │   ├── __init__.py
    │   │   ├── bce_test.py
    │   │   ├── dc_criteo.py
    │   │   ├── dcn_criteo.py
    │   │   ├── deepfm_criteo.py
    │   │   ├── load_data.py
    │   │   ├── wdl_adult.py
    │   │   └── wdl_criteo.py
    │   ├── run_hetu.py
    │   ├── run_tf_horovod.py
    │   ├── run_tf_local.py
    │   ├── run_tf_parallax.py
    │   ├── settings
    │   │   ├── dist_s2_w4.yml
    │   │   ├── local_s1_w2.yml
    │   │   ├── plx_local_spec.yml
    │   │   ├── tf_local_s1_w2.json
    │   │   ├── tf_local_s1_w4.json
    │   │   └── tf_local_s1_w8.json
    │   ├── tests
    │   │   ├── README.md
    │   │   ├── hybrid_dcn_criteo.sh
    │   │   ├── hybrid_dfm_criteo.sh
    │   │   ├── hybrid_wdl_adult.sh
    │   │   ├── hybrid_wdl_criteo.sh
    │   │   ├── local_dcn_criteo.sh
    │   │   ├── local_dfm_criteo.sh
    │   │   ├── local_wdl_adult.sh
    │   │   ├── local_wdl_criteo.sh
    │   │   ├── ps_dcn_criteo.sh
    │   │   ├── ps_dfm_criteo.sh
    │   │   ├── ps_wdl_adult.sh
    │   │   ├── ps_wdl_criteo.sh
    │   │   ├── tf_2workers.sh
    │   │   ├── tf_4workers.sh
    │   │   └── tf_8workers.sh
    │   ├── tf_launch_server.py
    │   ├── tf_launch_worker.py
    │   └── tf_models
    │   │   ├── __init__.py
    │   │   ├── tf_dcn_criteo.py
    │   │   ├── tf_deepfm_criteo.py
    │   │   ├── tf_wdl_adult.py
    │   │   └── tf_wdl_criteo.py
    ├── gnn
    │   ├── README.md
    │   ├── config
    │   │   ├── local_w2.yml
    │   │   ├── local_w4.yml
    │   │   ├── local_w8.yml
    │   │   └── single.yml
    │   ├── gnn_model
    │   │   ├── __init__.py
    │   │   ├── layer.py
    │   │   ├── model.py
    │   │   └── utils.py
    │   ├── gnn_tools
    │   │   ├── __init__.py
    │   │   ├── launcher.py
    │   │   ├── log.py
    │   │   ├── part_graph.py
    │   │   ├── prepare_amazon_data.py
    │   │   └── sparse_datasets.py
    │   ├── run_dist.py
    │   ├── run_dist_hybrid.py
    │   └── run_single.py
    ├── moe
    │   ├── README.md
    │   ├── scripts
    │   │   ├── run_2node_comm.sh
    │   │   ├── run_base.sh
    │   │   ├── run_hash.sh
    │   │   ├── run_ktop1.sh
    │   │   ├── run_mnist.sh
    │   │   ├── run_sam.sh
    │   │   ├── run_top1.sh
    │   │   ├── run_top1_16gpus.sh
    │   │   └── run_top2.sh
    │   ├── test_mnist.py
    │   ├── test_moe_base.py
    │   ├── test_moe_hash.py
    │   ├── test_moe_ktop1.py
    │   ├── test_moe_sam.py
    │   └── test_moe_top.py
    ├── nlp
    │   ├── .gitignore
    │   ├── README.md
    │   ├── __init__.py
    │   ├── bert
    │   │   ├── .gitignore
    │   │   ├── README.md
    │   │   ├── bert_config.py
    │   │   ├── config1.yml
    │   │   ├── config2.yml
    │   │   ├── config4.yml
    │   │   ├── config8.yml
    │   │   ├── create_pretraining_data.py
    │   │   ├── data
    │   │   │   ├── BooksDownloader.py
    │   │   │   ├── BookscorpusTextFormatting.py
    │   │   │   ├── Downloader.py
    │   │   │   ├── GLUEDownloader.py
    │   │   │   ├── GooglePretrainedWeightDownloader.py
    │   │   │   ├── NVIDIAPretrainedWeightDownloader.py
    │   │   │   ├── SquadDownloader.py
    │   │   │   ├── TextSharding.py
    │   │   │   ├── WikiDownloader.py
    │   │   │   ├── WikicorpusTextFormatting.py
    │   │   │   ├── __init__.py
    │   │   │   ├── bert-base-uncased-vocab.txt
    │   │   │   ├── bertPrep.py
    │   │   │   ├── preprocess_glue_data.py
    │   │   │   ├── squad
    │   │   │   │   └── squad_download.sh
    │   │   │   └── wikiextractor
    │   │   │   │   ├── WikiExtractor.py
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── cirrus-extract.py
    │   │   │   │   ├── clean.py
    │   │   │   │   ├── extract.py
    │   │   │   │   └── extractPage.py
    │   │   ├── file_utils.py
    │   │   ├── glue_processor
    │   │   │   └── glue.py
    │   │   ├── hetu_bert.py
    │   │   ├── hetu_bert_moe.py
    │   │   ├── load_data.py
    │   │   ├── pytorch_bert.py
    │   │   ├── scripts
    │   │   │   ├── create_datasets_from_start.sh
    │   │   │   ├── test_glue_hetu_bert_base.sh
    │   │   │   ├── test_glue_hetu_bert_large.sh
    │   │   │   ├── test_glue_pytorch_bert_base.sh
    │   │   │   ├── test_glue_pytorch_bert_large.sh
    │   │   │   ├── train_hetu_bert_base.sh
    │   │   │   ├── train_hetu_bert_base_dp.sh
    │   │   │   ├── train_hetu_bert_base_moe.sh
    │   │   │   ├── train_hetu_bert_base_ps.sh
    │   │   │   ├── train_hetu_bert_large.sh
    │   │   │   ├── train_hetu_bert_large_dp.sh
    │   │   │   ├── train_hetu_bert_large_ps.sh
    │   │   │   ├── train_pytorch_bert_base.sh
    │   │   │   └── train_pytorch_bert_large.sh
    │   │   ├── test_glue_hetu_bert.py
    │   │   ├── test_glue_pytorch_bert.py
    │   │   ├── tokenization.py
    │   │   ├── train_hetu_bert.py
    │   │   ├── train_hetu_bert_dp.py
    │   │   ├── train_hetu_bert_dp_moe.py
    │   │   ├── train_hetu_bert_ps.py
    │   │   └── train_pytorch_bert.py
    │   ├── data_load.py
    │   ├── hetu_transformer.py
    │   ├── hparams.py
    │   ├── prepare_data.py
    │   ├── tf_transformer.py
    │   ├── train_hetu_transformer.py
    │   └── train_tf_transformer.py
    ├── rec
    │   ├── .gitignore
    │   ├── models
    │   │   ├── __init__.py
    │   │   ├── base.py
    │   │   ├── gmf.py
    │   │   ├── load_data.py
    │   │   ├── mf.py
    │   │   ├── mlp.py
    │   │   └── neumf.py
    │   ├── run_compressed.py
    │   └── test
    │   │   ├── .gitignore
    │   │   ├── compare.py
    │   │   ├── config.py
    │   │   ├── hetu_data.py
    │   │   ├── hetu_main.py
    │   │   ├── hetu_ncf.py
    │   │   ├── torch_data.py
    │   │   ├── torch_main.py
    │   │   └── torch_ncf.py
    └── runner
    │   ├── README.md
    │   ├── local_allreduce.yml
    │   ├── local_ps.yml
    │   ├── models
    │       ├── MLP.py
    │       ├── __init__.py
    │       ├── load_data.py
    │       └── wdl_adult.py
    │   ├── parallel
    │       ├── README.md
    │       ├── all_cnn_tests.sh
    │       ├── all_mlp_tests.sh
    │       ├── complex_pipeline_mlp.py
    │       ├── config1.yml
    │       ├── config2.yml
    │       ├── config3.yml
    │       ├── config4.yml
    │       ├── config6.yml
    │       ├── config8.yml
    │       ├── data_model_pipeline_mlp.py
    │       ├── data_pipeline.py
    │       ├── data_pipeline_mlp.py
    │       ├── dist_config8.yml
    │       ├── dist_data_pipeline_mlp.py
    │       ├── gpipe.py
    │       ├── gpipe_multi.py
    │       ├── pipedream.py
    │       ├── ps_pipeline.py
    │       ├── simple_pipeline_mlp.py
    │       ├── test_mlp_base.py
    │       ├── test_mlp_mp.py
    │       ├── test_mlp_mp_pp.py
    │       ├── test_mlp_pp.py
    │       ├── test_model_cnn.py
    │       ├── test_model_cnn_base.py
    │       ├── test_model_cnn_complex.py
    │       └── validate_results.py
    │   ├── remote_allreduce.yml
    │   ├── remote_ps.yml
    │   ├── run_mlp.py
    │   └── run_wdl.py
├── hetu.exp
├── img
    ├── alibabacloud.png
    ├── features.png
    ├── hetu.png
    ├── kuaishou.png
    └── tencent.png
├── ps-lite
    ├── .gitignore
    ├── CMakeLists.txt
    ├── README.md
    ├── include
    │   ├── common
    │   │   ├── dmlc_base.h
    │   │   ├── logging.h
    │   │   ├── sarray.h
    │   │   ├── shared_mutex.h
    │   │   ├── thread_pool.h
    │   │   └── thread_safe_hash_map.h
    │   └── ps
    │   │   ├── base.h
    │   │   ├── internal
    │   │       ├── assign_op.h
    │   │       ├── customer.h
    │   │       ├── env.h
    │   │       ├── message.h
    │   │       ├── parallel_kv_match.h
    │   │       ├── parallel_sort.h
    │   │       ├── postoffice.h
    │   │       ├── threadsafe_pqueue.h
    │   │       ├── threadsafe_queue.h
    │   │       ├── utils.h
    │   │       └── van.h
    │   │   ├── kvapp.h
    │   │   ├── partitioner.h
    │   │   ├── ps.h
    │   │   ├── psf
    │   │       ├── PSFunc.h
    │   │       ├── cachetable.h
    │   │       ├── dense.h
    │   │       ├── misc.h
    │   │       ├── preduce.h
    │   │       ├── serializer.h
    │   │       ├── sparse.h
    │   │       └── ssp.h
    │   │   ├── range.h
    │   │   ├── server
    │   │       ├── PSFHandle.h
    │   │       ├── kvserver.h
    │   │       ├── optimizer.h
    │   │       ├── param.h
    │   │       ├── preduce_handler.h
    │   │       └── ssp_handler.h
    │   │   └── worker
    │   │       ├── PSAgent.h
    │   │       ├── callback_store.h
    │   │       ├── hetu_binding.h
    │   │       ├── kvworker.h
    │   │       └── worker.h
    ├── proto
    │   └── meta.proto
    └── src
    │   ├── PSFhandle_embedding.cc
    │   ├── PSFunc.cc
    │   ├── customer.cc
    │   ├── hetu_binding.cc
    │   ├── ibverbs_van.h
    │   ├── network_utils.h
    │   ├── p3_van.h
    │   ├── postoffice.cc
    │   ├── preduce_handler.cc
    │   ├── python_binding.cc
    │   ├── resender.h
    │   ├── thread_pool.cc
    │   ├── van.cc
    │   ├── worker.cc
    │   └── zmq_van.h
├── python
    ├── graphboard
    │   ├── __init__.py
    │   ├── graph2fig.py
    │   └── index.html
    ├── hetu
    │   ├── __init__.py
    │   ├── _base.py
    │   ├── communicator
    │   │   ├── __init__.py
    │   │   ├── mpi_comm.py
    │   │   ├── mpi_nccl_comm.py
    │   │   ├── nccl_comm.py
    │   │   └── test.sh
    │   ├── context.py
    │   ├── cpu_links
    │   │   ├── __init__.py
    │   │   └── dnnl_op.py
    │   ├── cstable.py
    │   ├── data.py
    │   ├── dataloader.py
    │   ├── distributed_strategies
    │   │   ├── __init__.py
    │   │   ├── base.py
    │   │   ├── flexflow.py
    │   │   ├── gpipe.py
    │   │   ├── optcnn.py
    │   │   ├── pipedream.py
    │   │   ├── pipeopt.py
    │   │   └── simple.py
    │   ├── gpu_links
    │   │   ├── AbsLink.py
    │   │   ├── AddConstLink.py
    │   │   ├── AddElewiseLink.py
    │   │   ├── AddmmLink.py
    │   │   ├── ArangeLink.py
    │   │   ├── ArgmaxLink.py
    │   │   ├── ArgsortLink.py
    │   │   ├── ArraySetLink.py
    │   │   ├── AssignWithIndexedSlicesLink.py
    │   │   ├── AutoDimLink.py
    │   │   ├── AvgPoolLink.py
    │   │   ├── BaddbmmLink.py
    │   │   ├── BatchMatrixMultLink.py
    │   │   ├── BinaryCrossEntropyLink.py
    │   │   ├── BoolLink.py
    │   │   ├── BroadcastLink.py
    │   │   ├── BroadcastShapeLink.py
    │   │   ├── ClampLink.py
    │   │   ├── CloneLink.py
    │   │   ├── CompressedEmbeddingLink.py
    │   │   ├── ConcatLink.py
    │   │   ├── ConcatenateLink.py
    │   │   ├── ConstPowLink.py
    │   │   ├── Conv2dBroadcastLink.py
    │   │   ├── Conv2dLink.py
    │   │   ├── Conv2dReduceSumLink.py
    │   │   ├── CrossEntropyLink.py
    │   │   ├── CrossEntropySparseLink.py
    │   │   ├── CuSparseLink.py
    │   │   ├── CudnnAvgPoolLink.py
    │   │   ├── CudnnBnLink.py
    │   │   ├── CudnnConv2d.py
    │   │   ├── CudnnConv2dAddBiasLink.py
    │   │   ├── CudnnDropoutLink.py
    │   │   ├── CudnnMaxPoolLink.py
    │   │   ├── CudnnSoftmaxCrossEntropyLink.py
    │   │   ├── CudnnSoftmaxLink.py
    │   │   ├── CumSumLink.py
    │   │   ├── DotLink.py
    │   │   ├── DropoutLink.py
    │   │   ├── EmbeddingLookUpLink.py
    │   │   ├── ExpLink.py
    │   │   ├── FloorLink.py
    │   │   ├── GatherLink.py
    │   │   ├── GeluLink.py
    │   │   ├── GroupTopKIdxLink.py
    │   │   ├── HA2ALayoutTransform.py
    │   │   ├── IndexedSliceLink.py
    │   │   ├── IndexingLink.py
    │   │   ├── InitializersLink.py
    │   │   ├── InstanceNorm2dLink.py
    │   │   ├── InterpolateLink.py
    │   │   ├── LayerNormLink.py
    │   │   ├── LayoutTransform.py
    │   │   ├── LeakyReluLink.py
    │   │   ├── LinearLink.py
    │   │   ├── LogLink.py
    │   │   ├── MaskLink.py
    │   │   ├── MaskedFillLink.py
    │   │   ├── MatrixDivideConstLink.py
    │   │   ├── MatrixDivideLink.py
    │   │   ├── MatrixMultLink.py
    │   │   ├── MatrixRsqrtLink.py
    │   │   ├── MatrixSqrtLink.py
    │   │   ├── MatrixTransLink.py
    │   │   ├── MaxLink.py
    │   │   ├── MaxPoolLink.py
    │   │   ├── MinDistLink.py
    │   │   ├── MinLink.py
    │   │   ├── MinusByConstLink.py
    │   │   ├── MinusElewiseLink.py
    │   │   ├── MultiplyConstLink.py
    │   │   ├── MultiplyElewiseLink.py
    │   │   ├── NllLossLink.py
    │   │   ├── NormLink.py
    │   │   ├── OneHotLink.py
    │   │   ├── OppositeLink.py
    │   │   ├── OptEmbedBinaryStepLink.py
    │   │   ├── OptimizerLink.py
    │   │   ├── PadLink.py
    │   │   ├── ParamClipLink.py
    │   │   ├── PowLink.py
    │   │   ├── PowerLink.py
    │   │   ├── PruneLink.py
    │   │   ├── QuantizeEmbeddingLink.py
    │   │   ├── QuantizeLink.py
    │   │   ├── ReduceMeanLink.py
    │   │   ├── ReduceMinLink.py
    │   │   ├── ReduceMulLink.py
    │   │   ├── ReduceNormLink.py
    │   │   ├── ReduceSumAxisZeroLink.py
    │   │   ├── ReduceSumLink.py
    │   │   ├── ReluLink.py
    │   │   ├── RepeatLink.py
    │   │   ├── ReshapeLink.py
    │   │   ├── ReverseLayoutTransform.py
    │   │   ├── RollLink.py
    │   │   ├── SamGroupSumLink.py
    │   │   ├── SamMaxLink.py
    │   │   ├── Scatter1DLink.py
    │   │   ├── ScatterLink.py
    │   │   ├── SigmoidLink.py
    │   │   ├── SignLink.py
    │   │   ├── SinLink.py
    │   │   ├── SliceAssignLink.py
    │   │   ├── SliceByMatrixLink.py
    │   │   ├── SliceLink.py
    │   │   ├── SoftmaxCrossEntropyLink.py
    │   │   ├── SoftmaxCrossEntropySparseLink.py
    │   │   ├── SoftmaxLink.py
    │   │   ├── SparseEmbeddingLookUpLink.py
    │   │   ├── SparseSetLink.py
    │   │   ├── TanhLink.py
    │   │   ├── TopKIdxLink.py
    │   │   ├── TopKValLink.py
    │   │   ├── TrilLookupLink.py
    │   │   ├── UniqueIndicesLink.py
    │   │   ├── WhereLink.py
    │   │   └── __init__.py
    │   ├── gpu_ops
    │   │   ├── Abs.py
    │   │   ├── AddConst.py
    │   │   ├── AddElewise.py
    │   │   ├── Addmm.py
    │   │   ├── AllGatherCommunicate.py
    │   │   ├── AllReduceCommunicate.py
    │   │   ├── AllToAll.py
    │   │   ├── Arange.py
    │   │   ├── Argmax.py
    │   │   ├── ArgmaxPartial.py
    │   │   ├── Argsort.py
    │   │   ├── AssignWithIndexedSlices.py
    │   │   ├── AvgPool.py
    │   │   ├── Baddbmm.py
    │   │   ├── BalanceAssignment.py
    │   │   ├── BatchMatrixMult.py
    │   │   ├── BatchNorm.py
    │   │   ├── BinaryCrossEntropy.py
    │   │   ├── BinaryCrossEntropyWithLogits.py
    │   │   ├── Bool.py
    │   │   ├── Broadcast.py
    │   │   ├── BroadcastCommunicate.py
    │   │   ├── BroadcastShape.py
    │   │   ├── Clamp.py
    │   │   ├── CompressedEmbedding.py
    │   │   ├── Concat.py
    │   │   ├── Concatenate.py
    │   │   ├── ConstPow.py
    │   │   ├── Conv2d.py
    │   │   ├── Conv2dAddBias.py
    │   │   ├── Conv2dBroadcast.py
    │   │   ├── Conv2dReduceSum.py
    │   │   ├── CrossEntropy.py
    │   │   ├── CrossEntropySparse.py
    │   │   ├── CuSparse.py
    │   │   ├── Cumsum.py
    │   │   ├── DataTransfer.py
    │   │   ├── Dispatch.py
    │   │   ├── DistGCN_15d.py
    │   │   ├── Division.py
    │   │   ├── Dropout.py
    │   │   ├── EmbeddingLookUp.py
    │   │   ├── Exp.py
    │   │   ├── Floor.py
    │   │   ├── Full.py
    │   │   ├── Gather.py
    │   │   ├── Gelu.py
    │   │   ├── GroupTopKIdx.py
    │   │   ├── HAllToAll.py
    │   │   ├── Indexing.py
    │   │   ├── InstanceNorm2d.py
    │   │   ├── Interpolate.py
    │   │   ├── LayerNorm.py
    │   │   ├── LayoutTransform.py
    │   │   ├── LeakyRelu.py
    │   │   ├── Linear.py
    │   │   ├── LogElewise.py
    │   │   ├── LogSoftmax.py
    │   │   ├── Mask.py
    │   │   ├── MaskedFill.py
    │   │   ├── MatrixDot.py
    │   │   ├── MatrixMult.py
    │   │   ├── Max.py
    │   │   ├── MaxPool.py
    │   │   ├── Min.py
    │   │   ├── MinDist.py
    │   │   ├── MinusByConst.py
    │   │   ├── MinusElewise.py
    │   │   ├── MultiplyConst.py
    │   │   ├── MultiplyElewise.py
    │   │   ├── NllLoss.py
    │   │   ├── Node.py
    │   │   ├── Norm.py
    │   │   ├── OneHot.py
    │   │   ├── OnesLike.py
    │   │   ├── Opposite.py
    │   │   ├── OptEmbedBinaryStep.py
    │   │   ├── Pad.py
    │   │   ├── ParamClip.py
    │   │   ├── ParameterServerCommunicate.py
    │   │   ├── PipelineReceive.py
    │   │   ├── PipelineSend.py
    │   │   ├── Pow.py
    │   │   ├── Power.py
    │   │   ├── Prune.py
    │   │   ├── Quantize.py
    │   │   ├── QuantizeALPTEmb.py
    │   │   ├── QuantizeEmbedding.py
    │   │   ├── README.md
    │   │   ├── Rand.py
    │   │   ├── ReduceCommunicate.py
    │   │   ├── ReduceMean.py
    │   │   ├── ReduceMin.py
    │   │   ├── ReduceMul.py
    │   │   ├── ReduceNorm1.py
    │   │   ├── ReduceNorm2.py
    │   │   ├── ReduceScatterCommunicate.py
    │   │   ├── ReduceSum.py
    │   │   ├── ReduceSumAxisZero.py
    │   │   ├── Relu.py
    │   │   ├── Repeat.py
    │   │   ├── Reshape.py
    │   │   ├── ReshapeTo.py
    │   │   ├── ReverseLayoutTransform.py
    │   │   ├── ReverseLayoutTransformNoGate.py
    │   │   ├── Roll.py
    │   │   ├── SamGroupSum.py
    │   │   ├── SamMax.py
    │   │   ├── Sample.py
    │   │   ├── Scatter.py
    │   │   ├── Scatter1D.py
    │   │   ├── Sigmoid.py
    │   │   ├── Sign.py
    │   │   ├── Sin.py
    │   │   ├── Slice.py
    │   │   ├── SliceAssign.py
    │   │   ├── SliceByMatrix.py
    │   │   ├── Softmax.py
    │   │   ├── SoftmaxCrossEntropy.py
    │   │   ├── SoftmaxCrossEntropySparse.py
    │   │   ├── SparseEmbeddingLookUp.py
    │   │   ├── SparseSet.py
    │   │   ├── Split.py
    │   │   ├── Sqrt.py
    │   │   ├── StopGradient.py
    │   │   ├── Sum.py
    │   │   ├── SumSparseGradient.py
    │   │   ├── Tanh.py
    │   │   ├── Tile.py
    │   │   ├── TopKIdx.py
    │   │   ├── TopKVal.py
    │   │   ├── Transpose.py
    │   │   ├── TrilLookup.py
    │   │   ├── Unique.py
    │   │   ├── Variable.py
    │   │   ├── Where.py
    │   │   ├── ZerosLike.py
    │   │   ├── __init__.py
    │   │   ├── executor.py
    │   │   ├── gpipe_subexecutor.py
    │   │   ├── pipedream_subexecutor.py
    │   │   ├── pipeline_subexecutor.py
    │   │   └── timer_subexecutor.py
    │   ├── initializers.py
    │   ├── launcher.py
    │   ├── layers
    │   │   ├── BalanceGate.py
    │   │   ├── HashGate.py
    │   │   ├── KTop1Gate.py
    │   │   ├── SAMGate.py
    │   │   ├── TopGate.py
    │   │   ├── __init__.py
    │   │   ├── attention.py
    │   │   ├── base.py
    │   │   ├── batch_split_layer.py
    │   │   ├── concatenate.py
    │   │   ├── conv.py
    │   │   ├── dropout.py
    │   │   ├── embedding.py
    │   │   ├── gates
    │   │   │   ├── base_gate.py
    │   │   │   ├── gshard_gate.py
    │   │   │   └── naive_gate.py
    │   │   ├── gelu.py
    │   │   ├── hash_layer.py
    │   │   ├── identity.py
    │   │   ├── ktop1_layer.py
    │   │   ├── linear.py
    │   │   ├── loss.py
    │   │   ├── mish.py
    │   │   ├── moe_layer.py
    │   │   ├── normalization.py
    │   │   ├── pooling.py
    │   │   ├── relu.py
    │   │   ├── reshape.py
    │   │   ├── sam_layer.py
    │   │   ├── sequence.py
    │   │   ├── slice.py
    │   │   └── sum.py
    │   ├── logger.py
    │   ├── lr_scheduler.py
    │   ├── memory_pool.py
    │   ├── metrics.py
    │   ├── ndarray.py
    │   ├── onnx
    │   │   ├── X2hetu
    │   │   │   ├── __init__.py
    │   │   │   ├── handler.py
    │   │   │   └── handlers
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── array.py
    │   │   │   │   ├── math.py
    │   │   │   │   └── nn.py
    │   │   ├── __init__.py
    │   │   ├── constants.py
    │   │   ├── graph.py
    │   │   ├── handler.py
    │   │   ├── hetu2onnx.py
    │   │   ├── onnx2hetu.py
    │   │   ├── onnx_opset
    │   │   │   ├── AddConst.py
    │   │   │   ├── AddElewise.py
    │   │   │   ├── BatchNorm.py
    │   │   │   ├── Concat.py
    │   │   │   ├── Conv2d.py
    │   │   │   ├── Division.py
    │   │   │   ├── Dropout.py
    │   │   │   ├── Identity.py
    │   │   │   ├── MatrixMult.py
    │   │   │   ├── MultiplyConst.py
    │   │   │   ├── OneHot.py
    │   │   │   ├── Opposite.py
    │   │   │   ├── Pad.py
    │   │   │   ├── Pool.py
    │   │   │   ├── Reduces.py
    │   │   │   ├── Relu.py
    │   │   │   ├── Reshape.py
    │   │   │   ├── Slice.py
    │   │   │   ├── Softmax.py
    │   │   │   ├── Sqrt.py
    │   │   │   ├── Tanh.py
    │   │   │   ├── Transpose.py
    │   │   │   ├── Variable.py
    │   │   │   ├── Where.py
    │   │   │   ├── __init__.py
    │   │   │   └── general.py
    │   │   └── util.py
    │   ├── optimizer.py
    │   ├── preduce.py
    │   ├── profiler.py
    │   ├── random.py
    │   ├── stream.py
    │   └── tokenizers
    │   │   ├── __init__.py
    │   │   ├── bert_tokenizer.py
    │   │   └── utils.py
    └── runner.py
├── src
    ├── CMakeLists.txt
    ├── common
    │   ├── c_runtime_api.cc
    │   ├── c_runtime_api.h
    │   ├── cpu_device_api.cc
    │   ├── cpu_device_api.h
    │   ├── cuda_device_api.h
    │   ├── device_api.h
    │   ├── dispatch.h
    │   ├── dlarray.h
    │   ├── random.cc
    │   ├── random.h
    │   └── runtime_base.h
    ├── communication
    │   ├── CMakeLists.txt
    │   ├── c_communication_nthread.cc
    │   ├── mpi_communication.cc
    │   ├── mpi_nccl_communication.cu
    │   └── nccl_communication.cu
    ├── cuda_common
    │   ├── cuda_device_api.cc
    │   ├── gpu_chunk.cc
    │   ├── gpu_functions.cuh
    │   ├── gpu_reduce.h
    │   ├── gpu_runtime.cc
    │   └── gpu_runtime.h
    ├── dnnl_ops
    │   ├── AddConst.cpp
    │   ├── AddElewise.cpp
    │   ├── ArraySet.cpp
    │   ├── AvgPool.cpp
    │   ├── BatchNorm.cpp
    │   ├── BroadcastTo.cpp
    │   ├── Concat.cpp
    │   ├── Conv2d.cpp
    │   ├── DivideConst.cpp
    │   ├── DivideElewise.cpp
    │   ├── Dropout.cpp
    │   ├── EmbeddingLookup.cpp
    │   ├── Gelu.cpp
    │   ├── Initializers.cpp
    │   ├── MatrixMult.cpp
    │   ├── MaxPool.cpp
    │   ├── MultiplyConst.cpp
    │   ├── MultiplyElewise.cpp
    │   ├── Opposite.cpp
    │   ├── Optimizers.cpp
    │   ├── Pad.cpp
    │   ├── ReduceIndexedSlice.cpp
    │   ├── ReduceSumAxisZero.cpp
    │   ├── Relu.cpp
    │   ├── Reshape.cpp
    │   ├── Sigmoid.cpp
    │   ├── Softmax.cpp
    │   ├── SoftmaxCrossEntropy.cpp
    │   ├── Sqrt.cpp
    │   ├── Tanh.cpp
    │   ├── Transpose.cpp
    │   ├── UniqueIndices.cpp
    │   ├── dnnl_runtime.cc
    │   └── dnnl_runtime.h
    ├── header
    │   ├── mpi_communication.h
    │   ├── mpi_nccl_communication.h
    │   ├── nccl_communication.h
    │   └── types.h
    ├── hetu_cache
    │   ├── CMakeLists.txt
    │   ├── include
    │   │   ├── binding.h
    │   │   ├── cache.h
    │   │   ├── embedding.h
    │   │   ├── hetu_client.h
    │   │   ├── lfu_cache.h
    │   │   ├── lfuopt_cache.h
    │   │   ├── lru_cache.h
    │   │   └── unqiue_tools.h
    │   └── src
    │   │   ├── cache.cc
    │   │   ├── embedding.cc
    │   │   ├── hetu_client.cc
    │   │   ├── lfu_cache.cc
    │   │   ├── lfuopt_cache.cc
    │   │   ├── lru_cache.cc
    │   │   └── python_api.cc
    ├── memory_pool
    │   ├── BFC_allocator.cc
    │   ├── BFC_allocator.h
    │   ├── allocator.cc
    │   └── allocator.h
    └── ops
    │   ├── Abs.cu
    │   ├── AddConst.cu
    │   ├── AddElewise.cu
    │   ├── Addmm.cu
    │   ├── Arange.cu
    │   ├── Argmax.cu
    │   ├── ArgmaxPartial.cu
    │   ├── Argsort.cu
    │   ├── ArrayLazyCallback.cu
    │   ├── ArraySet.cu
    │   ├── AssignWithIndexedSlices.cu
    │   ├── AutoDimOps.cu
    │   ├── AvgPool.cu
    │   ├── Baddbmm.cu
    │   ├── BatchMatrixMult.cu
    │   ├── BinaryCrossEntropy.cu
    │   ├── BinaryCrossEntropyWithLogits.cu
    │   ├── Bool.cu
    │   ├── Broadcast.cu
    │   ├── BroadcastShape.cu
    │   ├── Clamp.cu
    │   ├── Clip.cu
    │   ├── Clone.cu
    │   ├── CompressedEmbedding.cu
    │   ├── Concat.cu
    │   ├── Concatenate.cu
    │   ├── ConstPow.cu
    │   ├── Conv2d.cu
    │   ├── Conv2dBroadcast.cu
    │   ├── Conv2dReduceSum.cu
    │   ├── CrossEntropy.cu
    │   ├── CrossEntropySparse.cu
    │   ├── CuSparseCsrmm.cu
    │   ├── CuSparseCsrmv.cu
    │   ├── CudnnAvgPool.cu
    │   ├── CudnnBn.cu
    │   ├── CudnnConv2d.cu
    │   ├── CudnnConv2dAddBias.cu
    │   ├── CudnnDropout.cu
    │   ├── CudnnMaxPool.cu
    │   ├── CudnnSoftmax.cu
    │   ├── CudnnSoftmaxEntropy.cu
    │   ├── CumSum.cu
    │   ├── DivideConst.cu
    │   ├── DivideElewise.cu
    │   ├── Dot.cu
    │   ├── Dropout.cu
    │   ├── EmbeddingLookup.cu
    │   ├── Exp.cu
    │   ├── Floor.cu
    │   ├── Gather.cu
    │   ├── Gelu.cu
    │   ├── GroupTopKIdx.cu
    │   ├── H_A2A_LayoutTransform.cu
    │   ├── IndexedSlices.cu
    │   ├── Indexing.cu
    │   ├── Initializers.cu
    │   ├── InstanceNorm2d.cu
    │   ├── Interpolate.cu
    │   ├── LayerNorm.cu
    │   ├── LayoutTransform.cu
    │   ├── LeakyRelu.cu
    │   ├── Linear.cu
    │   ├── Log.cu
    │   ├── Mask.cu
    │   ├── MaskedFill.cu
    │   ├── MatrixMult.cu
    │   ├── Max.cu
    │   ├── MaxPool.cu
    │   ├── Min.cu
    │   ├── MinDist.cu
    │   ├── MinusByConst.cu
    │   ├── MinusElewise.cu
    │   ├── MultiplyConst.cu
    │   ├── MultiplyElewise.cu
    │   ├── NllLoss.cu
    │   ├── Norm.cu
    │   ├── OneHot.cu
    │   ├── Opposite.cu
    │   ├── OptEmbedBinaryStep.cu
    │   ├── Optimizers.cu
    │   ├── OptimizersSparse.cu
    │   ├── Outer.cu
    │   ├── Pad.cu
    │   ├── Pow.cu
    │   ├── Power.cu
    │   ├── PruneMask.cu
    │   ├── Quantize.cu
    │   ├── QuantizeEmbedding.cu
    │   ├── ReduceGeneral.cu
    │   ├── ReduceIndexedSlice.cu
    │   ├── ReduceSum.cu
    │   ├── ReduceSumAxisZero.cu
    │   ├── Relu.cu
    │   ├── Repeat.cu
    │   ├── Reshape.cu
    │   ├── Roll.cu
    │   ├── SamGroupSum.cu
    │   ├── SamMax.cu
    │   ├── Scatter.cu
    │   ├── Scatter1D.cu
    │   ├── Sigmoid.cu
    │   ├── Sign.cu
    │   ├── SignedQuantize.cu
    │   ├── Sin.cu
    │   ├── Slice.cu
    │   ├── SliceAssign.cu
    │   ├── SliceByMatrix.cu
    │   ├── Softmax.cu
    │   ├── SoftmaxCrossEntropy.cu
    │   ├── SoftmaxCrossEntropySparse.cu
    │   ├── SparseEmbeddingLookup.cu
    │   ├── SparseSet.cu
    │   ├── Sqrt.cu
    │   ├── Tanh.cu
    │   ├── TopKIdx.cu
    │   ├── TopKVal.cu
    │   ├── Transpose.cu
    │   ├── TrilLookup.cu
    │   ├── UniqueIndices.cu
    │   └── Where.cu
├── tests
    ├── README.md
    ├── balanced.py
    ├── draft.py
    ├── get_gpu_memory.py
    ├── hetu_cache
    │   ├── hetu_cache_config.yml
    │   └── hetu_cache_test.py
    ├── onnx
    │   ├── README.md
    │   ├── cnn_hetu_onnx_tf.py
    │   ├── cnn_tf_onnx_hetu.py
    │   ├── dnn_hetu_onnx_tf.py
    │   ├── dnn_tf_onnx_hetu.py
    │   ├── rnn_hetu_onnx_tf.py
    │   ├── rnn_tf_onnx_hetu.py
    │   ├── test_cnn.py
    │   ├── test_mlp.py
    │   └── test_nodes.py
    ├── pstests
    │   ├── local_s2_w1.yml
    │   ├── local_s2_w2.yml
    │   ├── test_apis.py
    │   ├── test_bandwidth.py
    │   ├── test_push_data.py
    │   ├── test_tf_bandwidth.py
    │   └── tf_local_s1_w2.json
    ├── test_DistGCN
    │   ├── prepare_data_GCN15d.py
    │   ├── prepare_data_GCN15d_reorder.py
    │   ├── test_group_comm.py
    │   └── test_model_distGCN15d.py
    ├── test_comm.py
    ├── test_datatransfer_op.py
    ├── test_dnnl_op.py
    ├── test_embedding_op.py
    ├── test_encode_decode.py
    ├── test_gpu_initializers.py
    ├── test_gpu_op.py
    ├── test_ha2agather.py
    ├── test_lr_scheduler.py
    ├── test_nccl_bandwidth.py
    ├── test_ops.py
    ├── test_optimizer.py
    ├── test_profiler.py
    ├── test_ps_preduce.py
    ├── test_reorder_lookup.py
    ├── test_resnet_block.py
    ├── test_simple_version_ops.py
    ├── test_sparse.py
    ├── test_sparse_op.py
    ├── test_split.py
    ├── test_sum_sparse_grad.py
    ├── test_transformer_ops.py
    ├── test_unique.py
    ├── tester.py
    └── torch_balance.py
└── tools
    ├── EmbeddingMemoryCompression
        ├── .gitignore
        ├── README.md
        ├── config.cmake
        ├── methods
        │   ├── __init__.py
        │   ├── layers
        │   │   ├── __init__.py
        │   │   ├── adapt.py
        │   │   ├── alpt.py
        │   │   ├── autodim.py
        │   │   ├── autosrh.py
        │   │   ├── compo.py
        │   │   ├── deduplication.py
        │   │   ├── deeplight.py
        │   │   ├── dhe.py
        │   │   ├── dpq.py
        │   │   ├── hash.py
        │   │   ├── mde.py
        │   │   ├── mgqe.py
        │   │   ├── optembed.py
        │   │   ├── pep.py
        │   │   ├── primes.npy
        │   │   ├── quantize.py
        │   │   ├── robe.py
        │   │   ├── sparse.py
        │   │   └── tensortrain.py
        │   └── scheduler
        │   │   ├── __init__.py
        │   │   ├── adapt.py
        │   │   ├── alpt.py
        │   │   ├── autodim.py
        │   │   ├── autosrh.py
        │   │   ├── base.py
        │   │   ├── compo.py
        │   │   ├── compressor.py
        │   │   ├── deduplication.py
        │   │   ├── deeplight.py
        │   │   ├── dhe.py
        │   │   ├── dpq.py
        │   │   ├── hash.py
        │   │   ├── md.py
        │   │   ├── mgqe.py
        │   │   ├── multistage.py
        │   │   ├── optembed.py
        │   │   ├── pep.py
        │   │   ├── quantize.py
        │   │   ├── robe.py
        │   │   ├── switchinference.py
        │   │   └── tensortrain.py
        ├── models
        │   ├── __init__.py
        │   ├── base.py
        │   ├── bench
        │   │   └── dlrm_s_criteo_kaggle.sh
        │   ├── dcn.py
        │   ├── deepfm.py
        │   ├── dlrm.py
        │   ├── dlrm_data_pytorch.py
        │   ├── dlrm_s_pytorch.py
        │   ├── dlrm_test.py
        │   ├── dlrm_torch.py
        │   ├── load_data.py
        │   ├── load_data_avazu_variants.py
        │   ├── load_data_criteo_variants.py
        │   └── wdl.py
        ├── run_compressed.py
        ├── scripts
        │   ├── .gitignore
        │   ├── fit_scaling_law.py
        │   ├── plot_dimension_autosrh.py
        │   ├── plot_dimension_mde.py
        │   └── plot_powerlaw.py
        ├── supplements
        │   ├── Explain_TTRec.md
        │   └── static_encoding.png
        └── test_rag.py
    └── Galvatron
        ├── .DS_Store
        ├── MANIFEST.in
        ├── Makefile
        ├── README.md
        ├── csrc
            └── dp_core.cpp
        ├── figs
            └── api.jpg
        ├── galvatron.exp
        ├── galvatron
            ├── .DS_Store
            ├── MANIFEST.in
            ├── __init__.py
            ├── core
            │   ├── __init__.py
            │   ├── arguments.py
            │   ├── comm_groups.py
            │   ├── dataloader.py
            │   ├── hybrid_parallel_config.py
            │   ├── hybrid_parallel_model.py
            │   ├── initialize.py
            │   ├── parallel.py
            │   ├── pipeline
            │   │   ├── __init__.py
            │   │   ├── grad_reduce.py
            │   │   ├── pipeline.py
            │   │   └── utils.py
            │   ├── profiler.py
            │   ├── redistribute.py
            │   └── tensor_parallel
            │   │   ├── __init__.py
            │   │   ├── transformer.py
            │   │   └── utils.py
            ├── models
            │   ├── README.md
            │   ├── __init__.py
            │   ├── baichuan
            │   │   ├── BaiChuanModel_hybrid_parallel.py
            │   │   ├── BaiChuanModel_sequential.py
            │   │   ├── BaiChuanModel_tensor_parallel.py
            │   │   ├── __init__.py
            │   │   ├── arguments.py
            │   │   ├── configs
            │   │   │   ├── computation_profiling_bf16_hidden4096_head32_seqlen2048.json
            │   │   │   ├── galvatron_config_baichuan-7b_2nodes_8gpus_per_node_40GB_bf16_3D_example.json
            │   │   │   ├── galvatron_config_baichuan-7b_2nodes_8gpus_per_node_40GB_bf16_example.json
            │   │   │   └── memory_profiling_bf16_hidden4096_head32_seqlen2048.json
            │   │   ├── dataloader.py
            │   │   ├── hf_configs
            │   │   │   ├── __init__.py
            │   │   │   ├── baichuan-7b
            │   │   │   │   ├── config.json
            │   │   │   │   ├── config_ori.json
            │   │   │   │   └── configuration_baichuan.py
            │   │   │   └── config_utils.py
            │   │   ├── scripts
            │   │   │   ├── train.sh
            │   │   │   └── train_dist.sh
            │   │   ├── train.py
            │   │   └── train_dist.py
            │   ├── gpt
            │   │   ├── GPTModel_hybrid_parallel.py
            │   │   ├── GPTModel_sequential.py
            │   │   ├── GPTModel_tensor_parallel.py
            │   │   ├── __init__.py
            │   │   ├── arguments.py
            │   │   ├── configs
            │   │   │   ├── computation_profiling_bf16_hidden1600_head32_seqlen1024.json
            │   │   │   ├── computation_profiling_bf16_hidden2560_head32_seqlen2048.json
            │   │   │   ├── computation_profiling_bf16_hidden4096_head32_seqlen2048.json
            │   │   │   ├── galvatron_config_gpt-1.5b_2nodes_8gpus_per_node_40GB_bf16_3D_example.json
            │   │   │   ├── galvatron_config_gpt-1.5b_2nodes_8gpus_per_node_40GB_bf16_example.json
            │   │   │   ├── galvatron_config_gpt-2.7b_2nodes_8gpus_per_node_40GB_bf16_3D_example.json
            │   │   │   ├── galvatron_config_gpt-2.7b_2nodes_8gpus_per_node_40GB_bf16_example.json
            │   │   │   ├── galvatron_config_gpt-6.7b_2nodes_8gpus_per_node_40GB_bf16_3D_example.json
            │   │   │   ├── galvatron_config_gpt-6.7b_2nodes_8gpus_per_node_40GB_bf16_example.json
            │   │   │   ├── memory_profiling_bf16_hidden1600_head32_seqlen1024.json
            │   │   │   ├── memory_profiling_bf16_hidden2560_head32_seqlen2048.json
            │   │   │   └── memory_profiling_bf16_hidden4096_head32_seqlen2048.json
            │   │   ├── dataloader.py
            │   │   ├── meta_configs
            │   │   │   ├── __init__.py
            │   │   │   ├── config_utils.py
            │   │   │   ├── gpt-0.3b.json
            │   │   │   ├── gpt-1.5b.json
            │   │   │   ├── gpt-2.7b.json
            │   │   │   └── gpt-6.7b.json
            │   │   ├── scripts
            │   │   │   ├── train.sh
            │   │   │   └── train_dist.sh
            │   │   ├── train.py
            │   │   └── train_dist.py
            │   └── llama
            │   │   ├── LlamaModel_hybrid_parallel.py
            │   │   ├── LlamaModel_sequential.py
            │   │   ├── LlamaModel_tensor_parallel.py
            │   │   ├── __init__.py
            │   │   ├── arguments.py
            │   │   ├── configs
            │   │       ├── computation_profiling_bf16_hidden4096_head32_seqlen2048.json
            │   │       ├── galvatron_config_llama-7b_2nodes_8gpus_per_node_40GB_bf16_3D_example.json
            │   │       ├── galvatron_config_llama-7b_2nodes_8gpus_per_node_40GB_bf16_example.json
            │   │       └── memory_profiling_bf16_hidden4096_head32_seqlen2048.json
            │   │   ├── dataloader.py
            │   │   ├── meta_configs
            │   │       ├── __init__.py
            │   │       ├── config_utils.py
            │   │       ├── llama-13b.json
            │   │       ├── llama-30b.json
            │   │       └── llama-7b.json
            │   │   ├── scripts
            │   │       ├── train.sh
            │   │       └── train_dist.sh
            │   │   ├── train.py
            │   │   └── train_dist.py
            ├── profile_hardware
            │   ├── hardware_configs
            │   │   ├── allreduce_bandwidth_1nodes_4gpus_per_node.json
            │   │   ├── allreduce_bandwidth_1nodes_8gpus_per_node.json
            │   │   ├── allreduce_bandwidth_2nodes_8gpus_per_node.json
            │   │   ├── overlap_coefficient.json
            │   │   ├── p2p_bandwidth_1nodes_4gpus_per_node.json
            │   │   ├── p2p_bandwidth_1nodes_8gpus_per_node.json
            │   │   └── p2p_bandwidth_2nodes_8gpus_per_node.json
            │   ├── hostfile
            │   ├── profile_hardware.py
            │   ├── profile_overlap.py
            │   └── scripts
            │   │   ├── build_nccl_test.sh
            │   │   ├── profile_hardware.sh
            │   │   ├── profile_overlap.sh
            │   │   └── run_nccl_test.sh
            ├── scripts
            │   ├── flash_attn_ops_install.sh
            │   └── prepare_env.sh
            ├── site_package
            │   ├── __init__.py
            │   ├── megatron
            │   │   ├── __init__.py
            │   │   ├── arguments.py
            │   │   ├── checkpointing.py
            │   │   ├── core
            │   │   │   ├── README.md
            │   │   │   ├── __init__.py
            │   │   │   ├── enums.py
            │   │   │   ├── package_info.py
            │   │   │   ├── parallel_state.py
            │   │   │   ├── pipeline_parallel
            │   │   │   │   ├── __init__.py
            │   │   │   │   ├── p2p_communication.py
            │   │   │   │   └── schedules.py
            │   │   │   ├── requirements.txt
            │   │   │   ├── tensor_parallel
            │   │   │   │   ├── __init__.py
            │   │   │   │   ├── cross_entropy.py
            │   │   │   │   ├── data.py
            │   │   │   │   ├── layers.py
            │   │   │   │   ├── mappings.py
            │   │   │   │   ├── mappings_group.py
            │   │   │   │   ├── random.py
            │   │   │   │   └── utils.py
            │   │   │   └── utils.py
            │   │   ├── data
            │   │   │   ├── Makefile
            │   │   │   ├── __init__.py
            │   │   │   ├── autoaugment.py
            │   │   │   ├── bert_dataset.py
            │   │   │   ├── biencoder_dataset_utils.py
            │   │   │   ├── blendable_dataset.py
            │   │   │   ├── data_samplers.py
            │   │   │   ├── dataset_utils.py
            │   │   │   ├── gpt_dataset.py
            │   │   │   ├── helpers.cpp
            │   │   │   ├── ict_dataset.py
            │   │   │   ├── indexed_dataset.py
            │   │   │   ├── orqa_wiki_dataset.py
            │   │   │   ├── realm_dataset_utils.py
            │   │   │   ├── realm_index.py
            │   │   │   ├── t5_dataset.py
            │   │   │   ├── test
            │   │   │   │   ├── test_indexed_dataset.py
            │   │   │   │   └── test_preprocess_data.sh
            │   │   │   └── vit_dataset.py
            │   │   ├── dist_signal_handler.py
            │   │   ├── fp16_deprecated
            │   │   │   └── loss_scaler.py
            │   │   ├── fused_kernels
            │   │   │   ├── __init__.py
            │   │   │   ├── compat.h
            │   │   │   ├── scaled_masked_softmax.cpp
            │   │   │   ├── scaled_masked_softmax.h
            │   │   │   ├── scaled_masked_softmax_cuda.cu
            │   │   │   ├── scaled_softmax.cpp
            │   │   │   ├── scaled_softmax_cuda.cu
            │   │   │   ├── scaled_upper_triang_masked_softmax.cpp
            │   │   │   ├── scaled_upper_triang_masked_softmax.h
            │   │   │   ├── scaled_upper_triang_masked_softmax_cuda.cu
            │   │   │   ├── tests
            │   │   │   │   ├── __init__.py
            │   │   │   │   └── test_fused_kernels.py
            │   │   │   └── type_shim.h
            │   │   ├── global_vars.py
            │   │   ├── indexer.py
            │   │   ├── initialize.py
            │   │   ├── memory.py
            │   │   ├── microbatches.py
            │   │   ├── model
            │   │   │   ├── __init__.py
            │   │   │   ├── bert_model.py
            │   │   │   ├── biencoder_model.py
            │   │   │   ├── classification.py
            │   │   │   ├── distributed.py
            │   │   │   ├── enums.py
            │   │   │   ├── fused_bias_gelu.py
            │   │   │   ├── fused_layer_norm.py
            │   │   │   ├── fused_softmax.py
            │   │   │   ├── gpt_model.py
            │   │   │   ├── language_model.py
            │   │   │   ├── module.py
            │   │   │   ├── multiple_choice.py
            │   │   │   ├── realm_model.py
            │   │   │   ├── retro_transformer.py
            │   │   │   ├── rotary_pos_embedding.py
            │   │   │   ├── t5_model.py
            │   │   │   ├── transformer.py
            │   │   │   ├── utils.py
            │   │   │   └── vision
            │   │   │   │   ├── classification.py
            │   │   │   │   ├── dino.py
            │   │   │   │   ├── esvit_swin_backbone.py
            │   │   │   │   ├── inpainting.py
            │   │   │   │   ├── knn_monitor.py
            │   │   │   │   ├── mit_backbone.py
            │   │   │   │   ├── swin_backbone.py
            │   │   │   │   ├── utils.py
            │   │   │   │   └── vit_backbone.py
            │   │   ├── mpu
            │   │   │   └── tests
            │   │   │   │   ├── __init__.py
            │   │   │   │   ├── commons.py
            │   │   │   │   ├── test_cross_entropy.py
            │   │   │   │   ├── test_data.py
            │   │   │   │   ├── test_initialize.py
            │   │   │   │   ├── test_layers.py
            │   │   │   │   └── test_random.py
            │   │   ├── optimizer
            │   │   │   ├── __init__.py
            │   │   │   ├── clip_grads.py
            │   │   │   ├── distrib_optimizer.py
            │   │   │   ├── grad_scaler.py
            │   │   │   └── optimizer.py
            │   │   ├── optimizer_param_scheduler.py
            │   │   ├── static
            │   │   │   └── index.html
            │   │   ├── text_generation
            │   │   │   ├── __init__.py
            │   │   │   ├── api.py
            │   │   │   ├── beam_utils.py
            │   │   │   ├── communication.py
            │   │   │   ├── forward_step.py
            │   │   │   ├── generation.py
            │   │   │   ├── sampling.py
            │   │   │   └── tokenization.py
            │   │   ├── text_generation_server.py
            │   │   ├── timers.py
            │   │   ├── tokenizer
            │   │   │   ├── __init__.py
            │   │   │   ├── bert_tokenization.py
            │   │   │   ├── gpt2_tokenization.py
            │   │   │   └── tokenizer.py
            │   │   ├── training.py
            │   │   └── utils.py
            │   └── nccl-tests
            │   │   ├── .gitignore
            │   │   ├── LICENSE.txt
            │   │   ├── Makefile
            │   │   ├── README.md
            │   │   ├── doc
            │   │       └── PERFORMANCE.md
            │   │   ├── src
            │   │       ├── Makefile
            │   │       ├── all_gather.cu
            │   │       ├── all_reduce.cu
            │   │       ├── alltoall.cu
            │   │       ├── broadcast.cu
            │   │       ├── common.cu
            │   │       ├── common.h
            │   │       ├── gather.cu
            │   │       ├── hypercube.cu
            │   │       ├── nccl1_compat.h
            │   │       ├── reduce.cu
            │   │       ├── reduce_scatter.cu
            │   │       ├── scatter.cu
            │   │       ├── sendrecv.cu
            │   │       ├── timer.cc
            │   │       └── timer.h
            │   │   └── verifiable
            │   │       ├── Makefile
            │   │       ├── inexact_regress.cu
            │   │       ├── verifiable.cu
            │   │       ├── verifiable.h
            │   │       └── verifiable.mk
            └── utils
            │   ├── __init__.py
            │   ├── config_utils.py
            │   ├── memory_utils.py
            │   ├── strategy_utils.py
            │   └── training_utils.py
        ├── requirements.txt
        └── setup.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pyc
 2 | *.csv
 3 | *.tar.gz*
 4 | *.pkl.gz
 5 | .dataset
 6 | build/
 7 | test_time/
 8 | .vscode/
 9 | CIFAR_10/
10 | CIFAR_100/
11 | update.sh
12 | cmake/config.cmake


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "third_party/GraphMix"]
2 | 	path = third_party/GraphMix
3 | 	url = https://github.com/nox-410/GraphMix.git
4 | [submodule "third_party/HetuML"]
5 | 	path = third_party/HetuML
6 | 	url = https://github.com/ccchengff/HetuML.git
7 | 


--------------------------------------------------------------------------------
/bin/heturun:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | python $(cd $(dirname $0); pwd)/../python/runner.py $@
3 | 


--------------------------------------------------------------------------------
/cmake/Modules/FindCUB.cmake:
--------------------------------------------------------------------------------
 1 | # - Try to find CUB
 2 | # Once done this will define
 3 | # CUB_FOUND - System has CUB
 4 | # CUB_INCLUDE_DIR - The CUB include directories
 5 | 
 6 | find_path ( CUB_INCLUDE_DIR cub HINTS ${CUB_ROOT}/include )
 7 | 
 8 | find_package_handle_standard_args (
 9 |     CUB
10 |     REQUIRED_VARS CUB_INCLUDE_DIR)
11 | 


--------------------------------------------------------------------------------
/cmake/Modules/FindMKL.cmake:
--------------------------------------------------------------------------------
 1 | # - Try to find DNNL(MKL-DNN)
 2 | # Once done this will define
 3 | # DNNL_FOUND - System has DNNL
 4 | # DNNL_INCLUDE_DIR - The DNNL include directories
 5 | # DNNL_BUILD_INCLUDE_DIR - DNNL include directories in build
 6 | # DNNL_LIBRARY - The libraries needed to use DNNL
 7 | # DNNL_DEFINITIONS - Compiler switches required for using DNNL
 8 | 
 9 | find_path ( DNNL_INCLUDE_DIR dnnl.h HINTS ${MKL_ROOT}/include )
10 | find_path ( DNNL_BUILD_INCLUDE_DIR dnnl_config.h HINTS ${MKL_BUILD}/include )
11 | find_library ( DNNL_LIBRARY NAMES dnnl mkldnn HINTS ${MKL_BUILD}/src )
12 | 
13 | include ( FindPackageHandleStandardArgs )
14 | find_package_handle_standard_args ( MKL DEFAULT_MSG DNNL_LIBRARY DNNL_INCLUDE_DIR DNNL_BUILD_INCLUDE_DIR )
15 | 


--------------------------------------------------------------------------------
/cmake/Modules/FindTHRUST.cmake:
--------------------------------------------------------------------------------
 1 | # - Try to find THRUST
 2 | # Once done this will define
 3 | # THRUST_FOUND - System has THRUST
 4 | # THRUST_INCLUDE_DIR - The THRUST include directories
 5 | 
 6 | find_path ( THRUST_INCLUDE_DIR thrust HINTS ${THRUST_ROOT}/include )
 7 | 
 8 | find_package_handle_standard_args (
 9 |     THRUST
10 |     REQUIRED_VARS THRUST_INCLUDE_DIR)
11 | 


--------------------------------------------------------------------------------
/examples/auto_parallel/.gitignore:
--------------------------------------------------------------------------------
1 | scripts/
2 | test_strategy/
3 | *.json


--------------------------------------------------------------------------------
/examples/auto_parallel/cnn/experiment_scripts/.gitignore:
--------------------------------------------------------------------------------
 1 | alexnet/
 2 | resnet101/
 3 | wresnet101/
 4 | inceptionv3/
 5 | prev/
 6 | 
 7 | run*.py
 8 | 
 9 | *.png
10 | 


--------------------------------------------------------------------------------
/examples/auto_parallel/cnn/experiment_scripts/w16.yml:
--------------------------------------------------------------------------------
1 | nodes:
2 |   - host: node1
3 |     workers: 8
4 |     chief: true
5 |   - host: node2
6 |     workers: 8
7 | 


--------------------------------------------------------------------------------
/examples/auto_parallel/cnn/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .alexnet import AlexNet
2 | from .inception_v3 import InceptionV3
3 | from .resnet101 import ResNet101
4 | from .vgg19 import VGG19
5 | from .wide_resnet import WideResNet50, WideResNet101
6 | 


--------------------------------------------------------------------------------
/examples/auto_parallel/cnn/torch_models/__init__.py:
--------------------------------------------------------------------------------
1 | from .alexnet import AlexNet
2 | from .resnet101 import ResNet101
3 | from .vgg19 import VGG19
4 | from .inception_v3 import InceptionV3
5 | from .wide_resnet import wide_resnet50_2, wide_resnet101_2
6 | 


--------------------------------------------------------------------------------
/examples/auto_parallel/transformer/.gitignore:
--------------------------------------------------------------------------------
1 | preprocessed_data
2 | cached_data


--------------------------------------------------------------------------------
/examples/auto_parallel/transformer/experiment_scripts/.gitignore:
--------------------------------------------------------------------------------
1 | bert*
2 | gpt2*
3 | prev/
4 | 
5 | run*.py
6 | *.png
7 | 


--------------------------------------------------------------------------------
/examples/auto_parallel/transformer/experiment_scripts/w16.yml:
--------------------------------------------------------------------------------
1 | nodes:
2 |   - host: node1
3 |     workers: 8
4 |     chief: true
5 |   - host: node2
6 |     workers: 8
7 | 


--------------------------------------------------------------------------------
/examples/auto_parallel/transformer/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .hetu_bert import BertForPreTraining as HetuBert
2 | from .torch_bert import BertForPreTraining as TorchBert
3 | from .bert_config import BertConfig
4 | 
5 | from .hetu_gpt2 import GPT2LMHeadModel as HetuGPT2
6 | from .torch_gpt2 import GPT2LMHeadModel as TorchGPT2
7 | from .gpt2_config import GPT2Config
8 | 


--------------------------------------------------------------------------------
/examples/cnn/local_s1.yml:
--------------------------------------------------------------------------------
 1 | shared :
 2 |   DMLC_PS_ROOT_URI : 127.0.0.1
 3 |   DMLC_PS_ROOT_PORT : 13030
 4 |   DMLC_NUM_WORKER : 2
 5 |   DMLC_NUM_SERVER : 1
 6 |   DMLC_PS_VAN_TYPE : p3
 7 | launch :
 8 |   worker : 0
 9 |   server : 1
10 |   scheduler : true
11 | 


--------------------------------------------------------------------------------
/examples/cnn/models/LogReg.py:
--------------------------------------------------------------------------------
 1 | import hetu as ht
 2 | from hetu import init
 3 | 
 4 | 
 5 | def logreg(x, y_):
 6 |     '''
 7 |     Logistic Regression model, for MNIST dataset.
 8 | 
 9 |     Parameters:
10 |         x: Variable(hetu.gpu_ops.Node.Node), shape (N, dims)
11 |         y_: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes)
12 |     Return:
13 |         loss: Variable(hetu.gpu_ops.Node.Node), shape (1,)
14 |         y: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes)
15 |     '''
16 | 
17 |     print("Build logistic regression model...")
18 |     weight = init.zeros((784, 10), name='logreg_weight')
19 |     bias = init.zeros((10,), name='logreg_bias')
20 |     x = ht.matmul_op(x, weight)
21 |     y = x + ht.broadcastto_op(bias, x)
22 |     loss = ht.softmaxcrossentropy_op(y, y_)
23 |     loss = ht.reduce_mean_op(loss, [0])
24 |     return loss, y
25 | 


--------------------------------------------------------------------------------
/examples/cnn/models/__init__.py:
--------------------------------------------------------------------------------
 1 | from .VGG import vgg, vgg16, vgg19
 2 | from .LogReg import logreg
 3 | from .CNN import cnn_3_layers
 4 | from .AlexNet import alexnet
 5 | from .LeNet import lenet
 6 | from .MLP import mlp
 7 | from .RNN import rnn
 8 | from .LSTM import lstm
 9 | from .ResNet import resnet, resnet18, resnet34
10 | 


--------------------------------------------------------------------------------
/examples/cnn/pytorch_models/__init__.py:
--------------------------------------------------------------------------------
1 | from .mlp import mlp
2 | from .resnet import resnet18, resnet34, resnet50, resnet101, resnet152
3 | from .vgg import vgg16, vgg19
4 | from .rnn import rnn
5 | 


--------------------------------------------------------------------------------
/examples/cnn/pytorch_models/mlp.py:
--------------------------------------------------------------------------------
 1 | import torch.nn.functional as F
 2 | import torch.nn as nn
 3 | 
 4 | 
 5 | class MLP(nn.Module):
 6 |     def __init__(self):
 7 |         super(MLP, self).__init__()
 8 |         self.fc1 = nn.Linear(3072, 256)
 9 |         self.fc2 = nn.Linear(256, 256)
10 |         self.fc3 = nn.Linear(256, 10)
11 | 
12 |     def forward(self, x):
13 |         x = F.relu(self.fc1(x))
14 |         x = F.relu(self.fc2(x))
15 |         out = self.fc3(x)
16 |         return out
17 | 
18 | 
19 | def mlp():
20 |     return MLP()
21 | 


--------------------------------------------------------------------------------
/examples/cnn/scripts/hetu_16gpu.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | workdir=$(cd $(dirname $0); pwd)
4 | mainpy=${workdir}/../main.py
5 | 
6 | ### validate and timing
7 | heturun -c hetu_config16allreduce.yml python ${mainpy} --model $1 --dataset $2 --learning-rate 0.000625 --validate --timing --comm-mode AllReduce
8 | 
9 | 


--------------------------------------------------------------------------------
/examples/cnn/scripts/hetu_16gpu_ps.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | workdir=$(cd $(dirname $0); pwd)
4 | mainpy=${workdir}/../main.py
5 | 
6 | ### validate and timing
7 | heturun -c hetu_config16ps.yml python ${mainpy} --model $1 --dataset $2 --learning-rate 0.000625 --validate --timing --comm-mode PS
8 | 
9 | 


--------------------------------------------------------------------------------
/examples/cnn/scripts/hetu_1gpu.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | workdir=$(cd $(dirname $0); pwd)
 4 | mainpy=${workdir}/../main.py
 5 | 
 6 | 
 7 | # model: 
 8 | # e.g. bash hetu_1gpu.sh mlp CIFAR10
 9 | 
10 | ### validate and timing
11 | python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --validate --timing
12 | 


--------------------------------------------------------------------------------
/examples/cnn/scripts/hetu_2gpu_ps.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | workdir=$(cd $(dirname $0); pwd)
4 | mainpy=${workdir}/../main.py
5 | 
6 | ### validate and timing
7 | heturun -s 1 -w 2 python ${mainpy} --model $1 --dataset $2 --validate --timing --comm-mode PS
8 | 


--------------------------------------------------------------------------------
/examples/cnn/scripts/hetu_8gpu.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | workdir=$(cd $(dirname $0); pwd)
3 | mainpy=${workdir}/../main.py
4 | depsdir=${workdir}/../../..
5 | 
6 | ### validate and timing
7 | heturun -w 8 python ${mainpy} --model $1 --dataset $2 --learning-rate 0.00125 --validate --timing --comm-mode AllReduce
8 | 


--------------------------------------------------------------------------------
/examples/cnn/scripts/hetu_config16allreduce.yml:
--------------------------------------------------------------------------------
1 | nodes:
2 |   - host: node1
3 |     workers: 8
4 |     chief: true
5 |   - host: node2
6 |     workers: 8
7 | 


--------------------------------------------------------------------------------
/examples/cnn/scripts/hetu_config16ps.yml:
--------------------------------------------------------------------------------
1 | nodes:
2 |   - host: node1
3 |     servers: 1
4 |     workers: 8
5 |     chief: true
6 |   - host: node2
7 |     servers: 1
8 |     workers: 8
9 | 


--------------------------------------------------------------------------------
/examples/cnn/scripts/horovod_16gpu.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | #!/bin/bash
 3 | workdir=$(cd $(dirname $0); pwd)
 4 | mainpy=${workdir}/../run_tf_horovod.py
 5 | 
 6 | # horovodrun -np 8 -H localhost:8 python ${mainpy} --model tf_mlp --dataset CIFAR10 --learning-rate 0.00125 --validate --timing
 7 | 
 8 | horovodrun -np 16 --start-timeout 3000 -H node1:8,node2:8 python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --validate --timing
 9 | 
10 | # ../build/_deps/openmpi-build/bin/mpirun -mca btl_tcp_if_include enp97s0f0 --bind-to none --map-by slot\
11 | #  -x NCCL_SOCKET_IFNAME=enp97s0f0 -H node1:8,node2:8 --allow-run-as-root python run_tf_horovod.py --model
12 | 


--------------------------------------------------------------------------------
/examples/cnn/scripts/horovod_8gpu.sh:
--------------------------------------------------------------------------------
1 | 
2 | #!/bin/bash
3 | workdir=$(cd $(dirname $0); pwd)
4 | mainpy=${workdir}/../run_tf_horovod.py
5 | 
6 | horovodrun -np 8 -H localhost:8 python ${mainpy} --model $1 --dataset $2 --learning-rate 0.00125 --validate --timing
7 | 


--------------------------------------------------------------------------------
/examples/cnn/scripts/pytorch_16gpu_0.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | GPUS_PER_NODE=8
 4 | # Change for multinode config
 5 | MASTER_ADDR=162.105.146.117
 6 | MASTER_PORT=6000
 7 | NNODES=2
 8 | NODE_RANK=0
 9 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
10 | 
11 | workdir=$(cd $(dirname $0); pwd)
12 | mainpy=${workdir}/../torch_main.py
13 | 
14 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
15 | 
16 | python -m torch.distributed.launch $DISTRIBUTED_ARGS \
17 |         ${mainpy} \
18 |         --model $1 --dataset $2 --learning-rate 0.01 --validate --timing --distributed


--------------------------------------------------------------------------------
/examples/cnn/scripts/pytorch_16gpu_1.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | GPUS_PER_NODE=8
 4 | # Change for multinode config
 5 | MASTER_ADDR=162.105.146.117
 6 | MASTER_PORT=39575
 7 | NNODES=2
 8 | NODE_RANK=1
 9 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
10 | 
11 | workdir=$(cd $(dirname $0); pwd)
12 | mainpy=${workdir}/../torch_main.py
13 | 
14 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
15 | 
16 | python -m torch.distributed.launch $DISTRIBUTED_ARGS \
17 |         ${mainpy} \
18 |         --model $1 --dataset $2 --learning-rate 0.01 --validate --timing --distributed


--------------------------------------------------------------------------------
/examples/cnn/scripts/pytorch_1gpu.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | workdir=$(cd $(dirname $0); pwd)
4 | mainpy=${workdir}/../torch_main.py
5 | 
6 | ## validate and timing
7 | python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --validate --timing
8 | 


--------------------------------------------------------------------------------
/examples/cnn/scripts/pytorch_8gpu.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | GPUS_PER_NODE=8
 4 | # Change for multinode config
 5 | MASTER_ADDR=localhost
 6 | MASTER_PORT=6000
 7 | NNODES=1
 8 | NODE_RANK=0
 9 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
10 | 
11 | workdir=$(cd $(dirname $0); pwd)
12 | mainpy=${workdir}/../torch_main.py
13 | 
14 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
15 | 
16 | python -m torch.distributed.launch $DISTRIBUTED_ARGS \
17 |         ${mainpy} \
18 |         --model $1 --dataset $2 --learning-rate 0.01 --validate --timing --distributed


--------------------------------------------------------------------------------
/examples/cnn/scripts/tf_1gpu.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | workdir=$(cd $(dirname $0); pwd)
 4 | mainpy=${workdir}/../tf_main.py
 5 | 
 6 | ### validate and timing
 7 | python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --validate --timing
 8 | 
 9 | ### run in cpu
10 | # python ${mainpy} --model tf_mlp --gpu -1 --validate --timing
11 | 


--------------------------------------------------------------------------------
/examples/cnn/settings/tf_dist_s1_w16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "worker": [
 3 |         "162.105.146.117:34569",
 4 |         "162.105.146.117:34568",
 5 |         "162.105.146.117:34567",
 6 |         "162.105.146.117:34566",
 7 |         "162.105.146.117:34565",
 8 |         "162.105.146.117:34564",
 9 |         "162.105.146.117:34563",
10 |         "162.105.146.117:34562",
11 |         "162.105.146.118:34779",
12 |         "162.105.146.118:34778",
13 |         "162.105.146.118:34777",
14 |         "162.105.146.118:34776",
15 |         "162.105.146.118:34775",
16 |         "162.105.146.118:34774",
17 |         "162.105.146.118:34773",
18 |         "162.105.146.118:34772"
19 |     ],
20 |     "ps": [
21 |         "162.105.146.117:34575"
22 |     ]
23 | }


--------------------------------------------------------------------------------
/examples/cnn/settings/tf_dist_s1_w4.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "worker": [
 3 |         "162.105.146.119:34569",
 4 |         "162.105.146.119:34568",
 5 |         "162.105.146.119:34567",
 6 |         "162.105.146.119:34566"
 7 |     ],
 8 |     "ps": [
 9 |         "162.105.146.119:34575"
10 |     ]
11 | }


--------------------------------------------------------------------------------
/examples/cnn/settings/tf_dist_s1_w8.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "worker": [
 3 |         "162.105.146.119:34569",
 4 |         "162.105.146.119:34568",
 5 |         "162.105.146.119:34567",
 6 |         "162.105.146.119:34566",
 7 |         "162.105.146.119:34565",
 8 |         "162.105.146.119:34564",
 9 |         "162.105.146.119:34563",
10 |         "162.105.146.119:34562"
11 |     ],
12 |     "ps": [
13 |         "162.105.146.119:34575"
14 |     ]
15 | }


--------------------------------------------------------------------------------
/examples/cnn/tf_models/__init__.py:
--------------------------------------------------------------------------------
1 | from .tf_LogReg import tf_logreg
2 | from .tf_CNN import tf_cnn_3_layers
3 | from .tf_LeNet import tf_lenet
4 | from .tf_MLP import tf_mlp
5 | from .tf_RNN import tf_rnn
6 | from .tf_LSTM import tf_lstm
7 | from .tf_ResNet import tf_resnet, tf_resnet18, tf_resnet34
8 | from .tf_VGG import tf_vgg16, tf_vgg19
9 | 


--------------------------------------------------------------------------------
/examples/cnn/tf_models/tf_LogReg.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import tensorflow as tf
 3 | 
 4 | 
 5 | def tf_logreg(x, y_):
 6 |     '''
 7 |     Logistic Regression model in TensorFlow, for MNIST dataset.
 8 | 
 9 |     Parameters:
10 |         x: Variable(tensorflow.python.framework.ops.Tensor), shape (N, dims)
11 |         y_: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes)
12 |     Return:
13 |         loss: Variable(tensorflow.python.framework.ops.Tensor), shape (1,)
14 |         y: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes)
15 |     '''
16 | 
17 |     print("Build logistic regression model in tensorflow...")
18 |     weight = tf.Variable(np.zeros(shape=(784, 10)).astype(np.float32))
19 |     bias = tf.Variable(np.zeros(shape=(10, )).astype(np.float32))
20 |     y = tf.matmul(x, weight) + bias
21 |     loss = tf.nn.softmax_cross_entropy_with_logits(logits=y, labels=y_)
22 |     loss = tf.reduce_mean(loss)
23 |     return loss, y
24 | 


--------------------------------------------------------------------------------
/examples/cnn/worker_conf0.json:
--------------------------------------------------------------------------------
 1 | {
 2 | "DMLC_ROLE":"worker",
 3 | "WORKER_ID":"0",
 4 | "DMLC_PS_ROOT_URI":"127.0.0.1",
 5 | "DMLC_PS_ROOT_PORT":"13030",
 6 | "DMLC_NUM_WORKER":"2",
 7 | "DMLC_NUM_SERVER":"1",
 8 | "DMLC_PS_VAN_TYPE":"p3"
 9 | }
10 | 


--------------------------------------------------------------------------------
/examples/cnn/worker_conf1.json:
--------------------------------------------------------------------------------
 1 | {
 2 | "DMLC_ROLE":"worker",
 3 | "WORKER_ID":"1",
 4 | "DMLC_PS_ROOT_URI":"127.0.0.1",
 5 | "DMLC_PS_ROOT_PORT":"13030",
 6 | "DMLC_NUM_WORKER":"2",
 7 | "DMLC_NUM_SERVER":"1",
 8 | "DMLC_PS_VAN_TYPE":"p3"
 9 | }
10 | 


--------------------------------------------------------------------------------
/examples/ctr/.gitignore:
--------------------------------------------------------------------------------
1 | datasets/
2 | logs/
3 | scripts.sh
4 | wandb/
5 | ckpts/
6 | 


--------------------------------------------------------------------------------
/examples/ctr/kill.sh:
--------------------------------------------------------------------------------
1 | #/bin/bash
2 | #pkill -f mnist_mlp_ps.py
3 | kill -9 $(pidof python)
4 | 


--------------------------------------------------------------------------------
/examples/ctr/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .wdl_adult import wdl_adult
2 | from .dcn_criteo import dcn_criteo
3 | from .dc_criteo import dc_criteo
4 | from .wdl_criteo import wdl_criteo
5 | from .deepfm_criteo import dfm_criteo
6 | 


--------------------------------------------------------------------------------
/examples/ctr/settings/dist_s2_w4.yml:
--------------------------------------------------------------------------------
 1 | nodes:
 2 |   - host: hostname1
 3 |     servers: 1 
 4 |     workers: 2
 5 |     chief: true
 6 |   - host: hostname2
 7 |     servers: 1
 8 |     workers: 2
 9 |     chief: false
10 | 


--------------------------------------------------------------------------------
/examples/ctr/settings/local_s1_w2.yml:
--------------------------------------------------------------------------------
1 | nodes:
2 |   - host: localhost
3 |     servers: 1 
4 |     workers: 2
5 |     chief: true
6 | 


--------------------------------------------------------------------------------
/examples/ctr/settings/plx_local_spec.yml:
--------------------------------------------------------------------------------
1 | nodes:
2 |   - address: localhost
3 |     cpus: [0]
4 |     gpus: [0,1,2,3,4,5,6,7]
5 | 


--------------------------------------------------------------------------------
/examples/ctr/settings/tf_local_s1_w2.json:
--------------------------------------------------------------------------------
1 | {
2 |     "worker": [
3 |         "127.0.0.1:12349",
4 |         "127.0.0.1:12348"
5 |     ],
6 |     "ps": [
7 |         "127.0.0.1:12345"
8 |     ]
9 | }


--------------------------------------------------------------------------------
/examples/ctr/settings/tf_local_s1_w4.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "worker": [
 3 |         "127.0.0.1:23459",
 4 |         "127.0.0.1:23458",
 5 |         "127.0.0.1:23457",
 6 |         "127.0.0.1:23456"
 7 |     ],
 8 |     "ps": [
 9 |         "127.0.0.1:23455"
10 |     ]
11 | }


--------------------------------------------------------------------------------
/examples/ctr/settings/tf_local_s1_w8.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "worker": [
 3 |         "127.0.0.1:34569",
 4 |         "127.0.0.1:34568",
 5 |         "127.0.0.1:34567",
 6 |         "127.0.0.1:34566",
 7 |         "127.0.0.1:34565",
 8 |         "127.0.0.1:34564",
 9 |         "127.0.0.1:34563",
10 |         "127.0.0.1:34562"
11 |     ],
12 |     "ps": [
13 |         "127.0.0.1:34575"
14 |     ]
15 | }


--------------------------------------------------------------------------------
/examples/ctr/tests/README.md:
--------------------------------------------------------------------------------
1 | * The scripts in this directory, except for `*_wdl_adult.sh` scripts, are deprecated. We have implemented `heturun` command to launch distributed deep learning tasks. Please refer to `*_wdl_adult.sh` scripts for the usage of `heturun`.


--------------------------------------------------------------------------------
/examples/ctr/tests/hybrid_dcn_criteo.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | workdir=$(cd $(dirname $0); pwd)
4 | mainpy=${workdir}/../run_hetu.py
5 | 
6 | python -m hetu.launcher ${workdir}/../settings/local_s1.yml -n 1 --sched &
7 | mpirun --allow-run-as-root -np 4 python ${mainpy} --model dcn_criteo --val --comm Hybrid --cache lfuopt --bound 3 --config ${workdir}/../settings/local_w4.yml
8 | 


--------------------------------------------------------------------------------
/examples/ctr/tests/hybrid_dfm_criteo.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | workdir=$(cd $(dirname $0); pwd)
4 | mainpy=${workdir}/../run_hetu.py
5 | 
6 | python -m hetu.launcher ${workdir}/../settings/local_s1.yml -n 1 --sched &
7 | mpirun --allow-run-as-root -np 4 python ${mainpy} --model dfm_criteo --val --comm Hybrid --cache lfuopt --bound 3 --config ${workdir}/../settings/local_w4.yml
8 | 


--------------------------------------------------------------------------------
/examples/ctr/tests/hybrid_wdl_adult.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | workdir=$(cd $(dirname $0); pwd)
4 | mainpy=${workdir}/../run_hetu.py
5 | 
6 | heturun -s 1 -w 4 python ${mainpy} --model wdl_adult --val --comm Hybrid --cache lfuopt --bound 3
7 | 


--------------------------------------------------------------------------------
/examples/ctr/tests/hybrid_wdl_criteo.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | workdir=$(cd $(dirname $0); pwd)
4 | mainpy=${workdir}/../run_hetu.py
5 | 
6 | python ${workdir}/../models/load_data.py # download and preprocessing criteo dataset
7 | heturun -s 1 -w 4 python ${mainpy} --model wdl_criteo --val --comm Hybrid --cache lfuopt --bound 3
8 | 


--------------------------------------------------------------------------------
/examples/ctr/tests/local_dcn_criteo.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | workdir=$(cd $(dirname $0); pwd)
4 | mainpy=${workdir}/../run_hetu.py
5 | 
6 | python ${mainpy} --model dcn_criteo --val
7 | 


--------------------------------------------------------------------------------
/examples/ctr/tests/local_dfm_criteo.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | workdir=$(cd $(dirname $0); pwd)
4 | mainpy=${workdir}/../run_hetu.py
5 | 
6 | python ${mainpy} --model dfm_criteo --val
7 | 


--------------------------------------------------------------------------------
/examples/ctr/tests/local_wdl_adult.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | workdir=$(cd $(dirname $0); pwd)
4 | mainpy=${workdir}/../run_hetu.py
5 | 
6 | heturun -w 1 python ${mainpy} --model wdl_adult --val
7 | 


--------------------------------------------------------------------------------
/examples/ctr/tests/local_wdl_criteo.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | workdir=$(cd $(dirname $0); pwd)
4 | mainpy=${workdir}/../run_hetu.py
5 | 
6 | python ${mainpy} --model wdl_criteo --val
7 | 


--------------------------------------------------------------------------------
/examples/ctr/tests/ps_dcn_criteo.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | workdir=$(cd $(dirname $0); pwd)
4 | mainpy=${workdir}/../run_hetu.py
5 | 
6 | python ${mainpy} --model dcn_criteo --val --comm PS --cache lfuopt --bound 3 --config ${workdir}/../settings/local_s1_w4.yml
7 | 


--------------------------------------------------------------------------------
/examples/ctr/tests/ps_dfm_criteo.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | workdir=$(cd $(dirname $0); pwd)
4 | mainpy=${workdir}/../run_hetu.py
5 | 
6 | python ${mainpy} --model dfm_criteo --val --comm PS --cache lfuopt --bound 3 --config ${workdir}/../settings/local_s1_w4.yml
7 | 


--------------------------------------------------------------------------------
/examples/ctr/tests/ps_wdl_adult.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | workdir=$(cd $(dirname $0); pwd)
4 | mainpy=${workdir}/../run_hetu.py
5 | 
6 | heturun -s 1 -w 4 python ${mainpy} --model wdl_adult --val --comm PS --cache lfuopt --bound 3
7 | 


--------------------------------------------------------------------------------
/examples/ctr/tests/ps_wdl_criteo.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | workdir=$(cd $(dirname $0); pwd)
4 | mainpy=${workdir}/../run_hetu.py
5 | 
6 | python ${mainpy} --model wdl_criteo --val --comm PS --cache lfuopt --bound 3 --config ${workdir}/../settings/local_s1_w4.yml
7 | 


--------------------------------------------------------------------------------
/examples/ctr/tests/tf_2workers.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | workdir=$(cd $(dirname $0); pwd)
 4 | mainpy=${workdir}/../tf_launch_worker.py
 5 | 
 6 | rm -f logs/temp*.log
 7 | CUDA_VISIBLE_DEVICES=0 python ${mainpy} --model wdl_criteo --config ${workdir}/../settings/tf_local_s1_w2.json --rank 0 > ${workdir}/../logs/temp0.log & 
 8 | CUDA_VISIBLE_DEVICES=1 python ${mainpy} --model wdl_criteo --config ${workdir}/../settings/tf_local_s1_w2.json --rank 1 > ${workdir}/../logs/temp1.log & 
 9 | wait
10 | 


--------------------------------------------------------------------------------
/examples/ctr/tests/tf_4workers.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | workdir=$(cd $(dirname $0); pwd)
 4 | mainpy=${workdir}/../tf_launch_worker.py
 5 | 
 6 | rm -f logs/temp*.log
 7 | CUDA_VISIBLE_DEVICES=0 python ${mainpy} --model wdl_criteo --config ${workdir}/../settings/tf_local_s1_w4.json --rank 0 > ${workdir}/../logs/temp0.log &
 8 | CUDA_VISIBLE_DEVICES=1 python ${mainpy} --model wdl_criteo --config ${workdir}/../settings/tf_local_s1_w4.json --rank 1 > ${workdir}/../logs/temp1.log &
 9 | CUDA_VISIBLE_DEVICES=2 python ${mainpy} --model wdl_criteo --config ${workdir}/../settings/tf_local_s1_w4.json --rank 2 > ${workdir}/../logs/temp2.log &
10 | CUDA_VISIBLE_DEVICES=3 python ${mainpy} --model wdl_criteo --config ${workdir}/../settings/tf_local_s1_w4.json --rank 3 > ${workdir}/../logs/temp3.log &
11 | wait
12 | 


--------------------------------------------------------------------------------
/examples/ctr/tf_models/__init__.py:
--------------------------------------------------------------------------------
1 | from .tf_dcn_criteo import dcn_criteo
2 | from .tf_deepfm_criteo import dfm_criteo
3 | from .tf_wdl_criteo import wdl_criteo
4 | from .tf_wdl_adult import wdl_adult
5 | 


--------------------------------------------------------------------------------
/examples/gnn/config/local_w2.yml:
--------------------------------------------------------------------------------
 1 | shared :
 2 |   DMLC_PS_ROOT_URI : 127.0.0.1
 3 |   DMLC_PS_ROOT_PORT : 13100
 4 |   DMLC_NUM_WORKER : 2
 5 |   DMLC_NUM_SERVER : 1
 6 | launch :
 7 |   worker : 2
 8 |   server : 1
 9 |   graph_server : 1
10 |   scheduler : true
11 | 


--------------------------------------------------------------------------------
/examples/gnn/config/local_w4.yml:
--------------------------------------------------------------------------------
 1 | shared :
 2 |   DMLC_PS_ROOT_URI : 127.0.0.1
 3 |   DMLC_PS_ROOT_PORT : 13100
 4 |   DMLC_NUM_WORKER : 4
 5 |   DMLC_NUM_SERVER : 1
 6 | launch :
 7 |   worker : 4
 8 |   server : 1
 9 |   graph_server : 4
10 |   scheduler : true
11 | 


--------------------------------------------------------------------------------
/examples/gnn/config/local_w8.yml:
--------------------------------------------------------------------------------
 1 | shared :
 2 |   DMLC_PS_ROOT_URI : 127.0.0.1
 3 |   DMLC_PS_ROOT_PORT : 13100
 4 |   DMLC_NUM_WORKER : 8
 5 |   DMLC_NUM_SERVER : 1
 6 | launch :
 7 |   worker : 8
 8 |   server : 1
 9 |   graph_server : 4
10 |   scheduler : true
11 | 


--------------------------------------------------------------------------------
/examples/gnn/config/single.yml:
--------------------------------------------------------------------------------
 1 | shared :
 2 |   DMLC_PS_ROOT_URI : 127.0.0.1
 3 |   DMLC_PS_ROOT_PORT : 13100
 4 |   DMLC_NUM_WORKER : 1
 5 |   DMLC_NUM_SERVER : 0
 6 | launch :
 7 |   worker : 1
 8 |   server : 0
 9 |   graph_server : 1
10 |   scheduler : true
11 | 


--------------------------------------------------------------------------------
/examples/gnn/gnn_model/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hsword/Hetu/a1cb1bcaf236c9f1d452733b105997d50c16c9df/examples/gnn/gnn_model/__init__.py


--------------------------------------------------------------------------------
/examples/gnn/gnn_tools/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hsword/Hetu/a1cb1bcaf236c9f1d452733b105997d50c16c9df/examples/gnn/gnn_tools/__init__.py


--------------------------------------------------------------------------------
/examples/moe/README.md:
--------------------------------------------------------------------------------
 1 | ## Structure
 2 | ```
 3 | - moe
 4 |     - scripts/              Test scripts
 5 |     - test_moe_top.py       TopK MoE
 6 |     - test_moe_hash.py      Hash Layer MoE
 7 |     - test_moe_ktop1.py     KTop1 MoE
 8 |     - test_moe_base.py      BASE Layer MoE
 9 |     - test_moe_sam.py       Switch and Mixture MoE
10 |     - 
11 | ```
12 | ## Usage
13 | Here are some examples of running scripts.
14 | ```bash
15 | bash scripts/run_top1.sh 
16 | ```
17 | Change ht.alltoall\_op into ht.halltoall\_op in the model definition(located in Hetu/python/hetu/layers) to use Hierarchical AllToAll. 
18 | 


--------------------------------------------------------------------------------
/examples/moe/scripts/run_2node_comm.sh:
--------------------------------------------------------------------------------
1 | NCCL_DEBUG=DEBUG mpirun --allow-run-as-root -np 16 -mca btl_tcp_if_include enp97s0f0 -x NCCL_SOCKET_IFNAME=enp97s0f0  -x PYTHONPATH=/home/Hetu/python -H node1:8,node2:8 /root/anaconda3/envs/moe/bin/python /home/Hetu/tests/test_ha2agather.py
2 | 


--------------------------------------------------------------------------------
/examples/moe/scripts/run_base.sh:
--------------------------------------------------------------------------------
1 | NCCL_DEBUG=INFO mpirun --allow-run-as-root -np 8 -x PYTHONPATH=/home/Hetu/python python test_moe_base.py --num_local_experts=1
2 | 


--------------------------------------------------------------------------------
/examples/moe/scripts/run_hash.sh:
--------------------------------------------------------------------------------
1 | NCCL_DEBUG=INFO mpirun --allow-run-as-root -np 8 -x PYTHONPATH=/home/Hetu/python python test_moe_hash.py --num_local_experts=2 --batch_size=4
2 | 


--------------------------------------------------------------------------------
/examples/moe/scripts/run_ktop1.sh:
--------------------------------------------------------------------------------
1 | NCCL_DEBUG=INFO mpirun --allow-run-as-root -np 8 -x PYTHONPATH=/home/Hetu/python python test_moe_ktop1.py --k=2 --num_local_experts=2 --batch_size=64
2 | 


--------------------------------------------------------------------------------
/examples/moe/scripts/run_mnist.sh:
--------------------------------------------------------------------------------
1 | NCCL_DEBUG=INFO mpirun --allow-run-as-root -np 2 -x  python ../test_mnist.py --top=1 --num_local_experts=2 --batch_size=16
2 | 


--------------------------------------------------------------------------------
/examples/moe/scripts/run_sam.sh:
--------------------------------------------------------------------------------
1 | NCCL_DEBUG=INFO mpirun --allow-run-as-root -np 8 -x PYTHONPATH=/home/Hetu/python python test_moe_sam.py --k=1 --num_local_experts=4 --batch_size=4
2 | 


--------------------------------------------------------------------------------
/examples/moe/scripts/run_top1.sh:
--------------------------------------------------------------------------------
1 | NCCL_DEBUG=DEBUG mpirun --allow-run-as-root -np 8 -x PYTHONPATH=/jizhicfs/pinxuezhao/Hetu_newest/python python3 /jizhicfs/pinxuezhao/Hetu_newest/examples/moe/test_moe_top.py --top=1 --num_local_experts=2 --batch_size=16
2 | 


--------------------------------------------------------------------------------
/examples/moe/scripts/run_top1_16gpus.sh:
--------------------------------------------------------------------------------
1 | # change ht.alltoall_op to ht.halltoall_op in Hetu/python/hetu/layers/moe_layer.py if you want to use 
2 | # hierarchical AllToAll.
3 | 
4 | NCCL_DEBUG=DEBUG mpirun --allow-run-as-root -np 16 -mca btl_tcp_if_include enp97s0f0 -x NCCL_SOCKET_IFNAME=enp97s0f0  -x PYTHONPATH=/home/Hetu/python -H node1:8,node2:8 /root/anaconda3/envs/moe/bin/python /home/Hetu/tests/test_moe_top.py --top=1 --num_local_experts=1 --batch_size=1
5 | 


--------------------------------------------------------------------------------
/examples/moe/scripts/run_top2.sh:
--------------------------------------------------------------------------------
1 | NCCL_DEBUG=INFO mpirun --allow-run-as-root -np 8 -x PYTHONPATH=/home/Hetu/python python test_moe_top.py --top=2 --num_local_experts=2 --batch_size=64
2 | 


--------------------------------------------------------------------------------
/examples/nlp/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__/
2 | iwslt2016/
3 | logs/
4 | cached_data
5 | preprocessed_data


--------------------------------------------------------------------------------
/examples/nlp/README.md:
--------------------------------------------------------------------------------
 1 | # NLP Examples
 2 | In this directory we provide simple implementations for Transformer model. We use the IWSLT2016 de-en dataset. 
 3 | ## Structure
 4 | ```
 5 | - nlp
 6 |     - hparams.py                    Hyperparameters
 7 |     - prepare_data.py               Downloading and preparing data
 8 |     - data_load.py                  Dataloader
 9 |     - hetu_transformer.py           Transformer model in hetu
10 |     - tf_transformer.py             Transformer model in tensorflow
11 |     - train_hetu_transformer.py     Trainer for hetu
12 |     - train_tf_transformer.py       Trainer for tensorflow
13 | ```
14 | ## Usage
15 | ```bash
16 | python train_{framework}_transformer.py
17 | ```
18 | To change the hyperparameters, please modify `hparams.py` file.


--------------------------------------------------------------------------------
/examples/nlp/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hsword/Hetu/a1cb1bcaf236c9f1d452733b105997d50c16c9df/examples/nlp/__init__.py


--------------------------------------------------------------------------------
/examples/nlp/bert/.gitignore:
--------------------------------------------------------------------------------
1 | preprocessed_data
2 | cached_data


--------------------------------------------------------------------------------
/examples/nlp/bert/config1.yml:
--------------------------------------------------------------------------------
1 | nodes:
2 |   - host: localhost
3 |     servers: 0
4 |     workers: 1
5 |     chief: true
6 | 


--------------------------------------------------------------------------------
/examples/nlp/bert/config2.yml:
--------------------------------------------------------------------------------
1 | nodes:
2 |   - host: localhost
3 |     servers: 0
4 |     workers: 2
5 |     chief: true
6 | 


--------------------------------------------------------------------------------
/examples/nlp/bert/config4.yml:
--------------------------------------------------------------------------------
1 | nodes:
2 |   - host: localhost
3 |     servers: 0
4 |     workers: 4
5 |     chief: true
6 | 


--------------------------------------------------------------------------------
/examples/nlp/bert/config8.yml:
--------------------------------------------------------------------------------
1 | nodes:
2 |   - host: localhost
3 |     servers: 0
4 |     workers: 8
5 |     chief: true
6 | 


--------------------------------------------------------------------------------
/examples/nlp/bert/data/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
 2 | # Licensed under the Apache License, Version 2.0 (the "License");
 3 | # you may not use this file except in compliance with the License.
 4 | # You may obtain a copy of the License at
 5 | #
 6 | #     http://www.apache.org/licenses/LICENSE-2.0
 7 | #
 8 | # Unless required by applicable law or agreed to in writing, software
 9 | # distributed under the License is distributed on an "AS IS" BASIS,
10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | # See the License for the specific language governing permissions and
12 | # limitations under the License.
13 | 


--------------------------------------------------------------------------------
/examples/nlp/bert/data/wikiextractor/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hsword/Hetu/a1cb1bcaf236c9f1d452733b105997d50c16c9df/examples/nlp/bert/data/wikiextractor/__init__.py


--------------------------------------------------------------------------------
/examples/nlp/bert/scripts/test_glue_hetu_bert_base.sh:
--------------------------------------------------------------------------------
 1 | python test_glue_hetu_bert.py \
 2 | --gpu_id 1 \
 3 | --train_batch_size 64 \
 4 | --task_name sst-2 \
 5 | --vocab_size 30522 \
 6 | --hidden_size 768 \
 7 | --num_hidden_layers 12 \
 8 | --num_attention_heads 12 \
 9 | --seq_length 128 \
10 | --epochs 20 \
11 | --lr 2e-5 \
12 | --adam_weight_decay 0.01 \
13 | --hidden_act relu \
14 | --dropout_prob 0.1


--------------------------------------------------------------------------------
/examples/nlp/bert/scripts/test_glue_hetu_bert_large.sh:
--------------------------------------------------------------------------------
 1 | python test_glue_hetu_bert.py \
 2 | --gpu_id 1 \
 3 | --train_batch_size 32 \
 4 | --task_name sst-2 \
 5 | --vocab_size 30522 \
 6 | --hidden_size 1024 \
 7 | --num_hidden_layers 24 \
 8 | --num_attention_heads 16 \
 9 | --seq_length 128 \
10 | --epochs 20 \
11 | --lr 2e-5 \
12 | --adam_weight_decay 0.01 \
13 | --hidden_act relu \
14 | --dropout_prob 0.1


--------------------------------------------------------------------------------
/examples/nlp/bert/scripts/test_glue_pytorch_bert_base.sh:
--------------------------------------------------------------------------------
 1 | python test_glue_pytorch_bert.py \
 2 | --gpu_id 1 \
 3 | --train_batch_size 64 \
 4 | --task_name sst-2 \
 5 | --vocab_size 30522 \
 6 | --hidden_size 768 \
 7 | --num_hidden_layers 12 \
 8 | --num_attention_heads 12 \
 9 | --seq_length 128 \
10 | --epochs 20 \
11 | --lr 2e-5 \
12 | --adam_weight_decay 0.01 \
13 | --hidden_act relu \
14 | --dropout_prob 0.1


--------------------------------------------------------------------------------
/examples/nlp/bert/scripts/test_glue_pytorch_bert_large.sh:
--------------------------------------------------------------------------------
 1 | python test_glue_pytorch_bert.py \
 2 | --gpu_id 1 \
 3 | --train_batch_size 32 \
 4 | --task_name sst-2 \
 5 | --vocab_size 30522 \
 6 | --hidden_size 1024 \
 7 | --num_hidden_layers 24 \
 8 | --num_attention_heads 16 \
 9 | --seq_length 128 \
10 | --epochs 20 \
11 | --lr 2e-5 \
12 | --adam_weight_decay 0.01 \
13 | --hidden_act relu \
14 | --dropout_prob 0.1


--------------------------------------------------------------------------------
/examples/nlp/bert/scripts/train_hetu_bert_base.sh:
--------------------------------------------------------------------------------
 1 | workdir=$(cd $(dirname $0); pwd)
 2 | mainpy=${workdir}/../train_hetu_bert.py
 3 | data_path=${workdir}/../data
 4 | 
 5 | python ${mainpy} \
 6 | --gpu_id 0 \
 7 | --train_batch_size 64 \
 8 | --data_path ${data_path} \
 9 | --dataset wikicorpus_en \
10 | --vocab_size 30522 \
11 | --hidden_size 768 \
12 | --num_hidden_layers 12 \
13 | --num_attention_heads 12 \
14 | --seq_length 128 \
15 | --epochs 20 \
16 | --lr 1e-5 \
17 | --adam_weight_decay 0.01 \
18 | --hidden_act relu \
19 | --dropout_prob 0.1


--------------------------------------------------------------------------------
/examples/nlp/bert/scripts/train_hetu_bert_base_dp.sh:
--------------------------------------------------------------------------------
 1 | workdir=$(cd $(dirname $0); pwd)
 2 | mainpy=${workdir}/../train_hetu_bert_dp.py
 3 | config=${workdir}/../config4.yml
 4 | data_path=${workdir}/../data
 5 | export PYTHONPATH=$HETU_PATH
 6 | heturun -c ${config} python ${mainpy} \
 7 | --num_gpus 4 \
 8 | --train_batch_size 64 \
 9 | --data_path ${data_path} \
10 | --dataset wikicorpus_en \
11 | --vocab_size 30522 \
12 | --hidden_size 768 \
13 | --num_hidden_layers 12 \
14 | --num_attention_heads 12 \
15 | --seq_length 512 \
16 | --epochs 80 \
17 | --lr 1e-5 \
18 | --adam_weight_decay 0.01 \
19 | --hidden_act relu \
20 | --dropout_prob 0.1
21 | 


--------------------------------------------------------------------------------
/examples/nlp/bert/scripts/train_hetu_bert_base_moe.sh:
--------------------------------------------------------------------------------
 1 | workdir=$(cd $(dirname $0); pwd)
 2 | mainpy=${workdir}/../train_hetu_bert_dp_moe.py
 3 | config=${workdir}/../config4.yml
 4 | data_path=${workdir}/../data
 5 | export PYTHONPATH=$HETU_PATH
 6 | heturun -c ${config} python3 ${mainpy} \
 7 | --num_gpus 4 \
 8 | --train_batch_size 64 \
 9 | --data_path ${data_path} \
10 | --dataset wikicorpus_en \
11 | --vocab_size 30522 \
12 | --hidden_size 768 \
13 | --num_hidden_layers 12 \
14 | --num_attention_heads 12 \
15 | --seq_length 512 \
16 | --epochs 80 \
17 | --lr 1e-5 \
18 | --adam_weight_decay 0.01 \
19 | --hidden_act relu \
20 | --dropout_prob 0.1
21 | 


--------------------------------------------------------------------------------
/examples/nlp/bert/scripts/train_hetu_bert_base_ps.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | heturun -s 1 -w 4 python train_hetu_bert_ps.py \
 4 | --train_batch_size 32 \
 5 | --dataset wikicorpus_en \
 6 | --vocab_size 30522 \
 7 | --hidden_size 768 \
 8 | --num_hidden_layers 12 \
 9 | --num_attention_heads 12 \
10 | --seq_length 128 \
11 | --epochs 20 \
12 | --lr 1e-5 \
13 | --adam_weight_decay 0.01 \
14 | --hidden_act relu \
15 | --dropout_prob 0.1
16 | 


--------------------------------------------------------------------------------
/examples/nlp/bert/scripts/train_hetu_bert_large.sh:
--------------------------------------------------------------------------------
 1 | workdir=$(cd $(dirname $0); pwd)
 2 | mainpy=${workdir}/../train_hetu_bert.py
 3 | data_path=${workdir}/../data
 4 | 
 5 | python ${mainpy} \
 6 | --gpu_id 0 \
 7 | --train_batch_size 32 \
 8 | --data_path ${data_path} \
 9 | --dataset wikicorpus_en \
10 | --vocab_size 30522 \
11 | --hidden_size 1024 \
12 | --num_hidden_layers 24 \
13 | --num_attention_heads 16 \
14 | --seq_length 128 \
15 | --epochs 20 \
16 | --lr 1e-5 \
17 | --adam_weight_decay 0.01 \
18 | --hidden_act relu \
19 | --dropout_prob 0.1


--------------------------------------------------------------------------------
/examples/nlp/bert/scripts/train_hetu_bert_large_dp.sh:
--------------------------------------------------------------------------------
 1 | workdir=$(cd $(dirname $0); pwd)
 2 | mainpy=${workdir}/../train_hetu_bert_dp.py
 3 | config=${workdir}/../config4.yml
 4 | data_path=${workdir}/../data
 5 | 
 6 | heturun -c ${config} python ${mainpy} \
 7 | --num_gpus 4 \
 8 | --train_batch_size 32 \
 9 | --data_path ${data_path} \
10 | --dataset wikicorpus_en \
11 | --vocab_size 30522 \
12 | --hidden_size 1024 \
13 | --num_hidden_layers 24 \
14 | --num_attention_heads 16 \
15 | --seq_length 128 \
16 | --epochs 20 \
17 | --lr 1e-5 \
18 | --adam_weight_decay 0.01 \
19 | --hidden_act relu \
20 | --dropout_prob 0.1


--------------------------------------------------------------------------------
/examples/nlp/bert/scripts/train_hetu_bert_large_ps.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | heturun -s 1 -w 4 python train_hetu_bert_ps.py \
 4 | --train_batch_size 32 \
 5 | --dataset wikicorpus_en \
 6 | --vocab_size 30522 \
 7 | --hidden_size 1024 \
 8 | --num_hidden_layers 24 \
 9 | --num_attention_heads 16 \
10 | --seq_length 128 \
11 | --epochs 20 \
12 | --lr 1e-5 \
13 | --adam_weight_decay 0.01 \
14 | --hidden_act relu \
15 | --dropout_prob 0.1
16 | 


--------------------------------------------------------------------------------
/examples/nlp/bert/scripts/train_pytorch_bert_base.sh:
--------------------------------------------------------------------------------
 1 | python train_pytorch_bert.py \
 2 | --gpu_id 1 \
 3 | --train_batch_size 64 \
 4 | --dataset wikicorpus_en \
 5 | --vocab_size 30522 \
 6 | --hidden_size 768 \
 7 | --num_hidden_layers 12 \
 8 | --num_attention_heads 12 \
 9 | --seq_length 128 \
10 | --epochs 20 \
11 | --lr 1e-5 \
12 | --adam_weight_decay 0.01 \
13 | --hidden_act relu \
14 | --dropout_prob 0.1


--------------------------------------------------------------------------------
/examples/nlp/bert/scripts/train_pytorch_bert_large.sh:
--------------------------------------------------------------------------------
 1 | python train_pytorch_bert.py \
 2 | --gpu_id 1 \
 3 | --train_batch_size 32 \
 4 | --dataset wikicorpus_en \
 5 | --vocab_size 30522 \
 6 | --hidden_size 1024 \
 7 | --num_hidden_layers 24 \
 8 | --num_attention_heads 16 \
 9 | --seq_length 128 \
10 | --epochs 20 \
11 | --lr 1e-5 \
12 | --adam_weight_decay 0.01 \
13 | --hidden_act relu \
14 | --dropout_prob 0.1


--------------------------------------------------------------------------------
/examples/rec/.gitignore:
--------------------------------------------------------------------------------
1 | datasets/
2 | logs/
3 | ckpts/
4 | 


--------------------------------------------------------------------------------
/examples/rec/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .base import RatingModel_Head
2 | from .mf import MF_Head
3 | from .neumf import NeuMF_Head
4 | from .gmf import GMF_Head
5 | from .mlp import MLP_Head
6 | 


--------------------------------------------------------------------------------
/examples/rec/models/gmf.py:
--------------------------------------------------------------------------------
 1 | import hetu as ht
 2 | import hetu.layers as htl
 3 | from .base import RatingModel_Head
 4 | 
 5 | 
 6 | class GMF_Head(RatingModel_Head):
 7 |     def __init__(self, embed_dim, nsparse=2, ndense=0):
 8 |         # fixed 2 layers
 9 |         super().__init__(embed_dim)
10 |         self.predict_layer = htl.Linear(
11 |             self.embed_dim, 1, initializer=ht.init.GenXavierNormal(), activation=None, name=f'predict')
12 | 
13 |     def __call__(self, embeddings, dense, label):
14 |         gmf_embs = ht.array_reshape_op(embeddings, [-1, 2, self.embed_dim])
15 |         output_gmf = ht.reduce_mul_op(gmf_embs, [1])
16 |         prediction = self.predict_layer(output_gmf)
17 |         prediction = ht.array_reshape_op(prediction, (-1,))
18 |         return self.output(prediction, label)
19 | 


--------------------------------------------------------------------------------
/examples/rec/models/mf.py:
--------------------------------------------------------------------------------
 1 | import hetu as ht
 2 | from .base import RatingModel_Head
 3 | 
 4 | 
 5 | class MF_Head(RatingModel_Head):
 6 |     def __call__(self, embeddings, dense, label):
 7 |         embeddings = ht.array_reshape_op(embeddings, [-1, 2, self.embed_dim])
 8 |         output = ht.reduce_mul_op(embeddings, [1])
 9 |         prediction = ht.reduce_sum_op(output, [-1])
10 |         return self.output(prediction, label)
11 | 


--------------------------------------------------------------------------------
/examples/rec/models/mlp.py:
--------------------------------------------------------------------------------
 1 | import hetu as ht
 2 | import hetu.layers as htl
 3 | from .base import RatingModel_Head
 4 | 
 5 | 
 6 | class MLP_Head(RatingModel_Head):
 7 |     def __init__(self, embed_dim, nsparse=2, ndense=0):
 8 |         # fixed 2 layers
 9 |         assert embed_dim % 4 == 0
10 |         super().__init__(embed_dim)
11 |         self.factor_num = embed_dim // 4
12 |         self.mlp_layers = self.create_mlp(
13 |             [8 * self.factor_num, 4 * self.factor_num, 2 * self.factor_num, self.factor_num])
14 |         self.predict_layer = htl.Linear(
15 |             self.factor_num, 1, initializer=ht.init.GenXavierNormal(), activation=None, name=f'predict')
16 | 
17 |     def __call__(self, embeddings, dense, label):
18 |         input_mlp = ht.array_reshape_op(embeddings, [-1, 2 * self.embed_dim])
19 |         output_mlp = self.mlp_layers(input_mlp)
20 |         prediction = self.predict_layer(output_mlp)
21 |         prediction = ht.array_reshape_op(prediction, (-1,))
22 |         return self.output(prediction, label)
23 | 


--------------------------------------------------------------------------------
/examples/rec/test/.gitignore:
--------------------------------------------------------------------------------
1 | datasets/
2 | logs/
3 | models/
4 | 


--------------------------------------------------------------------------------
/examples/rec/test/config.py:
--------------------------------------------------------------------------------
 1 | # dataset name 
 2 | dataset = 'ml-1m'
 3 | assert dataset in ['ml-1m', 'pinterest-20']
 4 | 
 5 | # model name 
 6 | model = 'NeuMF-end'
 7 | assert model in ['MLP', 'GMF', 'NeuMF-end', 'NeuMF-pre']
 8 | 
 9 | # paths
10 | main_path = './datasets/'
11 | 
12 | train_rating = main_path + '{}.train.rating'.format(dataset)
13 | test_rating = main_path + '{}.test.rating'.format(dataset)
14 | test_negative = main_path + '{}.test.negative'.format(dataset)
15 | 
16 | model_path = './models/'
17 | GMF_model_path = model_path + 'GMF.pth'
18 | MLP_model_path = model_path + 'MLP.pth'
19 | NeuMF_model_path = model_path + 'NeuMF.pth'
20 | 


--------------------------------------------------------------------------------
/examples/runner/local_allreduce.yml:
--------------------------------------------------------------------------------
1 | nodes:
2 |   - host: localhost
3 |     servers: 0
4 |     workers: 4
5 |     chief: true
6 | 


--------------------------------------------------------------------------------
/examples/runner/local_ps.yml:
--------------------------------------------------------------------------------
1 | nodes:
2 |   - host: localhost
3 |     servers: 1
4 |     workers: 4
5 |     chief: true
6 | 


--------------------------------------------------------------------------------
/examples/runner/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hsword/Hetu/a1cb1bcaf236c9f1d452733b105997d50c16c9df/examples/runner/models/__init__.py


--------------------------------------------------------------------------------
/examples/runner/parallel/config1.yml:
--------------------------------------------------------------------------------
1 | nodes:
2 |   - host: localhost
3 |     servers: 0
4 |     workers: 1
5 |     chief: true
6 | 


--------------------------------------------------------------------------------
/examples/runner/parallel/config2.yml:
--------------------------------------------------------------------------------
1 | nodes:
2 |   - host: localhost
3 |     servers: 0
4 |     workers: 2
5 |     chief: true
6 | 


--------------------------------------------------------------------------------
/examples/runner/parallel/config3.yml:
--------------------------------------------------------------------------------
1 | nodes:
2 |   - host: localhost
3 |     servers: 0
4 |     workers: 3
5 |     chief: true
6 | 


--------------------------------------------------------------------------------
/examples/runner/parallel/config4.yml:
--------------------------------------------------------------------------------
1 | nodes:
2 |   - host: localhost
3 |     servers: 0
4 |     workers: 4
5 |     chief: true
6 | 


--------------------------------------------------------------------------------
/examples/runner/parallel/config6.yml:
--------------------------------------------------------------------------------
1 | nodes:
2 |   - host: localhost
3 |     servers: 0
4 |     workers: 6
5 |     chief: true
6 | 


--------------------------------------------------------------------------------
/examples/runner/parallel/config8.yml:
--------------------------------------------------------------------------------
1 | nodes:
2 |   - host: localhost
3 |     servers: 0
4 |     workers: 8
5 |     chief: true
6 | 


--------------------------------------------------------------------------------
/examples/runner/parallel/dist_config8.yml:
--------------------------------------------------------------------------------
1 | nodes:
2 |   - host: node1
3 |     servers: 0
4 |     workers: 4
5 |     chief: true
6 |   - host: node2
7 |     servers: 0
8 |     workers: 4


--------------------------------------------------------------------------------
/examples/runner/parallel/validate_results.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import os.path as osp
 3 | import argparse
 4 | 
 5 | if __name__ == '__main__':
 6 |     parser = argparse.ArgumentParser()
 7 |     parser.add_argument('number', default=None)
 8 |     parser.add_argument('--rtol', default='1e-6')
 9 |     args = parser.parse_args()
10 | 
11 |     directory = 'results'
12 |     base = np.load(osp.join(directory, 'base.npy'))
13 |     print('Ground truth:', base)
14 |     for i in range(int(args.number)):
15 |         res = np.load(osp.join(directory, 'res%d.npy' % i))
16 |         np.testing.assert_allclose(base, res, rtol=float(args.rtol))
17 |         print('Result id %d passed test.' % i, res)
18 | 


--------------------------------------------------------------------------------
/examples/runner/remote_allreduce.yml:
--------------------------------------------------------------------------------
1 | nodes:
2 |   - host: node1
3 |     workers: 4
4 |     chief: true
5 |   - host: node2
6 |     workers: 2
7 | 


--------------------------------------------------------------------------------
/examples/runner/remote_ps.yml:
--------------------------------------------------------------------------------
1 | nodes:
2 |   - host: node1
3 |     servers: 1
4 |     workers: 4
5 |     chief: true
6 |   - host: node2
7 |     servers: 1
8 |     workers: 2
9 | 


--------------------------------------------------------------------------------
/hetu.exp:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | path="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
3 | echo "Hetu root is" $path
4 | export PATH="$path/bin:$PATH"
5 | export PYTHONPATH="$path/python:$path/build/lib:$path/third_party/GraphMix/python:$PYTHONPATH:$path/third_party/HetuML/hetuml/python"
6 | 


--------------------------------------------------------------------------------
/img/alibabacloud.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hsword/Hetu/a1cb1bcaf236c9f1d452733b105997d50c16c9df/img/alibabacloud.png


--------------------------------------------------------------------------------
/img/features.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hsword/Hetu/a1cb1bcaf236c9f1d452733b105997d50c16c9df/img/features.png


--------------------------------------------------------------------------------
/img/hetu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hsword/Hetu/a1cb1bcaf236c9f1d452733b105997d50c16c9df/img/hetu.png


--------------------------------------------------------------------------------
/img/kuaishou.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hsword/Hetu/a1cb1bcaf236c9f1d452733b105997d50c16c9df/img/kuaishou.png


--------------------------------------------------------------------------------
/img/tencent.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hsword/Hetu/a1cb1bcaf236c9f1d452733b105997d50c16c9df/img/tencent.png


--------------------------------------------------------------------------------
/ps-lite/.gitignore:
--------------------------------------------------------------------------------
1 | src/meta.pb.cc
2 | src/meta.pb.h
3 | 


--------------------------------------------------------------------------------
/ps-lite/include/ps/base.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  *  Copyright (c) 2015 by Contributors
 3 |  */
 4 | #ifndef PS_BASE_H_
 5 | #define PS_BASE_H_
 6 | #include <limits>
 7 | #include "ps/internal/utils.h"
 8 | namespace ps {
 9 | 
10 | #if USE_KEY32
11 | /*! \brief Use unsigned 32-bit int as the key type */
12 | using Key = uint32_t;
13 | #else
14 | /*! \brief Use unsigned 64-bit int as the key type */
15 | using Key = uint64_t;
16 | #endif
17 | /*! \brief The maximal allowed key value */
18 | static const Key kMaxKey = std::numeric_limits<Key>::max();
19 | /** \brief node ID for the scheduler */
20 | static const int kScheduler = 1;
21 | /**
22 |  * \brief the server node group ID
23 |  *
24 |  * group id can be combined:
25 |  * - kServerGroup + kScheduler means all server nodes and the scheuduler
26 |  * - kServerGroup + kWorkerGroup means all server and worker nodes
27 |  */
28 | static const int kServerGroup = 2;
29 | /** \brief the worker node group ID */
30 | static const int kWorkerGroup = 4;
31 | 
32 | } // namespace ps
33 | #endif // PS_BASE_H_
34 | 


--------------------------------------------------------------------------------
/ps-lite/include/ps/psf/preduce.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "PSFunc.h"
 4 | 
 5 | namespace ps {
 6 | 
 7 | template <>
 8 | struct PSFData<kPReduceGetPartner> {
 9 |     static constexpr PsfGroup group = PsfGroup::kPReduceScheduler;
10 |     static constexpr const char* name = "PReduceGetPartner";
11 |     using Request = tuple<
12 |         Key, // reduce group key, each pipeline stage has a unique key
13 |         int, // worker rank
14 |         size_t, // desired worker num
15 |         float // max wait time (ms)
16 |     >;
17 |     using Response = tuple<
18 |         SArray<int> // all the partners worker id to do reduce with
19 |     >;
20 |     static void _callback(const Response &response, int* tgt) {
21 |         auto &val = get<0>(response);
22 |         std::copy(val.begin(), val.end(), tgt);
23 |         tgt[val.size()] = -1;
24 |     }
25 | };
26 | 
27 | } // namespace ps
28 | 


--------------------------------------------------------------------------------
/ps-lite/include/ps/range.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  *  Copyright (c) 2015 by Contributors
 3 |  */
 4 | #ifndef PS_RANGE_H_
 5 | #define PS_RANGE_H_
 6 | #include "ps/internal/utils.h"
 7 | namespace ps {
 8 | 
 9 | /**
10 |  * \brief a range [begin, end)
11 |  */
12 | class Range {
13 | public:
14 |     Range() : Range(0, 0) {
15 |     }
16 |     Range(uint64_t begin, uint64_t end) : begin_(begin), end_(end) {
17 |     }
18 | 
19 |     uint64_t begin() const {
20 |         return begin_;
21 |     }
22 |     uint64_t end() const {
23 |         return end_;
24 |     }
25 |     uint64_t size() const {
26 |         return end_ - begin_;
27 |     }
28 | 
29 | private:
30 |     uint64_t begin_;
31 |     uint64_t end_;
32 | };
33 | 
34 | } // namespace ps
35 | #endif // PS_RANGE_H_
36 | 


--------------------------------------------------------------------------------
/ps-lite/src/PSFunc.cc:
--------------------------------------------------------------------------------
 1 | #include "ps/psf/PSFunc.h"
 2 | 
 3 | #include <mutex>
 4 | #include <unordered_map>
 5 | 
 6 | namespace ps {
 7 | 
 8 | static std::unordered_map<PsfType, const char*> psfunction_names;
 9 | 
10 | static void initnames(PSFData<kNumPSfunction>) {}
11 | 
12 | template<PsfType ftype> static void initnames(PSFData<ftype>) {
13 |     psfunction_names[ftype] = PSFData<ftype>::name;
14 |     initnames(PSFData<PsfType(ftype+1)>());
15 | }
16 | 
17 | const char*
18 | getPSFunctionName(const PsfType &ftype) {
19 |     static std::once_flag flag;
20 |     std::call_once(flag, []() {
21 |         initnames(PSFData<PsfType(0)>());
22 |     });
23 |     return psfunction_names[ftype];
24 | }
25 | 
26 | }
27 | 


--------------------------------------------------------------------------------
/python/graphboard/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hsword/Hetu/a1cb1bcaf236c9f1d452733b105997d50c16c9df/python/graphboard/__init__.py


--------------------------------------------------------------------------------
/python/graphboard/graph2fig.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | from graphviz import Digraph
 4 | import subprocess
 5 | import os
 6 | import signal
 7 | 
 8 | pid = None
 9 | 
10 | 
11 | def show(executor, port=9997):
12 |     print("Generating graph figure")
13 |     dot = Digraph()
14 |     dot.format = 'png'
15 |     for node in executor.topo_order:
16 |         dot.node(str(node.id), node.name)
17 |         print(node.desc)
18 |         if node.inputs:
19 |             for n in node.inputs:
20 |                 dot.edge(str(n.id), str(node.id))
21 |     print(dot.source)
22 |     dot.render('python/graphboard/output')
23 |     print("Starting server..")
24 |     cmd = 'cd python/graphboard; python -m SimpleHTTPServer '+str(port)
25 |     pro = subprocess.Popen(cmd, shell=True, preexec_fn=os.setsid)
26 |     global pid
27 |     pid = pro.pid
28 | 
29 | 
30 | def close():
31 |     global pid
32 |     os.killpg(pid, signal.SIGTERM)
33 | 


--------------------------------------------------------------------------------
/python/graphboard/index.html:
--------------------------------------------------------------------------------
 1 | <html>
 2 | <header>
 3 |     <title>Hetu</title>
 4 | </header>
 5 | 
 6 | <body text="Orange">
 7 |     <H1>
 8 |         Hetu Graph Borad:
 9 |     </H1>
10 |     <img alt="dataflow_graph" src="output.png" />
11 | </body>
12 | 
13 | </html>


--------------------------------------------------------------------------------
/python/hetu/__init__.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | from .gpu_ops import *
 3 | from .context import context, get_current_context, DistConfig
 4 | from .dataloader import dataloader_op, Dataloader, GNNDataLoaderOp
 5 | from .ndarray import cpu, gpu, rcpu, rgpu, array, sparse_array, empty, is_gpu_ctx, IndexedSlices
 6 | from . import optimizer as optim
 7 | from . import lr_scheduler as lr
 8 | from . import initializers as init
 9 | from . import data
10 | from . import layers
11 | from . import random
12 | from . import distributed_strategies as dist
13 | from .profiler import HetuProfiler, NCCLProfiler, HetuSimulator
14 | from .tokenizers import *
15 | 


--------------------------------------------------------------------------------
/python/hetu/communicator/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hsword/Hetu/a1cb1bcaf236c9f1d452733b105997d50c16c9df/python/hetu/communicator/__init__.py


--------------------------------------------------------------------------------
/python/hetu/communicator/test.sh:
--------------------------------------------------------------------------------
1 | export PYTHONPATH=$PYTHONPATH:/home/Hetu/python
2 | NCCL_DEBUG=INFO
3 | mpirun --allow-run-as-root -np 2 -mca btl_tcp_if_include enp97s0f0 -x NCCL_SOCKET_IFNAME=enp97s0f0 -H node1:1, node2:1  /root/anaconda3/envs/moe/bin/python mpi_nccl_comm.py
4 | 


--------------------------------------------------------------------------------
/python/hetu/cpu_links/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | from .dnnl_op import *
3 | 


--------------------------------------------------------------------------------
/python/hetu/distributed_strategies/__init__.py:
--------------------------------------------------------------------------------
1 | from .base import Strategy, BaseSearchingStrategy
2 | from .simple import DataParallel, ModelParallel4CNN, ModelParallel4LM, OneWeirdTrick4CNN, MegatronLM
3 | from .flexflow import FlexFlowSearching
4 | from .optcnn import OptCNNSearching
5 | from .gpipe import GPipeSearching
6 | from .pipedream import PipeDreamSearching
7 | from .pipeopt import PipeOptSearching
8 | 


--------------------------------------------------------------------------------
/python/hetu/gpu_links/AbsLink.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | from .._base import _LIB
 4 | from .. import ndarray as _nd
 5 | 
 6 | 
 7 | def abs_val(in_mat, out_mat, stream=None):
 8 |     assert isinstance(in_mat, _nd.NDArray)
 9 |     assert isinstance(out_mat, _nd.NDArray)
10 |     _LIB.DLGpuAbs(in_mat.handle, out_mat.handle, stream.handle if stream else None)
11 | 
12 | def abs_gradient(in_mat, grad_mat, out_mat, stream=None):
13 |     assert isinstance(in_mat, _nd.NDArray)
14 |     assert isinstance(grad_mat, _nd.NDArray)
15 |     assert isinstance(out_mat, _nd.NDArray)
16 |     _LIB.DLGpuAbsGradient(grad_mat.handle, in_mat.handle, out_mat.handle, stream.handle if stream else None)
17 | 


--------------------------------------------------------------------------------
/python/hetu/gpu_links/AddConstLink.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | import ctypes
 4 | from .._base import _LIB
 5 | from .. import ndarray as _nd
 6 | 
 7 | 
 8 | def matrix_elementwise_add_by_const(in_mat, val, out_mat, stream=None):
 9 |     assert isinstance(in_mat, _nd.NDArray)
10 |     assert isinstance(out_mat, _nd.NDArray)
11 |     _LIB.DLGpuMatrixElementwiseAddByConst(
12 |         in_mat.handle, ctypes.c_float(val), out_mat.handle, stream.handle if stream else None)
13 | 


--------------------------------------------------------------------------------
/python/hetu/gpu_links/AddmmLink.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | import ctypes
 4 | from .._base import _LIB
 5 | from .. import ndarray as _nd
 6 | 
 7 | 
 8 | def addmm(input_mat, matA, matB, matC, alpha, beta, stream=None):
 9 |     assert isinstance(input_mat, _nd.NDArray)
10 |     assert isinstance(matA, _nd.NDArray)
11 |     assert isinstance(matB, _nd.NDArray)
12 |     assert isinstance(matC, _nd.NDArray)
13 |     _LIB.DLGpuAddmm(input_mat.handle, matA.handle, matB.handle, ctypes.c_float(
14 |         alpha), ctypes.c_float(beta), matC.handle, stream.handle if stream else None)
15 | 
16 | 
17 | def addmm_gradient(input_mat, output_mat, axis, beta, stream=None):
18 |     assert isinstance(input_mat, _nd.NDArray)
19 |     assert isinstance(output_mat, _nd.NDArray)
20 |     _LIB.DLGpuAddmmGradient(input_mat.handle, output_mat.handle, ctypes.c_int(
21 |         axis), ctypes.c_float(beta), stream.handle if stream else None)
22 | 


--------------------------------------------------------------------------------
/python/hetu/gpu_links/ArangeLink.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | import ctypes
 4 | from .._base import _LIB
 5 | from .. import ndarray as _nd
 6 | 
 7 | 
 8 | def arange(start, end, step, out_mat, stream=None):
 9 |     assert isinstance(out_mat, _nd.NDArray)
10 |     _LIB.DLGpuArange(ctypes.c_float(start), ctypes.c_float(end), ctypes.c_float(
11 |         step), out_mat.handle, stream.handle if stream else None)
12 | 


--------------------------------------------------------------------------------
/python/hetu/gpu_links/ArgmaxLink.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | import ctypes
 4 | from .._base import _LIB
 5 | from .. import ndarray as _nd
 6 | 
 7 | 
 8 | def argmax(in_mat, out_mat, dim, stream=None):
 9 |     assert isinstance(in_mat, _nd.NDArray)
10 |     assert isinstance(out_mat, _nd.NDArray)
11 |     _LIB.DLGpuArgmax(
12 |         in_mat.handle, out_mat.handle, ctypes.c_int(dim), stream.handle if stream else None)
13 | 
14 | 
15 | def argmax_partial(in_mat, full_mask, out_mat, dim, topk, stream=None):
16 |     assert isinstance(in_mat, _nd.NDArray)
17 |     assert isinstance(full_mask, _nd.NDArray)
18 |     assert isinstance(out_mat, _nd.NDArray)
19 |     _LIB.DLGpuArgmaxPartial(
20 |         in_mat.handle, full_mask.handle, out_mat.handle, ctypes.c_int(dim), ctypes.c_int(topk), stream.handle if stream else None)
21 | 


--------------------------------------------------------------------------------
/python/hetu/gpu_links/ArgsortLink.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | import ctypes
 4 | from .._base import _LIB
 5 | from .. import ndarray as _nd
 6 | 
 7 | 
 8 | def argsort(input, output, index, output_index, dim, descending, stream=None):
 9 |     assert isinstance(input, _nd.NDArray)
10 |     assert isinstance(output, _nd.NDArray)
11 |     assert isinstance(index, _nd.NDArray)
12 |     assert isinstance(output_index, _nd.NDArray)
13 | 
14 |     _LIB.DLGpuArgsort(
15 |         input.handle, output.handle, index.handle, output_index.handle, ctypes.c_int(dim), descending, stream.handle if stream else None)
16 | 


--------------------------------------------------------------------------------
/python/hetu/gpu_links/ArraySetLink.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | import ctypes
 4 | from .._base import _LIB
 5 | from .. import ndarray as _nd
 6 | 
 7 | 
 8 | def array_set(arr, value, stream=None):
 9 |     assert isinstance(arr, _nd.NDArray)
10 |     _LIB.DLGpuArraySet(arr.handle, ctypes.c_float(
11 |         value), stream.handle if stream else None)
12 | 


--------------------------------------------------------------------------------
/python/hetu/gpu_links/AvgPoolLink.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | import ctypes
 4 | from .._base import _LIB
 5 | from .. import ndarray as _nd
 6 | 
 7 | 
 8 | def average_pooling2d(in_arr, kernel_H, kernel_W, pooled_layer, padding=0, stride=1, stream=None):
 9 |     assert isinstance(in_arr, _nd.NDArray)
10 |     assert isinstance(pooled_layer, _nd.NDArray)
11 |     _LIB.DLGpuAvgerage_Pooling2d(
12 |         in_arr.handle, kernel_H, kernel_W, pooled_layer.handle, padding, stride, stream.handle if stream else None)
13 | 
14 | 
15 | def average_pooling2d_gradient(in_gradient_y, kernel_H, kernel_W, out_gradient_x, padding=0, stride=1, stream=None):
16 |     assert isinstance(in_gradient_y, _nd.NDArray)
17 |     assert isinstance(out_gradient_x, _nd.NDArray)
18 |     _LIB.DLGpuAvgerage_Pooling2d_gradient(
19 |         in_gradient_y.handle, kernel_H, kernel_W, out_gradient_x.handle, padding, stride, stream.handle if stream else None)
20 | 


--------------------------------------------------------------------------------
/python/hetu/gpu_links/BaddbmmLink.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | import ctypes
 4 | from .._base import _LIB
 5 | from .. import ndarray as _nd
 6 | 
 7 | 
 8 | def baddbmm(input_mat, matA, matB, matC, alpha, beta, stream=None):
 9 |     assert isinstance(input_mat, _nd.NDArray)
10 |     assert isinstance(matA, _nd.NDArray)
11 |     assert isinstance(matB, _nd.NDArray)
12 |     assert isinstance(matC, _nd.NDArray)
13 |     _LIB.DLGpuBaddbmm(input_mat.handle, matA.handle, matB.handle, ctypes.c_float(
14 |         alpha), ctypes.c_float(beta), matC.handle, stream.handle if stream else None)
15 | 


--------------------------------------------------------------------------------
/python/hetu/gpu_links/BatchMatrixMultLink.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | from .._base import _LIB
 4 | from .. import ndarray as _nd
 5 | 
 6 | 
 7 | def batch_matrix_multiply(matA, transA, matB, transB, matC, stream=None):
 8 |     assert isinstance(matA, _nd.NDArray)
 9 |     assert isinstance(matB, _nd.NDArray)
10 |     assert isinstance(matC, _nd.NDArray)
11 |     _LIB.DLGpuBatchMatrixMultiply(
12 |         matA.handle, transA, matB.handle, transB, matC.handle, stream.handle if stream else None)
13 | 


--------------------------------------------------------------------------------
/python/hetu/gpu_links/BroadcastLink.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | import ctypes
 4 | from .._base import _LIB
 5 | from .. import ndarray as _nd
 6 | 
 7 | 
 8 | def broadcast_to(in_arr, out_arr, stream=None):
 9 |     assert isinstance(in_arr, _nd.NDArray)
10 |     assert isinstance(out_arr, _nd.NDArray)
11 |     _LIB.DLGpuBroadcastTo(in_arr.handle, out_arr.handle,
12 |                           stream.handle if stream else None)
13 | 


--------------------------------------------------------------------------------
/python/hetu/gpu_links/CloneLink.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | import ctypes
 4 | from .._base import _LIB
 5 | from .. import ndarray as _nd
 6 | 
 7 | def clone(input_mat, output_mat, stream=None):
 8 |         assert isinstance(input_mat, _nd.NDArray);
 9 |         assert isinstance(output_mat, _nd.NDArray);
10 |                     
11 |         _LIB.DLGpuClone(                       
12 |                 input_mat.handle, output_mat.handle, stream.handle if stream else None)
13 | 


--------------------------------------------------------------------------------
/python/hetu/gpu_links/ConcatLink.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | import ctypes
 4 | from .._base import _LIB
 5 | from .. import ndarray as _nd
 6 | 
 7 | 
 8 | def concat(in_arr1, in_arr2, out_arr, axis=0, stream=None):
 9 |     assert isinstance(in_arr1, _nd.NDArray)
10 |     assert isinstance(in_arr2, _nd.NDArray)
11 |     assert isinstance(out_arr, _nd.NDArray)
12 |     _LIB.DLGpuConcat(in_arr1.handle, in_arr2.handle,
13 |                      out_arr.handle, axis, stream.handle if stream else None)
14 | 
15 | 
16 | def concat_gradient(out_grad_arr, in_arr, axis=0, idx=0, stream=None):
17 |     assert isinstance(out_grad_arr, _nd.NDArray)
18 |     assert isinstance(in_arr, _nd.NDArray)
19 |     _LIB.DLGpuConcat_gradient(
20 |         out_grad_arr.handle, in_arr.handle, axis, idx, stream.handle if stream else None)
21 | 


--------------------------------------------------------------------------------
/python/hetu/gpu_links/ConcatenateLink.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | import ctypes
 4 | from .._base import _LIB
 5 | from .. import ndarray as _nd
 6 | 
 7 | 
 8 | def concatenate(in_arrs, out_arr, axis=0, stream=None):
 9 |     assert isinstance(out_arr, _nd.NDArray)
10 |     offset = 0
11 |     for arr in in_arrs:
12 |         assert isinstance(arr, _nd.NDArray)
13 |         _LIB.DLGpuConcatenate(
14 |             arr.handle, out_arr.handle,
15 |             ctypes.c_int(axis), ctypes.c_int(offset),
16 |             stream.handle if stream else None)
17 |         offset += arr.handle.contents.shape[axis]
18 | 
19 | 
20 | def concatenate_gradient(out_grad_arr, in_arr, axis, offset, stream=None):
21 |     assert isinstance(out_grad_arr, _nd.NDArray)
22 |     assert isinstance(in_arr, _nd.NDArray)
23 |     _LIB.DLGpuConcatenate_gradient(
24 |         out_grad_arr.handle, in_arr.handle,
25 |         ctypes.c_int(axis), ctypes.c_int(offset),
26 |         stream.handle if stream else None)
27 | 


--------------------------------------------------------------------------------
/python/hetu/gpu_links/ConstPowLink.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | import ctypes
 4 | from .._base import _LIB
 5 | from .. import ndarray as _nd
 6 | 
 7 | 
 8 | def const_pow(in_arr, out_arr, val, stream=None):
 9 |     assert isinstance(in_arr, _nd.NDArray)
10 |     assert isinstance(out_arr, _nd.NDArray)
11 |     _LIB.DLGpuConstPow(in_arr.handle, ctypes.c_float(val),
12 |                        out_arr.handle, stream.handle if stream else None)
13 | 
14 | 
15 | def const_pow_gradient(in_arr, grad_arr, out_arr, val, stream=None):
16 |     assert isinstance(in_arr, _nd.NDArray)
17 |     assert isinstance(grad_arr, _nd.NDArray)
18 |     assert isinstance(out_arr, _nd.NDArray)
19 |     _LIB.DLGpuConstPowGradient(
20 |         in_arr.handle, grad_arr.handle, ctypes.c_float(val), out_arr.handle, stream.handle if stream else None)
21 | 


--------------------------------------------------------------------------------
/python/hetu/gpu_links/Conv2dBroadcastLink.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | import ctypes
 4 | from .._base import _LIB
 5 | from .. import ndarray as _nd
 6 | 
 7 | 
 8 | def conv2d_broadcast_to(in_arr, out_arr, stream=None):
 9 |     assert isinstance(in_arr, _nd.NDArray)
10 |     assert isinstance(out_arr, _nd.NDArray)
11 |     _LIB.DLGpuConv2d_broadcast_to(
12 |         in_arr.handle, out_arr.handle, stream.handle if stream else None)
13 | 


--------------------------------------------------------------------------------
/python/hetu/gpu_links/Conv2dReduceSumLink.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | import ctypes
 4 | from .._base import _LIB
 5 | from .. import ndarray as _nd
 6 | 
 7 | 
 8 | def conv2d_reduce_sum(in_arr, out_arr, stream=None):
 9 |     assert isinstance(in_arr, _nd.NDArray)
10 |     assert isinstance(out_arr, _nd.NDArray)
11 |     _LIB.DLGpuConv2d_reduce_sum(
12 |         in_arr.handle, out_arr.handle, stream.handle if stream else None)
13 | 


--------------------------------------------------------------------------------
/python/hetu/gpu_links/CrossEntropyLink.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | import ctypes
 4 | from .._base import _LIB
 5 | from .. import ndarray as _nd
 6 | 
 7 | 
 8 | def cross_entropy(y, y_, out, stream=None):
 9 |     assert isinstance(y, _nd.NDArray)
10 |     assert isinstance(y_, _nd.NDArray)
11 |     assert isinstance(out, _nd.NDArray)
12 |     _LIB.DLGpuCrossEntropy(
13 |         y.handle, y_.handle, out.handle, stream.handle if stream else None)
14 | 
15 | 
16 | def cross_entropy_gradient(grad_arr, y_arr, label, out_arr, stream=None):
17 |     assert isinstance(grad_arr, _nd.NDArray)
18 |     assert isinstance(y_arr, _nd.NDArray)
19 |     assert isinstance(label, _nd.NDArray)
20 |     assert isinstance(out_arr, _nd.NDArray)
21 |     _LIB.DLGpuCrossEntropyGradient(
22 |         grad_arr.handle, y_arr.handle, label.handle, out_arr.handle, stream.handle if stream else None)
23 | 


--------------------------------------------------------------------------------
/python/hetu/gpu_links/CrossEntropySparseLink.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | import ctypes
 4 | from .._base import _LIB
 5 | from .. import ndarray as _nd
 6 | 
 7 | 
 8 | def cross_entropy_sparse(y, y_, ignored_index, out, stream=None):
 9 |     assert isinstance(y, _nd.NDArray)
10 |     assert isinstance(y_, _nd.NDArray)
11 |     assert isinstance(out, _nd.NDArray)
12 |     _LIB.DLGpuCrossEntropySparse(
13 |         y.handle, y_.handle, ignored_index, out.handle, stream.handle if stream else None)
14 | 
15 | 
16 | def cross_entropy_sparse_gradient(grad_arr, y_arr, label, ignored_index, out_arr, stream=None):
17 |     assert isinstance(grad_arr, _nd.NDArray)
18 |     assert isinstance(y_arr, _nd.NDArray)
19 |     assert isinstance(label, _nd.NDArray)
20 |     assert isinstance(out_arr, _nd.NDArray)
21 |     _LIB.DLGpuCrossEntropySparseGradient(
22 |         grad_arr.handle, y_arr.handle, label.handle, ignored_index, out_arr.handle, stream.handle if stream else None)
23 | 


--------------------------------------------------------------------------------
/python/hetu/gpu_links/CudnnConv2dAddBiasLink.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | import ctypes
 4 | from .._base import _LIB
 5 | from .. import ndarray as _nd
 6 | 
 7 | 
 8 | def CuDNN_conv2d_with_bias(in_arr_x, in_arr_f, bias, out_arr, padding=(0, 0), stride=(1, 1), stream=None):
 9 |     assert isinstance(in_arr_x, _nd.NDArray)
10 |     assert isinstance(in_arr_f, _nd.NDArray)
11 |     assert isinstance(bias, _nd.NDArray)
12 |     assert isinstance(out_arr, _nd.NDArray)
13 |     _LIB.Cudnn_Conv2dAddBias(in_arr_x.handle, in_arr_f.handle, bias.handle,
14 |                              out_arr.handle, padding[0], padding[1], stride[0], stride[1], stream.handle if stream else None)
15 | 


--------------------------------------------------------------------------------
/python/hetu/gpu_links/CudnnSoftmaxCrossEntropyLink.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | import ctypes
 4 | from .._base import _LIB
 5 | from .. import ndarray as _nd
 6 | 
 7 | 
 8 | def CuDNN_softmax_cross_entropy(y, y_, out, stream=None):
 9 |     assert isinstance(y, _nd.NDArray)
10 |     assert isinstance(y_, _nd.NDArray)
11 |     assert isinstance(out, _nd.NDArray)
12 |     _LIB.CuDNN_DLGpuSoftmaxEntropy(
13 |         y.handle, y_.handle, out.handle, stream.handle if stream else None)
14 | 
15 | 
16 | def CuDNN_softmax_cross_entropy_gradient(grad_arr, y_arr, label, out_arr, stream=None):
17 |     assert isinstance(grad_arr, _nd.NDArray)
18 |     assert isinstance(y_arr, _nd.NDArray)
19 |     assert isinstance(label, _nd.NDArray)
20 |     assert isinstance(out_arr, _nd.NDArray)
21 |     _LIB.CuDNN_DLGpuSoftmaxEntropyGradient(
22 |         grad_arr.handle, y_arr.handle, label.handle, out_arr.handle, stream.handle if stream else None)
23 | 


--------------------------------------------------------------------------------
/python/hetu/gpu_links/CumSumLink.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | import ctypes
 4 | from .._base import _LIB
 5 | from .. import ndarray as _nd
 6 | 
 7 | 
 8 | def cumsum_with_bias(input, output, bias, dim, stream=None):
 9 |     assert isinstance(input, _nd.NDArray)
10 |     assert isinstance(output, _nd.NDArray)
11 |     _LIB.DLGpuCumsumWithBias(
12 |         input.handle, output.handle, ctypes.c_float(bias), ctypes.c_int(dim), stream.handle if stream else None)
13 | 


--------------------------------------------------------------------------------
/python/hetu/gpu_links/DotLink.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | import ctypes
 4 | from .._base import _LIB
 5 | from .. import ndarray as _nd
 6 | 
 7 | 
 8 | def matrix_dot(matA, matB, matC, stream=None):
 9 |     assert isinstance(matA, _nd.NDArray)
10 |     assert isinstance(matB, _nd.NDArray)
11 |     assert isinstance(matC, _nd.NDArray)
12 |     _LIB.DLGpuDot(
13 |         matA.handle, matB.handle, matC.handle, stream.handle if stream else None)
14 | 


--------------------------------------------------------------------------------
/python/hetu/gpu_links/EmbeddingLookUpLink.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | import ctypes
 4 | from .._base import _LIB
 5 | from .. import ndarray as _nd
 6 | 
 7 | 
 8 | def embedding_lookup(in_mat, ids, out_mat, stream=None):
 9 |     assert isinstance(in_mat, _nd.NDArray)
10 |     assert isinstance(ids, _nd.NDArray)
11 |     assert isinstance(out_mat, _nd.NDArray)
12 |     _LIB.DLGpuEmbeddingLookUp(
13 |         in_mat.handle, ids.handle, out_mat.handle, stream.handle if stream else None)
14 | 


--------------------------------------------------------------------------------
/python/hetu/gpu_links/ExpLink.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from __future__ import absolute_import
 3 | 
 4 | from .._base import _LIB
 5 | from .. import ndarray as _nd
 6 | 
 7 | 
 8 | def exp(in_arr, out_arr, stream=None):
 9 |     assert isinstance(in_arr, _nd.NDArray)
10 |     assert isinstance(out_arr, _nd.NDArray)
11 |     _LIB.DLGpuExp(in_arr.handle, out_arr.handle,
12 |                   stream.handle if stream else None)
13 | 


--------------------------------------------------------------------------------
/python/hetu/gpu_links/FloorLink.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from __future__ import absolute_import
 3 | 
 4 | import ctypes
 5 | from .._base import _LIB
 6 | from .. import ndarray as _nd
 7 | 
 8 | 
 9 | def floor(in_arr, out_arr, stream=None):
10 |     assert isinstance(in_arr, _nd.NDArray)
11 |     assert isinstance(out_arr, _nd.NDArray)
12 |     _LIB.DLGpuFloor(in_arr.handle, out_arr.handle,
13 |                     stream.handle if stream else None)
14 | 


--------------------------------------------------------------------------------
/python/hetu/gpu_links/GatherLink.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from __future__ import absolute_import
 3 | 
 4 | import ctypes
 5 | from .._base import _LIB
 6 | from .. import ndarray as _nd
 7 | 
 8 | 
 9 | def gather(in_arr, index, out_arr, dim, stream=None):
10 |     assert isinstance(in_arr, _nd.NDArray)
11 |     assert isinstance(index, _nd.NDArray)
12 |     assert isinstance(out_arr, _nd.NDArray)
13 |     _LIB.DLGpuGather(in_arr.handle, index.handle, out_arr.handle,
14 |                      ctypes.c_int(dim), stream.handle if stream else None)
15 | 
16 | 
17 | def gather_gradient(in_arr, index, out_arr, dim, stream=None):
18 |     assert isinstance(in_arr, _nd.NDArray)
19 |     assert isinstance(index, _nd.NDArray)
20 |     assert isinstance(out_arr, _nd.NDArray)
21 |     _LIB.DLGpuGatherGradient(in_arr.handle, index.handle, out_arr.handle, ctypes.c_int(
22 |         dim), stream.handle if stream else None)
23 | 


--------------------------------------------------------------------------------
/python/hetu/gpu_links/GeluLink.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from __future__ import absolute_import
 3 | 
 4 | import ctypes
 5 | from .._base import _LIB
 6 | from .. import ndarray as _nd
 7 | 
 8 | 
 9 | def gelu(in_arr, out_arr, stream=None):
10 |     assert isinstance(in_arr, _nd.NDArray)
11 |     assert isinstance(out_arr, _nd.NDArray)
12 |     _LIB.DLGpuGelu(in_arr.handle, out_arr.handle,
13 |                    stream.handle if stream else None)
14 | 
15 | 
16 | def gelu_gradient(in_arr, in_grad_arr, out_arr, stream=None):
17 |     assert isinstance(in_arr, _nd.NDArray)
18 |     assert isinstance(in_grad_arr, _nd.NDArray)
19 |     assert isinstance(out_arr, _nd.NDArray)
20 |     _LIB.DLGpuGeluGradient(in_arr.handle, in_grad_arr.handle,
21 |                            out_arr.handle, stream.handle if stream else None)
22 | 


--------------------------------------------------------------------------------
/python/hetu/gpu_links/GroupTopKIdxLink.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | import ctypes
 4 | from .._base import _LIB
 5 | from .. import ndarray as _nd
 6 | 
 7 | def group_topk_idx(in_mat, top1_group, out_mat_idx, k, num_local_gpus, stream=None):
 8 |     assert isinstance(in_mat, _nd.NDArray);
 9 |     assert isinstance(top1_group, _nd.NDArray);
10 |     assert isinstance(out_mat_idx, _nd.NDArray);                    
11 |     _LIB.DLGpuGroupTopKIdx(                            
12 |         in_mat.handle, top1_group.handle, out_mat_idx.handle, ctypes.c_int(k), ctypes.c_int(num_local_gpus), stream.handle if stream else None)
13 | 


--------------------------------------------------------------------------------
/python/hetu/gpu_links/HA2ALayoutTransform.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | import ctypes
 4 | from .._base import _LIB
 5 | from .. import ndarray as _nd
 6 | 
 7 | 
 8 | def ha2a_layout_transform(input, output, num_nodes, num_local_gpus, stream=None):
 9 |     assert isinstance(input, _nd.NDArray)
10 |     assert isinstance(output, _nd.NDArray)
11 |     _LIB.DLGpuHA2ALayoutTransform(
12 |         input.handle, output.handle, ctypes.c_int(num_nodes), ctypes.c_int(num_local_gpus), stream.handle if stream else None)
13 | 
14 | 
15 | def ha2a_reverse_layout_transform(input, output, num_nodes, num_local_gpus, stream=None):
16 |     assert isinstance(input, _nd.NDArray)
17 |     assert isinstance(output, _nd.NDArray)
18 |     _LIB.DLGpuHA2AReverseLayoutTransform(
19 |         input.handle, output.handle, ctypes.c_int(num_nodes), ctypes.c_int(num_local_gpus), stream.handle if stream else None)
20 | 
21 | 
22 | 


--------------------------------------------------------------------------------
/python/hetu/gpu_links/IndexingLink.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | import ctypes
 4 | from .._base import _LIB
 5 | from .. import ndarray as _nd
 6 | 
 7 | def indexing(input_mat, index_mat,output_mat, stream=None):
 8 |     assert isinstance(input_mat, _nd.NDArray);
 9 |     assert isinstance(index_mat, _nd.NDArray);
10 |     assert isinstance(output_mat, _nd.NDArray);
11 |     
12 |     _LIB.DLGpuIndexing(
13 |             input_mat.handle,index_mat.handle, output_mat.handle, stream.handle if stream else None)
14 | 
15 | def indexing_grad(output_grad, index, input_grad, stream=None):
16 |     assert isinstance(output_grad, _nd.NDArray);
17 |     assert isinstance(index, _nd.NDArray);
18 |     assert isinstance(input_grad, _nd.NDArray);
19 | 
20 |     _LIB.DLGpuIndexingGrad(output_grad.handle, index.handle, input_grad.handle, stream.handle)
21 | 


--------------------------------------------------------------------------------
/python/hetu/gpu_links/InterpolateLink.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | import ctypes
 4 | from .._base import _LIB
 5 | from .. import ndarray as _nd
 6 | 
 7 | 
 8 | def bicubic_interpolate(in_arr, out_arr, align_corners, stream=None):
 9 |     assert isinstance(in_arr, _nd.NDArray)
10 |     assert isinstance(out_arr, _nd.NDArray)
11 |     _LIB.DLGpuBicubicInterpolate(
12 |         in_arr.handle, out_arr.handle, align_corners, stream.handle if stream else None)
13 | 
14 | 
15 | def bicubic_interpolate_gradient(input_grad, output, align_corners, stream=None):
16 |     assert isinstance(input_grad, _nd.NDArray)
17 |     assert isinstance(output, _nd.NDArray)
18 |     _LIB.DLGpuBicubicInterpolateGradient(
19 |         output.handle, input_grad.handle, align_corners, stream.handle if stream else None)
20 | 


--------------------------------------------------------------------------------
/python/hetu/gpu_links/LeakyReluLink.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | import ctypes
 4 | from .._base import _LIB
 5 | from .. import ndarray as _nd
 6 | 
 7 | 
 8 | def leaky_relu(in_arr, alpha, out_arr, stream=None):
 9 |     assert isinstance(in_arr, _nd.NDArray)
10 |     assert isinstance(out_arr, _nd.NDArray)
11 |     _LIB.DLGpuLeakyRelu(in_arr.handle, ctypes.c_float(
12 |         alpha), out_arr.handle, stream.handle if stream else None)
13 | 
14 | 
15 | def leaky_relu_gradient(in_arr, in_grad_arr, alpha, out_arr, stream=None):
16 |     assert isinstance(in_arr, _nd.NDArray)
17 |     assert isinstance(in_grad_arr, _nd.NDArray)
18 |     assert isinstance(out_arr, _nd.NDArray)
19 |     _LIB.DLGpuLeakyReluGradient(in_arr.handle, in_grad_arr.handle, ctypes.c_float(
20 |         alpha), out_arr.handle, stream.handle if stream else None)
21 | 


--------------------------------------------------------------------------------
/python/hetu/gpu_links/LinearLink.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | import ctypes
 4 | from .._base import _LIB
 5 | from .. import ndarray as _nd
 6 | 
 7 | 
 8 | def matmul_with_bias(matA, transA, matB, transB, bias, matC, stream=None):
 9 |     assert isinstance(matA, _nd.NDArray)
10 |     assert isinstance(matB, _nd.NDArray)
11 |     assert isinstance(bias, _nd.NDArray)
12 |     assert isinstance(matC, _nd.NDArray)
13 |     _LIB.DLGpuLinear(
14 |         matA.handle, transA, matB.handle, transB, bias.handle, matC.handle, stream.handle if stream else None)
15 | 


--------------------------------------------------------------------------------
/python/hetu/gpu_links/LogLink.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | import ctypes
 4 | from .._base import _LIB
 5 | from .. import ndarray as _nd
 6 | 
 7 | 
 8 | def log_link(input, output, eps, stream=None):
 9 |     assert isinstance(input, _nd.NDArray)
10 |     assert isinstance(output, _nd.NDArray)
11 |     _LIB.DLGpuLog(input.handle, output.handle, ctypes.c_float(
12 |         eps), stream.handle if stream else None)
13 | 
14 | 
15 | def log_grad_link(output_grad, input, input_grad, eps, stream=None):
16 |     assert isinstance(output_grad, _nd.NDArray)
17 |     assert isinstance(input, _nd.NDArray)
18 |     assert isinstance(input_grad, _nd.NDArray)
19 |     _LIB.DLGpuLogGrad(output_grad.handle, input.handle, input_grad.handle,
20 |                       ctypes.c_float(eps), stream.handle if stream else None)
21 | 


--------------------------------------------------------------------------------
/python/hetu/gpu_links/MaskLink.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | from .._base import _LIB
 4 | from .. import ndarray as _nd
 5 | 
 6 | 
 7 | def mask_func(input, mask, output, stream=None):
 8 |     assert isinstance(input, _nd.NDArray)
 9 |     assert isinstance(mask, _nd.NDArray)
10 |     assert isinstance(output, _nd.NDArray)
11 |     _LIB.DLGpuMask(input.handle, mask.handle, output.handle,
12 |                    stream.handle if stream else None)
13 | 


--------------------------------------------------------------------------------
/python/hetu/gpu_links/MaskedFillLink.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | import ctypes
 4 | from .._base import _LIB
 5 | from .. import ndarray as _nd
 6 | 
 7 | 
 8 | def masked_fill(input, mask, output, val, stream=None):
 9 |     assert isinstance(input, _nd.NDArray)
10 |     assert isinstance(mask, _nd.NDArray)
11 |     assert isinstance(output, _nd.NDArray)
12 |     _LIB.DLGpuMaskedFill(input.handle, mask.handle, ctypes.c_float(
13 |         val), output.handle, stream.handle if stream else None)
14 | 


--------------------------------------------------------------------------------
/python/hetu/gpu_links/MatrixDivideConstLink.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | import ctypes
 4 | from .._base import _LIB
 5 | from .. import ndarray as _nd
 6 | 
 7 | 
 8 | def matrix_elementwise_divide_const(val, in_mat, out_mat, stream=None):
 9 |     assert isinstance(in_mat, _nd.NDArray)
10 |     assert isinstance(out_mat, _nd.NDArray)
11 |     _LIB.DLGpuMatrixDivConst(
12 |         ctypes.c_float(val), in_mat.handle, out_mat.handle, stream.handle if stream else None)
13 | 


--------------------------------------------------------------------------------
/python/hetu/gpu_links/MatrixDivideLink.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | import ctypes
 4 | from .._base import _LIB
 5 | from .. import ndarray as _nd
 6 | 
 7 | 
 8 | def matrix_elementwise_divide(matA, matB, matC, stream=None):
 9 |     assert isinstance(matA, _nd.NDArray)
10 |     assert isinstance(matB, _nd.NDArray)
11 |     assert isinstance(matC, _nd.NDArray)
12 |     _LIB.DLGpuMatrixElementwiseDivide(
13 |         matA.handle, matB.handle, matC.handle, stream.handle if stream else None)
14 | 
15 | 
16 | def matrix_elementwise_divide_handle_zero(matA, matB, matC, stream=None):
17 |     assert isinstance(matA, _nd.NDArray)
18 |     assert isinstance(matB, _nd.NDArray)
19 |     assert isinstance(matC, _nd.NDArray)
20 |     _LIB.DLGpuMatrixElementwiseDivideHandleZero(
21 |         matA.handle, matB.handle, matC.handle, stream.handle if stream else None)
22 | 
23 | 


--------------------------------------------------------------------------------
/python/hetu/gpu_links/MatrixMultLink.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | import ctypes
 4 | from .._base import _LIB
 5 | from .. import ndarray as _nd
 6 | 
 7 | 
 8 | def matrix_multiply(matA, transA, matB, transB, matC, stream=None):
 9 |     assert isinstance(matA, _nd.NDArray)
10 |     assert isinstance(matB, _nd.NDArray)
11 |     assert isinstance(matC, _nd.NDArray)
12 |     _LIB.DLGpuMatrixMultiply(
13 |         matA.handle, transA, matB.handle, transB, matC.handle, stream.handle if stream else None)
14 | 


--------------------------------------------------------------------------------
/python/hetu/gpu_links/MatrixRsqrtLink.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | import ctypes
 4 | from .._base import _LIB
 5 | from .. import ndarray as _nd
 6 | 
 7 | 
 8 | def matrix_rsqrt(in_arr, out_arr, stream=None):
 9 |     assert isinstance(in_arr, _nd.NDArray)
10 |     assert isinstance(out_arr, _nd.NDArray)
11 |     _LIB.DLGpuReciprocalSqrt(
12 |         in_arr.handle, out_arr.handle, stream.handle if stream else None)
13 | 


--------------------------------------------------------------------------------
/python/hetu/gpu_links/MatrixSqrtLink.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | import ctypes
 4 | from .._base import _LIB
 5 | from .. import ndarray as _nd
 6 | 
 7 | 
 8 | def matrix_sqrt(in_arr, out_arr, stream=None):
 9 |     assert isinstance(in_arr, _nd.NDArray)
10 |     assert isinstance(out_arr, _nd.NDArray)
11 |     _LIB.DLGpuSqrt(in_arr.handle, out_arr.handle,
12 |                    stream.handle if stream else None)
13 | 


--------------------------------------------------------------------------------
/python/hetu/gpu_links/MatrixTransLink.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | import ctypes
 4 | from .._base import _LIB
 5 | from .. import ndarray as _nd
 6 | 
 7 | 
 8 | def matrix_transpose(in_mat, out_mat, perm, stream=None):
 9 |     assert isinstance(in_mat, _nd.NDArray)
10 |     assert isinstance(out_mat, _nd.NDArray)
11 |     pointer_func = ctypes.c_int * len(perm)
12 |     pointer = pointer_func(*list(perm))
13 |     _LIB.DLGpuTranspose(in_mat.handle, out_mat.handle,
14 |                         pointer, stream.handle if stream else None)
15 | 
16 | 
17 | def matrix_transpose_simple(in_mat, out_mat, gpu_buf, stream=None):
18 |     assert isinstance(in_mat, _nd.NDArray)
19 |     assert isinstance(out_mat, _nd.NDArray)
20 |     assert isinstance(gpu_buf, _nd.NDArray)
21 |     _LIB.DLGpuTransposeSimple(
22 |         in_mat.handle, out_mat.handle, gpu_buf.handle, stream.handle if stream else None)
23 | 


--------------------------------------------------------------------------------
/python/hetu/gpu_links/MaxLink.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | import ctypes
 4 | from .._base import _LIB
 5 | from .. import ndarray as _nd
 6 | 
 7 | 
 8 | def max(in_mat, out_mat_val, dim, stream=None):
 9 |     assert isinstance(in_mat, _nd.NDArray)
10 |     assert isinstance(out_mat_val, _nd.NDArray)
11 |     _LIB.DLGpuMax(in_mat.handle, out_mat_val.handle,
12 |                   dim, stream.handle if stream else None)
13 | 
14 | 
15 | def max_mat(matA, matB, out_mat, stream=None):
16 |     assert isinstance(matA, _nd.NDArray)
17 |     assert isinstance(matB, _nd.NDArray)
18 |     assert isinstance(out_mat, _nd.NDArray)
19 |     _LIB.DLGpuMaxMat(matA.handle, matB.handle, out_mat.handle,
20 |                      stream.handle if stream else None)
21 | 


--------------------------------------------------------------------------------
/python/hetu/gpu_links/MaxPoolLink.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | import ctypes
 4 | from .._base import _LIB
 5 | from .. import ndarray as _nd
 6 | 
 7 | 
 8 | def max_pooling2d(in_arr, kernel_H, kernel_W, pooled_layer, padding=0, stride=1, stream=None):
 9 |     assert isinstance(in_arr, _nd.NDArray)
10 |     assert isinstance(pooled_layer, _nd.NDArray)
11 |     _LIB.DLGpuMax_Pooling2d(in_arr.handle, kernel_H,
12 |                             kernel_W, pooled_layer.handle, padding, stride, stream.handle if stream else None)
13 | 
14 | 
15 | def max_pooling2d_gradient(in_arr, in_grad_arr, kernel_H, kernel_W, out_grad_arr, padding=0, stride=1, stream=None):
16 |     assert isinstance(in_arr, _nd.NDArray)
17 |     assert isinstance(in_grad_arr, _nd.NDArray)
18 |     assert isinstance(out_grad_arr, _nd.NDArray)
19 |     _LIB.DLGpuMax_Pooling2d_gradient(
20 |         in_arr.handle, in_grad_arr.handle, kernel_H, kernel_W, out_grad_arr.handle, padding, stride, stream.handle if stream else None)
21 | 


--------------------------------------------------------------------------------
/python/hetu/gpu_links/MinDistLink.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | from ctypes import c_bool
 4 | from .._base import _LIB
 5 | from .. import ndarray as _nd
 6 | 
 7 | 
 8 | def minimum_distance_vector(lookup, key, codebook, indices, output, mode, stream=None):
 9 |     assert isinstance(lookup, _nd.NDArray)
10 |     assert isinstance(key, _nd.NDArray)
11 |     assert isinstance(codebook, _nd.NDArray)
12 |     assert isinstance(indices, _nd.NDArray)
13 |     assert isinstance(output, _nd.NDArray)
14 |     if mode == 'eu':
15 |         cmode = True
16 |     else:
17 |         cmode = False
18 |     _LIB.DLGpuMinDist(lookup.handle, key.handle, codebook.handle,
19 |                       indices.handle, output.handle, c_bool(cmode), stream.handle if stream else None)
20 | 


--------------------------------------------------------------------------------
/python/hetu/gpu_links/MinLink.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | import ctypes
 4 | from .._base import _LIB
 5 | from .. import ndarray as _nd
 6 | 
 7 | 
 8 | def min(in_mat, out_mat_val, dim, stream=None):
 9 |     assert isinstance(in_mat, _nd.NDArray)
10 |     assert isinstance(out_mat_val, _nd.NDArray)
11 |     _LIB.DLGpuMin(in_mat.handle, out_mat_val.handle,
12 |                   dim, stream.handle if stream else None)
13 | 
14 | 
15 | def min_mat(matA, matB, out_mat, stream=None):
16 |     assert isinstance(matA, _nd.NDArray)
17 |     assert isinstance(matB, _nd.NDArray)
18 |     assert isinstance(out_mat, _nd.NDArray)
19 |     _LIB.DLGpuMinMat(matA.handle, matB.handle, out_mat.handle,
20 |                      stream.handle if stream else None)
21 | 


--------------------------------------------------------------------------------
/python/hetu/gpu_links/MinusByConstLink.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | import ctypes
 4 | from .._base import _LIB
 5 | from .. import ndarray as _nd
 6 | 
 7 | def minus_by_const(input, output, val, stream=None):
 8 |     assert isinstance(input, _nd.NDArray)
 9 |     assert isinstance(output, _nd.NDArray)
10 |     _LIB.DLGpuMinusByConst(input.handle, output.handle, ctypes.c_float(val), stream.handle if stream else None)
11 | 


--------------------------------------------------------------------------------
/python/hetu/gpu_links/MinusElewiseLink.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | import ctypes
 4 | from .._base import _LIB
 5 | from .. import ndarray as _nd
 6 | 
 7 | def matrix_elementwise_minus(input1, input2, output, stream=None):
 8 |     assert isinstance(input1, _nd.NDArray);
 9 |     assert isinstance(input2, _nd.NDArray);
10 |     assert isinstance(output, _nd.NDArray);
11 | 
12 |     _LIB.DLGpuMinusElewise(input1.handle, input2.handle, output.handle, stream.handle if stream else None)
13 | 


--------------------------------------------------------------------------------
/python/hetu/gpu_links/MultiplyConstLink.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | import ctypes
 4 | import numpy as np
 5 | from .._base import _LIB
 6 | from .. import ndarray as _nd
 7 | 
 8 | 
 9 | def matrix_elementwise_multiply_by_const(in_mat, val, out_mat, stream=None):
10 | 
11 |     assert isinstance(in_mat, (_nd.NDArray, _nd.IndexedSlices))
12 |     assert isinstance(out_mat, (_nd.NDArray, _nd.IndexedSlices))
13 | 
14 |     if in_mat.dtype == np.float32:
15 |         cval = ctypes.c_float(val)
16 |         func = _LIB.DLGpuMatrixMultiplyByConst
17 |     elif in_mat.dtype == np.int32:
18 |         cval = ctypes.c_int(val)
19 |         func = _LIB.DLGpuMatrixMultiplyByConstInt
20 | 
21 |     if isinstance(in_mat, _nd.NDArray):
22 |         func(
23 |             in_mat.handle, cval, out_mat.handle, stream.handle if stream else None)
24 |     else:
25 |         # isinstance(in_mat, _nd.IndexedSlices)
26 |         func(
27 |             in_mat.values.handle, cval, out_mat.values.handle, stream.handle if stream else None)
28 | 


--------------------------------------------------------------------------------
/python/hetu/gpu_links/MultiplyElewiseLink.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | import ctypes
 4 | from .._base import _LIB
 5 | from .. import ndarray as _nd
 6 | 
 7 | 
 8 | def matrix_elementwise_multiply(matA, matB, matC, stream=None):
 9 |     assert isinstance(matA, _nd.NDArray)
10 |     assert isinstance(matB, _nd.NDArray)
11 |     assert isinstance(matC, _nd.NDArray)
12 |     _LIB.DLGpuMatrixElementwiseMultiply(
13 |         matA.handle, matB.handle, matC.handle, stream.handle if stream else None)
14 | 


--------------------------------------------------------------------------------
/python/hetu/gpu_links/NllLossLink.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | import ctypes
 4 | from .._base import _LIB
 5 | from .. import ndarray as _nd
 6 | 
 7 | def nll_loss_link(input, target, output, stream=None):
 8 |     assert isinstance(input, _nd.NDArray)
 9 |     assert isinstance(target, _nd.NDArray)
10 |     assert isinstance(output, _nd.NDArray)
11 |     _LIB.DLGpuNllLoss(input.handle, target.handle, output.handle, stream.handle if stream else None)
12 | 
13 | def nll_loss_grad_link(output_grad, target, input_grad, stream=None):
14 |     assert isinstance(output_grad, _nd.NDArray)
15 |     assert isinstance(target, _nd.NDArray)
16 |     assert isinstance(input_grad, _nd.NDArray)
17 | 
18 |     _LIB.DLGpuNllLossGrad(output_grad.handle, target.handle, input_grad.handle, stream.handle if stream else None)
19 | 


--------------------------------------------------------------------------------
/python/hetu/gpu_links/NormLink.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | import ctypes
 4 | from .._base import _LIB
 5 | from .. import ndarray as _nd
 6 | 
 7 | 
 8 | def norm(in_arr, out_arr, axis, p, stream=None):
 9 |     assert isinstance(in_arr, _nd.NDArray)
10 |     assert isinstance(out_arr, _nd.NDArray)
11 |     _LIB.DLGpuNorm(in_arr.handle, out_arr.handle, ctypes.c_int(axis), ctypes.c_int(p),
12 |                    stream.handle if stream else None)
13 | 
14 | 
15 | def norm_gradient(input, input_y, grad_y, output, axis, p, stream=None):
16 |     assert isinstance(input, _nd.NDArray)
17 |     assert isinstance(input_y, _nd.NDArray)
18 |     assert isinstance(grad_y, _nd.NDArray)
19 |     assert isinstance(output, _nd.NDArray)
20 |     _LIB.DLGpuNormGradient(input.handle, input_y.handle, grad_y.handle, output.handle, ctypes.c_int(axis), ctypes.c_int(p),
21 |                            stream.handle if stream else None)
22 | 


--------------------------------------------------------------------------------
/python/hetu/gpu_links/OneHotLink.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | import ctypes
 4 | from .._base import _LIB
 5 | from .. import ndarray as _nd
 6 | 
 7 | 
 8 | def one_hot(in_arr, out_arr, stream=None):
 9 |     assert isinstance(in_arr, _nd.NDArray)
10 |     assert isinstance(out_arr, _nd.NDArray)
11 |     _LIB.DLGpuOneHot(in_arr.handle, out_arr.handle,
12 |                      stream.handle if stream else None)
13 | 


--------------------------------------------------------------------------------
/python/hetu/gpu_links/OppositeLink.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | import ctypes
 4 | from .._base import _LIB
 5 | from .. import ndarray as _nd
 6 | 
 7 | 
 8 | def matrix_opposite(in_arr, out_arr, stream=None):
 9 |     assert isinstance(in_arr, _nd.NDArray)
10 |     assert isinstance(out_arr, _nd.NDArray)
11 |     _LIB.DLGpuOpposite(in_arr.handle, out_arr.handle,
12 |                        stream.handle if stream else None)
13 | 


--------------------------------------------------------------------------------
/python/hetu/gpu_links/OptEmbedBinaryStepLink.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | from .._base import _LIB
 4 | from .. import ndarray as _nd
 5 | 
 6 | 
 7 | def binary_step_forward(in_arr, out_arr, stream=None):
 8 |     assert isinstance(in_arr, _nd.NDArray)
 9 |     assert isinstance(out_arr, _nd.NDArray)
10 |     _LIB.DLGpuIsPositive(in_arr.handle, out_arr.handle,
11 |                          stream.handle if stream else None)
12 | 
13 | 
14 | def binary_step_backward(in_arr, out_arr, stream=None):
15 |     assert isinstance(in_arr, _nd.NDArray)
16 |     assert isinstance(out_arr, _nd.NDArray)
17 |     _LIB.DLGpuBinaryStepBackward(in_arr.handle, out_arr.handle,
18 |                                  stream.handle if stream else None)
19 | 


--------------------------------------------------------------------------------
/python/hetu/gpu_links/ParamClipLink.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | import ctypes
 4 | from .._base import _LIB
 5 | from .. import ndarray as _nd
 6 | 
 7 | 
 8 | def param_clip_func(arr, min_value, max_value, stream=None):
 9 |     assert isinstance(arr, _nd.NDArray)
10 |     _LIB.DLGpuClipping(arr.handle, ctypes.c_float(min_value), ctypes.c_float(
11 |         max_value), stream.handle if stream else None)
12 | 


--------------------------------------------------------------------------------
/python/hetu/gpu_links/PowLink.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | import ctypes
 4 | from .._base import _LIB
 5 | from .. import ndarray as _nd
 6 | 
 7 | 
 8 | def pow_matrix(in_arr, out_arr, eps, stream=None):
 9 |     assert isinstance(in_arr, _nd.NDArray)
10 |     assert isinstance(out_arr, _nd.NDArray)
11 |     _LIB.DLGpuPow(in_arr.handle, out_arr.handle, ctypes.c_float(eps),
12 |                   stream.handle if stream else None)
13 | 
14 | 
15 | def pow_gradient(in_arr, in_grad_arr, out_arr, eps, stream=None):
16 |     assert isinstance(in_arr, _nd.NDArray)
17 |     assert isinstance(in_grad_arr, _nd.NDArray)
18 |     assert isinstance(out_arr, _nd.NDArray)
19 |     _LIB.DLGpuPowGradient(in_arr.handle, in_grad_arr.handle, out_arr.handle,
20 |                           ctypes.c_float(eps), stream.handle if stream else None)
21 | 


--------------------------------------------------------------------------------
/python/hetu/gpu_links/PowerLink.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | import ctypes
 4 | from .._base import _LIB
 5 | from .. import ndarray as _nd
 6 | 
 7 | 
 8 | def matrix_power(in_arr, out_arr, p, stream=None):
 9 |     assert isinstance(in_arr, _nd.NDArray)
10 |     assert isinstance(out_arr, _nd.NDArray)
11 |     _LIB.DLGpuPower(in_arr.handle, out_arr.handle, ctypes.c_float(p),
12 |                     stream.handle if stream else None)
13 | 


--------------------------------------------------------------------------------
/python/hetu/gpu_links/ReduceMeanLink.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | import ctypes
 4 | from .._base import _LIB
 5 | from .. import ndarray as _nd
 6 | 
 7 | 
 8 | def reduce_mean(in_arr, out_arr, axes, stream=None):
 9 |     assert isinstance(in_arr, _nd.NDArray)
10 |     assert isinstance(out_arr, _nd.NDArray)
11 |     pointer_func = ctypes.c_int * len(axes)
12 |     pointer = pointer_func(*list(axes))
13 |     _LIB.DLGpuReduceMean(
14 |         in_arr.handle, out_arr.handle, pointer, ctypes.c_int(len(axes)), stream.handle if stream else None)
15 | 


--------------------------------------------------------------------------------
/python/hetu/gpu_links/ReduceMinLink.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | import ctypes
 4 | from .._base import _LIB
 5 | from .. import ndarray as _nd
 6 | 
 7 | 
 8 | def reduce_min(in_arr, out_arr, axes, stream=None):
 9 |     assert isinstance(in_arr, _nd.NDArray)
10 |     assert isinstance(out_arr, _nd.NDArray)
11 |     pointer_func = ctypes.c_int * len(axes)
12 |     pointer = pointer_func(*list(axes))
13 |     _LIB.DLGpuReduceMin(
14 |         in_arr.handle, out_arr.handle, pointer, ctypes.c_int(len(axes)), stream.handle if stream else None)
15 | 


--------------------------------------------------------------------------------
/python/hetu/gpu_links/ReduceMulLink.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | import ctypes
 4 | from .._base import _LIB
 5 | from .. import ndarray as _nd
 6 | 
 7 | 
 8 | def reduce_mul(in_arr, out_arr, axes, stream=None):
 9 |     assert isinstance(in_arr, _nd.NDArray)
10 |     assert isinstance(out_arr, _nd.NDArray)
11 |     pointer_func = ctypes.c_int * len(axes)
12 |     pointer = pointer_func(*list(axes))
13 |     _LIB.DLGpuReduceMul(
14 |         in_arr.handle, out_arr.handle, pointer, ctypes.c_int(len(axes)), stream.handle if stream else None)
15 | 


--------------------------------------------------------------------------------
/python/hetu/gpu_links/ReduceNormLink.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | import ctypes
 4 | from .._base import _LIB
 5 | from .. import ndarray as _nd
 6 | 
 7 | 
 8 | def reduce_norm1(in_arr, out_arr, axes, stream=None):
 9 |     assert isinstance(in_arr, _nd.NDArray)
10 |     assert isinstance(out_arr, _nd.NDArray)
11 |     pointer_func = ctypes.c_int * len(axes)
12 |     pointer = pointer_func(*list(axes))
13 |     _LIB.DLGpuReduceNorm1(
14 |         in_arr.handle, out_arr.handle, pointer, ctypes.c_int(len(axes)), stream.handle if stream else None)
15 | 
16 | 
17 | def reduce_norm2(in_arr, out_arr, axes, stream=None):
18 |     assert isinstance(in_arr, _nd.NDArray)
19 |     assert isinstance(out_arr, _nd.NDArray)
20 |     pointer_func = ctypes.c_int * len(axes)
21 |     pointer = pointer_func(*list(axes))
22 |     _LIB.DLGpuReduceNorm2(
23 |         in_arr.handle, out_arr.handle, pointer, ctypes.c_int(len(axes)), stream.handle if stream else None)
24 | 


--------------------------------------------------------------------------------
/python/hetu/gpu_links/ReduceSumAxisZeroLink.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | import ctypes
 4 | from .._base import _LIB
 5 | from .. import ndarray as _nd
 6 | 
 7 | 
 8 | def reduce_sum_axis_zero(in_arr, out_arr, workspace_arr, stream=None):
 9 |     assert isinstance(in_arr, _nd.NDArray)
10 |     assert isinstance(out_arr, _nd.NDArray)
11 |     _LIB.DLGpuReduceSumAxisZero(
12 |         in_arr.handle, out_arr.handle, stream.handle if stream else None)
13 | 
14 | 
15 | def _reduce_sum_axis_zero(in_arr, out_arr, workspace_arr, stream=None):
16 |     assert isinstance(in_arr, _nd.NDArray)
17 |     assert isinstance(out_arr, _nd.NDArray)
18 |     assert isinstance(workspace_arr, _nd.NDArray)
19 |     _LIB._DLGpuReduceSumAxisZero(
20 |         in_arr.handle, out_arr.handle, workspace_arr.handle, stream.handle if stream else None)
21 | 


--------------------------------------------------------------------------------
/python/hetu/gpu_links/ReduceSumLink.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | import ctypes
 4 | from .._base import _LIB
 5 | from .. import ndarray as _nd
 6 | 
 7 | 
 8 | def reduce_sum(in_arr, out_arr, axes, stream=None):
 9 |     assert isinstance(in_arr, _nd.NDArray)
10 |     assert isinstance(out_arr, _nd.NDArray)
11 |     pointer_func = ctypes.c_int * len(axes)
12 |     pointer = pointer_func(*list(axes))
13 |     _LIB.DLGpuReduceSum(
14 |         in_arr.handle, out_arr.handle, pointer, ctypes.c_int(len(axes)), stream.handle if stream else None)
15 | 


--------------------------------------------------------------------------------
/python/hetu/gpu_links/ReluLink.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | import ctypes
 4 | from .._base import _LIB
 5 | from .. import ndarray as _nd
 6 | 
 7 | 
 8 | def relu(in_arr, out_arr, stream=None):
 9 |     assert isinstance(in_arr, _nd.NDArray)
10 |     assert isinstance(out_arr, _nd.NDArray)
11 |     _LIB.DLGpuRelu(in_arr.handle, out_arr.handle,
12 |                    stream.handle if stream else None)
13 | 
14 | 
15 | def relu_gradient(in_arr, in_grad_arr, out_arr, stream=None):
16 |     assert isinstance(in_arr, _nd.NDArray)
17 |     assert isinstance(in_grad_arr, _nd.NDArray)
18 |     assert isinstance(out_arr, _nd.NDArray)
19 |     _LIB.DLGpuReluGradient(in_arr.handle, in_grad_arr.handle,
20 |                            out_arr.handle, stream.handle if stream else None)
21 | 


--------------------------------------------------------------------------------
/python/hetu/gpu_links/RepeatLink.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | import ctypes
 4 | from .._base import _LIB
 5 | from .. import ndarray as _nd
 6 | 
 7 | 
 8 | def repeat(in_arr, out_arr, stream=None):
 9 |     assert isinstance(in_arr, _nd.NDArray)
10 |     assert isinstance(out_arr, _nd.NDArray)
11 | 
12 |     _LIB.DLGpuRepeat(in_arr.handle, out_arr.handle,
13 |                      stream.handle if stream else None)
14 | 
15 | 
16 | def repeat_gradient(in_arr, out_arr, stream=None):
17 |     assert isinstance(in_arr, _nd.NDArray)
18 |     assert isinstance(out_arr, _nd.NDArray)
19 | 
20 |     _LIB.DLGpuRepeatGradient(
21 |         in_arr.handle, out_arr.handle, stream.handle if stream else None)
22 | 


--------------------------------------------------------------------------------
/python/hetu/gpu_links/ReshapeLink.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | import ctypes
 4 | from .._base import _LIB
 5 | from .. import ndarray as _nd
 6 | 
 7 | 
 8 | def array_reshape(in_arr, out_arr, stream=None):
 9 | 
10 |     assert isinstance(in_arr, _nd.NDArray)
11 |     assert isinstance(out_arr, _nd.NDArray)
12 |     _LIB.DLGpuReshape(in_arr.handle, out_arr.handle,
13 |                       stream.handle if stream else None)
14 | 


--------------------------------------------------------------------------------
/python/hetu/gpu_links/RollLink.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | import ctypes
 4 | from .._base import _LIB
 5 | from .. import ndarray as _nd
 6 | 
 7 | 
 8 | def roll(input_mat, output_mat, shift, axis, stream=None):
 9 |     assert isinstance(input_mat, _nd.NDArray)
10 |     assert isinstance(output_mat, _nd.NDArray)
11 | 
12 |     nums = len(shift)
13 |     shift_func = ctypes.c_int * len(shift)
14 |     pointer_shift = shift_func(*list(shift))
15 | 
16 |     if (axis):
17 |         axis_func = ctypes.c_int * len(axis)
18 |         pointer_axis = axis_func(*list(axis))
19 |     else:
20 |         pointer_axis = None
21 | 
22 |     _LIB.DLGpuRoll(input_mat.handle, pointer_shift, pointer_axis,
23 |                    nums, output_mat.handle, stream.handle if stream else None)
24 | 


--------------------------------------------------------------------------------
/python/hetu/gpu_links/SamGroupSumLink.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | import ctypes
 4 | from .._base import _LIB
 5 | from .. import ndarray as _nd
 6 | 
 7 | def sam_group_sum_link(gate_mat, out_mat, num_local_gpus, stream=None):
 8 |     assert isinstance(gate_mat, _nd.NDArray);
 9 |     assert isinstance(out_mat, _nd.NDArray);                    
10 |     _LIB.DLGpuSamGroupSum(                            
11 |         gate_mat.handle, out_mat.handle, ctypes.c_int(num_local_gpus), stream.handle if stream else None)
12 | 


--------------------------------------------------------------------------------
/python/hetu/gpu_links/Scatter1DLink.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | import ctypes
 4 | from .._base import _LIB
 5 | from .. import ndarray as _nd
 6 | 
 7 | def scatter1d(input_mat, index_mat,output_mat, stream=None):
 8 |     assert isinstance(input_mat, _nd.NDArray);
 9 |     assert isinstance(index_mat, _nd.NDArray);
10 |     assert isinstance(output_mat, _nd.NDArray);
11 |     
12 |     _LIB.DLGpuScatter1D(
13 |             input_mat.handle,index_mat.handle, output_mat.handle, stream.handle if stream else None)
14 | 
15 | 
16 | def scatter1d_grad(output_grad_mat, index_mat, input_grad_mat, stream=None):
17 |     assert isinstance(output_grad_mat, _nd.NDArray)
18 |     assert isinstance(index_mat, _nd.NDArray)
19 |     assert isinstance(input_grad_mat, _nd.NDArray)
20 | 
21 |     _LIB.DLGpuScatter1DGrad(
22 |             output_grad_mat.handle, index_mat.handle, input_grad_mat.handle, stream.handle if stream else None)
23 | 


--------------------------------------------------------------------------------
/python/hetu/gpu_links/ScatterLink.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | import ctypes
 4 | from .._base import _LIB
 5 | from .. import ndarray as _nd
 6 | 
 7 | def scatter(target_mat, dim, index_mat, src_mat, stream=None):
 8 |     assert isinstance(target_mat, _nd.NDArray);
 9 |     assert isinstance(index_mat, _nd.NDArray);
10 |     assert isinstance(src_mat, _nd.NDArray);
11 | 
12 |     _LIB.DLGpuScatter(
13 |             target_mat.handle, dim, index_mat.handle, src_mat.handle, stream.handle if stream else None)
14 | 


--------------------------------------------------------------------------------
/python/hetu/gpu_links/SigmoidLink.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | import ctypes
 4 | from .._base import _LIB
 5 | from .. import ndarray as _nd
 6 | 
 7 | 
 8 | def sigmoid(in_arr, out_arr, stream=None):
 9 |     assert isinstance(in_arr, _nd.NDArray)
10 |     assert isinstance(out_arr, _nd.NDArray)
11 |     _LIB.DLGpuSigmoid(in_arr.handle, out_arr.handle,
12 |                       stream.handle if stream else None)
13 | 


--------------------------------------------------------------------------------
/python/hetu/gpu_links/SignLink.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | from .._base import _LIB
 4 | from .. import ndarray as _nd
 5 | 
 6 | 
 7 | def sign_func(in_arr, out_arr, stream=None):
 8 |     assert isinstance(in_arr, _nd.NDArray)
 9 |     assert isinstance(out_arr, _nd.NDArray)
10 |     _LIB.DLGpuSign(in_arr.handle, out_arr.handle,
11 |                    stream.handle if stream else None)
12 | 


--------------------------------------------------------------------------------
/python/hetu/gpu_links/SinLink.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | import ctypes
 4 | from .._base import _LIB
 5 | from .. import ndarray as _nd
 6 | 
 7 | 
 8 | def sin(in_arr, out_arr, stream=None):
 9 |     assert isinstance(in_arr, _nd.NDArray)
10 |     assert isinstance(out_arr, _nd.NDArray)
11 |     _LIB.DLGpuSin(in_arr.handle, out_arr.handle,
12 |                   stream.handle if stream else None)
13 | 
14 | 
15 | def cos(in_arr, out_arr, stream=None):
16 |     assert isinstance(in_arr, _nd.NDArray)
17 |     assert isinstance(out_arr, _nd.NDArray)
18 |     _LIB.DLGpuCos(in_arr.handle, out_arr.handle,
19 |                   stream.handle if stream else None)
20 | 


--------------------------------------------------------------------------------
/python/hetu/gpu_links/SoftmaxCrossEntropyLink.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | import ctypes
 4 | from .._base import _LIB
 5 | from .. import ndarray as _nd
 6 | 
 7 | 
 8 | def softmax_cross_entropy(in_arr_a, in_arr_b, out_arr, stream=None):
 9 |     assert isinstance(in_arr_a, _nd.NDArray)
10 |     assert isinstance(in_arr_b, _nd.NDArray)
11 |     assert isinstance(out_arr, _nd.NDArray)
12 |     _LIB.DLGpuSoftmaxCrossEntropy(
13 |         in_arr_a.handle, in_arr_b.handle, out_arr.handle, stream.handle if stream else None)
14 | 
15 | 
16 | def softmax_cross_entropy_gradient(in_arr_a, in_arr_b, in_arr_c, out_arr, stream=None):
17 |     assert isinstance(in_arr_a, _nd.NDArray)
18 |     assert isinstance(in_arr_b, _nd.NDArray)
19 |     assert isinstance(in_arr_c, _nd.NDArray)
20 |     assert isinstance(out_arr, _nd.NDArray)
21 |     _LIB.DLGpuSoftmaxCrossEntropy_Gradient(
22 |         in_arr_a.handle, in_arr_b.handle, in_arr_c.handle, out_arr.handle, stream.handle if stream else None)
23 | 


--------------------------------------------------------------------------------
/python/hetu/gpu_links/SoftmaxLink.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | import ctypes
 4 | from .._base import _LIB
 5 | from .. import ndarray as _nd
 6 | 
 7 | 
 8 | def softmax(in_arr, out_arr, stream=None):
 9 |     assert isinstance(in_arr, _nd.NDArray)
10 |     assert isinstance(out_arr, _nd.NDArray)
11 |     _LIB.DLGpuSoftmax(in_arr.handle, out_arr.handle,
12 |                       stream.handle if stream else None)
13 | 


--------------------------------------------------------------------------------
/python/hetu/gpu_links/SparseEmbeddingLookUpLink.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | import ctypes
 4 | from .._base import _LIB
 5 | from .. import ndarray as _nd
 6 | 
 7 | 
 8 | def sparse_embedding_lookup(in_mat, ids, out_mat, stream=None):
 9 |     assert isinstance(in_mat, _nd.ND_Sparse_Array)
10 |     assert isinstance(ids, _nd.NDArray)
11 |     assert isinstance(out_mat, _nd.NDArray)
12 |     if in_mat.form == 'csr':
13 |         _LIB.DLGpuCSREmbeddingLookUp(in_mat.data.handle, in_mat.row.handle, in_mat.col.handle,
14 |                                      ids.handle, out_mat.handle, stream.handle if stream else None)
15 |     else:
16 |         _LIB.DLGpuCOOEmbeddingLookUp(in_mat.data.handle, in_mat.row.handle, in_mat.col.handle,
17 |                                      ids.handle, out_mat.handle, stream.handle if stream else None)
18 | 


--------------------------------------------------------------------------------
/python/hetu/gpu_links/SparseSetLink.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | from .._base import _LIB
 4 | from .. import ndarray as _nd
 5 | 
 6 | 
 7 | def sparse_set(table, indices, data, stream=None):
 8 |     assert isinstance(table, _nd.NDArray)
 9 |     assert isinstance(indices, _nd.NDArray)
10 |     assert isinstance(data, _nd.NDArray)
11 |     _LIB.DLGpuSparseSet(table.handle, indices.handle,
12 |                         data.handle, stream.handle if stream else None)
13 | 


--------------------------------------------------------------------------------
/python/hetu/gpu_links/TanhLink.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | import ctypes
 4 | from .._base import _LIB
 5 | from .. import ndarray as _nd
 6 | 
 7 | 
 8 | def tanh(in_arr, out_arr, stream=None):
 9 |     assert isinstance(in_arr, _nd.NDArray)
10 |     assert isinstance(out_arr, _nd.NDArray)
11 |     _LIB.DLGpuTanh(in_arr.handle, out_arr.handle,
12 |                    stream.handle if stream else None)
13 | 
14 | 
15 | def tanh_gradient(forward_arr, grad_arr, out_arr, stream=None):
16 |     assert isinstance(forward_arr, _nd.NDArray)
17 |     assert isinstance(grad_arr, _nd.NDArray)
18 |     assert isinstance(out_arr, _nd.NDArray)
19 |     _LIB.DLGpuTanhGradient(forward_arr.handle, grad_arr.handle,
20 |                            out_arr.handle, stream.handle if stream else None)
21 | 


--------------------------------------------------------------------------------
/python/hetu/gpu_links/TopKIdxLink.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | import ctypes
 4 | from .._base import _LIB
 5 | from .. import ndarray as _nd
 6 | 
 7 | def topk_idx(in_mat, out_mat_idx, k, stream=None):
 8 |     assert isinstance(in_mat, _nd.NDArray);
 9 |     assert isinstance(out_mat_idx, _nd.NDArray);                    
10 |     _LIB.DLGpuTopKIdx(                            
11 |         in_mat.handle, out_mat_idx.handle, ctypes.c_int(k), stream.handle if stream else None)
12 | 


--------------------------------------------------------------------------------
/python/hetu/gpu_links/TopKValLink.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | import ctypes
 4 | from .._base import _LIB
 5 | from .. import ndarray as _nd
 6 | 
 7 | def topk_val(in_mat, out_mat_idx, out_mat_val, k, stream=None):
 8 |         assert isinstance(in_mat, _nd.NDArray);
 9 |         assert isinstance(out_mat_idx, _nd.NDArray);
10 |         assert isinstance(out_mat_val, _nd.NDArray);
11 |                     
12 |         _LIB.DLGpuTopKVal(                            
13 |                 in_mat.handle, out_mat_idx.handle, out_mat_val.handle, k, stream.handle if stream else None)
14 | 


--------------------------------------------------------------------------------
/python/hetu/gpu_links/TrilLookupLink.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | import ctypes
 4 | from .._base import _LIB
 5 | from .. import ndarray as _nd
 6 | 
 7 | 
 8 | def tril_lookup(in_arr, out_arr, offset, stream=None):
 9 |     assert isinstance(in_arr, _nd.NDArray)
10 |     assert isinstance(out_arr, _nd.NDArray)
11 |     _LIB.DLGpuTrilLookup(
12 |         in_arr.handle, out_arr.handle, ctypes.c_int(offset), stream.handle if stream else None)
13 | 
14 | 
15 | def tril_lookup_gradient(in_arr, out_arr, offset, stream=None):
16 |     assert isinstance(in_arr, _nd.NDArray)
17 |     assert isinstance(out_arr, _nd.NDArray)
18 |     _LIB.DLGpuTrilLookupGradient(
19 |         in_arr.handle, out_arr.handle, ctypes.c_int(offset), stream.handle if stream else None)
20 | 


--------------------------------------------------------------------------------
/python/hetu/gpu_links/WhereLink.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | import ctypes
 4 | from .._base import _LIB
 5 | from .. import ndarray as _nd
 6 | 
 7 | 
 8 | def where(cond, arr1, arr2, out_arr, stream=None):
 9 |     assert isinstance(cond, _nd.NDArray)
10 |     assert isinstance(arr1, _nd.NDArray)
11 |     assert isinstance(arr2, _nd.NDArray)
12 |     assert isinstance(out_arr, _nd.NDArray)
13 |     _LIB.DLGpuWhere(cond.handle, arr1.handle, arr2.handle,
14 |                     out_arr.handle, stream.handle if stream else None)
15 | 
16 | 
17 | def where_const(cond, arr1, const_attr, out_arr, stream=None):
18 |     assert isinstance(cond, _nd.NDArray)
19 |     assert isinstance(arr1, _nd.NDArray)
20 |     assert isinstance(out_arr, _nd.NDArray)
21 |     _LIB.DLGpuWhereConst(cond.handle, arr1.handle, ctypes.c_float(const_attr),
22 |                          out_arr.handle, stream.handle if stream else None)
23 | 


--------------------------------------------------------------------------------
/python/hetu/gpu_ops/SamGroupSum.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | from .Node import Op
 3 | from .. import ndarray
 4 | from ..gpu_links import sam_group_sum_link
 5 | 
 6 | 
 7 | class SamGroupSumOp(Op):
 8 |     def __init__(self, node_A, num_local_gpus=8, ctx=None):
 9 |         super().__init__(SamGroupSumOp, [node_A], ctx)
10 |         self.num_local_gpus = num_local_gpus
11 | 
12 |     def compute(self, input_val, output_val, stream_handle=None):
13 |         if self.on_cpu:
14 |             raise NotImplementedError
15 |         else:
16 |             sam_group_sum_link(input_val[0], output_val, self.num_local_gpus, stream_handle)
17 |     def gradient(self, output_grad):
18 |         return [None]
19 | 
20 |     def infer_shape(self, input_shapes):
21 |         assert len(input_shapes) == 1
22 |         return (input_shapes[0][0], self.num_local_gpus)
23 | 
24 | 
25 | def sam_group_sum_op(node, num_local_gpus, ctx=None):
26 |     return SamGroupSumOp(node, num_local_gpus, ctx=ctx)
27 | 


--------------------------------------------------------------------------------
/python/hetu/gpu_ops/Scatter.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | from .Node import Op
 3 | from .. import ndarray
 4 | from ..gpu_links import scatter
 5 | 
 6 | class ScatterOp(Op):
 7 |     def __init__(self, node_target, node_index, node_src, ctx=None):
 8 |         super().__init__(ScatterOp, [node_target, node_index, node_src], ctx)
 9 | 
10 |     def compute(self, target, dim, index, src, stream_handle=None):
11 |         scatter(target, dim, index, src)
12 | 
13 |     def gradient(self, output_grad):
14 |         pass
15 | 
16 |     def infer_shape(self, input_shapes):
17 |         pass
18 | 
19 | def scatter_op(node1, node2, node3, ctx=None):
20 |     return ScatterOp(node1, node2, node3, ctx=ctx)
21 | 


--------------------------------------------------------------------------------
/python/hetu/gpu_ops/Sign.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | import numpy as np
 3 | from .Node import Op
 4 | from ..gpu_links import sign_func
 5 | 
 6 | 
 7 | class SignOp(Op):
 8 |     def __init__(self, node, ctx=None):
 9 |         super().__init__(SignOp, [node], ctx)
10 | 
11 |     def compute(self, input_vals, output_val, stream_handle=None):
12 |         if self.on_cpu:
13 |             output_val[:] = np.sign(input_vals[0].asnumpy())
14 |         else:
15 |             sign_func(input_vals[0], output_val, stream_handle)
16 | 
17 |     def gradient(self, output_grad):
18 |         return [None]
19 | 
20 |     def infer_shape(self, input_shapes):
21 |         assert len(input_shapes) == 1
22 |         return input_shapes[0]
23 | 
24 | 
25 | def sign_op(node, ctx=None):
26 |     return SignOp(node, ctx=ctx)
27 | 


--------------------------------------------------------------------------------
/python/hetu/gpu_ops/SparseSet.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | import numpy as np
 3 | from .Node import Op
 4 | from .._base import DNNL_LIB
 5 | from ..gpu_links import sparse_set
 6 | 
 7 | 
 8 | class SparseSetOp(Op):
 9 |     def __init__(self, table, ind, data, ctx=None):
10 |         super().__init__(SparseSetOp, [table, ind, data], ctx)
11 |         assert table.dtype == ind.dtype == data.dtype == np.int32
12 | 
13 |     def compute(self, input_vals, output_val, stream_handle=None):
14 |         if self.on_cpu:
15 |             raise NotImplementedError
16 |         else:
17 |             sparse_set(input_vals[0], input_vals[1],
18 |                        input_vals[2], stream_handle)
19 | 
20 |     def gradient(self, output_grad):
21 |         return [None, None, None]
22 | 
23 |     def infer_shape(self, input_shapes):
24 |         return None
25 | 
26 | 
27 | def sparse_set_op(table, ind, data, ctx=None):
28 |     return SparseSetOp(table, ind, data, ctx=ctx)
29 | 


--------------------------------------------------------------------------------
/python/hetu/gpu_ops/StopGradient.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | from .Node import Op
 3 | 
 4 | 
 5 | class StopGradientOp(Op):
 6 |     def __init__(self, node, ctx=None):
 7 |         super().__init__(StopGradientOp, [node], ctx)
 8 | 
 9 |     def compute(self, input_vals, output_val, stream_handle=None):
10 |         raise NotImplementedError
11 | 
12 |     def gradient(self, output_grad):
13 |         return [None]
14 | 
15 |     def infer_shape(self, input_shapes):
16 |         assert len(input_shapes) == 1
17 |         return input_shapes[0]
18 | 
19 | 
20 | def stop_gradient_op(node, ctx=None):
21 |     return StopGradientOp(node, ctx=ctx)
22 | 


--------------------------------------------------------------------------------
/python/hetu/gpu_ops/TopKIdx.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | from .Node import Op
 3 | from .. import ndarray
 4 | from ..gpu_links import topk_idx
 5 | 
 6 | 
 7 | class TopKIdxOp(Op):
 8 |     def __init__(self, node_A, topk=1, ctx=None):
 9 |         super().__init__(TopKIdxOp, [node_A], ctx)
10 |         self.k = topk
11 | 
12 |     def compute(self, input_val, output_val, stream_handle=None):
13 |         if self.on_cpu:
14 |             raise NotImplementedError
15 |         else:
16 |             topk_idx(input_val[0], output_val, self.k, stream_handle)
17 |     def gradient(self, output_grad):
18 |         return [None]
19 | 
20 |     def infer_shape(self, input_shapes):
21 |         assert len(input_shapes) == 1
22 |         return (input_shapes[0][0], self.k)
23 | 
24 | 
25 | def topk_idx_op(node, topk, ctx=None):
26 |     return TopKIdxOp(node, topk, ctx=ctx)
27 | 


--------------------------------------------------------------------------------
/python/hetu/gpu_ops/TopKVal.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | from .Node import Op
 3 | from .. import ndarray
 4 | from ..gpu_links import topk_val
 5 | 
 6 | 
 7 | class TopKValOp(Op):
 8 |     def __init__(self, node_A, node_B, ctx=None):
 9 |         super().__init__(TopKValOp, [node_A, node_B], ctx)
10 | 
11 |     def compute(self, input_val, output_val, stream_handle=None):
12 |         if self.on_cpu:
13 |             raise NotImplementedError
14 |         else:
15 |             topk_val(input_val[0], input_val[1], output_val, stream_handle)
16 | 
17 |     def gradient(self, output_grad):
18 |         return [None, None]
19 | 
20 |     def infer_shape(self, input_shapes):
21 |         assert len(input_shapes) == 2
22 |         return input_shapes[1]
23 | 
24 | def topk_val_op(nodeA, nodeB, ctx=None):
25 |     return TopKValOp(nodeA, nodeB, ctx=ctx)
26 | 


--------------------------------------------------------------------------------
/python/hetu/layers/concatenate.py:
--------------------------------------------------------------------------------
 1 | from .base import BaseLayer
 2 | import hetu as ht
 3 | 
 4 | 
 5 | class Concatenate(BaseLayer):
 6 |     def __init__(self, axis):
 7 |         self.axis = axis
 8 | 
 9 |     def __call__(self, *args):
10 |         if len(args) == 1:
11 |             return args[0]
12 |         else:
13 |             return ht.concatenate_op(args, axis=self.axis)
14 | 
15 | 
16 | class ConcatenateLayers(BaseLayer):
17 |     def __init__(self, layers, axis=0):
18 |         self.layers = layers
19 |         self.axis = axis
20 | 
21 |     def __call__(self, x):
22 |         if len(self.layers) == 1:
23 |             return self.layers[0](x)
24 |         else:
25 |             return ht.concatenate_op([layer(x) for layer in self.layers], axis=self.axis)
26 | 


--------------------------------------------------------------------------------
/python/hetu/layers/dropout.py:
--------------------------------------------------------------------------------
 1 | from .base import BaseLayer
 2 | import hetu as ht
 3 | 
 4 | 
 5 | class DropOut(BaseLayer):
 6 |     def __init__(self, p=0.5):
 7 |         self.p = p
 8 | 
 9 |     def __call__(self, x):
10 |         if self.p == 0:
11 |             return x
12 |         return ht.dropout_op(x, 1-self.p)
13 | 


--------------------------------------------------------------------------------
/python/hetu/layers/embedding.py:
--------------------------------------------------------------------------------
 1 | from .base import BaseLayer
 2 | import hetu as ht
 3 | 
 4 | 
 5 | class Embedding(BaseLayer):
 6 |     def __init__(self, num_embeddings, embedding_dim, initializer=ht.init.GenXavierNormal(), name='embedding', ctx=None, **kargs):
 7 |         self.num_embeddings = num_embeddings
 8 |         self.embedding_dim = embedding_dim
 9 |         self.name = name
10 |         self.ctx = ctx
11 |         self.kargs = kargs
12 |         self.embedding_table = initializer(
13 |             shape=(self.num_embeddings, self.embedding_dim), name=self.name, ctx=ctx)
14 | 
15 |     def __call__(self, x):
16 |         return ht.embedding_lookup_op(self.embedding_table, x, ctx=self.ctx)
17 | 
18 |     def __repr__(self):
19 |         return f'{self.name}({self.num_embeddings},{self.embedding_dim})'
20 | 


--------------------------------------------------------------------------------
/python/hetu/layers/gates/base_gate.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Base gate
 3 | """
 4 | import hetu as ht
 5 | 
 6 | class BaseGate(object):
 7 |     def __init__(self, num_expert, world_size):
 8 |         super.__init__()
 9 |         self.world_size = world_size;
10 |         self.num_expert = num_expert;
11 |         self.tot_expert = world_size * num_expert;
12 |         self.loss = None
13 | 
14 |     def __call__(self):
15 |         raise NotImplementedError("Base gate cannot be directly used.")
16 | 
17 | 


--------------------------------------------------------------------------------
/python/hetu/layers/gates/gshard_gate.py:
--------------------------------------------------------------------------------
1 | from .naive_gate import NaiveGate
2 | import hetu as ht
3 | 
4 | class GshardGate(NaiveGate):
5 |     def __init__(self, d_model, num_expert, world_size, topk=2, capacity=(1.2, 2.4), random_routing=True):
6 |         assert topk==2, 'topk should be 2 in gshard'
7 |         super().__init__(d_model, num_expert, world_size, topk=2)
8 |         self.capa
9 | 


--------------------------------------------------------------------------------
/python/hetu/layers/gelu.py:
--------------------------------------------------------------------------------
1 | from .base import BaseLayer
2 | import hetu as ht
3 | 
4 | 
5 | class Gelu(BaseLayer):
6 |     def __call__(self, x):
7 |         return ht.gelu_op(x)


--------------------------------------------------------------------------------
/python/hetu/layers/identity.py:
--------------------------------------------------------------------------------
1 | from .base import BaseLayer
2 | 
3 | 
4 | class Identity(BaseLayer):
5 |     def __call__(self, x):
6 |         return x
7 | 


--------------------------------------------------------------------------------
/python/hetu/layers/mish.py:
--------------------------------------------------------------------------------
1 | from .base import BaseLayer
2 | import hetu as ht
3 | 
4 | 
5 | class Mish(BaseLayer):
6 |     def __call__(self, x):
7 |         return ht.mul_op(x, ht.tanh_op(ht.log_op(ht.addbyconst_op(ht.exp_op(x), 1))))
8 | 


--------------------------------------------------------------------------------
/python/hetu/layers/pooling.py:
--------------------------------------------------------------------------------
 1 | from .base import BaseLayer
 2 | import hetu as ht
 3 | 
 4 | 
 5 | class MaxPool2d(BaseLayer):
 6 |     def __init__(self, kernel_size, stride, padding=0):
 7 |         self.kernel_size = kernel_size
 8 |         self.stride = stride
 9 |         self.padding = padding
10 | 
11 |     def __call__(self, x):
12 |         return ht.max_pool2d_op(
13 |             x, self.kernel_size, self.kernel_size, self.padding, self.stride)
14 | 
15 | 
16 | class AvgPool2d(BaseLayer):
17 |     def __init__(self, kernel_size, stride, padding=0):
18 |         self.kernel_size = kernel_size
19 |         self.stride = stride
20 |         self.padding = padding
21 | 
22 |     def __call__(self, x):
23 |         return ht.avg_pool2d_op(
24 |             x, self.kernel_size, self.kernel_size, self.padding, self.stride)
25 | 


--------------------------------------------------------------------------------
/python/hetu/layers/relu.py:
--------------------------------------------------------------------------------
1 | from .base import BaseLayer
2 | import hetu as ht
3 | 
4 | 
5 | class Relu(BaseLayer):
6 |     def __call__(self, x):
7 |         return ht.relu_op(x)
8 | 


--------------------------------------------------------------------------------
/python/hetu/layers/reshape.py:
--------------------------------------------------------------------------------
 1 | from .base import BaseLayer
 2 | import hetu as ht
 3 | 
 4 | 
 5 | class Reshape(BaseLayer):
 6 |     def __init__(self, shape):
 7 |         self.shape = shape
 8 | 
 9 |     def __call__(self, x):
10 |         return ht.array_reshape_op(x, self.shape)
11 | 


--------------------------------------------------------------------------------
/python/hetu/layers/sequence.py:
--------------------------------------------------------------------------------
 1 | from .base import BaseLayer
 2 | 
 3 | 
 4 | class Sequence(BaseLayer):
 5 |     def __init__(self, *args):
 6 |         self.layers = args
 7 | 
 8 |     def __call__(self, x):
 9 |         for layer in self.layers:
10 |             x = layer(x)
11 |         return x
12 | 


--------------------------------------------------------------------------------
/python/hetu/layers/slice.py:
--------------------------------------------------------------------------------
 1 | from .base import BaseLayer
 2 | import hetu as ht
 3 | 
 4 | 
 5 | class Slice(BaseLayer):
 6 |     def __init__(self, begin, size):
 7 |         self.begin = begin
 8 |         self.size = size
 9 | 
10 |     def __call__(self, x):
11 |         return ht.slice_op(x, self.begin, self.size)
12 | 


--------------------------------------------------------------------------------
/python/hetu/layers/sum.py:
--------------------------------------------------------------------------------
 1 | from .base import BaseLayer
 2 | import hetu as ht
 3 | 
 4 | 
 5 | class SumLayers(BaseLayer):
 6 |     def __init__(self, layers):
 7 |         self.layers = layers
 8 | 
 9 |     def __call__(self, xs):
10 |         if not isinstance(xs, list):
11 |             xs = [xs] * len(self.layers)
12 |         assert len(xs) == len(self.layers)
13 |         if len(self.layers) == 1:
14 |             return self.layers[0](xs[0])
15 |         else:
16 |             return ht.sum_op([layer(x) for layer, x in zip(self.layers, xs)])
17 | 


--------------------------------------------------------------------------------
/python/hetu/onnx/X2hetu/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hsword/Hetu/a1cb1bcaf236c9f1d452733b105997d50c16c9df/python/hetu/onnx/X2hetu/__init__.py


--------------------------------------------------------------------------------
/python/hetu/onnx/X2hetu/handlers/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 | import pkgutil
3 | 
4 | __all__ = [
5 |     modname
6 |     for _, modname, _ in pkgutil.walk_packages(path=[os.path.split(__file__)[0]])
7 | ]
8 | 


--------------------------------------------------------------------------------
/python/hetu/onnx/__init__.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | from .hetu2onnx import (export)
 3 | from __future__ import print_function
 4 | from __future__ import unicode_literals
 5 | from __future__ import absolute_import
 6 | 
 7 | __all__ = ["hetu2onnx", "util", "constants", "handler", "graph", "onnx2hetu"]
 8 | 
 9 | from hetu.onnx import (hetu2onnx, util, constants, graph, handler, onnx2hetu)
10 | 


--------------------------------------------------------------------------------
/python/hetu/onnx/constants.py:
--------------------------------------------------------------------------------
 1 | NEEDLESS_ATTRS = ['op', 'desc', 'id', 'swap', 'trainable', 'ctx', 'event',
 2 |                   'inplace', 'lazy_execution', 'on_cpu', 'on_gpu', 'compute', 'middle_result', 'gpu_buffer',
 3 | 
 4 | 
 5 |                   ]
 6 | 
 7 | 
 8 | ONNX_DOMAIN = ""
 9 | AI_ONNX_ML_DOMAIN = "ai.onnx.ml"
10 | 


--------------------------------------------------------------------------------
/python/hetu/onnx/onnx_opset/Concat.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | from __future__ import print_function
 3 | from __future__ import unicode_literals
 4 | 
 5 | 
 6 | import numpy as np
 7 | from onnx import onnx_pb
 8 | from hetu.onnx import constants, util, graph
 9 | from hetu.onnx.handler import hetu_op
10 | from hetu.onnx.onnx_opset import general
11 | 
12 | 
13 | @hetu_op(["ConcatOp"], onnx_op=["Concat"])
14 | class Concat:
15 |     @classmethod
16 |     def version_1(cls, ctx, node, **kwargs):
17 |         pass
18 | 
19 |         # todo:opset < 8: might need to wrap concat in casts since only float is supported
20 |         # if ctx.opset < 8:
21 | 
22 |     @classmethod
23 |     def version_11(cls, ctx, node, **kwargs):
24 |         # Opset 11 supports negative axis, but core logic is same
25 |         cls.version_1(ctx, node, **kwargs)
26 | 


--------------------------------------------------------------------------------
/python/hetu/onnx/onnx_opset/Conv2d.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | from __future__ import print_function
 3 | from __future__ import unicode_literals
 4 | 
 5 | 
 6 | import numpy as np
 7 | from onnx import onnx_pb
 8 | from hetu.onnx import constants, util, graph
 9 | from hetu.onnx.handler import hetu_op
10 | from hetu.onnx.onnx_opset import general
11 | 
12 | 
13 | @hetu_op(["Conv2dOp"], onnx_op=["Conv"])
14 | class Conv:
15 |     @classmethod
16 |     def version_1(cls, ctx, node, **kwargs):
17 |         kernel_shape = ctx._shapes[node._inputs[1]][2:]
18 |         pads = [node.get_attr_value('padding', 0)]*4
19 |         strides = [node.get_attr_value('stride', 1)]*2
20 |         node.set_attr('kernel_shape', kernel_shape)
21 |         node.set_attr('pads', pads)
22 |         node.set_attr('strides', strides)
23 | 
24 |     @classmethod
25 |     def version_11(cls, ctx, node, **kwargs):
26 | 
27 |         cls.version_1(ctx, node, **kwargs)
28 | 


--------------------------------------------------------------------------------
/python/hetu/onnx/onnx_opset/Identity.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | from __future__ import print_function
 3 | from __future__ import unicode_literals
 4 | 
 5 | 
 6 | import numpy as np
 7 | from onnx import onnx_pb
 8 | from hetu.onnx import constants, util, graph
 9 | from hetu.onnx.handler import hetu_op
10 | from hetu.onnx.onnx_opset import general
11 | 
12 | 
13 | @hetu_op(["Identity"])
14 | class Identity(general.PassOp):
15 |     pass
16 | 


--------------------------------------------------------------------------------
/python/hetu/onnx/onnx_opset/MatrixMult.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | from __future__ import print_function
 3 | from __future__ import unicode_literals
 4 | 
 5 | 
 6 | import numpy as np
 7 | from onnx import onnx_pb
 8 | from hetu.onnx import constants, util, graph
 9 | from hetu.onnx.handler import hetu_op
10 | from hetu.onnx.onnx_opset import general
11 | 
12 | 
13 | @hetu_op(["MatMulOp"], onnx_op=["MatMul"])
14 | class MatMul:
15 |     @classmethod
16 |     def version_1(cls, ctx, node, **kwargs):
17 |         trans_a = node.get_attr_value('matmul_attr_trans_A', 0)
18 |         trans_b = node.get_attr_value('matmul_attr_trans_B', 0)
19 |         # fixme:only supported matrixs have two dims now
20 |         if trans_a != 0:
21 |             ctx.insert_new_node_on_input(
22 |                 node, 'Transpose', node._inputs[0], perm=[1, 0])
23 |         if trans_b != 0:
24 |             ctx.insert_new_node_on_input(
25 |                 node, 'Transpose', node._inputs[1], perm=[1, 0])
26 | 


--------------------------------------------------------------------------------
/python/hetu/onnx/onnx_opset/Opposite.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | from __future__ import print_function
 3 | from __future__ import unicode_literals
 4 | 
 5 | 
 6 | import numpy as np
 7 | from onnx import onnx_pb
 8 | from hetu.onnx import constants, util, graph
 9 | from hetu.onnx.handler import hetu_op
10 | from hetu.onnx.onnx_opset import general
11 | 
12 | 
13 | @hetu_op(["OppositeOp"], onnx_op=["Neg"])
14 | class Neg(general.PassOp):
15 |     pass
16 | 


--------------------------------------------------------------------------------
/python/hetu/onnx/onnx_opset/Reduces.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | from __future__ import print_function
 3 | from __future__ import unicode_literals
 4 | 
 5 | 
 6 | import numpy as np
 7 | from onnx import onnx_pb
 8 | from hetu.onnx import constants, util, graph
 9 | from hetu.onnx.handler import hetu_op
10 | from hetu.onnx.onnx_opset import general
11 | 
12 | 
13 | @hetu_op(["ReduceMeanOp"], onnx_op=["ReduceMean"])
14 | @hetu_op(["ReduceSumOp"], onnx_op=["ReduceSum"])
15 | class ReduceMean(general.PassOp):
16 |     @classmethod
17 |     def version_1(cls, ctx, node, **kwargs):
18 |         keepdims = node.get_attr_value('keepdims', None)
19 |         assert keepdims is not None
20 |         node.set_attr("keepdims", keepdims[0])
21 | 
22 |     @classmethod
23 |     def version_11(cls, ctx, node, **kwargs):
24 |         # Opset 11 supports negative axis, but core logic is same
25 |         cls.version_1(ctx, node, **kwargs)
26 | 


--------------------------------------------------------------------------------
/python/hetu/onnx/onnx_opset/Relu.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | from __future__ import print_function
 3 | from __future__ import unicode_literals
 4 | 
 5 | 
 6 | import numpy as np
 7 | from onnx import onnx_pb
 8 | from hetu.onnx import constants, util, graph
 9 | from hetu.onnx.handler import hetu_op
10 | from hetu.onnx.onnx_opset import general
11 | 
12 | 
13 | @hetu_op(["ReluOp"], onnx_op=["Relu"])
14 | class Relu(general.PassOp):
15 |     pass
16 | 


--------------------------------------------------------------------------------
/python/hetu/onnx/onnx_opset/Softmax.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | from __future__ import print_function
 3 | from __future__ import unicode_literals
 4 | 
 5 | 
 6 | import numpy as np
 7 | from onnx import onnx_pb
 8 | from hetu.onnx import constants, util, graph
 9 | from hetu.onnx.handler import hetu_op
10 | from hetu.onnx.onnx_opset import general
11 | 
12 | 
13 | @hetu_op(["SoftmaxOp"], onnx_op=["Softmax"])
14 | class Softmax():
15 |     @classmethod
16 |     def version_1(cls, ctx, node, **kwargs):
17 |         pass
18 |         # logits_rank = len(ctx.get_shape(node.input_tensor_names[0]))
19 |         # node.set_attr("axis",logits_rank - 1)
20 | 
21 |     @classmethod
22 |     def version_11(cls, ctx, node, **kwargs):
23 |         cls.version_1(ctx, node, **kwargs)
24 | 


--------------------------------------------------------------------------------
/python/hetu/onnx/onnx_opset/Sqrt.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | from __future__ import print_function
 3 | from __future__ import unicode_literals
 4 | 
 5 | 
 6 | import numpy as np
 7 | from onnx import onnx_pb
 8 | from hetu.onnx import constants, util, graph
 9 | from hetu.onnx.handler import hetu_op
10 | from hetu.onnx.onnx_opset import general
11 | 
12 | 
13 | @hetu_op(["SqrtOp"], onnx_op=["Sqrt"])
14 | class Sqrt(general.PassOp):
15 |     pass
16 | 
17 | 
18 | @hetu_op(["ReciprocalSqrtOp"], onnx_op=["Sqrt"])
19 | class rSqrt:
20 |     @classmethod
21 |     def version_1(cls, ctx, node, **kwargs):
22 |         op_name = util.make_name(node.name)
23 |         reciprocal = ctx.insert_new_node_on_output(
24 |             "Reciprocal", node.output_tensor_names[0], name=op_name
25 |         )
26 |         ctx.copy_shape(
27 |             node.output_tensor_names[0], reciprocal.output_tensor_names[0])
28 | 


--------------------------------------------------------------------------------
/python/hetu/onnx/onnx_opset/Tanh.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | from __future__ import print_function
 3 | from __future__ import unicode_literals
 4 | 
 5 | 
 6 | import numpy as np
 7 | from onnx import onnx_pb
 8 | from hetu.onnx import constants, util, graph
 9 | from hetu.onnx.handler import hetu_op
10 | from hetu.onnx.onnx_opset import general
11 | 
12 | 
13 | @hetu_op(["TanhOp"], onnx_op=["Tanh"])
14 | class Tanh(general.PassOp):
15 |     pass
16 | 


--------------------------------------------------------------------------------
/python/hetu/onnx/onnx_opset/Transpose.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | from __future__ import print_function
 3 | from __future__ import unicode_literals
 4 | 
 5 | 
 6 | import numpy as np
 7 | from onnx import onnx_pb
 8 | from hetu.onnx import constants, util, graph
 9 | from hetu.onnx.handler import hetu_op
10 | from hetu.onnx.onnx_opset import general
11 | 
12 | 
13 | @hetu_op(["TransposeOp"], onnx_op=["Transpose"])
14 | class Transpose(general.PassOp):
15 |     pass
16 | 


--------------------------------------------------------------------------------
/python/hetu/onnx/onnx_opset/Variable.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from __future__ import division
 3 | from __future__ import print_function
 4 | from __future__ import unicode_literals
 5 | 
 6 | from hetu.onnx.handler import hetu_op
 7 | 
 8 | from onnx import helper
 9 | from hetu.onnx.onnx_opset import general
10 | 
11 | 
12 | @hetu_op(["PlaceholderOp"], onnx_op=["Placeholder"])
13 | class PlaceholderOp:
14 |     @classmethod
15 |     def version_1(clsc, ctx, node, **kwargs):
16 |         val = node.get_attr_value('value')
17 |         if(val is not None):
18 |             node.op_type = "Const"
19 | 
20 | 
21 | @hetu_op(["defined_in"])
22 | class Defined_In(general.PassOp):
23 |     pass
24 | 


--------------------------------------------------------------------------------
/python/hetu/onnx/onnx_opset/Where.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | from __future__ import print_function
 3 | from __future__ import unicode_literals
 4 | 
 5 | 
 6 | import numpy as np
 7 | from onnx import onnx_pb
 8 | from hetu.onnx import constants, util, graph
 9 | from hetu.onnx.handler import hetu_op
10 | from hetu.onnx.onnx_opset import general
11 | 
12 | 
13 | @hetu_op(["WhereOp"], onnx_op=["Where"])
14 | class Where():
15 |     @classmethod
16 |     def version_1(cls, ctx, node, **kwargs):
17 |         assert False, "This version of the operator has been available since version 9 of the default ONNX operator set"
18 |         pass
19 | 
20 |     @classmethod
21 |     def version_9(cls, ctx, node, **kwargs):
22 |         pass
23 | 


--------------------------------------------------------------------------------
/python/hetu/onnx/onnx_opset/__init__.py:
--------------------------------------------------------------------------------
 1 | from . import (
 2 |     MatrixMult,
 3 |     MultiplyConst,
 4 |     Variable,
 5 |     AddElewise,
 6 |     Relu,
 7 |     Identity,
 8 |     Conv2d,
 9 |     Pool,
10 |     Reshape,
11 |     AddConst,
12 |     Concat,
13 |     Sqrt,
14 |     Tanh,
15 |     BatchNorm,
16 |     Pad,
17 |     Division,
18 |     OneHot,
19 |     Opposite,
20 |     Softmax,
21 |     general,
22 |     Reduces,
23 |     Dropout,
24 |     Transpose,
25 |     Where,
26 |     Slice,
27 | )
28 | 


--------------------------------------------------------------------------------
/python/hetu/onnx/onnx_opset/general.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | from __future__ import print_function
 3 | from __future__ import unicode_literals
 4 | from __future__ import absolute_import
 5 | 
 6 | 
 7 | class PassOp:
 8 |     @classmethod
 9 |     def version_1(cls, ctx, node, **kwargs):
10 |         pass
11 | 
12 |     @classmethod
13 |     def version_6(cls, ctx, node, **kwargs):
14 | 
15 |         cls.version_1(ctx, node, **kwargs)
16 | 


--------------------------------------------------------------------------------
/python/hetu/tokenizers/__init__.py:
--------------------------------------------------------------------------------
1 | from .bert_tokenizer import BertTokenizer
2 | 


--------------------------------------------------------------------------------
/src/common/cpu_device_api.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 |  *  Copyright (c) 2017 by Contributors
 3 |  * \file device_api.h
 4 |  * \brief Device specific API
 5 |  */
 6 | #ifndef HETUSYS_RUNTIME_CPU_DEVICE_API_H_
 7 | #define HETUSYS_RUNTIME_CPU_DEVICE_API_H_
 8 | 
 9 | #include "c_runtime_api.h"
10 | #include "device_api.h"
11 | #include <assert.h>
12 | #include <string>
13 | 
14 | namespace hetusys { namespace runtime {
15 | 
16 | class CPUDeviceAPI : public DeviceAPI {
17 | public:
18 |     void *AllocDataSpace(DLContext ctx, size_t size, size_t alignment) final;
19 | 
20 |     void FreeDataSpace(DLContext ctx, void *ptr) final;
21 | 
22 |     void CopyDataFromTo(const void *from, void *to, size_t size,
23 |                         DLContext ctx_from, DLContext ctx_to,
24 |                         DLStreamHandle stream) final;
25 | 
26 |     void StreamSync(DLContext ctx, DLStreamHandle stream) final;
27 | };
28 | 
29 | }}     // namespace hetusys::runtime
30 | #endif // HETUSYS_RUNTIME_CPU_DEVICE_API_H_
31 | 


--------------------------------------------------------------------------------
/src/common/cuda_device_api.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 |  *  Copyright (c) 2017 by Contributors
 3 |  * \file device_api.h
 4 |  * \brief Device specific API
 5 |  */
 6 | #ifndef HETUSYS_RUNTIME_CUDA_DEVICE_API_H_
 7 | #define HETUSYS_RUNTIME_CUDA_DEVICE_API_H_
 8 | 
 9 | #include "c_runtime_api.h"
10 | #include "device_api.h"
11 | 
12 | #include <assert.h>
13 | #include <string>
14 | 
15 | namespace hetusys { namespace runtime {
16 | 
17 | class CUDADeviceAPI : public DeviceAPI {
18 | public:
19 |     void *AllocDataSpace(DLContext ctx, size_t size, size_t alignment) final;
20 | 
21 |     void FreeDataSpace(DLContext ctx, void *ptr) final;
22 | 
23 |     void CopyDataFromTo(const void *from, void *to, size_t size,
24 |                         DLContext ctx_from, DLContext ctx_to,
25 |                         DLStreamHandle stream) final;
26 | 
27 |     void StreamSync(DLContext ctx, DLStreamHandle stream) final;
28 | };
29 | 
30 | }}     // namespace hetusys::runtime
31 | #endif // HETUSYS_RUNTIME_CUDA_DEVICE_API_H_
32 | 


--------------------------------------------------------------------------------
/src/common/random.cc:
--------------------------------------------------------------------------------
 1 | #include "random.h"
 2 | #include <mutex>
 3 | 
 4 | std::mutex random_state_mutex;
 5 | HetuRandomState hetu_random_state(0);
 6 | 
 7 | int SetRandomSeed(uint64_t seed) {
 8 |     std::lock_guard<std::mutex> lock(random_state_mutex);
 9 |     hetu_random_state.seed = seed;
10 |     return 0;
11 | }
12 | 
13 | uint64_t GetSeed() {
14 |     return hetu_random_state.seed;
15 | }
16 | 
17 | uint64_t GetSeedSeqNum() {
18 |     return hetu_random_state.seqnum;
19 | }
20 | 
21 | int StepSeqNum(uint64_t num_minimum_calls) {
22 |     std::lock_guard<std::mutex> lock(random_state_mutex);
23 |     hetu_random_state.seqnum += num_minimum_calls;
24 |     return 0;
25 | }
26 | 
27 | HetuRandomState NewRandomState(uint64_t seqnum) {
28 |     return HetuRandomState(hetu_random_state.seed, seqnum);
29 | }
30 | 
31 | HetuRandomState &GetRandomState(uint64_t num_minimum_calls) {
32 |     StepSeqNum(num_minimum_calls);
33 |     return hetu_random_state;
34 | }
35 | 


--------------------------------------------------------------------------------
/src/common/random.h:
--------------------------------------------------------------------------------
 1 | #ifndef HETUSYS_SRC_SEED_H
 2 | #define HETUSYS_SRC_SEED_H
 3 | 
 4 | #include "c_runtime_api.h"
 5 | 
 6 | struct HetuRandomState {
 7 |     HetuRandomState(uint64_t seed_ = 0, uint64_t seqnum_ = 0) :
 8 |         seed(seed_), seqnum(seqnum_) {
 9 |     }
10 | 
11 |     uint64_t seed;
12 |     uint64_t seqnum;
13 | };
14 | 
15 | HETUSYS_EXTERN_C {
16 |     int SetRandomSeed(uint64_t seed);
17 |     uint64_t GetSeed();
18 |     uint64_t GetSeedSeqNum();
19 |     int StepSeqNum(uint64_t num_minimum_calls);
20 | }
21 | 
22 | HetuRandomState NewRandomState(uint64_t seqnum);
23 | HetuRandomState &GetRandomState(uint64_t num_minimum_calls);
24 | 
25 | #endif
26 | 


--------------------------------------------------------------------------------
/src/dnnl_ops/ArraySet.cpp:
--------------------------------------------------------------------------------
 1 | #include <cassert>
 2 | #include <cctype>
 3 | #include <cmath>
 4 | #include <cstdio>
 5 | #include <iostream>
 6 | #include <random>
 7 | #include <stdexcept>
 8 | #include <vector>
 9 | #include <type_traits>
10 | #include <sys/time.h>
11 | #include <omp.h>
12 | 
13 | #include "dnnl.hpp"
14 | 
15 | #include "../common/c_runtime_api.h"
16 | #include "dnnl_runtime.h"
17 | 
18 | using namespace dnnl;
19 | using namespace std;
20 | 
21 | extern "C" int cpu_ArraySet(DLArrayHandle input, float value) {
22 |     int num = 1;
23 |     for (int i = 0; i < input->ndim; i++)
24 |         num *= input->shape[i];
25 |     float *data = (float *)(input->data);
26 | #pragma omp parallel for
27 |     for (int i = 0; i < num; i++)
28 |         data[i] = value;
29 |     return 0;
30 | }
31 | 


--------------------------------------------------------------------------------
/src/dnnl_ops/Gelu.cpp:
--------------------------------------------------------------------------------
 1 | #include <cassert>
 2 | #include <cctype>
 3 | #include <cmath>
 4 | #include <cstdio>
 5 | #include <iostream>
 6 | #include <random>
 7 | #include <stdexcept>
 8 | #include <vector>
 9 | #include <type_traits>
10 | #include <sys/time.h>
11 | 
12 | #include "dnnl.hpp"
13 | 
14 | #include "../common/c_runtime_api.h"
15 | #include "dnnl_runtime.h"
16 | using namespace dnnl;
17 | using namespace std;
18 | 
19 | extern "C" int DnnlGelu(const DLArrayHandle input, DLArrayHandle output) {
20 |     printf("DnnlGelu is not implemented yet.\n");
21 |     return 0;
22 | }
23 | 
24 | extern "C" int DnnlGelu_Gradient(const DLArrayHandle input,
25 |                                  const DLArrayHandle in_grad,
26 |                                  DLArrayHandle output) {
27 |     printf("DnnlGelu_Gradient is not implemented yet.\n");
28 |     return 0;
29 | }


--------------------------------------------------------------------------------
/src/dnnl_ops/Reshape.cpp:
--------------------------------------------------------------------------------
 1 | #include <cctype>
 2 | #include <cmath>
 3 | #include <cstdio>
 4 | #include <iostream>
 5 | #include <random>
 6 | #include <stdexcept>
 7 | #include <vector>
 8 | #include <type_traits>
 9 | #include <omp.h>
10 | #include "dnnl.hpp"
11 | 
12 | #include "../common/c_runtime_api.h"
13 | #include "dnnl_runtime.h"
14 | 
15 | extern "C" int cpu_Reshape(const DLArrayHandle in_arr, DLArrayHandle out_arr) {
16 |     int input_size = 1;
17 |     int output_size = 1;
18 |     float *input = (float *)(in_arr->data);
19 |     float *output = (float *)(out_arr->data);
20 |     for (int i = 0; i < in_arr->ndim; i++)
21 |         input_size *= in_arr->shape[i];
22 |     for (int i = 0; i < out_arr->ndim; i++)
23 |         output_size *= out_arr->shape[i];
24 | 
25 |     assert(input_size == output_size);
26 | #pragma omp parallel for
27 |     for (int i = 0; i < input_size; i++)
28 |         output[i] = input[i];
29 |     return 0;
30 | }


--------------------------------------------------------------------------------
/src/dnnl_ops/dnnl_runtime.h:
--------------------------------------------------------------------------------
 1 | #include <cassert>
 2 | #include <cctype>
 3 | #include <cmath>
 4 | #include <cstdio>
 5 | #include <iostream>
 6 | #include <random>
 7 | #include <stdexcept>
 8 | #include <vector>
 9 | #include <type_traits>
10 | #include <sys/time.h>
11 | 
12 | #include "dnnl.hpp"
13 | #include "../common/c_runtime_api.h"
14 | 
15 | using namespace dnnl;
16 | 
17 | extern bool is_dnnl_stream_init;
18 | extern engine eng;
19 | extern stream engine_stream;
20 | 
21 | void dnnl_stream_init();
22 | void print_dlarray(DLArrayHandle mat);
23 | void read_from_dnnl_memory(void *handle, dnnl::memory &mem);
24 | 


--------------------------------------------------------------------------------
/src/header/types.h:
--------------------------------------------------------------------------------
 1 | #ifndef HETUSYS_DEFAULT_TYPES_H
 2 | #define HETUSYS_DEFAULT_TYPES_H
 3 | 
 4 | #include <atomic>
 5 | 
 6 | typedef signed char int8;
 7 | typedef short int16;
 8 | typedef int int32;
 9 | typedef long long int64;
10 | 
11 | typedef unsigned char uint8;
12 | typedef unsigned short uint16;
13 | typedef unsigned int uint32;
14 | typedef unsigned long long uint64;
15 | 
16 | class SharedCounter {
17 | public:
18 |     int64 get() {
19 |         return cnt;
20 |     }
21 |     int64 next() {
22 |         return ++cnt;
23 |     }
24 | 
25 | private:
26 |     std::atomic<int64> cnt{0};
27 | };
28 | 
29 | #endif
30 | 


--------------------------------------------------------------------------------
/src/hetu_cache/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | aux_source_directory(src HETU_SRC_LIST)
 2 | 
 3 | find_package(pybind11 2.6.0 CONFIG)
 4 | 
 5 | if (NOT pybind11_FOUND)
 6 |     message(FATAL_ERROR "pybind11 not found")
 7 | else()
 8 |     pybind11_add_module(hetu_cache ${HETU_SRC_LIST})
 9 |     target_include_directories(hetu_cache PUBLIC include)
10 |     target_link_libraries(hetu_cache PUBLIC ps)
11 | endif()
12 | 


--------------------------------------------------------------------------------
/src/hetu_cache/include/lru_cache.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "cache.h"
 4 | 
 5 | #include <list>
 6 | #include <unordered_map>
 7 | 
 8 | namespace hetu {
 9 | 
10 | /*
11 |   LRUCache:
12 |     use LRU policy
13 |     Implemented with a double-linked list and a hash map
14 |     O(1) insert, lookup
15 | */
16 | 
17 | class LRUCache : public CacheBase {
18 | private:
19 |     std::unordered_map<cache_key_t, std::list<EmbeddingPT>::iterator> hash_;
20 |     std::list<EmbeddingPT> list_;
21 | 
22 | public:
23 |     using CacheBase::CacheBase;
24 |     size_t size() final {
25 |         return hash_.size();
26 |     }
27 |     int count(cache_key_t k) final;
28 |     void insert(EmbeddingPT e) final;
29 |     EmbeddingPT lookup(cache_key_t k) final;
30 | 
31 |     // python debug function
32 |     py::array_t<cache_key_t> PyAPI_keys();
33 | }; // class LRUCache
34 | 
35 | } // namespace hetu
36 | 


--------------------------------------------------------------------------------
/src/hetu_cache/src/embedding.cc:
--------------------------------------------------------------------------------
 1 | #include "embedding.h"
 2 | namespace hetu {
 3 | 
 4 | EmbeddingPT makeEmbedding(cache_key_t k, version_t version,
 5 |                           py::array_t<embed_t> val) {
 6 |     assert(val.ndim() == 1);
 7 |     PYTHON_CHECK_ARRAY(val);
 8 |     auto res = make_shared<Embedding>(k, val.data(), val.shape(0));
 9 |     res->setVersion(version);
10 |     return res;
11 | }
12 | 
13 | } // namespace hetu
14 | 


--------------------------------------------------------------------------------
/src/memory_pool/BFC_allocator.cc:
--------------------------------------------------------------------------------
1 | #include "BFC_allocator.h"
2 | 


--------------------------------------------------------------------------------
/src/ops/Clone.cu:
--------------------------------------------------------------------------------
 1 | #include "gpu_runtime.h"
 2 | 
 3 | int DLGpuClone(const DLArrayHandle input, DLArrayHandle output, DLStreamHandle stream_handle=NULL){
 4 |     float* input_data=(float*)input->data;
 5 |     float* output_data=(float*)output->data;
 6 |     int size = 1;
 7 |     for(int i=0;i<input->ndim; i++){
 8 |         size*=input->shape[i];
 9 |     }
10 |     cudaMemcpy((void*)output_data, (void*)input_data, size*sizeof(float),cudaMemcpyDeviceToDevice);
11 |     return 0;
12 | 
13 | }
14 | 
15 | 


--------------------------------------------------------------------------------
/src/ops/Exp.cu:
--------------------------------------------------------------------------------
 1 | #include "gpu_runtime.h"
 2 | __global__ void exp_kernel(const float *input, float *output, size_t size) {
 3 |     size_t ind = blockIdx.x * blockDim.x + threadIdx.x;
 4 |     if (ind >= size)
 5 |         return;
 6 |     output[ind] = expf(input[ind]);
 7 | }
 8 | 
 9 | int DLGpuExp(const DLArrayHandle input, DLArrayHandle output,
10 |              DLStreamHandle stream_handle = NULL) {
11 |     size_t size = ArrSize(input);
12 |     const float *input_data = (const float *)input->data;
13 |     float *output_data = (float *)output->data;
14 | 
15 |     dim3 blocks;
16 |     dim3 threads;
17 |     ThreadBlock1D(threads, blocks, size);
18 |     if (stream_handle) {
19 |         exp_kernel<<<blocks, threads, 0,
20 |                      *(cudaStream_t *)stream_handle->handle>>>(
21 |             input_data, output_data, size);
22 |     } else {
23 |         exp_kernel<<<blocks, threads>>>(input_data, output_data, size);
24 |     }
25 |     return 0;
26 | }
27 | 


--------------------------------------------------------------------------------
/tests/README.md:
--------------------------------------------------------------------------------
1 | ## Tests
2 | ### Need to validate:
3 | * test_dnnl_op.py
4 | * test_transformer_ops.py
5 | * onnx/*
6 | * test_DistGCN/test_model_distGCN15d.py
7 | 


--------------------------------------------------------------------------------
/tests/get_gpu_memory.py:
--------------------------------------------------------------------------------
 1 | from pynvml import smi as nvidia_smi
 2 | 
 3 | nvidia_smi.nvmlInit()
 4 | handle = nvidia_smi.nvmlDeviceGetHandleByIndex(0)
 5 | # card id 0 hardcoded here, there is also a call to get all available card ids, so we could iterate
 6 | 
 7 | ans = 0
 8 | while(True):
 9 |     mem_res = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)
10 |     # print(mem_res.used / (1024**2)) # usage in GiB
11 |     if (mem_res.used / (1024**2) > ans):
12 |         ans = mem_res.used / (1024**2)
13 |         print(ans)
14 | # print(f'mem: {100 * (mem_res.used / mem_res.total):.3f}%') # percentage usage
15 | 


--------------------------------------------------------------------------------
/tests/hetu_cache/hetu_cache_config.yml:
--------------------------------------------------------------------------------
 1 | shared: &shared
 2 |   DMLC_PS_ROOT_URI : 127.0.0.1
 3 |   DMLC_PS_ROOT_PORT : 13200
 4 |   DMLC_NUM_WORKER : 1
 5 |   DMLC_NUM_SERVER : 1
 6 |   DMLC_PS_VAN_TYPE : p3
 7 | launch :
 8 |   worker : 1
 9 |   server : 1
10 |   scheduler : true
11 | 


--------------------------------------------------------------------------------
/tests/onnx/README.md:
--------------------------------------------------------------------------------
1 | ## Attention
2 | This part has not validated in the latest version. Package "onnx_tf" cannot be used in Python 3.7, and there're some bugs in hetu's onnx part.
3 | 


--------------------------------------------------------------------------------
/tests/pstests/local_s2_w1.yml:
--------------------------------------------------------------------------------
 1 | shared: &shared
 2 |   DMLC_PS_ROOT_URI : 127.0.0.1
 3 |   DMLC_PS_ROOT_PORT : 13200
 4 |   DMLC_NUM_WORKER : 1
 5 |   DMLC_NUM_SERVER : 2
 6 |   DMLC_PS_VAN_TYPE : p3
 7 | sched:
 8 |   <<: *shared
 9 |   DMLC_ROLE : scheduler
10 | s0:
11 |   <<: *shared
12 |   DMLC_ROLE : server
13 |   SERVER_ID : 0
14 |   DMLC_PS_SERVER_URI : 127.0.0.1
15 |   DMLC_PS_SERVER_PORT : 13201
16 | s1:
17 |   <<: *shared
18 |   DMLC_ROLE : server
19 |   SERVER_ID : 1
20 |   DMLC_PS_SERVER_URI : 127.0.0.1
21 |   DMLC_PS_SERVER_PORT : 13203
22 | w0:
23 |   <<: *shared
24 |   DMLC_ROLE : worker
25 |   WORKER_ID : 0
26 |   DMLC_PS_WORKER_URI : 127.0.0.1
27 |   DMLC_PS_WORKER_PORT : 13210


--------------------------------------------------------------------------------
/tests/pstests/local_s2_w2.yml:
--------------------------------------------------------------------------------
 1 | shared: &shared
 2 |   DMLC_PS_ROOT_URI : 127.0.0.1
 3 |   DMLC_PS_ROOT_PORT : 13200
 4 |   DMLC_NUM_WORKER : 2
 5 |   DMLC_NUM_SERVER : 2
 6 |   DMLC_PS_VAN_TYPE : p3
 7 | sched:
 8 |   <<: *shared
 9 |   DMLC_ROLE : scheduler
10 | s0:
11 |   <<: *shared
12 |   DMLC_ROLE : server
13 |   SERVER_ID : 0
14 |   DMLC_PS_SERVER_URI : 127.0.0.1
15 |   DMLC_PS_SERVER_PORT : 13201
16 | s1:
17 |   <<: *shared
18 |   DMLC_ROLE : server
19 |   SERVER_ID : 1
20 |   DMLC_PS_SERVER_URI : 127.0.0.1
21 |   DMLC_PS_SERVER_PORT : 13203
22 | w0:
23 |   <<: *shared
24 |   DMLC_ROLE : worker
25 |   WORKER_ID : 0
26 |   DMLC_PS_WORKER_URI : 127.0.0.1
27 |   DMLC_PS_WORKER_PORT : 13210
28 | w1:
29 |   <<: *shared
30 |   DMLC_ROLE : worker
31 |   WORKER_ID : 1
32 |   DMLC_PS_WORKER_URI : 127.0.0.1
33 |   DMLC_PS_WORKER_PORT : 13211


--------------------------------------------------------------------------------
/tests/pstests/tf_local_s1_w2.json:
--------------------------------------------------------------------------------
1 | {
2 |     "worker": [
3 |         "127.0.0.1:12349",
4 |         "127.0.0.1:12348"
5 |     ],
6 |     "ps": [
7 |         "127.0.0.1:22345"
8 |     ]
9 | }


--------------------------------------------------------------------------------
/tests/test_encode_decode.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hsword/Hetu/a1cb1bcaf236c9f1d452733b105997d50c16c9df/tests/test_encode_decode.py


--------------------------------------------------------------------------------
/tests/test_ha2agather.py:
--------------------------------------------------------------------------------
 1 | from ctypes import *
 2 | from hetu import ndarray
 3 | from hetu.stream import *
 4 | from hetu.context import DeviceGroup
 5 | import numpy as np
 6 | import hetu as ht
 7 | import hetu.communicator.mpi_nccl_comm as fuck
 8 | 
 9 | if __name__ == "__main__":
10 |     t = ht.wrapped_mpi_nccl_init()        
11 |     send_arr = np.ones(16)*t.localRank.value        
12 |     recv_arr = np.ones(16*8)*t.localRank.value            
13 |     print("before: send_arr = "+str(send_arr)+" recv_arr = "+str(recv_arr))                
14 |     send_arr = ndarray.array(send_arr, ctx=ndarray.gpu(t.device_id.value))    
15 |     recv_arr = ndarray.array(recv_arr, ctx=ndarray.gpu(t.device_id.value))
16 |     t.dlarrayHA2AGather(send_arr, recv_arr,fuck.ncclDataType_t.ncclFloat32, t.localRank.value, 8)
17 |     print("after:  send_arr = "+str(send_arr.asnumpy())+" recv_arr = "+str(recv_arr.asnumpy()))
18 | 
19 | 


--------------------------------------------------------------------------------
/tools/EmbeddingMemoryCompression/.gitignore:
--------------------------------------------------------------------------------
1 | datasets/
2 | logs/
3 | scripts.sh
4 | wandb/
5 | ckpts/
6 | 


--------------------------------------------------------------------------------
/tools/EmbeddingMemoryCompression/methods/__init__.py:
--------------------------------------------------------------------------------
1 | from .scheduler import get_trainer
2 | 


--------------------------------------------------------------------------------
/tools/EmbeddingMemoryCompression/methods/layers/__init__.py:
--------------------------------------------------------------------------------
 1 | from .hash import HashEmbedding
 2 | from .compo import CompositionalEmbedding
 3 | from .tensortrain import TensorTrainEmbedding
 4 | from .dhe import DeepHashEmbedding
 5 | from .robe import RobeEmbedding
 6 | from .dpq import DPQEmbedding
 7 | from .mgqe import MGQEmbedding
 8 | from .adapt import AdaptiveEmbedding
 9 | from .mde import MDEmbedding
10 | from .autodim import AutoDimEmbedding, AutoDimRetrainEmbedding
11 | from .optembed import OptEmbedding, OptEmbeddingAfterRowPruning
12 | from .sparse import SparseEmbedding
13 | from .deeplight import DeepLightEmbedding
14 | from .pep import PEPEmbedding, PEPRetrainEmbedding
15 | from .autosrh import AutoSrhEmbedding, AutoSrhRetrainEmbedding
16 | from .quantize import QuantizedEmbedding
17 | from .alpt import ALPTEmbedding
18 | from .deduplication import DedupEmbedding
19 | 


--------------------------------------------------------------------------------
/tools/EmbeddingMemoryCompression/methods/layers/hash.py:
--------------------------------------------------------------------------------
 1 | import hetu as ht
 2 | from hetu.layers import Embedding
 3 | 
 4 | 
 5 | class HashEmbedding(Embedding):
 6 |     def __call__(self, x):
 7 |         # ref MLSys20, HierPS
 8 |         with ht.context(self.ctx):
 9 |             sparse_input = ht.mod_hash_op(x, self.num_embeddings)
10 |             return ht.embedding_lookup_op(self.embedding_table, sparse_input)
11 | 


--------------------------------------------------------------------------------
/tools/EmbeddingMemoryCompression/methods/layers/primes.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hsword/Hetu/a1cb1bcaf236c9f1d452733b105997d50c16c9df/tools/EmbeddingMemoryCompression/methods/layers/primes.npy


--------------------------------------------------------------------------------
/tools/EmbeddingMemoryCompression/methods/scheduler/robe.py:
--------------------------------------------------------------------------------
 1 | from .base import EmbeddingTrainer
 2 | from ..layers import RobeEmbedding
 3 | from hetu.random import get_np_rand, set_random_seed
 4 | import math
 5 | 
 6 | 
 7 | class ROBETrainer(EmbeddingTrainer):
 8 |     def assert_use_multi(self):
 9 |         assert self.use_multi == 0
10 | 
11 |     def get_embed_layer(self):
12 |         assert self.num_embed < 2038074743
13 |         set_random_seed(self.seed)
14 |         nprs = get_np_rand(1)
15 |         emb = RobeEmbedding(
16 |             math.floor(self.num_embed * self.embedding_dim *
17 |                        self.compress_rate),
18 |             self.embedding_dim,
19 |             self.embedding_args['Z'],
20 |             nprs,
21 |             use_slot_coef=bool(self.separate_fields),
22 |             initializer=self.initializer,
23 |             name=f'RobeEmb{self.compress_rate}',
24 |             ctx=self.ectx,
25 |         )
26 |         return emb
27 | 


--------------------------------------------------------------------------------
/tools/EmbeddingMemoryCompression/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .base import CTRModel_Head
2 | from .dcn import DCN_Head
3 | from .deepfm import DeepFM_Head
4 | from .dlrm import DLRM_Head
5 | from .wdl import WDL_Head
6 | 


--------------------------------------------------------------------------------
/tools/EmbeddingMemoryCompression/scripts/.gitignore:
--------------------------------------------------------------------------------
1 | *.png
2 | *.bin
3 | 


--------------------------------------------------------------------------------
/tools/EmbeddingMemoryCompression/supplements/static_encoding.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hsword/Hetu/a1cb1bcaf236c9f1d452733b105997d50c16c9df/tools/EmbeddingMemoryCompression/supplements/static_encoding.png


--------------------------------------------------------------------------------
/tools/Galvatron/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hsword/Hetu/a1cb1bcaf236c9f1d452733b105997d50c16c9df/tools/Galvatron/.DS_Store


--------------------------------------------------------------------------------
/tools/Galvatron/MANIFEST.in:
--------------------------------------------------------------------------------
1 | recursive-include galvatron *.json


--------------------------------------------------------------------------------
/tools/Galvatron/Makefile:
--------------------------------------------------------------------------------
 1 | CXX = g++
 2 | CXXFLAGS = -O3 -Wall -shared -std=c++11 -fPIC
 3 | PYTHON_INCLUDES = $(shell python3 -m pybind11 --includes)
 4 | PYTHON_EXTENSION_SUFFIX = $(shell python3-config --extension-suffix)
 5 | SOURCE_DIR = csrc
 6 | SOURCE_FILE = dp_core.cpp
 7 | BUILD_DIR = galvatron/build
 8 | LIB_DIR = $(BUILD_DIR)/lib
 9 | OUTPUT_FILE = $(LIB_DIR)/galvatron_dp_core$(PYTHON_EXTENSION_SUFFIX)
10 | CURRENT_DIR = $(shell dirname $(realpath $(lastword $(MAKEFILE_LIST)))
11 | 
12 | all: $(OUTPUT_FILE)
13 | 
14 | $(OUTPUT_FILE): $(SOURCE_DIR)/$(SOURCE_FILE)
15 | 	@mkdir -p $(LIB_DIR)
16 | 	$(CXX) $(CXXFLAGS) $(PYTHON_INCLUDES) $< -o $@
17 | 
18 | clean:
19 | 	rm -rf $(BUILD_DIR)
20 | 
21 | .PHONY: clean


--------------------------------------------------------------------------------
/tools/Galvatron/figs/api.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hsword/Hetu/a1cb1bcaf236c9f1d452733b105997d50c16c9df/tools/Galvatron/figs/api.jpg


--------------------------------------------------------------------------------
/tools/Galvatron/galvatron.exp:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | path="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
3 | echo "Galvatron root is" $path
4 | export GalvatronRoot="$path"
5 | export PATH="$path:$PATH"
6 | export PYTHONPATH="$path:$PYTHONPATH"


--------------------------------------------------------------------------------
/tools/Galvatron/galvatron/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hsword/Hetu/a1cb1bcaf236c9f1d452733b105997d50c16c9df/tools/Galvatron/galvatron/.DS_Store


--------------------------------------------------------------------------------
/tools/Galvatron/galvatron/MANIFEST.in:
--------------------------------------------------------------------------------
1 | recursive-include galvatron *.json


--------------------------------------------------------------------------------
/tools/Galvatron/galvatron/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | for p in ['site_package', 'build/lib']:
4 |     sys.path.insert(0, os.path.join(os.path.dirname(__file__), p))


--------------------------------------------------------------------------------
/tools/Galvatron/galvatron/core/__init__.py:
--------------------------------------------------------------------------------
1 | from .redistribute import split_to_group, gather_from_group
2 | from .comm_groups import gen_comm_groups
3 | from .initialize import init_empty_weights
4 | from .parallel import *
5 | from .arguments import initialize_galvatron, get_args
6 | from .hybrid_parallel_config import *
7 | from .hybrid_parallel_model import *
8 | from .profiler import *


--------------------------------------------------------------------------------
/tools/Galvatron/galvatron/core/dataloader.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hsword/Hetu/a1cb1bcaf236c9f1d452733b105997d50c16c9df/tools/Galvatron/galvatron/core/dataloader.py


--------------------------------------------------------------------------------
/tools/Galvatron/galvatron/core/pipeline/__init__.py:
--------------------------------------------------------------------------------
1 | from .pipeline import PipelineParallel, PipeSequential


--------------------------------------------------------------------------------
/tools/Galvatron/galvatron/core/tensor_parallel/__init__.py:
--------------------------------------------------------------------------------
1 | from .transformer import ParallelMLP, ParallelAttention
2 | from .utils import init_method_normal, scaled_init_method_normal
3 | from megatron.core.tensor_parallel import ColumnParallelLinear, RowParallelLinear
4 | from megatron.core.tensor_parallel import get_cuda_rng_tracker, split_tensor_along_last_dim
5 | from megatron.model.enums import AttnMaskType, LayerType, AttnType


--------------------------------------------------------------------------------
/tools/Galvatron/galvatron/core/tensor_parallel/utils.py:
--------------------------------------------------------------------------------
 1 | """Megatron-LM Utilities for models."""
 2 | 
 3 | import math
 4 | import torch
 5 | 
 6 | def init_method_normal(sigma):
 7 |     """Init method based on N(0, sigma)."""
 8 |     def init_(tensor):
 9 |         return torch.nn.init.normal_(tensor, mean=0.0, std=sigma)
10 | 
11 |     return init_
12 | 
13 | def scaled_init_method_normal(sigma, num_layers):
14 |     """Init method based on N(0, sigma/sqrt(2*num_layers)."""
15 |     std = sigma / math.sqrt(2.0 * num_layers)
16 | 
17 |     def init_(tensor):
18 |         return torch.nn.init.normal_(tensor, mean=0.0, std=std)
19 | 
20 |     return init_


--------------------------------------------------------------------------------
/tools/Galvatron/galvatron/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hsword/Hetu/a1cb1bcaf236c9f1d452733b105997d50c16c9df/tools/Galvatron/galvatron/models/__init__.py


--------------------------------------------------------------------------------
/tools/Galvatron/galvatron/models/baichuan/BaiChuanModel_tensor_parallel.py:
--------------------------------------------------------------------------------
 1 | from galvatron.core import get_args
 2 | from flash_attn.models.gpt import create_mixer_cls, create_mlp_cls
 3 |     
 4 | def construct_tensor_parallel_model(model, config, tp_groups_enc):
 5 |     args=get_args()
 6 |     factory_kwargs = {
 7 |         'device': 'meta' if hasattr(args, 'initialize_on_meta') and args.initialize_on_meta else 'cpu',
 8 |         'dtype': None
 9 |     }
10 |     for i in range(config.num_hidden_layers):
11 |         layer = model.transformer.layers[i]
12 |         setattr(layer, 'mixer', create_mixer_cls(config, layer_idx=i, process_group=tp_groups_enc[i].group, **factory_kwargs)(config.hidden_size))
13 |         setattr(layer, 'mlp', create_mlp_cls(config, layer_idx=i, process_group=tp_groups_enc[i].group, **factory_kwargs)(config.hidden_size))
14 |     return model


--------------------------------------------------------------------------------
/tools/Galvatron/galvatron/models/baichuan/__init__.py:
--------------------------------------------------------------------------------
1 | from .BaiChuanModel_hybrid_parallel import get_hybrid_parallel_configs, construct_hybrid_parallel_model, baichuan_model_hp


--------------------------------------------------------------------------------
/tools/Galvatron/galvatron/models/baichuan/configs/computation_profiling_bf16_hidden4096_head32_seqlen2048.json:
--------------------------------------------------------------------------------
1 | {
2 |     "layernum[6]_bsz2": 69.02671432495117,
3 |     "layernum[12]_bsz2": 123.64290746053062,
4 |     "layertype_0": 4.551349427964954
5 | }


--------------------------------------------------------------------------------
/tools/Galvatron/galvatron/models/baichuan/configs/galvatron_config_baichuan-7b_2nodes_8gpus_per_node_40GB_bf16_3D_example.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "pp_deg": 2,
 3 |     "tp_sizes_enc": "2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2",
 4 |     "tp_consecutive_flags": "1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1",
 5 |     "dp_types_enc": "0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0",
 6 |     "global_bsz": 256,
 7 |     "chunks": 32,
 8 |     "pp_division": "16,16",
 9 |     "pipeline_type": "pipedream_flush",
10 |     "default_dp_type": "zero2"
11 | }


--------------------------------------------------------------------------------
/tools/Galvatron/galvatron/models/baichuan/configs/galvatron_config_baichuan-7b_2nodes_8gpus_per_node_40GB_bf16_example.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "pp_deg": 1,
 3 |     "tp_sizes_enc": "1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1",
 4 |     "tp_consecutive_flags": "1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1",
 5 |     "dp_types_enc": "1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1",
 6 |     "global_bsz": 48,
 7 |     "chunks": 1,
 8 |     "pp_division": "32",
 9 |     "checkpoint": "1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0",
10 |     "pipeline_type": "pipedream_flush",
11 |     "default_dp_type": "zero2"
12 | }


--------------------------------------------------------------------------------
/tools/Galvatron/galvatron/models/baichuan/hf_configs/__init__.py:
--------------------------------------------------------------------------------
1 | from .config_utils import *


--------------------------------------------------------------------------------
/tools/Galvatron/galvatron/models/baichuan/hf_configs/baichuan-7b/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "architectures": [
 3 |     "BaiChuanForCausalLM"
 4 |   ],
 5 |   "auto_map": {
 6 |     "AutoConfig": "configuration_baichuan.BaiChuanConfig",
 7 |     "AutoModelForCausalLM": "modeling_baichuan.BaiChuanForCausalLM"
 8 |   },
 9 |   "bos_token_id": 1,
10 |   "eos_token_id": 2,
11 |   "hidden_act": "silu",
12 |   "hidden_size": 4096,
13 |   "initializer_range": 0.02,
14 |   "intermediate_size": null,
15 |   "max_position_embeddings": 2048,
16 |   "model_type": "baichuan",
17 |   "num_attention_heads": 32,
18 |   "num_hidden_layers": 32,
19 |   "pad_token_id": 0,
20 |   "rms_norm_eps": 1e-06,
21 |   "tie_word_embeddings": false,
22 |   "torch_dtype": "float32",
23 |   "transformers_version": "4.29.1",
24 |   "use_cache": true,
25 |   "vocab_size": 64000
26 | }
27 | 


--------------------------------------------------------------------------------
/tools/Galvatron/galvatron/models/baichuan/hf_configs/baichuan-7b/config_ori.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "architectures": [
 3 |     "BaiChuanForCausalLM"
 4 |   ],
 5 |   "auto_map": {
 6 |     "AutoConfig": "configuration_baichuan.BaiChuanConfig",
 7 |     "AutoModelForCausalLM": "modeling_baichuan.BaiChuanForCausalLM"
 8 |   },
 9 |   "bos_token_id": 1,
10 |   "eos_token_id": 2,
11 |   "hidden_act": "silu",
12 |   "hidden_size": 4096,
13 |   "initializer_range": 0.02,
14 |   "intermediate_size": 11008,
15 |   "max_position_embeddings": 4096,
16 |   "model_type": "baichuan",
17 |   "num_attention_heads": 32,
18 |   "num_hidden_layers": 32,
19 |   "pad_token_id": 0,
20 |   "rms_norm_eps": 1e-06,
21 |   "tie_word_embeddings": false,
22 |   "torch_dtype": "float32",
23 |   "transformers_version": "4.29.1",
24 |   "use_cache": true,
25 |   "vocab_size": 64000
26 | }
27 | 


--------------------------------------------------------------------------------
/tools/Galvatron/galvatron/models/baichuan/scripts/train.sh:
--------------------------------------------------------------------------------
 1 | LAUNCHER="python3"
 2 | 
 3 | TRAINER="train.py"
 4 | 
 5 | ${LAUNCHER} ${TRAINER} \
 6 | --gpu_id 0 \
 7 | --global_train_batch_size 1 \
 8 | --model_size baichuan-7b \
 9 | --set_model_config_manually 1 \
10 | --set_layernum_manually 0 \
11 | --vocab_size 32000 \
12 | --hidden_size 1024 \
13 | --num_hidden_layers 24 \
14 | --num_attention_heads 16 \
15 | --seq_length 1024 \
16 | --epochs 10 \
17 | --lr 1e-4 \
18 | --adam_weight_decay 0.01 \
19 | --dropout_prob 0.1 \
20 | --check_loss 0 \
21 | --profile 1


--------------------------------------------------------------------------------
/tools/Galvatron/galvatron/models/gpt/GPTModel_tensor_parallel.py:
--------------------------------------------------------------------------------
 1 | from galvatron.core import get_args
 2 | from flash_attn.models.gpt import create_mixer_cls, create_mlp_cls
 3 |     
 4 | def construct_tensor_parallel_model(model, config, tp_groups_enc):
 5 |     args=get_args()
 6 |     factory_kwargs = {
 7 |         'device': 'meta' if hasattr(args, 'initialize_on_meta') and args.initialize_on_meta else 'cpu',
 8 |         'dtype': None
 9 |     }
10 |     for i in range(config.num_hidden_layers):
11 |         layer = model.transformer.layers[i]
12 |         setattr(layer, 'mixer', create_mixer_cls(config, layer_idx=i, process_group=tp_groups_enc[i].group, **factory_kwargs)(config.hidden_size))
13 |         setattr(layer, 'mlp', create_mlp_cls(config, layer_idx=i, process_group=tp_groups_enc[i].group, **factory_kwargs)(config.hidden_size))
14 |     return model


--------------------------------------------------------------------------------
/tools/Galvatron/galvatron/models/gpt/__init__.py:
--------------------------------------------------------------------------------
1 | from .GPTModel_hybrid_parallel import get_hybrid_parallel_configs, construct_hybrid_parallel_model, gpt_model_hp


--------------------------------------------------------------------------------
/tools/Galvatron/galvatron/models/gpt/configs/computation_profiling_bf16_hidden1600_head32_seqlen1024.json:
--------------------------------------------------------------------------------
1 | {
2 |     "layernum[12]_bsz4": 41.484313583374025,
3 |     "layernum[24]_bsz4": 70.19392623901368,
4 |     "layertype_0": 0.5981169303258261
5 | }


--------------------------------------------------------------------------------
/tools/Galvatron/galvatron/models/gpt/configs/computation_profiling_bf16_hidden2560_head32_seqlen2048.json:
--------------------------------------------------------------------------------
1 | {
2 |     "layernum[6]_bsz2": 43.418639373779286,
3 |     "layernum[12]_bsz2": 68.19261474609375,
4 |     "layertype_0": 2.0644979476928724
5 | }


--------------------------------------------------------------------------------
/tools/Galvatron/galvatron/models/gpt/configs/computation_profiling_bf16_hidden4096_head32_seqlen2048.json:
--------------------------------------------------------------------------------
1 | {
2 |     "layernum[6]_bsz2": 83.93114852905275,
3 |     "layernum[12]_bsz2": 139.4233337402344,
4 |     "layertype_0": 4.6243487675984705
5 | }


--------------------------------------------------------------------------------
/tools/Galvatron/galvatron/models/gpt/configs/galvatron_config_gpt-1.5b_2nodes_8gpus_per_node_40GB_bf16_3D_example.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "pp_deg": 2,
 3 |     "tp_sizes_enc": "2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2",
 4 |     "tp_consecutive_flags": "1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1",
 5 |     "dp_types_enc": "0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0",
 6 |     "global_bsz": 448,
 7 |     "chunks": 8,
 8 |     "pp_division": "24,24",
 9 |     "pipeline_type": "pipedream_flush",
10 |     "default_dp_type": "zero2"
11 | }


--------------------------------------------------------------------------------
/tools/Galvatron/galvatron/models/gpt/configs/galvatron_config_gpt-1.5b_2nodes_8gpus_per_node_40GB_bf16_example.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "pp_deg": 1,
 3 |     "tp_sizes_enc": "1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1",
 4 |     "tp_consecutive_flags": "1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1",
 5 |     "dp_types_enc": "0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0",
 6 |     "global_bsz": 144,
 7 |     "chunks": 1,
 8 |     "pp_division": "48",
 9 |     "checkpoint": "0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0",
10 |     "pipeline_type": "pipedream_flush",
11 |     "default_dp_type": "zero2"
12 | }


--------------------------------------------------------------------------------
/tools/Galvatron/galvatron/models/gpt/configs/galvatron_config_gpt-2.7b_2nodes_8gpus_per_node_40GB_bf16_3D_example.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "pp_deg": 2,
 3 |     "tp_sizes_enc": "2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2",
 4 |     "tp_consecutive_flags": "1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1",
 5 |     "dp_types_enc": "0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0",
 6 |     "global_bsz": 320,
 7 |     "chunks": 16,
 8 |     "pp_division": "16,16",
 9 |     "pipeline_type": "pipedream_flush",
10 |     "default_dp_type": "zero2"
11 | }


--------------------------------------------------------------------------------
/tools/Galvatron/galvatron/models/gpt/configs/galvatron_config_gpt-2.7b_2nodes_8gpus_per_node_40GB_bf16_example.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "pp_deg": 1,
 3 |     "tp_sizes_enc": "1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1",
 4 |     "tp_consecutive_flags": "1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1",
 5 |     "dp_types_enc": "0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0",
 6 |     "global_bsz": 64,
 7 |     "chunks": 1,
 8 |     "pp_division": "32",
 9 |     "checkpoint": "0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0",
10 |     "pipeline_type": "pipedream_flush",
11 |     "default_dp_type": "zero2"
12 | }


--------------------------------------------------------------------------------
/tools/Galvatron/galvatron/models/gpt/configs/galvatron_config_gpt-6.7b_2nodes_8gpus_per_node_40GB_bf16_3D_example.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "pp_deg": 2,
 3 |     "tp_sizes_enc": "2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2",
 4 |     "tp_consecutive_flags": "1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1",
 5 |     "dp_types_enc": "0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0",
 6 |     "global_bsz": 256,
 7 |     "chunks": 32,
 8 |     "pp_division": "16,16",
 9 |     "pipeline_type": "pipedream_flush",
10 |     "default_dp_type": "zero2"
11 | }


--------------------------------------------------------------------------------
/tools/Galvatron/galvatron/models/gpt/configs/galvatron_config_gpt-6.7b_2nodes_8gpus_per_node_40GB_bf16_example.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "pp_deg": 1,
 3 |     "tp_sizes_enc": "1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1",
 4 |     "tp_consecutive_flags": "1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1",
 5 |     "dp_types_enc": "1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1",
 6 |     "global_bsz": 48,
 7 |     "chunks": 1,
 8 |     "pp_division": "32",
 9 |     "checkpoint": "0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0",
10 |     "pipeline_type": "pipedream_flush",
11 |     "default_dp_type": "zero2"
12 | }


--------------------------------------------------------------------------------
/tools/Galvatron/galvatron/models/gpt/meta_configs/__init__.py:
--------------------------------------------------------------------------------
1 | from .config_utils import *


--------------------------------------------------------------------------------
/tools/Galvatron/galvatron/models/gpt/meta_configs/gpt-0.3b.json:
--------------------------------------------------------------------------------
1 | {
2 |   "n_layer": 24,
3 |   "n_embd": 1024,
4 |   "n_head": 16,
5 |   "head_dim": 64,
6 |   "vocab_size": 50257,
7 |   "n_positions": 1024
8 | }


--------------------------------------------------------------------------------
/tools/Galvatron/galvatron/models/gpt/meta_configs/gpt-1.5b.json:
--------------------------------------------------------------------------------
1 | {
2 |   "n_layer": 48,
3 |   "n_embd": 1600,
4 |   "n_head": 32,
5 |   "head_dim": 50,
6 |   "vocab_size": 50257,
7 |   "n_positions": 1024
8 | }


--------------------------------------------------------------------------------
/tools/Galvatron/galvatron/models/gpt/meta_configs/gpt-2.7b.json:
--------------------------------------------------------------------------------
1 | {
2 |   "n_layer": 32,
3 |   "n_embd": 2560,
4 |   "n_head": 32,
5 |   "head_dim": 80,
6 |   "vocab_size": 50257,
7 |   "n_positions": 2048
8 | }


--------------------------------------------------------------------------------
/tools/Galvatron/galvatron/models/gpt/meta_configs/gpt-6.7b.json:
--------------------------------------------------------------------------------
1 | {
2 |   "n_layer": 32,
3 |   "n_embd": 4096,
4 |   "n_head": 32,
5 |   "head_dim": 128,
6 |   "vocab_size": 50257,
7 |   "n_positions": 2048
8 | }


--------------------------------------------------------------------------------
/tools/Galvatron/galvatron/models/gpt/scripts/train.sh:
--------------------------------------------------------------------------------
 1 | LAUNCHER="python3"
 2 | 
 3 | TRAINER="train.py"
 4 | 
 5 | ${LAUNCHER} ${TRAINER} \
 6 | --gpu_id 0 \
 7 | --global_train_batch_size 1 \
 8 | --model_size gpt-0.3b \
 9 | --set_model_config_manually 0 \
10 | --set_layernum_manually 0 \
11 | --vocab_size 50257 \
12 | --hidden_size 1024 \
13 | --num_hidden_layers 24 \
14 | --num_attention_heads 16 \
15 | --seq_length 1024 \
16 | --epochs 10 \
17 | --lr 1e-4 \
18 | --adam_weight_decay 0.01 \
19 | --dropout_prob 0.1 \
20 | --check_loss 0 \
21 | --profile 1


--------------------------------------------------------------------------------
/tools/Galvatron/galvatron/models/llama/LlamaModel_tensor_parallel.py:
--------------------------------------------------------------------------------
 1 | from galvatron.core import get_args
 2 | from flash_attn.models.gpt import create_mixer_cls, create_mlp_cls
 3 |     
 4 | def construct_tensor_parallel_model(model, config, tp_groups_enc):
 5 |     args=get_args()
 6 |     factory_kwargs = {
 7 |         'device': 'meta' if hasattr(args, 'initialize_on_meta') and args.initialize_on_meta else 'cpu',
 8 |         'dtype': None
 9 |     }
10 |     for i in range(config.num_hidden_layers):
11 |         layer = model.transformer.layers[i]
12 |         setattr(layer, 'mixer', create_mixer_cls(config, layer_idx=i, process_group=tp_groups_enc[i].group, **factory_kwargs)(config.hidden_size))
13 |         setattr(layer, 'mlp', create_mlp_cls(config, layer_idx=i, process_group=tp_groups_enc[i].group, **factory_kwargs)(config.hidden_size))
14 |     return model


--------------------------------------------------------------------------------
/tools/Galvatron/galvatron/models/llama/__init__.py:
--------------------------------------------------------------------------------
1 | from .LlamaModel_hybrid_parallel import get_hybrid_parallel_configs, construct_hybrid_parallel_model, llama_model_hp


--------------------------------------------------------------------------------
/tools/Galvatron/galvatron/models/llama/configs/computation_profiling_bf16_hidden4096_head32_seqlen2048.json:
--------------------------------------------------------------------------------
1 | {
2 |     "layernum[6]_bsz2": 63.300140380859375,
3 |     "layernum[12]_bsz2": 119.02136306762694,
4 |     "layertype_0": 4.6434352238972965
5 | }


--------------------------------------------------------------------------------
/tools/Galvatron/galvatron/models/llama/configs/galvatron_config_llama-7b_2nodes_8gpus_per_node_40GB_bf16_3D_example.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "pp_deg": 2,
 3 |     "tp_sizes_enc": "2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2",
 4 |     "tp_consecutive_flags": "1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1",
 5 |     "dp_types_enc": "0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0",
 6 |     "global_bsz": 256,
 7 |     "chunks": 32,
 8 |     "pp_division": "16,16",
 9 |     "pipeline_type": "pipedream_flush",
10 |     "default_dp_type": "zero2"
11 | }


--------------------------------------------------------------------------------
/tools/Galvatron/galvatron/models/llama/configs/galvatron_config_llama-7b_2nodes_8gpus_per_node_40GB_bf16_example.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "pp_deg": 1,
 3 |     "tp_sizes_enc": "1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1",
 4 |     "tp_consecutive_flags": "1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1",
 5 |     "dp_types_enc": "1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1",
 6 |     "global_bsz": 48,
 7 |     "chunks": 1,
 8 |     "pp_division": "32",
 9 |     "checkpoint": "1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0",
10 |     "pipeline_type": "pipedream_flush",
11 |     "default_dp_type": "zero2"
12 | }


--------------------------------------------------------------------------------
/tools/Galvatron/galvatron/models/llama/meta_configs/__init__.py:
--------------------------------------------------------------------------------
1 | from .config_utils import *


--------------------------------------------------------------------------------
/tools/Galvatron/galvatron/models/llama/meta_configs/llama-13b.json:
--------------------------------------------------------------------------------
1 | {
2 |   "dim": 5120,
3 |   "multiple_of": 256,
4 |   "n_heads": 40,
5 |   "n_layers": 40,
6 |   "norm_eps": 1e-06,
7 |   "vocab_size": 32000
8 | }


--------------------------------------------------------------------------------
/tools/Galvatron/galvatron/models/llama/meta_configs/llama-30b.json:
--------------------------------------------------------------------------------
1 | {
2 |   "dim": 6656,
3 |   "multiple_of": 256,
4 |   "n_heads": 52,
5 |   "n_layers": 60,
6 |   "norm_eps": 1e-06,
7 |   "vocab_size": 32000
8 | }


--------------------------------------------------------------------------------
/tools/Galvatron/galvatron/models/llama/meta_configs/llama-7b.json:
--------------------------------------------------------------------------------
1 | {
2 |   "dim": 4096,
3 |   "multiple_of": 256,
4 |   "n_heads": 32,
5 |   "n_layers": 32,
6 |   "norm_eps": 1e-06,
7 |   "vocab_size": 32000
8 | }


--------------------------------------------------------------------------------
/tools/Galvatron/galvatron/models/llama/scripts/train.sh:
--------------------------------------------------------------------------------
 1 | LAUNCHER="python3"
 2 | 
 3 | TRAINER="train.py"
 4 | 
 5 | ${LAUNCHER} ${TRAINER} \
 6 | --gpu_id 0 \
 7 | --global_train_batch_size 1 \
 8 | --model_size llama-7b \
 9 | --set_model_config_manually 1 \
10 | --set_layernum_manually 0 \
11 | --vocab_size 32000 \
12 | --hidden_size 1024 \
13 | --num_hidden_layers 24 \
14 | --num_attention_heads 16 \
15 | --seq_length 1024 \
16 | --epochs 10 \
17 | --lr 1e-4 \
18 | --adam_weight_decay 0.01 \
19 | --dropout_prob 0.1 \
20 | --check_loss 0 \
21 | --profile 1


--------------------------------------------------------------------------------
/tools/Galvatron/galvatron/profile_hardware/hardware_configs/allreduce_bandwidth_1nodes_4gpus_per_node.json:
--------------------------------------------------------------------------------
1 | {
2 |     "allreduce_size_4_consec_1": 158.018,
3 |     "allreduce_size_2_consec_1": 149.158,
4 |     "allreduce_size_2_consec_0": 149.317
5 | }


--------------------------------------------------------------------------------
/tools/Galvatron/galvatron/profile_hardware/hardware_configs/allreduce_bandwidth_1nodes_8gpus_per_node.json:
--------------------------------------------------------------------------------
1 | {
2 |     "allreduce_size_8_consec_1": 154.203,
3 |     "allreduce_size_4_consec_1": 159.119,
4 |     "allreduce_size_4_consec_0": 155.815,
5 |     "allreduce_size_2_consec_1": 138.156,
6 |     "allreduce_size_2_consec_0": 151.344
7 | }


--------------------------------------------------------------------------------
/tools/Galvatron/galvatron/profile_hardware/hardware_configs/allreduce_bandwidth_2nodes_8gpus_per_node.json:
--------------------------------------------------------------------------------
1 | {
2 |     "allreduce_size_16_consec_1": 44.682,
3 |     "allreduce_size_8_consec_1": 155.658,
4 |     "allreduce_size_8_consec_0": 20.7724,
5 |     "allreduce_size_4_consec_1": 157.984,
6 |     "allreduce_size_4_consec_0": 16.22,
7 |     "allreduce_size_2_consec_1": 149.666,
8 |     "allreduce_size_2_consec_0": 8.13007
9 | }


--------------------------------------------------------------------------------
/tools/Galvatron/galvatron/profile_hardware/hardware_configs/overlap_coefficient.json:
--------------------------------------------------------------------------------
1 | {
2 |     "overlap_coe": 1.125552573612729
3 | }


--------------------------------------------------------------------------------
/tools/Galvatron/galvatron/profile_hardware/hardware_configs/p2p_bandwidth_1nodes_4gpus_per_node.json:
--------------------------------------------------------------------------------
1 | {
2 |     "pp_size_2": 162.118,
3 |     "pp_size_4": 140.185
4 | }


--------------------------------------------------------------------------------
/tools/Galvatron/galvatron/profile_hardware/hardware_configs/p2p_bandwidth_1nodes_8gpus_per_node.json:
--------------------------------------------------------------------------------
1 | {
2 |     "pp_size_2": 163.671,
3 |     "pp_size_4": 138.581,
4 |     "pp_size_8": 109.45
5 | }


--------------------------------------------------------------------------------
/tools/Galvatron/galvatron/profile_hardware/hardware_configs/p2p_bandwidth_2nodes_8gpus_per_node.json:
--------------------------------------------------------------------------------
1 | {
2 |     "pp_size_2": 7.65998,
3 |     "pp_size_4": 8.02132,
4 |     "pp_size_8": 8.76278,
5 |     "pp_size_16": 8.13177
6 | }


--------------------------------------------------------------------------------
/tools/Galvatron/galvatron/profile_hardware/hostfile:
--------------------------------------------------------------------------------
1 | job-a23c7db3-67e5-45e4-9419-20270dd89a8f-master-0
2 | job-a23c7db3-67e5-45e4-9419-20270dd89a8f-worker-0


--------------------------------------------------------------------------------
/tools/Galvatron/galvatron/profile_hardware/profile_hardware.py:
--------------------------------------------------------------------------------
 1 | from galvatron.core import GalvatronProfiler, initialize_galvatron
 2 | import os
 3 | 
 4 | if __name__ == '__main__':
 5 |     args = initialize_galvatron(mode='profile_hardware')
 6 |     print(args)
 7 |     profiler = GalvatronProfiler(args)
 8 |     path = os.path.dirname(os.path.abspath(__file__))
 9 |     profiler.set_path(path)
10 |     
11 |     # profile allreduce & p2p bandwidth
12 |     profiler.profile_bandwidth()
13 |     
14 |     # profile overlapping slowdown coefficient
15 |     profiler.profile_overlap()


--------------------------------------------------------------------------------
/tools/Galvatron/galvatron/profile_hardware/scripts/build_nccl_test.sh:
--------------------------------------------------------------------------------
 1 | if [ "$USE_EXPORT_VARIABLE" = "1" ]; then
 2 | echo "USE_EXPORT_VARIABLE is set to 1, using the exported variables."
 3 | else
 4 | echo "USE_EXPORT_VARIABLE is not set to 1, using the variables defined in script."
 5 | MPI_PATH=/usr/local/mpi/
 6 | MAKE_MPI=1
 7 | fi
 8 | 
 9 | cd ../site_package/nccl-tests
10 | if [ "$MAKE_MPI" = "1" ]; then
11 | echo 'Building nccl-tests with MPI.'
12 | make MPI=1 MPI_HOME=${MPI_PATH}
13 | else
14 | echo 'Building nccl-tests without MPI.'
15 | make
16 | fi


--------------------------------------------------------------------------------
/tools/Galvatron/galvatron/profile_hardware/scripts/profile_hardware.sh:
--------------------------------------------------------------------------------
 1 | NUM_NODES=2
 2 | NUM_GPUS_PER_NODE=8
 3 | NCCLTEST_DIR="../site_package/nccl-tests"
 4 | MPI_PATH=/usr/local/mpi/
 5 | START_MB=16
 6 | END_MB=256
 7 | SCALE=2
 8 | HOSTFILE="hostfile"
 9 | 
10 | # These args will be directly added to nccl-test arguments
11 | export NCCLTEST_OTHER_ARGS="-x NCCL_IB_DISABLE=0 -x NCCL_IB_HCA=mlx5_2,mlx5_5"
12 | 
13 | PROFILE_ARGS="
14 |     --num_nodes ${NUM_NODES} \
15 |     --num_gpus_per_node ${NUM_GPUS_PER_NODE} \
16 |     --nccl_test_dir ${NCCLTEST_DIR} \
17 |     --mpi_path ${MPI_PATH} \
18 |     --start_mb ${START_MB} \
19 |     --end_mb ${END_MB} \
20 |     --scale ${SCALE} \
21 |     --hostfile ${HOSTFILE} \
22 |     --avg_or_min_or_first first \
23 |     --max_pp_deg 16 \
24 |     --overlap_time_multiply 4"
25 | python3 profile_hardware.py ${PROFILE_ARGS}


--------------------------------------------------------------------------------
/tools/Galvatron/galvatron/profile_hardware/scripts/profile_overlap.sh:
--------------------------------------------------------------------------------
 1 | if [ "$USE_EXPORT_VARIABLE" = "1" ]; then
 2 | echo "USE_EXPORT_VARIABLE is set to 1, using the exported variables."
 3 | else
 4 | echo "USE_EXPORT_VARIABLE is not set to 1, using the variables defined in script."
 5 | NUM_GPUS_PER_NODE=8
 6 | OVERLAP_TIME_MULTIPLY=4
 7 | fi
 8 | 
 9 | ARGS="-m torch.distributed.launch \
10 | --nproc_per_node=${NUM_GPUS_PER_NODE} \
11 | --master_port 9999 \
12 | profile_overlap.py \
13 | --overlap_time_multiply ${OVERLAP_TIME_MULTIPLY}"
14 | 
15 | echo "Running: python3 ${ARGS}"
16 | python3 ${ARGS}


--------------------------------------------------------------------------------
/tools/Galvatron/galvatron/scripts/flash_attn_ops_install.sh:
--------------------------------------------------------------------------------
1 | git clone --recursive https://github.com/Dao-AILab/flash-attention.git
2 | pip3 install flash-attention/csrc/fused_dense_lib
3 | pip3 install flash-attention/csrc/layer_norm
4 | pip3 install flash-attention/csrc/rotary
5 | pip3 install flash-attention/csrc/xentropy
6 | rm -rf flash-attention


--------------------------------------------------------------------------------
/tools/Galvatron/galvatron/scripts/prepare_env.sh:
--------------------------------------------------------------------------------
1 | pip3 install -r ../requirements.txt


--------------------------------------------------------------------------------
/tools/Galvatron/galvatron/site_package/__init__.py:
--------------------------------------------------------------------------------
1 | # import os
2 | # import sys
3 | # sys.path.insert(0, os.path.dirname(__file__))


--------------------------------------------------------------------------------
/tools/Galvatron/galvatron/site_package/megatron/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import torch
 4 | 
 5 | from .global_vars import get_args, get_retro_args
 6 | from .global_vars import get_current_global_batch_size
 7 | from .global_vars import get_num_microbatches
 8 | from .global_vars import get_signal_handler
 9 | from .global_vars import update_num_microbatches
10 | from .global_vars import get_tokenizer
11 | from .global_vars import get_tensorboard_writer
12 | from .global_vars import get_adlr_autoresume
13 | from .global_vars import get_timers
14 | from .initialize  import initialize_megatron
15 | 
16 | from .utils import (print_rank_0,
17 |                     is_last_rank,
18 |                     print_rank_last)
19 | 


--------------------------------------------------------------------------------
/tools/Galvatron/galvatron/site_package/megatron/core/README.md:
--------------------------------------------------------------------------------
1 | Megatron Core is a library for efficient and scalable training of transformer based models.
2 | 


--------------------------------------------------------------------------------
/tools/Galvatron/galvatron/site_package/megatron/core/__init__.py:
--------------------------------------------------------------------------------
 1 | import megatron.core.parallel_state
 2 | import megatron.core.tensor_parallel
 3 | import megatron.core.utils
 4 | 
 5 | # Alias parallel_state as mpu, its legacy name
 6 | mpu = parallel_state
 7 | 
 8 | __all__ = [
 9 |     "parallel_state",
10 |     "tensor_parallel",
11 |     "utils",
12 | ]
13 | 


--------------------------------------------------------------------------------
/tools/Galvatron/galvatron/site_package/megatron/core/enums.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
2 | 
3 | import enum
4 | 
5 | class ModelType(enum.Enum):
6 |     encoder_or_decoder = 1
7 |     encoder_and_decoder = 2
8 | 


--------------------------------------------------------------------------------
/tools/Galvatron/galvatron/site_package/megatron/core/pipeline_parallel/__init__.py:
--------------------------------------------------------------------------------
1 | from .schedules import get_forward_backward_func
2 | 


--------------------------------------------------------------------------------
/tools/Galvatron/galvatron/site_package/megatron/core/requirements.txt:
--------------------------------------------------------------------------------
1 | torch


--------------------------------------------------------------------------------
/tools/Galvatron/galvatron/site_package/megatron/data/Makefile:
--------------------------------------------------------------------------------
 1 | CXXFLAGS += -O3 -Wall -shared -std=c++11 -fPIC -fdiagnostics-color
 2 | CPPFLAGS += $(shell python3 -m pybind11 --includes)
 3 | LIBNAME = helpers
 4 | LIBEXT = $(shell python3-config --extension-suffix)
 5 | 
 6 | default: $(LIBNAME)$(LIBEXT)
 7 | 
 8 | %$(LIBEXT): %.cpp
 9 | 	$(CXX) $(CXXFLAGS) $(CPPFLAGS) $< -o $@
10 | 


--------------------------------------------------------------------------------
/tools/Galvatron/galvatron/site_package/megatron/data/__init__.py:
--------------------------------------------------------------------------------
1 | from . import indexed_dataset
2 | 


--------------------------------------------------------------------------------
/tools/Galvatron/galvatron/site_package/megatron/data/test/test_preprocess_data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | IMPL=cached
 4 | python ../preprocess_data.py \
 5 |        --input test_samples.json \
 6 |        --vocab vocab.txt \
 7 |        --dataset-impl ${IMPL} \
 8 |        --output-prefix test_samples_${IMPL} \
 9 |        --workers 1 \
10 |        --log-interval 2
11 | 


--------------------------------------------------------------------------------
/tools/Galvatron/galvatron/site_package/megatron/fused_kernels/compat.h:
--------------------------------------------------------------------------------
 1 | /* Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved. */
 2 | 
 3 | /*This code is copied fron NVIDIA apex:
 4 |  *     https://github.com/NVIDIA/apex
 5 |  *     with minor changes. */
 6 | 
 7 | 
 8 | 
 9 | #ifndef TORCH_CHECK
10 | #define TORCH_CHECK AT_CHECK
11 | #endif
12 | 
13 | #ifdef VERSION_GE_1_3
14 | #define DATA_PTR data_ptr
15 | #else
16 | #define DATA_PTR data
17 | #endif
18 | 


--------------------------------------------------------------------------------
/tools/Galvatron/galvatron/site_package/megatron/fused_kernels/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hsword/Hetu/a1cb1bcaf236c9f1d452733b105997d50c16c9df/tools/Galvatron/galvatron/site_package/megatron/fused_kernels/tests/__init__.py


--------------------------------------------------------------------------------
/tools/Galvatron/galvatron/site_package/megatron/model/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | from .fused_layer_norm import MixedFusedLayerNorm as LayerNorm
 4 | 
 5 | from .distributed import DistributedDataParallel
 6 | from .bert_model import BertModel
 7 | from .gpt_model import GPTModel
 8 | from .t5_model import T5Model
 9 | from .language_model import get_language_model
10 | from .module import Float16Module, MegatronModule
11 | 


--------------------------------------------------------------------------------
/tools/Galvatron/galvatron/site_package/megatron/model/enums.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 | 
 3 | import enum
 4 | 
 5 | class LayerType(enum.Enum):
 6 |     encoder = 1
 7 |     decoder = 2
 8 |  
 9 | class AttnType(enum.Enum):
10 |     self_attn = 1
11 |     cross_attn = 2
12 | 
13 | class AttnMaskType(enum.Enum):
14 |     padding = 1
15 |     causal = 2
16 | 
17 | # For backward compatibility with old model checkpoints
18 | from megatron.core.enums import ModelType
19 | 


--------------------------------------------------------------------------------
/tools/Galvatron/galvatron/site_package/megatron/mpu/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Hsword/Hetu/a1cb1bcaf236c9f1d452733b105997d50c16c9df/tools/Galvatron/galvatron/site_package/megatron/mpu/tests/__init__.py


--------------------------------------------------------------------------------
/tools/Galvatron/galvatron/site_package/megatron/text_generation/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
2 | 
3 | 
4 | from .api import (
5 |     generate,
6 |     generate_and_post_process,
7 |     beam_search_and_post_process)
8 | 


--------------------------------------------------------------------------------
/tools/Galvatron/galvatron/site_package/megatron/tokenizer/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
2 | 
3 | 
4 | from .tokenizer import build_tokenizer
5 | 


--------------------------------------------------------------------------------
/tools/Galvatron/galvatron/site_package/nccl-tests/.gitignore:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
2 | #
3 | # See LICENCE.txt for license information
4 | /build
5 | 


--------------------------------------------------------------------------------
/tools/Galvatron/galvatron/site_package/nccl-tests/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
 3 | #
 4 | # See LICENCE.txt for license information
 5 | #
 6 | 
 7 | BUILDDIR ?= build
 8 | override BUILDDIR := $(abspath $(BUILDDIR))
 9 | 
10 | .PHONY: all clean
11 | 
12 | default: src.build
13 | 
14 | TARGETS=src
15 | 
16 | all:   ${TARGETS:%=%.build}
17 | clean: ${TARGETS:%=%.clean}
18 | 
19 | %.build:
20 | 	${MAKE} -C $* build BUILDDIR=${BUILDDIR}
21 | 
22 | %.clean:
23 | 	${MAKE} -C $* clean BUILDDIR=${BUILDDIR}
24 | 


--------------------------------------------------------------------------------
/tools/Galvatron/galvatron/site_package/nccl-tests/src/timer.cc:
--------------------------------------------------------------------------------
 1 | #include "timer.h"
 2 | 
 3 | // Make sure to compile this translation unit with the host compiler and not
 4 | // nvcc, lest you hit an internal compiler error (ICE) with GCC 10.3.0
 5 | #include <chrono>
 6 | 
 7 | namespace {
 8 |   std::uint64_t now() {
 9 |     using clock = std::chrono::steady_clock;
10 |     return std::chrono::duration_cast<std::chrono::nanoseconds>(clock::now().time_since_epoch()).count();
11 |   }
12 | }
13 | 
14 | timer::timer() {
15 |   t0 = now();
16 | }
17 | 
18 | double timer::elapsed() const {
19 |   std::uint64_t t1 = now();
20 |   return 1.e-9*(t1 - t0);
21 | }
22 | 
23 | double timer::reset() {
24 |   std::uint64_t t1 = now();
25 |   double ans = 1.e-9*(t1 - t0);
26 |   t0 = t1;
27 |   return ans;
28 | }
29 | 


--------------------------------------------------------------------------------
/tools/Galvatron/galvatron/site_package/nccl-tests/src/timer.h:
--------------------------------------------------------------------------------
 1 | #ifndef _408319ecdd5b47b28bf8f511c4fdf816
 2 | #define _408319ecdd5b47b28bf8f511c4fdf816
 3 | 
 4 | #include <cstdint>
 5 | 
 6 | // Can't include <chrono> because of bug with gcc 10.3.0
 7 | class timer {
 8 |   std::uint64_t t0;
 9 | public:
10 |   timer();
11 |   double elapsed() const;
12 |   double reset();
13 | };
14 | 
15 | #endif
16 | 


--------------------------------------------------------------------------------
/tools/Galvatron/galvatron/site_package/nccl-tests/verifiable/Makefile:
--------------------------------------------------------------------------------
 1 | include ../../makefiles/common.mk
 2 | 
 3 | .PHONY: all clean
 4 | 
 5 | BUILDDIR := $(abspath ../../build)
 6 | NCCLDIR := $(BUILDDIR)
 7 | NVCUFLAGS += -I$(NCCLDIR)/include/ -I../include
 8 | DST_DIR := $(BUILDDIR)/test/verifiable
 9 | 
10 | all: $(DST_DIR)/self_test $(DST_DIR)/verifiable.o
11 | 
12 | clean:
13 | 	rm -rf $(DST_DIR)
14 | 
15 | TEST_VERIFIABLE_SRCDIR := .
16 | TEST_VERIFIABLE_BUILDDIR := $(DST_DIR)
17 | include verifiable.mk
18 | 
19 | self_test: $(DST_DIR)/self_test
20 | 
21 | $(DST_DIR)/self_test: verifiable.cu verifiable.h
22 | 	@printf "Linking  %s\n" $@
23 | 	@mkdir -p $(DST_DIR)
24 | 	$(NVCC) -o $@ $(NVCUFLAGS) -DSELF_TEST=1 verifiable.cu $(NVLDFLAGS)
25 | 


--------------------------------------------------------------------------------
/tools/Galvatron/galvatron/site_package/nccl-tests/verifiable/verifiable.mk:
--------------------------------------------------------------------------------
 1 | # We requires both of the following paths to be set upon including this makefile
 2 | # TEST_VERIFIABLE_SRCDIR = <points to this directory>
 3 | # TEST_VERIFIABLE_BUILDDIR = <points to destination of .o file>
 4 | 
 5 | TEST_VERIFIABLE_HDRS = $(TEST_VERIFIABLE_SRCDIR)/verifiable.h
 6 | TEST_VERIFIABLE_OBJS = $(TEST_VERIFIABLE_BUILDDIR)/verifiable.o
 7 | 
 8 | $(TEST_VERIFIABLE_BUILDDIR)/verifiable.o: $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu $(TEST_VERIFY_REDUCE_HDRS)
 9 | 	@printf "Compiling %s\n" $@
10 | 	@mkdir -p $(TEST_VERIFIABLE_BUILDDIR)
11 | 	$(NVCC) -o $@ $(NVCUFLAGS) -c $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu
12 | 


--------------------------------------------------------------------------------
/tools/Galvatron/galvatron/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .config_utils import *
2 | from .strategy_utils import form_strategy, strategy_str2list, print_strategies
3 | from .memory_utils import print_peak_memory, print_param_num
4 | from .training_utils import *
5 | 


--------------------------------------------------------------------------------
/tools/Galvatron/galvatron/utils/memory_utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | def print_peak_memory(prefix, device, type='allocated'):
 4 |     if type == 'allocated':
 5 |         print(prefix, '[Allocated]')
 6 |         max_mem = torch.cuda.max_memory_allocated(device)/2**20
 7 |         cur_mem = torch.cuda.memory_allocated(device)/2**20
 8 |         print("\tMax memory: %.2f MB\tCurrent memory : %.2f MB"%(max_mem, cur_mem))
 9 |     elif type == 'reserved':
10 |         print(prefix, '[Reserved]')
11 |         max_mem = torch.cuda.max_memory_reserved(device)/2**20
12 |         cur_mem = torch.cuda.memory_reserved(device)/2**20
13 |         print("\tMax memory: %.2f MB\tCurrent memory : %.2f MB"%(max_mem, cur_mem))
14 |     return max_mem, cur_mem
15 | 
16 | def print_param_num(model):
17 |     print("Total number of paramerters in networks is {}  ".format(sum(x.numel() for x in model.parameters())))
18 | 


--------------------------------------------------------------------------------
/tools/Galvatron/requirements.txt:
--------------------------------------------------------------------------------
 1 | torch==2.0.1
 2 | torchvision==0.15.2
 3 | transformers>=4.31.0
 4 | flash_attn>=2.0.8
 5 | h5py>=3.6.0
 6 | attrs>=21.4.0
 7 | yacs>=0.1.8
 8 | six>=1.15.0
 9 | sentenpiece>=0.1.95
10 | pybind11>=2.9.1


--------------------------------------------------------------------------------