├── .clang-format ├── .gitignore ├── .gitmodules ├── CMakeLists.txt ├── COMMITTERS.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── bin └── heturun ├── cmake ├── Modules │ ├── FindCUB.cmake │ ├── FindCUDNN.cmake │ ├── FindMETIS.cmake │ ├── FindMKL.cmake │ ├── FindNCCL.cmake │ ├── FindTHRUST.cmake │ └── FindZMQ.cmake └── config.example.cmake ├── environment.yml ├── examples ├── auto_parallel │ ├── .gitignore │ ├── cnn │ │ ├── analyze.py │ │ ├── experiment_scripts │ │ │ ├── .gitignore │ │ │ ├── compare.py │ │ │ ├── gen_nooverlap_scripts.py │ │ │ ├── gen_pipeopt_scripts.py │ │ │ ├── parse.py │ │ │ └── w16.yml │ │ ├── gen_configs.py │ │ ├── main.py │ │ ├── models │ │ │ ├── __init__.py │ │ │ ├── alexnet.py │ │ │ ├── inception_v3.py │ │ │ ├── resnet101.py │ │ │ ├── vgg19.py │ │ │ └── wide_resnet.py │ │ ├── test_models.py │ │ ├── test_scripts │ │ │ ├── compare.py │ │ │ ├── test_flexflow.py │ │ │ ├── test_gpipe.py │ │ │ ├── test_optcnn.py │ │ │ └── test_pipedream.py │ │ ├── test_simple_strategies.py │ │ └── torch_models │ │ │ ├── __init__.py │ │ │ ├── alexnet.py │ │ │ ├── inception_v3.py │ │ │ ├── resnet101.py │ │ │ ├── vgg19.py │ │ │ └── wide_resnet.py │ └── transformer │ │ ├── .gitignore │ │ ├── bert_main.py │ │ ├── experiment_scripts │ │ ├── .gitignore │ │ ├── compare.py │ │ ├── gen_large_noover_scripts.py │ │ ├── gen_pipeopt_scripts.py │ │ ├── parse.py │ │ └── w16.yml │ │ ├── gpt2_main.py │ │ ├── load_data.py │ │ ├── models │ │ ├── __init__.py │ │ ├── bert_config.py │ │ ├── gpt2_config.py │ │ ├── hetu_bert.py │ │ ├── hetu_gpt2.py │ │ ├── torch_bert.py │ │ └── torch_gpt2.py │ │ ├── test_bert.py │ │ ├── test_gpt2.py │ │ └── test_megatronlm.py ├── cnn │ ├── README.md │ ├── local_s1.yml │ ├── main.py │ ├── models │ │ ├── AlexNet.py │ │ ├── CNN.py │ │ ├── LSTM.py │ │ ├── LeNet.py │ │ ├── LogReg.py │ │ ├── MLP.py │ │ ├── RNN.py │ │ ├── ResNet.py │ │ ├── VGG.py │ │ └── __init__.py │ ├── pytorch_models │ │ ├── __init__.py │ │ ├── mlp.py │ │ ├── resnet.py │ │ ├── rnn.py │ │ └── vgg.py │ ├── run_tf_horovod.py │ ├── scripts │ │ ├── hetu_16gpu.sh │ │ ├── hetu_16gpu_ps.sh │ │ ├── hetu_1gpu.sh │ │ ├── hetu_2gpu_ps.sh │ │ ├── hetu_8gpu.sh │ │ ├── hetu_config16allreduce.yml │ │ ├── hetu_config16ps.yml │ │ ├── horovod_16gpu.sh │ │ ├── horovod_8gpu.sh │ │ ├── pytorch_16gpu_0.sh │ │ ├── pytorch_16gpu_1.sh │ │ ├── pytorch_1gpu.sh │ │ ├── pytorch_8gpu.sh │ │ ├── tf_16gpu_worker0.sh │ │ ├── tf_16gpu_worker1.sh │ │ ├── tf_1gpu.sh │ │ └── tf_8gpu.sh │ ├── settings │ │ ├── tf_dist_s1_w16.json │ │ ├── tf_dist_s1_w4.json │ │ └── tf_dist_s1_w8.json │ ├── tf_launch_server.py │ ├── tf_launch_worker.py │ ├── tf_main.py │ ├── tf_models │ │ ├── __init__.py │ │ ├── tf_CNN.py │ │ ├── tf_LSTM.py │ │ ├── tf_LeNet.py │ │ ├── tf_LogReg.py │ │ ├── tf_MLP.py │ │ ├── tf_RNN.py │ │ ├── tf_ResNet.py │ │ └── tf_VGG.py │ ├── torch_main.py │ ├── worker_conf0.json │ └── worker_conf1.json ├── ctr │ ├── .gitignore │ ├── README.md │ ├── data_utils.py │ ├── kill.sh │ ├── models │ │ ├── __init__.py │ │ ├── bce_test.py │ │ ├── dc_criteo.py │ │ ├── dcn_criteo.py │ │ ├── deepfm_criteo.py │ │ ├── load_data.py │ │ ├── wdl_adult.py │ │ └── wdl_criteo.py │ ├── run_hetu.py │ ├── run_tf_horovod.py │ ├── run_tf_local.py │ ├── run_tf_parallax.py │ ├── settings │ │ ├── dist_s2_w4.yml │ │ ├── local_s1_w2.yml │ │ ├── plx_local_spec.yml │ │ ├── tf_local_s1_w2.json │ │ ├── tf_local_s1_w4.json │ │ └── tf_local_s1_w8.json │ ├── tests │ │ ├── README.md │ │ ├── hybrid_dcn_criteo.sh │ │ ├── hybrid_dfm_criteo.sh │ │ ├── hybrid_wdl_adult.sh │ │ ├── hybrid_wdl_criteo.sh │ │ ├── local_dcn_criteo.sh │ │ ├── local_dfm_criteo.sh │ │ ├── local_wdl_adult.sh │ │ ├── local_wdl_criteo.sh │ │ ├── ps_dcn_criteo.sh │ │ ├── ps_dfm_criteo.sh │ │ ├── ps_wdl_adult.sh │ │ ├── ps_wdl_criteo.sh │ │ ├── tf_2workers.sh │ │ ├── tf_4workers.sh │ │ └── tf_8workers.sh │ ├── tf_launch_server.py │ ├── tf_launch_worker.py │ └── tf_models │ │ ├── __init__.py │ │ ├── tf_dcn_criteo.py │ │ ├── tf_deepfm_criteo.py │ │ ├── tf_wdl_adult.py │ │ └── tf_wdl_criteo.py ├── gnn │ ├── README.md │ ├── config │ │ ├── local_w2.yml │ │ ├── local_w4.yml │ │ ├── local_w8.yml │ │ └── single.yml │ ├── gnn_model │ │ ├── __init__.py │ │ ├── layer.py │ │ ├── model.py │ │ └── utils.py │ ├── gnn_tools │ │ ├── __init__.py │ │ ├── launcher.py │ │ ├── log.py │ │ ├── part_graph.py │ │ ├── prepare_amazon_data.py │ │ └── sparse_datasets.py │ ├── run_dist.py │ ├── run_dist_hybrid.py │ └── run_single.py ├── moe │ ├── README.md │ ├── scripts │ │ ├── run_2node_comm.sh │ │ ├── run_base.sh │ │ ├── run_hash.sh │ │ ├── run_ktop1.sh │ │ ├── run_mnist.sh │ │ ├── run_sam.sh │ │ ├── run_top1.sh │ │ ├── run_top1_16gpus.sh │ │ └── run_top2.sh │ ├── test_mnist.py │ ├── test_moe_base.py │ ├── test_moe_hash.py │ ├── test_moe_ktop1.py │ ├── test_moe_sam.py │ └── test_moe_top.py ├── nlp │ ├── .gitignore │ ├── README.md │ ├── __init__.py │ ├── bert │ │ ├── .gitignore │ │ ├── README.md │ │ ├── bert_config.py │ │ ├── config1.yml │ │ ├── config2.yml │ │ ├── config4.yml │ │ ├── config8.yml │ │ ├── create_pretraining_data.py │ │ ├── data │ │ │ ├── BooksDownloader.py │ │ │ ├── BookscorpusTextFormatting.py │ │ │ ├── Downloader.py │ │ │ ├── GLUEDownloader.py │ │ │ ├── GooglePretrainedWeightDownloader.py │ │ │ ├── NVIDIAPretrainedWeightDownloader.py │ │ │ ├── SquadDownloader.py │ │ │ ├── TextSharding.py │ │ │ ├── WikiDownloader.py │ │ │ ├── WikicorpusTextFormatting.py │ │ │ ├── __init__.py │ │ │ ├── bert-base-uncased-vocab.txt │ │ │ ├── bertPrep.py │ │ │ ├── preprocess_glue_data.py │ │ │ ├── squad │ │ │ │ └── squad_download.sh │ │ │ └── wikiextractor │ │ │ │ ├── WikiExtractor.py │ │ │ │ ├── __init__.py │ │ │ │ ├── cirrus-extract.py │ │ │ │ ├── clean.py │ │ │ │ ├── extract.py │ │ │ │ └── extractPage.py │ │ ├── file_utils.py │ │ ├── glue_processor │ │ │ └── glue.py │ │ ├── hetu_bert.py │ │ ├── hetu_bert_moe.py │ │ ├── load_data.py │ │ ├── pytorch_bert.py │ │ ├── scripts │ │ │ ├── create_datasets_from_start.sh │ │ │ ├── test_glue_hetu_bert_base.sh │ │ │ ├── test_glue_hetu_bert_large.sh │ │ │ ├── test_glue_pytorch_bert_base.sh │ │ │ ├── test_glue_pytorch_bert_large.sh │ │ │ ├── train_hetu_bert_base.sh │ │ │ ├── train_hetu_bert_base_dp.sh │ │ │ ├── train_hetu_bert_base_moe.sh │ │ │ ├── train_hetu_bert_base_ps.sh │ │ │ ├── train_hetu_bert_large.sh │ │ │ ├── train_hetu_bert_large_dp.sh │ │ │ ├── train_hetu_bert_large_ps.sh │ │ │ ├── train_pytorch_bert_base.sh │ │ │ └── train_pytorch_bert_large.sh │ │ ├── test_glue_hetu_bert.py │ │ ├── test_glue_pytorch_bert.py │ │ ├── tokenization.py │ │ ├── train_hetu_bert.py │ │ ├── train_hetu_bert_dp.py │ │ ├── train_hetu_bert_dp_moe.py │ │ ├── train_hetu_bert_ps.py │ │ └── train_pytorch_bert.py │ ├── data_load.py │ ├── hetu_transformer.py │ ├── hparams.py │ ├── prepare_data.py │ ├── tf_transformer.py │ ├── train_hetu_transformer.py │ └── train_tf_transformer.py ├── rec │ ├── .gitignore │ ├── models │ │ ├── __init__.py │ │ ├── base.py │ │ ├── gmf.py │ │ ├── load_data.py │ │ ├── mf.py │ │ ├── mlp.py │ │ └── neumf.py │ ├── run_compressed.py │ └── test │ │ ├── .gitignore │ │ ├── compare.py │ │ ├── config.py │ │ ├── hetu_data.py │ │ ├── hetu_main.py │ │ ├── hetu_ncf.py │ │ ├── torch_data.py │ │ ├── torch_main.py │ │ └── torch_ncf.py └── runner │ ├── README.md │ ├── local_allreduce.yml │ ├── local_ps.yml │ ├── models │ ├── MLP.py │ ├── __init__.py │ ├── load_data.py │ └── wdl_adult.py │ ├── parallel │ ├── README.md │ ├── all_cnn_tests.sh │ ├── all_mlp_tests.sh │ ├── complex_pipeline_mlp.py │ ├── config1.yml │ ├── config2.yml │ ├── config3.yml │ ├── config4.yml │ ├── config6.yml │ ├── config8.yml │ ├── data_model_pipeline_mlp.py │ ├── data_pipeline.py │ ├── data_pipeline_mlp.py │ ├── dist_config8.yml │ ├── dist_data_pipeline_mlp.py │ ├── gpipe.py │ ├── gpipe_multi.py │ ├── pipedream.py │ ├── ps_pipeline.py │ ├── simple_pipeline_mlp.py │ ├── test_mlp_base.py │ ├── test_mlp_mp.py │ ├── test_mlp_mp_pp.py │ ├── test_mlp_pp.py │ ├── test_model_cnn.py │ ├── test_model_cnn_base.py │ ├── test_model_cnn_complex.py │ └── validate_results.py │ ├── remote_allreduce.yml │ ├── remote_ps.yml │ ├── run_mlp.py │ └── run_wdl.py ├── hetu.exp ├── img ├── alibabacloud.png ├── features.png ├── hetu.png ├── kuaishou.png └── tencent.png ├── ps-lite ├── .gitignore ├── CMakeLists.txt ├── README.md ├── include │ ├── common │ │ ├── dmlc_base.h │ │ ├── logging.h │ │ ├── sarray.h │ │ ├── shared_mutex.h │ │ ├── thread_pool.h │ │ └── thread_safe_hash_map.h │ └── ps │ │ ├── base.h │ │ ├── internal │ │ ├── assign_op.h │ │ ├── customer.h │ │ ├── env.h │ │ ├── message.h │ │ ├── parallel_kv_match.h │ │ ├── parallel_sort.h │ │ ├── postoffice.h │ │ ├── threadsafe_pqueue.h │ │ ├── threadsafe_queue.h │ │ ├── utils.h │ │ └── van.h │ │ ├── kvapp.h │ │ ├── partitioner.h │ │ ├── ps.h │ │ ├── psf │ │ ├── PSFunc.h │ │ ├── cachetable.h │ │ ├── dense.h │ │ ├── misc.h │ │ ├── preduce.h │ │ ├── serializer.h │ │ ├── sparse.h │ │ └── ssp.h │ │ ├── range.h │ │ ├── server │ │ ├── PSFHandle.h │ │ ├── kvserver.h │ │ ├── optimizer.h │ │ ├── param.h │ │ ├── preduce_handler.h │ │ └── ssp_handler.h │ │ └── worker │ │ ├── PSAgent.h │ │ ├── callback_store.h │ │ ├── hetu_binding.h │ │ ├── kvworker.h │ │ └── worker.h ├── proto │ └── meta.proto └── src │ ├── PSFhandle_embedding.cc │ ├── PSFunc.cc │ ├── customer.cc │ ├── hetu_binding.cc │ ├── ibverbs_van.h │ ├── network_utils.h │ ├── p3_van.h │ ├── postoffice.cc │ ├── preduce_handler.cc │ ├── python_binding.cc │ ├── resender.h │ ├── thread_pool.cc │ ├── van.cc │ ├── worker.cc │ └── zmq_van.h ├── python ├── graphboard │ ├── __init__.py │ ├── graph2fig.py │ └── index.html ├── hetu │ ├── __init__.py │ ├── _base.py │ ├── communicator │ │ ├── __init__.py │ │ ├── mpi_comm.py │ │ ├── mpi_nccl_comm.py │ │ ├── nccl_comm.py │ │ └── test.sh │ ├── context.py │ ├── cpu_links │ │ ├── __init__.py │ │ └── dnnl_op.py │ ├── cstable.py │ ├── data.py │ ├── dataloader.py │ ├── distributed_strategies │ │ ├── __init__.py │ │ ├── base.py │ │ ├── flexflow.py │ │ ├── gpipe.py │ │ ├── optcnn.py │ │ ├── pipedream.py │ │ ├── pipeopt.py │ │ └── simple.py │ ├── gpu_links │ │ ├── AbsLink.py │ │ ├── AddConstLink.py │ │ ├── AddElewiseLink.py │ │ ├── AddmmLink.py │ │ ├── ArangeLink.py │ │ ├── ArgmaxLink.py │ │ ├── ArgsortLink.py │ │ ├── ArraySetLink.py │ │ ├── AssignWithIndexedSlicesLink.py │ │ ├── AutoDimLink.py │ │ ├── AvgPoolLink.py │ │ ├── BaddbmmLink.py │ │ ├── BatchMatrixMultLink.py │ │ ├── BinaryCrossEntropyLink.py │ │ ├── BoolLink.py │ │ ├── BroadcastLink.py │ │ ├── BroadcastShapeLink.py │ │ ├── ClampLink.py │ │ ├── CloneLink.py │ │ ├── CompressedEmbeddingLink.py │ │ ├── ConcatLink.py │ │ ├── ConcatenateLink.py │ │ ├── ConstPowLink.py │ │ ├── Conv2dBroadcastLink.py │ │ ├── Conv2dLink.py │ │ ├── Conv2dReduceSumLink.py │ │ ├── CrossEntropyLink.py │ │ ├── CrossEntropySparseLink.py │ │ ├── CuSparseLink.py │ │ ├── CudnnAvgPoolLink.py │ │ ├── CudnnBnLink.py │ │ ├── CudnnConv2d.py │ │ ├── CudnnConv2dAddBiasLink.py │ │ ├── CudnnDropoutLink.py │ │ ├── CudnnMaxPoolLink.py │ │ ├── CudnnSoftmaxCrossEntropyLink.py │ │ ├── CudnnSoftmaxLink.py │ │ ├── CumSumLink.py │ │ ├── DotLink.py │ │ ├── DropoutLink.py │ │ ├── EmbeddingLookUpLink.py │ │ ├── ExpLink.py │ │ ├── FloorLink.py │ │ ├── GatherLink.py │ │ ├── GeluLink.py │ │ ├── GroupTopKIdxLink.py │ │ ├── HA2ALayoutTransform.py │ │ ├── IndexedSliceLink.py │ │ ├── IndexingLink.py │ │ ├── InitializersLink.py │ │ ├── InstanceNorm2dLink.py │ │ ├── InterpolateLink.py │ │ ├── LayerNormLink.py │ │ ├── LayoutTransform.py │ │ ├── LeakyReluLink.py │ │ ├── LinearLink.py │ │ ├── LogLink.py │ │ ├── MaskLink.py │ │ ├── MaskedFillLink.py │ │ ├── MatrixDivideConstLink.py │ │ ├── MatrixDivideLink.py │ │ ├── MatrixMultLink.py │ │ ├── MatrixRsqrtLink.py │ │ ├── MatrixSqrtLink.py │ │ ├── MatrixTransLink.py │ │ ├── MaxLink.py │ │ ├── MaxPoolLink.py │ │ ├── MinDistLink.py │ │ ├── MinLink.py │ │ ├── MinusByConstLink.py │ │ ├── MinusElewiseLink.py │ │ ├── MultiplyConstLink.py │ │ ├── MultiplyElewiseLink.py │ │ ├── NllLossLink.py │ │ ├── NormLink.py │ │ ├── OneHotLink.py │ │ ├── OppositeLink.py │ │ ├── OptEmbedBinaryStepLink.py │ │ ├── OptimizerLink.py │ │ ├── PadLink.py │ │ ├── ParamClipLink.py │ │ ├── PowLink.py │ │ ├── PowerLink.py │ │ ├── PruneLink.py │ │ ├── QuantizeEmbeddingLink.py │ │ ├── QuantizeLink.py │ │ ├── ReduceMeanLink.py │ │ ├── ReduceMinLink.py │ │ ├── ReduceMulLink.py │ │ ├── ReduceNormLink.py │ │ ├── ReduceSumAxisZeroLink.py │ │ ├── ReduceSumLink.py │ │ ├── ReluLink.py │ │ ├── RepeatLink.py │ │ ├── ReshapeLink.py │ │ ├── ReverseLayoutTransform.py │ │ ├── RollLink.py │ │ ├── SamGroupSumLink.py │ │ ├── SamMaxLink.py │ │ ├── Scatter1DLink.py │ │ ├── ScatterLink.py │ │ ├── SigmoidLink.py │ │ ├── SignLink.py │ │ ├── SinLink.py │ │ ├── SliceAssignLink.py │ │ ├── SliceByMatrixLink.py │ │ ├── SliceLink.py │ │ ├── SoftmaxCrossEntropyLink.py │ │ ├── SoftmaxCrossEntropySparseLink.py │ │ ├── SoftmaxLink.py │ │ ├── SparseEmbeddingLookUpLink.py │ │ ├── SparseSetLink.py │ │ ├── TanhLink.py │ │ ├── TopKIdxLink.py │ │ ├── TopKValLink.py │ │ ├── TrilLookupLink.py │ │ ├── UniqueIndicesLink.py │ │ ├── WhereLink.py │ │ └── __init__.py │ ├── gpu_ops │ │ ├── Abs.py │ │ ├── AddConst.py │ │ ├── AddElewise.py │ │ ├── Addmm.py │ │ ├── AllGatherCommunicate.py │ │ ├── AllReduceCommunicate.py │ │ ├── AllToAll.py │ │ ├── Arange.py │ │ ├── Argmax.py │ │ ├── ArgmaxPartial.py │ │ ├── Argsort.py │ │ ├── AssignWithIndexedSlices.py │ │ ├── AvgPool.py │ │ ├── Baddbmm.py │ │ ├── BalanceAssignment.py │ │ ├── BatchMatrixMult.py │ │ ├── BatchNorm.py │ │ ├── BinaryCrossEntropy.py │ │ ├── BinaryCrossEntropyWithLogits.py │ │ ├── Bool.py │ │ ├── Broadcast.py │ │ ├── BroadcastCommunicate.py │ │ ├── BroadcastShape.py │ │ ├── Clamp.py │ │ ├── CompressedEmbedding.py │ │ ├── Concat.py │ │ ├── Concatenate.py │ │ ├── ConstPow.py │ │ ├── Conv2d.py │ │ ├── Conv2dAddBias.py │ │ ├── Conv2dBroadcast.py │ │ ├── Conv2dReduceSum.py │ │ ├── CrossEntropy.py │ │ ├── CrossEntropySparse.py │ │ ├── CuSparse.py │ │ ├── Cumsum.py │ │ ├── DataTransfer.py │ │ ├── Dispatch.py │ │ ├── DistGCN_15d.py │ │ ├── Division.py │ │ ├── Dropout.py │ │ ├── EmbeddingLookUp.py │ │ ├── Exp.py │ │ ├── Floor.py │ │ ├── Full.py │ │ ├── Gather.py │ │ ├── Gelu.py │ │ ├── GroupTopKIdx.py │ │ ├── HAllToAll.py │ │ ├── Indexing.py │ │ ├── InstanceNorm2d.py │ │ ├── Interpolate.py │ │ ├── LayerNorm.py │ │ ├── LayoutTransform.py │ │ ├── LeakyRelu.py │ │ ├── Linear.py │ │ ├── LogElewise.py │ │ ├── LogSoftmax.py │ │ ├── Mask.py │ │ ├── MaskedFill.py │ │ ├── MatrixDot.py │ │ ├── MatrixMult.py │ │ ├── Max.py │ │ ├── MaxPool.py │ │ ├── Min.py │ │ ├── MinDist.py │ │ ├── MinusByConst.py │ │ ├── MinusElewise.py │ │ ├── MultiplyConst.py │ │ ├── MultiplyElewise.py │ │ ├── NllLoss.py │ │ ├── Node.py │ │ ├── Norm.py │ │ ├── OneHot.py │ │ ├── OnesLike.py │ │ ├── Opposite.py │ │ ├── OptEmbedBinaryStep.py │ │ ├── Pad.py │ │ ├── ParamClip.py │ │ ├── ParameterServerCommunicate.py │ │ ├── PipelineReceive.py │ │ ├── PipelineSend.py │ │ ├── Pow.py │ │ ├── Power.py │ │ ├── Prune.py │ │ ├── Quantize.py │ │ ├── QuantizeALPTEmb.py │ │ ├── QuantizeEmbedding.py │ │ ├── README.md │ │ ├── Rand.py │ │ ├── ReduceCommunicate.py │ │ ├── ReduceMean.py │ │ ├── ReduceMin.py │ │ ├── ReduceMul.py │ │ ├── ReduceNorm1.py │ │ ├── ReduceNorm2.py │ │ ├── ReduceScatterCommunicate.py │ │ ├── ReduceSum.py │ │ ├── ReduceSumAxisZero.py │ │ ├── Relu.py │ │ ├── Repeat.py │ │ ├── Reshape.py │ │ ├── ReshapeTo.py │ │ ├── ReverseLayoutTransform.py │ │ ├── ReverseLayoutTransformNoGate.py │ │ ├── Roll.py │ │ ├── SamGroupSum.py │ │ ├── SamMax.py │ │ ├── Sample.py │ │ ├── Scatter.py │ │ ├── Scatter1D.py │ │ ├── Sigmoid.py │ │ ├── Sign.py │ │ ├── Sin.py │ │ ├── Slice.py │ │ ├── SliceAssign.py │ │ ├── SliceByMatrix.py │ │ ├── Softmax.py │ │ ├── SoftmaxCrossEntropy.py │ │ ├── SoftmaxCrossEntropySparse.py │ │ ├── SparseEmbeddingLookUp.py │ │ ├── SparseSet.py │ │ ├── Split.py │ │ ├── Sqrt.py │ │ ├── StopGradient.py │ │ ├── Sum.py │ │ ├── SumSparseGradient.py │ │ ├── Tanh.py │ │ ├── Tile.py │ │ ├── TopKIdx.py │ │ ├── TopKVal.py │ │ ├── Transpose.py │ │ ├── TrilLookup.py │ │ ├── Unique.py │ │ ├── Variable.py │ │ ├── Where.py │ │ ├── ZerosLike.py │ │ ├── __init__.py │ │ ├── executor.py │ │ ├── gpipe_subexecutor.py │ │ ├── pipedream_subexecutor.py │ │ ├── pipeline_subexecutor.py │ │ └── timer_subexecutor.py │ ├── initializers.py │ ├── launcher.py │ ├── layers │ │ ├── BalanceGate.py │ │ ├── HashGate.py │ │ ├── KTop1Gate.py │ │ ├── SAMGate.py │ │ ├── TopGate.py │ │ ├── __init__.py │ │ ├── attention.py │ │ ├── base.py │ │ ├── batch_split_layer.py │ │ ├── concatenate.py │ │ ├── conv.py │ │ ├── dropout.py │ │ ├── embedding.py │ │ ├── gates │ │ │ ├── base_gate.py │ │ │ ├── gshard_gate.py │ │ │ └── naive_gate.py │ │ ├── gelu.py │ │ ├── hash_layer.py │ │ ├── identity.py │ │ ├── ktop1_layer.py │ │ ├── linear.py │ │ ├── loss.py │ │ ├── mish.py │ │ ├── moe_layer.py │ │ ├── normalization.py │ │ ├── pooling.py │ │ ├── relu.py │ │ ├── reshape.py │ │ ├── sam_layer.py │ │ ├── sequence.py │ │ ├── slice.py │ │ └── sum.py │ ├── logger.py │ ├── lr_scheduler.py │ ├── memory_pool.py │ ├── metrics.py │ ├── ndarray.py │ ├── onnx │ │ ├── X2hetu │ │ │ ├── __init__.py │ │ │ ├── handler.py │ │ │ └── handlers │ │ │ │ ├── __init__.py │ │ │ │ ├── array.py │ │ │ │ ├── math.py │ │ │ │ └── nn.py │ │ ├── __init__.py │ │ ├── constants.py │ │ ├── graph.py │ │ ├── handler.py │ │ ├── hetu2onnx.py │ │ ├── onnx2hetu.py │ │ ├── onnx_opset │ │ │ ├── AddConst.py │ │ │ ├── AddElewise.py │ │ │ ├── BatchNorm.py │ │ │ ├── Concat.py │ │ │ ├── Conv2d.py │ │ │ ├── Division.py │ │ │ ├── Dropout.py │ │ │ ├── Identity.py │ │ │ ├── MatrixMult.py │ │ │ ├── MultiplyConst.py │ │ │ ├── OneHot.py │ │ │ ├── Opposite.py │ │ │ ├── Pad.py │ │ │ ├── Pool.py │ │ │ ├── Reduces.py │ │ │ ├── Relu.py │ │ │ ├── Reshape.py │ │ │ ├── Slice.py │ │ │ ├── Softmax.py │ │ │ ├── Sqrt.py │ │ │ ├── Tanh.py │ │ │ ├── Transpose.py │ │ │ ├── Variable.py │ │ │ ├── Where.py │ │ │ ├── __init__.py │ │ │ └── general.py │ │ └── util.py │ ├── optimizer.py │ ├── preduce.py │ ├── profiler.py │ ├── random.py │ ├── stream.py │ └── tokenizers │ │ ├── __init__.py │ │ ├── bert_tokenizer.py │ │ └── utils.py └── runner.py ├── src ├── CMakeLists.txt ├── common │ ├── c_runtime_api.cc │ ├── c_runtime_api.h │ ├── cpu_device_api.cc │ ├── cpu_device_api.h │ ├── cuda_device_api.h │ ├── device_api.h │ ├── dispatch.h │ ├── dlarray.h │ ├── random.cc │ ├── random.h │ └── runtime_base.h ├── communication │ ├── CMakeLists.txt │ ├── c_communication_nthread.cc │ ├── mpi_communication.cc │ ├── mpi_nccl_communication.cu │ └── nccl_communication.cu ├── cuda_common │ ├── cuda_device_api.cc │ ├── gpu_chunk.cc │ ├── gpu_functions.cuh │ ├── gpu_reduce.h │ ├── gpu_runtime.cc │ └── gpu_runtime.h ├── dnnl_ops │ ├── AddConst.cpp │ ├── AddElewise.cpp │ ├── ArraySet.cpp │ ├── AvgPool.cpp │ ├── BatchNorm.cpp │ ├── BroadcastTo.cpp │ ├── Concat.cpp │ ├── Conv2d.cpp │ ├── DivideConst.cpp │ ├── DivideElewise.cpp │ ├── Dropout.cpp │ ├── EmbeddingLookup.cpp │ ├── Gelu.cpp │ ├── Initializers.cpp │ ├── MatrixMult.cpp │ ├── MaxPool.cpp │ ├── MultiplyConst.cpp │ ├── MultiplyElewise.cpp │ ├── Opposite.cpp │ ├── Optimizers.cpp │ ├── Pad.cpp │ ├── ReduceIndexedSlice.cpp │ ├── ReduceSumAxisZero.cpp │ ├── Relu.cpp │ ├── Reshape.cpp │ ├── Sigmoid.cpp │ ├── Softmax.cpp │ ├── SoftmaxCrossEntropy.cpp │ ├── Sqrt.cpp │ ├── Tanh.cpp │ ├── Transpose.cpp │ ├── UniqueIndices.cpp │ ├── dnnl_runtime.cc │ └── dnnl_runtime.h ├── header │ ├── mpi_communication.h │ ├── mpi_nccl_communication.h │ ├── nccl_communication.h │ └── types.h ├── hetu_cache │ ├── CMakeLists.txt │ ├── include │ │ ├── binding.h │ │ ├── cache.h │ │ ├── embedding.h │ │ ├── hetu_client.h │ │ ├── lfu_cache.h │ │ ├── lfuopt_cache.h │ │ ├── lru_cache.h │ │ └── unqiue_tools.h │ └── src │ │ ├── cache.cc │ │ ├── embedding.cc │ │ ├── hetu_client.cc │ │ ├── lfu_cache.cc │ │ ├── lfuopt_cache.cc │ │ ├── lru_cache.cc │ │ └── python_api.cc ├── memory_pool │ ├── BFC_allocator.cc │ ├── BFC_allocator.h │ ├── allocator.cc │ └── allocator.h └── ops │ ├── Abs.cu │ ├── AddConst.cu │ ├── AddElewise.cu │ ├── Addmm.cu │ ├── Arange.cu │ ├── Argmax.cu │ ├── ArgmaxPartial.cu │ ├── Argsort.cu │ ├── ArrayLazyCallback.cu │ ├── ArraySet.cu │ ├── AssignWithIndexedSlices.cu │ ├── AutoDimOps.cu │ ├── AvgPool.cu │ ├── Baddbmm.cu │ ├── BatchMatrixMult.cu │ ├── BinaryCrossEntropy.cu │ ├── BinaryCrossEntropyWithLogits.cu │ ├── Bool.cu │ ├── Broadcast.cu │ ├── BroadcastShape.cu │ ├── Clamp.cu │ ├── Clip.cu │ ├── Clone.cu │ ├── CompressedEmbedding.cu │ ├── Concat.cu │ ├── Concatenate.cu │ ├── ConstPow.cu │ ├── Conv2d.cu │ ├── Conv2dBroadcast.cu │ ├── Conv2dReduceSum.cu │ ├── CrossEntropy.cu │ ├── CrossEntropySparse.cu │ ├── CuSparseCsrmm.cu │ ├── CuSparseCsrmv.cu │ ├── CudnnAvgPool.cu │ ├── CudnnBn.cu │ ├── CudnnConv2d.cu │ ├── CudnnConv2dAddBias.cu │ ├── CudnnDropout.cu │ ├── CudnnMaxPool.cu │ ├── CudnnSoftmax.cu │ ├── CudnnSoftmaxEntropy.cu │ ├── CumSum.cu │ ├── DivideConst.cu │ ├── DivideElewise.cu │ ├── Dot.cu │ ├── Dropout.cu │ ├── EmbeddingLookup.cu │ ├── Exp.cu │ ├── Floor.cu │ ├── Gather.cu │ ├── Gelu.cu │ ├── GroupTopKIdx.cu │ ├── H_A2A_LayoutTransform.cu │ ├── IndexedSlices.cu │ ├── Indexing.cu │ ├── Initializers.cu │ ├── InstanceNorm2d.cu │ ├── Interpolate.cu │ ├── LayerNorm.cu │ ├── LayoutTransform.cu │ ├── LeakyRelu.cu │ ├── Linear.cu │ ├── Log.cu │ ├── Mask.cu │ ├── MaskedFill.cu │ ├── MatrixMult.cu │ ├── Max.cu │ ├── MaxPool.cu │ ├── Min.cu │ ├── MinDist.cu │ ├── MinusByConst.cu │ ├── MinusElewise.cu │ ├── MultiplyConst.cu │ ├── MultiplyElewise.cu │ ├── NllLoss.cu │ ├── Norm.cu │ ├── OneHot.cu │ ├── Opposite.cu │ ├── OptEmbedBinaryStep.cu │ ├── Optimizers.cu │ ├── OptimizersSparse.cu │ ├── Outer.cu │ ├── Pad.cu │ ├── Pow.cu │ ├── Power.cu │ ├── PruneMask.cu │ ├── Quantize.cu │ ├── QuantizeEmbedding.cu │ ├── ReduceGeneral.cu │ ├── ReduceIndexedSlice.cu │ ├── ReduceSum.cu │ ├── ReduceSumAxisZero.cu │ ├── Relu.cu │ ├── Repeat.cu │ ├── Reshape.cu │ ├── Roll.cu │ ├── SamGroupSum.cu │ ├── SamMax.cu │ ├── Scatter.cu │ ├── Scatter1D.cu │ ├── Sigmoid.cu │ ├── Sign.cu │ ├── SignedQuantize.cu │ ├── Sin.cu │ ├── Slice.cu │ ├── SliceAssign.cu │ ├── SliceByMatrix.cu │ ├── Softmax.cu │ ├── SoftmaxCrossEntropy.cu │ ├── SoftmaxCrossEntropySparse.cu │ ├── SparseEmbeddingLookup.cu │ ├── SparseSet.cu │ ├── Sqrt.cu │ ├── Tanh.cu │ ├── TopKIdx.cu │ ├── TopKVal.cu │ ├── Transpose.cu │ ├── TrilLookup.cu │ ├── UniqueIndices.cu │ └── Where.cu ├── tests ├── README.md ├── balanced.py ├── draft.py ├── get_gpu_memory.py ├── hetu_cache │ ├── hetu_cache_config.yml │ └── hetu_cache_test.py ├── onnx │ ├── README.md │ ├── cnn_hetu_onnx_tf.py │ ├── cnn_tf_onnx_hetu.py │ ├── dnn_hetu_onnx_tf.py │ ├── dnn_tf_onnx_hetu.py │ ├── rnn_hetu_onnx_tf.py │ ├── rnn_tf_onnx_hetu.py │ ├── test_cnn.py │ ├── test_mlp.py │ └── test_nodes.py ├── pstests │ ├── local_s2_w1.yml │ ├── local_s2_w2.yml │ ├── test_apis.py │ ├── test_bandwidth.py │ ├── test_push_data.py │ ├── test_tf_bandwidth.py │ └── tf_local_s1_w2.json ├── test_DistGCN │ ├── prepare_data_GCN15d.py │ ├── prepare_data_GCN15d_reorder.py │ ├── test_group_comm.py │ └── test_model_distGCN15d.py ├── test_comm.py ├── test_datatransfer_op.py ├── test_dnnl_op.py ├── test_embedding_op.py ├── test_encode_decode.py ├── test_gpu_initializers.py ├── test_gpu_op.py ├── test_ha2agather.py ├── test_lr_scheduler.py ├── test_nccl_bandwidth.py ├── test_ops.py ├── test_optimizer.py ├── test_profiler.py ├── test_ps_preduce.py ├── test_reorder_lookup.py ├── test_resnet_block.py ├── test_simple_version_ops.py ├── test_sparse.py ├── test_sparse_op.py ├── test_split.py ├── test_sum_sparse_grad.py ├── test_transformer_ops.py ├── test_unique.py ├── tester.py └── torch_balance.py └── tools ├── EmbeddingMemoryCompression ├── .gitignore ├── README.md ├── config.cmake ├── methods │ ├── __init__.py │ ├── layers │ │ ├── __init__.py │ │ ├── adapt.py │ │ ├── alpt.py │ │ ├── autodim.py │ │ ├── autosrh.py │ │ ├── compo.py │ │ ├── deduplication.py │ │ ├── deeplight.py │ │ ├── dhe.py │ │ ├── dpq.py │ │ ├── hash.py │ │ ├── mde.py │ │ ├── mgqe.py │ │ ├── optembed.py │ │ ├── pep.py │ │ ├── primes.npy │ │ ├── quantize.py │ │ ├── robe.py │ │ ├── sparse.py │ │ └── tensortrain.py │ └── scheduler │ │ ├── __init__.py │ │ ├── adapt.py │ │ ├── alpt.py │ │ ├── autodim.py │ │ ├── autosrh.py │ │ ├── base.py │ │ ├── compo.py │ │ ├── compressor.py │ │ ├── deduplication.py │ │ ├── deeplight.py │ │ ├── dhe.py │ │ ├── dpq.py │ │ ├── hash.py │ │ ├── md.py │ │ ├── mgqe.py │ │ ├── multistage.py │ │ ├── optembed.py │ │ ├── pep.py │ │ ├── quantize.py │ │ ├── robe.py │ │ ├── switchinference.py │ │ └── tensortrain.py ├── models │ ├── __init__.py │ ├── base.py │ ├── bench │ │ └── dlrm_s_criteo_kaggle.sh │ ├── dcn.py │ ├── deepfm.py │ ├── dlrm.py │ ├── dlrm_data_pytorch.py │ ├── dlrm_s_pytorch.py │ ├── dlrm_test.py │ ├── dlrm_torch.py │ ├── load_data.py │ ├── load_data_avazu_variants.py │ ├── load_data_criteo_variants.py │ └── wdl.py ├── run_compressed.py ├── scripts │ ├── .gitignore │ ├── fit_scaling_law.py │ ├── plot_dimension_autosrh.py │ ├── plot_dimension_mde.py │ └── plot_powerlaw.py ├── supplements │ ├── Explain_TTRec.md │ └── static_encoding.png └── test_rag.py └── Galvatron ├── .DS_Store ├── MANIFEST.in ├── Makefile ├── README.md ├── csrc └── dp_core.cpp ├── figs └── api.jpg ├── galvatron.exp ├── galvatron ├── .DS_Store ├── MANIFEST.in ├── __init__.py ├── core │ ├── __init__.py │ ├── arguments.py │ ├── comm_groups.py │ ├── dataloader.py │ ├── hybrid_parallel_config.py │ ├── hybrid_parallel_model.py │ ├── initialize.py │ ├── parallel.py │ ├── pipeline │ │ ├── __init__.py │ │ ├── grad_reduce.py │ │ ├── pipeline.py │ │ └── utils.py │ ├── profiler.py │ ├── redistribute.py │ └── tensor_parallel │ │ ├── __init__.py │ │ ├── transformer.py │ │ └── utils.py ├── models │ ├── README.md │ ├── __init__.py │ ├── baichuan │ │ ├── BaiChuanModel_hybrid_parallel.py │ │ ├── BaiChuanModel_sequential.py │ │ ├── BaiChuanModel_tensor_parallel.py │ │ ├── __init__.py │ │ ├── arguments.py │ │ ├── configs │ │ │ ├── computation_profiling_bf16_hidden4096_head32_seqlen2048.json │ │ │ ├── galvatron_config_baichuan-7b_2nodes_8gpus_per_node_40GB_bf16_3D_example.json │ │ │ ├── galvatron_config_baichuan-7b_2nodes_8gpus_per_node_40GB_bf16_example.json │ │ │ └── memory_profiling_bf16_hidden4096_head32_seqlen2048.json │ │ ├── dataloader.py │ │ ├── hf_configs │ │ │ ├── __init__.py │ │ │ ├── baichuan-7b │ │ │ │ ├── config.json │ │ │ │ ├── config_ori.json │ │ │ │ └── configuration_baichuan.py │ │ │ └── config_utils.py │ │ ├── scripts │ │ │ ├── train.sh │ │ │ └── train_dist.sh │ │ ├── train.py │ │ └── train_dist.py │ ├── gpt │ │ ├── GPTModel_hybrid_parallel.py │ │ ├── GPTModel_sequential.py │ │ ├── GPTModel_tensor_parallel.py │ │ ├── __init__.py │ │ ├── arguments.py │ │ ├── configs │ │ │ ├── computation_profiling_bf16_hidden1600_head32_seqlen1024.json │ │ │ ├── computation_profiling_bf16_hidden2560_head32_seqlen2048.json │ │ │ ├── computation_profiling_bf16_hidden4096_head32_seqlen2048.json │ │ │ ├── galvatron_config_gpt-1.5b_2nodes_8gpus_per_node_40GB_bf16_3D_example.json │ │ │ ├── galvatron_config_gpt-1.5b_2nodes_8gpus_per_node_40GB_bf16_example.json │ │ │ ├── galvatron_config_gpt-2.7b_2nodes_8gpus_per_node_40GB_bf16_3D_example.json │ │ │ ├── galvatron_config_gpt-2.7b_2nodes_8gpus_per_node_40GB_bf16_example.json │ │ │ ├── galvatron_config_gpt-6.7b_2nodes_8gpus_per_node_40GB_bf16_3D_example.json │ │ │ ├── galvatron_config_gpt-6.7b_2nodes_8gpus_per_node_40GB_bf16_example.json │ │ │ ├── memory_profiling_bf16_hidden1600_head32_seqlen1024.json │ │ │ ├── memory_profiling_bf16_hidden2560_head32_seqlen2048.json │ │ │ └── memory_profiling_bf16_hidden4096_head32_seqlen2048.json │ │ ├── dataloader.py │ │ ├── meta_configs │ │ │ ├── __init__.py │ │ │ ├── config_utils.py │ │ │ ├── gpt-0.3b.json │ │ │ ├── gpt-1.5b.json │ │ │ ├── gpt-2.7b.json │ │ │ └── gpt-6.7b.json │ │ ├── scripts │ │ │ ├── train.sh │ │ │ └── train_dist.sh │ │ ├── train.py │ │ └── train_dist.py │ └── llama │ │ ├── LlamaModel_hybrid_parallel.py │ │ ├── LlamaModel_sequential.py │ │ ├── LlamaModel_tensor_parallel.py │ │ ├── __init__.py │ │ ├── arguments.py │ │ ├── configs │ │ ├── computation_profiling_bf16_hidden4096_head32_seqlen2048.json │ │ ├── galvatron_config_llama-7b_2nodes_8gpus_per_node_40GB_bf16_3D_example.json │ │ ├── galvatron_config_llama-7b_2nodes_8gpus_per_node_40GB_bf16_example.json │ │ └── memory_profiling_bf16_hidden4096_head32_seqlen2048.json │ │ ├── dataloader.py │ │ ├── meta_configs │ │ ├── __init__.py │ │ ├── config_utils.py │ │ ├── llama-13b.json │ │ ├── llama-30b.json │ │ └── llama-7b.json │ │ ├── scripts │ │ ├── train.sh │ │ └── train_dist.sh │ │ ├── train.py │ │ └── train_dist.py ├── profile_hardware │ ├── hardware_configs │ │ ├── allreduce_bandwidth_1nodes_4gpus_per_node.json │ │ ├── allreduce_bandwidth_1nodes_8gpus_per_node.json │ │ ├── allreduce_bandwidth_2nodes_8gpus_per_node.json │ │ ├── overlap_coefficient.json │ │ ├── p2p_bandwidth_1nodes_4gpus_per_node.json │ │ ├── p2p_bandwidth_1nodes_8gpus_per_node.json │ │ └── p2p_bandwidth_2nodes_8gpus_per_node.json │ ├── hostfile │ ├── profile_hardware.py │ ├── profile_overlap.py │ └── scripts │ │ ├── build_nccl_test.sh │ │ ├── profile_hardware.sh │ │ ├── profile_overlap.sh │ │ └── run_nccl_test.sh ├── scripts │ ├── flash_attn_ops_install.sh │ └── prepare_env.sh ├── site_package │ ├── __init__.py │ ├── megatron │ │ ├── __init__.py │ │ ├── arguments.py │ │ ├── checkpointing.py │ │ ├── core │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── enums.py │ │ │ ├── package_info.py │ │ │ ├── parallel_state.py │ │ │ ├── pipeline_parallel │ │ │ │ ├── __init__.py │ │ │ │ ├── p2p_communication.py │ │ │ │ └── schedules.py │ │ │ ├── requirements.txt │ │ │ ├── tensor_parallel │ │ │ │ ├── __init__.py │ │ │ │ ├── cross_entropy.py │ │ │ │ ├── data.py │ │ │ │ ├── layers.py │ │ │ │ ├── mappings.py │ │ │ │ ├── mappings_group.py │ │ │ │ ├── random.py │ │ │ │ └── utils.py │ │ │ └── utils.py │ │ ├── data │ │ │ ├── Makefile │ │ │ ├── __init__.py │ │ │ ├── autoaugment.py │ │ │ ├── bert_dataset.py │ │ │ ├── biencoder_dataset_utils.py │ │ │ ├── blendable_dataset.py │ │ │ ├── data_samplers.py │ │ │ ├── dataset_utils.py │ │ │ ├── gpt_dataset.py │ │ │ ├── helpers.cpp │ │ │ ├── ict_dataset.py │ │ │ ├── indexed_dataset.py │ │ │ ├── orqa_wiki_dataset.py │ │ │ ├── realm_dataset_utils.py │ │ │ ├── realm_index.py │ │ │ ├── t5_dataset.py │ │ │ ├── test │ │ │ │ ├── test_indexed_dataset.py │ │ │ │ └── test_preprocess_data.sh │ │ │ └── vit_dataset.py │ │ ├── dist_signal_handler.py │ │ ├── fp16_deprecated │ │ │ └── loss_scaler.py │ │ ├── fused_kernels │ │ │ ├── __init__.py │ │ │ ├── compat.h │ │ │ ├── scaled_masked_softmax.cpp │ │ │ ├── scaled_masked_softmax.h │ │ │ ├── scaled_masked_softmax_cuda.cu │ │ │ ├── scaled_softmax.cpp │ │ │ ├── scaled_softmax_cuda.cu │ │ │ ├── scaled_upper_triang_masked_softmax.cpp │ │ │ ├── scaled_upper_triang_masked_softmax.h │ │ │ ├── scaled_upper_triang_masked_softmax_cuda.cu │ │ │ ├── tests │ │ │ │ ├── __init__.py │ │ │ │ └── test_fused_kernels.py │ │ │ └── type_shim.h │ │ ├── global_vars.py │ │ ├── indexer.py │ │ ├── initialize.py │ │ ├── memory.py │ │ ├── microbatches.py │ │ ├── model │ │ │ ├── __init__.py │ │ │ ├── bert_model.py │ │ │ ├── biencoder_model.py │ │ │ ├── classification.py │ │ │ ├── distributed.py │ │ │ ├── enums.py │ │ │ ├── fused_bias_gelu.py │ │ │ ├── fused_layer_norm.py │ │ │ ├── fused_softmax.py │ │ │ ├── gpt_model.py │ │ │ ├── language_model.py │ │ │ ├── module.py │ │ │ ├── multiple_choice.py │ │ │ ├── realm_model.py │ │ │ ├── retro_transformer.py │ │ │ ├── rotary_pos_embedding.py │ │ │ ├── t5_model.py │ │ │ ├── transformer.py │ │ │ ├── utils.py │ │ │ └── vision │ │ │ │ ├── classification.py │ │ │ │ ├── dino.py │ │ │ │ ├── esvit_swin_backbone.py │ │ │ │ ├── inpainting.py │ │ │ │ ├── knn_monitor.py │ │ │ │ ├── mit_backbone.py │ │ │ │ ├── swin_backbone.py │ │ │ │ ├── utils.py │ │ │ │ └── vit_backbone.py │ │ ├── mpu │ │ │ └── tests │ │ │ │ ├── __init__.py │ │ │ │ ├── commons.py │ │ │ │ ├── test_cross_entropy.py │ │ │ │ ├── test_data.py │ │ │ │ ├── test_initialize.py │ │ │ │ ├── test_layers.py │ │ │ │ └── test_random.py │ │ ├── optimizer │ │ │ ├── __init__.py │ │ │ ├── clip_grads.py │ │ │ ├── distrib_optimizer.py │ │ │ ├── grad_scaler.py │ │ │ └── optimizer.py │ │ ├── optimizer_param_scheduler.py │ │ ├── static │ │ │ └── index.html │ │ ├── text_generation │ │ │ ├── __init__.py │ │ │ ├── api.py │ │ │ ├── beam_utils.py │ │ │ ├── communication.py │ │ │ ├── forward_step.py │ │ │ ├── generation.py │ │ │ ├── sampling.py │ │ │ └── tokenization.py │ │ ├── text_generation_server.py │ │ ├── timers.py │ │ ├── tokenizer │ │ │ ├── __init__.py │ │ │ ├── bert_tokenization.py │ │ │ ├── gpt2_tokenization.py │ │ │ └── tokenizer.py │ │ ├── training.py │ │ └── utils.py │ └── nccl-tests │ │ ├── .gitignore │ │ ├── LICENSE.txt │ │ ├── Makefile │ │ ├── README.md │ │ ├── doc │ │ └── PERFORMANCE.md │ │ ├── src │ │ ├── Makefile │ │ ├── all_gather.cu │ │ ├── all_reduce.cu │ │ ├── alltoall.cu │ │ ├── broadcast.cu │ │ ├── common.cu │ │ ├── common.h │ │ ├── gather.cu │ │ ├── hypercube.cu │ │ ├── nccl1_compat.h │ │ ├── reduce.cu │ │ ├── reduce_scatter.cu │ │ ├── scatter.cu │ │ ├── sendrecv.cu │ │ ├── timer.cc │ │ └── timer.h │ │ └── verifiable │ │ ├── Makefile │ │ ├── inexact_regress.cu │ │ ├── verifiable.cu │ │ ├── verifiable.h │ │ └── verifiable.mk └── utils │ ├── __init__.py │ ├── config_utils.py │ ├── memory_utils.py │ ├── strategy_utils.py │ └── training_utils.py ├── requirements.txt └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.csv 3 | *.tar.gz* 4 | *.pkl.gz 5 | .dataset 6 | build/ 7 | test_time/ 8 | .vscode/ 9 | CIFAR_10/ 10 | CIFAR_100/ 11 | update.sh 12 | cmake/config.cmake -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "third_party/GraphMix"] 2 | path = third_party/GraphMix 3 | url = https://github.com/nox-410/GraphMix.git 4 | [submodule "third_party/HetuML"] 5 | path = third_party/HetuML 6 | url = https://github.com/ccchengff/HetuML.git 7 | -------------------------------------------------------------------------------- /bin/heturun: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | python $(cd $(dirname $0); pwd)/../python/runner.py $@ 3 | -------------------------------------------------------------------------------- /cmake/Modules/FindCUB.cmake: -------------------------------------------------------------------------------- 1 | # - Try to find CUB 2 | # Once done this will define 3 | # CUB_FOUND - System has CUB 4 | # CUB_INCLUDE_DIR - The CUB include directories 5 | 6 | find_path ( CUB_INCLUDE_DIR cub HINTS ${CUB_ROOT}/include ) 7 | 8 | find_package_handle_standard_args ( 9 | CUB 10 | REQUIRED_VARS CUB_INCLUDE_DIR) 11 | -------------------------------------------------------------------------------- /cmake/Modules/FindMKL.cmake: -------------------------------------------------------------------------------- 1 | # - Try to find DNNL(MKL-DNN) 2 | # Once done this will define 3 | # DNNL_FOUND - System has DNNL 4 | # DNNL_INCLUDE_DIR - The DNNL include directories 5 | # DNNL_BUILD_INCLUDE_DIR - DNNL include directories in build 6 | # DNNL_LIBRARY - The libraries needed to use DNNL 7 | # DNNL_DEFINITIONS - Compiler switches required for using DNNL 8 | 9 | find_path ( DNNL_INCLUDE_DIR dnnl.h HINTS ${MKL_ROOT}/include ) 10 | find_path ( DNNL_BUILD_INCLUDE_DIR dnnl_config.h HINTS ${MKL_BUILD}/include ) 11 | find_library ( DNNL_LIBRARY NAMES dnnl mkldnn HINTS ${MKL_BUILD}/src ) 12 | 13 | include ( FindPackageHandleStandardArgs ) 14 | find_package_handle_standard_args ( MKL DEFAULT_MSG DNNL_LIBRARY DNNL_INCLUDE_DIR DNNL_BUILD_INCLUDE_DIR ) 15 | -------------------------------------------------------------------------------- /cmake/Modules/FindTHRUST.cmake: -------------------------------------------------------------------------------- 1 | # - Try to find THRUST 2 | # Once done this will define 3 | # THRUST_FOUND - System has THRUST 4 | # THRUST_INCLUDE_DIR - The THRUST include directories 5 | 6 | find_path ( THRUST_INCLUDE_DIR thrust HINTS ${THRUST_ROOT}/include ) 7 | 8 | find_package_handle_standard_args ( 9 | THRUST 10 | REQUIRED_VARS THRUST_INCLUDE_DIR) 11 | -------------------------------------------------------------------------------- /examples/auto_parallel/.gitignore: -------------------------------------------------------------------------------- 1 | scripts/ 2 | test_strategy/ 3 | *.json -------------------------------------------------------------------------------- /examples/auto_parallel/cnn/experiment_scripts/.gitignore: -------------------------------------------------------------------------------- 1 | alexnet/ 2 | resnet101/ 3 | wresnet101/ 4 | inceptionv3/ 5 | prev/ 6 | 7 | run*.py 8 | 9 | *.png 10 | -------------------------------------------------------------------------------- /examples/auto_parallel/cnn/experiment_scripts/w16.yml: -------------------------------------------------------------------------------- 1 | nodes: 2 | - host: node1 3 | workers: 8 4 | chief: true 5 | - host: node2 6 | workers: 8 7 | -------------------------------------------------------------------------------- /examples/auto_parallel/cnn/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .alexnet import AlexNet 2 | from .inception_v3 import InceptionV3 3 | from .resnet101 import ResNet101 4 | from .vgg19 import VGG19 5 | from .wide_resnet import WideResNet50, WideResNet101 6 | -------------------------------------------------------------------------------- /examples/auto_parallel/cnn/torch_models/__init__.py: -------------------------------------------------------------------------------- 1 | from .alexnet import AlexNet 2 | from .resnet101 import ResNet101 3 | from .vgg19 import VGG19 4 | from .inception_v3 import InceptionV3 5 | from .wide_resnet import wide_resnet50_2, wide_resnet101_2 6 | -------------------------------------------------------------------------------- /examples/auto_parallel/transformer/.gitignore: -------------------------------------------------------------------------------- 1 | preprocessed_data 2 | cached_data -------------------------------------------------------------------------------- /examples/auto_parallel/transformer/experiment_scripts/.gitignore: -------------------------------------------------------------------------------- 1 | bert* 2 | gpt2* 3 | prev/ 4 | 5 | run*.py 6 | *.png 7 | -------------------------------------------------------------------------------- /examples/auto_parallel/transformer/experiment_scripts/w16.yml: -------------------------------------------------------------------------------- 1 | nodes: 2 | - host: node1 3 | workers: 8 4 | chief: true 5 | - host: node2 6 | workers: 8 7 | -------------------------------------------------------------------------------- /examples/auto_parallel/transformer/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .hetu_bert import BertForPreTraining as HetuBert 2 | from .torch_bert import BertForPreTraining as TorchBert 3 | from .bert_config import BertConfig 4 | 5 | from .hetu_gpt2 import GPT2LMHeadModel as HetuGPT2 6 | from .torch_gpt2 import GPT2LMHeadModel as TorchGPT2 7 | from .gpt2_config import GPT2Config 8 | -------------------------------------------------------------------------------- /examples/cnn/local_s1.yml: -------------------------------------------------------------------------------- 1 | shared : 2 | DMLC_PS_ROOT_URI : 127.0.0.1 3 | DMLC_PS_ROOT_PORT : 13030 4 | DMLC_NUM_WORKER : 2 5 | DMLC_NUM_SERVER : 1 6 | DMLC_PS_VAN_TYPE : p3 7 | launch : 8 | worker : 0 9 | server : 1 10 | scheduler : true 11 | -------------------------------------------------------------------------------- /examples/cnn/models/LogReg.py: -------------------------------------------------------------------------------- 1 | import hetu as ht 2 | from hetu import init 3 | 4 | 5 | def logreg(x, y_): 6 | ''' 7 | Logistic Regression model, for MNIST dataset. 8 | 9 | Parameters: 10 | x: Variable(hetu.gpu_ops.Node.Node), shape (N, dims) 11 | y_: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes) 12 | Return: 13 | loss: Variable(hetu.gpu_ops.Node.Node), shape (1,) 14 | y: Variable(hetu.gpu_ops.Node.Node), shape (N, num_classes) 15 | ''' 16 | 17 | print("Build logistic regression model...") 18 | weight = init.zeros((784, 10), name='logreg_weight') 19 | bias = init.zeros((10,), name='logreg_bias') 20 | x = ht.matmul_op(x, weight) 21 | y = x + ht.broadcastto_op(bias, x) 22 | loss = ht.softmaxcrossentropy_op(y, y_) 23 | loss = ht.reduce_mean_op(loss, [0]) 24 | return loss, y 25 | -------------------------------------------------------------------------------- /examples/cnn/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .VGG import vgg, vgg16, vgg19 2 | from .LogReg import logreg 3 | from .CNN import cnn_3_layers 4 | from .AlexNet import alexnet 5 | from .LeNet import lenet 6 | from .MLP import mlp 7 | from .RNN import rnn 8 | from .LSTM import lstm 9 | from .ResNet import resnet, resnet18, resnet34 10 | -------------------------------------------------------------------------------- /examples/cnn/pytorch_models/__init__.py: -------------------------------------------------------------------------------- 1 | from .mlp import mlp 2 | from .resnet import resnet18, resnet34, resnet50, resnet101, resnet152 3 | from .vgg import vgg16, vgg19 4 | from .rnn import rnn 5 | -------------------------------------------------------------------------------- /examples/cnn/pytorch_models/mlp.py: -------------------------------------------------------------------------------- 1 | import torch.nn.functional as F 2 | import torch.nn as nn 3 | 4 | 5 | class MLP(nn.Module): 6 | def __init__(self): 7 | super(MLP, self).__init__() 8 | self.fc1 = nn.Linear(3072, 256) 9 | self.fc2 = nn.Linear(256, 256) 10 | self.fc3 = nn.Linear(256, 10) 11 | 12 | def forward(self, x): 13 | x = F.relu(self.fc1(x)) 14 | x = F.relu(self.fc2(x)) 15 | out = self.fc3(x) 16 | return out 17 | 18 | 19 | def mlp(): 20 | return MLP() 21 | -------------------------------------------------------------------------------- /examples/cnn/scripts/hetu_16gpu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | workdir=$(cd $(dirname $0); pwd) 4 | mainpy=${workdir}/../main.py 5 | 6 | ### validate and timing 7 | heturun -c hetu_config16allreduce.yml python ${mainpy} --model $1 --dataset $2 --learning-rate 0.000625 --validate --timing --comm-mode AllReduce 8 | 9 | -------------------------------------------------------------------------------- /examples/cnn/scripts/hetu_16gpu_ps.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | workdir=$(cd $(dirname $0); pwd) 4 | mainpy=${workdir}/../main.py 5 | 6 | ### validate and timing 7 | heturun -c hetu_config16ps.yml python ${mainpy} --model $1 --dataset $2 --learning-rate 0.000625 --validate --timing --comm-mode PS 8 | 9 | -------------------------------------------------------------------------------- /examples/cnn/scripts/hetu_1gpu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | workdir=$(cd $(dirname $0); pwd) 4 | mainpy=${workdir}/../main.py 5 | 6 | 7 | # model: 8 | # e.g. bash hetu_1gpu.sh mlp CIFAR10 9 | 10 | ### validate and timing 11 | python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --validate --timing 12 | -------------------------------------------------------------------------------- /examples/cnn/scripts/hetu_2gpu_ps.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | workdir=$(cd $(dirname $0); pwd) 4 | mainpy=${workdir}/../main.py 5 | 6 | ### validate and timing 7 | heturun -s 1 -w 2 python ${mainpy} --model $1 --dataset $2 --validate --timing --comm-mode PS 8 | -------------------------------------------------------------------------------- /examples/cnn/scripts/hetu_8gpu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | workdir=$(cd $(dirname $0); pwd) 3 | mainpy=${workdir}/../main.py 4 | depsdir=${workdir}/../../.. 5 | 6 | ### validate and timing 7 | heturun -w 8 python ${mainpy} --model $1 --dataset $2 --learning-rate 0.00125 --validate --timing --comm-mode AllReduce 8 | -------------------------------------------------------------------------------- /examples/cnn/scripts/hetu_config16allreduce.yml: -------------------------------------------------------------------------------- 1 | nodes: 2 | - host: node1 3 | workers: 8 4 | chief: true 5 | - host: node2 6 | workers: 8 7 | -------------------------------------------------------------------------------- /examples/cnn/scripts/hetu_config16ps.yml: -------------------------------------------------------------------------------- 1 | nodes: 2 | - host: node1 3 | servers: 1 4 | workers: 8 5 | chief: true 6 | - host: node2 7 | servers: 1 8 | workers: 8 9 | -------------------------------------------------------------------------------- /examples/cnn/scripts/horovod_16gpu.sh: -------------------------------------------------------------------------------- 1 | 2 | #!/bin/bash 3 | workdir=$(cd $(dirname $0); pwd) 4 | mainpy=${workdir}/../run_tf_horovod.py 5 | 6 | # horovodrun -np 8 -H localhost:8 python ${mainpy} --model tf_mlp --dataset CIFAR10 --learning-rate 0.00125 --validate --timing 7 | 8 | horovodrun -np 16 --start-timeout 3000 -H node1:8,node2:8 python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --validate --timing 9 | 10 | # ../build/_deps/openmpi-build/bin/mpirun -mca btl_tcp_if_include enp97s0f0 --bind-to none --map-by slot\ 11 | # -x NCCL_SOCKET_IFNAME=enp97s0f0 -H node1:8,node2:8 --allow-run-as-root python run_tf_horovod.py --model 12 | -------------------------------------------------------------------------------- /examples/cnn/scripts/horovod_8gpu.sh: -------------------------------------------------------------------------------- 1 | 2 | #!/bin/bash 3 | workdir=$(cd $(dirname $0); pwd) 4 | mainpy=${workdir}/../run_tf_horovod.py 5 | 6 | horovodrun -np 8 -H localhost:8 python ${mainpy} --model $1 --dataset $2 --learning-rate 0.00125 --validate --timing 7 | -------------------------------------------------------------------------------- /examples/cnn/scripts/pytorch_16gpu_0.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | GPUS_PER_NODE=8 4 | # Change for multinode config 5 | MASTER_ADDR=162.105.146.117 6 | MASTER_PORT=6000 7 | NNODES=2 8 | NODE_RANK=0 9 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) 10 | 11 | workdir=$(cd $(dirname $0); pwd) 12 | mainpy=${workdir}/../torch_main.py 13 | 14 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT" 15 | 16 | python -m torch.distributed.launch $DISTRIBUTED_ARGS \ 17 | ${mainpy} \ 18 | --model $1 --dataset $2 --learning-rate 0.01 --validate --timing --distributed -------------------------------------------------------------------------------- /examples/cnn/scripts/pytorch_16gpu_1.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | GPUS_PER_NODE=8 4 | # Change for multinode config 5 | MASTER_ADDR=162.105.146.117 6 | MASTER_PORT=39575 7 | NNODES=2 8 | NODE_RANK=1 9 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) 10 | 11 | workdir=$(cd $(dirname $0); pwd) 12 | mainpy=${workdir}/../torch_main.py 13 | 14 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT" 15 | 16 | python -m torch.distributed.launch $DISTRIBUTED_ARGS \ 17 | ${mainpy} \ 18 | --model $1 --dataset $2 --learning-rate 0.01 --validate --timing --distributed -------------------------------------------------------------------------------- /examples/cnn/scripts/pytorch_1gpu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | workdir=$(cd $(dirname $0); pwd) 4 | mainpy=${workdir}/../torch_main.py 5 | 6 | ## validate and timing 7 | python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --validate --timing 8 | -------------------------------------------------------------------------------- /examples/cnn/scripts/pytorch_8gpu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | GPUS_PER_NODE=8 4 | # Change for multinode config 5 | MASTER_ADDR=localhost 6 | MASTER_PORT=6000 7 | NNODES=1 8 | NODE_RANK=0 9 | WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) 10 | 11 | workdir=$(cd $(dirname $0); pwd) 12 | mainpy=${workdir}/../torch_main.py 13 | 14 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT" 15 | 16 | python -m torch.distributed.launch $DISTRIBUTED_ARGS \ 17 | ${mainpy} \ 18 | --model $1 --dataset $2 --learning-rate 0.01 --validate --timing --distributed -------------------------------------------------------------------------------- /examples/cnn/scripts/tf_1gpu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | workdir=$(cd $(dirname $0); pwd) 4 | mainpy=${workdir}/../tf_main.py 5 | 6 | ### validate and timing 7 | python ${mainpy} --model $1 --dataset $2 --learning-rate 0.01 --validate --timing 8 | 9 | ### run in cpu 10 | # python ${mainpy} --model tf_mlp --gpu -1 --validate --timing 11 | -------------------------------------------------------------------------------- /examples/cnn/settings/tf_dist_s1_w16.json: -------------------------------------------------------------------------------- 1 | { 2 | "worker": [ 3 | "162.105.146.117:34569", 4 | "162.105.146.117:34568", 5 | "162.105.146.117:34567", 6 | "162.105.146.117:34566", 7 | "162.105.146.117:34565", 8 | "162.105.146.117:34564", 9 | "162.105.146.117:34563", 10 | "162.105.146.117:34562", 11 | "162.105.146.118:34779", 12 | "162.105.146.118:34778", 13 | "162.105.146.118:34777", 14 | "162.105.146.118:34776", 15 | "162.105.146.118:34775", 16 | "162.105.146.118:34774", 17 | "162.105.146.118:34773", 18 | "162.105.146.118:34772" 19 | ], 20 | "ps": [ 21 | "162.105.146.117:34575" 22 | ] 23 | } -------------------------------------------------------------------------------- /examples/cnn/settings/tf_dist_s1_w4.json: -------------------------------------------------------------------------------- 1 | { 2 | "worker": [ 3 | "162.105.146.119:34569", 4 | "162.105.146.119:34568", 5 | "162.105.146.119:34567", 6 | "162.105.146.119:34566" 7 | ], 8 | "ps": [ 9 | "162.105.146.119:34575" 10 | ] 11 | } -------------------------------------------------------------------------------- /examples/cnn/settings/tf_dist_s1_w8.json: -------------------------------------------------------------------------------- 1 | { 2 | "worker": [ 3 | "162.105.146.119:34569", 4 | "162.105.146.119:34568", 5 | "162.105.146.119:34567", 6 | "162.105.146.119:34566", 7 | "162.105.146.119:34565", 8 | "162.105.146.119:34564", 9 | "162.105.146.119:34563", 10 | "162.105.146.119:34562" 11 | ], 12 | "ps": [ 13 | "162.105.146.119:34575" 14 | ] 15 | } -------------------------------------------------------------------------------- /examples/cnn/tf_models/__init__.py: -------------------------------------------------------------------------------- 1 | from .tf_LogReg import tf_logreg 2 | from .tf_CNN import tf_cnn_3_layers 3 | from .tf_LeNet import tf_lenet 4 | from .tf_MLP import tf_mlp 5 | from .tf_RNN import tf_rnn 6 | from .tf_LSTM import tf_lstm 7 | from .tf_ResNet import tf_resnet, tf_resnet18, tf_resnet34 8 | from .tf_VGG import tf_vgg16, tf_vgg19 9 | -------------------------------------------------------------------------------- /examples/cnn/tf_models/tf_LogReg.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | 4 | 5 | def tf_logreg(x, y_): 6 | ''' 7 | Logistic Regression model in TensorFlow, for MNIST dataset. 8 | 9 | Parameters: 10 | x: Variable(tensorflow.python.framework.ops.Tensor), shape (N, dims) 11 | y_: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes) 12 | Return: 13 | loss: Variable(tensorflow.python.framework.ops.Tensor), shape (1,) 14 | y: Variable(tensorflow.python.framework.ops.Tensor), shape (N, num_classes) 15 | ''' 16 | 17 | print("Build logistic regression model in tensorflow...") 18 | weight = tf.Variable(np.zeros(shape=(784, 10)).astype(np.float32)) 19 | bias = tf.Variable(np.zeros(shape=(10, )).astype(np.float32)) 20 | y = tf.matmul(x, weight) + bias 21 | loss = tf.nn.softmax_cross_entropy_with_logits(logits=y, labels=y_) 22 | loss = tf.reduce_mean(loss) 23 | return loss, y 24 | -------------------------------------------------------------------------------- /examples/cnn/worker_conf0.json: -------------------------------------------------------------------------------- 1 | { 2 | "DMLC_ROLE":"worker", 3 | "WORKER_ID":"0", 4 | "DMLC_PS_ROOT_URI":"127.0.0.1", 5 | "DMLC_PS_ROOT_PORT":"13030", 6 | "DMLC_NUM_WORKER":"2", 7 | "DMLC_NUM_SERVER":"1", 8 | "DMLC_PS_VAN_TYPE":"p3" 9 | } 10 | -------------------------------------------------------------------------------- /examples/cnn/worker_conf1.json: -------------------------------------------------------------------------------- 1 | { 2 | "DMLC_ROLE":"worker", 3 | "WORKER_ID":"1", 4 | "DMLC_PS_ROOT_URI":"127.0.0.1", 5 | "DMLC_PS_ROOT_PORT":"13030", 6 | "DMLC_NUM_WORKER":"2", 7 | "DMLC_NUM_SERVER":"1", 8 | "DMLC_PS_VAN_TYPE":"p3" 9 | } 10 | -------------------------------------------------------------------------------- /examples/ctr/.gitignore: -------------------------------------------------------------------------------- 1 | datasets/ 2 | logs/ 3 | scripts.sh 4 | wandb/ 5 | ckpts/ 6 | -------------------------------------------------------------------------------- /examples/ctr/kill.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | #pkill -f mnist_mlp_ps.py 3 | kill -9 $(pidof python) 4 | -------------------------------------------------------------------------------- /examples/ctr/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .wdl_adult import wdl_adult 2 | from .dcn_criteo import dcn_criteo 3 | from .dc_criteo import dc_criteo 4 | from .wdl_criteo import wdl_criteo 5 | from .deepfm_criteo import dfm_criteo 6 | -------------------------------------------------------------------------------- /examples/ctr/settings/dist_s2_w4.yml: -------------------------------------------------------------------------------- 1 | nodes: 2 | - host: hostname1 3 | servers: 1 4 | workers: 2 5 | chief: true 6 | - host: hostname2 7 | servers: 1 8 | workers: 2 9 | chief: false 10 | -------------------------------------------------------------------------------- /examples/ctr/settings/local_s1_w2.yml: -------------------------------------------------------------------------------- 1 | nodes: 2 | - host: localhost 3 | servers: 1 4 | workers: 2 5 | chief: true 6 | -------------------------------------------------------------------------------- /examples/ctr/settings/plx_local_spec.yml: -------------------------------------------------------------------------------- 1 | nodes: 2 | - address: localhost 3 | cpus: [0] 4 | gpus: [0,1,2,3,4,5,6,7] 5 | -------------------------------------------------------------------------------- /examples/ctr/settings/tf_local_s1_w2.json: -------------------------------------------------------------------------------- 1 | { 2 | "worker": [ 3 | "127.0.0.1:12349", 4 | "127.0.0.1:12348" 5 | ], 6 | "ps": [ 7 | "127.0.0.1:12345" 8 | ] 9 | } -------------------------------------------------------------------------------- /examples/ctr/settings/tf_local_s1_w4.json: -------------------------------------------------------------------------------- 1 | { 2 | "worker": [ 3 | "127.0.0.1:23459", 4 | "127.0.0.1:23458", 5 | "127.0.0.1:23457", 6 | "127.0.0.1:23456" 7 | ], 8 | "ps": [ 9 | "127.0.0.1:23455" 10 | ] 11 | } -------------------------------------------------------------------------------- /examples/ctr/settings/tf_local_s1_w8.json: -------------------------------------------------------------------------------- 1 | { 2 | "worker": [ 3 | "127.0.0.1:34569", 4 | "127.0.0.1:34568", 5 | "127.0.0.1:34567", 6 | "127.0.0.1:34566", 7 | "127.0.0.1:34565", 8 | "127.0.0.1:34564", 9 | "127.0.0.1:34563", 10 | "127.0.0.1:34562" 11 | ], 12 | "ps": [ 13 | "127.0.0.1:34575" 14 | ] 15 | } -------------------------------------------------------------------------------- /examples/ctr/tests/README.md: -------------------------------------------------------------------------------- 1 | * The scripts in this directory, except for `*_wdl_adult.sh` scripts, are deprecated. We have implemented `heturun` command to launch distributed deep learning tasks. Please refer to `*_wdl_adult.sh` scripts for the usage of `heturun`. -------------------------------------------------------------------------------- /examples/ctr/tests/hybrid_dcn_criteo.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | workdir=$(cd $(dirname $0); pwd) 4 | mainpy=${workdir}/../run_hetu.py 5 | 6 | python -m hetu.launcher ${workdir}/../settings/local_s1.yml -n 1 --sched & 7 | mpirun --allow-run-as-root -np 4 python ${mainpy} --model dcn_criteo --val --comm Hybrid --cache lfuopt --bound 3 --config ${workdir}/../settings/local_w4.yml 8 | -------------------------------------------------------------------------------- /examples/ctr/tests/hybrid_dfm_criteo.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | workdir=$(cd $(dirname $0); pwd) 4 | mainpy=${workdir}/../run_hetu.py 5 | 6 | python -m hetu.launcher ${workdir}/../settings/local_s1.yml -n 1 --sched & 7 | mpirun --allow-run-as-root -np 4 python ${mainpy} --model dfm_criteo --val --comm Hybrid --cache lfuopt --bound 3 --config ${workdir}/../settings/local_w4.yml 8 | -------------------------------------------------------------------------------- /examples/ctr/tests/hybrid_wdl_adult.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | workdir=$(cd $(dirname $0); pwd) 4 | mainpy=${workdir}/../run_hetu.py 5 | 6 | heturun -s 1 -w 4 python ${mainpy} --model wdl_adult --val --comm Hybrid --cache lfuopt --bound 3 7 | -------------------------------------------------------------------------------- /examples/ctr/tests/hybrid_wdl_criteo.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | workdir=$(cd $(dirname $0); pwd) 4 | mainpy=${workdir}/../run_hetu.py 5 | 6 | python ${workdir}/../models/load_data.py # download and preprocessing criteo dataset 7 | heturun -s 1 -w 4 python ${mainpy} --model wdl_criteo --val --comm Hybrid --cache lfuopt --bound 3 8 | -------------------------------------------------------------------------------- /examples/ctr/tests/local_dcn_criteo.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | workdir=$(cd $(dirname $0); pwd) 4 | mainpy=${workdir}/../run_hetu.py 5 | 6 | python ${mainpy} --model dcn_criteo --val 7 | -------------------------------------------------------------------------------- /examples/ctr/tests/local_dfm_criteo.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | workdir=$(cd $(dirname $0); pwd) 4 | mainpy=${workdir}/../run_hetu.py 5 | 6 | python ${mainpy} --model dfm_criteo --val 7 | -------------------------------------------------------------------------------- /examples/ctr/tests/local_wdl_adult.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | workdir=$(cd $(dirname $0); pwd) 4 | mainpy=${workdir}/../run_hetu.py 5 | 6 | heturun -w 1 python ${mainpy} --model wdl_adult --val 7 | -------------------------------------------------------------------------------- /examples/ctr/tests/local_wdl_criteo.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | workdir=$(cd $(dirname $0); pwd) 4 | mainpy=${workdir}/../run_hetu.py 5 | 6 | python ${mainpy} --model wdl_criteo --val 7 | -------------------------------------------------------------------------------- /examples/ctr/tests/ps_dcn_criteo.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | workdir=$(cd $(dirname $0); pwd) 4 | mainpy=${workdir}/../run_hetu.py 5 | 6 | python ${mainpy} --model dcn_criteo --val --comm PS --cache lfuopt --bound 3 --config ${workdir}/../settings/local_s1_w4.yml 7 | -------------------------------------------------------------------------------- /examples/ctr/tests/ps_dfm_criteo.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | workdir=$(cd $(dirname $0); pwd) 4 | mainpy=${workdir}/../run_hetu.py 5 | 6 | python ${mainpy} --model dfm_criteo --val --comm PS --cache lfuopt --bound 3 --config ${workdir}/../settings/local_s1_w4.yml 7 | -------------------------------------------------------------------------------- /examples/ctr/tests/ps_wdl_adult.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | workdir=$(cd $(dirname $0); pwd) 4 | mainpy=${workdir}/../run_hetu.py 5 | 6 | heturun -s 1 -w 4 python ${mainpy} --model wdl_adult --val --comm PS --cache lfuopt --bound 3 7 | -------------------------------------------------------------------------------- /examples/ctr/tests/ps_wdl_criteo.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | workdir=$(cd $(dirname $0); pwd) 4 | mainpy=${workdir}/../run_hetu.py 5 | 6 | python ${mainpy} --model wdl_criteo --val --comm PS --cache lfuopt --bound 3 --config ${workdir}/../settings/local_s1_w4.yml 7 | -------------------------------------------------------------------------------- /examples/ctr/tests/tf_2workers.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | workdir=$(cd $(dirname $0); pwd) 4 | mainpy=${workdir}/../tf_launch_worker.py 5 | 6 | rm -f logs/temp*.log 7 | CUDA_VISIBLE_DEVICES=0 python ${mainpy} --model wdl_criteo --config ${workdir}/../settings/tf_local_s1_w2.json --rank 0 > ${workdir}/../logs/temp0.log & 8 | CUDA_VISIBLE_DEVICES=1 python ${mainpy} --model wdl_criteo --config ${workdir}/../settings/tf_local_s1_w2.json --rank 1 > ${workdir}/../logs/temp1.log & 9 | wait 10 | -------------------------------------------------------------------------------- /examples/ctr/tests/tf_4workers.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | workdir=$(cd $(dirname $0); pwd) 4 | mainpy=${workdir}/../tf_launch_worker.py 5 | 6 | rm -f logs/temp*.log 7 | CUDA_VISIBLE_DEVICES=0 python ${mainpy} --model wdl_criteo --config ${workdir}/../settings/tf_local_s1_w4.json --rank 0 > ${workdir}/../logs/temp0.log & 8 | CUDA_VISIBLE_DEVICES=1 python ${mainpy} --model wdl_criteo --config ${workdir}/../settings/tf_local_s1_w4.json --rank 1 > ${workdir}/../logs/temp1.log & 9 | CUDA_VISIBLE_DEVICES=2 python ${mainpy} --model wdl_criteo --config ${workdir}/../settings/tf_local_s1_w4.json --rank 2 > ${workdir}/../logs/temp2.log & 10 | CUDA_VISIBLE_DEVICES=3 python ${mainpy} --model wdl_criteo --config ${workdir}/../settings/tf_local_s1_w4.json --rank 3 > ${workdir}/../logs/temp3.log & 11 | wait 12 | -------------------------------------------------------------------------------- /examples/ctr/tf_models/__init__.py: -------------------------------------------------------------------------------- 1 | from .tf_dcn_criteo import dcn_criteo 2 | from .tf_deepfm_criteo import dfm_criteo 3 | from .tf_wdl_criteo import wdl_criteo 4 | from .tf_wdl_adult import wdl_adult 5 | -------------------------------------------------------------------------------- /examples/gnn/config/local_w2.yml: -------------------------------------------------------------------------------- 1 | shared : 2 | DMLC_PS_ROOT_URI : 127.0.0.1 3 | DMLC_PS_ROOT_PORT : 13100 4 | DMLC_NUM_WORKER : 2 5 | DMLC_NUM_SERVER : 1 6 | launch : 7 | worker : 2 8 | server : 1 9 | graph_server : 1 10 | scheduler : true 11 | -------------------------------------------------------------------------------- /examples/gnn/config/local_w4.yml: -------------------------------------------------------------------------------- 1 | shared : 2 | DMLC_PS_ROOT_URI : 127.0.0.1 3 | DMLC_PS_ROOT_PORT : 13100 4 | DMLC_NUM_WORKER : 4 5 | DMLC_NUM_SERVER : 1 6 | launch : 7 | worker : 4 8 | server : 1 9 | graph_server : 4 10 | scheduler : true 11 | -------------------------------------------------------------------------------- /examples/gnn/config/local_w8.yml: -------------------------------------------------------------------------------- 1 | shared : 2 | DMLC_PS_ROOT_URI : 127.0.0.1 3 | DMLC_PS_ROOT_PORT : 13100 4 | DMLC_NUM_WORKER : 8 5 | DMLC_NUM_SERVER : 1 6 | launch : 7 | worker : 8 8 | server : 1 9 | graph_server : 4 10 | scheduler : true 11 | -------------------------------------------------------------------------------- /examples/gnn/config/single.yml: -------------------------------------------------------------------------------- 1 | shared : 2 | DMLC_PS_ROOT_URI : 127.0.0.1 3 | DMLC_PS_ROOT_PORT : 13100 4 | DMLC_NUM_WORKER : 1 5 | DMLC_NUM_SERVER : 0 6 | launch : 7 | worker : 1 8 | server : 0 9 | graph_server : 1 10 | scheduler : true 11 | -------------------------------------------------------------------------------- /examples/gnn/gnn_model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Hsword/Hetu/a1cb1bcaf236c9f1d452733b105997d50c16c9df/examples/gnn/gnn_model/__init__.py -------------------------------------------------------------------------------- /examples/gnn/gnn_tools/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Hsword/Hetu/a1cb1bcaf236c9f1d452733b105997d50c16c9df/examples/gnn/gnn_tools/__init__.py -------------------------------------------------------------------------------- /examples/moe/README.md: -------------------------------------------------------------------------------- 1 | ## Structure 2 | ``` 3 | - moe 4 | - scripts/ Test scripts 5 | - test_moe_top.py TopK MoE 6 | - test_moe_hash.py Hash Layer MoE 7 | - test_moe_ktop1.py KTop1 MoE 8 | - test_moe_base.py BASE Layer MoE 9 | - test_moe_sam.py Switch and Mixture MoE 10 | - 11 | ``` 12 | ## Usage 13 | Here are some examples of running scripts. 14 | ```bash 15 | bash scripts/run_top1.sh 16 | ``` 17 | Change ht.alltoall\_op into ht.halltoall\_op in the model definition(located in Hetu/python/hetu/layers) to use Hierarchical AllToAll. 18 | -------------------------------------------------------------------------------- /examples/moe/scripts/run_2node_comm.sh: -------------------------------------------------------------------------------- 1 | NCCL_DEBUG=DEBUG mpirun --allow-run-as-root -np 16 -mca btl_tcp_if_include enp97s0f0 -x NCCL_SOCKET_IFNAME=enp97s0f0 -x PYTHONPATH=/home/Hetu/python -H node1:8,node2:8 /root/anaconda3/envs/moe/bin/python /home/Hetu/tests/test_ha2agather.py 2 | -------------------------------------------------------------------------------- /examples/moe/scripts/run_base.sh: -------------------------------------------------------------------------------- 1 | NCCL_DEBUG=INFO mpirun --allow-run-as-root -np 8 -x PYTHONPATH=/home/Hetu/python python test_moe_base.py --num_local_experts=1 2 | -------------------------------------------------------------------------------- /examples/moe/scripts/run_hash.sh: -------------------------------------------------------------------------------- 1 | NCCL_DEBUG=INFO mpirun --allow-run-as-root -np 8 -x PYTHONPATH=/home/Hetu/python python test_moe_hash.py --num_local_experts=2 --batch_size=4 2 | -------------------------------------------------------------------------------- /examples/moe/scripts/run_ktop1.sh: -------------------------------------------------------------------------------- 1 | NCCL_DEBUG=INFO mpirun --allow-run-as-root -np 8 -x PYTHONPATH=/home/Hetu/python python test_moe_ktop1.py --k=2 --num_local_experts=2 --batch_size=64 2 | -------------------------------------------------------------------------------- /examples/moe/scripts/run_mnist.sh: -------------------------------------------------------------------------------- 1 | NCCL_DEBUG=INFO mpirun --allow-run-as-root -np 2 -x python ../test_mnist.py --top=1 --num_local_experts=2 --batch_size=16 2 | -------------------------------------------------------------------------------- /examples/moe/scripts/run_sam.sh: -------------------------------------------------------------------------------- 1 | NCCL_DEBUG=INFO mpirun --allow-run-as-root -np 8 -x PYTHONPATH=/home/Hetu/python python test_moe_sam.py --k=1 --num_local_experts=4 --batch_size=4 2 | -------------------------------------------------------------------------------- /examples/moe/scripts/run_top1.sh: -------------------------------------------------------------------------------- 1 | NCCL_DEBUG=DEBUG mpirun --allow-run-as-root -np 8 -x PYTHONPATH=/jizhicfs/pinxuezhao/Hetu_newest/python python3 /jizhicfs/pinxuezhao/Hetu_newest/examples/moe/test_moe_top.py --top=1 --num_local_experts=2 --batch_size=16 2 | -------------------------------------------------------------------------------- /examples/moe/scripts/run_top1_16gpus.sh: -------------------------------------------------------------------------------- 1 | # change ht.alltoall_op to ht.halltoall_op in Hetu/python/hetu/layers/moe_layer.py if you want to use 2 | # hierarchical AllToAll. 3 | 4 | NCCL_DEBUG=DEBUG mpirun --allow-run-as-root -np 16 -mca btl_tcp_if_include enp97s0f0 -x NCCL_SOCKET_IFNAME=enp97s0f0 -x PYTHONPATH=/home/Hetu/python -H node1:8,node2:8 /root/anaconda3/envs/moe/bin/python /home/Hetu/tests/test_moe_top.py --top=1 --num_local_experts=1 --batch_size=1 5 | -------------------------------------------------------------------------------- /examples/moe/scripts/run_top2.sh: -------------------------------------------------------------------------------- 1 | NCCL_DEBUG=INFO mpirun --allow-run-as-root -np 8 -x PYTHONPATH=/home/Hetu/python python test_moe_top.py --top=2 --num_local_experts=2 --batch_size=64 2 | -------------------------------------------------------------------------------- /examples/nlp/.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | iwslt2016/ 3 | logs/ 4 | cached_data 5 | preprocessed_data -------------------------------------------------------------------------------- /examples/nlp/README.md: -------------------------------------------------------------------------------- 1 | # NLP Examples 2 | In this directory we provide simple implementations for Transformer model. We use the IWSLT2016 de-en dataset. 3 | ## Structure 4 | ``` 5 | - nlp 6 | - hparams.py Hyperparameters 7 | - prepare_data.py Downloading and preparing data 8 | - data_load.py Dataloader 9 | - hetu_transformer.py Transformer model in hetu 10 | - tf_transformer.py Transformer model in tensorflow 11 | - train_hetu_transformer.py Trainer for hetu 12 | - train_tf_transformer.py Trainer for tensorflow 13 | ``` 14 | ## Usage 15 | ```bash 16 | python train_{framework}_transformer.py 17 | ``` 18 | To change the hyperparameters, please modify `hparams.py` file. -------------------------------------------------------------------------------- /examples/nlp/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Hsword/Hetu/a1cb1bcaf236c9f1d452733b105997d50c16c9df/examples/nlp/__init__.py -------------------------------------------------------------------------------- /examples/nlp/bert/.gitignore: -------------------------------------------------------------------------------- 1 | preprocessed_data 2 | cached_data -------------------------------------------------------------------------------- /examples/nlp/bert/config1.yml: -------------------------------------------------------------------------------- 1 | nodes: 2 | - host: localhost 3 | servers: 0 4 | workers: 1 5 | chief: true 6 | -------------------------------------------------------------------------------- /examples/nlp/bert/config2.yml: -------------------------------------------------------------------------------- 1 | nodes: 2 | - host: localhost 3 | servers: 0 4 | workers: 2 5 | chief: true 6 | -------------------------------------------------------------------------------- /examples/nlp/bert/config4.yml: -------------------------------------------------------------------------------- 1 | nodes: 2 | - host: localhost 3 | servers: 0 4 | workers: 4 5 | chief: true 6 | -------------------------------------------------------------------------------- /examples/nlp/bert/config8.yml: -------------------------------------------------------------------------------- 1 | nodes: 2 | - host: localhost 3 | servers: 0 4 | workers: 8 5 | chief: true 6 | -------------------------------------------------------------------------------- /examples/nlp/bert/data/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved. 2 | # Licensed under the Apache License, Version 2.0 (the "License"); 3 | # you may not use this file except in compliance with the License. 4 | # You may obtain a copy of the License at 5 | # 6 | # http://www.apache.org/licenses/LICENSE-2.0 7 | # 8 | # Unless required by applicable law or agreed to in writing, software 9 | # distributed under the License is distributed on an "AS IS" BASIS, 10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | # See the License for the specific language governing permissions and 12 | # limitations under the License. 13 | -------------------------------------------------------------------------------- /examples/nlp/bert/data/wikiextractor/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Hsword/Hetu/a1cb1bcaf236c9f1d452733b105997d50c16c9df/examples/nlp/bert/data/wikiextractor/__init__.py -------------------------------------------------------------------------------- /examples/nlp/bert/scripts/test_glue_hetu_bert_base.sh: -------------------------------------------------------------------------------- 1 | python test_glue_hetu_bert.py \ 2 | --gpu_id 1 \ 3 | --train_batch_size 64 \ 4 | --task_name sst-2 \ 5 | --vocab_size 30522 \ 6 | --hidden_size 768 \ 7 | --num_hidden_layers 12 \ 8 | --num_attention_heads 12 \ 9 | --seq_length 128 \ 10 | --epochs 20 \ 11 | --lr 2e-5 \ 12 | --adam_weight_decay 0.01 \ 13 | --hidden_act relu \ 14 | --dropout_prob 0.1 -------------------------------------------------------------------------------- /examples/nlp/bert/scripts/test_glue_hetu_bert_large.sh: -------------------------------------------------------------------------------- 1 | python test_glue_hetu_bert.py \ 2 | --gpu_id 1 \ 3 | --train_batch_size 32 \ 4 | --task_name sst-2 \ 5 | --vocab_size 30522 \ 6 | --hidden_size 1024 \ 7 | --num_hidden_layers 24 \ 8 | --num_attention_heads 16 \ 9 | --seq_length 128 \ 10 | --epochs 20 \ 11 | --lr 2e-5 \ 12 | --adam_weight_decay 0.01 \ 13 | --hidden_act relu \ 14 | --dropout_prob 0.1 -------------------------------------------------------------------------------- /examples/nlp/bert/scripts/test_glue_pytorch_bert_base.sh: -------------------------------------------------------------------------------- 1 | python test_glue_pytorch_bert.py \ 2 | --gpu_id 1 \ 3 | --train_batch_size 64 \ 4 | --task_name sst-2 \ 5 | --vocab_size 30522 \ 6 | --hidden_size 768 \ 7 | --num_hidden_layers 12 \ 8 | --num_attention_heads 12 \ 9 | --seq_length 128 \ 10 | --epochs 20 \ 11 | --lr 2e-5 \ 12 | --adam_weight_decay 0.01 \ 13 | --hidden_act relu \ 14 | --dropout_prob 0.1 -------------------------------------------------------------------------------- /examples/nlp/bert/scripts/test_glue_pytorch_bert_large.sh: -------------------------------------------------------------------------------- 1 | python test_glue_pytorch_bert.py \ 2 | --gpu_id 1 \ 3 | --train_batch_size 32 \ 4 | --task_name sst-2 \ 5 | --vocab_size 30522 \ 6 | --hidden_size 1024 \ 7 | --num_hidden_layers 24 \ 8 | --num_attention_heads 16 \ 9 | --seq_length 128 \ 10 | --epochs 20 \ 11 | --lr 2e-5 \ 12 | --adam_weight_decay 0.01 \ 13 | --hidden_act relu \ 14 | --dropout_prob 0.1 -------------------------------------------------------------------------------- /examples/nlp/bert/scripts/train_hetu_bert_base.sh: -------------------------------------------------------------------------------- 1 | workdir=$(cd $(dirname $0); pwd) 2 | mainpy=${workdir}/../train_hetu_bert.py 3 | data_path=${workdir}/../data 4 | 5 | python ${mainpy} \ 6 | --gpu_id 0 \ 7 | --train_batch_size 64 \ 8 | --data_path ${data_path} \ 9 | --dataset wikicorpus_en \ 10 | --vocab_size 30522 \ 11 | --hidden_size 768 \ 12 | --num_hidden_layers 12 \ 13 | --num_attention_heads 12 \ 14 | --seq_length 128 \ 15 | --epochs 20 \ 16 | --lr 1e-5 \ 17 | --adam_weight_decay 0.01 \ 18 | --hidden_act relu \ 19 | --dropout_prob 0.1 -------------------------------------------------------------------------------- /examples/nlp/bert/scripts/train_hetu_bert_base_dp.sh: -------------------------------------------------------------------------------- 1 | workdir=$(cd $(dirname $0); pwd) 2 | mainpy=${workdir}/../train_hetu_bert_dp.py 3 | config=${workdir}/../config4.yml 4 | data_path=${workdir}/../data 5 | export PYTHONPATH=$HETU_PATH 6 | heturun -c ${config} python ${mainpy} \ 7 | --num_gpus 4 \ 8 | --train_batch_size 64 \ 9 | --data_path ${data_path} \ 10 | --dataset wikicorpus_en \ 11 | --vocab_size 30522 \ 12 | --hidden_size 768 \ 13 | --num_hidden_layers 12 \ 14 | --num_attention_heads 12 \ 15 | --seq_length 512 \ 16 | --epochs 80 \ 17 | --lr 1e-5 \ 18 | --adam_weight_decay 0.01 \ 19 | --hidden_act relu \ 20 | --dropout_prob 0.1 21 | -------------------------------------------------------------------------------- /examples/nlp/bert/scripts/train_hetu_bert_base_moe.sh: -------------------------------------------------------------------------------- 1 | workdir=$(cd $(dirname $0); pwd) 2 | mainpy=${workdir}/../train_hetu_bert_dp_moe.py 3 | config=${workdir}/../config4.yml 4 | data_path=${workdir}/../data 5 | export PYTHONPATH=$HETU_PATH 6 | heturun -c ${config} python3 ${mainpy} \ 7 | --num_gpus 4 \ 8 | --train_batch_size 64 \ 9 | --data_path ${data_path} \ 10 | --dataset wikicorpus_en \ 11 | --vocab_size 30522 \ 12 | --hidden_size 768 \ 13 | --num_hidden_layers 12 \ 14 | --num_attention_heads 12 \ 15 | --seq_length 512 \ 16 | --epochs 80 \ 17 | --lr 1e-5 \ 18 | --adam_weight_decay 0.01 \ 19 | --hidden_act relu \ 20 | --dropout_prob 0.1 21 | -------------------------------------------------------------------------------- /examples/nlp/bert/scripts/train_hetu_bert_base_ps.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | heturun -s 1 -w 4 python train_hetu_bert_ps.py \ 4 | --train_batch_size 32 \ 5 | --dataset wikicorpus_en \ 6 | --vocab_size 30522 \ 7 | --hidden_size 768 \ 8 | --num_hidden_layers 12 \ 9 | --num_attention_heads 12 \ 10 | --seq_length 128 \ 11 | --epochs 20 \ 12 | --lr 1e-5 \ 13 | --adam_weight_decay 0.01 \ 14 | --hidden_act relu \ 15 | --dropout_prob 0.1 16 | -------------------------------------------------------------------------------- /examples/nlp/bert/scripts/train_hetu_bert_large.sh: -------------------------------------------------------------------------------- 1 | workdir=$(cd $(dirname $0); pwd) 2 | mainpy=${workdir}/../train_hetu_bert.py 3 | data_path=${workdir}/../data 4 | 5 | python ${mainpy} \ 6 | --gpu_id 0 \ 7 | --train_batch_size 32 \ 8 | --data_path ${data_path} \ 9 | --dataset wikicorpus_en \ 10 | --vocab_size 30522 \ 11 | --hidden_size 1024 \ 12 | --num_hidden_layers 24 \ 13 | --num_attention_heads 16 \ 14 | --seq_length 128 \ 15 | --epochs 20 \ 16 | --lr 1e-5 \ 17 | --adam_weight_decay 0.01 \ 18 | --hidden_act relu \ 19 | --dropout_prob 0.1 -------------------------------------------------------------------------------- /examples/nlp/bert/scripts/train_hetu_bert_large_dp.sh: -------------------------------------------------------------------------------- 1 | workdir=$(cd $(dirname $0); pwd) 2 | mainpy=${workdir}/../train_hetu_bert_dp.py 3 | config=${workdir}/../config4.yml 4 | data_path=${workdir}/../data 5 | 6 | heturun -c ${config} python ${mainpy} \ 7 | --num_gpus 4 \ 8 | --train_batch_size 32 \ 9 | --data_path ${data_path} \ 10 | --dataset wikicorpus_en \ 11 | --vocab_size 30522 \ 12 | --hidden_size 1024 \ 13 | --num_hidden_layers 24 \ 14 | --num_attention_heads 16 \ 15 | --seq_length 128 \ 16 | --epochs 20 \ 17 | --lr 1e-5 \ 18 | --adam_weight_decay 0.01 \ 19 | --hidden_act relu \ 20 | --dropout_prob 0.1 -------------------------------------------------------------------------------- /examples/nlp/bert/scripts/train_hetu_bert_large_ps.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | heturun -s 1 -w 4 python train_hetu_bert_ps.py \ 4 | --train_batch_size 32 \ 5 | --dataset wikicorpus_en \ 6 | --vocab_size 30522 \ 7 | --hidden_size 1024 \ 8 | --num_hidden_layers 24 \ 9 | --num_attention_heads 16 \ 10 | --seq_length 128 \ 11 | --epochs 20 \ 12 | --lr 1e-5 \ 13 | --adam_weight_decay 0.01 \ 14 | --hidden_act relu \ 15 | --dropout_prob 0.1 16 | -------------------------------------------------------------------------------- /examples/nlp/bert/scripts/train_pytorch_bert_base.sh: -------------------------------------------------------------------------------- 1 | python train_pytorch_bert.py \ 2 | --gpu_id 1 \ 3 | --train_batch_size 64 \ 4 | --dataset wikicorpus_en \ 5 | --vocab_size 30522 \ 6 | --hidden_size 768 \ 7 | --num_hidden_layers 12 \ 8 | --num_attention_heads 12 \ 9 | --seq_length 128 \ 10 | --epochs 20 \ 11 | --lr 1e-5 \ 12 | --adam_weight_decay 0.01 \ 13 | --hidden_act relu \ 14 | --dropout_prob 0.1 -------------------------------------------------------------------------------- /examples/nlp/bert/scripts/train_pytorch_bert_large.sh: -------------------------------------------------------------------------------- 1 | python train_pytorch_bert.py \ 2 | --gpu_id 1 \ 3 | --train_batch_size 32 \ 4 | --dataset wikicorpus_en \ 5 | --vocab_size 30522 \ 6 | --hidden_size 1024 \ 7 | --num_hidden_layers 24 \ 8 | --num_attention_heads 16 \ 9 | --seq_length 128 \ 10 | --epochs 20 \ 11 | --lr 1e-5 \ 12 | --adam_weight_decay 0.01 \ 13 | --hidden_act relu \ 14 | --dropout_prob 0.1 -------------------------------------------------------------------------------- /examples/rec/.gitignore: -------------------------------------------------------------------------------- 1 | datasets/ 2 | logs/ 3 | ckpts/ 4 | -------------------------------------------------------------------------------- /examples/rec/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import RatingModel_Head 2 | from .mf import MF_Head 3 | from .neumf import NeuMF_Head 4 | from .gmf import GMF_Head 5 | from .mlp import MLP_Head 6 | -------------------------------------------------------------------------------- /examples/rec/models/gmf.py: -------------------------------------------------------------------------------- 1 | import hetu as ht 2 | import hetu.layers as htl 3 | from .base import RatingModel_Head 4 | 5 | 6 | class GMF_Head(RatingModel_Head): 7 | def __init__(self, embed_dim, nsparse=2, ndense=0): 8 | # fixed 2 layers 9 | super().__init__(embed_dim) 10 | self.predict_layer = htl.Linear( 11 | self.embed_dim, 1, initializer=ht.init.GenXavierNormal(), activation=None, name=f'predict') 12 | 13 | def __call__(self, embeddings, dense, label): 14 | gmf_embs = ht.array_reshape_op(embeddings, [-1, 2, self.embed_dim]) 15 | output_gmf = ht.reduce_mul_op(gmf_embs, [1]) 16 | prediction = self.predict_layer(output_gmf) 17 | prediction = ht.array_reshape_op(prediction, (-1,)) 18 | return self.output(prediction, label) 19 | -------------------------------------------------------------------------------- /examples/rec/models/mf.py: -------------------------------------------------------------------------------- 1 | import hetu as ht 2 | from .base import RatingModel_Head 3 | 4 | 5 | class MF_Head(RatingModel_Head): 6 | def __call__(self, embeddings, dense, label): 7 | embeddings = ht.array_reshape_op(embeddings, [-1, 2, self.embed_dim]) 8 | output = ht.reduce_mul_op(embeddings, [1]) 9 | prediction = ht.reduce_sum_op(output, [-1]) 10 | return self.output(prediction, label) 11 | -------------------------------------------------------------------------------- /examples/rec/models/mlp.py: -------------------------------------------------------------------------------- 1 | import hetu as ht 2 | import hetu.layers as htl 3 | from .base import RatingModel_Head 4 | 5 | 6 | class MLP_Head(RatingModel_Head): 7 | def __init__(self, embed_dim, nsparse=2, ndense=0): 8 | # fixed 2 layers 9 | assert embed_dim % 4 == 0 10 | super().__init__(embed_dim) 11 | self.factor_num = embed_dim // 4 12 | self.mlp_layers = self.create_mlp( 13 | [8 * self.factor_num, 4 * self.factor_num, 2 * self.factor_num, self.factor_num]) 14 | self.predict_layer = htl.Linear( 15 | self.factor_num, 1, initializer=ht.init.GenXavierNormal(), activation=None, name=f'predict') 16 | 17 | def __call__(self, embeddings, dense, label): 18 | input_mlp = ht.array_reshape_op(embeddings, [-1, 2 * self.embed_dim]) 19 | output_mlp = self.mlp_layers(input_mlp) 20 | prediction = self.predict_layer(output_mlp) 21 | prediction = ht.array_reshape_op(prediction, (-1,)) 22 | return self.output(prediction, label) 23 | -------------------------------------------------------------------------------- /examples/rec/test/.gitignore: -------------------------------------------------------------------------------- 1 | datasets/ 2 | logs/ 3 | models/ 4 | -------------------------------------------------------------------------------- /examples/rec/test/config.py: -------------------------------------------------------------------------------- 1 | # dataset name 2 | dataset = 'ml-1m' 3 | assert dataset in ['ml-1m', 'pinterest-20'] 4 | 5 | # model name 6 | model = 'NeuMF-end' 7 | assert model in ['MLP', 'GMF', 'NeuMF-end', 'NeuMF-pre'] 8 | 9 | # paths 10 | main_path = './datasets/' 11 | 12 | train_rating = main_path + '{}.train.rating'.format(dataset) 13 | test_rating = main_path + '{}.test.rating'.format(dataset) 14 | test_negative = main_path + '{}.test.negative'.format(dataset) 15 | 16 | model_path = './models/' 17 | GMF_model_path = model_path + 'GMF.pth' 18 | MLP_model_path = model_path + 'MLP.pth' 19 | NeuMF_model_path = model_path + 'NeuMF.pth' 20 | -------------------------------------------------------------------------------- /examples/runner/local_allreduce.yml: -------------------------------------------------------------------------------- 1 | nodes: 2 | - host: localhost 3 | servers: 0 4 | workers: 4 5 | chief: true 6 | -------------------------------------------------------------------------------- /examples/runner/local_ps.yml: -------------------------------------------------------------------------------- 1 | nodes: 2 | - host: localhost 3 | servers: 1 4 | workers: 4 5 | chief: true 6 | -------------------------------------------------------------------------------- /examples/runner/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Hsword/Hetu/a1cb1bcaf236c9f1d452733b105997d50c16c9df/examples/runner/models/__init__.py -------------------------------------------------------------------------------- /examples/runner/parallel/config1.yml: -------------------------------------------------------------------------------- 1 | nodes: 2 | - host: localhost 3 | servers: 0 4 | workers: 1 5 | chief: true 6 | -------------------------------------------------------------------------------- /examples/runner/parallel/config2.yml: -------------------------------------------------------------------------------- 1 | nodes: 2 | - host: localhost 3 | servers: 0 4 | workers: 2 5 | chief: true 6 | -------------------------------------------------------------------------------- /examples/runner/parallel/config3.yml: -------------------------------------------------------------------------------- 1 | nodes: 2 | - host: localhost 3 | servers: 0 4 | workers: 3 5 | chief: true 6 | -------------------------------------------------------------------------------- /examples/runner/parallel/config4.yml: -------------------------------------------------------------------------------- 1 | nodes: 2 | - host: localhost 3 | servers: 0 4 | workers: 4 5 | chief: true 6 | -------------------------------------------------------------------------------- /examples/runner/parallel/config6.yml: -------------------------------------------------------------------------------- 1 | nodes: 2 | - host: localhost 3 | servers: 0 4 | workers: 6 5 | chief: true 6 | -------------------------------------------------------------------------------- /examples/runner/parallel/config8.yml: -------------------------------------------------------------------------------- 1 | nodes: 2 | - host: localhost 3 | servers: 0 4 | workers: 8 5 | chief: true 6 | -------------------------------------------------------------------------------- /examples/runner/parallel/dist_config8.yml: -------------------------------------------------------------------------------- 1 | nodes: 2 | - host: node1 3 | servers: 0 4 | workers: 4 5 | chief: true 6 | - host: node2 7 | servers: 0 8 | workers: 4 -------------------------------------------------------------------------------- /examples/runner/parallel/validate_results.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os.path as osp 3 | import argparse 4 | 5 | if __name__ == '__main__': 6 | parser = argparse.ArgumentParser() 7 | parser.add_argument('number', default=None) 8 | parser.add_argument('--rtol', default='1e-6') 9 | args = parser.parse_args() 10 | 11 | directory = 'results' 12 | base = np.load(osp.join(directory, 'base.npy')) 13 | print('Ground truth:', base) 14 | for i in range(int(args.number)): 15 | res = np.load(osp.join(directory, 'res%d.npy' % i)) 16 | np.testing.assert_allclose(base, res, rtol=float(args.rtol)) 17 | print('Result id %d passed test.' % i, res) 18 | -------------------------------------------------------------------------------- /examples/runner/remote_allreduce.yml: -------------------------------------------------------------------------------- 1 | nodes: 2 | - host: node1 3 | workers: 4 4 | chief: true 5 | - host: node2 6 | workers: 2 7 | -------------------------------------------------------------------------------- /examples/runner/remote_ps.yml: -------------------------------------------------------------------------------- 1 | nodes: 2 | - host: node1 3 | servers: 1 4 | workers: 4 5 | chief: true 6 | - host: node2 7 | servers: 1 8 | workers: 2 9 | -------------------------------------------------------------------------------- /hetu.exp: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | path="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 3 | echo "Hetu root is" $path 4 | export PATH="$path/bin:$PATH" 5 | export PYTHONPATH="$path/python:$path/build/lib:$path/third_party/GraphMix/python:$PYTHONPATH:$path/third_party/HetuML/hetuml/python" 6 | -------------------------------------------------------------------------------- /img/alibabacloud.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Hsword/Hetu/a1cb1bcaf236c9f1d452733b105997d50c16c9df/img/alibabacloud.png -------------------------------------------------------------------------------- /img/features.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Hsword/Hetu/a1cb1bcaf236c9f1d452733b105997d50c16c9df/img/features.png -------------------------------------------------------------------------------- /img/hetu.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Hsword/Hetu/a1cb1bcaf236c9f1d452733b105997d50c16c9df/img/hetu.png -------------------------------------------------------------------------------- /img/kuaishou.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Hsword/Hetu/a1cb1bcaf236c9f1d452733b105997d50c16c9df/img/kuaishou.png -------------------------------------------------------------------------------- /img/tencent.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Hsword/Hetu/a1cb1bcaf236c9f1d452733b105997d50c16c9df/img/tencent.png -------------------------------------------------------------------------------- /ps-lite/.gitignore: -------------------------------------------------------------------------------- 1 | src/meta.pb.cc 2 | src/meta.pb.h 3 | -------------------------------------------------------------------------------- /ps-lite/include/ps/base.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015 by Contributors 3 | */ 4 | #ifndef PS_BASE_H_ 5 | #define PS_BASE_H_ 6 | #include 7 | #include "ps/internal/utils.h" 8 | namespace ps { 9 | 10 | #if USE_KEY32 11 | /*! \brief Use unsigned 32-bit int as the key type */ 12 | using Key = uint32_t; 13 | #else 14 | /*! \brief Use unsigned 64-bit int as the key type */ 15 | using Key = uint64_t; 16 | #endif 17 | /*! \brief The maximal allowed key value */ 18 | static const Key kMaxKey = std::numeric_limits::max(); 19 | /** \brief node ID for the scheduler */ 20 | static const int kScheduler = 1; 21 | /** 22 | * \brief the server node group ID 23 | * 24 | * group id can be combined: 25 | * - kServerGroup + kScheduler means all server nodes and the scheuduler 26 | * - kServerGroup + kWorkerGroup means all server and worker nodes 27 | */ 28 | static const int kServerGroup = 2; 29 | /** \brief the worker node group ID */ 30 | static const int kWorkerGroup = 4; 31 | 32 | } // namespace ps 33 | #endif // PS_BASE_H_ 34 | -------------------------------------------------------------------------------- /ps-lite/include/ps/psf/preduce.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "PSFunc.h" 4 | 5 | namespace ps { 6 | 7 | template <> 8 | struct PSFData { 9 | static constexpr PsfGroup group = PsfGroup::kPReduceScheduler; 10 | static constexpr const char* name = "PReduceGetPartner"; 11 | using Request = tuple< 12 | Key, // reduce group key, each pipeline stage has a unique key 13 | int, // worker rank 14 | size_t, // desired worker num 15 | float // max wait time (ms) 16 | >; 17 | using Response = tuple< 18 | SArray // all the partners worker id to do reduce with 19 | >; 20 | static void _callback(const Response &response, int* tgt) { 21 | auto &val = get<0>(response); 22 | std::copy(val.begin(), val.end(), tgt); 23 | tgt[val.size()] = -1; 24 | } 25 | }; 26 | 27 | } // namespace ps 28 | -------------------------------------------------------------------------------- /ps-lite/include/ps/range.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2015 by Contributors 3 | */ 4 | #ifndef PS_RANGE_H_ 5 | #define PS_RANGE_H_ 6 | #include "ps/internal/utils.h" 7 | namespace ps { 8 | 9 | /** 10 | * \brief a range [begin, end) 11 | */ 12 | class Range { 13 | public: 14 | Range() : Range(0, 0) { 15 | } 16 | Range(uint64_t begin, uint64_t end) : begin_(begin), end_(end) { 17 | } 18 | 19 | uint64_t begin() const { 20 | return begin_; 21 | } 22 | uint64_t end() const { 23 | return end_; 24 | } 25 | uint64_t size() const { 26 | return end_ - begin_; 27 | } 28 | 29 | private: 30 | uint64_t begin_; 31 | uint64_t end_; 32 | }; 33 | 34 | } // namespace ps 35 | #endif // PS_RANGE_H_ 36 | -------------------------------------------------------------------------------- /ps-lite/src/PSFunc.cc: -------------------------------------------------------------------------------- 1 | #include "ps/psf/PSFunc.h" 2 | 3 | #include 4 | #include 5 | 6 | namespace ps { 7 | 8 | static std::unordered_map psfunction_names; 9 | 10 | static void initnames(PSFData) {} 11 | 12 | template static void initnames(PSFData) { 13 | psfunction_names[ftype] = PSFData::name; 14 | initnames(PSFData()); 15 | } 16 | 17 | const char* 18 | getPSFunctionName(const PsfType &ftype) { 19 | static std::once_flag flag; 20 | std::call_once(flag, []() { 21 | initnames(PSFData()); 22 | }); 23 | return psfunction_names[ftype]; 24 | } 25 | 26 | } 27 | -------------------------------------------------------------------------------- /python/graphboard/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Hsword/Hetu/a1cb1bcaf236c9f1d452733b105997d50c16c9df/python/graphboard/__init__.py -------------------------------------------------------------------------------- /python/graphboard/graph2fig.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | from graphviz import Digraph 4 | import subprocess 5 | import os 6 | import signal 7 | 8 | pid = None 9 | 10 | 11 | def show(executor, port=9997): 12 | print("Generating graph figure") 13 | dot = Digraph() 14 | dot.format = 'png' 15 | for node in executor.topo_order: 16 | dot.node(str(node.id), node.name) 17 | print(node.desc) 18 | if node.inputs: 19 | for n in node.inputs: 20 | dot.edge(str(n.id), str(node.id)) 21 | print(dot.source) 22 | dot.render('python/graphboard/output') 23 | print("Starting server..") 24 | cmd = 'cd python/graphboard; python -m SimpleHTTPServer '+str(port) 25 | pro = subprocess.Popen(cmd, shell=True, preexec_fn=os.setsid) 26 | global pid 27 | pid = pro.pid 28 | 29 | 30 | def close(): 31 | global pid 32 | os.killpg(pid, signal.SIGTERM) 33 | -------------------------------------------------------------------------------- /python/graphboard/index.html: -------------------------------------------------------------------------------- 1 | 2 |
3 | Hetu 4 |
5 | 6 | 7 |

8 | Hetu Graph Borad: 9 |

10 | dataflow_graph 11 | 12 | 13 | -------------------------------------------------------------------------------- /python/hetu/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from .gpu_ops import * 3 | from .context import context, get_current_context, DistConfig 4 | from .dataloader import dataloader_op, Dataloader, GNNDataLoaderOp 5 | from .ndarray import cpu, gpu, rcpu, rgpu, array, sparse_array, empty, is_gpu_ctx, IndexedSlices 6 | from . import optimizer as optim 7 | from . import lr_scheduler as lr 8 | from . import initializers as init 9 | from . import data 10 | from . import layers 11 | from . import random 12 | from . import distributed_strategies as dist 13 | from .profiler import HetuProfiler, NCCLProfiler, HetuSimulator 14 | from .tokenizers import * 15 | -------------------------------------------------------------------------------- /python/hetu/communicator/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Hsword/Hetu/a1cb1bcaf236c9f1d452733b105997d50c16c9df/python/hetu/communicator/__init__.py -------------------------------------------------------------------------------- /python/hetu/communicator/test.sh: -------------------------------------------------------------------------------- 1 | export PYTHONPATH=$PYTHONPATH:/home/Hetu/python 2 | NCCL_DEBUG=INFO 3 | mpirun --allow-run-as-root -np 2 -mca btl_tcp_if_include enp97s0f0 -x NCCL_SOCKET_IFNAME=enp97s0f0 -H node1:1, node2:1 /root/anaconda3/envs/moe/bin/python mpi_nccl_comm.py 4 | -------------------------------------------------------------------------------- /python/hetu/cpu_links/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from .dnnl_op import * 3 | -------------------------------------------------------------------------------- /python/hetu/distributed_strategies/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import Strategy, BaseSearchingStrategy 2 | from .simple import DataParallel, ModelParallel4CNN, ModelParallel4LM, OneWeirdTrick4CNN, MegatronLM 3 | from .flexflow import FlexFlowSearching 4 | from .optcnn import OptCNNSearching 5 | from .gpipe import GPipeSearching 6 | from .pipedream import PipeDreamSearching 7 | from .pipeopt import PipeOptSearching 8 | -------------------------------------------------------------------------------- /python/hetu/gpu_links/AbsLink.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | from .._base import _LIB 4 | from .. import ndarray as _nd 5 | 6 | 7 | def abs_val(in_mat, out_mat, stream=None): 8 | assert isinstance(in_mat, _nd.NDArray) 9 | assert isinstance(out_mat, _nd.NDArray) 10 | _LIB.DLGpuAbs(in_mat.handle, out_mat.handle, stream.handle if stream else None) 11 | 12 | def abs_gradient(in_mat, grad_mat, out_mat, stream=None): 13 | assert isinstance(in_mat, _nd.NDArray) 14 | assert isinstance(grad_mat, _nd.NDArray) 15 | assert isinstance(out_mat, _nd.NDArray) 16 | _LIB.DLGpuAbsGradient(grad_mat.handle, in_mat.handle, out_mat.handle, stream.handle if stream else None) 17 | -------------------------------------------------------------------------------- /python/hetu/gpu_links/AddConstLink.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import ctypes 4 | from .._base import _LIB 5 | from .. import ndarray as _nd 6 | 7 | 8 | def matrix_elementwise_add_by_const(in_mat, val, out_mat, stream=None): 9 | assert isinstance(in_mat, _nd.NDArray) 10 | assert isinstance(out_mat, _nd.NDArray) 11 | _LIB.DLGpuMatrixElementwiseAddByConst( 12 | in_mat.handle, ctypes.c_float(val), out_mat.handle, stream.handle if stream else None) 13 | -------------------------------------------------------------------------------- /python/hetu/gpu_links/AddmmLink.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import ctypes 4 | from .._base import _LIB 5 | from .. import ndarray as _nd 6 | 7 | 8 | def addmm(input_mat, matA, matB, matC, alpha, beta, stream=None): 9 | assert isinstance(input_mat, _nd.NDArray) 10 | assert isinstance(matA, _nd.NDArray) 11 | assert isinstance(matB, _nd.NDArray) 12 | assert isinstance(matC, _nd.NDArray) 13 | _LIB.DLGpuAddmm(input_mat.handle, matA.handle, matB.handle, ctypes.c_float( 14 | alpha), ctypes.c_float(beta), matC.handle, stream.handle if stream else None) 15 | 16 | 17 | def addmm_gradient(input_mat, output_mat, axis, beta, stream=None): 18 | assert isinstance(input_mat, _nd.NDArray) 19 | assert isinstance(output_mat, _nd.NDArray) 20 | _LIB.DLGpuAddmmGradient(input_mat.handle, output_mat.handle, ctypes.c_int( 21 | axis), ctypes.c_float(beta), stream.handle if stream else None) 22 | -------------------------------------------------------------------------------- /python/hetu/gpu_links/ArangeLink.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import ctypes 4 | from .._base import _LIB 5 | from .. import ndarray as _nd 6 | 7 | 8 | def arange(start, end, step, out_mat, stream=None): 9 | assert isinstance(out_mat, _nd.NDArray) 10 | _LIB.DLGpuArange(ctypes.c_float(start), ctypes.c_float(end), ctypes.c_float( 11 | step), out_mat.handle, stream.handle if stream else None) 12 | -------------------------------------------------------------------------------- /python/hetu/gpu_links/ArgmaxLink.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import ctypes 4 | from .._base import _LIB 5 | from .. import ndarray as _nd 6 | 7 | 8 | def argmax(in_mat, out_mat, dim, stream=None): 9 | assert isinstance(in_mat, _nd.NDArray) 10 | assert isinstance(out_mat, _nd.NDArray) 11 | _LIB.DLGpuArgmax( 12 | in_mat.handle, out_mat.handle, ctypes.c_int(dim), stream.handle if stream else None) 13 | 14 | 15 | def argmax_partial(in_mat, full_mask, out_mat, dim, topk, stream=None): 16 | assert isinstance(in_mat, _nd.NDArray) 17 | assert isinstance(full_mask, _nd.NDArray) 18 | assert isinstance(out_mat, _nd.NDArray) 19 | _LIB.DLGpuArgmaxPartial( 20 | in_mat.handle, full_mask.handle, out_mat.handle, ctypes.c_int(dim), ctypes.c_int(topk), stream.handle if stream else None) 21 | -------------------------------------------------------------------------------- /python/hetu/gpu_links/ArgsortLink.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import ctypes 4 | from .._base import _LIB 5 | from .. import ndarray as _nd 6 | 7 | 8 | def argsort(input, output, index, output_index, dim, descending, stream=None): 9 | assert isinstance(input, _nd.NDArray) 10 | assert isinstance(output, _nd.NDArray) 11 | assert isinstance(index, _nd.NDArray) 12 | assert isinstance(output_index, _nd.NDArray) 13 | 14 | _LIB.DLGpuArgsort( 15 | input.handle, output.handle, index.handle, output_index.handle, ctypes.c_int(dim), descending, stream.handle if stream else None) 16 | -------------------------------------------------------------------------------- /python/hetu/gpu_links/ArraySetLink.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import ctypes 4 | from .._base import _LIB 5 | from .. import ndarray as _nd 6 | 7 | 8 | def array_set(arr, value, stream=None): 9 | assert isinstance(arr, _nd.NDArray) 10 | _LIB.DLGpuArraySet(arr.handle, ctypes.c_float( 11 | value), stream.handle if stream else None) 12 | -------------------------------------------------------------------------------- /python/hetu/gpu_links/AvgPoolLink.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import ctypes 4 | from .._base import _LIB 5 | from .. import ndarray as _nd 6 | 7 | 8 | def average_pooling2d(in_arr, kernel_H, kernel_W, pooled_layer, padding=0, stride=1, stream=None): 9 | assert isinstance(in_arr, _nd.NDArray) 10 | assert isinstance(pooled_layer, _nd.NDArray) 11 | _LIB.DLGpuAvgerage_Pooling2d( 12 | in_arr.handle, kernel_H, kernel_W, pooled_layer.handle, padding, stride, stream.handle if stream else None) 13 | 14 | 15 | def average_pooling2d_gradient(in_gradient_y, kernel_H, kernel_W, out_gradient_x, padding=0, stride=1, stream=None): 16 | assert isinstance(in_gradient_y, _nd.NDArray) 17 | assert isinstance(out_gradient_x, _nd.NDArray) 18 | _LIB.DLGpuAvgerage_Pooling2d_gradient( 19 | in_gradient_y.handle, kernel_H, kernel_W, out_gradient_x.handle, padding, stride, stream.handle if stream else None) 20 | -------------------------------------------------------------------------------- /python/hetu/gpu_links/BaddbmmLink.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import ctypes 4 | from .._base import _LIB 5 | from .. import ndarray as _nd 6 | 7 | 8 | def baddbmm(input_mat, matA, matB, matC, alpha, beta, stream=None): 9 | assert isinstance(input_mat, _nd.NDArray) 10 | assert isinstance(matA, _nd.NDArray) 11 | assert isinstance(matB, _nd.NDArray) 12 | assert isinstance(matC, _nd.NDArray) 13 | _LIB.DLGpuBaddbmm(input_mat.handle, matA.handle, matB.handle, ctypes.c_float( 14 | alpha), ctypes.c_float(beta), matC.handle, stream.handle if stream else None) 15 | -------------------------------------------------------------------------------- /python/hetu/gpu_links/BatchMatrixMultLink.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | from .._base import _LIB 4 | from .. import ndarray as _nd 5 | 6 | 7 | def batch_matrix_multiply(matA, transA, matB, transB, matC, stream=None): 8 | assert isinstance(matA, _nd.NDArray) 9 | assert isinstance(matB, _nd.NDArray) 10 | assert isinstance(matC, _nd.NDArray) 11 | _LIB.DLGpuBatchMatrixMultiply( 12 | matA.handle, transA, matB.handle, transB, matC.handle, stream.handle if stream else None) 13 | -------------------------------------------------------------------------------- /python/hetu/gpu_links/BroadcastLink.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import ctypes 4 | from .._base import _LIB 5 | from .. import ndarray as _nd 6 | 7 | 8 | def broadcast_to(in_arr, out_arr, stream=None): 9 | assert isinstance(in_arr, _nd.NDArray) 10 | assert isinstance(out_arr, _nd.NDArray) 11 | _LIB.DLGpuBroadcastTo(in_arr.handle, out_arr.handle, 12 | stream.handle if stream else None) 13 | -------------------------------------------------------------------------------- /python/hetu/gpu_links/CloneLink.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import ctypes 4 | from .._base import _LIB 5 | from .. import ndarray as _nd 6 | 7 | def clone(input_mat, output_mat, stream=None): 8 | assert isinstance(input_mat, _nd.NDArray); 9 | assert isinstance(output_mat, _nd.NDArray); 10 | 11 | _LIB.DLGpuClone( 12 | input_mat.handle, output_mat.handle, stream.handle if stream else None) 13 | -------------------------------------------------------------------------------- /python/hetu/gpu_links/ConcatLink.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import ctypes 4 | from .._base import _LIB 5 | from .. import ndarray as _nd 6 | 7 | 8 | def concat(in_arr1, in_arr2, out_arr, axis=0, stream=None): 9 | assert isinstance(in_arr1, _nd.NDArray) 10 | assert isinstance(in_arr2, _nd.NDArray) 11 | assert isinstance(out_arr, _nd.NDArray) 12 | _LIB.DLGpuConcat(in_arr1.handle, in_arr2.handle, 13 | out_arr.handle, axis, stream.handle if stream else None) 14 | 15 | 16 | def concat_gradient(out_grad_arr, in_arr, axis=0, idx=0, stream=None): 17 | assert isinstance(out_grad_arr, _nd.NDArray) 18 | assert isinstance(in_arr, _nd.NDArray) 19 | _LIB.DLGpuConcat_gradient( 20 | out_grad_arr.handle, in_arr.handle, axis, idx, stream.handle if stream else None) 21 | -------------------------------------------------------------------------------- /python/hetu/gpu_links/ConcatenateLink.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import ctypes 4 | from .._base import _LIB 5 | from .. import ndarray as _nd 6 | 7 | 8 | def concatenate(in_arrs, out_arr, axis=0, stream=None): 9 | assert isinstance(out_arr, _nd.NDArray) 10 | offset = 0 11 | for arr in in_arrs: 12 | assert isinstance(arr, _nd.NDArray) 13 | _LIB.DLGpuConcatenate( 14 | arr.handle, out_arr.handle, 15 | ctypes.c_int(axis), ctypes.c_int(offset), 16 | stream.handle if stream else None) 17 | offset += arr.handle.contents.shape[axis] 18 | 19 | 20 | def concatenate_gradient(out_grad_arr, in_arr, axis, offset, stream=None): 21 | assert isinstance(out_grad_arr, _nd.NDArray) 22 | assert isinstance(in_arr, _nd.NDArray) 23 | _LIB.DLGpuConcatenate_gradient( 24 | out_grad_arr.handle, in_arr.handle, 25 | ctypes.c_int(axis), ctypes.c_int(offset), 26 | stream.handle if stream else None) 27 | -------------------------------------------------------------------------------- /python/hetu/gpu_links/ConstPowLink.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import ctypes 4 | from .._base import _LIB 5 | from .. import ndarray as _nd 6 | 7 | 8 | def const_pow(in_arr, out_arr, val, stream=None): 9 | assert isinstance(in_arr, _nd.NDArray) 10 | assert isinstance(out_arr, _nd.NDArray) 11 | _LIB.DLGpuConstPow(in_arr.handle, ctypes.c_float(val), 12 | out_arr.handle, stream.handle if stream else None) 13 | 14 | 15 | def const_pow_gradient(in_arr, grad_arr, out_arr, val, stream=None): 16 | assert isinstance(in_arr, _nd.NDArray) 17 | assert isinstance(grad_arr, _nd.NDArray) 18 | assert isinstance(out_arr, _nd.NDArray) 19 | _LIB.DLGpuConstPowGradient( 20 | in_arr.handle, grad_arr.handle, ctypes.c_float(val), out_arr.handle, stream.handle if stream else None) 21 | -------------------------------------------------------------------------------- /python/hetu/gpu_links/Conv2dBroadcastLink.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import ctypes 4 | from .._base import _LIB 5 | from .. import ndarray as _nd 6 | 7 | 8 | def conv2d_broadcast_to(in_arr, out_arr, stream=None): 9 | assert isinstance(in_arr, _nd.NDArray) 10 | assert isinstance(out_arr, _nd.NDArray) 11 | _LIB.DLGpuConv2d_broadcast_to( 12 | in_arr.handle, out_arr.handle, stream.handle if stream else None) 13 | -------------------------------------------------------------------------------- /python/hetu/gpu_links/Conv2dReduceSumLink.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import ctypes 4 | from .._base import _LIB 5 | from .. import ndarray as _nd 6 | 7 | 8 | def conv2d_reduce_sum(in_arr, out_arr, stream=None): 9 | assert isinstance(in_arr, _nd.NDArray) 10 | assert isinstance(out_arr, _nd.NDArray) 11 | _LIB.DLGpuConv2d_reduce_sum( 12 | in_arr.handle, out_arr.handle, stream.handle if stream else None) 13 | -------------------------------------------------------------------------------- /python/hetu/gpu_links/CrossEntropyLink.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import ctypes 4 | from .._base import _LIB 5 | from .. import ndarray as _nd 6 | 7 | 8 | def cross_entropy(y, y_, out, stream=None): 9 | assert isinstance(y, _nd.NDArray) 10 | assert isinstance(y_, _nd.NDArray) 11 | assert isinstance(out, _nd.NDArray) 12 | _LIB.DLGpuCrossEntropy( 13 | y.handle, y_.handle, out.handle, stream.handle if stream else None) 14 | 15 | 16 | def cross_entropy_gradient(grad_arr, y_arr, label, out_arr, stream=None): 17 | assert isinstance(grad_arr, _nd.NDArray) 18 | assert isinstance(y_arr, _nd.NDArray) 19 | assert isinstance(label, _nd.NDArray) 20 | assert isinstance(out_arr, _nd.NDArray) 21 | _LIB.DLGpuCrossEntropyGradient( 22 | grad_arr.handle, y_arr.handle, label.handle, out_arr.handle, stream.handle if stream else None) 23 | -------------------------------------------------------------------------------- /python/hetu/gpu_links/CrossEntropySparseLink.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import ctypes 4 | from .._base import _LIB 5 | from .. import ndarray as _nd 6 | 7 | 8 | def cross_entropy_sparse(y, y_, ignored_index, out, stream=None): 9 | assert isinstance(y, _nd.NDArray) 10 | assert isinstance(y_, _nd.NDArray) 11 | assert isinstance(out, _nd.NDArray) 12 | _LIB.DLGpuCrossEntropySparse( 13 | y.handle, y_.handle, ignored_index, out.handle, stream.handle if stream else None) 14 | 15 | 16 | def cross_entropy_sparse_gradient(grad_arr, y_arr, label, ignored_index, out_arr, stream=None): 17 | assert isinstance(grad_arr, _nd.NDArray) 18 | assert isinstance(y_arr, _nd.NDArray) 19 | assert isinstance(label, _nd.NDArray) 20 | assert isinstance(out_arr, _nd.NDArray) 21 | _LIB.DLGpuCrossEntropySparseGradient( 22 | grad_arr.handle, y_arr.handle, label.handle, ignored_index, out_arr.handle, stream.handle if stream else None) 23 | -------------------------------------------------------------------------------- /python/hetu/gpu_links/CudnnConv2dAddBiasLink.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import ctypes 4 | from .._base import _LIB 5 | from .. import ndarray as _nd 6 | 7 | 8 | def CuDNN_conv2d_with_bias(in_arr_x, in_arr_f, bias, out_arr, padding=(0, 0), stride=(1, 1), stream=None): 9 | assert isinstance(in_arr_x, _nd.NDArray) 10 | assert isinstance(in_arr_f, _nd.NDArray) 11 | assert isinstance(bias, _nd.NDArray) 12 | assert isinstance(out_arr, _nd.NDArray) 13 | _LIB.Cudnn_Conv2dAddBias(in_arr_x.handle, in_arr_f.handle, bias.handle, 14 | out_arr.handle, padding[0], padding[1], stride[0], stride[1], stream.handle if stream else None) 15 | -------------------------------------------------------------------------------- /python/hetu/gpu_links/CudnnSoftmaxCrossEntropyLink.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import ctypes 4 | from .._base import _LIB 5 | from .. import ndarray as _nd 6 | 7 | 8 | def CuDNN_softmax_cross_entropy(y, y_, out, stream=None): 9 | assert isinstance(y, _nd.NDArray) 10 | assert isinstance(y_, _nd.NDArray) 11 | assert isinstance(out, _nd.NDArray) 12 | _LIB.CuDNN_DLGpuSoftmaxEntropy( 13 | y.handle, y_.handle, out.handle, stream.handle if stream else None) 14 | 15 | 16 | def CuDNN_softmax_cross_entropy_gradient(grad_arr, y_arr, label, out_arr, stream=None): 17 | assert isinstance(grad_arr, _nd.NDArray) 18 | assert isinstance(y_arr, _nd.NDArray) 19 | assert isinstance(label, _nd.NDArray) 20 | assert isinstance(out_arr, _nd.NDArray) 21 | _LIB.CuDNN_DLGpuSoftmaxEntropyGradient( 22 | grad_arr.handle, y_arr.handle, label.handle, out_arr.handle, stream.handle if stream else None) 23 | -------------------------------------------------------------------------------- /python/hetu/gpu_links/CumSumLink.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import ctypes 4 | from .._base import _LIB 5 | from .. import ndarray as _nd 6 | 7 | 8 | def cumsum_with_bias(input, output, bias, dim, stream=None): 9 | assert isinstance(input, _nd.NDArray) 10 | assert isinstance(output, _nd.NDArray) 11 | _LIB.DLGpuCumsumWithBias( 12 | input.handle, output.handle, ctypes.c_float(bias), ctypes.c_int(dim), stream.handle if stream else None) 13 | -------------------------------------------------------------------------------- /python/hetu/gpu_links/DotLink.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import ctypes 4 | from .._base import _LIB 5 | from .. import ndarray as _nd 6 | 7 | 8 | def matrix_dot(matA, matB, matC, stream=None): 9 | assert isinstance(matA, _nd.NDArray) 10 | assert isinstance(matB, _nd.NDArray) 11 | assert isinstance(matC, _nd.NDArray) 12 | _LIB.DLGpuDot( 13 | matA.handle, matB.handle, matC.handle, stream.handle if stream else None) 14 | -------------------------------------------------------------------------------- /python/hetu/gpu_links/EmbeddingLookUpLink.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import ctypes 4 | from .._base import _LIB 5 | from .. import ndarray as _nd 6 | 7 | 8 | def embedding_lookup(in_mat, ids, out_mat, stream=None): 9 | assert isinstance(in_mat, _nd.NDArray) 10 | assert isinstance(ids, _nd.NDArray) 11 | assert isinstance(out_mat, _nd.NDArray) 12 | _LIB.DLGpuEmbeddingLookUp( 13 | in_mat.handle, ids.handle, out_mat.handle, stream.handle if stream else None) 14 | -------------------------------------------------------------------------------- /python/hetu/gpu_links/ExpLink.py: -------------------------------------------------------------------------------- 1 | 2 | from __future__ import absolute_import 3 | 4 | from .._base import _LIB 5 | from .. import ndarray as _nd 6 | 7 | 8 | def exp(in_arr, out_arr, stream=None): 9 | assert isinstance(in_arr, _nd.NDArray) 10 | assert isinstance(out_arr, _nd.NDArray) 11 | _LIB.DLGpuExp(in_arr.handle, out_arr.handle, 12 | stream.handle if stream else None) 13 | -------------------------------------------------------------------------------- /python/hetu/gpu_links/FloorLink.py: -------------------------------------------------------------------------------- 1 | 2 | from __future__ import absolute_import 3 | 4 | import ctypes 5 | from .._base import _LIB 6 | from .. import ndarray as _nd 7 | 8 | 9 | def floor(in_arr, out_arr, stream=None): 10 | assert isinstance(in_arr, _nd.NDArray) 11 | assert isinstance(out_arr, _nd.NDArray) 12 | _LIB.DLGpuFloor(in_arr.handle, out_arr.handle, 13 | stream.handle if stream else None) 14 | -------------------------------------------------------------------------------- /python/hetu/gpu_links/GatherLink.py: -------------------------------------------------------------------------------- 1 | 2 | from __future__ import absolute_import 3 | 4 | import ctypes 5 | from .._base import _LIB 6 | from .. import ndarray as _nd 7 | 8 | 9 | def gather(in_arr, index, out_arr, dim, stream=None): 10 | assert isinstance(in_arr, _nd.NDArray) 11 | assert isinstance(index, _nd.NDArray) 12 | assert isinstance(out_arr, _nd.NDArray) 13 | _LIB.DLGpuGather(in_arr.handle, index.handle, out_arr.handle, 14 | ctypes.c_int(dim), stream.handle if stream else None) 15 | 16 | 17 | def gather_gradient(in_arr, index, out_arr, dim, stream=None): 18 | assert isinstance(in_arr, _nd.NDArray) 19 | assert isinstance(index, _nd.NDArray) 20 | assert isinstance(out_arr, _nd.NDArray) 21 | _LIB.DLGpuGatherGradient(in_arr.handle, index.handle, out_arr.handle, ctypes.c_int( 22 | dim), stream.handle if stream else None) 23 | -------------------------------------------------------------------------------- /python/hetu/gpu_links/GeluLink.py: -------------------------------------------------------------------------------- 1 | 2 | from __future__ import absolute_import 3 | 4 | import ctypes 5 | from .._base import _LIB 6 | from .. import ndarray as _nd 7 | 8 | 9 | def gelu(in_arr, out_arr, stream=None): 10 | assert isinstance(in_arr, _nd.NDArray) 11 | assert isinstance(out_arr, _nd.NDArray) 12 | _LIB.DLGpuGelu(in_arr.handle, out_arr.handle, 13 | stream.handle if stream else None) 14 | 15 | 16 | def gelu_gradient(in_arr, in_grad_arr, out_arr, stream=None): 17 | assert isinstance(in_arr, _nd.NDArray) 18 | assert isinstance(in_grad_arr, _nd.NDArray) 19 | assert isinstance(out_arr, _nd.NDArray) 20 | _LIB.DLGpuGeluGradient(in_arr.handle, in_grad_arr.handle, 21 | out_arr.handle, stream.handle if stream else None) 22 | -------------------------------------------------------------------------------- /python/hetu/gpu_links/GroupTopKIdxLink.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import ctypes 4 | from .._base import _LIB 5 | from .. import ndarray as _nd 6 | 7 | def group_topk_idx(in_mat, top1_group, out_mat_idx, k, num_local_gpus, stream=None): 8 | assert isinstance(in_mat, _nd.NDArray); 9 | assert isinstance(top1_group, _nd.NDArray); 10 | assert isinstance(out_mat_idx, _nd.NDArray); 11 | _LIB.DLGpuGroupTopKIdx( 12 | in_mat.handle, top1_group.handle, out_mat_idx.handle, ctypes.c_int(k), ctypes.c_int(num_local_gpus), stream.handle if stream else None) 13 | -------------------------------------------------------------------------------- /python/hetu/gpu_links/HA2ALayoutTransform.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import ctypes 4 | from .._base import _LIB 5 | from .. import ndarray as _nd 6 | 7 | 8 | def ha2a_layout_transform(input, output, num_nodes, num_local_gpus, stream=None): 9 | assert isinstance(input, _nd.NDArray) 10 | assert isinstance(output, _nd.NDArray) 11 | _LIB.DLGpuHA2ALayoutTransform( 12 | input.handle, output.handle, ctypes.c_int(num_nodes), ctypes.c_int(num_local_gpus), stream.handle if stream else None) 13 | 14 | 15 | def ha2a_reverse_layout_transform(input, output, num_nodes, num_local_gpus, stream=None): 16 | assert isinstance(input, _nd.NDArray) 17 | assert isinstance(output, _nd.NDArray) 18 | _LIB.DLGpuHA2AReverseLayoutTransform( 19 | input.handle, output.handle, ctypes.c_int(num_nodes), ctypes.c_int(num_local_gpus), stream.handle if stream else None) 20 | 21 | 22 | -------------------------------------------------------------------------------- /python/hetu/gpu_links/IndexingLink.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import ctypes 4 | from .._base import _LIB 5 | from .. import ndarray as _nd 6 | 7 | def indexing(input_mat, index_mat,output_mat, stream=None): 8 | assert isinstance(input_mat, _nd.NDArray); 9 | assert isinstance(index_mat, _nd.NDArray); 10 | assert isinstance(output_mat, _nd.NDArray); 11 | 12 | _LIB.DLGpuIndexing( 13 | input_mat.handle,index_mat.handle, output_mat.handle, stream.handle if stream else None) 14 | 15 | def indexing_grad(output_grad, index, input_grad, stream=None): 16 | assert isinstance(output_grad, _nd.NDArray); 17 | assert isinstance(index, _nd.NDArray); 18 | assert isinstance(input_grad, _nd.NDArray); 19 | 20 | _LIB.DLGpuIndexingGrad(output_grad.handle, index.handle, input_grad.handle, stream.handle) 21 | -------------------------------------------------------------------------------- /python/hetu/gpu_links/InterpolateLink.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import ctypes 4 | from .._base import _LIB 5 | from .. import ndarray as _nd 6 | 7 | 8 | def bicubic_interpolate(in_arr, out_arr, align_corners, stream=None): 9 | assert isinstance(in_arr, _nd.NDArray) 10 | assert isinstance(out_arr, _nd.NDArray) 11 | _LIB.DLGpuBicubicInterpolate( 12 | in_arr.handle, out_arr.handle, align_corners, stream.handle if stream else None) 13 | 14 | 15 | def bicubic_interpolate_gradient(input_grad, output, align_corners, stream=None): 16 | assert isinstance(input_grad, _nd.NDArray) 17 | assert isinstance(output, _nd.NDArray) 18 | _LIB.DLGpuBicubicInterpolateGradient( 19 | output.handle, input_grad.handle, align_corners, stream.handle if stream else None) 20 | -------------------------------------------------------------------------------- /python/hetu/gpu_links/LeakyReluLink.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import ctypes 4 | from .._base import _LIB 5 | from .. import ndarray as _nd 6 | 7 | 8 | def leaky_relu(in_arr, alpha, out_arr, stream=None): 9 | assert isinstance(in_arr, _nd.NDArray) 10 | assert isinstance(out_arr, _nd.NDArray) 11 | _LIB.DLGpuLeakyRelu(in_arr.handle, ctypes.c_float( 12 | alpha), out_arr.handle, stream.handle if stream else None) 13 | 14 | 15 | def leaky_relu_gradient(in_arr, in_grad_arr, alpha, out_arr, stream=None): 16 | assert isinstance(in_arr, _nd.NDArray) 17 | assert isinstance(in_grad_arr, _nd.NDArray) 18 | assert isinstance(out_arr, _nd.NDArray) 19 | _LIB.DLGpuLeakyReluGradient(in_arr.handle, in_grad_arr.handle, ctypes.c_float( 20 | alpha), out_arr.handle, stream.handle if stream else None) 21 | -------------------------------------------------------------------------------- /python/hetu/gpu_links/LinearLink.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import ctypes 4 | from .._base import _LIB 5 | from .. import ndarray as _nd 6 | 7 | 8 | def matmul_with_bias(matA, transA, matB, transB, bias, matC, stream=None): 9 | assert isinstance(matA, _nd.NDArray) 10 | assert isinstance(matB, _nd.NDArray) 11 | assert isinstance(bias, _nd.NDArray) 12 | assert isinstance(matC, _nd.NDArray) 13 | _LIB.DLGpuLinear( 14 | matA.handle, transA, matB.handle, transB, bias.handle, matC.handle, stream.handle if stream else None) 15 | -------------------------------------------------------------------------------- /python/hetu/gpu_links/LogLink.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import ctypes 4 | from .._base import _LIB 5 | from .. import ndarray as _nd 6 | 7 | 8 | def log_link(input, output, eps, stream=None): 9 | assert isinstance(input, _nd.NDArray) 10 | assert isinstance(output, _nd.NDArray) 11 | _LIB.DLGpuLog(input.handle, output.handle, ctypes.c_float( 12 | eps), stream.handle if stream else None) 13 | 14 | 15 | def log_grad_link(output_grad, input, input_grad, eps, stream=None): 16 | assert isinstance(output_grad, _nd.NDArray) 17 | assert isinstance(input, _nd.NDArray) 18 | assert isinstance(input_grad, _nd.NDArray) 19 | _LIB.DLGpuLogGrad(output_grad.handle, input.handle, input_grad.handle, 20 | ctypes.c_float(eps), stream.handle if stream else None) 21 | -------------------------------------------------------------------------------- /python/hetu/gpu_links/MaskLink.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | from .._base import _LIB 4 | from .. import ndarray as _nd 5 | 6 | 7 | def mask_func(input, mask, output, stream=None): 8 | assert isinstance(input, _nd.NDArray) 9 | assert isinstance(mask, _nd.NDArray) 10 | assert isinstance(output, _nd.NDArray) 11 | _LIB.DLGpuMask(input.handle, mask.handle, output.handle, 12 | stream.handle if stream else None) 13 | -------------------------------------------------------------------------------- /python/hetu/gpu_links/MaskedFillLink.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import ctypes 4 | from .._base import _LIB 5 | from .. import ndarray as _nd 6 | 7 | 8 | def masked_fill(input, mask, output, val, stream=None): 9 | assert isinstance(input, _nd.NDArray) 10 | assert isinstance(mask, _nd.NDArray) 11 | assert isinstance(output, _nd.NDArray) 12 | _LIB.DLGpuMaskedFill(input.handle, mask.handle, ctypes.c_float( 13 | val), output.handle, stream.handle if stream else None) 14 | -------------------------------------------------------------------------------- /python/hetu/gpu_links/MatrixDivideConstLink.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import ctypes 4 | from .._base import _LIB 5 | from .. import ndarray as _nd 6 | 7 | 8 | def matrix_elementwise_divide_const(val, in_mat, out_mat, stream=None): 9 | assert isinstance(in_mat, _nd.NDArray) 10 | assert isinstance(out_mat, _nd.NDArray) 11 | _LIB.DLGpuMatrixDivConst( 12 | ctypes.c_float(val), in_mat.handle, out_mat.handle, stream.handle if stream else None) 13 | -------------------------------------------------------------------------------- /python/hetu/gpu_links/MatrixDivideLink.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import ctypes 4 | from .._base import _LIB 5 | from .. import ndarray as _nd 6 | 7 | 8 | def matrix_elementwise_divide(matA, matB, matC, stream=None): 9 | assert isinstance(matA, _nd.NDArray) 10 | assert isinstance(matB, _nd.NDArray) 11 | assert isinstance(matC, _nd.NDArray) 12 | _LIB.DLGpuMatrixElementwiseDivide( 13 | matA.handle, matB.handle, matC.handle, stream.handle if stream else None) 14 | 15 | 16 | def matrix_elementwise_divide_handle_zero(matA, matB, matC, stream=None): 17 | assert isinstance(matA, _nd.NDArray) 18 | assert isinstance(matB, _nd.NDArray) 19 | assert isinstance(matC, _nd.NDArray) 20 | _LIB.DLGpuMatrixElementwiseDivideHandleZero( 21 | matA.handle, matB.handle, matC.handle, stream.handle if stream else None) 22 | 23 | -------------------------------------------------------------------------------- /python/hetu/gpu_links/MatrixMultLink.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import ctypes 4 | from .._base import _LIB 5 | from .. import ndarray as _nd 6 | 7 | 8 | def matrix_multiply(matA, transA, matB, transB, matC, stream=None): 9 | assert isinstance(matA, _nd.NDArray) 10 | assert isinstance(matB, _nd.NDArray) 11 | assert isinstance(matC, _nd.NDArray) 12 | _LIB.DLGpuMatrixMultiply( 13 | matA.handle, transA, matB.handle, transB, matC.handle, stream.handle if stream else None) 14 | -------------------------------------------------------------------------------- /python/hetu/gpu_links/MatrixRsqrtLink.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import ctypes 4 | from .._base import _LIB 5 | from .. import ndarray as _nd 6 | 7 | 8 | def matrix_rsqrt(in_arr, out_arr, stream=None): 9 | assert isinstance(in_arr, _nd.NDArray) 10 | assert isinstance(out_arr, _nd.NDArray) 11 | _LIB.DLGpuReciprocalSqrt( 12 | in_arr.handle, out_arr.handle, stream.handle if stream else None) 13 | -------------------------------------------------------------------------------- /python/hetu/gpu_links/MatrixSqrtLink.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import ctypes 4 | from .._base import _LIB 5 | from .. import ndarray as _nd 6 | 7 | 8 | def matrix_sqrt(in_arr, out_arr, stream=None): 9 | assert isinstance(in_arr, _nd.NDArray) 10 | assert isinstance(out_arr, _nd.NDArray) 11 | _LIB.DLGpuSqrt(in_arr.handle, out_arr.handle, 12 | stream.handle if stream else None) 13 | -------------------------------------------------------------------------------- /python/hetu/gpu_links/MatrixTransLink.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import ctypes 4 | from .._base import _LIB 5 | from .. import ndarray as _nd 6 | 7 | 8 | def matrix_transpose(in_mat, out_mat, perm, stream=None): 9 | assert isinstance(in_mat, _nd.NDArray) 10 | assert isinstance(out_mat, _nd.NDArray) 11 | pointer_func = ctypes.c_int * len(perm) 12 | pointer = pointer_func(*list(perm)) 13 | _LIB.DLGpuTranspose(in_mat.handle, out_mat.handle, 14 | pointer, stream.handle if stream else None) 15 | 16 | 17 | def matrix_transpose_simple(in_mat, out_mat, gpu_buf, stream=None): 18 | assert isinstance(in_mat, _nd.NDArray) 19 | assert isinstance(out_mat, _nd.NDArray) 20 | assert isinstance(gpu_buf, _nd.NDArray) 21 | _LIB.DLGpuTransposeSimple( 22 | in_mat.handle, out_mat.handle, gpu_buf.handle, stream.handle if stream else None) 23 | -------------------------------------------------------------------------------- /python/hetu/gpu_links/MaxLink.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import ctypes 4 | from .._base import _LIB 5 | from .. import ndarray as _nd 6 | 7 | 8 | def max(in_mat, out_mat_val, dim, stream=None): 9 | assert isinstance(in_mat, _nd.NDArray) 10 | assert isinstance(out_mat_val, _nd.NDArray) 11 | _LIB.DLGpuMax(in_mat.handle, out_mat_val.handle, 12 | dim, stream.handle if stream else None) 13 | 14 | 15 | def max_mat(matA, matB, out_mat, stream=None): 16 | assert isinstance(matA, _nd.NDArray) 17 | assert isinstance(matB, _nd.NDArray) 18 | assert isinstance(out_mat, _nd.NDArray) 19 | _LIB.DLGpuMaxMat(matA.handle, matB.handle, out_mat.handle, 20 | stream.handle if stream else None) 21 | -------------------------------------------------------------------------------- /python/hetu/gpu_links/MaxPoolLink.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import ctypes 4 | from .._base import _LIB 5 | from .. import ndarray as _nd 6 | 7 | 8 | def max_pooling2d(in_arr, kernel_H, kernel_W, pooled_layer, padding=0, stride=1, stream=None): 9 | assert isinstance(in_arr, _nd.NDArray) 10 | assert isinstance(pooled_layer, _nd.NDArray) 11 | _LIB.DLGpuMax_Pooling2d(in_arr.handle, kernel_H, 12 | kernel_W, pooled_layer.handle, padding, stride, stream.handle if stream else None) 13 | 14 | 15 | def max_pooling2d_gradient(in_arr, in_grad_arr, kernel_H, kernel_W, out_grad_arr, padding=0, stride=1, stream=None): 16 | assert isinstance(in_arr, _nd.NDArray) 17 | assert isinstance(in_grad_arr, _nd.NDArray) 18 | assert isinstance(out_grad_arr, _nd.NDArray) 19 | _LIB.DLGpuMax_Pooling2d_gradient( 20 | in_arr.handle, in_grad_arr.handle, kernel_H, kernel_W, out_grad_arr.handle, padding, stride, stream.handle if stream else None) 21 | -------------------------------------------------------------------------------- /python/hetu/gpu_links/MinDistLink.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | from ctypes import c_bool 4 | from .._base import _LIB 5 | from .. import ndarray as _nd 6 | 7 | 8 | def minimum_distance_vector(lookup, key, codebook, indices, output, mode, stream=None): 9 | assert isinstance(lookup, _nd.NDArray) 10 | assert isinstance(key, _nd.NDArray) 11 | assert isinstance(codebook, _nd.NDArray) 12 | assert isinstance(indices, _nd.NDArray) 13 | assert isinstance(output, _nd.NDArray) 14 | if mode == 'eu': 15 | cmode = True 16 | else: 17 | cmode = False 18 | _LIB.DLGpuMinDist(lookup.handle, key.handle, codebook.handle, 19 | indices.handle, output.handle, c_bool(cmode), stream.handle if stream else None) 20 | -------------------------------------------------------------------------------- /python/hetu/gpu_links/MinLink.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import ctypes 4 | from .._base import _LIB 5 | from .. import ndarray as _nd 6 | 7 | 8 | def min(in_mat, out_mat_val, dim, stream=None): 9 | assert isinstance(in_mat, _nd.NDArray) 10 | assert isinstance(out_mat_val, _nd.NDArray) 11 | _LIB.DLGpuMin(in_mat.handle, out_mat_val.handle, 12 | dim, stream.handle if stream else None) 13 | 14 | 15 | def min_mat(matA, matB, out_mat, stream=None): 16 | assert isinstance(matA, _nd.NDArray) 17 | assert isinstance(matB, _nd.NDArray) 18 | assert isinstance(out_mat, _nd.NDArray) 19 | _LIB.DLGpuMinMat(matA.handle, matB.handle, out_mat.handle, 20 | stream.handle if stream else None) 21 | -------------------------------------------------------------------------------- /python/hetu/gpu_links/MinusByConstLink.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import ctypes 4 | from .._base import _LIB 5 | from .. import ndarray as _nd 6 | 7 | def minus_by_const(input, output, val, stream=None): 8 | assert isinstance(input, _nd.NDArray) 9 | assert isinstance(output, _nd.NDArray) 10 | _LIB.DLGpuMinusByConst(input.handle, output.handle, ctypes.c_float(val), stream.handle if stream else None) 11 | -------------------------------------------------------------------------------- /python/hetu/gpu_links/MinusElewiseLink.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import ctypes 4 | from .._base import _LIB 5 | from .. import ndarray as _nd 6 | 7 | def matrix_elementwise_minus(input1, input2, output, stream=None): 8 | assert isinstance(input1, _nd.NDArray); 9 | assert isinstance(input2, _nd.NDArray); 10 | assert isinstance(output, _nd.NDArray); 11 | 12 | _LIB.DLGpuMinusElewise(input1.handle, input2.handle, output.handle, stream.handle if stream else None) 13 | -------------------------------------------------------------------------------- /python/hetu/gpu_links/MultiplyConstLink.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import ctypes 4 | import numpy as np 5 | from .._base import _LIB 6 | from .. import ndarray as _nd 7 | 8 | 9 | def matrix_elementwise_multiply_by_const(in_mat, val, out_mat, stream=None): 10 | 11 | assert isinstance(in_mat, (_nd.NDArray, _nd.IndexedSlices)) 12 | assert isinstance(out_mat, (_nd.NDArray, _nd.IndexedSlices)) 13 | 14 | if in_mat.dtype == np.float32: 15 | cval = ctypes.c_float(val) 16 | func = _LIB.DLGpuMatrixMultiplyByConst 17 | elif in_mat.dtype == np.int32: 18 | cval = ctypes.c_int(val) 19 | func = _LIB.DLGpuMatrixMultiplyByConstInt 20 | 21 | if isinstance(in_mat, _nd.NDArray): 22 | func( 23 | in_mat.handle, cval, out_mat.handle, stream.handle if stream else None) 24 | else: 25 | # isinstance(in_mat, _nd.IndexedSlices) 26 | func( 27 | in_mat.values.handle, cval, out_mat.values.handle, stream.handle if stream else None) 28 | -------------------------------------------------------------------------------- /python/hetu/gpu_links/MultiplyElewiseLink.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import ctypes 4 | from .._base import _LIB 5 | from .. import ndarray as _nd 6 | 7 | 8 | def matrix_elementwise_multiply(matA, matB, matC, stream=None): 9 | assert isinstance(matA, _nd.NDArray) 10 | assert isinstance(matB, _nd.NDArray) 11 | assert isinstance(matC, _nd.NDArray) 12 | _LIB.DLGpuMatrixElementwiseMultiply( 13 | matA.handle, matB.handle, matC.handle, stream.handle if stream else None) 14 | -------------------------------------------------------------------------------- /python/hetu/gpu_links/NllLossLink.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import ctypes 4 | from .._base import _LIB 5 | from .. import ndarray as _nd 6 | 7 | def nll_loss_link(input, target, output, stream=None): 8 | assert isinstance(input, _nd.NDArray) 9 | assert isinstance(target, _nd.NDArray) 10 | assert isinstance(output, _nd.NDArray) 11 | _LIB.DLGpuNllLoss(input.handle, target.handle, output.handle, stream.handle if stream else None) 12 | 13 | def nll_loss_grad_link(output_grad, target, input_grad, stream=None): 14 | assert isinstance(output_grad, _nd.NDArray) 15 | assert isinstance(target, _nd.NDArray) 16 | assert isinstance(input_grad, _nd.NDArray) 17 | 18 | _LIB.DLGpuNllLossGrad(output_grad.handle, target.handle, input_grad.handle, stream.handle if stream else None) 19 | -------------------------------------------------------------------------------- /python/hetu/gpu_links/NormLink.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import ctypes 4 | from .._base import _LIB 5 | from .. import ndarray as _nd 6 | 7 | 8 | def norm(in_arr, out_arr, axis, p, stream=None): 9 | assert isinstance(in_arr, _nd.NDArray) 10 | assert isinstance(out_arr, _nd.NDArray) 11 | _LIB.DLGpuNorm(in_arr.handle, out_arr.handle, ctypes.c_int(axis), ctypes.c_int(p), 12 | stream.handle if stream else None) 13 | 14 | 15 | def norm_gradient(input, input_y, grad_y, output, axis, p, stream=None): 16 | assert isinstance(input, _nd.NDArray) 17 | assert isinstance(input_y, _nd.NDArray) 18 | assert isinstance(grad_y, _nd.NDArray) 19 | assert isinstance(output, _nd.NDArray) 20 | _LIB.DLGpuNormGradient(input.handle, input_y.handle, grad_y.handle, output.handle, ctypes.c_int(axis), ctypes.c_int(p), 21 | stream.handle if stream else None) 22 | -------------------------------------------------------------------------------- /python/hetu/gpu_links/OneHotLink.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import ctypes 4 | from .._base import _LIB 5 | from .. import ndarray as _nd 6 | 7 | 8 | def one_hot(in_arr, out_arr, stream=None): 9 | assert isinstance(in_arr, _nd.NDArray) 10 | assert isinstance(out_arr, _nd.NDArray) 11 | _LIB.DLGpuOneHot(in_arr.handle, out_arr.handle, 12 | stream.handle if stream else None) 13 | -------------------------------------------------------------------------------- /python/hetu/gpu_links/OppositeLink.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import ctypes 4 | from .._base import _LIB 5 | from .. import ndarray as _nd 6 | 7 | 8 | def matrix_opposite(in_arr, out_arr, stream=None): 9 | assert isinstance(in_arr, _nd.NDArray) 10 | assert isinstance(out_arr, _nd.NDArray) 11 | _LIB.DLGpuOpposite(in_arr.handle, out_arr.handle, 12 | stream.handle if stream else None) 13 | -------------------------------------------------------------------------------- /python/hetu/gpu_links/OptEmbedBinaryStepLink.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | from .._base import _LIB 4 | from .. import ndarray as _nd 5 | 6 | 7 | def binary_step_forward(in_arr, out_arr, stream=None): 8 | assert isinstance(in_arr, _nd.NDArray) 9 | assert isinstance(out_arr, _nd.NDArray) 10 | _LIB.DLGpuIsPositive(in_arr.handle, out_arr.handle, 11 | stream.handle if stream else None) 12 | 13 | 14 | def binary_step_backward(in_arr, out_arr, stream=None): 15 | assert isinstance(in_arr, _nd.NDArray) 16 | assert isinstance(out_arr, _nd.NDArray) 17 | _LIB.DLGpuBinaryStepBackward(in_arr.handle, out_arr.handle, 18 | stream.handle if stream else None) 19 | -------------------------------------------------------------------------------- /python/hetu/gpu_links/ParamClipLink.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import ctypes 4 | from .._base import _LIB 5 | from .. import ndarray as _nd 6 | 7 | 8 | def param_clip_func(arr, min_value, max_value, stream=None): 9 | assert isinstance(arr, _nd.NDArray) 10 | _LIB.DLGpuClipping(arr.handle, ctypes.c_float(min_value), ctypes.c_float( 11 | max_value), stream.handle if stream else None) 12 | -------------------------------------------------------------------------------- /python/hetu/gpu_links/PowLink.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import ctypes 4 | from .._base import _LIB 5 | from .. import ndarray as _nd 6 | 7 | 8 | def pow_matrix(in_arr, out_arr, eps, stream=None): 9 | assert isinstance(in_arr, _nd.NDArray) 10 | assert isinstance(out_arr, _nd.NDArray) 11 | _LIB.DLGpuPow(in_arr.handle, out_arr.handle, ctypes.c_float(eps), 12 | stream.handle if stream else None) 13 | 14 | 15 | def pow_gradient(in_arr, in_grad_arr, out_arr, eps, stream=None): 16 | assert isinstance(in_arr, _nd.NDArray) 17 | assert isinstance(in_grad_arr, _nd.NDArray) 18 | assert isinstance(out_arr, _nd.NDArray) 19 | _LIB.DLGpuPowGradient(in_arr.handle, in_grad_arr.handle, out_arr.handle, 20 | ctypes.c_float(eps), stream.handle if stream else None) 21 | -------------------------------------------------------------------------------- /python/hetu/gpu_links/PowerLink.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import ctypes 4 | from .._base import _LIB 5 | from .. import ndarray as _nd 6 | 7 | 8 | def matrix_power(in_arr, out_arr, p, stream=None): 9 | assert isinstance(in_arr, _nd.NDArray) 10 | assert isinstance(out_arr, _nd.NDArray) 11 | _LIB.DLGpuPower(in_arr.handle, out_arr.handle, ctypes.c_float(p), 12 | stream.handle if stream else None) 13 | -------------------------------------------------------------------------------- /python/hetu/gpu_links/ReduceMeanLink.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import ctypes 4 | from .._base import _LIB 5 | from .. import ndarray as _nd 6 | 7 | 8 | def reduce_mean(in_arr, out_arr, axes, stream=None): 9 | assert isinstance(in_arr, _nd.NDArray) 10 | assert isinstance(out_arr, _nd.NDArray) 11 | pointer_func = ctypes.c_int * len(axes) 12 | pointer = pointer_func(*list(axes)) 13 | _LIB.DLGpuReduceMean( 14 | in_arr.handle, out_arr.handle, pointer, ctypes.c_int(len(axes)), stream.handle if stream else None) 15 | -------------------------------------------------------------------------------- /python/hetu/gpu_links/ReduceMinLink.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import ctypes 4 | from .._base import _LIB 5 | from .. import ndarray as _nd 6 | 7 | 8 | def reduce_min(in_arr, out_arr, axes, stream=None): 9 | assert isinstance(in_arr, _nd.NDArray) 10 | assert isinstance(out_arr, _nd.NDArray) 11 | pointer_func = ctypes.c_int * len(axes) 12 | pointer = pointer_func(*list(axes)) 13 | _LIB.DLGpuReduceMin( 14 | in_arr.handle, out_arr.handle, pointer, ctypes.c_int(len(axes)), stream.handle if stream else None) 15 | -------------------------------------------------------------------------------- /python/hetu/gpu_links/ReduceMulLink.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import ctypes 4 | from .._base import _LIB 5 | from .. import ndarray as _nd 6 | 7 | 8 | def reduce_mul(in_arr, out_arr, axes, stream=None): 9 | assert isinstance(in_arr, _nd.NDArray) 10 | assert isinstance(out_arr, _nd.NDArray) 11 | pointer_func = ctypes.c_int * len(axes) 12 | pointer = pointer_func(*list(axes)) 13 | _LIB.DLGpuReduceMul( 14 | in_arr.handle, out_arr.handle, pointer, ctypes.c_int(len(axes)), stream.handle if stream else None) 15 | -------------------------------------------------------------------------------- /python/hetu/gpu_links/ReduceNormLink.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import ctypes 4 | from .._base import _LIB 5 | from .. import ndarray as _nd 6 | 7 | 8 | def reduce_norm1(in_arr, out_arr, axes, stream=None): 9 | assert isinstance(in_arr, _nd.NDArray) 10 | assert isinstance(out_arr, _nd.NDArray) 11 | pointer_func = ctypes.c_int * len(axes) 12 | pointer = pointer_func(*list(axes)) 13 | _LIB.DLGpuReduceNorm1( 14 | in_arr.handle, out_arr.handle, pointer, ctypes.c_int(len(axes)), stream.handle if stream else None) 15 | 16 | 17 | def reduce_norm2(in_arr, out_arr, axes, stream=None): 18 | assert isinstance(in_arr, _nd.NDArray) 19 | assert isinstance(out_arr, _nd.NDArray) 20 | pointer_func = ctypes.c_int * len(axes) 21 | pointer = pointer_func(*list(axes)) 22 | _LIB.DLGpuReduceNorm2( 23 | in_arr.handle, out_arr.handle, pointer, ctypes.c_int(len(axes)), stream.handle if stream else None) 24 | -------------------------------------------------------------------------------- /python/hetu/gpu_links/ReduceSumAxisZeroLink.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import ctypes 4 | from .._base import _LIB 5 | from .. import ndarray as _nd 6 | 7 | 8 | def reduce_sum_axis_zero(in_arr, out_arr, workspace_arr, stream=None): 9 | assert isinstance(in_arr, _nd.NDArray) 10 | assert isinstance(out_arr, _nd.NDArray) 11 | _LIB.DLGpuReduceSumAxisZero( 12 | in_arr.handle, out_arr.handle, stream.handle if stream else None) 13 | 14 | 15 | def _reduce_sum_axis_zero(in_arr, out_arr, workspace_arr, stream=None): 16 | assert isinstance(in_arr, _nd.NDArray) 17 | assert isinstance(out_arr, _nd.NDArray) 18 | assert isinstance(workspace_arr, _nd.NDArray) 19 | _LIB._DLGpuReduceSumAxisZero( 20 | in_arr.handle, out_arr.handle, workspace_arr.handle, stream.handle if stream else None) 21 | -------------------------------------------------------------------------------- /python/hetu/gpu_links/ReduceSumLink.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import ctypes 4 | from .._base import _LIB 5 | from .. import ndarray as _nd 6 | 7 | 8 | def reduce_sum(in_arr, out_arr, axes, stream=None): 9 | assert isinstance(in_arr, _nd.NDArray) 10 | assert isinstance(out_arr, _nd.NDArray) 11 | pointer_func = ctypes.c_int * len(axes) 12 | pointer = pointer_func(*list(axes)) 13 | _LIB.DLGpuReduceSum( 14 | in_arr.handle, out_arr.handle, pointer, ctypes.c_int(len(axes)), stream.handle if stream else None) 15 | -------------------------------------------------------------------------------- /python/hetu/gpu_links/ReluLink.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import ctypes 4 | from .._base import _LIB 5 | from .. import ndarray as _nd 6 | 7 | 8 | def relu(in_arr, out_arr, stream=None): 9 | assert isinstance(in_arr, _nd.NDArray) 10 | assert isinstance(out_arr, _nd.NDArray) 11 | _LIB.DLGpuRelu(in_arr.handle, out_arr.handle, 12 | stream.handle if stream else None) 13 | 14 | 15 | def relu_gradient(in_arr, in_grad_arr, out_arr, stream=None): 16 | assert isinstance(in_arr, _nd.NDArray) 17 | assert isinstance(in_grad_arr, _nd.NDArray) 18 | assert isinstance(out_arr, _nd.NDArray) 19 | _LIB.DLGpuReluGradient(in_arr.handle, in_grad_arr.handle, 20 | out_arr.handle, stream.handle if stream else None) 21 | -------------------------------------------------------------------------------- /python/hetu/gpu_links/RepeatLink.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import ctypes 4 | from .._base import _LIB 5 | from .. import ndarray as _nd 6 | 7 | 8 | def repeat(in_arr, out_arr, stream=None): 9 | assert isinstance(in_arr, _nd.NDArray) 10 | assert isinstance(out_arr, _nd.NDArray) 11 | 12 | _LIB.DLGpuRepeat(in_arr.handle, out_arr.handle, 13 | stream.handle if stream else None) 14 | 15 | 16 | def repeat_gradient(in_arr, out_arr, stream=None): 17 | assert isinstance(in_arr, _nd.NDArray) 18 | assert isinstance(out_arr, _nd.NDArray) 19 | 20 | _LIB.DLGpuRepeatGradient( 21 | in_arr.handle, out_arr.handle, stream.handle if stream else None) 22 | -------------------------------------------------------------------------------- /python/hetu/gpu_links/ReshapeLink.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import ctypes 4 | from .._base import _LIB 5 | from .. import ndarray as _nd 6 | 7 | 8 | def array_reshape(in_arr, out_arr, stream=None): 9 | 10 | assert isinstance(in_arr, _nd.NDArray) 11 | assert isinstance(out_arr, _nd.NDArray) 12 | _LIB.DLGpuReshape(in_arr.handle, out_arr.handle, 13 | stream.handle if stream else None) 14 | -------------------------------------------------------------------------------- /python/hetu/gpu_links/RollLink.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import ctypes 4 | from .._base import _LIB 5 | from .. import ndarray as _nd 6 | 7 | 8 | def roll(input_mat, output_mat, shift, axis, stream=None): 9 | assert isinstance(input_mat, _nd.NDArray) 10 | assert isinstance(output_mat, _nd.NDArray) 11 | 12 | nums = len(shift) 13 | shift_func = ctypes.c_int * len(shift) 14 | pointer_shift = shift_func(*list(shift)) 15 | 16 | if (axis): 17 | axis_func = ctypes.c_int * len(axis) 18 | pointer_axis = axis_func(*list(axis)) 19 | else: 20 | pointer_axis = None 21 | 22 | _LIB.DLGpuRoll(input_mat.handle, pointer_shift, pointer_axis, 23 | nums, output_mat.handle, stream.handle if stream else None) 24 | -------------------------------------------------------------------------------- /python/hetu/gpu_links/SamGroupSumLink.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import ctypes 4 | from .._base import _LIB 5 | from .. import ndarray as _nd 6 | 7 | def sam_group_sum_link(gate_mat, out_mat, num_local_gpus, stream=None): 8 | assert isinstance(gate_mat, _nd.NDArray); 9 | assert isinstance(out_mat, _nd.NDArray); 10 | _LIB.DLGpuSamGroupSum( 11 | gate_mat.handle, out_mat.handle, ctypes.c_int(num_local_gpus), stream.handle if stream else None) 12 | -------------------------------------------------------------------------------- /python/hetu/gpu_links/Scatter1DLink.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import ctypes 4 | from .._base import _LIB 5 | from .. import ndarray as _nd 6 | 7 | def scatter1d(input_mat, index_mat,output_mat, stream=None): 8 | assert isinstance(input_mat, _nd.NDArray); 9 | assert isinstance(index_mat, _nd.NDArray); 10 | assert isinstance(output_mat, _nd.NDArray); 11 | 12 | _LIB.DLGpuScatter1D( 13 | input_mat.handle,index_mat.handle, output_mat.handle, stream.handle if stream else None) 14 | 15 | 16 | def scatter1d_grad(output_grad_mat, index_mat, input_grad_mat, stream=None): 17 | assert isinstance(output_grad_mat, _nd.NDArray) 18 | assert isinstance(index_mat, _nd.NDArray) 19 | assert isinstance(input_grad_mat, _nd.NDArray) 20 | 21 | _LIB.DLGpuScatter1DGrad( 22 | output_grad_mat.handle, index_mat.handle, input_grad_mat.handle, stream.handle if stream else None) 23 | -------------------------------------------------------------------------------- /python/hetu/gpu_links/ScatterLink.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import ctypes 4 | from .._base import _LIB 5 | from .. import ndarray as _nd 6 | 7 | def scatter(target_mat, dim, index_mat, src_mat, stream=None): 8 | assert isinstance(target_mat, _nd.NDArray); 9 | assert isinstance(index_mat, _nd.NDArray); 10 | assert isinstance(src_mat, _nd.NDArray); 11 | 12 | _LIB.DLGpuScatter( 13 | target_mat.handle, dim, index_mat.handle, src_mat.handle, stream.handle if stream else None) 14 | -------------------------------------------------------------------------------- /python/hetu/gpu_links/SigmoidLink.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import ctypes 4 | from .._base import _LIB 5 | from .. import ndarray as _nd 6 | 7 | 8 | def sigmoid(in_arr, out_arr, stream=None): 9 | assert isinstance(in_arr, _nd.NDArray) 10 | assert isinstance(out_arr, _nd.NDArray) 11 | _LIB.DLGpuSigmoid(in_arr.handle, out_arr.handle, 12 | stream.handle if stream else None) 13 | -------------------------------------------------------------------------------- /python/hetu/gpu_links/SignLink.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | from .._base import _LIB 4 | from .. import ndarray as _nd 5 | 6 | 7 | def sign_func(in_arr, out_arr, stream=None): 8 | assert isinstance(in_arr, _nd.NDArray) 9 | assert isinstance(out_arr, _nd.NDArray) 10 | _LIB.DLGpuSign(in_arr.handle, out_arr.handle, 11 | stream.handle if stream else None) 12 | -------------------------------------------------------------------------------- /python/hetu/gpu_links/SinLink.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import ctypes 4 | from .._base import _LIB 5 | from .. import ndarray as _nd 6 | 7 | 8 | def sin(in_arr, out_arr, stream=None): 9 | assert isinstance(in_arr, _nd.NDArray) 10 | assert isinstance(out_arr, _nd.NDArray) 11 | _LIB.DLGpuSin(in_arr.handle, out_arr.handle, 12 | stream.handle if stream else None) 13 | 14 | 15 | def cos(in_arr, out_arr, stream=None): 16 | assert isinstance(in_arr, _nd.NDArray) 17 | assert isinstance(out_arr, _nd.NDArray) 18 | _LIB.DLGpuCos(in_arr.handle, out_arr.handle, 19 | stream.handle if stream else None) 20 | -------------------------------------------------------------------------------- /python/hetu/gpu_links/SoftmaxCrossEntropyLink.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import ctypes 4 | from .._base import _LIB 5 | from .. import ndarray as _nd 6 | 7 | 8 | def softmax_cross_entropy(in_arr_a, in_arr_b, out_arr, stream=None): 9 | assert isinstance(in_arr_a, _nd.NDArray) 10 | assert isinstance(in_arr_b, _nd.NDArray) 11 | assert isinstance(out_arr, _nd.NDArray) 12 | _LIB.DLGpuSoftmaxCrossEntropy( 13 | in_arr_a.handle, in_arr_b.handle, out_arr.handle, stream.handle if stream else None) 14 | 15 | 16 | def softmax_cross_entropy_gradient(in_arr_a, in_arr_b, in_arr_c, out_arr, stream=None): 17 | assert isinstance(in_arr_a, _nd.NDArray) 18 | assert isinstance(in_arr_b, _nd.NDArray) 19 | assert isinstance(in_arr_c, _nd.NDArray) 20 | assert isinstance(out_arr, _nd.NDArray) 21 | _LIB.DLGpuSoftmaxCrossEntropy_Gradient( 22 | in_arr_a.handle, in_arr_b.handle, in_arr_c.handle, out_arr.handle, stream.handle if stream else None) 23 | -------------------------------------------------------------------------------- /python/hetu/gpu_links/SoftmaxLink.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import ctypes 4 | from .._base import _LIB 5 | from .. import ndarray as _nd 6 | 7 | 8 | def softmax(in_arr, out_arr, stream=None): 9 | assert isinstance(in_arr, _nd.NDArray) 10 | assert isinstance(out_arr, _nd.NDArray) 11 | _LIB.DLGpuSoftmax(in_arr.handle, out_arr.handle, 12 | stream.handle if stream else None) 13 | -------------------------------------------------------------------------------- /python/hetu/gpu_links/SparseEmbeddingLookUpLink.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import ctypes 4 | from .._base import _LIB 5 | from .. import ndarray as _nd 6 | 7 | 8 | def sparse_embedding_lookup(in_mat, ids, out_mat, stream=None): 9 | assert isinstance(in_mat, _nd.ND_Sparse_Array) 10 | assert isinstance(ids, _nd.NDArray) 11 | assert isinstance(out_mat, _nd.NDArray) 12 | if in_mat.form == 'csr': 13 | _LIB.DLGpuCSREmbeddingLookUp(in_mat.data.handle, in_mat.row.handle, in_mat.col.handle, 14 | ids.handle, out_mat.handle, stream.handle if stream else None) 15 | else: 16 | _LIB.DLGpuCOOEmbeddingLookUp(in_mat.data.handle, in_mat.row.handle, in_mat.col.handle, 17 | ids.handle, out_mat.handle, stream.handle if stream else None) 18 | -------------------------------------------------------------------------------- /python/hetu/gpu_links/SparseSetLink.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | from .._base import _LIB 4 | from .. import ndarray as _nd 5 | 6 | 7 | def sparse_set(table, indices, data, stream=None): 8 | assert isinstance(table, _nd.NDArray) 9 | assert isinstance(indices, _nd.NDArray) 10 | assert isinstance(data, _nd.NDArray) 11 | _LIB.DLGpuSparseSet(table.handle, indices.handle, 12 | data.handle, stream.handle if stream else None) 13 | -------------------------------------------------------------------------------- /python/hetu/gpu_links/TanhLink.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import ctypes 4 | from .._base import _LIB 5 | from .. import ndarray as _nd 6 | 7 | 8 | def tanh(in_arr, out_arr, stream=None): 9 | assert isinstance(in_arr, _nd.NDArray) 10 | assert isinstance(out_arr, _nd.NDArray) 11 | _LIB.DLGpuTanh(in_arr.handle, out_arr.handle, 12 | stream.handle if stream else None) 13 | 14 | 15 | def tanh_gradient(forward_arr, grad_arr, out_arr, stream=None): 16 | assert isinstance(forward_arr, _nd.NDArray) 17 | assert isinstance(grad_arr, _nd.NDArray) 18 | assert isinstance(out_arr, _nd.NDArray) 19 | _LIB.DLGpuTanhGradient(forward_arr.handle, grad_arr.handle, 20 | out_arr.handle, stream.handle if stream else None) 21 | -------------------------------------------------------------------------------- /python/hetu/gpu_links/TopKIdxLink.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import ctypes 4 | from .._base import _LIB 5 | from .. import ndarray as _nd 6 | 7 | def topk_idx(in_mat, out_mat_idx, k, stream=None): 8 | assert isinstance(in_mat, _nd.NDArray); 9 | assert isinstance(out_mat_idx, _nd.NDArray); 10 | _LIB.DLGpuTopKIdx( 11 | in_mat.handle, out_mat_idx.handle, ctypes.c_int(k), stream.handle if stream else None) 12 | -------------------------------------------------------------------------------- /python/hetu/gpu_links/TopKValLink.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import ctypes 4 | from .._base import _LIB 5 | from .. import ndarray as _nd 6 | 7 | def topk_val(in_mat, out_mat_idx, out_mat_val, k, stream=None): 8 | assert isinstance(in_mat, _nd.NDArray); 9 | assert isinstance(out_mat_idx, _nd.NDArray); 10 | assert isinstance(out_mat_val, _nd.NDArray); 11 | 12 | _LIB.DLGpuTopKVal( 13 | in_mat.handle, out_mat_idx.handle, out_mat_val.handle, k, stream.handle if stream else None) 14 | -------------------------------------------------------------------------------- /python/hetu/gpu_links/TrilLookupLink.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import ctypes 4 | from .._base import _LIB 5 | from .. import ndarray as _nd 6 | 7 | 8 | def tril_lookup(in_arr, out_arr, offset, stream=None): 9 | assert isinstance(in_arr, _nd.NDArray) 10 | assert isinstance(out_arr, _nd.NDArray) 11 | _LIB.DLGpuTrilLookup( 12 | in_arr.handle, out_arr.handle, ctypes.c_int(offset), stream.handle if stream else None) 13 | 14 | 15 | def tril_lookup_gradient(in_arr, out_arr, offset, stream=None): 16 | assert isinstance(in_arr, _nd.NDArray) 17 | assert isinstance(out_arr, _nd.NDArray) 18 | _LIB.DLGpuTrilLookupGradient( 19 | in_arr.handle, out_arr.handle, ctypes.c_int(offset), stream.handle if stream else None) 20 | -------------------------------------------------------------------------------- /python/hetu/gpu_links/WhereLink.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import ctypes 4 | from .._base import _LIB 5 | from .. import ndarray as _nd 6 | 7 | 8 | def where(cond, arr1, arr2, out_arr, stream=None): 9 | assert isinstance(cond, _nd.NDArray) 10 | assert isinstance(arr1, _nd.NDArray) 11 | assert isinstance(arr2, _nd.NDArray) 12 | assert isinstance(out_arr, _nd.NDArray) 13 | _LIB.DLGpuWhere(cond.handle, arr1.handle, arr2.handle, 14 | out_arr.handle, stream.handle if stream else None) 15 | 16 | 17 | def where_const(cond, arr1, const_attr, out_arr, stream=None): 18 | assert isinstance(cond, _nd.NDArray) 19 | assert isinstance(arr1, _nd.NDArray) 20 | assert isinstance(out_arr, _nd.NDArray) 21 | _LIB.DLGpuWhereConst(cond.handle, arr1.handle, ctypes.c_float(const_attr), 22 | out_arr.handle, stream.handle if stream else None) 23 | -------------------------------------------------------------------------------- /python/hetu/gpu_ops/SamGroupSum.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from .Node import Op 3 | from .. import ndarray 4 | from ..gpu_links import sam_group_sum_link 5 | 6 | 7 | class SamGroupSumOp(Op): 8 | def __init__(self, node_A, num_local_gpus=8, ctx=None): 9 | super().__init__(SamGroupSumOp, [node_A], ctx) 10 | self.num_local_gpus = num_local_gpus 11 | 12 | def compute(self, input_val, output_val, stream_handle=None): 13 | if self.on_cpu: 14 | raise NotImplementedError 15 | else: 16 | sam_group_sum_link(input_val[0], output_val, self.num_local_gpus, stream_handle) 17 | def gradient(self, output_grad): 18 | return [None] 19 | 20 | def infer_shape(self, input_shapes): 21 | assert len(input_shapes) == 1 22 | return (input_shapes[0][0], self.num_local_gpus) 23 | 24 | 25 | def sam_group_sum_op(node, num_local_gpus, ctx=None): 26 | return SamGroupSumOp(node, num_local_gpus, ctx=ctx) 27 | -------------------------------------------------------------------------------- /python/hetu/gpu_ops/Scatter.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from .Node import Op 3 | from .. import ndarray 4 | from ..gpu_links import scatter 5 | 6 | class ScatterOp(Op): 7 | def __init__(self, node_target, node_index, node_src, ctx=None): 8 | super().__init__(ScatterOp, [node_target, node_index, node_src], ctx) 9 | 10 | def compute(self, target, dim, index, src, stream_handle=None): 11 | scatter(target, dim, index, src) 12 | 13 | def gradient(self, output_grad): 14 | pass 15 | 16 | def infer_shape(self, input_shapes): 17 | pass 18 | 19 | def scatter_op(node1, node2, node3, ctx=None): 20 | return ScatterOp(node1, node2, node3, ctx=ctx) 21 | -------------------------------------------------------------------------------- /python/hetu/gpu_ops/Sign.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | import numpy as np 3 | from .Node import Op 4 | from ..gpu_links import sign_func 5 | 6 | 7 | class SignOp(Op): 8 | def __init__(self, node, ctx=None): 9 | super().__init__(SignOp, [node], ctx) 10 | 11 | def compute(self, input_vals, output_val, stream_handle=None): 12 | if self.on_cpu: 13 | output_val[:] = np.sign(input_vals[0].asnumpy()) 14 | else: 15 | sign_func(input_vals[0], output_val, stream_handle) 16 | 17 | def gradient(self, output_grad): 18 | return [None] 19 | 20 | def infer_shape(self, input_shapes): 21 | assert len(input_shapes) == 1 22 | return input_shapes[0] 23 | 24 | 25 | def sign_op(node, ctx=None): 26 | return SignOp(node, ctx=ctx) 27 | -------------------------------------------------------------------------------- /python/hetu/gpu_ops/SparseSet.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | import numpy as np 3 | from .Node import Op 4 | from .._base import DNNL_LIB 5 | from ..gpu_links import sparse_set 6 | 7 | 8 | class SparseSetOp(Op): 9 | def __init__(self, table, ind, data, ctx=None): 10 | super().__init__(SparseSetOp, [table, ind, data], ctx) 11 | assert table.dtype == ind.dtype == data.dtype == np.int32 12 | 13 | def compute(self, input_vals, output_val, stream_handle=None): 14 | if self.on_cpu: 15 | raise NotImplementedError 16 | else: 17 | sparse_set(input_vals[0], input_vals[1], 18 | input_vals[2], stream_handle) 19 | 20 | def gradient(self, output_grad): 21 | return [None, None, None] 22 | 23 | def infer_shape(self, input_shapes): 24 | return None 25 | 26 | 27 | def sparse_set_op(table, ind, data, ctx=None): 28 | return SparseSetOp(table, ind, data, ctx=ctx) 29 | -------------------------------------------------------------------------------- /python/hetu/gpu_ops/StopGradient.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from .Node import Op 3 | 4 | 5 | class StopGradientOp(Op): 6 | def __init__(self, node, ctx=None): 7 | super().__init__(StopGradientOp, [node], ctx) 8 | 9 | def compute(self, input_vals, output_val, stream_handle=None): 10 | raise NotImplementedError 11 | 12 | def gradient(self, output_grad): 13 | return [None] 14 | 15 | def infer_shape(self, input_shapes): 16 | assert len(input_shapes) == 1 17 | return input_shapes[0] 18 | 19 | 20 | def stop_gradient_op(node, ctx=None): 21 | return StopGradientOp(node, ctx=ctx) 22 | -------------------------------------------------------------------------------- /python/hetu/gpu_ops/TopKIdx.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from .Node import Op 3 | from .. import ndarray 4 | from ..gpu_links import topk_idx 5 | 6 | 7 | class TopKIdxOp(Op): 8 | def __init__(self, node_A, topk=1, ctx=None): 9 | super().__init__(TopKIdxOp, [node_A], ctx) 10 | self.k = topk 11 | 12 | def compute(self, input_val, output_val, stream_handle=None): 13 | if self.on_cpu: 14 | raise NotImplementedError 15 | else: 16 | topk_idx(input_val[0], output_val, self.k, stream_handle) 17 | def gradient(self, output_grad): 18 | return [None] 19 | 20 | def infer_shape(self, input_shapes): 21 | assert len(input_shapes) == 1 22 | return (input_shapes[0][0], self.k) 23 | 24 | 25 | def topk_idx_op(node, topk, ctx=None): 26 | return TopKIdxOp(node, topk, ctx=ctx) 27 | -------------------------------------------------------------------------------- /python/hetu/gpu_ops/TopKVal.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from .Node import Op 3 | from .. import ndarray 4 | from ..gpu_links import topk_val 5 | 6 | 7 | class TopKValOp(Op): 8 | def __init__(self, node_A, node_B, ctx=None): 9 | super().__init__(TopKValOp, [node_A, node_B], ctx) 10 | 11 | def compute(self, input_val, output_val, stream_handle=None): 12 | if self.on_cpu: 13 | raise NotImplementedError 14 | else: 15 | topk_val(input_val[0], input_val[1], output_val, stream_handle) 16 | 17 | def gradient(self, output_grad): 18 | return [None, None] 19 | 20 | def infer_shape(self, input_shapes): 21 | assert len(input_shapes) == 2 22 | return input_shapes[1] 23 | 24 | def topk_val_op(nodeA, nodeB, ctx=None): 25 | return TopKValOp(nodeA, nodeB, ctx=ctx) 26 | -------------------------------------------------------------------------------- /python/hetu/layers/concatenate.py: -------------------------------------------------------------------------------- 1 | from .base import BaseLayer 2 | import hetu as ht 3 | 4 | 5 | class Concatenate(BaseLayer): 6 | def __init__(self, axis): 7 | self.axis = axis 8 | 9 | def __call__(self, *args): 10 | if len(args) == 1: 11 | return args[0] 12 | else: 13 | return ht.concatenate_op(args, axis=self.axis) 14 | 15 | 16 | class ConcatenateLayers(BaseLayer): 17 | def __init__(self, layers, axis=0): 18 | self.layers = layers 19 | self.axis = axis 20 | 21 | def __call__(self, x): 22 | if len(self.layers) == 1: 23 | return self.layers[0](x) 24 | else: 25 | return ht.concatenate_op([layer(x) for layer in self.layers], axis=self.axis) 26 | -------------------------------------------------------------------------------- /python/hetu/layers/dropout.py: -------------------------------------------------------------------------------- 1 | from .base import BaseLayer 2 | import hetu as ht 3 | 4 | 5 | class DropOut(BaseLayer): 6 | def __init__(self, p=0.5): 7 | self.p = p 8 | 9 | def __call__(self, x): 10 | if self.p == 0: 11 | return x 12 | return ht.dropout_op(x, 1-self.p) 13 | -------------------------------------------------------------------------------- /python/hetu/layers/embedding.py: -------------------------------------------------------------------------------- 1 | from .base import BaseLayer 2 | import hetu as ht 3 | 4 | 5 | class Embedding(BaseLayer): 6 | def __init__(self, num_embeddings, embedding_dim, initializer=ht.init.GenXavierNormal(), name='embedding', ctx=None, **kargs): 7 | self.num_embeddings = num_embeddings 8 | self.embedding_dim = embedding_dim 9 | self.name = name 10 | self.ctx = ctx 11 | self.kargs = kargs 12 | self.embedding_table = initializer( 13 | shape=(self.num_embeddings, self.embedding_dim), name=self.name, ctx=ctx) 14 | 15 | def __call__(self, x): 16 | return ht.embedding_lookup_op(self.embedding_table, x, ctx=self.ctx) 17 | 18 | def __repr__(self): 19 | return f'{self.name}({self.num_embeddings},{self.embedding_dim})' 20 | -------------------------------------------------------------------------------- /python/hetu/layers/gates/base_gate.py: -------------------------------------------------------------------------------- 1 | """ 2 | Base gate 3 | """ 4 | import hetu as ht 5 | 6 | class BaseGate(object): 7 | def __init__(self, num_expert, world_size): 8 | super.__init__() 9 | self.world_size = world_size; 10 | self.num_expert = num_expert; 11 | self.tot_expert = world_size * num_expert; 12 | self.loss = None 13 | 14 | def __call__(self): 15 | raise NotImplementedError("Base gate cannot be directly used.") 16 | 17 | -------------------------------------------------------------------------------- /python/hetu/layers/gates/gshard_gate.py: -------------------------------------------------------------------------------- 1 | from .naive_gate import NaiveGate 2 | import hetu as ht 3 | 4 | class GshardGate(NaiveGate): 5 | def __init__(self, d_model, num_expert, world_size, topk=2, capacity=(1.2, 2.4), random_routing=True): 6 | assert topk==2, 'topk should be 2 in gshard' 7 | super().__init__(d_model, num_expert, world_size, topk=2) 8 | self.capa 9 | -------------------------------------------------------------------------------- /python/hetu/layers/gelu.py: -------------------------------------------------------------------------------- 1 | from .base import BaseLayer 2 | import hetu as ht 3 | 4 | 5 | class Gelu(BaseLayer): 6 | def __call__(self, x): 7 | return ht.gelu_op(x) -------------------------------------------------------------------------------- /python/hetu/layers/identity.py: -------------------------------------------------------------------------------- 1 | from .base import BaseLayer 2 | 3 | 4 | class Identity(BaseLayer): 5 | def __call__(self, x): 6 | return x 7 | -------------------------------------------------------------------------------- /python/hetu/layers/mish.py: -------------------------------------------------------------------------------- 1 | from .base import BaseLayer 2 | import hetu as ht 3 | 4 | 5 | class Mish(BaseLayer): 6 | def __call__(self, x): 7 | return ht.mul_op(x, ht.tanh_op(ht.log_op(ht.addbyconst_op(ht.exp_op(x), 1)))) 8 | -------------------------------------------------------------------------------- /python/hetu/layers/pooling.py: -------------------------------------------------------------------------------- 1 | from .base import BaseLayer 2 | import hetu as ht 3 | 4 | 5 | class MaxPool2d(BaseLayer): 6 | def __init__(self, kernel_size, stride, padding=0): 7 | self.kernel_size = kernel_size 8 | self.stride = stride 9 | self.padding = padding 10 | 11 | def __call__(self, x): 12 | return ht.max_pool2d_op( 13 | x, self.kernel_size, self.kernel_size, self.padding, self.stride) 14 | 15 | 16 | class AvgPool2d(BaseLayer): 17 | def __init__(self, kernel_size, stride, padding=0): 18 | self.kernel_size = kernel_size 19 | self.stride = stride 20 | self.padding = padding 21 | 22 | def __call__(self, x): 23 | return ht.avg_pool2d_op( 24 | x, self.kernel_size, self.kernel_size, self.padding, self.stride) 25 | -------------------------------------------------------------------------------- /python/hetu/layers/relu.py: -------------------------------------------------------------------------------- 1 | from .base import BaseLayer 2 | import hetu as ht 3 | 4 | 5 | class Relu(BaseLayer): 6 | def __call__(self, x): 7 | return ht.relu_op(x) 8 | -------------------------------------------------------------------------------- /python/hetu/layers/reshape.py: -------------------------------------------------------------------------------- 1 | from .base import BaseLayer 2 | import hetu as ht 3 | 4 | 5 | class Reshape(BaseLayer): 6 | def __init__(self, shape): 7 | self.shape = shape 8 | 9 | def __call__(self, x): 10 | return ht.array_reshape_op(x, self.shape) 11 | -------------------------------------------------------------------------------- /python/hetu/layers/sequence.py: -------------------------------------------------------------------------------- 1 | from .base import BaseLayer 2 | 3 | 4 | class Sequence(BaseLayer): 5 | def __init__(self, *args): 6 | self.layers = args 7 | 8 | def __call__(self, x): 9 | for layer in self.layers: 10 | x = layer(x) 11 | return x 12 | -------------------------------------------------------------------------------- /python/hetu/layers/slice.py: -------------------------------------------------------------------------------- 1 | from .base import BaseLayer 2 | import hetu as ht 3 | 4 | 5 | class Slice(BaseLayer): 6 | def __init__(self, begin, size): 7 | self.begin = begin 8 | self.size = size 9 | 10 | def __call__(self, x): 11 | return ht.slice_op(x, self.begin, self.size) 12 | -------------------------------------------------------------------------------- /python/hetu/layers/sum.py: -------------------------------------------------------------------------------- 1 | from .base import BaseLayer 2 | import hetu as ht 3 | 4 | 5 | class SumLayers(BaseLayer): 6 | def __init__(self, layers): 7 | self.layers = layers 8 | 9 | def __call__(self, xs): 10 | if not isinstance(xs, list): 11 | xs = [xs] * len(self.layers) 12 | assert len(xs) == len(self.layers) 13 | if len(self.layers) == 1: 14 | return self.layers[0](xs[0]) 15 | else: 16 | return ht.sum_op([layer(x) for layer, x in zip(self.layers, xs)]) 17 | -------------------------------------------------------------------------------- /python/hetu/onnx/X2hetu/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Hsword/Hetu/a1cb1bcaf236c9f1d452733b105997d50c16c9df/python/hetu/onnx/X2hetu/__init__.py -------------------------------------------------------------------------------- /python/hetu/onnx/X2hetu/handlers/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pkgutil 3 | 4 | __all__ = [ 5 | modname 6 | for _, modname, _ in pkgutil.walk_packages(path=[os.path.split(__file__)[0]]) 7 | ] 8 | -------------------------------------------------------------------------------- /python/hetu/onnx/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from .hetu2onnx import (export) 3 | from __future__ import print_function 4 | from __future__ import unicode_literals 5 | from __future__ import absolute_import 6 | 7 | __all__ = ["hetu2onnx", "util", "constants", "handler", "graph", "onnx2hetu"] 8 | 9 | from hetu.onnx import (hetu2onnx, util, constants, graph, handler, onnx2hetu) 10 | -------------------------------------------------------------------------------- /python/hetu/onnx/constants.py: -------------------------------------------------------------------------------- 1 | NEEDLESS_ATTRS = ['op', 'desc', 'id', 'swap', 'trainable', 'ctx', 'event', 2 | 'inplace', 'lazy_execution', 'on_cpu', 'on_gpu', 'compute', 'middle_result', 'gpu_buffer', 3 | 4 | 5 | ] 6 | 7 | 8 | ONNX_DOMAIN = "" 9 | AI_ONNX_ML_DOMAIN = "ai.onnx.ml" 10 | -------------------------------------------------------------------------------- /python/hetu/onnx/onnx_opset/Concat.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import print_function 3 | from __future__ import unicode_literals 4 | 5 | 6 | import numpy as np 7 | from onnx import onnx_pb 8 | from hetu.onnx import constants, util, graph 9 | from hetu.onnx.handler import hetu_op 10 | from hetu.onnx.onnx_opset import general 11 | 12 | 13 | @hetu_op(["ConcatOp"], onnx_op=["Concat"]) 14 | class Concat: 15 | @classmethod 16 | def version_1(cls, ctx, node, **kwargs): 17 | pass 18 | 19 | # todo:opset < 8: might need to wrap concat in casts since only float is supported 20 | # if ctx.opset < 8: 21 | 22 | @classmethod 23 | def version_11(cls, ctx, node, **kwargs): 24 | # Opset 11 supports negative axis, but core logic is same 25 | cls.version_1(ctx, node, **kwargs) 26 | -------------------------------------------------------------------------------- /python/hetu/onnx/onnx_opset/Conv2d.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import print_function 3 | from __future__ import unicode_literals 4 | 5 | 6 | import numpy as np 7 | from onnx import onnx_pb 8 | from hetu.onnx import constants, util, graph 9 | from hetu.onnx.handler import hetu_op 10 | from hetu.onnx.onnx_opset import general 11 | 12 | 13 | @hetu_op(["Conv2dOp"], onnx_op=["Conv"]) 14 | class Conv: 15 | @classmethod 16 | def version_1(cls, ctx, node, **kwargs): 17 | kernel_shape = ctx._shapes[node._inputs[1]][2:] 18 | pads = [node.get_attr_value('padding', 0)]*4 19 | strides = [node.get_attr_value('stride', 1)]*2 20 | node.set_attr('kernel_shape', kernel_shape) 21 | node.set_attr('pads', pads) 22 | node.set_attr('strides', strides) 23 | 24 | @classmethod 25 | def version_11(cls, ctx, node, **kwargs): 26 | 27 | cls.version_1(ctx, node, **kwargs) 28 | -------------------------------------------------------------------------------- /python/hetu/onnx/onnx_opset/Identity.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import print_function 3 | from __future__ import unicode_literals 4 | 5 | 6 | import numpy as np 7 | from onnx import onnx_pb 8 | from hetu.onnx import constants, util, graph 9 | from hetu.onnx.handler import hetu_op 10 | from hetu.onnx.onnx_opset import general 11 | 12 | 13 | @hetu_op(["Identity"]) 14 | class Identity(general.PassOp): 15 | pass 16 | -------------------------------------------------------------------------------- /python/hetu/onnx/onnx_opset/MatrixMult.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import print_function 3 | from __future__ import unicode_literals 4 | 5 | 6 | import numpy as np 7 | from onnx import onnx_pb 8 | from hetu.onnx import constants, util, graph 9 | from hetu.onnx.handler import hetu_op 10 | from hetu.onnx.onnx_opset import general 11 | 12 | 13 | @hetu_op(["MatMulOp"], onnx_op=["MatMul"]) 14 | class MatMul: 15 | @classmethod 16 | def version_1(cls, ctx, node, **kwargs): 17 | trans_a = node.get_attr_value('matmul_attr_trans_A', 0) 18 | trans_b = node.get_attr_value('matmul_attr_trans_B', 0) 19 | # fixme:only supported matrixs have two dims now 20 | if trans_a != 0: 21 | ctx.insert_new_node_on_input( 22 | node, 'Transpose', node._inputs[0], perm=[1, 0]) 23 | if trans_b != 0: 24 | ctx.insert_new_node_on_input( 25 | node, 'Transpose', node._inputs[1], perm=[1, 0]) 26 | -------------------------------------------------------------------------------- /python/hetu/onnx/onnx_opset/Opposite.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import print_function 3 | from __future__ import unicode_literals 4 | 5 | 6 | import numpy as np 7 | from onnx import onnx_pb 8 | from hetu.onnx import constants, util, graph 9 | from hetu.onnx.handler import hetu_op 10 | from hetu.onnx.onnx_opset import general 11 | 12 | 13 | @hetu_op(["OppositeOp"], onnx_op=["Neg"]) 14 | class Neg(general.PassOp): 15 | pass 16 | -------------------------------------------------------------------------------- /python/hetu/onnx/onnx_opset/Reduces.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import print_function 3 | from __future__ import unicode_literals 4 | 5 | 6 | import numpy as np 7 | from onnx import onnx_pb 8 | from hetu.onnx import constants, util, graph 9 | from hetu.onnx.handler import hetu_op 10 | from hetu.onnx.onnx_opset import general 11 | 12 | 13 | @hetu_op(["ReduceMeanOp"], onnx_op=["ReduceMean"]) 14 | @hetu_op(["ReduceSumOp"], onnx_op=["ReduceSum"]) 15 | class ReduceMean(general.PassOp): 16 | @classmethod 17 | def version_1(cls, ctx, node, **kwargs): 18 | keepdims = node.get_attr_value('keepdims', None) 19 | assert keepdims is not None 20 | node.set_attr("keepdims", keepdims[0]) 21 | 22 | @classmethod 23 | def version_11(cls, ctx, node, **kwargs): 24 | # Opset 11 supports negative axis, but core logic is same 25 | cls.version_1(ctx, node, **kwargs) 26 | -------------------------------------------------------------------------------- /python/hetu/onnx/onnx_opset/Relu.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import print_function 3 | from __future__ import unicode_literals 4 | 5 | 6 | import numpy as np 7 | from onnx import onnx_pb 8 | from hetu.onnx import constants, util, graph 9 | from hetu.onnx.handler import hetu_op 10 | from hetu.onnx.onnx_opset import general 11 | 12 | 13 | @hetu_op(["ReluOp"], onnx_op=["Relu"]) 14 | class Relu(general.PassOp): 15 | pass 16 | -------------------------------------------------------------------------------- /python/hetu/onnx/onnx_opset/Softmax.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import print_function 3 | from __future__ import unicode_literals 4 | 5 | 6 | import numpy as np 7 | from onnx import onnx_pb 8 | from hetu.onnx import constants, util, graph 9 | from hetu.onnx.handler import hetu_op 10 | from hetu.onnx.onnx_opset import general 11 | 12 | 13 | @hetu_op(["SoftmaxOp"], onnx_op=["Softmax"]) 14 | class Softmax(): 15 | @classmethod 16 | def version_1(cls, ctx, node, **kwargs): 17 | pass 18 | # logits_rank = len(ctx.get_shape(node.input_tensor_names[0])) 19 | # node.set_attr("axis",logits_rank - 1) 20 | 21 | @classmethod 22 | def version_11(cls, ctx, node, **kwargs): 23 | cls.version_1(ctx, node, **kwargs) 24 | -------------------------------------------------------------------------------- /python/hetu/onnx/onnx_opset/Sqrt.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import print_function 3 | from __future__ import unicode_literals 4 | 5 | 6 | import numpy as np 7 | from onnx import onnx_pb 8 | from hetu.onnx import constants, util, graph 9 | from hetu.onnx.handler import hetu_op 10 | from hetu.onnx.onnx_opset import general 11 | 12 | 13 | @hetu_op(["SqrtOp"], onnx_op=["Sqrt"]) 14 | class Sqrt(general.PassOp): 15 | pass 16 | 17 | 18 | @hetu_op(["ReciprocalSqrtOp"], onnx_op=["Sqrt"]) 19 | class rSqrt: 20 | @classmethod 21 | def version_1(cls, ctx, node, **kwargs): 22 | op_name = util.make_name(node.name) 23 | reciprocal = ctx.insert_new_node_on_output( 24 | "Reciprocal", node.output_tensor_names[0], name=op_name 25 | ) 26 | ctx.copy_shape( 27 | node.output_tensor_names[0], reciprocal.output_tensor_names[0]) 28 | -------------------------------------------------------------------------------- /python/hetu/onnx/onnx_opset/Tanh.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import print_function 3 | from __future__ import unicode_literals 4 | 5 | 6 | import numpy as np 7 | from onnx import onnx_pb 8 | from hetu.onnx import constants, util, graph 9 | from hetu.onnx.handler import hetu_op 10 | from hetu.onnx.onnx_opset import general 11 | 12 | 13 | @hetu_op(["TanhOp"], onnx_op=["Tanh"]) 14 | class Tanh(general.PassOp): 15 | pass 16 | -------------------------------------------------------------------------------- /python/hetu/onnx/onnx_opset/Transpose.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import print_function 3 | from __future__ import unicode_literals 4 | 5 | 6 | import numpy as np 7 | from onnx import onnx_pb 8 | from hetu.onnx import constants, util, graph 9 | from hetu.onnx.handler import hetu_op 10 | from hetu.onnx.onnx_opset import general 11 | 12 | 13 | @hetu_op(["TransposeOp"], onnx_op=["Transpose"]) 14 | class Transpose(general.PassOp): 15 | pass 16 | -------------------------------------------------------------------------------- /python/hetu/onnx/onnx_opset/Variable.py: -------------------------------------------------------------------------------- 1 | 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import unicode_literals 5 | 6 | from hetu.onnx.handler import hetu_op 7 | 8 | from onnx import helper 9 | from hetu.onnx.onnx_opset import general 10 | 11 | 12 | @hetu_op(["PlaceholderOp"], onnx_op=["Placeholder"]) 13 | class PlaceholderOp: 14 | @classmethod 15 | def version_1(clsc, ctx, node, **kwargs): 16 | val = node.get_attr_value('value') 17 | if(val is not None): 18 | node.op_type = "Const" 19 | 20 | 21 | @hetu_op(["defined_in"]) 22 | class Defined_In(general.PassOp): 23 | pass 24 | -------------------------------------------------------------------------------- /python/hetu/onnx/onnx_opset/Where.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import print_function 3 | from __future__ import unicode_literals 4 | 5 | 6 | import numpy as np 7 | from onnx import onnx_pb 8 | from hetu.onnx import constants, util, graph 9 | from hetu.onnx.handler import hetu_op 10 | from hetu.onnx.onnx_opset import general 11 | 12 | 13 | @hetu_op(["WhereOp"], onnx_op=["Where"]) 14 | class Where(): 15 | @classmethod 16 | def version_1(cls, ctx, node, **kwargs): 17 | assert False, "This version of the operator has been available since version 9 of the default ONNX operator set" 18 | pass 19 | 20 | @classmethod 21 | def version_9(cls, ctx, node, **kwargs): 22 | pass 23 | -------------------------------------------------------------------------------- /python/hetu/onnx/onnx_opset/__init__.py: -------------------------------------------------------------------------------- 1 | from . import ( 2 | MatrixMult, 3 | MultiplyConst, 4 | Variable, 5 | AddElewise, 6 | Relu, 7 | Identity, 8 | Conv2d, 9 | Pool, 10 | Reshape, 11 | AddConst, 12 | Concat, 13 | Sqrt, 14 | Tanh, 15 | BatchNorm, 16 | Pad, 17 | Division, 18 | OneHot, 19 | Opposite, 20 | Softmax, 21 | general, 22 | Reduces, 23 | Dropout, 24 | Transpose, 25 | Where, 26 | Slice, 27 | ) 28 | -------------------------------------------------------------------------------- /python/hetu/onnx/onnx_opset/general.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import print_function 3 | from __future__ import unicode_literals 4 | from __future__ import absolute_import 5 | 6 | 7 | class PassOp: 8 | @classmethod 9 | def version_1(cls, ctx, node, **kwargs): 10 | pass 11 | 12 | @classmethod 13 | def version_6(cls, ctx, node, **kwargs): 14 | 15 | cls.version_1(ctx, node, **kwargs) 16 | -------------------------------------------------------------------------------- /python/hetu/tokenizers/__init__.py: -------------------------------------------------------------------------------- 1 | from .bert_tokenizer import BertTokenizer 2 | -------------------------------------------------------------------------------- /src/common/cpu_device_api.h: -------------------------------------------------------------------------------- 1 | /*! 2 | * Copyright (c) 2017 by Contributors 3 | * \file device_api.h 4 | * \brief Device specific API 5 | */ 6 | #ifndef HETUSYS_RUNTIME_CPU_DEVICE_API_H_ 7 | #define HETUSYS_RUNTIME_CPU_DEVICE_API_H_ 8 | 9 | #include "c_runtime_api.h" 10 | #include "device_api.h" 11 | #include 12 | #include 13 | 14 | namespace hetusys { namespace runtime { 15 | 16 | class CPUDeviceAPI : public DeviceAPI { 17 | public: 18 | void *AllocDataSpace(DLContext ctx, size_t size, size_t alignment) final; 19 | 20 | void FreeDataSpace(DLContext ctx, void *ptr) final; 21 | 22 | void CopyDataFromTo(const void *from, void *to, size_t size, 23 | DLContext ctx_from, DLContext ctx_to, 24 | DLStreamHandle stream) final; 25 | 26 | void StreamSync(DLContext ctx, DLStreamHandle stream) final; 27 | }; 28 | 29 | }} // namespace hetusys::runtime 30 | #endif // HETUSYS_RUNTIME_CPU_DEVICE_API_H_ 31 | -------------------------------------------------------------------------------- /src/common/cuda_device_api.h: -------------------------------------------------------------------------------- 1 | /*! 2 | * Copyright (c) 2017 by Contributors 3 | * \file device_api.h 4 | * \brief Device specific API 5 | */ 6 | #ifndef HETUSYS_RUNTIME_CUDA_DEVICE_API_H_ 7 | #define HETUSYS_RUNTIME_CUDA_DEVICE_API_H_ 8 | 9 | #include "c_runtime_api.h" 10 | #include "device_api.h" 11 | 12 | #include 13 | #include 14 | 15 | namespace hetusys { namespace runtime { 16 | 17 | class CUDADeviceAPI : public DeviceAPI { 18 | public: 19 | void *AllocDataSpace(DLContext ctx, size_t size, size_t alignment) final; 20 | 21 | void FreeDataSpace(DLContext ctx, void *ptr) final; 22 | 23 | void CopyDataFromTo(const void *from, void *to, size_t size, 24 | DLContext ctx_from, DLContext ctx_to, 25 | DLStreamHandle stream) final; 26 | 27 | void StreamSync(DLContext ctx, DLStreamHandle stream) final; 28 | }; 29 | 30 | }} // namespace hetusys::runtime 31 | #endif // HETUSYS_RUNTIME_CUDA_DEVICE_API_H_ 32 | -------------------------------------------------------------------------------- /src/common/random.cc: -------------------------------------------------------------------------------- 1 | #include "random.h" 2 | #include 3 | 4 | std::mutex random_state_mutex; 5 | HetuRandomState hetu_random_state(0); 6 | 7 | int SetRandomSeed(uint64_t seed) { 8 | std::lock_guard lock(random_state_mutex); 9 | hetu_random_state.seed = seed; 10 | return 0; 11 | } 12 | 13 | uint64_t GetSeed() { 14 | return hetu_random_state.seed; 15 | } 16 | 17 | uint64_t GetSeedSeqNum() { 18 | return hetu_random_state.seqnum; 19 | } 20 | 21 | int StepSeqNum(uint64_t num_minimum_calls) { 22 | std::lock_guard lock(random_state_mutex); 23 | hetu_random_state.seqnum += num_minimum_calls; 24 | return 0; 25 | } 26 | 27 | HetuRandomState NewRandomState(uint64_t seqnum) { 28 | return HetuRandomState(hetu_random_state.seed, seqnum); 29 | } 30 | 31 | HetuRandomState &GetRandomState(uint64_t num_minimum_calls) { 32 | StepSeqNum(num_minimum_calls); 33 | return hetu_random_state; 34 | } 35 | -------------------------------------------------------------------------------- /src/common/random.h: -------------------------------------------------------------------------------- 1 | #ifndef HETUSYS_SRC_SEED_H 2 | #define HETUSYS_SRC_SEED_H 3 | 4 | #include "c_runtime_api.h" 5 | 6 | struct HetuRandomState { 7 | HetuRandomState(uint64_t seed_ = 0, uint64_t seqnum_ = 0) : 8 | seed(seed_), seqnum(seqnum_) { 9 | } 10 | 11 | uint64_t seed; 12 | uint64_t seqnum; 13 | }; 14 | 15 | HETUSYS_EXTERN_C { 16 | int SetRandomSeed(uint64_t seed); 17 | uint64_t GetSeed(); 18 | uint64_t GetSeedSeqNum(); 19 | int StepSeqNum(uint64_t num_minimum_calls); 20 | } 21 | 22 | HetuRandomState NewRandomState(uint64_t seqnum); 23 | HetuRandomState &GetRandomState(uint64_t num_minimum_calls); 24 | 25 | #endif 26 | -------------------------------------------------------------------------------- /src/dnnl_ops/ArraySet.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | #include "dnnl.hpp" 14 | 15 | #include "../common/c_runtime_api.h" 16 | #include "dnnl_runtime.h" 17 | 18 | using namespace dnnl; 19 | using namespace std; 20 | 21 | extern "C" int cpu_ArraySet(DLArrayHandle input, float value) { 22 | int num = 1; 23 | for (int i = 0; i < input->ndim; i++) 24 | num *= input->shape[i]; 25 | float *data = (float *)(input->data); 26 | #pragma omp parallel for 27 | for (int i = 0; i < num; i++) 28 | data[i] = value; 29 | return 0; 30 | } 31 | -------------------------------------------------------------------------------- /src/dnnl_ops/Gelu.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #include "dnnl.hpp" 13 | 14 | #include "../common/c_runtime_api.h" 15 | #include "dnnl_runtime.h" 16 | using namespace dnnl; 17 | using namespace std; 18 | 19 | extern "C" int DnnlGelu(const DLArrayHandle input, DLArrayHandle output) { 20 | printf("DnnlGelu is not implemented yet.\n"); 21 | return 0; 22 | } 23 | 24 | extern "C" int DnnlGelu_Gradient(const DLArrayHandle input, 25 | const DLArrayHandle in_grad, 26 | DLArrayHandle output) { 27 | printf("DnnlGelu_Gradient is not implemented yet.\n"); 28 | return 0; 29 | } -------------------------------------------------------------------------------- /src/dnnl_ops/Reshape.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include "dnnl.hpp" 11 | 12 | #include "../common/c_runtime_api.h" 13 | #include "dnnl_runtime.h" 14 | 15 | extern "C" int cpu_Reshape(const DLArrayHandle in_arr, DLArrayHandle out_arr) { 16 | int input_size = 1; 17 | int output_size = 1; 18 | float *input = (float *)(in_arr->data); 19 | float *output = (float *)(out_arr->data); 20 | for (int i = 0; i < in_arr->ndim; i++) 21 | input_size *= in_arr->shape[i]; 22 | for (int i = 0; i < out_arr->ndim; i++) 23 | output_size *= out_arr->shape[i]; 24 | 25 | assert(input_size == output_size); 26 | #pragma omp parallel for 27 | for (int i = 0; i < input_size; i++) 28 | output[i] = input[i]; 29 | return 0; 30 | } -------------------------------------------------------------------------------- /src/dnnl_ops/dnnl_runtime.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #include "dnnl.hpp" 13 | #include "../common/c_runtime_api.h" 14 | 15 | using namespace dnnl; 16 | 17 | extern bool is_dnnl_stream_init; 18 | extern engine eng; 19 | extern stream engine_stream; 20 | 21 | void dnnl_stream_init(); 22 | void print_dlarray(DLArrayHandle mat); 23 | void read_from_dnnl_memory(void *handle, dnnl::memory &mem); 24 | -------------------------------------------------------------------------------- /src/header/types.h: -------------------------------------------------------------------------------- 1 | #ifndef HETUSYS_DEFAULT_TYPES_H 2 | #define HETUSYS_DEFAULT_TYPES_H 3 | 4 | #include 5 | 6 | typedef signed char int8; 7 | typedef short int16; 8 | typedef int int32; 9 | typedef long long int64; 10 | 11 | typedef unsigned char uint8; 12 | typedef unsigned short uint16; 13 | typedef unsigned int uint32; 14 | typedef unsigned long long uint64; 15 | 16 | class SharedCounter { 17 | public: 18 | int64 get() { 19 | return cnt; 20 | } 21 | int64 next() { 22 | return ++cnt; 23 | } 24 | 25 | private: 26 | std::atomic cnt{0}; 27 | }; 28 | 29 | #endif 30 | -------------------------------------------------------------------------------- /src/hetu_cache/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | aux_source_directory(src HETU_SRC_LIST) 2 | 3 | find_package(pybind11 2.6.0 CONFIG) 4 | 5 | if (NOT pybind11_FOUND) 6 | message(FATAL_ERROR "pybind11 not found") 7 | else() 8 | pybind11_add_module(hetu_cache ${HETU_SRC_LIST}) 9 | target_include_directories(hetu_cache PUBLIC include) 10 | target_link_libraries(hetu_cache PUBLIC ps) 11 | endif() 12 | -------------------------------------------------------------------------------- /src/hetu_cache/include/lru_cache.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "cache.h" 4 | 5 | #include 6 | #include 7 | 8 | namespace hetu { 9 | 10 | /* 11 | LRUCache: 12 | use LRU policy 13 | Implemented with a double-linked list and a hash map 14 | O(1) insert, lookup 15 | */ 16 | 17 | class LRUCache : public CacheBase { 18 | private: 19 | std::unordered_map::iterator> hash_; 20 | std::list list_; 21 | 22 | public: 23 | using CacheBase::CacheBase; 24 | size_t size() final { 25 | return hash_.size(); 26 | } 27 | int count(cache_key_t k) final; 28 | void insert(EmbeddingPT e) final; 29 | EmbeddingPT lookup(cache_key_t k) final; 30 | 31 | // python debug function 32 | py::array_t PyAPI_keys(); 33 | }; // class LRUCache 34 | 35 | } // namespace hetu 36 | -------------------------------------------------------------------------------- /src/hetu_cache/src/embedding.cc: -------------------------------------------------------------------------------- 1 | #include "embedding.h" 2 | namespace hetu { 3 | 4 | EmbeddingPT makeEmbedding(cache_key_t k, version_t version, 5 | py::array_t val) { 6 | assert(val.ndim() == 1); 7 | PYTHON_CHECK_ARRAY(val); 8 | auto res = make_shared(k, val.data(), val.shape(0)); 9 | res->setVersion(version); 10 | return res; 11 | } 12 | 13 | } // namespace hetu 14 | -------------------------------------------------------------------------------- /src/memory_pool/BFC_allocator.cc: -------------------------------------------------------------------------------- 1 | #include "BFC_allocator.h" 2 | -------------------------------------------------------------------------------- /src/ops/Clone.cu: -------------------------------------------------------------------------------- 1 | #include "gpu_runtime.h" 2 | 3 | int DLGpuClone(const DLArrayHandle input, DLArrayHandle output, DLStreamHandle stream_handle=NULL){ 4 | float* input_data=(float*)input->data; 5 | float* output_data=(float*)output->data; 6 | int size = 1; 7 | for(int i=0;indim; i++){ 8 | size*=input->shape[i]; 9 | } 10 | cudaMemcpy((void*)output_data, (void*)input_data, size*sizeof(float),cudaMemcpyDeviceToDevice); 11 | return 0; 12 | 13 | } 14 | 15 | -------------------------------------------------------------------------------- /src/ops/Exp.cu: -------------------------------------------------------------------------------- 1 | #include "gpu_runtime.h" 2 | __global__ void exp_kernel(const float *input, float *output, size_t size) { 3 | size_t ind = blockIdx.x * blockDim.x + threadIdx.x; 4 | if (ind >= size) 5 | return; 6 | output[ind] = expf(input[ind]); 7 | } 8 | 9 | int DLGpuExp(const DLArrayHandle input, DLArrayHandle output, 10 | DLStreamHandle stream_handle = NULL) { 11 | size_t size = ArrSize(input); 12 | const float *input_data = (const float *)input->data; 13 | float *output_data = (float *)output->data; 14 | 15 | dim3 blocks; 16 | dim3 threads; 17 | ThreadBlock1D(threads, blocks, size); 18 | if (stream_handle) { 19 | exp_kernel<<handle>>>( 21 | input_data, output_data, size); 22 | } else { 23 | exp_kernel<<>>(input_data, output_data, size); 24 | } 25 | return 0; 26 | } 27 | -------------------------------------------------------------------------------- /tests/README.md: -------------------------------------------------------------------------------- 1 | ## Tests 2 | ### Need to validate: 3 | * test_dnnl_op.py 4 | * test_transformer_ops.py 5 | * onnx/* 6 | * test_DistGCN/test_model_distGCN15d.py 7 | -------------------------------------------------------------------------------- /tests/get_gpu_memory.py: -------------------------------------------------------------------------------- 1 | from pynvml import smi as nvidia_smi 2 | 3 | nvidia_smi.nvmlInit() 4 | handle = nvidia_smi.nvmlDeviceGetHandleByIndex(0) 5 | # card id 0 hardcoded here, there is also a call to get all available card ids, so we could iterate 6 | 7 | ans = 0 8 | while(True): 9 | mem_res = nvidia_smi.nvmlDeviceGetMemoryInfo(handle) 10 | # print(mem_res.used / (1024**2)) # usage in GiB 11 | if (mem_res.used / (1024**2) > ans): 12 | ans = mem_res.used / (1024**2) 13 | print(ans) 14 | # print(f'mem: {100 * (mem_res.used / mem_res.total):.3f}%') # percentage usage 15 | -------------------------------------------------------------------------------- /tests/hetu_cache/hetu_cache_config.yml: -------------------------------------------------------------------------------- 1 | shared: &shared 2 | DMLC_PS_ROOT_URI : 127.0.0.1 3 | DMLC_PS_ROOT_PORT : 13200 4 | DMLC_NUM_WORKER : 1 5 | DMLC_NUM_SERVER : 1 6 | DMLC_PS_VAN_TYPE : p3 7 | launch : 8 | worker : 1 9 | server : 1 10 | scheduler : true 11 | -------------------------------------------------------------------------------- /tests/onnx/README.md: -------------------------------------------------------------------------------- 1 | ## Attention 2 | This part has not validated in the latest version. Package "onnx_tf" cannot be used in Python 3.7, and there're some bugs in hetu's onnx part. 3 | -------------------------------------------------------------------------------- /tests/pstests/local_s2_w1.yml: -------------------------------------------------------------------------------- 1 | shared: &shared 2 | DMLC_PS_ROOT_URI : 127.0.0.1 3 | DMLC_PS_ROOT_PORT : 13200 4 | DMLC_NUM_WORKER : 1 5 | DMLC_NUM_SERVER : 2 6 | DMLC_PS_VAN_TYPE : p3 7 | sched: 8 | <<: *shared 9 | DMLC_ROLE : scheduler 10 | s0: 11 | <<: *shared 12 | DMLC_ROLE : server 13 | SERVER_ID : 0 14 | DMLC_PS_SERVER_URI : 127.0.0.1 15 | DMLC_PS_SERVER_PORT : 13201 16 | s1: 17 | <<: *shared 18 | DMLC_ROLE : server 19 | SERVER_ID : 1 20 | DMLC_PS_SERVER_URI : 127.0.0.1 21 | DMLC_PS_SERVER_PORT : 13203 22 | w0: 23 | <<: *shared 24 | DMLC_ROLE : worker 25 | WORKER_ID : 0 26 | DMLC_PS_WORKER_URI : 127.0.0.1 27 | DMLC_PS_WORKER_PORT : 13210 -------------------------------------------------------------------------------- /tests/pstests/local_s2_w2.yml: -------------------------------------------------------------------------------- 1 | shared: &shared 2 | DMLC_PS_ROOT_URI : 127.0.0.1 3 | DMLC_PS_ROOT_PORT : 13200 4 | DMLC_NUM_WORKER : 2 5 | DMLC_NUM_SERVER : 2 6 | DMLC_PS_VAN_TYPE : p3 7 | sched: 8 | <<: *shared 9 | DMLC_ROLE : scheduler 10 | s0: 11 | <<: *shared 12 | DMLC_ROLE : server 13 | SERVER_ID : 0 14 | DMLC_PS_SERVER_URI : 127.0.0.1 15 | DMLC_PS_SERVER_PORT : 13201 16 | s1: 17 | <<: *shared 18 | DMLC_ROLE : server 19 | SERVER_ID : 1 20 | DMLC_PS_SERVER_URI : 127.0.0.1 21 | DMLC_PS_SERVER_PORT : 13203 22 | w0: 23 | <<: *shared 24 | DMLC_ROLE : worker 25 | WORKER_ID : 0 26 | DMLC_PS_WORKER_URI : 127.0.0.1 27 | DMLC_PS_WORKER_PORT : 13210 28 | w1: 29 | <<: *shared 30 | DMLC_ROLE : worker 31 | WORKER_ID : 1 32 | DMLC_PS_WORKER_URI : 127.0.0.1 33 | DMLC_PS_WORKER_PORT : 13211 -------------------------------------------------------------------------------- /tests/pstests/tf_local_s1_w2.json: -------------------------------------------------------------------------------- 1 | { 2 | "worker": [ 3 | "127.0.0.1:12349", 4 | "127.0.0.1:12348" 5 | ], 6 | "ps": [ 7 | "127.0.0.1:22345" 8 | ] 9 | } -------------------------------------------------------------------------------- /tests/test_encode_decode.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Hsword/Hetu/a1cb1bcaf236c9f1d452733b105997d50c16c9df/tests/test_encode_decode.py -------------------------------------------------------------------------------- /tests/test_ha2agather.py: -------------------------------------------------------------------------------- 1 | from ctypes import * 2 | from hetu import ndarray 3 | from hetu.stream import * 4 | from hetu.context import DeviceGroup 5 | import numpy as np 6 | import hetu as ht 7 | import hetu.communicator.mpi_nccl_comm as fuck 8 | 9 | if __name__ == "__main__": 10 | t = ht.wrapped_mpi_nccl_init() 11 | send_arr = np.ones(16)*t.localRank.value 12 | recv_arr = np.ones(16*8)*t.localRank.value 13 | print("before: send_arr = "+str(send_arr)+" recv_arr = "+str(recv_arr)) 14 | send_arr = ndarray.array(send_arr, ctx=ndarray.gpu(t.device_id.value)) 15 | recv_arr = ndarray.array(recv_arr, ctx=ndarray.gpu(t.device_id.value)) 16 | t.dlarrayHA2AGather(send_arr, recv_arr,fuck.ncclDataType_t.ncclFloat32, t.localRank.value, 8) 17 | print("after: send_arr = "+str(send_arr.asnumpy())+" recv_arr = "+str(recv_arr.asnumpy())) 18 | 19 | -------------------------------------------------------------------------------- /tools/EmbeddingMemoryCompression/.gitignore: -------------------------------------------------------------------------------- 1 | datasets/ 2 | logs/ 3 | scripts.sh 4 | wandb/ 5 | ckpts/ 6 | -------------------------------------------------------------------------------- /tools/EmbeddingMemoryCompression/methods/__init__.py: -------------------------------------------------------------------------------- 1 | from .scheduler import get_trainer 2 | -------------------------------------------------------------------------------- /tools/EmbeddingMemoryCompression/methods/layers/__init__.py: -------------------------------------------------------------------------------- 1 | from .hash import HashEmbedding 2 | from .compo import CompositionalEmbedding 3 | from .tensortrain import TensorTrainEmbedding 4 | from .dhe import DeepHashEmbedding 5 | from .robe import RobeEmbedding 6 | from .dpq import DPQEmbedding 7 | from .mgqe import MGQEmbedding 8 | from .adapt import AdaptiveEmbedding 9 | from .mde import MDEmbedding 10 | from .autodim import AutoDimEmbedding, AutoDimRetrainEmbedding 11 | from .optembed import OptEmbedding, OptEmbeddingAfterRowPruning 12 | from .sparse import SparseEmbedding 13 | from .deeplight import DeepLightEmbedding 14 | from .pep import PEPEmbedding, PEPRetrainEmbedding 15 | from .autosrh import AutoSrhEmbedding, AutoSrhRetrainEmbedding 16 | from .quantize import QuantizedEmbedding 17 | from .alpt import ALPTEmbedding 18 | from .deduplication import DedupEmbedding 19 | -------------------------------------------------------------------------------- /tools/EmbeddingMemoryCompression/methods/layers/hash.py: -------------------------------------------------------------------------------- 1 | import hetu as ht 2 | from hetu.layers import Embedding 3 | 4 | 5 | class HashEmbedding(Embedding): 6 | def __call__(self, x): 7 | # ref MLSys20, HierPS 8 | with ht.context(self.ctx): 9 | sparse_input = ht.mod_hash_op(x, self.num_embeddings) 10 | return ht.embedding_lookup_op(self.embedding_table, sparse_input) 11 | -------------------------------------------------------------------------------- /tools/EmbeddingMemoryCompression/methods/layers/primes.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Hsword/Hetu/a1cb1bcaf236c9f1d452733b105997d50c16c9df/tools/EmbeddingMemoryCompression/methods/layers/primes.npy -------------------------------------------------------------------------------- /tools/EmbeddingMemoryCompression/methods/scheduler/robe.py: -------------------------------------------------------------------------------- 1 | from .base import EmbeddingTrainer 2 | from ..layers import RobeEmbedding 3 | from hetu.random import get_np_rand, set_random_seed 4 | import math 5 | 6 | 7 | class ROBETrainer(EmbeddingTrainer): 8 | def assert_use_multi(self): 9 | assert self.use_multi == 0 10 | 11 | def get_embed_layer(self): 12 | assert self.num_embed < 2038074743 13 | set_random_seed(self.seed) 14 | nprs = get_np_rand(1) 15 | emb = RobeEmbedding( 16 | math.floor(self.num_embed * self.embedding_dim * 17 | self.compress_rate), 18 | self.embedding_dim, 19 | self.embedding_args['Z'], 20 | nprs, 21 | use_slot_coef=bool(self.separate_fields), 22 | initializer=self.initializer, 23 | name=f'RobeEmb{self.compress_rate}', 24 | ctx=self.ectx, 25 | ) 26 | return emb 27 | -------------------------------------------------------------------------------- /tools/EmbeddingMemoryCompression/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import CTRModel_Head 2 | from .dcn import DCN_Head 3 | from .deepfm import DeepFM_Head 4 | from .dlrm import DLRM_Head 5 | from .wdl import WDL_Head 6 | -------------------------------------------------------------------------------- /tools/EmbeddingMemoryCompression/scripts/.gitignore: -------------------------------------------------------------------------------- 1 | *.png 2 | *.bin 3 | -------------------------------------------------------------------------------- /tools/EmbeddingMemoryCompression/supplements/static_encoding.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Hsword/Hetu/a1cb1bcaf236c9f1d452733b105997d50c16c9df/tools/EmbeddingMemoryCompression/supplements/static_encoding.png -------------------------------------------------------------------------------- /tools/Galvatron/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Hsword/Hetu/a1cb1bcaf236c9f1d452733b105997d50c16c9df/tools/Galvatron/.DS_Store -------------------------------------------------------------------------------- /tools/Galvatron/MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-include galvatron *.json -------------------------------------------------------------------------------- /tools/Galvatron/Makefile: -------------------------------------------------------------------------------- 1 | CXX = g++ 2 | CXXFLAGS = -O3 -Wall -shared -std=c++11 -fPIC 3 | PYTHON_INCLUDES = $(shell python3 -m pybind11 --includes) 4 | PYTHON_EXTENSION_SUFFIX = $(shell python3-config --extension-suffix) 5 | SOURCE_DIR = csrc 6 | SOURCE_FILE = dp_core.cpp 7 | BUILD_DIR = galvatron/build 8 | LIB_DIR = $(BUILD_DIR)/lib 9 | OUTPUT_FILE = $(LIB_DIR)/galvatron_dp_core$(PYTHON_EXTENSION_SUFFIX) 10 | CURRENT_DIR = $(shell dirname $(realpath $(lastword $(MAKEFILE_LIST))) 11 | 12 | all: $(OUTPUT_FILE) 13 | 14 | $(OUTPUT_FILE): $(SOURCE_DIR)/$(SOURCE_FILE) 15 | @mkdir -p $(LIB_DIR) 16 | $(CXX) $(CXXFLAGS) $(PYTHON_INCLUDES) $< -o $@ 17 | 18 | clean: 19 | rm -rf $(BUILD_DIR) 20 | 21 | .PHONY: clean -------------------------------------------------------------------------------- /tools/Galvatron/figs/api.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Hsword/Hetu/a1cb1bcaf236c9f1d452733b105997d50c16c9df/tools/Galvatron/figs/api.jpg -------------------------------------------------------------------------------- /tools/Galvatron/galvatron.exp: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | path="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 3 | echo "Galvatron root is" $path 4 | export GalvatronRoot="$path" 5 | export PATH="$path:$PATH" 6 | export PYTHONPATH="$path:$PYTHONPATH" -------------------------------------------------------------------------------- /tools/Galvatron/galvatron/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Hsword/Hetu/a1cb1bcaf236c9f1d452733b105997d50c16c9df/tools/Galvatron/galvatron/.DS_Store -------------------------------------------------------------------------------- /tools/Galvatron/galvatron/MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-include galvatron *.json -------------------------------------------------------------------------------- /tools/Galvatron/galvatron/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | for p in ['site_package', 'build/lib']: 4 | sys.path.insert(0, os.path.join(os.path.dirname(__file__), p)) -------------------------------------------------------------------------------- /tools/Galvatron/galvatron/core/__init__.py: -------------------------------------------------------------------------------- 1 | from .redistribute import split_to_group, gather_from_group 2 | from .comm_groups import gen_comm_groups 3 | from .initialize import init_empty_weights 4 | from .parallel import * 5 | from .arguments import initialize_galvatron, get_args 6 | from .hybrid_parallel_config import * 7 | from .hybrid_parallel_model import * 8 | from .profiler import * -------------------------------------------------------------------------------- /tools/Galvatron/galvatron/core/dataloader.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Hsword/Hetu/a1cb1bcaf236c9f1d452733b105997d50c16c9df/tools/Galvatron/galvatron/core/dataloader.py -------------------------------------------------------------------------------- /tools/Galvatron/galvatron/core/pipeline/__init__.py: -------------------------------------------------------------------------------- 1 | from .pipeline import PipelineParallel, PipeSequential -------------------------------------------------------------------------------- /tools/Galvatron/galvatron/core/tensor_parallel/__init__.py: -------------------------------------------------------------------------------- 1 | from .transformer import ParallelMLP, ParallelAttention 2 | from .utils import init_method_normal, scaled_init_method_normal 3 | from megatron.core.tensor_parallel import ColumnParallelLinear, RowParallelLinear 4 | from megatron.core.tensor_parallel import get_cuda_rng_tracker, split_tensor_along_last_dim 5 | from megatron.model.enums import AttnMaskType, LayerType, AttnType -------------------------------------------------------------------------------- /tools/Galvatron/galvatron/core/tensor_parallel/utils.py: -------------------------------------------------------------------------------- 1 | """Megatron-LM Utilities for models.""" 2 | 3 | import math 4 | import torch 5 | 6 | def init_method_normal(sigma): 7 | """Init method based on N(0, sigma).""" 8 | def init_(tensor): 9 | return torch.nn.init.normal_(tensor, mean=0.0, std=sigma) 10 | 11 | return init_ 12 | 13 | def scaled_init_method_normal(sigma, num_layers): 14 | """Init method based on N(0, sigma/sqrt(2*num_layers).""" 15 | std = sigma / math.sqrt(2.0 * num_layers) 16 | 17 | def init_(tensor): 18 | return torch.nn.init.normal_(tensor, mean=0.0, std=std) 19 | 20 | return init_ -------------------------------------------------------------------------------- /tools/Galvatron/galvatron/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Hsword/Hetu/a1cb1bcaf236c9f1d452733b105997d50c16c9df/tools/Galvatron/galvatron/models/__init__.py -------------------------------------------------------------------------------- /tools/Galvatron/galvatron/models/baichuan/BaiChuanModel_tensor_parallel.py: -------------------------------------------------------------------------------- 1 | from galvatron.core import get_args 2 | from flash_attn.models.gpt import create_mixer_cls, create_mlp_cls 3 | 4 | def construct_tensor_parallel_model(model, config, tp_groups_enc): 5 | args=get_args() 6 | factory_kwargs = { 7 | 'device': 'meta' if hasattr(args, 'initialize_on_meta') and args.initialize_on_meta else 'cpu', 8 | 'dtype': None 9 | } 10 | for i in range(config.num_hidden_layers): 11 | layer = model.transformer.layers[i] 12 | setattr(layer, 'mixer', create_mixer_cls(config, layer_idx=i, process_group=tp_groups_enc[i].group, **factory_kwargs)(config.hidden_size)) 13 | setattr(layer, 'mlp', create_mlp_cls(config, layer_idx=i, process_group=tp_groups_enc[i].group, **factory_kwargs)(config.hidden_size)) 14 | return model -------------------------------------------------------------------------------- /tools/Galvatron/galvatron/models/baichuan/__init__.py: -------------------------------------------------------------------------------- 1 | from .BaiChuanModel_hybrid_parallel import get_hybrid_parallel_configs, construct_hybrid_parallel_model, baichuan_model_hp -------------------------------------------------------------------------------- /tools/Galvatron/galvatron/models/baichuan/configs/computation_profiling_bf16_hidden4096_head32_seqlen2048.json: -------------------------------------------------------------------------------- 1 | { 2 | "layernum[6]_bsz2": 69.02671432495117, 3 | "layernum[12]_bsz2": 123.64290746053062, 4 | "layertype_0": 4.551349427964954 5 | } -------------------------------------------------------------------------------- /tools/Galvatron/galvatron/models/baichuan/configs/galvatron_config_baichuan-7b_2nodes_8gpus_per_node_40GB_bf16_3D_example.json: -------------------------------------------------------------------------------- 1 | { 2 | "pp_deg": 2, 3 | "tp_sizes_enc": "2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2", 4 | "tp_consecutive_flags": "1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1", 5 | "dp_types_enc": "0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0", 6 | "global_bsz": 256, 7 | "chunks": 32, 8 | "pp_division": "16,16", 9 | "pipeline_type": "pipedream_flush", 10 | "default_dp_type": "zero2" 11 | } -------------------------------------------------------------------------------- /tools/Galvatron/galvatron/models/baichuan/configs/galvatron_config_baichuan-7b_2nodes_8gpus_per_node_40GB_bf16_example.json: -------------------------------------------------------------------------------- 1 | { 2 | "pp_deg": 1, 3 | "tp_sizes_enc": "1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1", 4 | "tp_consecutive_flags": "1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1", 5 | "dp_types_enc": "1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1", 6 | "global_bsz": 48, 7 | "chunks": 1, 8 | "pp_division": "32", 9 | "checkpoint": "1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0", 10 | "pipeline_type": "pipedream_flush", 11 | "default_dp_type": "zero2" 12 | } -------------------------------------------------------------------------------- /tools/Galvatron/galvatron/models/baichuan/hf_configs/__init__.py: -------------------------------------------------------------------------------- 1 | from .config_utils import * -------------------------------------------------------------------------------- /tools/Galvatron/galvatron/models/baichuan/hf_configs/baichuan-7b/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "BaiChuanForCausalLM" 4 | ], 5 | "auto_map": { 6 | "AutoConfig": "configuration_baichuan.BaiChuanConfig", 7 | "AutoModelForCausalLM": "modeling_baichuan.BaiChuanForCausalLM" 8 | }, 9 | "bos_token_id": 1, 10 | "eos_token_id": 2, 11 | "hidden_act": "silu", 12 | "hidden_size": 4096, 13 | "initializer_range": 0.02, 14 | "intermediate_size": null, 15 | "max_position_embeddings": 2048, 16 | "model_type": "baichuan", 17 | "num_attention_heads": 32, 18 | "num_hidden_layers": 32, 19 | "pad_token_id": 0, 20 | "rms_norm_eps": 1e-06, 21 | "tie_word_embeddings": false, 22 | "torch_dtype": "float32", 23 | "transformers_version": "4.29.1", 24 | "use_cache": true, 25 | "vocab_size": 64000 26 | } 27 | -------------------------------------------------------------------------------- /tools/Galvatron/galvatron/models/baichuan/hf_configs/baichuan-7b/config_ori.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "BaiChuanForCausalLM" 4 | ], 5 | "auto_map": { 6 | "AutoConfig": "configuration_baichuan.BaiChuanConfig", 7 | "AutoModelForCausalLM": "modeling_baichuan.BaiChuanForCausalLM" 8 | }, 9 | "bos_token_id": 1, 10 | "eos_token_id": 2, 11 | "hidden_act": "silu", 12 | "hidden_size": 4096, 13 | "initializer_range": 0.02, 14 | "intermediate_size": 11008, 15 | "max_position_embeddings": 4096, 16 | "model_type": "baichuan", 17 | "num_attention_heads": 32, 18 | "num_hidden_layers": 32, 19 | "pad_token_id": 0, 20 | "rms_norm_eps": 1e-06, 21 | "tie_word_embeddings": false, 22 | "torch_dtype": "float32", 23 | "transformers_version": "4.29.1", 24 | "use_cache": true, 25 | "vocab_size": 64000 26 | } 27 | -------------------------------------------------------------------------------- /tools/Galvatron/galvatron/models/baichuan/scripts/train.sh: -------------------------------------------------------------------------------- 1 | LAUNCHER="python3" 2 | 3 | TRAINER="train.py" 4 | 5 | ${LAUNCHER} ${TRAINER} \ 6 | --gpu_id 0 \ 7 | --global_train_batch_size 1 \ 8 | --model_size baichuan-7b \ 9 | --set_model_config_manually 1 \ 10 | --set_layernum_manually 0 \ 11 | --vocab_size 32000 \ 12 | --hidden_size 1024 \ 13 | --num_hidden_layers 24 \ 14 | --num_attention_heads 16 \ 15 | --seq_length 1024 \ 16 | --epochs 10 \ 17 | --lr 1e-4 \ 18 | --adam_weight_decay 0.01 \ 19 | --dropout_prob 0.1 \ 20 | --check_loss 0 \ 21 | --profile 1 -------------------------------------------------------------------------------- /tools/Galvatron/galvatron/models/gpt/GPTModel_tensor_parallel.py: -------------------------------------------------------------------------------- 1 | from galvatron.core import get_args 2 | from flash_attn.models.gpt import create_mixer_cls, create_mlp_cls 3 | 4 | def construct_tensor_parallel_model(model, config, tp_groups_enc): 5 | args=get_args() 6 | factory_kwargs = { 7 | 'device': 'meta' if hasattr(args, 'initialize_on_meta') and args.initialize_on_meta else 'cpu', 8 | 'dtype': None 9 | } 10 | for i in range(config.num_hidden_layers): 11 | layer = model.transformer.layers[i] 12 | setattr(layer, 'mixer', create_mixer_cls(config, layer_idx=i, process_group=tp_groups_enc[i].group, **factory_kwargs)(config.hidden_size)) 13 | setattr(layer, 'mlp', create_mlp_cls(config, layer_idx=i, process_group=tp_groups_enc[i].group, **factory_kwargs)(config.hidden_size)) 14 | return model -------------------------------------------------------------------------------- /tools/Galvatron/galvatron/models/gpt/__init__.py: -------------------------------------------------------------------------------- 1 | from .GPTModel_hybrid_parallel import get_hybrid_parallel_configs, construct_hybrid_parallel_model, gpt_model_hp -------------------------------------------------------------------------------- /tools/Galvatron/galvatron/models/gpt/configs/computation_profiling_bf16_hidden1600_head32_seqlen1024.json: -------------------------------------------------------------------------------- 1 | { 2 | "layernum[12]_bsz4": 41.484313583374025, 3 | "layernum[24]_bsz4": 70.19392623901368, 4 | "layertype_0": 0.5981169303258261 5 | } -------------------------------------------------------------------------------- /tools/Galvatron/galvatron/models/gpt/configs/computation_profiling_bf16_hidden2560_head32_seqlen2048.json: -------------------------------------------------------------------------------- 1 | { 2 | "layernum[6]_bsz2": 43.418639373779286, 3 | "layernum[12]_bsz2": 68.19261474609375, 4 | "layertype_0": 2.0644979476928724 5 | } -------------------------------------------------------------------------------- /tools/Galvatron/galvatron/models/gpt/configs/computation_profiling_bf16_hidden4096_head32_seqlen2048.json: -------------------------------------------------------------------------------- 1 | { 2 | "layernum[6]_bsz2": 83.93114852905275, 3 | "layernum[12]_bsz2": 139.4233337402344, 4 | "layertype_0": 4.6243487675984705 5 | } -------------------------------------------------------------------------------- /tools/Galvatron/galvatron/models/gpt/configs/galvatron_config_gpt-1.5b_2nodes_8gpus_per_node_40GB_bf16_3D_example.json: -------------------------------------------------------------------------------- 1 | { 2 | "pp_deg": 2, 3 | "tp_sizes_enc": "2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2", 4 | "tp_consecutive_flags": "1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1", 5 | "dp_types_enc": "0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0", 6 | "global_bsz": 448, 7 | "chunks": 8, 8 | "pp_division": "24,24", 9 | "pipeline_type": "pipedream_flush", 10 | "default_dp_type": "zero2" 11 | } -------------------------------------------------------------------------------- /tools/Galvatron/galvatron/models/gpt/configs/galvatron_config_gpt-1.5b_2nodes_8gpus_per_node_40GB_bf16_example.json: -------------------------------------------------------------------------------- 1 | { 2 | "pp_deg": 1, 3 | "tp_sizes_enc": "1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1", 4 | "tp_consecutive_flags": "1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1", 5 | "dp_types_enc": "0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0", 6 | "global_bsz": 144, 7 | "chunks": 1, 8 | "pp_division": "48", 9 | "checkpoint": "0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0", 10 | "pipeline_type": "pipedream_flush", 11 | "default_dp_type": "zero2" 12 | } -------------------------------------------------------------------------------- /tools/Galvatron/galvatron/models/gpt/configs/galvatron_config_gpt-2.7b_2nodes_8gpus_per_node_40GB_bf16_3D_example.json: -------------------------------------------------------------------------------- 1 | { 2 | "pp_deg": 2, 3 | "tp_sizes_enc": "2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2", 4 | "tp_consecutive_flags": "1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1", 5 | "dp_types_enc": "0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0", 6 | "global_bsz": 320, 7 | "chunks": 16, 8 | "pp_division": "16,16", 9 | "pipeline_type": "pipedream_flush", 10 | "default_dp_type": "zero2" 11 | } -------------------------------------------------------------------------------- /tools/Galvatron/galvatron/models/gpt/configs/galvatron_config_gpt-2.7b_2nodes_8gpus_per_node_40GB_bf16_example.json: -------------------------------------------------------------------------------- 1 | { 2 | "pp_deg": 1, 3 | "tp_sizes_enc": "1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1", 4 | "tp_consecutive_flags": "1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1", 5 | "dp_types_enc": "0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0", 6 | "global_bsz": 64, 7 | "chunks": 1, 8 | "pp_division": "32", 9 | "checkpoint": "0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0", 10 | "pipeline_type": "pipedream_flush", 11 | "default_dp_type": "zero2" 12 | } -------------------------------------------------------------------------------- /tools/Galvatron/galvatron/models/gpt/configs/galvatron_config_gpt-6.7b_2nodes_8gpus_per_node_40GB_bf16_3D_example.json: -------------------------------------------------------------------------------- 1 | { 2 | "pp_deg": 2, 3 | "tp_sizes_enc": "2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2", 4 | "tp_consecutive_flags": "1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1", 5 | "dp_types_enc": "0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0", 6 | "global_bsz": 256, 7 | "chunks": 32, 8 | "pp_division": "16,16", 9 | "pipeline_type": "pipedream_flush", 10 | "default_dp_type": "zero2" 11 | } -------------------------------------------------------------------------------- /tools/Galvatron/galvatron/models/gpt/configs/galvatron_config_gpt-6.7b_2nodes_8gpus_per_node_40GB_bf16_example.json: -------------------------------------------------------------------------------- 1 | { 2 | "pp_deg": 1, 3 | "tp_sizes_enc": "1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1", 4 | "tp_consecutive_flags": "1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1", 5 | "dp_types_enc": "1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1", 6 | "global_bsz": 48, 7 | "chunks": 1, 8 | "pp_division": "32", 9 | "checkpoint": "0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0", 10 | "pipeline_type": "pipedream_flush", 11 | "default_dp_type": "zero2" 12 | } -------------------------------------------------------------------------------- /tools/Galvatron/galvatron/models/gpt/meta_configs/__init__.py: -------------------------------------------------------------------------------- 1 | from .config_utils import * -------------------------------------------------------------------------------- /tools/Galvatron/galvatron/models/gpt/meta_configs/gpt-0.3b.json: -------------------------------------------------------------------------------- 1 | { 2 | "n_layer": 24, 3 | "n_embd": 1024, 4 | "n_head": 16, 5 | "head_dim": 64, 6 | "vocab_size": 50257, 7 | "n_positions": 1024 8 | } -------------------------------------------------------------------------------- /tools/Galvatron/galvatron/models/gpt/meta_configs/gpt-1.5b.json: -------------------------------------------------------------------------------- 1 | { 2 | "n_layer": 48, 3 | "n_embd": 1600, 4 | "n_head": 32, 5 | "head_dim": 50, 6 | "vocab_size": 50257, 7 | "n_positions": 1024 8 | } -------------------------------------------------------------------------------- /tools/Galvatron/galvatron/models/gpt/meta_configs/gpt-2.7b.json: -------------------------------------------------------------------------------- 1 | { 2 | "n_layer": 32, 3 | "n_embd": 2560, 4 | "n_head": 32, 5 | "head_dim": 80, 6 | "vocab_size": 50257, 7 | "n_positions": 2048 8 | } -------------------------------------------------------------------------------- /tools/Galvatron/galvatron/models/gpt/meta_configs/gpt-6.7b.json: -------------------------------------------------------------------------------- 1 | { 2 | "n_layer": 32, 3 | "n_embd": 4096, 4 | "n_head": 32, 5 | "head_dim": 128, 6 | "vocab_size": 50257, 7 | "n_positions": 2048 8 | } -------------------------------------------------------------------------------- /tools/Galvatron/galvatron/models/gpt/scripts/train.sh: -------------------------------------------------------------------------------- 1 | LAUNCHER="python3" 2 | 3 | TRAINER="train.py" 4 | 5 | ${LAUNCHER} ${TRAINER} \ 6 | --gpu_id 0 \ 7 | --global_train_batch_size 1 \ 8 | --model_size gpt-0.3b \ 9 | --set_model_config_manually 0 \ 10 | --set_layernum_manually 0 \ 11 | --vocab_size 50257 \ 12 | --hidden_size 1024 \ 13 | --num_hidden_layers 24 \ 14 | --num_attention_heads 16 \ 15 | --seq_length 1024 \ 16 | --epochs 10 \ 17 | --lr 1e-4 \ 18 | --adam_weight_decay 0.01 \ 19 | --dropout_prob 0.1 \ 20 | --check_loss 0 \ 21 | --profile 1 -------------------------------------------------------------------------------- /tools/Galvatron/galvatron/models/llama/LlamaModel_tensor_parallel.py: -------------------------------------------------------------------------------- 1 | from galvatron.core import get_args 2 | from flash_attn.models.gpt import create_mixer_cls, create_mlp_cls 3 | 4 | def construct_tensor_parallel_model(model, config, tp_groups_enc): 5 | args=get_args() 6 | factory_kwargs = { 7 | 'device': 'meta' if hasattr(args, 'initialize_on_meta') and args.initialize_on_meta else 'cpu', 8 | 'dtype': None 9 | } 10 | for i in range(config.num_hidden_layers): 11 | layer = model.transformer.layers[i] 12 | setattr(layer, 'mixer', create_mixer_cls(config, layer_idx=i, process_group=tp_groups_enc[i].group, **factory_kwargs)(config.hidden_size)) 13 | setattr(layer, 'mlp', create_mlp_cls(config, layer_idx=i, process_group=tp_groups_enc[i].group, **factory_kwargs)(config.hidden_size)) 14 | return model -------------------------------------------------------------------------------- /tools/Galvatron/galvatron/models/llama/__init__.py: -------------------------------------------------------------------------------- 1 | from .LlamaModel_hybrid_parallel import get_hybrid_parallel_configs, construct_hybrid_parallel_model, llama_model_hp -------------------------------------------------------------------------------- /tools/Galvatron/galvatron/models/llama/configs/computation_profiling_bf16_hidden4096_head32_seqlen2048.json: -------------------------------------------------------------------------------- 1 | { 2 | "layernum[6]_bsz2": 63.300140380859375, 3 | "layernum[12]_bsz2": 119.02136306762694, 4 | "layertype_0": 4.6434352238972965 5 | } -------------------------------------------------------------------------------- /tools/Galvatron/galvatron/models/llama/configs/galvatron_config_llama-7b_2nodes_8gpus_per_node_40GB_bf16_3D_example.json: -------------------------------------------------------------------------------- 1 | { 2 | "pp_deg": 2, 3 | "tp_sizes_enc": "2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2", 4 | "tp_consecutive_flags": "1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1", 5 | "dp_types_enc": "0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0", 6 | "global_bsz": 256, 7 | "chunks": 32, 8 | "pp_division": "16,16", 9 | "pipeline_type": "pipedream_flush", 10 | "default_dp_type": "zero2" 11 | } -------------------------------------------------------------------------------- /tools/Galvatron/galvatron/models/llama/configs/galvatron_config_llama-7b_2nodes_8gpus_per_node_40GB_bf16_example.json: -------------------------------------------------------------------------------- 1 | { 2 | "pp_deg": 1, 3 | "tp_sizes_enc": "1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1", 4 | "tp_consecutive_flags": "1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1", 5 | "dp_types_enc": "1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1", 6 | "global_bsz": 48, 7 | "chunks": 1, 8 | "pp_division": "32", 9 | "checkpoint": "1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0", 10 | "pipeline_type": "pipedream_flush", 11 | "default_dp_type": "zero2" 12 | } -------------------------------------------------------------------------------- /tools/Galvatron/galvatron/models/llama/meta_configs/__init__.py: -------------------------------------------------------------------------------- 1 | from .config_utils import * -------------------------------------------------------------------------------- /tools/Galvatron/galvatron/models/llama/meta_configs/llama-13b.json: -------------------------------------------------------------------------------- 1 | { 2 | "dim": 5120, 3 | "multiple_of": 256, 4 | "n_heads": 40, 5 | "n_layers": 40, 6 | "norm_eps": 1e-06, 7 | "vocab_size": 32000 8 | } -------------------------------------------------------------------------------- /tools/Galvatron/galvatron/models/llama/meta_configs/llama-30b.json: -------------------------------------------------------------------------------- 1 | { 2 | "dim": 6656, 3 | "multiple_of": 256, 4 | "n_heads": 52, 5 | "n_layers": 60, 6 | "norm_eps": 1e-06, 7 | "vocab_size": 32000 8 | } -------------------------------------------------------------------------------- /tools/Galvatron/galvatron/models/llama/meta_configs/llama-7b.json: -------------------------------------------------------------------------------- 1 | { 2 | "dim": 4096, 3 | "multiple_of": 256, 4 | "n_heads": 32, 5 | "n_layers": 32, 6 | "norm_eps": 1e-06, 7 | "vocab_size": 32000 8 | } -------------------------------------------------------------------------------- /tools/Galvatron/galvatron/models/llama/scripts/train.sh: -------------------------------------------------------------------------------- 1 | LAUNCHER="python3" 2 | 3 | TRAINER="train.py" 4 | 5 | ${LAUNCHER} ${TRAINER} \ 6 | --gpu_id 0 \ 7 | --global_train_batch_size 1 \ 8 | --model_size llama-7b \ 9 | --set_model_config_manually 1 \ 10 | --set_layernum_manually 0 \ 11 | --vocab_size 32000 \ 12 | --hidden_size 1024 \ 13 | --num_hidden_layers 24 \ 14 | --num_attention_heads 16 \ 15 | --seq_length 1024 \ 16 | --epochs 10 \ 17 | --lr 1e-4 \ 18 | --adam_weight_decay 0.01 \ 19 | --dropout_prob 0.1 \ 20 | --check_loss 0 \ 21 | --profile 1 -------------------------------------------------------------------------------- /tools/Galvatron/galvatron/profile_hardware/hardware_configs/allreduce_bandwidth_1nodes_4gpus_per_node.json: -------------------------------------------------------------------------------- 1 | { 2 | "allreduce_size_4_consec_1": 158.018, 3 | "allreduce_size_2_consec_1": 149.158, 4 | "allreduce_size_2_consec_0": 149.317 5 | } -------------------------------------------------------------------------------- /tools/Galvatron/galvatron/profile_hardware/hardware_configs/allreduce_bandwidth_1nodes_8gpus_per_node.json: -------------------------------------------------------------------------------- 1 | { 2 | "allreduce_size_8_consec_1": 154.203, 3 | "allreduce_size_4_consec_1": 159.119, 4 | "allreduce_size_4_consec_0": 155.815, 5 | "allreduce_size_2_consec_1": 138.156, 6 | "allreduce_size_2_consec_0": 151.344 7 | } -------------------------------------------------------------------------------- /tools/Galvatron/galvatron/profile_hardware/hardware_configs/allreduce_bandwidth_2nodes_8gpus_per_node.json: -------------------------------------------------------------------------------- 1 | { 2 | "allreduce_size_16_consec_1": 44.682, 3 | "allreduce_size_8_consec_1": 155.658, 4 | "allreduce_size_8_consec_0": 20.7724, 5 | "allreduce_size_4_consec_1": 157.984, 6 | "allreduce_size_4_consec_0": 16.22, 7 | "allreduce_size_2_consec_1": 149.666, 8 | "allreduce_size_2_consec_0": 8.13007 9 | } -------------------------------------------------------------------------------- /tools/Galvatron/galvatron/profile_hardware/hardware_configs/overlap_coefficient.json: -------------------------------------------------------------------------------- 1 | { 2 | "overlap_coe": 1.125552573612729 3 | } -------------------------------------------------------------------------------- /tools/Galvatron/galvatron/profile_hardware/hardware_configs/p2p_bandwidth_1nodes_4gpus_per_node.json: -------------------------------------------------------------------------------- 1 | { 2 | "pp_size_2": 162.118, 3 | "pp_size_4": 140.185 4 | } -------------------------------------------------------------------------------- /tools/Galvatron/galvatron/profile_hardware/hardware_configs/p2p_bandwidth_1nodes_8gpus_per_node.json: -------------------------------------------------------------------------------- 1 | { 2 | "pp_size_2": 163.671, 3 | "pp_size_4": 138.581, 4 | "pp_size_8": 109.45 5 | } -------------------------------------------------------------------------------- /tools/Galvatron/galvatron/profile_hardware/hardware_configs/p2p_bandwidth_2nodes_8gpus_per_node.json: -------------------------------------------------------------------------------- 1 | { 2 | "pp_size_2": 7.65998, 3 | "pp_size_4": 8.02132, 4 | "pp_size_8": 8.76278, 5 | "pp_size_16": 8.13177 6 | } -------------------------------------------------------------------------------- /tools/Galvatron/galvatron/profile_hardware/hostfile: -------------------------------------------------------------------------------- 1 | job-a23c7db3-67e5-45e4-9419-20270dd89a8f-master-0 2 | job-a23c7db3-67e5-45e4-9419-20270dd89a8f-worker-0 -------------------------------------------------------------------------------- /tools/Galvatron/galvatron/profile_hardware/profile_hardware.py: -------------------------------------------------------------------------------- 1 | from galvatron.core import GalvatronProfiler, initialize_galvatron 2 | import os 3 | 4 | if __name__ == '__main__': 5 | args = initialize_galvatron(mode='profile_hardware') 6 | print(args) 7 | profiler = GalvatronProfiler(args) 8 | path = os.path.dirname(os.path.abspath(__file__)) 9 | profiler.set_path(path) 10 | 11 | # profile allreduce & p2p bandwidth 12 | profiler.profile_bandwidth() 13 | 14 | # profile overlapping slowdown coefficient 15 | profiler.profile_overlap() -------------------------------------------------------------------------------- /tools/Galvatron/galvatron/profile_hardware/scripts/build_nccl_test.sh: -------------------------------------------------------------------------------- 1 | if [ "$USE_EXPORT_VARIABLE" = "1" ]; then 2 | echo "USE_EXPORT_VARIABLE is set to 1, using the exported variables." 3 | else 4 | echo "USE_EXPORT_VARIABLE is not set to 1, using the variables defined in script." 5 | MPI_PATH=/usr/local/mpi/ 6 | MAKE_MPI=1 7 | fi 8 | 9 | cd ../site_package/nccl-tests 10 | if [ "$MAKE_MPI" = "1" ]; then 11 | echo 'Building nccl-tests with MPI.' 12 | make MPI=1 MPI_HOME=${MPI_PATH} 13 | else 14 | echo 'Building nccl-tests without MPI.' 15 | make 16 | fi -------------------------------------------------------------------------------- /tools/Galvatron/galvatron/profile_hardware/scripts/profile_hardware.sh: -------------------------------------------------------------------------------- 1 | NUM_NODES=2 2 | NUM_GPUS_PER_NODE=8 3 | NCCLTEST_DIR="../site_package/nccl-tests" 4 | MPI_PATH=/usr/local/mpi/ 5 | START_MB=16 6 | END_MB=256 7 | SCALE=2 8 | HOSTFILE="hostfile" 9 | 10 | # These args will be directly added to nccl-test arguments 11 | export NCCLTEST_OTHER_ARGS="-x NCCL_IB_DISABLE=0 -x NCCL_IB_HCA=mlx5_2,mlx5_5" 12 | 13 | PROFILE_ARGS=" 14 | --num_nodes ${NUM_NODES} \ 15 | --num_gpus_per_node ${NUM_GPUS_PER_NODE} \ 16 | --nccl_test_dir ${NCCLTEST_DIR} \ 17 | --mpi_path ${MPI_PATH} \ 18 | --start_mb ${START_MB} \ 19 | --end_mb ${END_MB} \ 20 | --scale ${SCALE} \ 21 | --hostfile ${HOSTFILE} \ 22 | --avg_or_min_or_first first \ 23 | --max_pp_deg 16 \ 24 | --overlap_time_multiply 4" 25 | python3 profile_hardware.py ${PROFILE_ARGS} -------------------------------------------------------------------------------- /tools/Galvatron/galvatron/profile_hardware/scripts/profile_overlap.sh: -------------------------------------------------------------------------------- 1 | if [ "$USE_EXPORT_VARIABLE" = "1" ]; then 2 | echo "USE_EXPORT_VARIABLE is set to 1, using the exported variables." 3 | else 4 | echo "USE_EXPORT_VARIABLE is not set to 1, using the variables defined in script." 5 | NUM_GPUS_PER_NODE=8 6 | OVERLAP_TIME_MULTIPLY=4 7 | fi 8 | 9 | ARGS="-m torch.distributed.launch \ 10 | --nproc_per_node=${NUM_GPUS_PER_NODE} \ 11 | --master_port 9999 \ 12 | profile_overlap.py \ 13 | --overlap_time_multiply ${OVERLAP_TIME_MULTIPLY}" 14 | 15 | echo "Running: python3 ${ARGS}" 16 | python3 ${ARGS} -------------------------------------------------------------------------------- /tools/Galvatron/galvatron/scripts/flash_attn_ops_install.sh: -------------------------------------------------------------------------------- 1 | git clone --recursive https://github.com/Dao-AILab/flash-attention.git 2 | pip3 install flash-attention/csrc/fused_dense_lib 3 | pip3 install flash-attention/csrc/layer_norm 4 | pip3 install flash-attention/csrc/rotary 5 | pip3 install flash-attention/csrc/xentropy 6 | rm -rf flash-attention -------------------------------------------------------------------------------- /tools/Galvatron/galvatron/scripts/prepare_env.sh: -------------------------------------------------------------------------------- 1 | pip3 install -r ../requirements.txt -------------------------------------------------------------------------------- /tools/Galvatron/galvatron/site_package/__init__.py: -------------------------------------------------------------------------------- 1 | # import os 2 | # import sys 3 | # sys.path.insert(0, os.path.dirname(__file__)) -------------------------------------------------------------------------------- /tools/Galvatron/galvatron/site_package/megatron/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import torch 4 | 5 | from .global_vars import get_args, get_retro_args 6 | from .global_vars import get_current_global_batch_size 7 | from .global_vars import get_num_microbatches 8 | from .global_vars import get_signal_handler 9 | from .global_vars import update_num_microbatches 10 | from .global_vars import get_tokenizer 11 | from .global_vars import get_tensorboard_writer 12 | from .global_vars import get_adlr_autoresume 13 | from .global_vars import get_timers 14 | from .initialize import initialize_megatron 15 | 16 | from .utils import (print_rank_0, 17 | is_last_rank, 18 | print_rank_last) 19 | -------------------------------------------------------------------------------- /tools/Galvatron/galvatron/site_package/megatron/core/README.md: -------------------------------------------------------------------------------- 1 | Megatron Core is a library for efficient and scalable training of transformer based models. 2 | -------------------------------------------------------------------------------- /tools/Galvatron/galvatron/site_package/megatron/core/__init__.py: -------------------------------------------------------------------------------- 1 | import megatron.core.parallel_state 2 | import megatron.core.tensor_parallel 3 | import megatron.core.utils 4 | 5 | # Alias parallel_state as mpu, its legacy name 6 | mpu = parallel_state 7 | 8 | __all__ = [ 9 | "parallel_state", 10 | "tensor_parallel", 11 | "utils", 12 | ] 13 | -------------------------------------------------------------------------------- /tools/Galvatron/galvatron/site_package/megatron/core/enums.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import enum 4 | 5 | class ModelType(enum.Enum): 6 | encoder_or_decoder = 1 7 | encoder_and_decoder = 2 8 | -------------------------------------------------------------------------------- /tools/Galvatron/galvatron/site_package/megatron/core/pipeline_parallel/__init__.py: -------------------------------------------------------------------------------- 1 | from .schedules import get_forward_backward_func 2 | -------------------------------------------------------------------------------- /tools/Galvatron/galvatron/site_package/megatron/core/requirements.txt: -------------------------------------------------------------------------------- 1 | torch -------------------------------------------------------------------------------- /tools/Galvatron/galvatron/site_package/megatron/data/Makefile: -------------------------------------------------------------------------------- 1 | CXXFLAGS += -O3 -Wall -shared -std=c++11 -fPIC -fdiagnostics-color 2 | CPPFLAGS += $(shell python3 -m pybind11 --includes) 3 | LIBNAME = helpers 4 | LIBEXT = $(shell python3-config --extension-suffix) 5 | 6 | default: $(LIBNAME)$(LIBEXT) 7 | 8 | %$(LIBEXT): %.cpp 9 | $(CXX) $(CXXFLAGS) $(CPPFLAGS) $< -o $@ 10 | -------------------------------------------------------------------------------- /tools/Galvatron/galvatron/site_package/megatron/data/__init__.py: -------------------------------------------------------------------------------- 1 | from . import indexed_dataset 2 | -------------------------------------------------------------------------------- /tools/Galvatron/galvatron/site_package/megatron/data/test/test_preprocess_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | IMPL=cached 4 | python ../preprocess_data.py \ 5 | --input test_samples.json \ 6 | --vocab vocab.txt \ 7 | --dataset-impl ${IMPL} \ 8 | --output-prefix test_samples_${IMPL} \ 9 | --workers 1 \ 10 | --log-interval 2 11 | -------------------------------------------------------------------------------- /tools/Galvatron/galvatron/site_package/megatron/fused_kernels/compat.h: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. */ 2 | 3 | /*This code is copied fron NVIDIA apex: 4 | * https://github.com/NVIDIA/apex 5 | * with minor changes. */ 6 | 7 | 8 | 9 | #ifndef TORCH_CHECK 10 | #define TORCH_CHECK AT_CHECK 11 | #endif 12 | 13 | #ifdef VERSION_GE_1_3 14 | #define DATA_PTR data_ptr 15 | #else 16 | #define DATA_PTR data 17 | #endif 18 | -------------------------------------------------------------------------------- /tools/Galvatron/galvatron/site_package/megatron/fused_kernels/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Hsword/Hetu/a1cb1bcaf236c9f1d452733b105997d50c16c9df/tools/Galvatron/galvatron/site_package/megatron/fused_kernels/tests/__init__.py -------------------------------------------------------------------------------- /tools/Galvatron/galvatron/site_package/megatron/model/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from .fused_layer_norm import MixedFusedLayerNorm as LayerNorm 4 | 5 | from .distributed import DistributedDataParallel 6 | from .bert_model import BertModel 7 | from .gpt_model import GPTModel 8 | from .t5_model import T5Model 9 | from .language_model import get_language_model 10 | from .module import Float16Module, MegatronModule 11 | -------------------------------------------------------------------------------- /tools/Galvatron/galvatron/site_package/megatron/model/enums.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | import enum 4 | 5 | class LayerType(enum.Enum): 6 | encoder = 1 7 | decoder = 2 8 | 9 | class AttnType(enum.Enum): 10 | self_attn = 1 11 | cross_attn = 2 12 | 13 | class AttnMaskType(enum.Enum): 14 | padding = 1 15 | causal = 2 16 | 17 | # For backward compatibility with old model checkpoints 18 | from megatron.core.enums import ModelType 19 | -------------------------------------------------------------------------------- /tools/Galvatron/galvatron/site_package/megatron/mpu/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Hsword/Hetu/a1cb1bcaf236c9f1d452733b105997d50c16c9df/tools/Galvatron/galvatron/site_package/megatron/mpu/tests/__init__.py -------------------------------------------------------------------------------- /tools/Galvatron/galvatron/site_package/megatron/text_generation/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | 4 | from .api import ( 5 | generate, 6 | generate_and_post_process, 7 | beam_search_and_post_process) 8 | -------------------------------------------------------------------------------- /tools/Galvatron/galvatron/site_package/megatron/tokenizer/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | 4 | from .tokenizer import build_tokenizer 5 | -------------------------------------------------------------------------------- /tools/Galvatron/galvatron/site_package/nccl-tests/.gitignore: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # See LICENCE.txt for license information 4 | /build 5 | -------------------------------------------------------------------------------- /tools/Galvatron/galvatron/site_package/nccl-tests/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # See LICENCE.txt for license information 5 | # 6 | 7 | BUILDDIR ?= build 8 | override BUILDDIR := $(abspath $(BUILDDIR)) 9 | 10 | .PHONY: all clean 11 | 12 | default: src.build 13 | 14 | TARGETS=src 15 | 16 | all: ${TARGETS:%=%.build} 17 | clean: ${TARGETS:%=%.clean} 18 | 19 | %.build: 20 | ${MAKE} -C $* build BUILDDIR=${BUILDDIR} 21 | 22 | %.clean: 23 | ${MAKE} -C $* clean BUILDDIR=${BUILDDIR} 24 | -------------------------------------------------------------------------------- /tools/Galvatron/galvatron/site_package/nccl-tests/src/timer.cc: -------------------------------------------------------------------------------- 1 | #include "timer.h" 2 | 3 | // Make sure to compile this translation unit with the host compiler and not 4 | // nvcc, lest you hit an internal compiler error (ICE) with GCC 10.3.0 5 | #include 6 | 7 | namespace { 8 | std::uint64_t now() { 9 | using clock = std::chrono::steady_clock; 10 | return std::chrono::duration_cast(clock::now().time_since_epoch()).count(); 11 | } 12 | } 13 | 14 | timer::timer() { 15 | t0 = now(); 16 | } 17 | 18 | double timer::elapsed() const { 19 | std::uint64_t t1 = now(); 20 | return 1.e-9*(t1 - t0); 21 | } 22 | 23 | double timer::reset() { 24 | std::uint64_t t1 = now(); 25 | double ans = 1.e-9*(t1 - t0); 26 | t0 = t1; 27 | return ans; 28 | } 29 | -------------------------------------------------------------------------------- /tools/Galvatron/galvatron/site_package/nccl-tests/src/timer.h: -------------------------------------------------------------------------------- 1 | #ifndef _408319ecdd5b47b28bf8f511c4fdf816 2 | #define _408319ecdd5b47b28bf8f511c4fdf816 3 | 4 | #include 5 | 6 | // Can't include because of bug with gcc 10.3.0 7 | class timer { 8 | std::uint64_t t0; 9 | public: 10 | timer(); 11 | double elapsed() const; 12 | double reset(); 13 | }; 14 | 15 | #endif 16 | -------------------------------------------------------------------------------- /tools/Galvatron/galvatron/site_package/nccl-tests/verifiable/Makefile: -------------------------------------------------------------------------------- 1 | include ../../makefiles/common.mk 2 | 3 | .PHONY: all clean 4 | 5 | BUILDDIR := $(abspath ../../build) 6 | NCCLDIR := $(BUILDDIR) 7 | NVCUFLAGS += -I$(NCCLDIR)/include/ -I../include 8 | DST_DIR := $(BUILDDIR)/test/verifiable 9 | 10 | all: $(DST_DIR)/self_test $(DST_DIR)/verifiable.o 11 | 12 | clean: 13 | rm -rf $(DST_DIR) 14 | 15 | TEST_VERIFIABLE_SRCDIR := . 16 | TEST_VERIFIABLE_BUILDDIR := $(DST_DIR) 17 | include verifiable.mk 18 | 19 | self_test: $(DST_DIR)/self_test 20 | 21 | $(DST_DIR)/self_test: verifiable.cu verifiable.h 22 | @printf "Linking %s\n" $@ 23 | @mkdir -p $(DST_DIR) 24 | $(NVCC) -o $@ $(NVCUFLAGS) -DSELF_TEST=1 verifiable.cu $(NVLDFLAGS) 25 | -------------------------------------------------------------------------------- /tools/Galvatron/galvatron/site_package/nccl-tests/verifiable/verifiable.mk: -------------------------------------------------------------------------------- 1 | # We requires both of the following paths to be set upon including this makefile 2 | # TEST_VERIFIABLE_SRCDIR = 3 | # TEST_VERIFIABLE_BUILDDIR = 4 | 5 | TEST_VERIFIABLE_HDRS = $(TEST_VERIFIABLE_SRCDIR)/verifiable.h 6 | TEST_VERIFIABLE_OBJS = $(TEST_VERIFIABLE_BUILDDIR)/verifiable.o 7 | 8 | $(TEST_VERIFIABLE_BUILDDIR)/verifiable.o: $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu $(TEST_VERIFY_REDUCE_HDRS) 9 | @printf "Compiling %s\n" $@ 10 | @mkdir -p $(TEST_VERIFIABLE_BUILDDIR) 11 | $(NVCC) -o $@ $(NVCUFLAGS) -c $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu 12 | -------------------------------------------------------------------------------- /tools/Galvatron/galvatron/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .config_utils import * 2 | from .strategy_utils import form_strategy, strategy_str2list, print_strategies 3 | from .memory_utils import print_peak_memory, print_param_num 4 | from .training_utils import * 5 | -------------------------------------------------------------------------------- /tools/Galvatron/galvatron/utils/memory_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | def print_peak_memory(prefix, device, type='allocated'): 4 | if type == 'allocated': 5 | print(prefix, '[Allocated]') 6 | max_mem = torch.cuda.max_memory_allocated(device)/2**20 7 | cur_mem = torch.cuda.memory_allocated(device)/2**20 8 | print("\tMax memory: %.2f MB\tCurrent memory : %.2f MB"%(max_mem, cur_mem)) 9 | elif type == 'reserved': 10 | print(prefix, '[Reserved]') 11 | max_mem = torch.cuda.max_memory_reserved(device)/2**20 12 | cur_mem = torch.cuda.memory_reserved(device)/2**20 13 | print("\tMax memory: %.2f MB\tCurrent memory : %.2f MB"%(max_mem, cur_mem)) 14 | return max_mem, cur_mem 15 | 16 | def print_param_num(model): 17 | print("Total number of paramerters in networks is {} ".format(sum(x.numel() for x in model.parameters()))) 18 | -------------------------------------------------------------------------------- /tools/Galvatron/requirements.txt: -------------------------------------------------------------------------------- 1 | torch==2.0.1 2 | torchvision==0.15.2 3 | transformers>=4.31.0 4 | flash_attn>=2.0.8 5 | h5py>=3.6.0 6 | attrs>=21.4.0 7 | yacs>=0.1.8 8 | six>=1.15.0 9 | sentenpiece>=0.1.95 10 | pybind11>=2.9.1 --------------------------------------------------------------------------------