├── deepreduce.nips21.pdf ├── tensorflow ├── logger.cc ├── policies.hpp ├── integer_compression.cc ├── bloom_filter_compression.cc ├── compression_utils.hpp └── deepreduce.py ├── README.md ├── run_deepreduce.sh └── pytorch └── deepreduce.py /deepreduce.nips21.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hangxu0304/DeepReduce/HEAD/deepreduce.nips21.pdf -------------------------------------------------------------------------------- /tensorflow/logger.cc: -------------------------------------------------------------------------------- 1 | #include "tensorflow/core/framework/op.h" 2 | #include "tensorflow/core/framework/op_kernel.h" 3 | #include "tensorflow/core/framework/shape_inference.h" 4 | #include "tensorflow/core/framework/common_shape_fns.h" 5 | #include "tensorflow/core/framework/tensor.h" 6 | #include "tensorflow/core/framework/tensor_types.h" 7 | #include "./compression_utils.hpp" 8 | 9 | #include 10 | #include 11 | 12 | using namespace tensorflow; 13 | 14 | REGISTER_OP("Logger") 15 | .Attr("bloom_logs_path: string") 16 | .Attr("gradient_id: int") 17 | .Attr("rank: int") 18 | .Attr("verbosity_frequency: int") 19 | .Attr("verbosity: int") 20 | .Input("initial_tensor: float32") 21 | .Input("coefficients: double") 22 | .Input("step: int64") 23 | .Doc(R"doc()doc"); 24 | 25 | class LoggerOp : public OpKernel { 26 | 27 | public: 28 | 29 | explicit LoggerOp(OpKernelConstruction *context) : OpKernel(context) { 30 | OP_REQUIRES_OK(context, context->GetAttr("bloom_logs_path", &bloom_logs_path)); 31 | OP_REQUIRES_OK(context, context->GetAttr("gradient_id", &gradient_id)); 32 | OP_REQUIRES_OK(context, context->GetAttr("rank", &rank)); 33 | OP_REQUIRES_OK(context, context->GetAttr("verbosity", &verbosity)); 34 | OP_REQUIRES_OK(context, context->GetAttr("verbosity_frequency", &verbosity_frequency)); 35 | } 36 | 37 | void Compute(OpKernelContext *context) override { 38 | 39 | // Retrieving Inputs 40 | const Tensor &initial_tensor = context->input(0); 41 | int N = initial_tensor.flat().size(); 42 | const Tensor &coefficients_tensor = context->input(1); 43 | // auto coefficients_tensor_flat = coefficients_tensor.flat(); 44 | int64 step = context->input(2).flat()(0); 45 | 46 | // *********************** Logging ********************** // 47 | if (verbosity_frequency != 0 && step % verbosity_frequency == 0 ) { 48 | CompressionUtilities::logging(N, initial_tensor, coefficients_tensor, bloom_logs_path, gradient_id, 49 | step, rank, verbosity); 50 | } 51 | // *********************** Logging ********************** // 52 | } 53 | 54 | private: 55 | // Logging 56 | string bloom_logs_path; 57 | int gradient_id; 58 | int rank; 59 | int verbosity_frequency; 60 | int verbosity; 61 | }; 62 | REGISTER_KERNEL_BUILDER(Name("Logger").Device(DEVICE_CPU), LoggerOp); 63 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DeepReduce 2 | 3 | A Sparse-tensor Communication Framework for Distributed Deep Learning 4 | 5 | **Abstrat**: Sparse tensors appear frequently in distributed deep learning, either as a direct artifact of the deep neural network’s gradients, or as a result of an explicit sparsification process. Existing communication primitives are agnostic to the peculiarities of deep learning; consequently, they impose unnecessary communication overhead. We introduce DeepReduce, a versatile framework for the compressed communication of sparse tensors, tailored for distributed deep learning. DeepReduce decomposes sparse tensors in two sets, values and indices, and allows both independent and combined compression of these sets. We support a variety of common compressors, such as Deflate for values, or run-length encoding for indices. We also propose two novel compression schemes that achieve superior results: curve fitting-based for values and bloom filter-based for indices. DeepReduce is orthogonal to existing gradient sparsifiers and can be applied in conjunction with them, transparently to the end-user, to significantly lower the communication overhead. As proof of concept, we implement our approach on TensorFlow and PyTorch. Our experiments with large real models demonstrate that DeepReduce transmits fewer data and imposes lower computational overhead than existing methods, without affecting the training accuracy. 6 | 7 | ## Prerequisites 8 | 9 | The code is built with following libraries: 10 | 11 | - Python >= 3.7 12 | - [PyTorch](https://github.com/pytorch/pytorch) >= 1.4 13 | - [TensorFlow](https://www.tensorflow.org/) >= 1.14 14 | - [GRACE](https://github.com/sands-lab/grace) >=1.0 15 | 16 | ## Benchmarks 17 | 18 | We use the following benchmarks to run our experiments: 19 | 20 | - [Image Classification/tf_cnn_benchmarks](https://github.com/sands-lab/grace-benchmarks/tree/master/tensorflow/Classification/tf_cnn_benchmarks) [TensorFlow] ResNet-20, ResNet-50 21 | - [Image_Classification/Cifar10](https://github.com/sands-lab/grace-benchmarks/tree/master/torch/cifar10) [PyTorch] ResNet-20 22 | - [Recommendation/NCF](https://github.com/sands-lab/grace-benchmarks/tree/master/torch/Recommendation/NCF) [PyTorch] NCF 23 | 24 | ## Usage 25 | 26 | For the usage of GRACE and environment setup etc., please check the guides [here](https://github.com/sands-lab/grace). 27 | 28 | First, create a GRACE instance from `params`. `params` should include parameters for both GRACE and DeepReduce. The valid parameter options for DeepReduce is listed as below: 29 | 30 | ```python 31 | ''' 32 | 'deepreduce': None, 'value', 'index', 'both' 33 | 'value': None, 'polyfit', ...(other custom methods) 34 | 'index': None, 'bloom', ...(other custom methods) 35 | ''' 36 | from grace_dl.dist.helper import grace_from_params 37 | params = {'compressor': 'topk', 'memory': 'residual', 'communicator': 'allgather', 'compress_ratio': 0.01, 'deepreduce':'index', 'index':'bloom'} 38 | grc = grace_from_params(params) 39 | ``` 40 | 41 | Once you get a desired GRACE instance, warp the compressor by DeepReduce. After that, you can use DeepReduce in the same way as GRACE. 42 | 43 | ```python 44 | deepreduce_wrapper = {'value': ValueCompressor, 45 | 'index': IndexCompressor, 46 | 'both': DeepReduce} 47 | DReduce = deepreduce_wrapper[deepreduce](grc.compressor, params) 48 | grc.compressor = DReduce 49 | ``` 50 | 51 | ## Scripts & Data 52 | 53 | We provide the bash scripts to reproduce the experiments in our paper. All experiment results are also available at [WANDB](https://wandb.ai/sands-lab/deepreduce/reports/DeepReduce--VmlldzoxODM5NTU) database. 54 | 55 | ## Citation 56 | 57 | If you find this useful, please cite our work as: 58 | 59 | Kostopoulou K, Xu H, Dutta A, et al. DeepReduce: A Sparse-tensor Communication Framework for Distributed Deep Learning[J]. arXiv preprint arXiv:2102.03112, 2021. 60 | 61 | Here's a BibTeX blurb: 62 | 63 | ``` 64 | @misc{kostopoulou2021deepreduce, 65 | title={DeepReduce: A Sparse-tensor Communication Framework for Distributed Deep Learning}, 66 | author={Kelly Kostopoulou and Hang Xu and Aritra Dutta and Xin Li and Alexandros Ntoulas and Panos Kalnis}, 67 | year={2021}, 68 | eprint={2102.03112}, 69 | archivePrefix={arXiv}, 70 | primaryClass={cs.LG} 71 | } 72 | ``` 73 | 74 | ## Publications 75 | 76 | * DeepReduce: A Sparse-tensor Communication Framework for Distributed Deep Learning [[arXiv]](https://arxiv.org/abs/2102.03112)
77 | -------------------------------------------------------------------------------- /run_deepreduce.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | ### tf_cnn_benchmarks ResNet20 ResNet50 4 | ./grace/env-tf1.14/bin/mpirun \ 5 | -x CUDA_VISIBLE_DEVICES=0 \ 6 | -x NCCL_IB_DISABLE=0 \ 7 | -np 8 -H 11.0.0.233:1,11.0.0.234:1,11.0.0.235:1,11.0.0.236:1,11.0.0.237:1,11.0.0.238:1,11.0.0.239:1,11.0.0.240:1, \ 8 | --display-allocation -map-by slot -bind-to none -nooversubscribe \ 9 | -mca pml ob1 -mca btl ^openib --tag-output --mca btl_tcp_if_include enp1s0f1 -x NCCL_SOCKET_IFNAME=enp1s0f1 \ 10 | python \ 11 | ./grace-benchmarks/tensorflow/Classification/tf_cnn_benchmarks/tf_cnn_benchmarks.py --model=resnet20_v2 --data_name=cifar10 --batch_size=256 --weight_decay=0.0001 --optimizer=momentum --piecewise_learning_rate_schedule=0.1;163;0.01;245;0.001 --variable_update=horovod --train_dir={log_dir}/ckpts --summary_verbosity=1 --save_summaries_steps=10 --num_epochs=328 --eval_during_training_every_n_steps=200 --num_eval_epochs=8 --data_dir=/mnt/scratch/cifar-10-batches-py/ 12 | 13 | ./grace/env-tf1.14/bin/mpirun \ 14 | -x CUDA_VISIBLE_DEVICES=0 \ 15 | -x NCCL_IB_DISABLE=0 \ 16 | -np 8 -H 11.0.0.233:1,11.0.0.234:1,11.0.0.235:1,11.0.0.236:1,11.0.0.237:1,11.0.0.238:1,11.0.0.239:1,11.0.0.240:1, \ 17 | --display-allocation -map-by slot -bind-to none -nooversubscribe \ 18 | -mca pml ob1 -mca btl ^openib --tag-output --mca btl_tcp_if_include enp1s0f1 -x NCCL_SOCKET_IFNAME=enp1s0f1 \ 19 | ./grace/env-tf1.14/bin/python \ 20 | ./grace-benchmarks/tensorflow/Classification/tf_cnn_benchmarks/tf_cnn_benchmarks.py --model=resnet50 --data_name=imagenet --batch_size=256 --weight_decay=1e-4 --optimizer=momentum --nodistortions --variable_update=horovod --train_dir={log_dir}/ckpts --summary_verbosity=1 --save_summaries_steps=100 --num_epochs=90 --eval_during_training_every_n_steps=625 --num_eval_epochs=8 --data_dir=/mnt/scratch/imagenet18/data/imagenet 21 | 22 | 23 | 24 | ### pytorch resnet20 cifar10 data volume and micro-benchmark 25 | 26 | ./grace/env-tf1.14/bin/mpirun \ 27 | -x CUDA_VISIBLE_DEVICES=0 \ 28 | -x NCCL_IB_DISABLE=0 \ 29 | -np 8 -H 11.0.0.233:1,11.0.0.234:1,11.0.0.235:1,11.0.0.236:1,11.0.0.237:1,11.0.0.238:1,11.0.0.239:1,11.0.0.240:1, \ 30 | --display-allocation -map-by slot -bind-to none -nooversubscribe \ 31 | -mca pml ob1 -mca btl ^openib --tag-output --mca btl_tcp_if_include enp1s0f1 -x NCCL_SOCKET_IFNAME=enp1s0f1 \ 32 | ./grace/env-tf1.14/bin/python \ 33 | ./grace-benchmarks/torch/cifar10/trainer_grace.py \ 34 | -a resnet20 --data=/mnt/scratch/cifar-10-batches-py/ --log_volume \ 35 | --grace_config="{'compressor': 'topk', 'memory': 'residual', 'communicator': 'allgather', 'compress_ratio': 0.01, 'deepreduce':'index', 'index':'bloom', 'micro-benchmark':True}" 36 | 37 | 38 | ### NCF sparse/dense/deepreduce accuracy 39 | 40 | ./grace/env-tf1.14/bin/mpirun \ 41 | -x CUDA_VISIBLE_DEVICES=0 \ 42 | -x NCCL_IB_DISABLE=0 \ 43 | -np 2 -H 11.0.0.203:1,11.0.0.204:1 \ 44 | --display-allocation -map-by slot -bind-to none -nooversubscribe \ 45 | -mca pml ob1 -mca btl ^openib --tag-output --mca btl_tcp_if_include ens1f0 -x NCCL_SOCKET_IFNAME=ens1f0 \ 46 | ./grace/env-tf1.14/bin/python -W ignore::UserWarning \ 47 | ./grace-benchmarks/torch/Recommendation/NCF/ncf_grace.py \ 48 | --data ./data/ml-20m/torch/cache/ml-20m \ 49 | --load_checkpoint_path ./grace/results/NCF/checkpoints/model_init.pth --seed 44 \ 50 | --weak_scaling --extra_wandb_tags=accuracy,10G --log_volume \ 51 | --grace_config="{'compressor': 'none', 'memory': 'none', 'communicator': 'allreduce'}" 52 | 53 | 54 | 55 | ./grace/env-tf1.14/bin/mpirun \ 56 | -x CUDA_VISIBLE_DEVICES=1 \ 57 | -x NCCL_IB_DISABLE=0 \ 58 | -np 8 -H 11.0.0.233:1,11.0.0.234:1,11.0.0.235:1,11.0.0.236:1,11.0.0.237:1,11.0.0.238:1,11.0.0.239:1,11.0.0.240:1, \ 59 | --display-allocation -map-by slot -bind-to none -nooversubscribe \ 60 | -mca pml ob1 -mca btl ^openib --tag-output --mca btl_tcp_if_include enp136s0f0 -x NCCL_SOCKET_IFNAME=enp136s0f0 \ 61 | ./grace/env-tf1.14/bin/python -W ignore::UserWarning \ 62 | ./grace/src/grace-benchmarks/torch/Recommendation/NCF/ncf_grace.py \ 63 | --data ./grace/data/ml-20m/torch/cache/ml-20m \ 64 | --load_checkpoint_path ./grace/results/NCF/checkpoints/model_init.pth --seed 44 \ 65 | --weak_scaling --extra_wandb_tags=accuracy,10G --log_volume \ 66 | --grace_config="{'compressor': 'threshold', 'memory': 'none', 'communicator': 'allgather', 'threshold': 0.0}" 67 | 68 | # "{'compressor': 'threshold', 'memory': 'none', 'communicator': 'allgather', 'threshold': 0.0, 'deepreduce':'index', 'index':'bloom'}" 69 | # "{'compressor': 'threshold', 'memory': 'none', 'communicator': 'allgather', 'threshold': 0.0, 'deepreduce':'value', 'value':'polyfit'}" 70 | # "{'compressor': 'threshold', 'memory': 'none', 'communicator': 'allgather', 'threshold': 0.0, 'deepreduce':'value', 'value':'qsgd', 'buckets_num':128, 'quantum_num': 32}" 71 | # "{'compressor': 'threshold', 'memory': 'none', 'communicator': 'allgather', 'threshold': 0.0, 'deepreduce':'both'}" 72 | # "{'compressor': 'threshold', 'memory': 'none', 'communicator': 'allgather', 'threshold': 0.0, 'deepreduce':'index', 'index':'bloom', 'policy':'p0', 'fpr':0.01}" 73 | # "{'compressor': 'threshold', 'memory': 'none', 'communicator': 'allgather', 'threshold': 0.0, 'deepreduce':'both', 'index':'bloom', 'policy':'random', 'fpr':0.01, 'value':'qsgd'}" 74 | # "{'compressor': 'threshold', 'memory': 'none', 'communicator': 'allgather', 'threshold': 0.0, 'deepreduce':'both', 'index':'bloom', 'policy':'random', 'fpr':0.01, 'value':'qsgd'}" 75 | 76 | 77 | # SKCompress 78 | ./grace/env-tf1.14/bin/mpirun \ 79 | -x CUDA_VISIBLE_DEVICES=1 \ 80 | -x NCCL_IB_DISABLE=0 \ 81 | -np 8 -H 11.0.0.233:1,11.0.0.234:1,11.0.0.235:1,11.0.0.236:1,11.0.0.237:1,11.0.0.238:1,11.0.0.239:1,11.0.0.240:1, \ 82 | --display-allocation -map-by slot -bind-to none -nooversubscribe \ 83 | -mca pml ob1 -mca btl ^openib --tag-output --mca btl_tcp_if_include enp136s0f0 -x NCCL_SOCKET_IFNAME=enp136s0f0 \ 84 | ./grace/env-tf1.14/bin/python -W ignore::UserWarning \ 85 | ./grace/src/grace-benchmarks/torch/Recommendation/NCF/ncf_grace.py \ 86 | --data ./grace/data/ml-20m/torch/cache/ml-20m \ 87 | --load_checkpoint_path ./grace/results/NCF/checkpoints/model_init.pth --seed 44 \ 88 | --weak_scaling --extra_wandb_tags=accuracy,10G \ 89 | --grace_config="{'compressor': 'SKCompressCPU','num_quantiles':128, 'sparsifier': 'threshold', 'threshold': 0.0, \ 90 | 'memory': 'none', 'communicator': 'allgather'}" --micro_benchmark --log_volume 91 | 92 | 93 | 94 | ### NCF time breakdown 95 | 96 | ./grace/env-tf1.14/bin/mpirun \ 97 | -x CUDA_VISIBLE_DEVICES=0 \ 98 | -x NCCL_IB_DISABLE=1 \ 99 | -np 4 -H 10.0.0.133:1,10.0.0.134:1,10.0.0.135:1,10.0.0.136:1 \ 100 | --display-allocation -map-by slot -bind-to none -nooversubscribe \ 101 | -mca pml ob1 -mca btl ^openib --tag-output --mca btl_tcp_if_include enp1s0f1 -x NCCL_SOCKET_IFNAME=enp1s0f1 \ 102 | ./grace/env-tf1.14/bin/python -W ignore::UserWarning \ 103 | ./grace/src/grace-benchmarks/torch/Recommendation/NCF/ncf_grace.py \ 104 | --data ./grace/data/ml-20m/torch/cache/ml-20m \ 105 | --load_checkpoint_path ./grace/results/NCF/checkpoints/model_init.pth --seed 44 \ 106 | --weak_scaling --extra_wandb_tags=timer,10G -e=3 --grads_accumulated=10 --log_time \ 107 | --grace_config="{'compressor': 'topk', 'memory': 'residual', 'communicator': 'allgather', 'compress_ratio': 0.1, 'deepreduce':'index', 'index':'bloom'}" -------------------------------------------------------------------------------- /tensorflow/policies.hpp: -------------------------------------------------------------------------------- 1 | #ifndef policies_hpp 2 | #define policies_hpp 3 | 4 | #include 5 | 6 | #include "tensorflow/core/framework/tensor_types.h" 7 | #include "tensorflow/core/framework/op_kernel.h" 8 | #include "tensorflow/core/framework/tensor.h" 9 | #include "tensorflow/core/framework/op.h" 10 | #include "../../third_party/bloomfilter/inc/OrdinaryBloomFilter.hpp" 11 | 12 | using namespace tensorflow; 13 | #include 14 | 15 | 16 | class Policies { 17 | 18 | public: 19 | 20 | explicit 21 | Policies(){} 22 | 23 | static int find(const Tensor& indices, int x) { 24 | auto indices_flat = indices.flat(); 25 | for (int i=0; i& selected_indices) { 33 | int policy_errors = 0; 34 | for (int i=0; i& bloom, std::map>& conflict_sets) { 44 | // Iterating over the universe and collecting the conflict sets 45 | uint8_t hash_num = bloom.Get_numHashes(); 46 | for (size_t i=0; i& cs = conflict_sets[hash]; 51 | // if (std::find(cs.begin(), cs.end(), i) == cs.end()) { 52 | conflict_sets[hash].insert(i); 53 | // } 54 | } 55 | } 56 | } 57 | } 58 | 59 | static void transform_and_sort(std::map>& conflict_sets, std::vector>& conflict_sets_ordered) { 60 | 61 | typedef std::function(std::pair>)> Transformator; 62 | Transformator transformator = [](std::pair> i) { 63 | return i.second; 64 | }; 65 | std::transform(conflict_sets.begin(), conflict_sets.end(), conflict_sets_ordered.begin(), transformator); 66 | std::sort(conflict_sets_ordered.begin(), conflict_sets_ordered.end(), [](const std::unordered_set& l, const std::unordered_set& r) { 67 | return l.size() < r.size(); 68 | }); 69 | } 70 | 71 | // static void print_map(std::map>& map) { 72 | // for (auto& it: map) { 73 | // printf("Key: %d, Values: ", it.first); 74 | // for (auto& itt : it.second) 75 | // printf("%d, ", itt); 76 | // printf("\n"); 77 | // } 78 | // } 79 | // static void print_2d_vector(std::vector>& vec) { 80 | // printf("Conflict Sets:\n"); 81 | // for (auto& it: vec) { 82 | // printf(" {"); 83 | // for (auto& itt : it) { 84 | // printf("%d, ", itt); 85 | // } 86 | // printf("}\n"); 87 | // } 88 | // } 89 | // static void print_vector(std::vector& vec) { 90 | // printf("\n["); 91 | // int i=0; 92 | // for (i = 0; i < vec.size()-1; i++) { 93 | // printf("%d, ", (int) vec[i]); 94 | // } 95 | // printf("%d]\n\n", (int) vec[i]); 96 | // } 97 | 98 | static int erase_intersection(std::unordered_set& a, std::unordered_set& b) { 99 | bool compromised=false; 100 | std::unordered_set::iterator it; 101 | for (it = a.begin(); it != a.end();) { 102 | if (b.find(*it) != b.end()) { 103 | it = a.erase(it); 104 | compromised = true; 105 | } else { 106 | it++; 107 | } 108 | } 109 | return compromised; 110 | } 111 | 112 | static void choose_indices_from_conflict_sets(int K, int seed, std::vector>& conflict_sets_ordered, std::vector& selected_indices) { 113 | std::default_random_engine generator; 114 | generator.seed(seed); 115 | bool compromised; 116 | int random, left = K; 117 | std::unordered_set selected_indices_set; 118 | while (left > 0) { // Don't stop until you have selected K positives 119 | for (int i=0; i0; i++) { 120 | std::unordered_set& cset = conflict_sets_ordered[i]; 121 | compromised = erase_intersection(cset, selected_indices_set); 122 | if (!compromised && cset.size()>0) { 123 | std::uniform_int_distribution distribution(0, cset.size()-1); 124 | random = distribution(generator); 125 | auto it = std::begin(cset); std::advance(it, random); 126 | selected_indices_set.insert(*it); 127 | cset.erase(it); 128 | left--; 129 | } 130 | } 131 | } 132 | std::copy(selected_indices_set.begin(), selected_indices_set.end(), std::back_inserter(selected_indices)); 133 | std::sort(selected_indices.begin(), selected_indices.end()); 134 | } 135 | 136 | static void conflict_sets_policy(int N, int K, int seed, bloom::OrdinaryBloomFilter& bloom, std::vector& selected_indices) { 137 | std::map> conflict_sets; 138 | build_conflict_sets(N, bloom, conflict_sets); 139 | //print_map(conflict_sets); 140 | // Sort the conflict sets by their size 141 | std::vector> conflict_sets_ordered; 142 | conflict_sets_ordered.resize(conflict_sets.size()); 143 | transform_and_sort(conflict_sets, conflict_sets_ordered); 144 | // Collect selected indices 145 | choose_indices_from_conflict_sets(K, seed, conflict_sets_ordered, selected_indices); 146 | } 147 | 148 | static void leftmostK(int N, int K, bloom::OrdinaryBloomFilter& bloom, 149 | std::vector& selected_indices) { 150 | // Iterating over the universe and collecting the first K positives 151 | for (size_t i=0, left=K; i0; i++) { 152 | if (bloom.Query(i)) { // If it is positive 153 | selected_indices.push_back(i); 154 | left--; 155 | } 156 | } 157 | } 158 | 159 | 160 | static void randomK(int N, int K, int64 step, bloom::OrdinaryBloomFilter& bloom, 161 | std::vector& selected_indices) { 162 | // Iterating over the universe and creating P 163 | std::vector P; 164 | for (size_t i=0; i distribution(0, P.size()-1); 175 | random = distribution(generator); 176 | auto it = std::begin(P); std::advance(it, random); 177 | selected_indices.push_back(*it); 178 | P.erase(it); 179 | } 180 | } 181 | 182 | static void select_indices(std::string policy, int N, int K, int64 step, 183 | bloom::OrdinaryBloomFilter& bloom, 184 | std::vector& selected_indices) { 185 | if (policy == "conflict_sets") { 186 | conflict_sets_policy(N, K, step, bloom, selected_indices); 187 | } else if (policy == "leftmostK") { 188 | leftmostK(N, K, bloom, selected_indices); 189 | } else if (policy == "randomK") { 190 | randomK(N, K, step, bloom, selected_indices); 191 | } else if (policy == "policy_zero") { 192 | leftmostK(N, N, bloom, selected_indices); 193 | } 194 | } 195 | 196 | }; 197 | 198 | #endif 199 | -------------------------------------------------------------------------------- /tensorflow/integer_compression.cc: -------------------------------------------------------------------------------- 1 | 2 | #include "tensorflow/core/framework/op.h" 3 | #include "tensorflow/core/framework/op_kernel.h" 4 | #include "tensorflow/core/framework/shape_inference.h" 5 | #include "tensorflow/core/framework/common_shape_fns.h" 6 | #include "tensorflow/core/framework/tensor.h" 7 | #include "tensorflow/core/framework/tensor_types.h" 8 | 9 | #include "../../third_party/FastPFor/headers/codecfactory.h" 10 | #include "../../third_party/FastPFor/headers/deltautil.h" 11 | 12 | #include 13 | #include 14 | #include 15 | 16 | using namespace tensorflow; 17 | 18 | using namespace FastPForLib; 19 | 20 | REGISTER_OP("IntegerCompressor") 21 | //.Attr("T: {int32, int64, float16, float32, float64}") 22 | .Attr("logfile_suffix: int") // For debugging 23 | .Attr("logs_path_suffix: int") // For debugging 24 | .Attr("verbosity: int") // For debugging 25 | .Attr("code: string") 26 | .Input("input: uint32") 27 | .Input("step: int64") // For debugging 28 | .Output("intcompressed_tensor: uint32") 29 | .Doc(R"doc( Integer compression )doc"); 30 | 31 | REGISTER_OP("IntegerDecompressor") 32 | //.Attr("T: {int32, int64, float16, float32, float64}") 33 | .Attr("logfile_suffix: int") // For debugging 34 | .Attr("logs_path_suffix: int") // For debugging 35 | .Attr("suffix: int") // For debugging 36 | .Attr("verbosity: int") // For debugging 37 | .Attr("code: string") 38 | .Input("input: uint32") 39 | .Input("decompressed_size: int32") 40 | .Input("step: int64") // For debugging 41 | .Output("decompressed_tensor: uint32") 42 | .Doc(R"doc( Integer decompression )doc"); 43 | 44 | 45 | class IntegerCompressorOp : public OpKernel { 46 | 47 | public: 48 | 49 | explicit IntegerCompressorOp(OpKernelConstruction *context) : OpKernel(context) { 50 | OP_REQUIRES_OK(context, context->GetAttr("logfile_suffix", &logfile_suffix)); // For debugging 51 | OP_REQUIRES_OK(context, context->GetAttr("logs_path_suffix", &logs_path_suffix)); // For debugging 52 | OP_REQUIRES_OK(context, context->GetAttr("verbosity", &verbosity)); // For debugging 53 | OP_REQUIRES_OK(context, context->GetAttr("code", &code)); 54 | } 55 | 56 | void Compute(OpKernelContext *context) override { 57 | 58 | const Tensor &input_tensor = context->input(0); 59 | auto input_tensor_flat = input_tensor.flat(); 60 | size_t input_tensor_size = input_tensor_flat.size(); 61 | 62 | IntegerCODEC &codec = *CODECFactory::getFromName(code); // Pick a CODEC 63 | std::vector intcompressed_output(input_tensor_size + 262144); 64 | size_t intcompressed_size = intcompressed_output.size(); 65 | codec.encodeArray(input_tensor_flat.data(), input_tensor_size, intcompressed_output.data(), intcompressed_size); 66 | // Shrink back the array: 67 | intcompressed_output.resize(intcompressed_size); 68 | intcompressed_output.shrink_to_fit(); 69 | 70 | // display compression rate: 71 | std::cout << std::setprecision(3); 72 | std::cout << "You are using " << 32.0 * static_cast(intcompressed_output.size()) / 73 | static_cast(input_tensor_flat.size()) << " ints per integer. " << std::endl; 74 | 75 | // Create an output tensor 76 | int output_concat_dim = intcompressed_output.size() ; 77 | printf("output_concat_dim %d\n", output_concat_dim); 78 | TensorShape output_shape; 79 | output_shape.AddDim(output_concat_dim); 80 | Tensor *output = NULL; 81 | OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output)); 82 | 83 | auto output_flat = output->template flat(); 84 | uint32* out_ptr = output_flat.data(); 85 | 86 | std::copy(intcompressed_output.begin(), intcompressed_output.end(), out_ptr); 87 | 88 | 89 | // ////// 90 | // std::vector init(input_tensor_size); 91 | // const uint32_t *ptr = input_tensor_flat.data(); 92 | // memcpy(init.data(), ptr, input_tensor_size*sizeof(int)); 93 | // std::vector decompressed_output(input_tensor_size); 94 | // codec.decodeArray(intcompressed_output.data(), output_concat_dim, decompressed_output.data(), input_tensor_size); 95 | // decompressed_output.resize(input_tensor_size); 96 | // 97 | // assert(std::equal(init.begin(), init.end(), decompressed_output.begin()) == 1); 98 | // ///// 99 | 100 | // *********************** For Debugging ********************** // 101 | const Tensor &step_tensor = context->input(1); 102 | auto step = step_tensor.flat(); 103 | 104 | if (verbosity != 0 && step(0) % verbosity == 0 ) { 105 | std::string suffix = std::to_string(logfile_suffix); 106 | std::string logs_suffix = std::to_string(logs_path_suffix); 107 | std::string str_step = std::to_string(step(0)); 108 | 109 | std::string cmd = "mkdir -p logs" + logs_suffix + "/step_" + str_step + "/" + suffix + "/"; 110 | int systemRet = system(cmd.c_str()); 111 | if(systemRet == -1){ 112 | perror("mkdir failed"); 113 | } 114 | std::string str = "logs" + logs_suffix + "/step_" + str_step + "/" + suffix + "/intcompressor_logs_" + suffix + ".txt"; 115 | FILE* f = fopen(str.c_str(),"w"); 116 | fprintf(f, "input_tensor: %s\n", input_tensor.DebugString(input_tensor_flat.size()).c_str()); 117 | fprintf(f, "Output_concat_size: = %d\n\n", output_concat_dim); 118 | fprintf(f, "intcompressed_tensor: %s\n", output->DebugString(output_flat.size()).c_str()); 119 | fprintf(f, "\n\n########################################################################################\n\n"); 120 | fclose(f); 121 | 122 | std::string str1 = "logs" + logs_suffix + "/step_" + str_step + "/" + suffix + "/stats" + suffix + ".txt"; 123 | f = fopen(str1.c_str(),"w"); 124 | fprintf(f, "Initial_Size: %d Final_Size: %d\n", input_tensor_flat.size()*32, output_concat_dim*32 + 32); 125 | fclose(f); 126 | } 127 | // *********************** For Debugging ********************** // 128 | 129 | } 130 | 131 | private: 132 | int logfile_suffix; // For debugging 133 | int logs_path_suffix; // For debugging 134 | int verbosity; // For debugging 135 | string code; 136 | }; 137 | 138 | class IntegerDecompressorOp : public OpKernel { 139 | 140 | public: 141 | 142 | explicit IntegerDecompressorOp(OpKernelConstruction *context) : OpKernel(context) { 143 | OP_REQUIRES_OK(context, context->GetAttr("logfile_suffix", &logfile_suffix)); // For debugging 144 | OP_REQUIRES_OK(context, context->GetAttr("logs_path_suffix", &logs_path_suffix)); // For debugging 145 | OP_REQUIRES_OK(context, context->GetAttr("suffix", &suffix)); // For debugging 146 | OP_REQUIRES_OK(context, context->GetAttr("verbosity", &verbosity)); // For debugging 147 | OP_REQUIRES_OK(context, context->GetAttr("code", &code)); 148 | 149 | } 150 | 151 | void Compute(OpKernelContext *context) override { 152 | 153 | const Tensor &input_tensor = context->input(0); 154 | auto input_tensor_flat = input_tensor.flat(); 155 | const size_t input_tensor_size = input_tensor_flat.size(); 156 | 157 | const Tensor &decompressed_size_tensor = context->input(1); 158 | auto decompressed_size_flat = decompressed_size_tensor.flat(); 159 | int decompressed_size = *decompressed_size_flat.data(); 160 | 161 | IntegerCODEC &codec = *CODECFactory::getFromName(code); // Pick a CODEC 162 | std::vector decompressed_output(decompressed_size); 163 | size_t recoveredsize = decompressed_output.size(); 164 | 165 | codec.decodeArray(input_tensor_flat.data(), input_tensor_size, decompressed_output.data(), recoveredsize); 166 | decompressed_output.resize(recoveredsize); 167 | 168 | // Create an output tensor 169 | int output_concat_dim = recoveredsize; 170 | printf("output_concat_dim %d\n", output_concat_dim); 171 | TensorShape output_shape; 172 | output_shape.AddDim(output_concat_dim); 173 | Tensor *output = NULL; 174 | OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output)); 175 | 176 | auto output_flat = output->template flat(); 177 | uint32* out_ptr = output_flat.data(); 178 | 179 | std::copy(decompressed_output.begin(), decompressed_output.end(), out_ptr); 180 | 181 | // *********************** For Debugging ********************** // 182 | const Tensor &step_tensor = context->input(2); 183 | auto step = step_tensor.flat(); 184 | if (verbosity != 0 && step(0) % verbosity == 0 ) { 185 | std::string str_suffix = std::to_string(logfile_suffix); 186 | std::string logs_suffix = std::to_string(logs_path_suffix); 187 | std::string str_step = std::to_string(step(0)); 188 | std::string str = "logs" + logs_suffix + "/step_" + str_step + "/" + str_suffix + "/intdecompressor_logs_" + str_suffix + "_" + std::to_string(suffix) + ".txt"; 189 | FILE* f = fopen(str.c_str(),"w"); 190 | fprintf(f, "input_tensor: %s\n", input_tensor.DebugString(input_tensor_flat.size()).c_str()); 191 | fprintf(f, "Output_concat_size: = %d\n\n", output_concat_dim); 192 | fprintf(f, "recoveredsize: = %d\n\n", recoveredsize); 193 | fprintf(f, "intdecompressed_tensor: %s\n", output->DebugString(output_flat.size()).c_str()); 194 | fprintf(f, "\n\n########################################################################################\n\n"); 195 | fclose(f); 196 | } 197 | // *********************** For Debugging ********************** // 198 | 199 | } 200 | 201 | private: 202 | int logfile_suffix; // For debugging 203 | int logs_path_suffix; // For debugging 204 | int suffix; // For debugging 205 | int verbosity; // For debugging 206 | string code; 207 | }; 208 | 209 | REGISTER_KERNEL_BUILDER(Name("IntegerCompressor").Device(DEVICE_CPU), IntegerCompressorOp); 210 | 211 | REGISTER_KERNEL_BUILDER(Name("IntegerDecompressor").Device(DEVICE_CPU), IntegerDecompressorOp); 212 | 213 | -------------------------------------------------------------------------------- /tensorflow/bloom_filter_compression.cc: -------------------------------------------------------------------------------- 1 | #include "tensorflow/core/framework/op.h" 2 | #include "tensorflow/core/framework/op_kernel.h" 3 | #include "tensorflow/core/framework/shape_inference.h" 4 | #include "tensorflow/core/framework/common_shape_fns.h" 5 | #include "tensorflow/core/framework/tensor.h" 6 | #include "tensorflow/core/framework/tensor_types.h" 7 | #include "../../third_party/bloomfilter/inc/OrdinaryBloomFilter.hpp" 8 | #include "./policies.hpp" 9 | #include "./compression_utils.hpp" 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | using namespace tensorflow; 18 | 19 | REGISTER_OP("BloomCompressor") 20 | .Attr("T: {int32, int64, float16, float32, float64}") 21 | .Attr("false_positives_aware: bool") 22 | .Attr("policy: string") 23 | .Attr("fpr: float") 24 | .Attr("hash_num: int") 25 | .Attr("bloom_size: int") 26 | .Attr("bloom_logs_path: string") 27 | .Attr("gradient_id: int") 28 | .Attr("rank: int") 29 | .Attr("verbosity_frequency: int") 30 | .Attr("verbosity: int") 31 | .Input("values: T") 32 | .Input("indices: int32") 33 | .Input("initial_tensor: int32") 34 | .Input("step: int64") 35 | .Output("compressed_tensor: int8") 36 | .Doc(R"doc()doc"); 37 | 38 | REGISTER_OP("BloomDecompressor") 39 | .Attr("policy: string") 40 | .Attr("mem_mode: int") 41 | .Attr("hash_num: int") 42 | .Attr("bloom_size: int") 43 | .Attr("bloom_logs_path: string") 44 | .Attr("gradient_id: int") 45 | .Attr("rank: int") 46 | .Attr("suffix: int") 47 | .Attr("verbosity_frequency: int") 48 | .Attr("verbosity: int") 49 | .Input("compressed_tensor: int8") 50 | .Input("decompressed_size: int32") 51 | .Input("step: int64") 52 | .Output("decompressed_tensor: int32") 53 | .Doc(R"doc()doc"); 54 | 55 | class BloomCompressorOp : public OpKernel { 56 | 57 | public: 58 | 59 | explicit BloomCompressorOp(OpKernelConstruction *context) : OpKernel(context) { 60 | OP_REQUIRES_OK(context, context->GetAttr("false_positives_aware", &false_positives_aware)); 61 | OP_REQUIRES_OK(context, context->GetAttr("policy", &policy)); 62 | OP_REQUIRES_OK(context, context->GetAttr("fpr", &fpr)); 63 | OP_REQUIRES_OK(context, context->GetAttr("hash_num", &hash_num)); 64 | OP_REQUIRES_OK(context, context->GetAttr("bloom_size", &bloom_size)); 65 | OP_REQUIRES_OK(context, context->GetAttr("bloom_logs_path", &bloom_logs_path)); 66 | OP_REQUIRES_OK(context, context->GetAttr("gradient_id", &gradient_id)); 67 | OP_REQUIRES_OK(context, context->GetAttr("rank", &rank)); 68 | OP_REQUIRES_OK(context, context->GetAttr("verbosity", &verbosity)); 69 | OP_REQUIRES_OK(context, context->GetAttr("verbosity_frequency", &verbosity_frequency)); 70 | } 71 | 72 | void Compute(OpKernelContext *context) override { 73 | 74 | 75 | 76 | // Retrieving Inputs 77 | const Tensor &values = context->input(0); auto values_flat = values.flat(); 78 | const Tensor &indices = context->input(1); auto indices_flat = indices.flat(); 79 | const Tensor &initial_tensor = context->input(2); auto initial_flat = initial_tensor.flat(); 80 | int64 step = context->input(3).flat()(0); 81 | 82 | int N = initial_flat.size(); 83 | int K = values_flat.size(); 84 | 85 | // Given FPR compute M and H 86 | int m,rem,h; 87 | float temp_m,temp_h; 88 | temp_m = (K*fabs(log(fpr))) / (pow(log(2), 2)); 89 | // Give bloom size in number of bytes ; bloom size must be a multiple of 8 90 | m = int(temp_m/8); 91 | rem = m % 8; 92 | if (rem != 0 || m == 0){ 93 | m += 1; 94 | } 95 | temp_h = (m*8 / K)*log(2); 96 | h = int(ceil(temp_h)); 97 | 98 | bloom_size=m; 99 | hash_num=h; 100 | 101 | // Building Bloom Filter 102 | bloom::OrdinaryBloomFilter bloom(hash_num, bloom_size); 103 | for (int i=0; i selected_indices; 109 | Policies::select_indices(policy, N, K, step, bloom, selected_indices); 110 | K = selected_indices.size(); 111 | 112 | int output_concat_dim = 2*sizeof(int) + K*sizeof(int) + bloom_size; 113 | // Create an output tensor 114 | TensorShape output_shape; 115 | output_shape.AddDim(output_concat_dim); 116 | Tensor *output = NULL; 117 | OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output)); 118 | 119 | auto output_flat = output->template flat(); 120 | int8* out_ptr = output_flat.data(); 121 | 122 | // copy the bloom_size and hash_num in the output tensor 123 | memcpy(out_ptr, &bloom_size, sizeof(int)); 124 | memcpy(out_ptr+sizeof(int), &hash_num, sizeof(int)); 125 | 126 | // Copy the values in the output tensor 127 | std::vector new_values; 128 | if (false_positives_aware) { 129 | for (int i=0; i &bloom_vec = bloom.Get_bloom(); 141 | std::copy(bloom_vec.begin(), bloom_vec.end(), out_ptr+(K+2)*sizeof(int)); 142 | 143 | // *********************** For Debugging ********************** // 144 | if (verbosity_frequency != 0 && step % verbosity_frequency == 0 ) { 145 | if (!false_positives_aware) { 146 | // Select Indices using a Policy 147 | Policies::select_indices(policy, N, K, step, bloom, selected_indices); 148 | } 149 | CompressionUtilities::logging_compressor(bloom, N, K, output_concat_dim, initial_tensor, indices, values, 150 | new_values, selected_indices, bloom_logs_path, gradient_id, step, policy, rank, verbosity); 151 | } 152 | // *********************** For Debugging ********************** // 153 | } 154 | 155 | private: 156 | float fpr; 157 | int hash_num; 158 | int bloom_size; 159 | string policy; 160 | bool false_positives_aware; 161 | // Logging 162 | string bloom_logs_path; 163 | int gradient_id; 164 | int rank; 165 | int verbosity_frequency; 166 | int verbosity; 167 | }; 168 | 169 | class BloomDecompressorOp : public OpKernel { 170 | 171 | public: 172 | 173 | explicit BloomDecompressorOp(OpKernelConstruction *context) : OpKernel(context) { 174 | OP_REQUIRES_OK(context, context->GetAttr("policy", &policy)); 175 | OP_REQUIRES_OK(context, context->GetAttr("mem_mode", &mem_mode)); 176 | OP_REQUIRES_OK(context, context->GetAttr("hash_num", &hash_num)); 177 | OP_REQUIRES_OK(context, context->GetAttr("bloom_size", &bloom_size)); 178 | OP_REQUIRES_OK(context, context->GetAttr("bloom_logs_path", &bloom_logs_path)); 179 | OP_REQUIRES_OK(context, context->GetAttr("gradient_id", &gradient_id)); 180 | OP_REQUIRES_OK(context, context->GetAttr("rank", &rank)); 181 | OP_REQUIRES_OK(context, context->GetAttr("suffix", &suffix)); 182 | OP_REQUIRES_OK(context, context->GetAttr("verbosity", &verbosity)); 183 | OP_REQUIRES_OK(context, context->GetAttr("verbosity_frequency", &verbosity_frequency)); 184 | } 185 | 186 | void Compute(OpKernelContext *context) override { 187 | 188 | // Retrieving Inputs 189 | const Tensor &compressed_tensor = context->input(0); 190 | auto compressed_tensor_flat = compressed_tensor.flat(); 191 | int N = *context->input(1).flat().data(); 192 | int64 step = context->input(2).flat()(0); 193 | 194 | // Reconstruct the bloom filter 195 | const int8 *ptr = compressed_tensor_flat.data(); // Note: int8 is 1 byte 196 | 197 | memcpy(&bloom_size, ptr, sizeof(int)); 198 | memcpy(&hash_num, ptr+sizeof(int), sizeof(int)); 199 | 200 | int K = (compressed_tensor_flat.size()-bloom_size)/sizeof(int) - 2; 201 | int values_bytes = K*sizeof(int); 202 | int *values_vec = (int*) malloc(values_bytes); 203 | memcpy(values_vec, ptr+2*sizeof(int), values_bytes); 204 | 205 | ptr += (values_bytes + 2*sizeof(int)); 206 | bloom::OrdinaryBloomFilter bloom(hash_num, bloom_size, ptr); 207 | 208 | // Create an output tensor 209 | TensorShape decompressed_tensor_shape; 210 | decompressed_tensor_shape.AddDim(N); 211 | Tensor *decompressed_tensor = NULL; 212 | OP_REQUIRES_OK(context, context->allocate_output(0, decompressed_tensor_shape, &decompressed_tensor)); 213 | auto decompressed_tensor_flat = decompressed_tensor->template flat(); 214 | memset(decompressed_tensor_flat.data(), 0, N*sizeof(int)); 215 | 216 | // Select Indices using a Policy 217 | std::vector selected_indices; 218 | Policies::select_indices(policy, N, K, step, bloom, selected_indices); 219 | 220 | // Map values to the selected indices 221 | for (int i=0; i 15 | 16 | class CompressionUtilities { 17 | 18 | public: 19 | 20 | explicit 21 | CompressionUtilities(){} 22 | 23 | static void fprint(FILE* f, int size, std::vector ptr) { 24 | unsigned int bit_pos, byte_pos, value, byte; 25 | fprintf(f, "Bitstream Array: \n [ "); 26 | 27 | for (byte_pos=0; byte_pos ptr) { 42 | unsigned int bit_pos, byte_pos, value, byte; 43 | fprintf(f, "Bitstream Array: \n [ "); 44 | 45 | for (byte_pos=0; byte_pos>& vec, FILE* f) { 78 | for (auto& it: vec) { 79 | fprintf(f, "{"); 80 | for (auto& itt : it) 81 | fprintf(f, "%d, ", itt); 82 | fprintf(f, "}\n"); 83 | } 84 | } 85 | 86 | static void print_map(std::map>& map, FILE* f) { 87 | for (auto& it: map) { 88 | fprintf(f, "Key: %d, Values: ", it.first); 89 | for (auto& itt : it.second) 90 | fprintf(f, "%d, ", itt); 91 | fprintf(f, "\n"); 92 | } 93 | } 94 | 95 | // Logging for values approximation 96 | static void logging_compressor(bloom::OrdinaryBloomFilter& bloom, int N, int K, int output_concat_dim, 97 | const Tensor& initial_tensor, const Tensor& indices, const Tensor& values, std::vector& new_values, std::vector& selected_indices, 98 | std::string bloom_logs_path, int gradient_id, int64 step, std::string policy, int rank, int verbosity) { 99 | 100 | FILE* f; 101 | std::string str; 102 | int false_positives = bloom.Compute_False_Positives(N, indices); 103 | int policy_errors = Policies::get_policy_errors(K, indices, selected_indices); 104 | std::string str_gradient_id = std::to_string(gradient_id); 105 | std::string str_step = std::to_string(step); 106 | std::string str_rank = std::to_string(rank); 107 | int bloom_size = bloom.Get_numBytes(); 108 | 109 | std::string path = bloom_logs_path + "/" + str_rank + "/step_" + str_step + "/" + str_gradient_id + "/"; 110 | std::string cmd = "mkdir -p " + path; 111 | int systemRet = system(cmd.c_str()); 112 | if(systemRet == -1){ 113 | perror("mkdir failed"); 114 | } 115 | if (verbosity > 1) { 116 | str = path + "compressor_logs_" + policy + ".txt"; 117 | f = fopen(str.c_str(),"w"); 118 | fprintf(f, "\nInitial Tensor: %s\n\n", initial_tensor.DebugString(N).c_str()); 119 | fprintf(f, "Values: %s\n", values.DebugString(K).c_str()); 120 | fprintf(f, "\nIndices: %s\n\n", indices.DebugString(K).c_str()); 121 | // fprintf(f, "Step: = %d\n\n", step); 122 | fprintf(f, "Bloom size: = %d\n", bloom_size); 123 | bloom.fprint(f); 124 | fprintf(f, "\nIndices Chosen:"); 125 | print_vector(selected_indices.data(), K, f); 126 | if (new_values.size() > 0) { 127 | fprintf(f, "\nValues-Sent:"); 128 | print_vector(new_values.data(), K, f); 129 | } 130 | fprintf(f, "Output_concat_size: = %d\n\n", output_concat_dim); 131 | fprintf(f, "FalsePositives: %d\n", false_positives); 132 | fprintf(f, "Total: %d\n", N); 133 | fprintf(f, "\n\n########################################################################################\n\n"); 134 | fclose(f); 135 | } 136 | 137 | str = path + "fpr.txt"; 138 | f = fopen(str.c_str(),"w"); 139 | fprintf(f, "FalsePositives: %d Total: %d\n", false_positives, N); 140 | fclose(f); 141 | str = path + "policy_errors.txt"; 142 | f = fopen(str.c_str(),"w"); 143 | fprintf(f, "PolicyErrors: %d Total: %d\n", policy_errors, K); 144 | fclose(f); 145 | str = path + "stats.txt"; 146 | f = fopen(str.c_str(),"w"); 147 | fprintf(f, "Initial_Size: %d Final_Size: %d\n", N /*in bits*/, bloom_size*8 /*in bits*/); 148 | fclose(f); 149 | } 150 | 151 | static void logging_decompressor(bloom::OrdinaryBloomFilter& bloom, int N, int K, 152 | int* values_vec, std::vector& selected_indices, std::string bloom_logs_path, int gradient_id, 153 | int suffix, int64 step, Tensor* decompressed_tensor, std::string policy, int rank, int verbosity) { 154 | if (verbosity > 1) { 155 | FILE* f; 156 | std::string str_gradient_id = std::to_string(gradient_id); 157 | std::string str_step = std::to_string(step); 158 | std::string str_rank = std::to_string(rank); 159 | 160 | std::string path = bloom_logs_path + "/" + str_rank + "/step_" + str_step + "/" + str_gradient_id; 161 | std::string str = path + "/decompressor_logs_" + policy + "_" + std::to_string(suffix) + ".txt"; 162 | f = fopen(str.c_str(),"w"); 163 | fprintf(f, "decompressed size: %d\n\n", N); 164 | // fprintf(f, "Step: = %d\n\n", step); 165 | 166 | int bloom_size = bloom.Get_numBytes(); 167 | fprintf(f, "Bloom size: = %d\n", bloom_size); 168 | bloom.fprint(f); 169 | fprintf(f, "\nIndices Chosen:"); 170 | print_vector(selected_indices.data(), K, f); 171 | fprintf(f, "Values Received:"); print_vector(values_vec, K, f); 172 | fprintf(f, "Decompressed_tensor: %s\n", decompressed_tensor->DebugString(N).c_str()); 173 | fprintf(f, "########################################################################################\n\n"); 174 | fclose (f); 175 | } 176 | } 177 | 178 | // Logging for values approximation 179 | static void logging(int N, const Tensor& initial_tensor, const Tensor& coefficients, 180 | std::string bloom_logs_path, int gradient_id, int64 step, int rank, int verbosity) { 181 | 182 | FILE* f; 183 | std::string str; 184 | std::string str_gradient_id = std::to_string(gradient_id); 185 | std::string str_step = std::to_string(step); 186 | std::string str_rank = std::to_string(rank); 187 | 188 | std::string path = bloom_logs_path + "/" + str_rank + "/step_" + str_step + "/" + str_gradient_id + "/"; 189 | std::string cmd = "mkdir -p " + path; 190 | int systemRet = system(cmd.c_str()); 191 | if(systemRet == -1){ 192 | perror("mkdir failed"); 193 | } 194 | // if (verbosity > 1) { 195 | // str = path + "log.txt"; 196 | // f = fopen(str.c_str(),"w"); 197 | // fprintf(f, "\nInitial Tensor: %s\n\n", initial_tensor.DebugString(N).c_str()); 198 | // fprintf(f, "\nEstimated Tensor: %s\n\n", coefficients.DebugString(N).c_str()); 199 | // fprintf(f, "\nCoefficients: %s\n\n", coefficients.DebugString(num_of_coefficients).c_str()); 200 | // fprintf(f, "Step: = %d\n\n", step); 201 | // } 202 | auto initial_flat = initial_tensor.flat(); 203 | auto coefficients_flat = coefficients.flat(); 204 | str = path + "values.csv"; 205 | f = fopen(str.c_str(),"w"); 206 | for(int i=0; i& bitstream, std::vector* lengths, const Tensor* output, int initial_tensor_size, 222 | std::string logs_path, int gradient_id, int64 step, int rank, int verbosity) { 223 | 224 | FILE* f; 225 | std::string str; 226 | std::string str_gradient_id = std::to_string(gradient_id); 227 | std::string str_step = std::to_string(step); 228 | std::string str_rank = std::to_string(rank); 229 | 230 | std::string path = logs_path + "/" + str_rank + "/step_" + str_step + "/" + str_gradient_id + "/"; 231 | std::string cmd = "mkdir -p " + path; 232 | int systemRet = system(cmd.c_str()); 233 | if(systemRet == -1){ 234 | perror("mkdir failed"); 235 | } 236 | if (verbosity > 1) { 237 | str = path + "RleCompressor_logs.txt"; 238 | f = fopen(str.c_str(),"w"); 239 | auto indices_tensor_flat = indices_tensor.flat(); 240 | fprintf(f, "indices_tensor: %s\n", indices_tensor.DebugString(indices_tensor_flat.size()).c_str()); 241 | fprintf(f, "Output_concat_size: = %d\n\n", output_concat_dim); 242 | CompressionUtilities::fprint(f, tensor_size_bytes, bitstream); 243 | if (lengths != NULL) { 244 | fprintf(f, "Lengths:\n"); 245 | CompressionUtilities::print_vector(lengths->data(), lengths->size(), f); 246 | } 247 | auto output_flat = output->template flat(); 248 | fprintf(f, "Encoded lengths: %s\n", output->DebugString(output_flat.size()).c_str()); 249 | fprintf(f, "\n\n########################################################################################\n\n"); 250 | fclose(f); 251 | } 252 | 253 | str = path + "stats.txt"; 254 | f = fopen(str.c_str(),"w"); 255 | fprintf(f, "Initial_Size: %d Final_Size: %d\n", initial_tensor_size, output_concat_dim*8 + 32); // in bits // worker sends the size of the encoded tensor 256 | fclose(f); 257 | } 258 | 259 | static void logging_bitstream_decompressor(const Tensor& encoding, int output_concat_dim, std::vector* lengths, 260 | const Tensor* output, std::string logs_path, int gradient_id, int64 step, int rank, int suffix, int verbosity) { 261 | 262 | FILE* f; 263 | std::string str; 264 | std::string str_gradient_id = std::to_string(gradient_id); 265 | std::string str_step = std::to_string(step); 266 | std::string str_rank = std::to_string(rank); 267 | 268 | std::string path = logs_path + "/" + str_rank + "/step_" + str_step + "/" + str_gradient_id + "/"; 269 | std::string cmd = "mkdir -p " + path; 270 | int systemRet = system(cmd.c_str()); 271 | if(systemRet == -1){ 272 | perror("mkdir failed"); 273 | } 274 | if (verbosity > 1) { 275 | str = path + "RleDecompressor_logs" + "_" + std::to_string(suffix) + ".txt"; 276 | f = fopen(str.c_str(),"w"); 277 | 278 | auto encoding_flat = encoding.flat(); 279 | fprintf(f, "encoding_flat: %s\n", encoding.DebugString(encoding_flat.size()).c_str()); 280 | fprintf(f, "Output_concat_size: = %d\n\n", output_concat_dim); 281 | if (lengths != NULL) { 282 | fprintf(f, "Lengths:\n"); 283 | CompressionUtilities::print_vector(lengths->data(), lengths->size(), f); 284 | } 285 | auto output_flat = output->template flat(); 286 | fprintf(f, "Indices: %s\n", output->DebugString(output_flat.size()).c_str()); 287 | fprintf(f, "\n\n########################################################################################\n\n"); 288 | fclose(f); 289 | } 290 | } 291 | 292 | }; 293 | 294 | #endif 295 | -------------------------------------------------------------------------------- /tensorflow/deepreduce.py: -------------------------------------------------------------------------------- 1 | """ 2 | DeepReduce compression algorithms. 3 | Author:Kelly Kostopoulou 4 | """ 5 | 6 | from __future__ import division 7 | import tensorflow as tf 8 | from tensorflow.python.framework import load_library 9 | from tensorflow.python.platform import resource_loader 10 | import math 11 | from horovod.tensorflow.mpi_ops import rank 12 | 13 | tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR) 14 | 15 | 16 | class Compressor(object): 17 | """Interface for compressing and decompressing a given tensor.""" 18 | 19 | residuals = {} 20 | 21 | @staticmethod 22 | def compress(tensor, params): 23 | """Compresses a tensor and returns a list of compressed tensors with the context needed to decompress it.""" 24 | pass 25 | 26 | @staticmethod 27 | def decompress(tensors, ctx, params): 28 | """Decompress a list of compressed tensors with the given context.""" 29 | pass 30 | 31 | @classmethod 32 | def memory_compensate(cls, tensor, params): 33 | """Update the tensor with the residuals.""" 34 | use_memory = params['use_memory'] 35 | beta = params['beta'] 36 | gamma = params['gamma'] 37 | if use_memory: 38 | name = tensor.name 39 | cls.residuals[tensor.name] = tf.Variable(tf.zeros_like(tensor), trainable=False) 40 | tensor = beta * cls.residuals[name] + gamma * tensor 41 | return tensor 42 | 43 | @classmethod 44 | def memory_update(cls, tensor, tensor_compensate, tensor_compressed, ctx, params): 45 | """Update the residuals.""" 46 | use_memory = params['use_memory'] 47 | if use_memory: 48 | name = tensor.name 49 | tensor_decompressed = cls.decompress(tensor_compressed, ctx, params) 50 | delta = tensor_compensate - tensor_decompressed 51 | memory_update_op = cls.residuals[name].assign(delta) 52 | return [memory_update_op] if use_memory else [] 53 | 54 | @staticmethod 55 | def aggregate(tensors, params): 56 | """Aggregate a list of tensors.""" 57 | average = params['average'] 58 | agged_tensor = tf.math.add_n(tensors) 59 | horovod_size = tf.cast(params["horovod_size"], dtype=agged_tensor.dtype) 60 | agged_tensor = (agged_tensor / horovod_size) if average else agged_tensor 61 | return agged_tensor 62 | 63 | 64 | class Values_Approximation_Helper(Compressor): 65 | 66 | @staticmethod 67 | def double_exponential_fit(X_, Y_, K): 68 | 69 | # S, SS initialization 70 | Ysum = Y_ + tf.roll(Y_, shift=-1, axis=0) 71 | Xsum = tf.roll(X_, shift=-1, axis=0) - X_ 72 | S = tf.tensor_scatter_nd_update(tf.roll(0.5 * Ysum * Xsum, shift=1, axis=0), [[0]], tf.zeros(1, tf.float64)) 73 | S = tf.math.cumsum(S) 74 | Ssum = S + tf.roll(S, shift=-1, axis=0) 75 | SS = tf.tensor_scatter_nd_update(tf.roll(0.5 * Ssum * Xsum, shift=1, axis=0), [[0]], tf.zeros(1, tf.float64)) 76 | SS = tf.math.cumsum(SS) 77 | 78 | sum_SSk_squared = tf.math.reduce_sum(tf.math.pow(SS, 2)) 79 | sum_SSk_Sk = tf.math.reduce_sum(S * SS) 80 | sum_SSk_xk = tf.math.reduce_sum(SS * X_) 81 | sum_SSk = tf.math.reduce_sum(SS) 82 | sum_Sk_squared = tf.math.reduce_sum(tf.math.pow(S, 2)) 83 | sum_Sk_xk = tf.math.reduce_sum(S * X_) 84 | sum_Sk = tf.math.reduce_sum(S) 85 | sum_data_x = tf.cast(K * (K + 1) / 2, tf.float64) 86 | sum_data_x_squared = tf.cast(K * (K + 1) * (2 * K + 1) / 6, tf.float64) 87 | K = tf.cast(K, tf.float64) 88 | 89 | # Form the first system 90 | values = tf.stack([sum_SSk_squared, sum_Sk_squared, sum_data_x_squared, K, 91 | sum_SSk_Sk, sum_SSk_xk, sum_SSk, sum_Sk_xk, sum_Sk, sum_data_x], axis=0) 92 | 93 | A_LS_1 = tf.scatter_nd([[0, 0], [1, 1], [2, 2], [3, 3], 94 | [0, 1], [0, 2], [0, 3], 95 | [1, 2], [1, 3], 96 | [2, 3]], 97 | values, [4, 4]) 98 | A_LS_1 = tf.tensor_scatter_nd_update(A_LS_1, 99 | [[0, 0], [1, 1], [2, 2], [3, 3], 100 | [1, 0], [2, 0], [3, 0], 101 | [2, 1], [3, 1], 102 | [3, 2]], 103 | values) 104 | 105 | a = tf.math.reduce_sum(tf.transpose(SS) * Y_) 106 | b = tf.math.reduce_sum(tf.transpose(S) * Y_) 107 | c = tf.math.reduce_sum(tf.transpose(X_) * Y_) 108 | d = tf.math.reduce_sum(Y_) 109 | 110 | b_vector_1 = tf.stack([a, b, c, d], axis=0) 111 | b_vector_1 = tf.reshape(b_vector_1, [4, 1]) 112 | 113 | # Solve the first system 114 | Coefficient_vector_1 = tf.linalg.solve(A_LS_1, b_vector_1) 115 | 116 | # Calculate p1 and q1 117 | p1 = 0.5 * (Coefficient_vector_1[1] + tf.math.sqrt( 118 | tf.math.pow(Coefficient_vector_1[1], 2) + 4 * Coefficient_vector_1[0])) 119 | q1 = 0.5 * (Coefficient_vector_1[1] - tf.math.sqrt( 120 | tf.math.pow(Coefficient_vector_1[1], 2) + 4 * Coefficient_vector_1[0])) 121 | 122 | beta_k = tf.math.exp(p1 * X_) 123 | eta_k = tf.math.exp(q1 * X_) 124 | 125 | sum_betak_square = tf.math.reduce_sum(tf.math.pow(beta_k, 2)) 126 | sum_etak_square = tf.math.reduce_sum(tf.math.pow(eta_k, 2)) 127 | sum_betak_etak = tf.math.reduce_sum(beta_k * eta_k) 128 | 129 | # Form the second system 130 | A_LS_2 = tf.stack([sum_betak_square, sum_betak_etak, sum_betak_etak, sum_etak_square], axis=0) 131 | A_LS_2 = tf.reshape(A_LS_2, [2, 2]) 132 | a = tf.reshape(tf.math.reduce_sum(tf.transpose(beta_k) * Y_), [1, ]) 133 | b = tf.reshape(tf.math.reduce_sum(tf.transpose(eta_k) * Y_), [1, ]) 134 | b_vector_2 = tf.stack([a, b], axis=0) 135 | b_vector_2 = tf.reshape(b_vector_2, [2, 1]) 136 | 137 | # Solve the second system 138 | Coefficient_vector_2 = tf.linalg.solve(A_LS_2, b_vector_2) 139 | 140 | # print("Coefficient_vector_1: \n", Coefficient_vector_1) 141 | # print("p1:\n", p1) 142 | # print("Coefficient_vector_2:\n", Coefficient_vector_2) 143 | # print("q1:\n", q1) 144 | return Coefficient_vector_2[0], Coefficient_vector_2[1], p1, q1 145 | 146 | @staticmethod 147 | def logit_basis(X, a, N): # log(p/(1-p)) 148 | return tf.cast(a * tf.math.log(X / ((N + 1) - X)), dtype=tf.float64) 149 | 150 | @staticmethod 151 | def exp_basis(X, b, c): 152 | return tf.cast(b * tf.math.exp(c * X), dtype=tf.float64) 153 | 154 | @staticmethod 155 | def polynomial_basis(X, a): 156 | return tf.cast(tf.pow(X, a), dtype=tf.float64) 157 | 158 | @staticmethod 159 | def GetInputMatrix_Polynomial(xcol, x): 160 | N = tf.size(x) 161 | Xtrans = tf.ones([1, N], tf.float64) 162 | for i in range(1, xcol): 163 | basis = Values_Approximation_Helper.polynomial_basis(x, i) 164 | Xtrans = tf.concat([Xtrans, basis], axis=0) 165 | return tf.transpose(Xtrans) 166 | 167 | @staticmethod 168 | def find_breaks(y_train, num_of_segments, N): 169 | b = tf.constant(0, shape=(1,), dtype=tf.int32) 170 | N = tf.constant(N, shape=(1,), dtype=tf.int32) 171 | y = y_train 172 | break_points = tf.zeros(num_of_segments + 1, tf.int32) 173 | for i in range(num_of_segments - 1): 174 | a = tf.math.argmax(tf.abs(tf.linspace(y[0], y[-1], tf.size(y)) - y)) 175 | b = b + tf.cast(a, tf.int32) 176 | break_points = tf.tensor_scatter_nd_update(break_points, [[i + 1]], b) 177 | y = tf.gather(y_train, tf.range(b[0], N[0])) 178 | break_points = tf.tensor_scatter_nd_update(break_points, [[num_of_segments]], N) 179 | sizes = [break_points[i + 1] - break_points[i] for i in range(num_of_segments)] 180 | return break_points, sizes 181 | 182 | @staticmethod 183 | def get_breaks(model, N): 184 | if model == "resnet20_v2": 185 | breaks = { 186 | 432 : [0, 353, 432], 187 | 2304 : [0, 1847, 2229, 2304], 188 | 4608 : [0, 4073, 4544, 4608], 189 | 9216 : [0, 8164, 9012, 9216], 190 | 18432 : [0, 16094, 18060, 18432], 191 | 36864 : [0, 33742, 36595, 36864]} 192 | elif model == "vgg16": 193 | breaks = { 194 | 1728 : [0, 1443, 1663, 1728], 195 | 36864 : [0, 34097, 36467, 36815, 36864], 196 | 73728 : [0, 67595, 73032, 73630, 73728], 197 | 147456 : [0, 132193, 145286, 147125, 147456], 198 | 294912 : [0, 272485, 292623, 294580, 294844, 294912], 199 | 589824 : [0, 553577, 586620, 589431, 589764, 589824], 200 | 1179648 : [0, 1099105, 1172811, 1179005, 1179543, 1179648], 201 | 2359296 : [0, 2195844, 2343594, 2357633, 2359102, 2359296]} 202 | elif model == "resnet50": 203 | breaks = { 204 | 4096 : [0, 3656, 4018, 4096], 205 | 9408 : [0, 8476, 9165, 9408], 206 | 16384 : [0, 14406, 16145, 16327, 16384], 207 | 36864 : [0, 32238, 36292, 36726, 36864], 208 | 131072 : [0, 121069, 130381, 130989, 131072], 209 | 32768 : [0, 29429, 32320, 32692, 32768], 210 | 147456 : [0, 133258, 145944, 147255, 147456], 211 | 65536 : [0, 58690, 64507, 65371, 65536], 212 | 524288 : [0, 494762, 522078, 524067, 524238, 524288], 213 | 589824 : [0, 539407, 584654, 589214, 589738, 589824], 214 | 262144 : [0, 237433, 259437, 261782, 262062, 262144], 215 | 2097152 : [0, 1990620, 2088919, 2096322, 2097036, 2097152], 216 | 2359296 : [0, 2188168, 2341896, 2356580, 2358793, 2359296], 217 | 1048576 : [0, 981145, 1041707, 1047784, 1048461, 1048576], 218 | 2050048 : [0, 1980923, 2044274, 2049225, 2049929, 2050048]} 219 | return breaks[N] 220 | 221 | 222 | @staticmethod 223 | def LeastSquares(X, y): # returns (X'X)^-1 X'y 224 | Xtrans = tf.transpose(X) 225 | tmp = tf.matmul(Xtrans, X) 226 | inverse = tf.linalg.inv(tmp) 227 | theta_estimates = tf.matmul(tf.matmul(inverse, Xtrans), y) 228 | return theta_estimates 229 | 230 | @staticmethod 231 | def is_convolutional(model, N): 232 | # print(model) 233 | if model == "resnet20_v2": 234 | conv_sizes = [432, 2304, 4608, 9216, 18432, 36864] 235 | elif model == "vgg16": 236 | conv_sizes = [1728, 36864, 73728, 147456, 294912, 589824, 1179648, 2359296] 237 | elif model == "resnet50": 238 | conv_sizes = [4096, 9408, 16384, 36864, 131072, 32768, 147456, 65536, 524288, 239 | 589824, 262144, 2097152, 2359296, 1048576, 2050048] 240 | if N in conv_sizes: 241 | return True 242 | return False 243 | 244 | @staticmethod 245 | def get_num_of_segments(model, N): 246 | if model == "resnet20_v2": 247 | segments = {432:2, 2304:3, 4608:3, 9216:3, 18432:3, 36864:3} 248 | elif model == "vgg16": 249 | segments = {1728:3, 36864:4, 73728:4, 147456:4, 294912:5, 589824:5, 1179648:5, 2359296:5} 250 | elif model == "resnet50": 251 | segments = { 4096:3, 9408:3, 16384:4, 36864:4, 131072:4, 32768:4, 147456:4, 65536:4, 524288:5, 252 | 589824:5, 262144:5, 2097152:5, 2359296:5, 1048576:5, 2050048:5} 253 | return segments[N] 254 | 255 | 256 | # Bloom on CPU 257 | class BloomFilterCompressor(Compressor): 258 | global_step = 0 259 | """""" 260 | @staticmethod 261 | def bloom_configuration(k, fpr): 262 | # Given FPR compute M and H 263 | m = (k*abs(math.log(fpr))) / (math.pow(math.log(2), 2)) 264 | # Give bloom size in number of bytes ; bloom size must be a multiple of 8 265 | m = int(m/8) 266 | rem = m % 8 267 | if rem != 0 or m == 0: 268 | m += 1 269 | h = (m*8 / k)*math.log(2) 270 | h = int(math.ceil(h)) 271 | return m, h 272 | 273 | @staticmethod 274 | def topk_indices(tensor, K): 275 | _, indices = tf.math.top_k(tf.math.abs(tensor), K, sorted=False) 276 | indices = tf.sort(indices, axis=0, direction='ASCENDING') 277 | return indices 278 | 279 | @staticmethod 280 | def threshold_indices(tensor, params): 281 | tensor_flatten = tf.reshape(tensor, [-1]) 282 | threshold_val = tf.constant(params["threshold_val"], dtype=tf.float32) 283 | threshold_val = tf.math.minimum(threshold_val, tf.reduce_max(tf.abs(tensor_flatten))) 284 | thr_mask = tf.math.greater_equal(tf.math.abs(tensor_flatten), threshold_val) 285 | indices = tf.reshape(tf.where(thr_mask), [-1]) 286 | indices = tf.sort(indices, axis=0, direction='ASCENDING') 287 | indices = tf.cast(indices, dtype=tf.int32) 288 | return indices 289 | 290 | @staticmethod 291 | def randomk_indices(tensor_name, N, K): 292 | all_indices = tf.range(N, dtype=tf.int32) 293 | h = hash(tensor_name) + BloomFilterCompressor.global_step 294 | tf.compat.v1.set_random_seed(1) 295 | indices = tf.random.shuffle(all_indices, seed=h)[:K] 296 | indices = tf.sort(indices, axis=0, direction='ASCENDING') 297 | BloomFilterCompressor.global_step += 1 298 | return indices 299 | 300 | @staticmethod 301 | def compress(tensor, params): 302 | 303 | tensor_shape = tf.shape(tensor) 304 | tensor_flatten = tf.reshape(tensor, [-1]) 305 | elemnum = tensor_flatten.get_shape().as_list()[0] 306 | 307 | compress_ratio = params["compress_ratio"] 308 | k = max(1, int(elemnum * compress_ratio)) 309 | 310 | # Configure bloom filter's m, k values 311 | assert params["bloom_fpr"] is not None, "False Positive Rate is None" 312 | params['m'], params['k'] = BloomFilterCompressor.bloom_configuration(k, params["bloom_fpr"]) 313 | assert params['k'] < 256, "Number of hash functions too big" 314 | 315 | # params["bloom_config"].add_data(k, params['m']*8, params['k'], params["bloom_fpr"]) 316 | # params["throughput_info"].add_data(elemnum, elemnum/8, params['m']*8, (params['m']*8)/8, elemnum-params['m']*8, (elemnum-params['m']*8)/8) 317 | 318 | if params['bloom_on'] == "topk": # Topk Sparsification Method 319 | indices = BloomFilterCompressor.topk_indices(tensor_flatten, k) 320 | elif params['bloom_on'] == "randomk": # Randomk Sparsification Method 321 | indices = BloomFilterCompressor.randomk_indices(tensor.name, elemnum, k) 322 | elif params['bloom_on'] == "threshold": # threshold Sparsification Method 323 | indices = BloomFilterCompressor.threshold_indices(tensor, params) 324 | 325 | values = tf.gather(tensor_flatten, indices) 326 | values = tf.bitcast(values, tf.int32) 327 | 328 | filename = resource_loader.get_path_to_datafile('bloom_filter_compression.so') 329 | library = load_library.load_op_library(filename) 330 | bloom_compressor = library.bloom_compressor 331 | 332 | log_initial_tensor = tf.bitcast(tensor_flatten, tf.int32) 333 | compressed_tensor = bloom_compressor(values, indices, log_initial_tensor, tf.train.get_or_create_global_step(), 334 | false_positives_aware=params['bloom_false_positives_aware'], 335 | policy=params['bloom_policy'], 336 | fpr=params["bloom_fpr"], 337 | hash_num=params['k'], 338 | bloom_size=params['m'], 339 | bloom_logs_path=params['bloom_logs_path'], 340 | gradient_id=params['gradient_id'], 341 | verbosity_frequency=params['bloom_verbosity_frequency'], 342 | verbosity=params['bloom_verbosity'], 343 | rank=rank()) 344 | ctx = tensor_shape 345 | params['tensors_size_are_same'] = False 346 | return compressed_tensor, ctx 347 | 348 | @staticmethod 349 | def decompress(compressed_tensor, ctx, params): 350 | 351 | tensor_shape = ctx 352 | tensor_size = tf.math.reduce_prod(tensor_shape) 353 | 354 | filename = resource_loader.get_path_to_datafile('bloom_filter_compression.so') 355 | library = load_library.load_op_library(filename) 356 | bloom_decompressor = library.bloom_decompressor 357 | 358 | decompressed_tensor = bloom_decompressor(compressed_tensor, tensor_size, 359 | tf.train.get_or_create_global_step(), 360 | policy=params['bloom_policy'], 361 | mem_mode=params['mem_mode'], 362 | hash_num=params['k'], 363 | bloom_size=params['m'], 364 | bloom_logs_path=params['bloom_logs_path'], 365 | gradient_id=params['gradient_id'], 366 | verbosity_frequency=params['bloom_verbosity_frequency'], 367 | verbosity=params['bloom_verbosity'], 368 | suffix=params['suffix'], 369 | rank=rank()) 370 | 371 | decompressed_tensor = tf.bitcast(decompressed_tensor, tf.float32) 372 | decompressed_tensor = tf.reshape(decompressed_tensor, tensor_shape) 373 | return decompressed_tensor 374 | 375 | 376 | # Double exponential on GPU 377 | class DoubleExpCompressor(Compressor): 378 | # Values_Approximation_Compressor + TopK 379 | 380 | @staticmethod 381 | def compress(tensor, params): 382 | tensor_shape = tf.shape(tensor) 383 | tensor_flatten = tf.reshape(tensor, [-1]) 384 | N = tensor_flatten.get_shape().as_list()[0] 385 | params['N'] = int(N) 386 | # print("Tensor", tensor, "size:", params['N']) 387 | 388 | elemnum = tensor_flatten.get_shape().as_list()[0] 389 | compress_ratio = params["compress_ratio"] 390 | 391 | k = max(1, int(elemnum * compress_ratio)) 392 | params['K'] = k 393 | abs_vals, indices = tf.math.top_k(tf.math.abs(tensor_flatten), k, ) # sorted by default, desending order 394 | values = tf.gather(tensor_flatten, indices) 395 | # Values Approximation 396 | if elemnum>9000: 397 | # Indices have a negative sign if they correspond to negative values and positive otherwise 398 | indices = tf.cast(indices, dtype=tf.int32) 399 | indices = (indices + 1) * (tf.cast(values > 0, dtype=tf.int32) * 2 - 1) 400 | 401 | mapping = tf.argsort(abs_vals, axis=0, direction='ASCENDING') 402 | abs_vals = tf.gather(abs_vals, mapping) 403 | indices = tf.gather(indices, mapping) 404 | 405 | # Fitting the curve 406 | X = tf.cast(tf.range(1, k + 1), tf.float64) 407 | coefficients = Values_Approximation_Helper.double_exponential_fit(X, tf.cast(abs_vals, tf.float64), k) 408 | 409 | coefficients = tf.reshape(coefficients, [-1]) 410 | 411 | tensor_compressed = indices, coefficients 412 | params['X_train'] = X 413 | 414 | else: # No approximation 415 | tensor_compressed = indices, values 416 | 417 | ctx = tensor_shape, elemnum 418 | params['tensors_size_are_same'] = True 419 | return tensor_compressed, ctx 420 | 421 | @staticmethod 422 | def decompress(tensor_compressed, ctx, params): 423 | tensor_shape, elemnum = ctx 424 | tensor_size = tf.math.reduce_prod(tensor_shape) 425 | 426 | if elemnum>9000: 427 | indices, message = tensor_compressed 428 | 429 | y_estimates = message[0] * tf.math.exp(message[2] * params['X_train']) + \ 430 | message[1] * tf.math.exp(message[3] * params['X_train']) 431 | 432 | values = tf.reshape(y_estimates, [-1]) 433 | 434 | values = tf.cast(values, dtype=tf.float32) * tf.cast(tf.math.sign(indices), dtype=tf.float32) 435 | indices = tf.math.abs(indices) - 1 436 | 437 | else: 438 | indices, values = tensor_compressed 439 | 440 | tensor_decompressed = tf.scatter_nd(tf.expand_dims(indices, 1), values, [tensor_size]) 441 | tensor_decompressed = tf.reshape(tensor_decompressed, tensor_shape) 442 | return tensor_decompressed 443 | 444 | 445 | # PolyFit on GPU 446 | class PolySegCompressor(Compressor): 447 | 448 | @staticmethod 449 | def compress(tensor, params): 450 | tensor_shape = tf.shape(tensor) 451 | tensor_flatten = tf.reshape(tensor, [-1]) 452 | N = tensor_flatten.get_shape().as_list()[0] 453 | compress_ratio = params["compress_ratio"] 454 | params['N'] = params['K'] = int(N) 455 | 456 | # print("Tensor", tensor, "size:", params['N']) 457 | 458 | if Values_Approximation_Helper.is_convolutional(params['model_name'], params['N']): 459 | 460 | abs_values = tf.math.abs(tensor_flatten) 461 | 462 | if params['approximation_mode'] == "topk": 463 | K = max(1, int(N * compress_ratio)) 464 | params['K'] = K 465 | top_values, mapping = tf.math.top_k(abs_values, K, sorted=False) 466 | sorted_mapping = tf.argsort(top_values, axis=0, direction='ASCENDING') 467 | values = tf.gather(top_values, sorted_mapping) 468 | mapping = tf.gather(mapping, sorted_mapping) 469 | else: 470 | mapping = tf.argsort(abs_values, axis=0, direction='ASCENDING') 471 | values = tf.gather(abs_values, mapping) 472 | 473 | # Indices have a negative sign if they correspond to negative values and positive otherwise 474 | negative_indices = tf.where(tf.less(tf.gather(tensor_flatten, mapping), 0)) 475 | Nneg = tf.size(negative_indices) 476 | mask = tf.tensor_scatter_nd_update(tf.ones([params['K']], dtype=tf.int32), negative_indices, 477 | -tf.ones(Nneg, dtype=tf.int32)) 478 | mapping = (mapping + 1) * mask 479 | 480 | # Fitting the curve segments 481 | params['num_of_segments'] = Values_Approximation_Helper.get_num_of_segments(params['model_name'], params['N']) 482 | # break_points, sizes = Values_Approximation_Helper.find_breaks(values, params['num_of_segments'], params['K']) 483 | break_points = Values_Approximation_Helper.get_breaks(params['model_name'], N) 484 | sizes = [break_points[i+1]-break_points[i] for i in range(params['num_of_segments'])] 485 | 486 | # params['X'] = {} 487 | coefficients = [] 488 | for i in range(params['num_of_segments']): 489 | x = tf.reshape(tf.cast(tf.range(0, sizes[i]), tf.float64), [1, sizes[i]]) 490 | X = Values_Approximation_Helper.GetInputMatrix_Polynomial(params['polynomial_degree'], x) 491 | y = tf.reshape(values[break_points[i]: break_points[i+1]], [sizes[i], 1]) 492 | coefficients += [Values_Approximation_Helper.LeastSquares(X, tf.cast(y, tf.float64))] 493 | # params['X'][i] = X 494 | coefficients = tf.convert_to_tensor(coefficients) 495 | coefficients = tf.reshape(coefficients, [-1]) 496 | 497 | ##################### Logging ##################### 498 | # filename = resource_loader.get_path_to_datafile('mpi_lib.cpython-36m-x86_64-linux-gnu.so') 499 | # library = load_library.load_op_library(filename) 500 | # logger = library.logger 501 | # logger = logger(tensor_flatten, coefficients, tf.train.get_or_create_global_step(), 502 | # bloom_logs_path=params['bloom_logs_path'], 503 | # gradient_id=params['gradient_id'], 504 | # verbosity_frequency=params['bloom_verbosity_frequency'], 505 | # verbosity=params['bloom_verbosity'], 506 | # rank=rank()) 507 | ##################### / Logging ##################### 508 | 509 | compressed_indices = mapping # Possible indices compression 510 | # with tf.control_dependencies([logger]): 511 | sizes = tf.cast(sizes, tf.float64) 512 | compressed_indices = tf.cast(compressed_indices, tf.float64) 513 | tensor_compressed = tf.concat([sizes, coefficients, compressed_indices], 0) 514 | # tensor_compressed = tf.concat([coefficients, compressed_indices], 0) 515 | else: 516 | tensor_compressed = tensor 517 | 518 | ctx = tensor_shape 519 | params['tensors_size_are_same'] = True 520 | return tensor_compressed, ctx 521 | 522 | @staticmethod 523 | def decompress(tensor_compressed, ctx, params): 524 | tensor_shape = ctx 525 | 526 | if Values_Approximation_Helper.is_convolutional(params['model_name'], params['N']): 527 | sizes, coefficients, indices = tf.split(tensor_compressed, [params['num_of_segments'], 528 | params['polynomial_degree'] * params[ 529 | 'num_of_segments'], 530 | params['K']]) 531 | # coefficients, indices = tf.split(tensor_compressed, [params['polynomial_degree'] * 532 | # params['num_of_segments'], 533 | # params['N']]) 534 | coefficients = tf.reshape(coefficients, [params['num_of_segments'], params['polynomial_degree']]) 535 | decompressed_indices = tf.cast(indices, tf.int32) 536 | sizes = tf.cast(sizes, tf.int32) 537 | negative_indices = tf.where(tf.less(decompressed_indices, 0)) 538 | decompressed_indices = tf.math.abs(decompressed_indices) 539 | decompressed_indices = decompressed_indices - 1 540 | Nneg = tf.size(negative_indices) 541 | mask = tf.tensor_scatter_nd_update(tf.ones(tf.shape(decompressed_indices), dtype=tf.int32), negative_indices, 542 | -tf.ones(Nneg, dtype=tf.int32)) 543 | y_segments = [] 544 | for i in range(params['num_of_segments']): 545 | x = tf.reshape(tf.cast(tf.range(0, sizes[i]), tf.float64), [1, sizes[i]]) 546 | X = Values_Approximation_Helper.GetInputMatrix_Polynomial(params['polynomial_degree'], x) 547 | # X = params['X'][i] 548 | y_segments += [tf.matmul(X, tf.reshape(coefficients[i], [params['polynomial_degree'], 1]))] 549 | values = tf.reshape(tf.concat(y_segments, axis=0), [params['K']]) 550 | values = values * tf.cast(mask, tf.float64) 551 | 552 | decompressed_indices = tf.expand_dims(decompressed_indices, 1) 553 | tensor_decompressed = tf.scatter_nd(decompressed_indices, tf.cast(values, tf.float32), [params['N']]) 554 | tensor_decompressed = tf.reshape(tensor_decompressed, tensor_shape) 555 | else: 556 | tensor_decompressed = tensor_compressed 557 | 558 | return tensor_decompressed -------------------------------------------------------------------------------- /pytorch/deepreduce.py: -------------------------------------------------------------------------------- 1 | """ 2 | DeepReduce compression algorithms. 3 | Author: Hang Xu 4 | """ 5 | 6 | import torch 7 | from grace_dl.dist import Compressor 8 | from grace_dl.dist.helper import tensor_bits 9 | 10 | ######################################################################## 11 | # DeepReduce Framework 12 | 13 | 14 | class SparseCompressor(object): 15 | """Interface for compressing and decompressing a given sparse tensor.""" 16 | 17 | @staticmethod 18 | def compress(sparse_tensor, ctx): 19 | """Compresses a sparse tensor and returns it with the context needed to decompress it.""" 20 | raise NotImplemented("compress was not implemented.") 21 | 22 | @staticmethod 23 | def decompress(sparse_tensor, ctx): 24 | """Decompress a sparse tensor with the given context.""" 25 | raise NotImplemented("decompress was not implemented.") 26 | 27 | 28 | def deepreduce_from_params(params): 29 | from grace_dl.dist.helper import grace_from_params 30 | # only sparsification compressors are valid if params['deepreduce'] is not None 31 | if params['compressor'] in ['SKCompressCPU', 'sketch', 'SKCompressGPU']: 32 | hash_30m = torch.load("./deepreduce/hash_table_18m_int.pt").cuda() 33 | params['hash_table'] = hash_30m 34 | grc = grace_from_params(params) 35 | 36 | deepreduce = params.get('deepreduce', None) # value, index, both 37 | deepreduce_wrapper = {'value': ValueCompressor, 38 | 'index': IndexCompressor, 39 | 'both': DeepReduce} 40 | if deepreduce: 41 | # hash table has to be passed in by dictionary to avoid excessive memory footprints on GPU 42 | if params.get('index', 'bloom') in ['bloom']: 43 | hash_30m = torch.load("/data/scratch/hang/deepreduce/hash_table_18m_int.pt").cuda() 44 | params['hash_table'] = hash_30m 45 | compressor = deepreduce_wrapper[deepreduce](grc.compressor, params) 46 | grc.compressor = compressor 47 | 48 | return grc 49 | 50 | 51 | class ValueCompressor(Compressor): 52 | 53 | def __init__(self, sparsifier, params=None): 54 | super().__init__(tensors_size_are_same=sparsifier.tensors_size_are_same) 55 | self.sparsifier = sparsifier 56 | self.params = params 57 | self.val_compressor = compressor[self.params.get('value', 'polyfit')] 58 | if self.params.get('value', 'polyfit') not in ['polyfit']: 59 | self.tensors_size_are_same = False 60 | 61 | def compress(self, tensor, name): 62 | tensors, ctx = self.sparsifier.compress(tensor, name) 63 | vals, idxs = tensors 64 | shape = ctx 65 | 66 | # for performance reason, we don't apply compression for small sparse tensors 67 | # in our experiments when top-r ratio is set to 0.01, we use 1000 to filter tensors. 68 | if shape.numel() > 1000: 69 | sparse_tensor = vals, idxs, tensor.size() 70 | import time 71 | torch.cuda.synchronize() 72 | start = time.time() 73 | vals, idxs, shape = self.val_compressor.compress(sparse_tensor, self.params) 74 | if self.params.get('micro-benchmark', False): 75 | torch.cuda.synchronize() 76 | print(f'val_compression time:{time.time() - start}') 77 | ctx = shape 78 | return (vals, idxs), ctx 79 | 80 | def decompress(self, tensors, ctx): 81 | shape = ctx 82 | vals, idxs = tensors 83 | 84 | if shape.numel() > 1000: 85 | import time 86 | torch.cuda.synchronize() 87 | start = time.time() 88 | fitted_sparse_tensor = vals, idxs, shape 89 | vals, idxs, shape = self.val_compressor.decompress(fitted_sparse_tensor, self.params) 90 | if self.params.get('micro-benchmark', False): 91 | torch.cuda.synchronize() 92 | print(f'val_decompression time:{time.time() - start}') 93 | dense_tensor_bits = shape.numel() * 32 94 | print(f'idx_relative_volume: {(tensor_bits([tensors[1]]) / dense_tensor_bits):.4f}') 95 | print(f'val_relative_volume: {(tensor_bits([tensors[0]]) / dense_tensor_bits):.4f}') 96 | tensor_decompressed = self.sparsifier.decompress((vals, idxs), shape) 97 | return tensor_decompressed 98 | 99 | 100 | class IndexCompressor(Compressor): 101 | 102 | def __init__(self, sparsifier, params=None): 103 | super().__init__(tensors_size_are_same=sparsifier.tensors_size_are_same) 104 | self.sparsifier = sparsifier 105 | self.params = params 106 | self.idx_compressor = compressor[self.params.get('index', 'bloom')] 107 | if self.params.get('index', 'bloom') not in ['bloom', ]: 108 | self.tensors_size_are_same = False 109 | 110 | def compress(self, tensor, name): 111 | tensors, ctx = self.sparsifier.compress(tensor, name) 112 | vals, idxs = tensors 113 | shape = ctx 114 | 115 | if shape.numel() > 1000: 116 | sparse_tensor = vals, idxs, tensor.size() 117 | self.params['dense_tensor'] = tensor 118 | 119 | import time 120 | torch.cuda.synchronize() 121 | start = time.time() 122 | 123 | vals, idxs, shape = self.idx_compressor.compress(sparse_tensor, self.params) 124 | 125 | if self.params.get('micro-benchmark', False): 126 | torch.cuda.synchronize() 127 | print(f'idx_compression time:{time.time() - start}') 128 | 129 | ctx = shape 130 | return (vals, idxs), ctx 131 | 132 | def decompress(self, tensors, ctx): 133 | shape = ctx 134 | vals, idxs = tensors 135 | 136 | if shape.numel() > 1000: 137 | bf_sparse_tensor = vals, idxs, shape 138 | 139 | import time 140 | torch.cuda.synchronize() 141 | start = time.time() 142 | 143 | vals, idxs, shape = self.idx_compressor.decompress(bf_sparse_tensor, self.params) 144 | 145 | if self.params.get('micro-benchmark', False): 146 | torch.cuda.synchronize() 147 | print(f'idx_decompression time:{time.time() - start}') 148 | dense_tensor_bits = shape.numel() * 32 149 | print(f'idx_relative_volume: {(tensor_bits([tensors[1]]) / dense_tensor_bits):.4f}') 150 | print(f'val_relative_volume: {(tensor_bits([tensors[0]]) / dense_tensor_bits):.4f}') 151 | 152 | tensor_decompressed = self.sparsifier.decompress((vals, idxs), shape) 153 | return tensor_decompressed 154 | 155 | 156 | class DeepReduce(Compressor): 157 | 158 | def __init__(self, sparsifier, params=None): 159 | super().__init__(tensors_size_are_same=sparsifier.tensors_size_are_same) 160 | self.sparsifier = sparsifier 161 | self.params = params 162 | self.val_compressor = compressor[self.params.get('value', 'polyfit')] 163 | self.idx_compressor = compressor[self.params.get('index', 'bloom')] 164 | 165 | @staticmethod 166 | def pack_(mapping, max_val=None): 167 | ''' 168 | :param mapping: values to be packed 169 | :param bits: number of bits needed to represent each value 170 | :return: 64-bit singed integer tensor 171 | currently it only supports packing 3 values into one signed 64bit integer, 172 | ''' 173 | bits = 21 174 | mapping = mapping.long() 175 | N = mapping.numel() 176 | chunk_size = 63 // bits 177 | padding = torch.zeros([chunk_size - N % chunk_size], device=mapping.device).long() 178 | padded = torch.cat([mapping, padding], dim=0).view(chunk_size, -1) 179 | encode = padded[0] * pow(2, 2 * bits) + padded[1] * pow(2, bits) + padded[2] 180 | return torch.cat([encode, torch.tensor([N]).cuda()], dim=0) 181 | 182 | @staticmethod 183 | def unpack_(encode): 184 | bits = 21 185 | N = encode[-1] 186 | encode = encode[:-1] 187 | a1 = encode // pow(2, 2 * bits) 188 | a3 = encode % pow(2, bits) 189 | a2 = (encode // pow(2, bits)) % pow(2, bits) 190 | unpack = torch.cat([a1, a2, a3], dim=0)[:N] 191 | return unpack.int() 192 | 193 | @staticmethod 194 | def pack(mapping, max_val=None): 195 | from math import log, ceil 196 | import cupy 197 | max_val = mapping.max().item() if max_val is None else max_val 198 | N = mapping.numel() 199 | bits = log(max_val, 2) 200 | 201 | t = mapping.int() 202 | a = [(t % 256).byte(), (t // 256 % 256).byte(), (t // pow(2, 16) % 256).byte(), (t // pow(2, 24)).byte(), ] 203 | body_len = int(ceil(bits) // 8) # 0,1,2,3 204 | body = a[:body_len] 205 | head = a[body_len] 206 | 207 | head_len = ceil(bits) % 8 208 | exp = [] 209 | if head_len > 0: 210 | head = cupy.asarray(head) 211 | for i in range(head_len): 212 | e = torch.as_tensor(cupy.packbits(head % 2), device="cuda") 213 | exp.append(e) 214 | head = cupy.right_shift(head, 1) 215 | 216 | N = torch.cuda.ByteTensor([N % 256, N // 256 % 256, N // pow(2, 16) % 256, N // pow(2, 24)]) 217 | bits = torch.cuda.ByteTensor([ceil(bits)]) 218 | encode = torch.cat([N, bits] + body + exp, dim=0) 219 | 220 | return encode 221 | 222 | @staticmethod 223 | def unpack(encode): 224 | from math import log, ceil 225 | import cupy 226 | N, bits, encode = encode.split([4, 1, encode.numel() - 5]) 227 | N = torch.sum(N.int() * torch.cuda.IntTensor([1, 256, 256 ** 2, 256 ** 3])).item() 228 | bits = bits.item() 229 | body_len = int(bits // 8) # 0,1,2,3 230 | head_len = ceil(bits) % 8 231 | decode = torch.zeros([N], device='cuda').int() 232 | if body_len > 0: 233 | body, exp = encode.split([body_len * N, encode.numel() - body_len * N]) 234 | body = body.view(body_len, -1).int() 235 | for i in range(body_len): 236 | decode += body[i] * pow(2, i * 8) 237 | else: 238 | exp = encode 239 | 240 | if head_len > 0: 241 | head = torch.zeros([N], device='cuda').int() 242 | exp = exp.view(head_len, -1) 243 | for i in range(head_len): 244 | t = torch.as_tensor(cupy.unpackbits(cupy.asarray(exp[i])), device="cuda")[:N] 245 | head += t.int() * pow(2, i) 246 | decode += head * pow(2, body_len * 8) 247 | 248 | return decode.long() 249 | 250 | def compress(self, tensor, name): 251 | tensors, ctx = self.sparsifier.compress(tensor, name) 252 | vals, idxs = tensors 253 | shape = ctx 254 | 255 | import time 256 | torch.cuda.synchronize() 257 | start = time.time() 258 | 259 | if shape.numel() > 1000: 260 | sparse_tensor = vals, idxs, tensor.size() 261 | vals, idxs, _ = self.idx_compressor.compress(sparse_tensor, self.params) 262 | new_idxs = torch.arange(vals.numel(), device=vals.device) 263 | vals, mapping, shape = self.val_compressor.compress((vals, new_idxs, shape), self.params) 264 | # mapping = mapping.int() 265 | # mapping = self.pack(mapping, max_val=mapping.numel() - 1) 266 | ctx = shape 267 | tensors = (vals, idxs, mapping) 268 | 269 | if self.params.get('micro-benchmark', False): 270 | torch.cuda.synchronize() 271 | print(f'_compression time:{time.time() - start}') 272 | return tensors, ctx 273 | 274 | def decompress(self, tensors, ctx): 275 | shape = ctx 276 | 277 | import time 278 | torch.cuda.synchronize() 279 | start = time.time() 280 | 281 | if shape.numel() > 1000: 282 | vals, idxs, mapping = tensors 283 | # mapping = self.unpack(mapping) 284 | fitted_sparse_tensor = vals, mapping, shape 285 | vals, _, _ = self.val_compressor.decompress(fitted_sparse_tensor, self.params) 286 | 287 | bf_sparse_tensor = mapping, idxs, shape 288 | _, idxs, _ = self.idx_compressor.decompress(bf_sparse_tensor, self.params) 289 | 290 | idxs = idxs[mapping.long()] 291 | else: 292 | vals, idxs = tensors 293 | 294 | if self.params.get('micro-benchmark', False): 295 | torch.cuda.synchronize() 296 | print(f'_decompression time:{time.time() - start}') 297 | dense_tensor_bits = shape.numel() * 32 298 | print(f'idx_relative_volume: {(tensor_bits(tensors[1:]) / dense_tensor_bits):.4f}') 299 | print(f'val_relative_volume: {(tensor_bits([tensors[0]]) / dense_tensor_bits):.4f}') 300 | 301 | tensor_decompressed = self.sparsifier.decompress((vals, idxs), shape) 302 | return tensor_decompressed 303 | 304 | 305 | ######################################################################## 306 | # PolyFit on GPU, not order-preserving 307 | 308 | def GetInputMatrix_Polynomial(N, degree, device): 309 | ''' 310 | degree: polynomial degree 311 | N: the number of elements in fitting values 312 | ''' 313 | x = torch.arange(1, N + 1, device=device).view(-1, 1).float() 314 | t0 = torch.ones(degree + 1, device=device).view(1, -1) 315 | basis = torch.matmul(x, t0) 316 | 317 | t1 = torch.ones(N, device=device).view(-1, 1) 318 | t2 = torch.arange(0, degree + 1, device=device).view(1, -1).float() 319 | exp = torch.matmul(t1, t2) 320 | 321 | X_mat = torch.pow(basis, exp) 322 | del x, t0, t1, t2, basis, exp 323 | return X_mat 324 | 325 | 326 | def LeastSquares(X, y): # returns (X'X)^-1 X'y 327 | X = X.double() 328 | y = y.double() 329 | Xtrans = torch.transpose(X, 0, 1) 330 | tmp = torch.matmul(Xtrans, X) 331 | # torch.inverse on small matrix is much faster on cpu than GPU, see here: 332 | # the size of tmp is [degree x degree] 333 | # https://github.com/pytorch/pytorch/issues/2219 334 | inverse = torch.inverse(tmp.cpu()).cuda() 335 | 336 | theta_estimates = torch.matmul(torch.matmul(inverse, Xtrans), y) 337 | del Xtrans, tmp, inverse, X, y 338 | return theta_estimates 339 | 340 | 341 | def RestoreValues(N, coefficients): 342 | degree = coefficients.numel() - 1 343 | X = GetInputMatrix_Polynomial(N, degree, device=coefficients.device) 344 | X = X.double() 345 | y = torch.matmul(X, coefficients.view(-1, 1)) 346 | del X 347 | return y.view(-1) 348 | 349 | 350 | # def get_segments(N, num_pos=0): 351 | # ''' 352 | # for same N, it returns same segments 353 | # ''' 354 | # segments = [] 355 | # for r in [1/5, 1/10, 1 / 30, 1 / 100, 1/300, 1/1000, 1/3000, 1 / 10000, 1 / 30000, 1/100000]: 356 | # if int(N*r) > 30: 357 | # segments.append(int(N*r)) 358 | # segments = segments[::-1] + [N-2*sum(segments)] + segments 359 | # return segments 360 | 361 | 362 | def get_segments(N, num_pos=0): 363 | ''' 364 | for different nodes,num_pos in the grad are different, thus segments are different 365 | need to set tensors_size_are_same=False for allgather communication 366 | this is useful for TopK due to the discontinuity between positive and negative values 367 | ''' 368 | segments, pos, neg = [], [], [] 369 | num_neg = N - num_pos 370 | for r in [1 / 5, 1 / 10, 1 / 30, 1 / 100, 1 / 300, 1 / 1000, 1 / 3000, 1 / 10000, 1 / 30000, 1 / 100000]: 371 | if int(num_pos * r) > 30: 372 | pos.append(int(num_pos * r)) 373 | if int(num_neg * r) > 30: 374 | neg.append(int(num_neg * r)) 375 | segments = pos[::-1] + [num_pos - sum(pos)] + [num_neg - sum(neg)] + neg 376 | 377 | return segments 378 | 379 | 380 | class PolyFit(SparseCompressor): 381 | order_preserving = False 382 | @staticmethod 383 | def compress(sparse_tensor, params): 384 | sort = params.get('sort', False) 385 | degree = params.get('poly_degree', 5) 386 | vals, idxs, shape = sparse_tensor 387 | N = idxs.numel() 388 | y_all = vals 389 | num_pos = torch.sum(y_all > 0.0) 390 | 391 | if not sort: 392 | y_all, mapping = y_all.sort(descending=True) 393 | idxs = idxs[mapping] 394 | 395 | coefficients = [] 396 | segments = get_segments(N, num_pos.item()) 397 | 398 | for y in y_all.split(segments): 399 | n = y.numel() 400 | X = GetInputMatrix_Polynomial(n, degree, device=y_all.device) 401 | a = LeastSquares(X, y) 402 | coefficients.append(a) 403 | del X 404 | 405 | coefficients.append(num_pos.double().view(-1)) 406 | coefficients_tensor = torch.cat(coefficients, dim=0) 407 | fitted_sparse_tensor = coefficients_tensor, idxs, shape 408 | del vals, y_all, idxs 409 | return fitted_sparse_tensor 410 | 411 | @staticmethod 412 | def decompress(fitted_sparse_tensor, params): 413 | coefficients_tensor, idxs, shape = fitted_sparse_tensor 414 | N = idxs.numel() 415 | coefficients_tensor, num_pos = coefficients_tensor.split([coefficients_tensor.numel() - 1, 1]) 416 | segments = get_segments(N, num_pos.int().item()) 417 | chunk_size = int(coefficients_tensor.numel()/len(segments)) 418 | coefficients = coefficients_tensor.split(chunk_size) 419 | y_fit = [] 420 | for a, n in zip(coefficients, segments): 421 | y_fit.append(RestoreValues(n, a)) 422 | vals = torch.cat(y_fit, dim=0).float() 423 | sparse_tensor = vals, idxs, shape 424 | del y_fit, idxs 425 | return sparse_tensor 426 | 427 | 428 | ######################################################################## 429 | # Bloom on GPU, not order-preserving 430 | 431 | class Bloomfilter(set): 432 | 433 | def __init__(self, size, num_hash, params, bit_array=None): 434 | super(Bloomfilter, self).__init__() 435 | if bit_array is not None: 436 | self.bit_array = bit_array 437 | else: 438 | self.bit_array = torch.zeros(size, device='cuda', dtype=torch.bool) 439 | self.size = size 440 | self.num_hash = min(num_hash, params['hash_table'].size()[1]) 441 | self.params = params 442 | 443 | def __len__(self): 444 | return self.size 445 | 446 | def pack_bitarray(self): 447 | import cupy 448 | byte_tensor = self.bit_array.byte() 449 | pack = torch.as_tensor(cupy.packbits(cupy.asarray(byte_tensor)), device="cuda") 450 | self.bit_array = pack 451 | 452 | def unpack_bitarray(self): 453 | import cupy 454 | unpack = torch.as_tensor(cupy.unpackbits(cupy.asarray(self.bit_array)), device="cuda") 455 | self.bit_array = unpack[:self.size].bool() 456 | 457 | def add(self, items): 458 | ''' 459 | items: list of integers or torch int tensor 460 | ''' 461 | hash_table = self.params['hash_table'][:, :self.num_hash] % self.size 462 | index = hash_table[items.long()].flatten() 463 | self.bit_array[index.long()] = 1 464 | del index, hash_table 465 | 466 | def query(self, query_range): 467 | ''' 468 | query_range will be range[0,query_range) 469 | returns all query results in this range 470 | ''' 471 | hash_table = self.params['hash_table'][:, :self.num_hash] % self.size 472 | index = hash_table[:query_range] 473 | bits = self.bit_array[index.long()] 474 | mask = torch.sum(bits, dim=1) == bits.size()[1] 475 | positives = torch.arange(query_range, device='cuda')[mask] 476 | del index, bits, mask, hash_table 477 | return positives 478 | 479 | def policy(self, positives, k, policy): 480 | if policy == 'leftmost': 481 | res = positives[:k] 482 | elif policy == 'random': 483 | # if positives.numel() is always same, then with same seed, you will get same permutation 484 | # in practice, when we get all non-zero elements from gradient, positives.numel() won't be the 485 | # same across different iterations. However, if use TopK with fixed ratio here, we need to set 486 | # different seeds at different iterations. 487 | torch.manual_seed(42) 488 | keys = torch.randperm(positives.numel(), device=positives.device)[:k] 489 | res = positives[keys] 490 | elif policy == 'p0': 491 | res = positives 492 | return res 493 | 494 | 495 | def get_BFconfig(capacity, fpr): 496 | import math 497 | # num_hash only depends on fpr 498 | num_hash = math.log(1 / fpr, 2) 499 | num_bits = num_hash * capacity / 0.693147180 500 | return math.ceil(num_hash), math.ceil(num_bits) 501 | 502 | 503 | class Bloom(SparseCompressor): 504 | order_preserving = False 505 | @staticmethod 506 | def compress(sparse_tensor, params): 507 | vals, idxs, shape = sparse_tensor 508 | grad_size = shape.numel() 509 | num_indices = vals.numel() 510 | 511 | fpr = 0.1 * num_indices / grad_size 512 | fpr = params.get('fpr', fpr) 513 | policy = params.get('policy', 'leftmost') 514 | num_hash, bf_size = get_BFconfig(num_indices, fpr) 515 | bloom = Bloomfilter(bf_size, num_hash, params) 516 | bloom.add(idxs) 517 | 518 | # apply fpaware 519 | dense_tensor = params.get('dense_tensor', None) 520 | if dense_tensor is not None: 521 | query_res = bloom.query(grad_size) 522 | new_idxs = bloom.policy(query_res, idxs.numel(), policy) 523 | vals = dense_tensor.flatten()[new_idxs] 524 | 525 | if policy == 'p0': 526 | num_indices = torch.as_tensor([num_indices], dtype=vals.dtype, device=vals.device) 527 | vals = torch.cat([num_indices, vals], dim=0) 528 | 529 | bloom.pack_bitarray() 530 | bf_sparse_tensor = vals, bloom.bit_array, shape 531 | 532 | del vals, bloom 533 | return bf_sparse_tensor 534 | 535 | @staticmethod 536 | def decompress(bf_sparse_tensor, params): 537 | vals, bit_array, shape = bf_sparse_tensor 538 | policy = params.get('policy', 'leftmost') 539 | if policy == 'p0': 540 | num_indices, vals = vals.split([1, vals.numel()-1]) 541 | num_indices = int(num_indices.item()) 542 | else: 543 | num_indices = vals.numel() 544 | grad_size = shape.numel() 545 | fpr = 0.1 * num_indices / grad_size 546 | fpr = params.get('fpr', fpr) 547 | 548 | num_hash, bf_size = get_BFconfig(num_indices, fpr) 549 | bloom = Bloomfilter(bf_size, num_hash, params, bit_array=bit_array) 550 | bloom.unpack_bitarray() 551 | query_res = bloom.query(grad_size) 552 | idxs = bloom.policy(query_res, num_indices, policy) 553 | sparse_tensor = vals, idxs, shape 554 | del bloom, query_res 555 | return sparse_tensor 556 | 557 | 558 | ######################################################################## 559 | # PolyFit on CPU, not order-preserving 560 | import numpy as np 561 | import numpy.polynomial.polynomial as poly 562 | import warnings 563 | warnings.simplefilter('ignore', np.RankWarning) 564 | 565 | 566 | def find_breaks(curve, num_of_breaks=10): 567 | # find breaks in a recursive ascending order 568 | y = curve 569 | breaks = [] 570 | break_index = 0 571 | 572 | for i in range(num_of_breaks): 573 | if len(y) < 20 * num_of_breaks: 574 | break 575 | line = np.linspace(y[0], y[-1], len(y)) 576 | distance = np.abs(line - y) 577 | break_index += np.argmax(distance) 578 | if (len(curve)-break_index) < 20 * num_of_breaks: 579 | break 580 | breaks.append(break_index) 581 | y = curve[break_index:] 582 | return breaks 583 | 584 | 585 | def fit_curve(curve, breaks, poly_degree=5): 586 | breaks = [0] + breaks + [len(curve)] 587 | size = breaks[-1] 588 | x = list(range(size)) 589 | 590 | yy = [curve[breaks[i - 1]:breaks[i]] for i in range(1, len(breaks))] 591 | xx = [x[breaks[i - 1]:breaks[i]] for i in range(1, len(breaks))] 592 | coefficients = [] 593 | # plt.plot(x, curve) 594 | # print("==debug==", breaks, ) 595 | for i in range(len(xx)): 596 | x = xx[i] 597 | y = yy[i] 598 | # print("==Debug==") 599 | # print(x, "\n", y) 600 | z, _ = poly.polyfit(x, y, poly_degree, full=True) 601 | # set full=True to turn off the rank warning 602 | coefficients.append(z) 603 | return coefficients, breaks 604 | 605 | 606 | def restore_curve(coefficients, breaks): 607 | size = breaks[-1] 608 | x = list(range(size)) 609 | xx = [x[breaks[i - 1]:breaks[i]] for i in range(1, len(breaks))] 610 | curve_fit = [] 611 | for i in range(len(xx)): 612 | x = xx[i] 613 | z = coefficients[i] 614 | # y_fit = [np.poly1d(z)(i) for i in x] 615 | y_fit = list(poly.polyval(x, z)) 616 | # plt.plot(x, y_fit) 617 | curve_fit += y_fit 618 | # print(coefficients) 619 | return curve_fit 620 | 621 | 622 | class PolyFitCPU(SparseCompressor): 623 | order_preserving = False 624 | @staticmethod 625 | def compress(sparse_tensor, params): 626 | vals, idxs, shape = sparse_tensor 627 | vals_sorted, mask = torch.sort(vals, descending=True) # sorted in descending order 628 | indices_sorted = idxs[mask] 629 | vals_sorted = vals_sorted.cpu() 630 | 631 | # fit values 632 | num_of_breaks = 5 633 | poly_degree = 5 634 | 635 | vals_sorted = np.asarray(vals_sorted) 636 | # vals_sorted = np.array([float(x) for x in vals_sorted]) 637 | num_pos = np.sum(vals_sorted > 0) 638 | if num_pos == 0: 639 | # all negtive values 640 | y = vals_sorted 641 | breaks = find_breaks(y, num_of_breaks) 642 | coefficients, breaks = fit_curve(y, breaks, poly_degree) 643 | 644 | elif num_pos == len(vals_sorted): 645 | # all positive values 646 | y = vals_sorted[::-1] # reverse positive vals order to be ascending 647 | breaks = find_breaks(y, num_of_breaks) 648 | breaks = [len(y) - x for x in breaks[::-1]] 649 | 650 | y = vals_sorted # fit positive vals in original order 651 | coefficients, breaks = fit_curve(y, breaks, poly_degree) 652 | 653 | else: 654 | vals_pos = vals_sorted[vals_sorted > 0] 655 | vals_neg = vals_sorted[vals_sorted < 0] 656 | 657 | y = vals_pos[::-1] # reverse positive vals order to be ascending 658 | breaks = find_breaks(y, num_of_breaks) 659 | breaks_pos = [len(y) - x for x in breaks[::-1]] 660 | 661 | y = vals_neg 662 | breaks_neg = find_breaks(y, num_of_breaks) 663 | breaks_neg = [num_pos + x for x in breaks_neg] 664 | 665 | breaks = breaks_pos + [num_pos] + breaks_neg 666 | y = vals_sorted 667 | coefficients, breaks = fit_curve(y, breaks, poly_degree) 668 | 669 | coeff_tensor = torch.tensor(np.asarray(coefficients), dtype=torch.float64, device=idxs.device).flatten() 670 | breaks_tensor = torch.tensor(np.asarray(breaks), dtype=torch.int32, device=idxs.device) 671 | 672 | # todo: encode coeff_tensor and breaks_tensor into one tensor 673 | compressed_vals = coeff_tensor, breaks_tensor 674 | 675 | return compressed_vals, indices_sorted, shape 676 | 677 | @staticmethod 678 | def decompress(sparse_tensor, params): 679 | compressed_vals, idxs, shape = sparse_tensor 680 | coeff_tensor, breaks_tensor = compressed_vals 681 | 682 | coefficients = np.asarray(coeff_tensor.cpu()) 683 | breaks = np.asarray(breaks_tensor.cpu()) 684 | coefficients = np.reshape(coefficients, [len(breaks) - 1, -1]) 685 | vals = restore_curve(coefficients, breaks) 686 | vals = torch.tensor(np.asarray(vals), dtype=torch.float32, device=idxs.device) 687 | 688 | return vals, idxs, shape 689 | 690 | 691 | ######################################################################## 692 | # Bloom on CPU, not order-preserving 693 | from pybloomfilter import BloomFilter 694 | 695 | 696 | class BloomCPU(SparseCompressor): 697 | order_preserving = False 698 | @staticmethod 699 | def compress(sparse_tensor, params): 700 | vals, idxs, shape = sparse_tensor 701 | grad_size = shape.numel() 702 | err_rate = len(idxs) * 0.1 / grad_size 703 | bf = BloomFilter(len(idxs), err_rate, '/tmp/test.bf') 704 | 705 | for i, index in enumerate(idxs): 706 | bf.add(int(index)) 707 | 708 | # apply fpaware 709 | dense_tensor = params.get('dense_tensor', None) 710 | if dense_tensor is not None: 711 | idx_pos = [] 712 | for idx in range(grad_size): 713 | if idx in bf: 714 | idx_pos.append(idx) 715 | new_idxs = idx_pos[:len(idxs)] 716 | vals = dense_tensor.flatten()[new_idxs] 717 | 718 | bf64 = bf.to_base64() 719 | bf_tensor = torch.tensor(np.asarray([int(x) for x in bf64]), dtype=torch.uint8, device=vals.device) 720 | 721 | return vals, bf_tensor, shape 722 | 723 | @staticmethod 724 | def decompress(sparse_tensor, params): 725 | vals, bf_tensor, shape = sparse_tensor 726 | grad_size = shape.numel() 727 | b64_repr = bytes(list(np.asarray(bf_tensor.cpu()))) 728 | bf = BloomFilter.from_base64('/tmp/test.bf', b64_repr) 729 | idx_pos = [] 730 | 731 | for idx in range(grad_size): 732 | if idx in bf: 733 | idx_pos.append(idx) 734 | new_indices = idx_pos[:len(vals)] 735 | idxs = torch.as_tensor(new_indices, device=vals.device) 736 | return vals, idxs, shape 737 | 738 | 739 | ######################################################################## 740 | # Gzip on CPU, order-preserving 741 | 742 | class Gzip(SparseCompressor): 743 | order_preserving = True 744 | @staticmethod 745 | def compress(sparse_tensor, params): 746 | import struct 747 | import zlib 748 | vals, idxs, shape = sparse_tensor 749 | data = vals.cpu() 750 | packed = struct.pack(f'{data.numel()}f', *data) 751 | zlib_packed = zlib.compress(packed) 752 | vals = torch.as_tensor(list(zlib_packed), dtype=torch.uint8, device=idxs.device) 753 | return vals, idxs, shape 754 | 755 | @staticmethod 756 | def decompress(gzip_sparse_tensor, params): 757 | import struct 758 | import zlib 759 | gzip_vals, idxs, shape = gzip_sparse_tensor 760 | gzip_vals = gzip_vals.cpu() 761 | packed = zlib.decompress(bytes(gzip_vals)) 762 | vals = struct.unpack(f'{int(len(packed)//4)}f', packed) 763 | vals = torch.as_tensor(vals, dtype=torch.float, device=idxs.device) 764 | return vals, idxs, shape 765 | 766 | 767 | ######################################################################## 768 | # Huffman on CPU, order-preserving 769 | 770 | class Huffman(SparseCompressor): 771 | order_preserving = True 772 | @staticmethod 773 | def compress(sparse_tensor, params): 774 | vals, idxs, shape = sparse_tensor 775 | 776 | import struct 777 | from dahuffman import HuffmanCodec 778 | data = torch.arange(shape.numel()).int() 779 | packed = struct.pack(f'{data.numel()}i', *data) 780 | codec = HuffmanCodec.from_data(packed) 781 | 782 | data = idxs.cpu().int() 783 | packed = struct.pack(f'{data.numel()}i', *data) 784 | huff_encoded = codec.encode(packed) 785 | 786 | idxs = torch.as_tensor(list(huff_encoded), dtype=torch.uint8, device=vals.device) 787 | return vals, idxs, shape 788 | 789 | @staticmethod 790 | def decompress(sparse_tensor, params): 791 | vals, idxs, shape = sparse_tensor 792 | import struct 793 | from dahuffman import HuffmanCodec 794 | data = torch.arange(shape.numel()).int() 795 | packed = struct.pack(f'{data.numel()}i', *data) 796 | codec = HuffmanCodec.from_data(packed) 797 | 798 | encoded = idxs.cpu() 799 | packed = codec.decode(bytes(encoded)) 800 | idxs = struct.unpack(f'{int(len(packed) // 4)}i', packed) 801 | idxs = torch.as_tensor(idxs, dtype=torch.long, device=vals.device) 802 | return vals, idxs, shape 803 | 804 | 805 | ######################################################################## 806 | # RLE on CPU, not order-preserving 807 | 808 | class RunLength(SparseCompressor): 809 | order_preserving = False 810 | @staticmethod 811 | def compress(sparse_tensor, params): 812 | vals, idxs, shape = sparse_tensor 813 | bitmap = torch.zeros([shape.numel()]) 814 | bitmap[idxs.cpu().long()] = 1 815 | 816 | idxs, mapping = idxs.sort(descending=False) 817 | vals = vals[mapping] 818 | 819 | count = 0 820 | encode = [] 821 | last = 0 822 | for val in bitmap: 823 | if val == last: 824 | count += 1 825 | else: 826 | encode.append(count) 827 | count = 1 828 | last = val 829 | encode.append(count) 830 | 831 | idxs = DeepReduce.pack(torch.as_tensor(encode, device=vals.device)) 832 | return vals, idxs, shape 833 | 834 | @staticmethod 835 | def decompress(rle_sparse_tensor, params): 836 | vals, idxs, shape = rle_sparse_tensor 837 | encode = DeepReduce.unpack(idxs).cpu() 838 | 839 | decode = [] 840 | for i, freq in enumerate(encode): 841 | val = i % 2 842 | decode += [val for _ in range(freq)] 843 | 844 | bitmap = torch.as_tensor(decode, device=vals.device) 845 | idxs = torch.where(bitmap.bool())[0] 846 | return vals, idxs, shape 847 | 848 | 849 | ######################################################################## 850 | # QSGD on GPU, order-preserving 851 | 852 | class QSGD(SparseCompressor): 853 | order_preserving = True 854 | @staticmethod 855 | def compress(sparse_tensor, params): 856 | vals_all, idxs, shape = sparse_tensor 857 | quantum_num = params.get('quantum_num', 127) 858 | bucket_size = params.get('bucket_size', 512) 859 | 860 | vals_qsgd = [] 861 | for vals in vals_all.split(bucket_size): 862 | norm = vals.norm() 863 | abs_gradient = vals.abs() 864 | 865 | level_float = quantum_num / norm * abs_gradient 866 | previous_level = level_float.floor() 867 | prob = torch.empty_like(vals).uniform_() 868 | is_next_level = (prob < (level_float - previous_level)).type(torch.float32) 869 | new_level = (previous_level + is_next_level) 870 | 871 | sign = vals.sign() 872 | tensor_compressed = (new_level * sign) 873 | tensor_compressed = tensor_compressed.type(torch.int8 if quantum_num < 128 else torch.int16) 874 | 875 | #encode norm 876 | import struct 877 | packed = struct.pack('f', norm.cpu().item()) 878 | norm = torch.as_tensor([b-128 for b in packed], dtype=torch.int8, device=idxs.device) 879 | 880 | vals = torch.cat([tensor_compressed, norm], dim=0) 881 | vals_qsgd.append(vals) 882 | vals = torch.cat(vals_qsgd, dim=0) 883 | 884 | return vals, idxs, shape 885 | 886 | @staticmethod 887 | def decompress(sparse_tensor, params): 888 | vals_qsgd, idxs, shape = sparse_tensor 889 | quantum_num = params.get('quantum_num', 127) 890 | bucket_size = params.get('bucket_size', 512) 891 | 892 | vals_all = [] 893 | for vals in vals_qsgd.split(bucket_size+4): 894 | vals, norm = vals.split([vals.numel()-4, 4]) 895 | 896 | # decode norm 897 | import struct 898 | norm = norm.int() + 128 899 | norm = bytes(norm.cpu().type(torch.ByteTensor)) 900 | norm = struct.unpack('f', norm)[0] 901 | 902 | decode_output = vals.type(torch.float32) 903 | vals = norm / quantum_num * decode_output 904 | vals_all.append(vals) 905 | 906 | vals = torch.cat(vals_all, dim=0) 907 | return vals, idxs, shape 908 | 909 | 910 | ################################################################ 911 | 912 | 913 | compressor = { 914 | "bloom": Bloom, 915 | "polyfit": PolyFit, 916 | "bloom_cpu": BloomCPU, 917 | "polyfit_cpu": PolyFitCPU, 918 | "gzip": Gzip, 919 | "huffman": Huffman, 920 | "rle": RunLength, 921 | "qsgd": QSGD, 922 | } --------------------------------------------------------------------------------