├── .gitignore ├── LICENSE ├── README.md ├── benchmarks ├── hist │ ├── Makefile │ ├── Param.h │ ├── bin │ │ └── host │ ├── hist_funcs │ │ ├── init_combine_func.h │ │ └── map_to_val_func.h │ └── host.c ├── kmeans │ ├── Makefile │ ├── Param.h │ ├── bin │ │ └── host │ ├── host.c │ ├── kmeans.py │ ├── kmeans_dpu.py │ ├── kmeans_funcs │ │ ├── init_combine_func.h │ │ └── map_to_val_func.h │ └── plot.py ├── linear_reg │ ├── Makefile │ ├── Param.h │ ├── bin │ │ └── host │ ├── host.c │ ├── lin_reg_funcs │ │ ├── init_combine_func.h │ │ └── map_to_val_func.h │ ├── linear_reg.py │ └── plot.py ├── log_reg │ ├── Makefile │ ├── Param.h │ ├── bin │ │ └── host │ ├── host.c │ ├── log_reg_funcs │ │ ├── init_combine_func.h │ │ └── map_to_val_func.h │ ├── logistic_reg.py │ └── plot.py ├── red │ ├── Makefile │ ├── Param.h │ ├── bin │ │ └── host │ ├── host.c │ └── red_funcs │ │ ├── init_combine_func.h │ │ └── map_to_val_func.h └── va │ ├── Makefile │ ├── Param.h │ ├── bin │ ├── dpu_init_binary │ ├── dpu_map_va_funcs │ ├── dpu_zip │ └── host │ ├── host.c │ └── va_funcs │ └── map.h └── lib ├── Common.c ├── Common.h ├── Parallel.c ├── Parallel.h ├── Structs.h ├── StructsPIM.h ├── Table.c ├── Table.h ├── TableHost.c ├── TableHost.h ├── TableShared.c ├── TableShared.h ├── UpmemCustom.c ├── UpmemCustom.h ├── communication ├── CommHelper.c ├── CommHelper.h ├── CommOps.c └── CommOps.h ├── management ├── Management.c ├── Management.h ├── SmallTableInit.c ├── SmallTableInit.h └── SmallTableInit_dpu.c ├── processing ├── ProcessingHelper.c ├── ProcessingHelper.h ├── ProcessingHelperHost.c ├── ProcessingHelperHost.h ├── gen_red │ ├── GenRed.c │ ├── GenRed.h │ ├── GenRedArgs.h │ ├── GenRedProcessing.h │ └── gen_red_dpu.c ├── map │ ├── Map.c │ ├── Map.h │ ├── MapArgs.h │ ├── MapProcessing.h │ └── map_dpu.c └── zip │ ├── Zip.c │ ├── Zip.h │ ├── ZipArgs.h │ ├── ZipProcessing.c │ ├── ZipProcessing.h │ └── zip_dpu.c └── timer.h /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | *.csv 3 | *.so 4 | *.json -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 SAFARI Research Group at ETH Zürich 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SimplePIM: A Software Framework for Productive and Efficient In-Memory Processing 2 | This project implements SimplePIM, a software framework for easy and efficient in-memory-hardware programming. The code is implemented on UPMEM, an actual, commercially available PIM hardware that combines traditional DRAM memory with general-purpose in-order cores inside the same chip. SimplePIM processes arrays of arbitrary elements on a PIM device by calling iterator functions from the host and provides primitives for communication among PIM cores and between PIM and the host system. 3 | 4 | We implement six applications with SimplePIM on UPMEM: 5 | - Vector Addtition 6 | - Reduction 7 | - K-Means Clustering 8 | - Histogram 9 | - Linear Regression 10 | - Logistic Regression 11 | 12 | Previous manual UPMEM implementations of the same applications can be found in [PrIM Benchmark](https://github.com/CMU-SAFARI/prim-benchmarks), [dpu_kmeans](https://github.com/upmem/dpu_kmeans) and [prim-ml](https://github.com/CMU-SAFARI/pim-ml). These previous implementations can serve as baseline for measuring SimplePIM's performance as well as productivity improvements. 13 | 14 | ## Citation 15 | Please cite the following papers if you find this repository useful. 16 | Jinfan Chen, Juan Gómez-Luna, Izzat El Hajj, Yuxin Guo and Onur Mutlu, "[SimplePIM: A Software Framework for Productive and Efficient In-Memory Processing](https://arxiv.org/abs/2310.01893)" , International Conference on Parallel Architectures and Compilation Techniques (PACT), 2023. 17 | 18 | Bibtex entries for citation: 19 | ``` 20 | @article{Chen2023SimplePIMPACT, 21 | title={SimplePIM: A Software Framework for Productive and Efficient Processing-in-Memory}, 22 | author={Jinfan Chen and Juan G'omez-Luna and Izzat El Hajj and Yu-Yin Guo and Onur Mutlu}, 23 | year={2023}, 24 | booktitle = {PACT} 25 | } 26 | ``` 27 | 28 | ## Installation 29 | 30 | ### Prerequisites 31 | Running PIM-ML requires installing the [UPMEM SDK](https://sdk.upmem.com). This benchmark designed to run on a server with real UPMEM modules, but they are also able to be run by the functional simulator in the UPMEM SDK. 32 | 33 | ### Getting Started 34 | Clone the repository: 35 | ``` 36 | $ git clone https://github.com/CMU-SAFARI/SimplePIM.git 37 | $ cd SimplePIM 38 | ``` 39 | 40 | ## Repository Structure 41 | ``` 42 | . 43 | +-- LICENSE 44 | +-- README.md 45 | +-- gitignore 46 | +-- benchmarks/ 47 | | +-- hist/ 48 | | +-- kmeans/ 49 | | +-- linear_reg/ 50 | | +-- log_reg/ 51 | | +-- red/ 52 | | +-- va/ 53 | +-- lib/ 54 | | +-- communication/ 55 | | +-- CommOps.c 56 | | +-- CommOps.h 57 | | +-- management/ 58 | | +-- Management.c 59 | | +-- Management.h 60 | | +-- processing/ 61 | | +-- gen_red 62 | | +-- map 63 | | +-- zip 64 | ``` 65 | 66 | ## APIs 67 | SimplePIM provides three APIs to the users. The management interface is under SimplePIM/lib/management/. The management interface code sets up the UPMEM hardware, records and manages information about the PIM arrays. The communication interface under SimplePIM/lib/communication/ contains code for PIM-to-PIM and host-PIM communication operators (gather, scatter, broadcast, allreduce, and allgather). Finally, the processing interface under SimplePIM/lib/processing/ contains the UPMEM implementation of array map, array zip and array reduction. Many workloads like histogram, kmeans and vector addition can be abstracted as a combination of the communication and processing operators. 68 | 69 | SimplePIM/lib/ contains other files that are helper functions for the ease of framework development. 70 | 71 | ## Running SimplePIM 72 | Each benchmark folder includes Makefiles to run the experiments: 73 | To run vector addition, redcution and histogram, one could simply go to each benchmark folder and run make. For example, to run vector addition, one could run 74 | ``` 75 | $ cd benchmarks/va 76 | $ make 77 | $ ./bin/host 78 | ``` 79 | One can observe that SimplePIM produces the exact same result as the CPU code. One can change the parameters (number of elements, number of DPU used) in the Param.h file. 80 | 81 | To run Linear Regresion, Logitic Regression, and KMeans, one needs to generate the input data with a python script under each benchmark folder. For example, to run linear regression, one firstly needs to run 82 | ``` 83 | $ cd benchmarks/linear_reg 84 | $ python linear_reg.py 85 | ``` 86 | And then one can build and run the actual SimplePIM code as before with 87 | ``` 88 | $ make 89 | $ ./bin/host 90 | ``` 91 | 92 | ## Getting Help 93 | If you have any suggestions for improvement, please contact georgcjf at gmail dot com. If you find any bugs or have further questions or requests, please post an issue at the [issue page](https://github.com/CMU-SAFARI/SimplePIM/issues). 94 | 95 | ## Acknowledgement 96 | We acknowledge support from the SAFARI Research Group’s industrial partners, especially Google, Huawei, Intel, Microsoft, VMware, and the Semiconductor Research Corporation. This research was partially supported by the ETH Future Computing Laboratory and the European Union’s Horizon programme for research and innovation under grant agreement No. 101047160, project BioPIM (Processing-in-memory architectures and pro- gramming libraries for bioinformatics algorithms). This research was also partially supported by ACCESS – AI Chip Center for Emerging Smart Systems, sponsored by InnoHK funding, Hong Kong SAR. 97 | -------------------------------------------------------------------------------- /benchmarks/hist/Makefile: -------------------------------------------------------------------------------- 1 | va: host.c 2 | @mkdir -p bin 3 | gcc --std=c99 -ldl -lm -fopenmp -O3 host.c -o bin/host ../../lib/processing/ProcessingHelperHost.c ../../lib/communication/CommHelper.c ../../lib/communication/CommOps.c ../../lib/management/SmallTableInit.c ../../lib/management/Management.c ../../lib/processing/gen_red/GenRed.c `dpu-pkg-config --cflags --libs dpu` -------------------------------------------------------------------------------- /benchmarks/hist/Param.h: -------------------------------------------------------------------------------- 1 | #ifndef PARAM_H 2 | #define PARAM_H 3 | #include 4 | uint32_t print_info = 0; 5 | typedef uint32_t T; 6 | const uint32_t dpu_number = 3; //2432 7 | 8 | #define DEPTH 12 // 2^12 = 4096 9 | #define bins 256 10 | 11 | uint64_t nr_elements = dpu_number*128; //64*1536*1024 12 | #endif 13 | -------------------------------------------------------------------------------- /benchmarks/hist/bin/host: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CMU-SAFARI/SimplePIM/1d639c53532555f01e9f71d872e7712b166d6cba/benchmarks/hist/bin/host -------------------------------------------------------------------------------- /benchmarks/hist/hist_funcs/init_combine_func.h: -------------------------------------------------------------------------------- 1 | #ifndef INIT_COMBINE_FUNC_H 2 | #define INIT_COMBINE_FUNC_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include "../Param.h" 8 | 9 | 10 | void init_func(uint32_t size, void* ptr){ 11 | char* casted_value_ptr = (char*) ptr; 12 | for(int i=0; i 5 | #include 6 | #include 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | #include "../Param.h" 14 | #include "../../../lib/processing/gen_red/GenRedArgs.h" 15 | 16 | 17 | void start_func(gen_red_arguments_t* args){} 18 | 19 | void map_to_val_func(void* input, void* output, uint32_t* key){ 20 | uint32_t d = *((uint32_t*)input); 21 | *(uint32_t*)output = 1; 22 | *key = d*bins >> 12; 23 | } 24 | 25 | #endif -------------------------------------------------------------------------------- /benchmarks/hist/host.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include "../../lib/processing/gen_red/GenRed.h" 8 | #include "../../lib/processing/ProcessingHelperHost.h" 9 | #include "../../lib/timer.h" 10 | #include "Param.h" 11 | 12 | 13 | 14 | void init_data(T* A){ 15 | for(unsigned long i=0; i> DEPTH] += 1; 30 | } 31 | } 32 | 33 | void printf_hist(uint32_t* histo){ 34 | printf("the bins :\n"); 35 | for(int i=0; i 5 | #include 6 | uint32_t print_info = 0; 7 | typedef int32_t T; 8 | const uint32_t dpu_number = 5; // 2432 9 | const uint32_t k = 10; 10 | const uint32_t dim = 10; 11 | const uint64_t num_elements = 1000*dpu_number; 12 | const uint32_t iter = 1; 13 | 14 | #endif -------------------------------------------------------------------------------- /benchmarks/kmeans/bin/host: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CMU-SAFARI/SimplePIM/1d639c53532555f01e9f71d872e7712b166d6cba/benchmarks/kmeans/bin/host -------------------------------------------------------------------------------- /benchmarks/kmeans/host.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include "../../lib/processing/gen_red/GenRed.h" 8 | #include "../../lib/processing/ProcessingHelperHost.h" 9 | #include "../../lib/communication/CommOps.h" 10 | #include "../../lib/management/Management.h" 11 | #include "../../lib/timer.h" 12 | #include "Param.h" 13 | 14 | 15 | 16 | FILE* fp; 17 | 18 | void add(void* p1, void* p2){ 19 | uint32_t* times1 = (uint32_t*)p1; 20 | uint32_t* times2 = (uint32_t*)p2; 21 | *times1 += *times2; 22 | T* ptr1 = (T*)(p1+sizeof(uint32_t)); 23 | T* ptr2 = (T*)(p2+sizeof(uint32_t)); 24 | 25 | for(int i=0; iend; 139 | // main loop 140 | for(int m=0; m 5 | #include 6 | #include 7 | #include "../Param.h" 8 | 9 | 10 | void init_func(uint32_t size, void* ptr){ 11 | char* casted_value_ptr = (char*) ptr; 12 | for(int i=0; i 5 | #include 6 | #include 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | #include "../Param.h" 14 | #include "../../../lib/processing/gen_red/GenRedArgs.h" 15 | 16 | __dma_aligned void* centroids_data; 17 | 18 | 19 | BARRIER_INIT(barrier_maptoval, NR_TASKLETS); 20 | void start_func(gen_red_arguments_t* args){ 21 | uint32_t total_len = args->table_len * args->output_type_size; 22 | uint32_t aligned_weights_size = total_len + 8-(total_len%8); 23 | if(me()==0){ 24 | // initialise weights 25 | fsb_allocator_t weights_allocator = fsb_alloc(aligned_weights_size, 1); 26 | centroids_data = (void*)fsb_get(weights_allocator); 27 | mram_read(DPU_MRAM_HEAP_POINTER+args->info, centroids_data, aligned_weights_size); 28 | } 29 | barrier_wait(&barrier_maptoval); 30 | } 31 | 32 | void map_to_val_func(void* input_point, void* intermediate_input, uint32_t* centroid){ 33 | // the data is preserved and later added to corresponding centroid 34 | int32_t* times = (int32_t*)intermediate_input; 35 | *times = 1; 36 | intermediate_input+=sizeof(uint32_t); 37 | 38 | T* intermediate_ptr = (T*)intermediate_input; 39 | T* input_point_ptr = (T*)input_point; 40 | T* centroids_data_ptr = (T*)centroids_data; 41 | 42 | for(int i=0; i 5 | #include 6 | uint32_t print_info = 0; 7 | typedef int T; 8 | const uint32_t dpu_number = 5; // 2432 9 | const uint32_t dim = 10; 10 | const uint64_t num_elements = 1000*dpu_number;//10000*dpu_number; 11 | const uint32_t iter = 1; 12 | const float lr = 1e-4; 13 | const uint32_t shift_amount = 5;//10; 14 | const uint32_t prevent_overflow_shift_amount = 8;//15; 15 | #endif -------------------------------------------------------------------------------- /benchmarks/linear_reg/bin/host: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CMU-SAFARI/SimplePIM/1d639c53532555f01e9f71d872e7712b166d6cba/benchmarks/linear_reg/bin/host -------------------------------------------------------------------------------- /benchmarks/linear_reg/host.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include "../../lib/processing/gen_red/GenRed.h" 9 | #include "../../lib/processing/ProcessingHelperHost.h" 10 | #include "../../lib/communication/CommOps.h" 11 | #include "../../lib/management/Management.h" 12 | #include "../../lib/timer.h" 13 | #include "Param.h" 14 | 15 | FILE* fp; 16 | 17 | 18 | void read_csv_to_arr(FILE* fp, T* arr, int32_t len, int32_t d){ 19 | if (fp == NULL) { 20 | fprintf(stderr, "Error reading file\n"); 21 | return; 22 | } 23 | 24 | float tmp; 25 | for (size_t i = 0; i < len; i++){ 26 | for(size_t j = 0; j < d-1; j++){ 27 | fscanf(fp, "%f,", &tmp); 28 | arr[i*d+j] = (T)tmp; 29 | } 30 | fscanf(fp, "%f\n", &tmp); 31 | arr[i*d+d-1] = (T)tmp; 32 | } 33 | 34 | fclose(fp); 35 | } 36 | 37 | void write_time_to_csv(double* arr, int32_t len){ 38 | if (fp == NULL) { 39 | fprintf(stderr, "Error reading file\n"); 40 | return; 41 | } 42 | 43 | 44 | for (size_t i = 0; i < len; i++){ 45 | fprintf(fp,"%f\n", arr[i]/1000.0); 46 | } 47 | 48 | fclose(fp); 49 | } 50 | 51 | void compute_gradients(const T*arr){ 52 | 53 | // [X|Y] -> [X], [Y] 54 | T* X = malloc(num_elements*dim*sizeof(T)); 55 | T* Y = malloc(num_elements*sizeof(T)); 56 | for(uint32_t i=0; i> prevent_overflow_shift_amount; 87 | } 88 | } 89 | } 90 | 91 | printf("\nthe gradients on host: \n"); 92 | for(int i=0; iend; 171 | simplepim_broadcast("t2", weights, 1, dim*sizeof(T), table_management); 172 | uint32_t weights_offset = lookup_table("t2", table_management)->end; 173 | 174 | handle_t* va_handle = create_handle("lin_reg_funcs", REDUCE); 175 | 176 | for(int l=0; l 5 | #include 6 | #include 7 | #include "../Param.h" 8 | 9 | 10 | void init_func(uint32_t size, void* ptr){ 11 | char* casted_value_ptr = (char*) ptr; 12 | for(int i=0; i 5 | #include 6 | #include 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | #include "../Param.h" 14 | #include "../../../lib/processing/gen_red/GenRedArgs.h" 15 | 16 | 17 | __dma_aligned T* weights_data; 18 | BARRIER_INIT(barrier_maptoval, NR_TASKLETS); 19 | 20 | void start_func(gen_red_arguments_t* args){ 21 | uint32_t total_len = args->table_len * args->output_type_size; 22 | uint32_t aligned_weights_size = total_len + 8-(total_len%8); 23 | if(me()==0){ 24 | // initialise weights 25 | fsb_allocator_t weights_allocator = fsb_alloc(aligned_weights_size, 1); 26 | weights_data = (void*)fsb_get(weights_allocator); 27 | mram_read(DPU_MRAM_HEAP_POINTER+args->info, weights_data, aligned_weights_size); 28 | } 29 | barrier_wait(&barrier_maptoval); 30 | } 31 | 32 | 33 | 34 | void map_to_val_func(void* input, void* grads, uint32_t* dummy){ 35 | // the data is preserved and later added to corresponding weights 36 | int64_t* grads_ptr = (int64_t*)grads; 37 | T* input_ptr = (T*)input; 38 | T* weights_data_ptr = (T*)weights_data; 39 | 40 | // calculate gradients w.r.t. linear weights 41 | int64_t dot_prod = 0; 42 | for(int i=0; i>prevent_overflow_shift_amount; 50 | //printf("%f ", grads_ptr[i]); 51 | grads_ptr[i] = input_ptr[i] * e >> prevent_overflow_shift_amount; 52 | } 53 | //printf("\n"); 54 | 55 | // put weight gradients to the 0th entry 56 | *dummy = 0; 57 | 58 | } 59 | 60 | #endif -------------------------------------------------------------------------------- /benchmarks/linear_reg/linear_reg.py: -------------------------------------------------------------------------------- 1 | num_threads=32 2 | import os 3 | from joblib import parallel_backend 4 | os.environ["OMP_NUM_THREADS"] = str(num_threads) 5 | os.environ["OPENBLAS_NUM_THREADS"] = str(num_threads) 6 | os.environ["MKL_NUM_THREADS"] = str(num_threads) 7 | os.environ["BLIS_NUM_THREADS"] = str(num_threads) 8 | 9 | import time 10 | import random 11 | import numpy as np 12 | import pandas as pd 13 | import torch 14 | from torch import float32 15 | from torch.autograd import Variable 16 | from torch.nn.functional import linear 17 | random.seed(10) 18 | np.set_printoptions(precision=4) 19 | torch.set_printoptions(precision=4) 20 | torch.set_default_dtype(float32) 21 | torch.set_num_threads(num_threads) 22 | 23 | class linearRegression(torch.nn.Module): 24 | def __init__(self, inputSize, init_weight): 25 | super(linearRegression, self).__init__() 26 | self.inputSize = inputSize 27 | self.weights = init_weight 28 | self.criterion = torch.nn.MSELoss(reduction='mean') 29 | 30 | def forward(self, x, y): 31 | out = torch.squeeze(linear(x, self.weights)) 32 | loss = self.criterion(out, y) 33 | return loss 34 | 35 | def main(): 36 | num_dpus = 5 37 | dim, num_elements, iter, lr = 10, 1000*num_dpus, 1, 1e-4 38 | 39 | df = pd.DataFrame([dim, num_elements, iter, lr]) 40 | init_vector = np.zeros((dim), dtype=np.float32) 41 | input = np.zeros((num_elements, dim+1), dtype=np.float32) 42 | 43 | groud_truth = np.zeros((dim), dtype=np.float32) 44 | for i in range(dim): 45 | groud_truth[i] = random.randint(-2, 2) 46 | 47 | for i in range(num_elements): 48 | for j in range(dim): 49 | r1, r2 = random.uniform(0, 1), random.uniform(0, 1)/dim 50 | input[i][j] = (int)((i-num_elements/2)*r1 + j*r2)%10 if j%2 == 0 else (int)(-1*((i-num_elements/2)*r1 + j*r2))%10 51 | input[i][dim] = groud_truth.dot(input[i][:-1]) 52 | 53 | 54 | 55 | #np.savetxt("data/args.csv", np.array([dim, num_elements, iter, lr]), delimiter=",", fmt='%s') 56 | np.savetxt("data/input.csv", input, delimiter=",", fmt='%f') 57 | 58 | x_train, y_train = (input.transpose(1, 0)[0:-1]).transpose(1, 0), (input.transpose(1, 0)[-1]) 59 | 60 | if torch.cuda.is_available(): 61 | inputs = Variable(torch.from_numpy(x_train).cuda(), requires_grad=True) 62 | labels = Variable(torch.from_numpy(y_train).cuda(), requires_grad=True) 63 | init_weights = torch.nn.Parameter(Variable(torch.from_numpy(init_vector).cuda(), requires_grad=True)) 64 | else: 65 | inputs = Variable(torch.from_numpy(x_train), requires_grad=True) 66 | labels = Variable(torch.from_numpy(y_train), requires_grad=True) 67 | init_weights = torch.nn.Parameter(Variable(torch.from_numpy(init_vector), requires_grad=True)) 68 | 69 | 70 | model = linearRegression(dim, init_weights) 71 | optimizer = torch.optim.SGD(model.parameters(), lr=lr) 72 | 73 | start = time.time() 74 | for epoch in range(iter): 75 | # Clear gradient buffers because we don't want any gradient from previous epoch to carry forward, dont want to cummulate gradients 76 | optimizer.zero_grad() 77 | 78 | # get output from the model, given the inputs 79 | loss = model(inputs, labels) 80 | loss.backward() 81 | 82 | # update parameters 83 | optimizer.step() 84 | end = time.time() 85 | 86 | t = (end-start)*1000 87 | print("the time consumed is "+str(t)+"ms") 88 | print("linear model weights: ") 89 | print(model.weights.detach().numpy()) 90 | 91 | print("groud truth: "+str(groud_truth)) 92 | 93 | ''' 94 | print("$$$$$") 95 | print((x_train[0].dot(init_vector)-y_train[0])*x_train[0]) 96 | print((x_train[1].dot(init_vector)-y_train[1])*x_train[1]) 97 | print((x_train[0].dot(init_vector)-y_train[0])*x_train[0] + (x_train[1].dot(init_vector)*init_vector-y_train[1])*x_train[1]) 98 | print((x_train@init_vector-y_train)@x_train) 99 | ''' 100 | if not os.path.exists("results/"): 101 | os.makedirs("results/") 102 | 103 | path = 'results/cpu_'+str(dim)+"_"+str(num_elements)+".csv" 104 | np.savetxt(path, np.array([t])) 105 | 106 | if __name__ == "__main__": 107 | if not os.path.exists("data/"): 108 | os.mkdir("data/") 109 | with parallel_backend('threading', n_jobs=num_threads): 110 | main() 111 | 112 | -------------------------------------------------------------------------------- /benchmarks/linear_reg/plot.py: -------------------------------------------------------------------------------- 1 | from cProfile import label 2 | import matplotlib.pyplot as plt 3 | import json 4 | import numpy as np 5 | import scipy.stats 6 | from math import log2, exp 7 | 8 | colors=['#23ef68','#32efff','#2eaf9f','#22222f','#eeeff1','#eee112','#00ef00','#aa0000','#0000aa','#000999','#2e3f56','#7eef1f','#eeef11'] 9 | 10 | font = {'weight' : 'bold', 11 | 'size' : 16} 12 | plt.rcParams["figure.figsize"] = (16, 7) 13 | plt.rc('font', **font) 14 | 15 | 16 | def load_data(data, cpu_files, DPU_files, DPU_MASTER_files): 17 | for i in range(len(cpu_files)): 18 | data["CPU"][i] = np.loadtxt(cpu_files[i]) /1000 19 | 20 | for i in range(len(DPU_files)): 21 | tmp = np.loadtxt(DPU_files[i])/1000 22 | data["DPU_initial_transfer"][i] = tmp[0] 23 | data["DPU_Kernel"][i] = tmp[1] 24 | data["DPU_D2C"][i] = tmp[2] 25 | data["DPU_C2D"][i] = tmp[4] 26 | data["DPU"][i] = tmp[5] 27 | 28 | for i in range(len(DPU_MASTER_files)): 29 | tmp = np.loadtxt(DPU_MASTER_files[i])/1000 30 | data["DPU_MASTER_initial_transfer"][i] = tmp[0] 31 | data["DPU_MASTER_Kernel"][i] = tmp[1] 32 | data["DPU_MASTER_D2C"][i] = tmp[2] 33 | data["DPU_MASTER_C2D"][i] = tmp[4] 34 | data["DPU_MASTER"][i] = tmp[5] 35 | 36 | 37 | 38 | 39 | 40 | dim_data={ 41 | "title":"varing_input_dimension", 42 | "x_name":"input dimension", 43 | "x_axis":["5", "10", "20"], 44 | "CPU":np.zeros(3), 45 | 46 | "DPU":np.zeros(3), 47 | "DPU_Kernel":np.zeros(3), 48 | "DPU_initial_transfer":np.zeros(3), 49 | "DPU_C2D":np.zeros(3), 50 | "DPU_D2C":np.zeros(3), 51 | 52 | "DPU_MASTER":np.zeros(3), 53 | "DPU_MASTER_Kernel": np.zeros(3), 54 | "DPU_MASTER_initial_transfer": np.zeros(3), 55 | "DPU_MASTER_C2D": np.zeros(3), 56 | "DPU_MASTER_D2C": np.zeros(3), 57 | 58 | } 59 | 60 | 61 | num_data={ 62 | "title":"varing_#data", 63 | "x_name":"number of input data points", 64 | "x_axis":["100000", "1000000", "10000000"], 65 | "CPU":np.zeros(3), 66 | 67 | "DPU":np.zeros(3), 68 | "DPU_Kernel":np.zeros(3), 69 | "DPU_initial_transfer":np.zeros(3), 70 | "DPU_C2D":np.zeros(3), 71 | "DPU_D2C":np.zeros(3), 72 | 73 | "DPU_MASTER":np.zeros(3), 74 | "DPU_MASTER_Kernel": np.zeros(3), 75 | "DPU_MASTER_initial_transfer": np.zeros(3), 76 | "DPU_MASTER_C2D": np.zeros(3), 77 | "DPU_MASTER_D2C": np.zeros(3), 78 | } 79 | 80 | num_dpus_data={ 81 | "title":"varing_#dpus", 82 | "x_name":"number of dpus", 83 | "x_axis":["128", "512", "2048"], 84 | "CPU":np.zeros(3), 85 | 86 | "DPU":np.zeros(3), 87 | "DPU_Kernel":np.zeros(3), 88 | "DPU_initial_transfer":np.zeros(3), 89 | "DPU_C2D":np.zeros(3), 90 | "DPU_D2C":np.zeros(3), 91 | 92 | "DPU_MASTER":np.zeros(3), 93 | "DPU_MASTER_Kernel": np.zeros(3), 94 | "DPU_MASTER_initial_transfer": np.zeros(3), 95 | "DPU_MASTER_C2D": np.zeros(3), 96 | "DPU_MASTER_D2C": np.zeros(3), 97 | } 98 | 99 | 100 | def mean_confidence_interval(data, confidence=0.99): 101 | a = 1.0 * np.array(data) 102 | n = len(a) 103 | m, se = np.mean(a), scipy.stats.sem(a) 104 | h = se * scipy.stats.t.ppf((1 + confidence) / 2., n-1) 105 | return h 106 | 107 | def concate(arr_list): 108 | arr_list = [x.reshape(len(x), 1) for x in arr_list] 109 | return np.concatenate(arr_list, axis = 1) 110 | 111 | def plot_res(data): 112 | x = data["x_axis"] 113 | title = data["title"] 114 | _, ax = plt.subplots() 115 | 116 | line_width = 0.25 117 | x_pos = np.arange(len(x)) 118 | 119 | 120 | bar1 = ax.bar(x_pos - line_width + line_width*0, data["CPU"], width=line_width, edgecolor='k', color=colors[0], label ="CPU version") 121 | 122 | bar2 = ax.bar(x_pos - line_width + line_width*1, data["DPU"], width=line_width, edgecolor='k', color=colors[1], label ="CPU-reduce") 123 | bar4 = ax.bar(x_pos - line_width + line_width*1, data["DPU_Kernel"]+data["DPU_initial_transfer"]+data["DPU_D2C"]+data["DPU_C2D"], width=line_width, edgecolor='k', color=colors[2], label="C2D transfer") 124 | bar5 = ax.bar(x_pos - line_width + line_width*1, data["DPU_Kernel"]+data["DPU_initial_transfer"]+data["DPU_D2C"], width=line_width, edgecolor='k', color=colors[3], label="D2C transfer") 125 | bar6 = ax.bar(x_pos - line_width + line_width*1, data["DPU_Kernel"]+data["DPU_initial_transfer"], width=line_width, edgecolor='k', color=colors[4], label="initial transfer") 126 | bar7 = ax.bar(x_pos - line_width + line_width*1, data["DPU_Kernel"], width=line_width, edgecolor='k', color=colors[5], label="DPU kernel") 127 | 128 | bar3 = ax.bar(x_pos - line_width + line_width*2, data["DPU_MASTER"], width=line_width, edgecolor='k', color=colors[1]) 129 | ax.bar(x_pos - line_width + line_width*2, data["DPU_MASTER_Kernel"]+data["DPU_MASTER_initial_transfer"]+data["DPU_MASTER_D2C"]+data["DPU_MASTER_C2D"], width=line_width, edgecolor='k', color=colors[2]) 130 | ax.bar(x_pos - line_width + line_width*2, data["DPU_MASTER_Kernel"]+data["DPU_MASTER_initial_transfer"]+data["DPU_MASTER_D2C"], width=line_width, edgecolor='k', color=colors[3]) 131 | ax.bar(x_pos - line_width + line_width*2, data["DPU_MASTER_Kernel"]+data["DPU_MASTER_initial_transfer"], width=line_width, edgecolor='k', color=colors[4]) 132 | ax.bar(x_pos - line_width + line_width*2, data["DPU_MASTER_Kernel"], width=line_width, edgecolor='k', color=colors[5]) 133 | 134 | 135 | 136 | for i in range(len(bar1 + bar2 +bar3)): 137 | rect = (bar1 + bar2 +bar3)[i] 138 | height = rect.get_height() 139 | if(not height == 0): 140 | if i//3 == 0: 141 | text = "cpu" 142 | elif i//3 == 1: 143 | text = "pim-f" 144 | else: 145 | text = "pim-h" 146 | plt.text(rect.get_x()+rect.get_width() / 2, height, text, ha = 'center', va = 'bottom', fontdict={'size': 16}) 147 | 148 | #plt.yscale('log',base=2) 149 | ax.set_xticks(x_pos-line_width*0) 150 | ax.set_xticklabels(x) 151 | ax.set_title(title) 152 | plt.xlabel(data["x_name"], fontdict=font) 153 | plt.ylabel("time in s", fontdict=font) 154 | 155 | legend1 = plt.legend(handles=[bar1, bar2], loc='upper left', shadow=True, bbox_to_anchor=(0, -0.12, 0, 0)) 156 | ax.add_artist(legend1) 157 | legend2 = plt.legend(handles=[bar3, bar4, bar7], loc='upper left', shadow=True, bbox_to_anchor=(0.425, -0.12, 0, 0)) 158 | ax.add_artist(legend2) 159 | plt.legend(handles=[bar5, bar6], loc='upper left', shadow=True, bbox_to_anchor=(0.2, -0.12, 0, 0)) 160 | plt.savefig("images/"+title, bbox_inches='tight') 161 | plt.clf() 162 | plt.close() 163 | 164 | 165 | 166 | 167 | if __name__=="__main__": 168 | dir = "results/" 169 | load_data(dim_data, [dir+ i for i in ["cpu_5_1000000.csv", "cpu_10_1000000.csv", "cpu_20_1000000.csv"]], [dir+i for i in ["framework_2523_5_1000000", "framework_2523_10_1000000", "framework_2523_20_1000000"]], [dir+i for i in ["human_2523_5_1000000", "human_2523_10_1000000"]]) 170 | plot_res(dim_data) 171 | load_data(num_data, [dir+ i for i in ["cpu_10_100000.csv", "cpu_10_1000000.csv", "cpu_10_10000000.csv"]], [dir+i for i in ["framework_2523_10_100000", "framework_2523_10_1000000", "framework_2523_10_10000000"]], [dir+i for i in ["human_2523_10_100000", "human_2523_10_1000000", "human_2523_10_10000000"]]) 172 | plot_res(num_data) 173 | load_data(num_dpus_data, [], [dir+i for i in ["framework_128_10_1000000", "framework_512_10_1000000", "framework_2048_10_1000000"]], [dir+i for i in ["human_128_10_1000000", "human_512_10_1000000", "human_2048_10_1000000"]]) 174 | plot_res(num_dpus_data) 175 | -------------------------------------------------------------------------------- /benchmarks/log_reg/Makefile: -------------------------------------------------------------------------------- 1 | log: host.c 2 | @mkdir -p bin 3 | gcc --std=c99 -ldl -lm -fopenmp -O3 host.c -o bin/host ../../lib/processing/ProcessingHelperHost.c ../../lib/communication/CommHelper.c ../../lib/communication/CommOps.c ../../lib/management/SmallTableInit.c ../../lib/management/Management.c ../../lib/processing/gen_red/GenRed.c `dpu-pkg-config --cflags --libs dpu` -------------------------------------------------------------------------------- /benchmarks/log_reg/Param.h: -------------------------------------------------------------------------------- 1 | #ifndef PARAM_H 2 | #define PARAM_H 3 | 4 | #include 5 | #include 6 | uint32_t print_info = 0; 7 | typedef int T; 8 | const uint32_t dpu_number = 5; // 2432 9 | const uint32_t dim = 10; 10 | const uint64_t num_elements = 1000*dpu_number; 11 | const uint32_t iter = 1; 12 | const float lr = 1e-4; 13 | const uint32_t prevent_overflow_shift_amount = 3; 14 | const uint32_t shift_amount = 5; 15 | #endif 16 | -------------------------------------------------------------------------------- /benchmarks/log_reg/bin/host: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CMU-SAFARI/SimplePIM/1d639c53532555f01e9f71d872e7712b166d6cba/benchmarks/log_reg/bin/host -------------------------------------------------------------------------------- /benchmarks/log_reg/host.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | 9 | #include "../../lib/processing/gen_red/GenRed.h" 10 | #include "../../lib/processing/ProcessingHelperHost.h" 11 | #include "../../lib/communication/CommOps.h" 12 | #include "../../lib/management/Management.h" 13 | #include "../../lib/timer.h" 14 | #include "Param.h" 15 | 16 | 17 | 18 | FILE* fp; 19 | 20 | 21 | void read_csv_to_arr(FILE* fp, T* arr, int32_t len, int32_t d){ 22 | if (fp == NULL) { 23 | fprintf(stderr, "Error reading file\n"); 24 | return; 25 | } 26 | 27 | float tmp; 28 | for (size_t i = 0; i < len; i++){ 29 | for(size_t j = 0; j < d-1; j++){ 30 | fscanf(fp, "%f,", &tmp); 31 | arr[i*d+j] = (T)tmp; 32 | } 33 | fscanf(fp, "%f,", &tmp); 34 | arr[i*d+d-1] = (T)tmp; 35 | } 36 | 37 | fclose(fp); 38 | } 39 | 40 | void write_time_to_csv(double* arr, int32_t len){ 41 | if (fp == NULL) { 42 | fprintf(stderr, "Error reading file\n"); 43 | return; 44 | } 45 | 46 | for (size_t i = 0; i < len; i++){ 47 | fprintf(fp,"%f\n", arr[i]/1000.0); 48 | } 49 | 50 | fclose(fp); 51 | } 52 | 53 | void get_output_file(int num_dpus, int dim, int num_elem){ 54 | char str1[10]; 55 | char str2[10]; 56 | char str3[10]; 57 | sprintf(str1, "%d", num_dpus); 58 | sprintf(str2, "%d", dim); 59 | sprintf(str3, "%d", num_elem); 60 | char out[100] = "results/framework_"; 61 | strcat(out, str1); 62 | strcat(out,"_"); 63 | strcat(out, str2); 64 | strcat(out,"_"); 65 | strcat(out, str3); 66 | fp = fopen (out, "w"); 67 | } 68 | 69 | 70 | void compute_gradients(const T*arr){ 71 | 72 | // [X|Y] -> [X], [Y] 73 | T* X = malloc(num_elements*dim*sizeof(T)); 74 | T* Y = malloc(num_elements*sizeof(T)); 75 | for(uint32_t i=0; i> prevent_overflow_shift_amount; 106 | } 107 | } 108 | } 109 | 110 | printf("\nthe gradients on host: \n"); 111 | for(int i=0; iend; 158 | simplepim_broadcast("t2", weights, 1, dim*sizeof(T), table_management); 159 | 160 | handle_t* va_handle = create_handle("log_reg_funcs", REDUCE); 161 | T* res = table_gen_red("t1", "t3", dim*sizeof(T), 1, va_handle, table_management, data_offset); 162 | 163 | printf("the weights of linear model: \n"); 164 | for(int i=0; i 5 | #include 6 | #include 7 | #include "../Param.h" 8 | 9 | 10 | void init_func(uint32_t size, void* ptr){ 11 | char* casted_value_ptr = (char*) ptr; 12 | for(int i=0; i 5 | #include 6 | #include 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | #include "../Param.h" 14 | #include "../../../lib/processing/gen_red/GenRedArgs.h" 15 | 16 | 17 | __dma_aligned T* weights_data; 18 | BARRIER_INIT(barrier_maptoval, NR_TASKLETS); 19 | 20 | void start_func(gen_red_arguments_t* args){ 21 | uint32_t total_len = args->table_len * args->output_type_size; 22 | uint32_t aligned_weights_size = total_len + 8-(total_len%8); 23 | if(me()==0){ 24 | // initialise weights 25 | fsb_allocator_t weights_allocator = fsb_alloc(aligned_weights_size, 1); 26 | weights_data = (void*)fsb_get(weights_allocator); 27 | mram_read(DPU_MRAM_HEAP_POINTER+args->info, weights_data, aligned_weights_size); 28 | } 29 | barrier_wait(&barrier_maptoval); 30 | } 31 | 32 | static inline T sigmoid_dpu(T x){ 33 | if(x >= 15) 34 | return 1.0; 35 | else if (x <= -15) 36 | return 0.0; 37 | else if (x == 0.0) 38 | return 0.5; 39 | 40 | float sum = 1.0; 41 | float temp = 1.0; 42 | // iter 100 times 43 | for(uint32_t i = 1; i < 101; ++i){ 44 | temp = temp * (-x) / i; 45 | sum = sum + temp; 46 | } 47 | return (T)(1.0 / (1.0 + sum)); 48 | } 49 | 50 | 51 | void map_to_val_func(void* input, void* grads, uint32_t* dummy){ 52 | // the data is preserved and later added to corresponding weights 53 | float* grads_ptr = (float*)grads; 54 | float* input_ptr = (float*)input; 55 | float* weights_data_ptr = (float*)weights_data; 56 | 57 | // calculate gradients w.r.t. linear weights 58 | float dot = 0; 59 | for(int i=0; i 0 else 0 52 | 53 | print(input) 54 | 55 | np.savetxt("data/args.csv", np.array([dim, num_elements, iter, lr]), delimiter=",", fmt='%s') 56 | np.savetxt("data/input.csv", input, delimiter=",", fmt='%f') 57 | 58 | x_train, y_train = (input.transpose(1, 0)[0:-1]).transpose(1, 0), (input.transpose(1, 0)[-1]) 59 | 60 | if torch.cuda.is_available(): 61 | inputs = Variable(torch.from_numpy(x_train).cuda(), requires_grad=True) 62 | labels = Variable(torch.from_numpy(y_train).cuda(), requires_grad=True) 63 | init_weights = torch.nn.Parameter(Variable(torch.from_numpy(init_vector).cuda(), requires_grad=True)) 64 | else: 65 | inputs = Variable(torch.from_numpy(x_train), requires_grad=True) 66 | labels = Variable(torch.from_numpy(y_train), requires_grad=True) 67 | init_weights = torch.nn.Parameter(Variable(torch.from_numpy(init_vector), requires_grad=True)) 68 | 69 | 70 | model = logisticRegression(dim, init_weights) 71 | optimizer = torch.optim.SGD(model.parameters(), lr=lr) 72 | 73 | start = time.time() 74 | for epoch in range(iter): 75 | # Clear gradient buffers because we don't want any gradient from previous epoch to carry forward, dont want to cummulate gradients 76 | optimizer.zero_grad() 77 | 78 | # get output from the model, given the inputs 79 | loss = model(inputs, labels) 80 | loss.backward() 81 | 82 | # update parameters 83 | optimizer.step() 84 | end = time.time() 85 | t = (end-start) *1000 86 | print("the time consumed is "+str(t)+" ms") 87 | print("linear model weights: ") 88 | print(model.weights.detach().numpy()) 89 | print("the groud truth: ") 90 | print(groud_truth) 91 | 92 | if not os.path.exists("results/"): 93 | os.makedirs("results/") 94 | 95 | path = 'results/cpu_'+str(dim)+"_"+str(num_elements)+".csv" 96 | np.savetxt(path, np.array([t])) 97 | 98 | 99 | if __name__ == "__main__": 100 | if not os.path.exists("data/"): 101 | os.mkdir("data/") 102 | with parallel_backend('threading', n_jobs=num_threads): 103 | main() -------------------------------------------------------------------------------- /benchmarks/log_reg/plot.py: -------------------------------------------------------------------------------- 1 | from cProfile import label 2 | import matplotlib.pyplot as plt 3 | import json 4 | import numpy as np 5 | import scipy.stats 6 | from math import log2, exp 7 | 8 | colors=['#23ef68','#32efff','#2eaf9f','#22222f','#eeeff1','#eee112','#00ef00','#aa0000','#0000aa','#000999','#2e3f56','#7eef1f','#eeef11'] 9 | 10 | font = {'weight' : 'bold', 11 | 'size' : 16} 12 | plt.rcParams["figure.figsize"] = (16, 7) 13 | plt.rc('font', **font) 14 | 15 | 16 | def load_data(data, cpu_files, DPU_files, DPU_MASTER_files): 17 | for i in range(len(cpu_files)): 18 | data["CPU"][i] = np.loadtxt(cpu_files[i]) /1000 19 | 20 | for i in range(len(DPU_files)): 21 | tmp = np.loadtxt(DPU_files[i])/1000 22 | data["DPU_initial_transfer"][i] = tmp[0] 23 | data["DPU_Kernel"][i] = tmp[1] 24 | data["DPU_D2C"][i] = tmp[2] 25 | data["DPU_C2D"][i] = tmp[4] 26 | data["DPU"][i] = tmp[5] 27 | 28 | for i in range(len(DPU_MASTER_files)): 29 | tmp = np.loadtxt(DPU_MASTER_files[i])/1000 30 | data["DPU_MASTER_initial_transfer"][i] = tmp[0] 31 | data["DPU_MASTER_Kernel"][i] = tmp[1] 32 | data["DPU_MASTER_D2C"][i] = tmp[2] 33 | data["DPU_MASTER_C2D"][i] = tmp[4] 34 | data["DPU_MASTER"][i] = tmp[5] 35 | 36 | 37 | 38 | 39 | 40 | dim_data={ 41 | "title":"varing_input_dimension", 42 | "x_name":"input dimension", 43 | "x_axis":["5", "10", "20"], 44 | "CPU":np.zeros(3), 45 | 46 | "DPU":np.zeros(3), 47 | "DPU_Kernel":np.zeros(3), 48 | "DPU_initial_transfer":np.zeros(3), 49 | "DPU_C2D":np.zeros(3), 50 | "DPU_D2C":np.zeros(3), 51 | 52 | "DPU_MASTER":np.zeros(3), 53 | "DPU_MASTER_Kernel": np.zeros(3), 54 | "DPU_MASTER_initial_transfer": np.zeros(3), 55 | "DPU_MASTER_C2D": np.zeros(3), 56 | "DPU_MASTER_D2C": np.zeros(3), 57 | 58 | } 59 | 60 | 61 | num_data={ 62 | "title":"varing_#data", 63 | "x_name":"number of input data points", 64 | "x_axis":["100000", "1000000", "10000000"], 65 | "CPU":np.zeros(3), 66 | 67 | "DPU":np.zeros(3), 68 | "DPU_Kernel":np.zeros(3), 69 | "DPU_initial_transfer":np.zeros(3), 70 | "DPU_C2D":np.zeros(3), 71 | "DPU_D2C":np.zeros(3), 72 | 73 | "DPU_MASTER":np.zeros(3), 74 | "DPU_MASTER_Kernel": np.zeros(3), 75 | "DPU_MASTER_initial_transfer": np.zeros(3), 76 | "DPU_MASTER_C2D": np.zeros(3), 77 | "DPU_MASTER_D2C": np.zeros(3), 78 | } 79 | 80 | num_dpus_data={ 81 | "title":"varing_#dpus", 82 | "x_name":"number of dpus", 83 | "x_axis":["128", "512", "2048"], 84 | "CPU":np.zeros(3), 85 | 86 | "DPU":np.zeros(3), 87 | "DPU_Kernel":np.zeros(3), 88 | "DPU_initial_transfer":np.zeros(3), 89 | "DPU_C2D":np.zeros(3), 90 | "DPU_D2C":np.zeros(3), 91 | 92 | "DPU_MASTER":np.zeros(3), 93 | "DPU_MASTER_Kernel": np.zeros(3), 94 | "DPU_MASTER_initial_transfer": np.zeros(3), 95 | "DPU_MASTER_C2D": np.zeros(3), 96 | "DPU_MASTER_D2C": np.zeros(3), 97 | } 98 | 99 | 100 | def mean_confidence_interval(data, confidence=0.99): 101 | a = 1.0 * np.array(data) 102 | n = len(a) 103 | m, se = np.mean(a), scipy.stats.sem(a) 104 | h = se * scipy.stats.t.ppf((1 + confidence) / 2., n-1) 105 | return h 106 | 107 | def concate(arr_list): 108 | arr_list = [x.reshape(len(x), 1) for x in arr_list] 109 | return np.concatenate(arr_list, axis = 1) 110 | 111 | def plot_res(data): 112 | x = data["x_axis"] 113 | title = data["title"] 114 | _, ax = plt.subplots() 115 | 116 | line_width = 0.25 117 | x_pos = np.arange(len(x)) 118 | 119 | 120 | bar1 = ax.bar(x_pos - line_width + line_width*0, data["CPU"], width=line_width, edgecolor='k', color=colors[0], label ="CPU version") 121 | 122 | bar2 = ax.bar(x_pos - line_width + line_width*1, data["DPU"], width=line_width, edgecolor='k', color=colors[1], label ="CPU-reduce") 123 | bar4 = ax.bar(x_pos - line_width + line_width*1, data["DPU_Kernel"]+data["DPU_initial_transfer"]+data["DPU_D2C"]+data["DPU_C2D"], width=line_width, edgecolor='k', color=colors[2], label="C2D transfer") 124 | bar5 = ax.bar(x_pos - line_width + line_width*1, data["DPU_Kernel"]+data["DPU_initial_transfer"]+data["DPU_D2C"], width=line_width, edgecolor='k', color=colors[3], label="D2C transfer") 125 | bar6 = ax.bar(x_pos - line_width + line_width*1, data["DPU_Kernel"]+data["DPU_initial_transfer"], width=line_width, edgecolor='k', color=colors[4], label="initial transfer") 126 | bar7 = ax.bar(x_pos - line_width + line_width*1, data["DPU_Kernel"], width=line_width, edgecolor='k', color=colors[5], label="DPU kernel") 127 | 128 | bar3 = ax.bar(x_pos - line_width + line_width*2, data["DPU_MASTER"], width=line_width, edgecolor='k', color=colors[1]) 129 | ax.bar(x_pos - line_width + line_width*2, data["DPU_MASTER_Kernel"]+data["DPU_MASTER_initial_transfer"]+data["DPU_MASTER_D2C"]+data["DPU_MASTER_C2D"], width=line_width, edgecolor='k', color=colors[2]) 130 | ax.bar(x_pos - line_width + line_width*2, data["DPU_MASTER_Kernel"]+data["DPU_MASTER_initial_transfer"]+data["DPU_MASTER_D2C"], width=line_width, edgecolor='k', color=colors[3]) 131 | ax.bar(x_pos - line_width + line_width*2, data["DPU_MASTER_Kernel"]+data["DPU_MASTER_initial_transfer"], width=line_width, edgecolor='k', color=colors[4]) 132 | ax.bar(x_pos - line_width + line_width*2, data["DPU_MASTER_Kernel"], width=line_width, edgecolor='k', color=colors[5]) 133 | 134 | 135 | 136 | for i in range(len(bar1 + bar2 +bar3)): 137 | rect = (bar1 + bar2 +bar3)[i] 138 | height = rect.get_height() 139 | if(not height == 0): 140 | if i//3 == 0: 141 | text = "cpu" 142 | elif i//3 == 1: 143 | text = "pim-f" 144 | else: 145 | text = "pim-h" 146 | plt.text(rect.get_x()+rect.get_width() / 2, height, text, ha = 'center', va = 'bottom', fontdict={'size': 16}) 147 | 148 | #plt.yscale('log',base=2) 149 | ax.set_xticks(x_pos-line_width*0) 150 | ax.set_xticklabels(x) 151 | ax.set_title(title) 152 | plt.xlabel(data["x_name"], fontdict=font) 153 | plt.ylabel("time in s", fontdict=font) 154 | 155 | legend1 = plt.legend(handles=[bar1, bar2], loc='upper left', shadow=True, bbox_to_anchor=(0, -0.12, 0, 0)) 156 | ax.add_artist(legend1) 157 | legend2 = plt.legend(handles=[bar3, bar4, bar7], loc='upper left', shadow=True, bbox_to_anchor=(0.425, -0.12, 0, 0)) 158 | ax.add_artist(legend2) 159 | plt.legend(handles=[bar5, bar6], loc='upper left', shadow=True, bbox_to_anchor=(0.2, -0.12, 0, 0)) 160 | plt.savefig("images/"+title, bbox_inches='tight') 161 | plt.clf() 162 | plt.close() 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | if __name__=="__main__": 173 | dir = "results/" 174 | load_data(dim_data, [dir+ i for i in ["cpu_5_1000000.csv", "cpu_10_1000000.csv", "cpu_20_1000000.csv"]], [dir+i for i in ["framework_2546_5_1000000", "framework_2546_10_1000000", "framework_2546_20_1000000"]], [dir+i for i in ["human_2546_5_1000000", "human_2546_10_1000000", "human_2546_20_1000000"]]) 175 | plot_res(dim_data) 176 | load_data(num_data, [dir+ i for i in ["cpu_10_100000.csv", "cpu_10_1000000.csv", "cpu_10_10000000.csv"]], [dir+i for i in ["framework_2546_10_100000", "framework_2546_10_1000000", "framework_2546_10_10000000"]], [dir+i for i in ["human_2546_10_100000", "human_2546_10_1000000", "human_2546_10_10000000"]]) 177 | plot_res(num_data) 178 | load_data(num_dpus_data, [], [dir+i for i in ["framework_128_10_1000000", "framework_512_10_1000000", "framework_2048_10_1000000"]], [dir+i for i in ["human_128_10_1000000", "human_512_10_1000000", "human_2048_10_1000000"]]) 179 | plot_res(num_dpus_data) -------------------------------------------------------------------------------- /benchmarks/red/Makefile: -------------------------------------------------------------------------------- 1 | va: host.c 2 | @mkdir -p bin 3 | gcc --std=c99 -ldl -lm -fopenmp -O3 host.c -o bin/host ../../lib/processing/ProcessingHelperHost.c ../../lib/communication/CommHelper.c ../../lib/communication/CommOps.c ../../lib/management/SmallTableInit.c ../../lib/management/Management.c ../../lib/processing/gen_red/GenRed.c `dpu-pkg-config --cflags --libs dpu` -------------------------------------------------------------------------------- /benchmarks/red/Param.h: -------------------------------------------------------------------------------- 1 | #ifndef PARAM_H 2 | #define PARAM_H 3 | #include 4 | 5 | typedef uint32_t T; 6 | 7 | const uint32_t dpu_number = 32; //2432 8 | uint32_t print_info = 0; 9 | uint64_t nr_elements = 1000*dpu_number; //1000000*dpu_number 10 | #endif 11 | -------------------------------------------------------------------------------- /benchmarks/red/bin/host: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CMU-SAFARI/SimplePIM/1d639c53532555f01e9f71d872e7712b166d6cba/benchmarks/red/bin/host -------------------------------------------------------------------------------- /benchmarks/red/host.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | //#include "../../lib/communication/CommOps.h" 9 | //#include "../../lib/management/Management.h" 10 | #include "../../lib/processing/gen_red/GenRed.h" 11 | #include "../../lib/processing/ProcessingHelperHost.h" 12 | #include "../../lib/timer.h" 13 | #include "Param.h" 14 | 15 | 16 | 17 | void init(T* A){ 18 | for (uint64_t i = 0; i < nr_elements; i++) { 19 | A[i] = i%1000; 20 | } 21 | } 22 | 23 | void add(void* p1, void* p2){ 24 | T* ptr1 = (T*)(p1); 25 | T* ptr2 = (T*)(p2); 26 | *ptr1 += *ptr2; 27 | } 28 | 29 | static T reduction_host(T* A) { 30 | T count = 0; 31 | for (uint64_t i = 0; i < nr_elements; i++) { 32 | count += A[i]; 33 | } 34 | return count; 35 | } 36 | 37 | void run(){ 38 | simplepim_management_t* table_management = table_management_init(dpu_number); 39 | T* A = (T*)malloc_scatter_aligned(nr_elements, sizeof(T), table_management); 40 | init(A); 41 | T correct_res = reduction_host(A); 42 | 43 | simplepim_scatter("t1", A, nr_elements, sizeof(T), table_management); 44 | printf("end of data transfer\n"); 45 | 46 | handle_t* va_handle = create_handle("red_funcs", REDUCE); 47 | 48 | 49 | T* res = table_gen_red("t1", "t2", sizeof(T), 1, va_handle, table_management, 0); 50 | 51 | 52 | if(print_info){ 53 | struct dpu_set_t set = table_management->set; 54 | struct dpu_set_t dpu; 55 | DPU_FOREACH(set, dpu) { 56 | DPU_ASSERT(dpu_log_read(dpu, stdout)); 57 | } 58 | } 59 | 60 | 61 | // accumulating floats may have difference in precision 62 | if( ((float)(correct_res-*res))/correct_res < 0.01 && ((float)(correct_res-*res))/correct_res > -0.01){ 63 | printf("the result is correct \n"); 64 | } 65 | else{ 66 | printf("correct res : %f, got res : %f \n", correct_res, *res); 67 | printf("cpu result does not match \n"); 68 | } 69 | 70 | 71 | } 72 | 73 | 74 | int main(int argc, char *argv[]){ 75 | run(); 76 | return 0; 77 | } 78 | -------------------------------------------------------------------------------- /benchmarks/red/red_funcs/init_combine_func.h: -------------------------------------------------------------------------------- 1 | #ifndef INIT_COMBINE_FUNC_H 2 | #define INIT_COMBINE_FUNC_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include "../Param.h" 8 | 9 | 10 | void combine_func(void* dest, void* src){ 11 | *(uint32_t*)dest += *(uint32_t*)src; 12 | } 13 | 14 | void init_func(uint32_t size, void* ptr){ 15 | char* casted_value_ptr = (char*) ptr; 16 | for(int i=0; i 5 | #include 6 | #include 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | #include "../Param.h" 14 | #include "../../../lib/processing/gen_red/GenRedArgs.h" 15 | 16 | 17 | void start_func(gen_red_arguments_t* args){ 18 | 19 | } 20 | 21 | void map_to_val_func(void* input, void* output, uint32_t* key){ 22 | *key = 0; 23 | *(T*)output = *(T*) input; 24 | } 25 | 26 | #endif -------------------------------------------------------------------------------- /benchmarks/va/Makefile: -------------------------------------------------------------------------------- 1 | va: host.c 2 | @mkdir -p bin 3 | gcc --std=c99 -lm -fopenmp -O3 host.c -o bin/host ../../lib/processing/ProcessingHelperHost.c ../../lib/communication/CommHelper.c ../../lib/communication/CommOps.c ../../lib/management/SmallTableInit.c ../../lib/management/Management.c ../../lib/processing/map/Map.c ../../lib/processing/zip/Zip.c `dpu-pkg-config --cflags --libs dpu` -------------------------------------------------------------------------------- /benchmarks/va/Param.h: -------------------------------------------------------------------------------- 1 | #ifndef PARAM_H 2 | #define PARAM_H 3 | #include 4 | 5 | typedef uint32_t T; 6 | const uint32_t dpu_number = 5; //2432 7 | uint32_t print_info = 0; 8 | uint64_t nr_elements = dpu_number*10000; //dpu_number*1000000 9 | #endif -------------------------------------------------------------------------------- /benchmarks/va/bin/dpu_init_binary: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CMU-SAFARI/SimplePIM/1d639c53532555f01e9f71d872e7712b166d6cba/benchmarks/va/bin/dpu_init_binary -------------------------------------------------------------------------------- /benchmarks/va/bin/dpu_map_va_funcs: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CMU-SAFARI/SimplePIM/1d639c53532555f01e9f71d872e7712b166d6cba/benchmarks/va/bin/dpu_map_va_funcs -------------------------------------------------------------------------------- /benchmarks/va/bin/dpu_zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CMU-SAFARI/SimplePIM/1d639c53532555f01e9f71d872e7712b166d6cba/benchmarks/va/bin/dpu_zip -------------------------------------------------------------------------------- /benchmarks/va/bin/host: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CMU-SAFARI/SimplePIM/1d639c53532555f01e9f71d872e7712b166d6cba/benchmarks/va/bin/host -------------------------------------------------------------------------------- /benchmarks/va/host.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include "../../lib/processing/map/Map.h" 10 | #include "../../lib/processing/zip/Zip.h" 11 | #include "../../lib/processing/ProcessingHelperHost.h" 12 | #include "../../lib/communication/CommOps.h" 13 | #include "../../lib/management/Management.h" 14 | #include "../../lib/timer.h" 15 | #include "Param.h" 16 | 17 | 18 | 19 | void init(T* A, uint32_t salt){ 20 | for (uint64_t i = 0; i < nr_elements; i++) { 21 | A[i] = (i + salt)%128; 22 | } 23 | } 24 | 25 | void zip(T* A, T* B, T* res){ 26 | for (uint64_t i = 0; i < nr_elements; i++){ 27 | res[2*i] = A[i]; 28 | res[2*i+1] = B[i]; 29 | } 30 | } 31 | 32 | void vec_add(T* A, T* res){ 33 | for (uint64_t i = 0; i < nr_elements; i++){ 34 | res[i] = A[i*2] + A[i*2+1]; 35 | } 36 | } 37 | 38 | void vector_addition_host(T* A, T* B, T* res) { 39 | omp_set_num_threads(16); 40 | #pragma omp parallel for 41 | for (uint64_t i = 0; i < nr_elements; i++) { 42 | res[i] = A[i] + B[i]; 43 | } 44 | } 45 | 46 | 47 | void run(){ 48 | simplepim_management_t* table_management = table_management_init(dpu_number); 49 | T* A = (T*)malloc_scatter_aligned(nr_elements, sizeof(T), table_management); 50 | T* B = (T*)malloc_scatter_aligned(nr_elements, sizeof(T), table_management); 51 | 52 | T* correct_res = (T*)malloc((uint64_t)sizeof(T)*nr_elements); 53 | init(A, 0); 54 | init(B, 1); 55 | vector_addition_host(A, B, correct_res); 56 | 57 | Timer timer; 58 | start(&timer, 0, 0); 59 | start(&timer, 5, 0); 60 | simplepim_scatter("t1", A, nr_elements, sizeof(T), table_management); 61 | simplepim_scatter("t2", B, nr_elements, sizeof(T), table_management); 62 | stop(&timer, 0); 63 | printf("end of data transfer\n"); 64 | 65 | handle_t* add_handle = create_handle("va_funcs", MAP); 66 | handle_t* zip_handle = create_handle("", ZIP); 67 | 68 | 69 | start(&timer, 1, 0); 70 | table_zip("t1", "t2", "t3", zip_handle, table_management); 71 | table_map("t3", "t4", sizeof(T), add_handle, table_management, 0); 72 | stop(&timer, 1); 73 | 74 | 75 | if(print_info){ 76 | struct dpu_set_t set, dpu; 77 | set = table_management->set; 78 | DPU_FOREACH(set, dpu) { 79 | DPU_ASSERT(dpu_log_read(dpu, stdout)); 80 | } 81 | } 82 | 83 | 84 | start(&timer, 2, 0); 85 | T* res = simplepim_gather("t4", table_management); 86 | stop(&timer, 2); 87 | 88 | printf("the total time with timing consumed is (ms): "); 89 | print(&timer, 5, 1); 90 | printf("\n"); 91 | printf("initial CPU-DPU input transfer (ms): "); 92 | print(&timer, 0, 1); 93 | printf("\n"); 94 | printf("DPU Kernel Time (ms): "); 95 | print(&timer, 1, 1); 96 | printf("\n"); 97 | printf("DPU-CPU Time (ms): "); 98 | print(&timer, 2, 1); 99 | printf("\n"); 100 | 101 | 102 | int32_t is_correct = 1; 103 | 104 | for(int i=0; i 5 | #include 6 | #include "Param.h" 7 | #include "../../../lib/processing/map/MapArgs.h" 8 | 9 | void start_func(map_arguments_t* args){} 10 | 11 | void map_func(void* input, void* res){ 12 | *(T*)res = ((T*)input)[0] + ((T*)input)[1]; 13 | } 14 | 15 | #endif -------------------------------------------------------------------------------- /lib/Common.c: -------------------------------------------------------------------------------- 1 | #include "Common.h" 2 | void zero_init(uint32_t value_size, void* value_ptr){ 3 | char* casted_value_ptr = (char*) value_ptr; 4 | for(int i=0; i 4 | #include 5 | #include 6 | 7 | void print_int(void* value); 8 | void add_int(void* i1, void* i2); 9 | void zero_init(uint32_t value_size, void* value_ptr); 10 | #endif -------------------------------------------------------------------------------- /lib/Parallel.c: -------------------------------------------------------------------------------- 1 | #include "Parallel.h" 2 | #include "../benchmarks/UserPath.h" 3 | #include _user_h_ 4 | 5 | BARRIER_INIT(barrier_p, NR_TASKLETS); 6 | MUTEX_INIT(mutex); 7 | uint32_t curr_block; 8 | 9 | uint64_t _copy_block_size(uint32_t type_size1, uint32_t type_size2, uint32_t num_elem){ 10 | //1024 11 | uint64_t res=0; 12 | uint32_t res_arr[2]; 13 | uint32_t max_type_size = type_size1>type_size2?type_size1:type_size2; 14 | if(type_size1%8 == 0 && type_size2%8 == 0 && (num_elem <= NR_TASKLETS || max_type_size > 512)){ 15 | res_arr[0] = 1; 16 | res_arr[1] = 0; 17 | } 18 | else if(type_size1%4 == 0 && type_size2%4 == 0 &&(num_elem <= 2*NR_TASKLETS || max_type_size > 256)){ 19 | res_arr[0] = 2; 20 | res_arr[1] = 1; 21 | } 22 | else if(type_size1%2 == 0 && type_size2%2 == 0 && max_type_size > 128){ 23 | res_arr[0] = 4; 24 | res_arr[1] = 2; 25 | } 26 | else if(max_type_size < 16){ 27 | res_arr[0] = 256; 28 | res_arr[1] = 8; 29 | } 30 | else if(max_type_size < 32){ 31 | res_arr[0] = 128; 32 | res_arr[1] = 7; 33 | } 34 | else{ 35 | res_arr[0] = 16; 36 | res_arr[1] = 4; 37 | } 38 | 39 | res = *(uint64_t*)res_arr; 40 | return res; 41 | 42 | } 43 | 44 | uint32_t get_shift_bits_for_type(uint32_t value_size){ 45 | switch (value_size) { 46 | case 2: 47 | return 1; 48 | case 4: 49 | return 2; 50 | case 8: 51 | return 3; 52 | case 16: 53 | return 4; 54 | case 32: 55 | return 5; 56 | case 64: 57 | return 6; 58 | case 128: 59 | return 7; 60 | case 256: 61 | return 8; 62 | case 512: 63 | return 9; 64 | case 1024: 65 | return 10; 66 | case 2048: 67 | return 11; 68 | case 4096: 69 | return 12; 70 | default: 71 | return 0; 72 | } 73 | } 74 | 75 | 76 | void map_dpu(__mram_ptr void* inputs, __mram_ptr void* outputs, uint32_t input_type, uint32_t output_type, uint32_t len){ 77 | uint32_t elem_type_size = input_type; 78 | uint32_t inter_type_size = output_type; 79 | uint32_t num_tasklets = NR_TASKLETS; 80 | uint32_t pid = num_tasklets == 1 ? 0 : me(); 81 | uint64_t tuple = _copy_block_size(elem_type_size, inter_type_size, len); 82 | uint32_t* copy_block_size_ = (uint32_t*)&tuple; 83 | uint32_t copy_block_size = copy_block_size_[0]; 84 | uint32_t copy_block_size_shiftbits = copy_block_size_[1]; 85 | // try malloc/free for performance 86 | fsb_allocator_t elems_block_allocator = fsb_alloc(elem_type_size<>copy_block_size_shiftbits)<>2)<<2; 103 | 104 | uint32_t i_init = pid<>2)<<2; 223 | 224 | for(int i=pid*copy_block_size; iinput_len; 305 | uint32_t elem_type_size = DPU_INPUT_ARGUMENTS->input_type_size; 306 | uint32_t inter_type_size = DPU_INPUT_ARGUMENTS->table_type_size; 307 | uint32_t table_size = DPU_INPUT_ARGUMENTS->table_len; 308 | uint32_t num_tasklets = NR_TASKLETS; 309 | uint32_t pid = num_tasklets == 1 ? 0 : me(); 310 | 311 | uint64_t tuple = _copy_block_size(elem_type_size, inter_type_size, len); 312 | uint32_t* copy_block_size_ = (uint32_t*)&tuple; 313 | uint32_t copy_block_size = copy_block_size_[0]; 314 | uint32_t copy_block_size_shiftbits = copy_block_size_[1]; 315 | 316 | __mram_ptr void* elements = DPU_MRAM_HEAP_POINTER+DPU_INPUT_ARGUMENTS->input_start_offset; 317 | // try malloc/free for performance 318 | fsb_allocator_t elems_block_allocator = fsb_alloc(copy_block_size*elem_type_size, 1); 319 | __dma_aligned void* elems_block = fsb_get(elems_block_allocator); 320 | 321 | fsb_allocator_t table_allocator = fsb_alloc(sizeof(table_t), 1); 322 | __dma_aligned table_t* local_table = fsb_get(table_allocator); 323 | init_table(local_table, table_size, inter_type_size, init_func); 324 | 325 | fsb_allocator_t tmp_intermediate_allocator = fsb_alloc(inter_type_size, 1); 326 | __dma_aligned void* tmp_intermediate = fsb_get(tmp_intermediate_allocator); 327 | 328 | uint32_t key = 0; 329 | // TODO : possible optimisation : shifts instead of mult, check bounds for last iterations (later) 330 | 331 | curr_block = 0; 332 | uint32_t i; 333 | uint32_t curr_block_local; 334 | uint32_t num_blocks = (len%copy_block_size==0)?len/copy_block_size:len/copy_block_size+1; 335 | uint32_t copy_block_size_in_bytes = copy_block_size*elem_type_size; 336 | uint32_t total_len_in_bytes = len * elem_type_size; 337 | 338 | barrier_wait(&barrier_p); 339 | while (curr_blockinput_len; 375 | uint32_t elem_type_size = DPU_INPUT_ARGUMENTS->input_type_size; 376 | uint32_t inter_type_size = DPU_INPUT_ARGUMENTS->table_type_size; 377 | uint32_t table_size = DPU_INPUT_ARGUMENTS->table_len; 378 | uint32_t num_tasklets = NR_TASKLETS; 379 | uint32_t pid = num_tasklets == 1 ? 0 : me(); 380 | 381 | uint64_t tuple = _copy_block_size(elem_type_size, inter_type_size, len); 382 | uint32_t* copy_block_size_ = (uint32_t*)&tuple; 383 | uint32_t copy_block_size = copy_block_size_[0]; 384 | uint32_t copy_block_size_shiftbits = copy_block_size_[1]; 385 | 386 | 387 | __mram_ptr void* elements = DPU_MRAM_HEAP_POINTER+DPU_INPUT_ARGUMENTS->input_start_offset; 388 | // try malloc/free for performance 389 | fsb_allocator_t elems_block_allocator = fsb_alloc(copy_block_size*elem_type_size, 1); 390 | __dma_aligned void* elems_block = fsb_get(elems_block_allocator); 391 | 392 | 393 | 394 | fsb_allocator_t table_allocator = fsb_alloc(sizeof(table_t), 1); 395 | __dma_aligned table_t* local_table = fsb_get(table_allocator); 396 | init_table(local_table, table_size, inter_type_size, init_func); 397 | 398 | 399 | fsb_allocator_t tmp_intermediate_allocator = fsb_alloc(inter_type_size, 1); 400 | __dma_aligned void* tmp_intermediate = fsb_get(tmp_intermediate_allocator); 401 | 402 | uint32_t last_block = (len/copy_block_size)*copy_block_size; 403 | uint32_t key = 0; 404 | 405 | void* local_table_entries = local_table->table; 406 | uint32_t curr_entry; 407 | uint32_t shift_bits = get_shift_bits_for_type(inter_type_size); 408 | 409 | 410 | // TODO : possible optimisation : shifts instead of mult, check bounds for last iterations (later) 411 | uint32_t total_len_in_bytes = len * elem_type_size; 412 | uint32_t copy_block_size_in_bytes = copy_block_size*elem_type_size; 413 | uint32_t stride = copy_block_size*num_tasklets*elem_type_size; 414 | for(int i=pid*copy_block_size_in_bytes; iinput_len; 458 | uint32_t elem_type_size = DPU_INPUT_ARGUMENTS->input_type_size; 459 | uint32_t inter_type_size = DPU_INPUT_ARGUMENTS->table_type_size; 460 | uint32_t table_size = DPU_INPUT_ARGUMENTS->table_len; 461 | uint32_t num_tasklets = NR_TASKLETS; 462 | uint32_t pid = num_tasklets == 1 ? 0 : me(); 463 | 464 | uint64_t tuple = _copy_block_size(elem_type_size, inter_type_size, len); 465 | uint32_t* copy_block_size_ = (uint32_t*)&tuple; 466 | uint32_t copy_block_size = copy_block_size_[0]; 467 | uint32_t copy_block_size_shiftbits = copy_block_size_[1]; 468 | 469 | __mram_ptr void* elements = DPU_MRAM_HEAP_POINTER+DPU_INPUT_ARGUMENTS->input_start_offset; 470 | // try malloc/free for performance 471 | fsb_allocator_t elems_block_allocator = fsb_alloc(copy_block_size*elem_type_size, 1); 472 | __dma_aligned void* elems_block = fsb_get(elems_block_allocator); 473 | 474 | fsb_allocator_t table_allocator = fsb_alloc(sizeof(table_t), 1); 475 | if(pid == 0){ 476 | t_global = fsb_get(table_allocator); 477 | } 478 | init_shared_table(t_global, table_size, inter_type_size, init_func); 479 | 480 | fsb_allocator_t tmp_intermediate_allocator = fsb_alloc(inter_type_size, 1); 481 | __dma_aligned void* tmp_intermediate = fsb_get(tmp_intermediate_allocator); 482 | 483 | int last_block = (len/copy_block_size)*copy_block_size; 484 | uint32_t key = 0; 485 | 486 | // TODO : possible optimisation : shifts instead of mult, check bounds for last iterations (later) 487 | uint32_t total_len_in_bytes = len * elem_type_size; 488 | uint32_t copy_block_size_in_bytes = copy_block_size*elem_type_size; 489 | uint32_t stride = copy_block_size*num_tasklets*elem_type_size; 490 | for(int i=pid*copy_block_size_in_bytes; i 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include "mutex.h" 10 | 11 | #include "Structs.h" 12 | #include "StructsPIM.h" 13 | #include "Table.h" 14 | #include "TableShared.h" 15 | #include "Common.h" 16 | 17 | void map_dpu(__mram_ptr void* inputs, __mram_ptr void* outputs, uint32_t input_type, uint32_t output_type, uint32_t len); 18 | void combine_oncache_dpu(__mram_ptr void* table_entries_mram, dpu_arguments_t* DPU_INPUT_ARGUMENTS); 19 | void combine_shared_dpu(__mram_ptr void* table_entries_mram, void (*init_func)(uint32_t, void*), void (*key_func)(void*, void*, uint32_t*), void (*combine_func)(void*, void*), dpu_arguments_t* DPU_INPUT_ARGUMENTS); 20 | void zip_dpu(__mram_ptr void* table_entries_1, __mram_ptr void* table_entries_2, __mram_ptr void* table_entries_res, uint32_t input_type_1, uint32_t input_type_2, uint32_t len); 21 | uint32_t get_shift_bits_for_type(uint32_t value_size); 22 | 23 | #endif -------------------------------------------------------------------------------- /lib/Structs.h: -------------------------------------------------------------------------------- 1 | #ifndef STRUCTS_H 2 | #define STRUCTS_H 3 | #include 4 | #include 5 | 6 | typedef struct { 7 | uint32_t input_start_offset; 8 | uint32_t input_len; 9 | uint32_t input_type_size; 10 | uint32_t data_start_offset; 11 | uint32_t data_len; 12 | uint32_t data_type_size; 13 | uint32_t end_offset; 14 | uint32_t table_type_size; 15 | uint32_t table_len; 16 | uint32_t mode; //mode 0, all on wram; mode 1, data on mram; 17 | } dpu_arguments_t; 18 | 19 | 20 | #endif -------------------------------------------------------------------------------- /lib/StructsPIM.h: -------------------------------------------------------------------------------- 1 | #ifndef STRUCTSPIM_H 2 | #define STRUCTSPIM_H 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | typedef struct { 9 | uint8_t* locks; 10 | uint32_t table_size; 11 | uint32_t value_size; 12 | __mram_ptr void* heap_ptr; 13 | void* table; // table is an array of table_size, each element of size (uint32_t, uint32_t, value_size) 14 | fsb_allocator_t table_allocator; 15 | fsb_allocator_t locks_allocator; 16 | } table_t; 17 | 18 | #endif -------------------------------------------------------------------------------- /lib/Table.c: -------------------------------------------------------------------------------- 1 | #include "Table.h" 2 | 3 | 4 | void init_table(table_t* t, uint32_t table_size, uint32_t value_size, void (*init_func)(uint32_t, void*)){ 5 | t->table_size = table_size; 6 | t->value_size = value_size; 7 | t->table_allocator = fsb_alloc(table_size*value_size+(table_size*value_size)%8, 1); 8 | t->locks_allocator = NULL; 9 | t->table = fsb_get(t->table_allocator); 10 | t->locks = NULL; 11 | t->heap_ptr = NULL; 12 | void* tmp = t->table; 13 | 14 | for(int i=0; ivalue_size; 22 | uint32_t curr_entry = key*value_size; 23 | void* value_ptr_in_table = t->table+curr_entry; 24 | 25 | (*combineFunc)(value_ptr_in_table, value); 26 | 27 | } 28 | 29 | void free_table(table_t* t){ 30 | fsb_free(t->table_allocator, t->table); 31 | } 32 | 33 | 34 | void print_table(table_t* t, void (*printFunc)(void*)){ 35 | uint32_t table_size = t->table_size; 36 | uint32_t value_size = t->value_size; 37 | print_table_entries(t->table, table_size, value_size, printFunc); 38 | } 39 | 40 | 41 | void combine_tables(table_t* t1, table_t* t2, void (*combineFunc)(void*, void*)){ 42 | uint32_t table_size = t1->table_size; 43 | uint32_t value_size = t1->value_size; 44 | void* table1 = t1->table; 45 | void* table2 = t2->table; 46 | combine_table_entries(table1, table2, table_size, value_size, combineFunc); 47 | } 48 | 49 | uint32_t d = 32; 50 | void print_entry(void* p){ 51 | int* int_p = (int*)p; 52 | printf("%d, ", *int_p); 53 | 54 | p += sizeof(int32_t); 55 | int_p = (int*)p; 56 | 57 | for(int i=0; itable_size; 71 | uint32_t value_size = local_table->value_size; 72 | barrier_wait(&barrier); 73 | 74 | 75 | if(num_tasklets==1){ 76 | //global table is empty&&on heap, and store local table on heap 77 | store_shared_table_on_heap(local_table, table_entries_mram); 78 | } 79 | else{ 80 | 81 | 82 | // a helper table for global merge 83 | fsb_allocator_t global_table_allocator; 84 | if(pid==0){ 85 | global_table_allocator = fsb_alloc(sizeof(table_t), 1); 86 | global_table_helper = (table_t*)fsb_get(global_table_allocator); 87 | init_table(global_table_helper, table_size, value_size, init_func); 88 | } 89 | barrier_wait(&barrier); 90 | // 91 | uint32_t local_len = table_size / num_tasklets; 92 | uint32_t rest = table_size % num_tasklets; 93 | uint32_t curr_id; 94 | uint32_t curr_len; 95 | uint32_t start_pos; 96 | void* global_table_ptr = global_table_helper->table; 97 | void* local_table_ptr = local_table->table; 98 | 99 | for(int i=0; itable, table_entries_mram, transfer_size); 111 | //print_shared_table(global_table_helper, print_entry); 112 | if(pid == 0){ 113 | free_table(global_table_helper); 114 | fsb_free(global_table_allocator, global_table_helper); 115 | } 116 | 117 | } 118 | 119 | 120 | } 121 | 122 | 123 | -------------------------------------------------------------------------------- /lib/Table.h: -------------------------------------------------------------------------------- 1 | #ifndef TABLE_H 2 | #define TABLE_H 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "TableHost.h" 9 | #include "TableShared.h" 10 | #include "StructsPIM.h" 11 | 12 | void init_table(table_t* t, uint32_t table_size, uint32_t value_size, void (*init_func)(uint32_t, void*)); 13 | void free_table(table_t* t); 14 | void insert_table(table_t* t, uint32_t key, void* value, void (*combineFunc)(void*, void*)); 15 | void combine_tables_lockfree(__mram_ptr void* table_entries_mram, table_t* local_table, void (*init_func)(uint32_t, void*), void (*combineFunc)(void*, void*)); 16 | void print_table(table_t* t, void (*printFunc)(void*)); 17 | #endif -------------------------------------------------------------------------------- /lib/TableHost.c: -------------------------------------------------------------------------------- 1 | #include "TableHost.h" 2 | void combine_table_entries(void* table1, void* table2, uint32_t table_size, uint32_t value_size, void (*combineFunc)(void*, void*)){ 3 | 4 | uint32_t curr_entry; 5 | 6 | for(int i=0; i 4 | #include 5 | #include "Structs.h" 6 | void combine_table_entries(void* table1, void* table2, uint32_t table_size, uint32_t value_size, void (*combineFunc)(void*, void*)); 7 | void print_table_entries(void* table, uint32_t table_size, uint32_t value_size, void (*printFunc)(void*)); 8 | 9 | #endif 10 | -------------------------------------------------------------------------------- /lib/TableShared.c: -------------------------------------------------------------------------------- 1 | #include "TableShared.h" 2 | 3 | // table in this file must be the same table seen by all threads 4 | 5 | BARRIER_INIT(barrier_shared, NR_TASKLETS); 6 | MUTEX_INIT(mutex_shared); 7 | 8 | // multithreaded load and store, len must be 8 bytes aligned, can operate on more than 2kB data 9 | uint32_t curr_block_shared; 10 | void load_arr_aligned(void* arr, __mram_ptr void* heap_ptr, uint32_t len){ 11 | uint32_t transfer_each_time = 256; //must be multiple of 8 12 | uint32_t transfer_size = len; 13 | curr_block_shared = 0; 14 | 15 | uint32_t curr_offset; 16 | uint32_t curr_block_local; 17 | uint32_t last_transfer_size = (transfer_size%transfer_each_time==0)?transfer_each_time:transfer_size/transfer_size%transfer_each_time; 18 | uint32_t num_blocks = (transfer_size%transfer_each_time==0)?transfer_size/transfer_each_time:transfer_size/transfer_each_time+1; 19 | 20 | 21 | barrier_wait(&barrier_shared); 22 | while (curr_block_sharedtable_size = table_size; 79 | t->value_size = value_size; 80 | t->table_allocator = fsb_alloc(table_size*value_size+(table_size*value_size)%8, 1); 81 | t->locks_allocator = fsb_alloc(table_size*sizeof(uint8_t), 1); 82 | t->table = fsb_get(t->table_allocator); 83 | t->locks = fsb_get(t->locks_allocator); 84 | t->heap_ptr = NULL; 85 | void* tmp = t->table; 86 | 87 | for(int i=0; ilocks[i])); 94 | } 95 | } 96 | 97 | barrier_wait(&barrier_shared); 98 | } 99 | 100 | void insert_shared_table(table_t* t, uint32_t key, void* value, void (*combineFunc)(void*, void*)){ 101 | uint32_t value_size = t->value_size; 102 | uint32_t curr_entry = key*value_size; 103 | void* value_ptr_in_table = (t->table)+curr_entry; 104 | mutex_lock(&(t->locks[key])); 105 | (*combineFunc)(value_ptr_in_table, value); 106 | mutex_unlock(&(t->locks[key])); 107 | } 108 | 109 | void print_shared_table(table_t* t, void (*printFunc)(void*)){ 110 | uint32_t pid = me(); 111 | if(pid==0){ 112 | uint32_t table_size = t->table_size; 113 | uint32_t value_size = t->value_size; 114 | print_table_entries(t->table, table_size, value_size, printFunc); 115 | } 116 | barrier_wait(&barrier_shared); 117 | } 118 | 119 | void free_shared_table(table_t* t){ 120 | fsb_free(t->table_allocator, t->table); 121 | fsb_free(t->locks_allocator, t->locks); 122 | } 123 | 124 | uint32_t store_shared_table_on_heap(table_t* table, __mram_ptr void* heap_ptr){ 125 | uint32_t table_size = table->table_size; 126 | uint32_t value_size = table->value_size; 127 | uint32_t transfer_size = table_size*value_size+(table_size*value_size)%8; 128 | store_arr_aligned(table->table, heap_ptr, transfer_size); 129 | return (uint32_t)(heap_ptr-DPU_MRAM_HEAP_POINTER+transfer_size); 130 | } 131 | 132 | void load_shared_table_from_heap(table_t* table, __mram_ptr void* heap_ptr){ 133 | uint32_t table_size = table->table_size; 134 | uint32_t value_size = table->value_size; 135 | uint32_t transfer_size = table_size*value_size+(table_size*value_size)%8; 136 | load_arr_aligned(table->table, heap_ptr, transfer_size); 137 | } 138 | -------------------------------------------------------------------------------- /lib/TableShared.h: -------------------------------------------------------------------------------- 1 | #ifndef TABLESHARED_H 2 | #define TABLESHARED_H 3 | 4 | #include 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include "mutex.h" 11 | #include "TableHost.h" 12 | #include "StructsPIM.h" 13 | 14 | void init_shared_table(table_t* t, uint32_t table_size, uint32_t value_size, void (*init_func)(uint32_t, void*)); 15 | void free_shared_table(table_t* t); 16 | void insert_shared_table(table_t* t, uint32_t key, void* value, void (*combineFunc)(void*, void*)); 17 | void print_shared_table(table_t* t, void (*printFunc)(void*)); 18 | 19 | uint32_t store_shared_table_on_heap(table_t* table, __mram_ptr void* heap_ptr); 20 | void load_shared_table_from_heap(table_t* table, __mram_ptr void* heap_ptr); 21 | void load_arr_aligned(void* arr, __mram_ptr void* heap_ptr, uint32_t len); 22 | void store_arr_aligned(void* arr, __mram_ptr void* heap_ptr, uint32_t len); 23 | #endif -------------------------------------------------------------------------------- /lib/UpmemCustom.c: -------------------------------------------------------------------------------- 1 | #include "UpmemCustom.h" 2 | // allocate aligned arr and zero out rest 3 | uint32_t calculate_pad_len(uint64_t len, uint32_t type_size, uint32_t num_dpus){ 4 | uint64_t len_in_byte = (uint64_t)len*type_size; 5 | 6 | // calculate lcm of typesize and 8, each dpu gets %8 7 | uint32_t lcm = (type_size > 8) ? type_size : 8; 8 | 9 | while (1) { 10 | if (lcm % type_size == 0 && lcm % 8 == 0) { 11 | break; 12 | } 13 | ++lcm; 14 | } 15 | 16 | // divisible by typesize 17 | uint64_t padded_len = len_in_byte; 18 | 19 | while (1) { 20 | if (padded_len % num_dpus == 0 && (padded_len/num_dpus) % lcm == 0) { 21 | break; 22 | } 23 | ++padded_len; 24 | } 25 | 26 | uint64_t pad_len = padded_len - len_in_byte; 27 | return (uint32_t)pad_len; 28 | } 29 | 30 | void* malloc_split_aligned(uint32_t len, uint32_t type_size, uint32_t num_dpus){ 31 | uint32_t pad_len = calculate_pad_len(len , type_size, num_dpus); 32 | uint64_t len_in_byte = (uint64_t)len * type_size; 33 | 34 | void* ptr = malloc(len_in_byte+pad_len); 35 | 36 | for(int i=0; iinput_args[i].input_len?max:input_args[i].input_len; 82 | } 83 | printf("assigning %d elements at max for each dpu\n", max); 84 | } 85 | 86 | 87 | 88 | 89 | uint32_t host_split_to_dpu(struct dpu_set_t set, void* elements, uint32_t len, uint32_t type_size, uint32_t num_dpus, uint32_t curr_offset){ 90 | //elements must be 8 bytes aligned and divisible by num_dpus, pad not aglined, use malloc_aligned 91 | uint32_t pad_len = calculate_pad_len(len , type_size, num_dpus); 92 | 93 | //split elements to dpu and rest remains on host 94 | uint32_t len_per_dpu_in_byte = ((uint64_t)len*type_size+pad_len)/num_dpus; 95 | 96 | //transfer to dpu 97 | int i; 98 | struct dpu_set_t dpu; 99 | 100 | 101 | DPU_FOREACH(set, dpu, i) { 102 | DPU_ASSERT(dpu_prepare_xfer(dpu, &((char*)elements)[i * len_per_dpu_in_byte])); 103 | } 104 | 105 | // offset 106 | 107 | DPU_ASSERT(dpu_push_xfer(set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, curr_offset, len_per_dpu_in_byte, DPU_XFER_DEFAULT)); 108 | 109 | return curr_offset+len_per_dpu_in_byte; 110 | 111 | } 112 | 113 | uint32_t host_broadcast_to_dpu(struct dpu_set_t set, void* elements, uint32_t len, uint32_t type_size, uint32_t curr_offset){ 114 | uint32_t broadcast_size = (len*type_size)+(len*type_size)%8; 115 | DPU_ASSERT(dpu_broadcast_to(set, DPU_MRAM_HEAP_POINTER_NAME, curr_offset, elements, broadcast_size, DPU_XFER_DEFAULT)); 116 | return curr_offset+broadcast_size; 117 | } 118 | 119 | void* malloc_gather_aligned(uint32_t len, uint32_t type_size, uint32_t num_dpus){ 120 | uint32_t len_in_byte = len * type_size; 121 | uint32_t pad_len = len_in_byte%8; 122 | 123 | void* ptr = calloc((len_in_byte+pad_len)*num_dpus); 124 | 125 | return ptr; 126 | } 127 | 128 | void gather_tables_to_host(struct dpu_set_t set, void* my_table, uint32_t len, uint32_t type_size, uint32_t curr_offset_on_mram, uint32_t num_dpus, void (*init_func)(uint32_t, void*) ,void (*combineFunc)(void*, void*)){ 129 | void* tables = malloc_gather_aligned(len, type_size, num_dpus); 130 | uint32_t aligned_table_size = (len*type_size)+(len*type_size)%8; 131 | 132 | for(int i=0; ilen?max_len:len; 197 | total_size += (uint64_t)len*type_size; 198 | } 199 | aligned_max_len = (max_len*type_size)+(max_len*type_size)%8; 200 | 201 | 202 | void* tmp_buffer = malloc((uint64_t)num_dpus*aligned_max_len); 203 | void* res = malloc(total_size); 204 | printf("max len per dpu %u\n", aligned_max_len); 205 | printf("transfer buffer size %u\n", num_dpus*aligned_max_len); 206 | int i; 207 | struct dpu_set_t dpu; 208 | DPU_FOREACH(set, dpu, i) { 209 | DPU_ASSERT(dpu_prepare_xfer(dpu, tmp_buffer+i*aligned_max_len)); 210 | } 211 | DPU_ASSERT(dpu_push_xfer(set, DPU_XFER_FROM_DPU, DPU_MRAM_HEAP_POINTER_NAME, curr_offset_on_mram, aligned_max_len, DPU_XFER_DEFAULT)); 212 | 213 | 214 | void* buff_ptr = (void*)tmp_buffer; 215 | void* ptr_in_res = (void*)res; 216 | uint32_t curr_size; 217 | 218 | //printf("\n-----\n%d\n-----\n", *(int*)buff_ptr); 219 | for(int i=0; i 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include "Structs.h" 10 | #include "TableHost.h" 11 | #include "Common.h" 12 | 13 | void* malloc_split_aligned(uint32_t len, uint32_t type_size, uint32_t num_dpus); 14 | void* malloc_broadcast_aligned(uint32_t len, uint32_t type_size); 15 | uint32_t host_split_to_dpu(struct dpu_set_t set, void* elements, uint32_t len, uint32_t type_size, uint32_t num_dpus, uint32_t curr_offset); 16 | uint32_t host_broadcast_to_dpu(struct dpu_set_t set, void* elements, uint32_t len, uint32_t type_size, uint32_t curr_offset); 17 | void prepare_input_len_and_parse_args(struct dpu_set_t set, dpu_arguments_t* input_args, uint32_t input_len, uint32_t input_type_size, uint32_t num_dpus); 18 | void* malloc_gather_aligned(uint32_t len, uint32_t type_size, uint32_t num_dpus); 19 | void gather_tables_to_host(struct dpu_set_t set, void* my_table, uint32_t len, uint32_t type_size, uint32_t curr_offset_on_mram, uint32_t num_dpus, void (*init_func)(uint32_t, void*) ,void (*combineFunc)(void*, void*)); 20 | void* gather_to_host(struct dpu_set_t set, uint32_t* lens, uint32_t type_size, uint32_t curr_offset_on_mram, uint32_t num_dpus); 21 | #endif -------------------------------------------------------------------------------- /lib/communication/CommHelper.c: -------------------------------------------------------------------------------- 1 | #include "CommHelper.h" 2 | uint32_t calculate_pad_len(uint32_t len, uint32_t type_size, uint32_t num_dpus){ 3 | uint64_t len_in_byte = (uint64_t)len*type_size; 4 | 5 | // calculate lcm of typesize and 8, each dpu gets %8 6 | uint32_t lcm = (type_size > 8) ? type_size : 8; 7 | 8 | while (1) { 9 | if (lcm % type_size == 0 && lcm % 8 == 0) { 10 | break; 11 | } 12 | ++lcm; 13 | } 14 | 15 | // divisible by typesize 16 | uint64_t padded_len = len_in_byte; 17 | 18 | while (1) { 19 | if (padded_len % num_dpus == 0 && (padded_len/num_dpus) % lcm == 0) { 20 | break; 21 | } 22 | ++padded_len; 23 | } 24 | 25 | uint64_t pad_len = padded_len - len_in_byte; 26 | return (uint32_t)pad_len; 27 | } 28 | -------------------------------------------------------------------------------- /lib/communication/CommHelper.h: -------------------------------------------------------------------------------- 1 | #ifndef COMMHELPER_H 2 | #define COMMHELPER_H 3 | #include 4 | #include 5 | #include 6 | uint32_t calculate_pad_len(uint32_t len, uint32_t type_size, uint32_t num_dpus); 7 | #endif -------------------------------------------------------------------------------- /lib/communication/CommOps.c: -------------------------------------------------------------------------------- 1 | #include "CommOps.h" 2 | 3 | /* 4 | see description of the functions in CommOps.h 5 | */ 6 | void* malloc_scatter_aligned(uint64_t len, uint32_t type_size, simplepim_management_t* table_management){ 7 | uint32_t num_dpus = table_management->num_dpus; 8 | uint32_t pad_len = calculate_pad_len(len , type_size, num_dpus); 9 | uint64_t len_in_byte = len * (uint64_t)type_size; 10 | 11 | void* ptr = calloc(1, len_in_byte+pad_len); 12 | 13 | return ptr; 14 | } 15 | 16 | void* malloc_reduce_aligned(uint32_t len, uint32_t type_size, simplepim_management_t* table_management){ 17 | uint32_t num_dpus = table_management->num_dpus; 18 | uint64_t len_in_byte = len * type_size; 19 | uint64_t pad_len = 8-len_in_byte%8; 20 | 21 | void* ptr = calloc(num_dpus, (len_in_byte+pad_len)); 22 | 23 | return ptr; 24 | } 25 | 26 | void* malloc_broadcast_aligned(uint32_t len, uint32_t type_size, simplepim_management_t* table_management){ 27 | uint64_t len_in_byte = (uint64_t)len * type_size; 28 | uint64_t pad_len = 8-len_in_byte%8; 29 | 30 | void* ptr = calloc(1, len_in_byte+pad_len); 31 | 32 | return ptr; 33 | } 34 | 35 | void simplepim_scatter(char* const table_id, void* elements, uint64_t len, uint32_t type_size, simplepim_management_t* table_management){ 36 | uint32_t curr_offset = table_management->free_space_start_pos; 37 | if(contains_table(table_id, table_management)){ 38 | curr_offset = lookup_table(table_id, table_management) -> start; 39 | } 40 | // elements must be 8 bytes aligned and divisible by num_dpus, pad not aglined, use malloc_aligned 41 | if(contains_table(table_id, table_management)){ 42 | printf(table_id); 43 | printf(" is contained in table management unit, invalid scatter\n"); 44 | return; 45 | } 46 | uint32_t num_dpus = table_management->num_dpus; 47 | uint32_t pad_len = calculate_pad_len(len , type_size, num_dpus); 48 | 49 | // split elements to dpu and rest remains on host 50 | uint32_t len_per_dpu_in_byte = (len*type_size+pad_len)/num_dpus; 51 | 52 | // transfer to dpu 53 | int i; 54 | struct dpu_set_t dpu; 55 | struct dpu_set_t set = table_management->set; 56 | 57 | 58 | DPU_FOREACH(set, dpu, i) { 59 | DPU_ASSERT(dpu_prepare_xfer(dpu, &((char*)elements)[i * len_per_dpu_in_byte])); 60 | } 61 | 62 | // offset 63 | DPU_ASSERT(dpu_push_xfer(set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, curr_offset, len_per_dpu_in_byte, DPU_XFER_DEFAULT)); 64 | 65 | // calculate lens per dpu 66 | uint32_t pad_len_in_elem = pad_len/type_size; 67 | uint32_t num_transfered_elem_per_dpu = (len + pad_len_in_elem)/num_dpus; 68 | uint32_t* lens = malloc(sizeof(int32_t)*num_dpus); 69 | for(int i=0; iname = malloc(strlen(table_id)+1); 85 | memcpy(t->name, table_id, strlen(table_id)+1); 86 | t->start = curr_offset; 87 | t->end = curr_offset+len_per_dpu_in_byte; 88 | t->len = len; 89 | t->table_type_size = type_size; 90 | t->lens_each_dpu = lens; 91 | t->is_virtual_zipped = 0; 92 | 93 | add_table(t, table_management); 94 | table_management->free_space_start_pos = table_management->free_space_start_pos > t->end ? table_management->free_space_start_pos : t->end; 95 | } 96 | 97 | void* simplepim_gather(char* const table_id, simplepim_management_t* table_management){ 98 | if(!contains_table(table_id, table_management)){ 99 | printf(table_id); 100 | printf(" is not contained in table management unit, invalid scatter\n"); 101 | return NULL; 102 | } 103 | 104 | uint32_t num_dpus = table_management->num_dpus; 105 | table_host_t* t = lookup_table(table_id, table_management); 106 | uint32_t* lens = t->lens_each_dpu; 107 | uint32_t type_size = t->table_type_size; 108 | uint32_t start_addr = t->start; 109 | uint32_t max_len = 0; 110 | for(int i=0; ilens[i]?max_len:lens[i]; 112 | } 113 | 114 | uint64_t aligned_max_len = (max_len*type_size)+(8-(max_len*type_size)%8); 115 | uint64_t buff_size = aligned_max_len*num_dpus; 116 | uint64_t total_size = t->len*t->table_type_size; 117 | struct dpu_set_t set = table_management->set; 118 | void* tmp_buffer = malloc((uint64_t)num_dpus*aligned_max_len+2048); 119 | void* res = malloc(total_size + 2048); 120 | 121 | int i; 122 | struct dpu_set_t dpu; 123 | DPU_FOREACH(set, dpu, i) { 124 | DPU_ASSERT(dpu_prepare_xfer(dpu, tmp_buffer+i*aligned_max_len)); 125 | } 126 | DPU_ASSERT(dpu_push_xfer(set, DPU_XFER_FROM_DPU, DPU_MRAM_HEAP_POINTER_NAME, start_addr, aligned_max_len, DPU_XFER_DEFAULT)); 127 | 128 | void* buff_ptr = (void*)tmp_buffer; 129 | void* ptr_in_res = (void*)res; 130 | uint32_t curr_size; 131 | 132 | 133 | for(int j=0; jfree_space_start_pos; 148 | if(contains_table(table_id, table_management)){ 149 | curr_offset = lookup_table(table_id, table_management) -> start; 150 | } 151 | 152 | uint64_t broadcast_size = (len*type_size)+8-(len*type_size)%8; 153 | uint32_t num_dpus = table_management->num_dpus; 154 | struct dpu_set_t set = table_management->set; 155 | DPU_ASSERT(dpu_broadcast_to(set, DPU_MRAM_HEAP_POINTER_NAME, curr_offset, elements, broadcast_size, DPU_XFER_DEFAULT)); 156 | // table information to management unit 157 | table_host_t* t = malloc(sizeof(table_host_t)); 158 | t->name = malloc(strlen(table_id)+1); 159 | memcpy(t->name, table_id, strlen(table_id)+1); 160 | t->start = curr_offset; 161 | t->end = curr_offset+broadcast_size; 162 | t->len = len; 163 | t->table_type_size = type_size; 164 | 165 | uint32_t* lens = malloc(sizeof(int32_t)*num_dpus); 166 | for(int i=0; ilens_each_dpu = lens; 171 | t->is_virtual_zipped = 0; 172 | add_table(t, table_management); 173 | table_management->free_space_start_pos = t->end > table_management->free_space_start_pos ? t->end : table_management->free_space_start_pos; 174 | } 175 | 176 | void simplepim_allgather(char* const table_id, char* const new_table_id, simplepim_management_t* table_management){ 177 | if(!contains_table(table_id, table_management)){ 178 | printf(table_id); 179 | printf(" is not contained in table management unit, invalid allgather\n"); 180 | return; 181 | } 182 | 183 | uint32_t num_dpus = table_management->num_dpus; 184 | table_host_t* t = lookup_table(table_id, table_management); 185 | uint32_t* lens = t->lens_each_dpu; 186 | uint32_t type_size = t->table_type_size; 187 | 188 | uint32_t total_len = 0; 189 | 190 | for(int i=0; ifunc_type == 1){ 207 | if(!contains_table(table_id, table_management)){ 208 | printf("source table "); 209 | printf(table_id); 210 | printf(" is not contains in current management unit\n"); 211 | return; 212 | } 213 | 214 | uint32_t num_dpus = table_management->num_dpus; 215 | table_host_t* t = lookup_table(table_id, table_management); 216 | uint32_t* lens = t->lens_each_dpu; 217 | uint32_t type_size = t->table_type_size; 218 | uint32_t len = lens[0]; 219 | 220 | for(int i=0; iso_bin_location, RTLD_NOW); 228 | void (*combine_func)(void*, void*) = dlsym(lib, "combine_func"); 229 | 230 | void* res = simplepim_gather(table_id, table_management); 231 | for(int i=1; i start; 243 | uint64_t broadcast_size = (len*type_size)+8-(len*type_size)%8; 244 | struct dpu_set_t set = table_management->set; 245 | DPU_ASSERT(dpu_broadcast_to(set, DPU_MRAM_HEAP_POINTER_NAME, outputs_pos, bc_buffer, broadcast_size, DPU_XFER_DEFAULT)); 246 | 247 | free(bc_buffer); 248 | 249 | } 250 | else{ 251 | printf("ERROR: compiled binary "); 252 | printf(binary_handle->bin_location); 253 | printf(" does not contain general reduction functions\n"); 254 | } 255 | 256 | } 257 | 258 | -------------------------------------------------------------------------------- /lib/communication/CommOps.h: -------------------------------------------------------------------------------- 1 | #ifndef COMMOPS_H 2 | #define COMMOPS_H 3 | #include "CommHelper.h" 4 | #include "../management/Management.h" 5 | #include "../processing/ProcessingHelperHost.h" 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | /* 13 | malloc_*_aligned allocates memory on host for alignent requirements of UPMEM, one can treat the returned pointer 14 | as normal heap memory obtained by calling "malloc" 15 | 16 | simplepim_* (for example simplepim_scatter) implements the communication operators as in the paper 17 | */ 18 | 19 | 20 | void* malloc_scatter_aligned(uint64_t len, uint32_t type_size, simplepim_management_t* table_management); 21 | void* malloc_reduce_aligned(uint32_t len, uint32_t type_size, simplepim_management_t* table_management); 22 | void* malloc_broadcast_aligned(uint32_t len, uint32_t type_size, simplepim_management_t* table_management); 23 | void simplepim_scatter(char* const table_id, void* elements, uint64_t len, uint32_t type_size, simplepim_management_t* table_management); 24 | void* simplepim_gather(char* const table_id, simplepim_management_t* table_management); 25 | void simplepim_broadcast(char* const table_id, void* elements, uint64_t len, uint32_t type_size, simplepim_management_t* table_management); 26 | void simplepim_allgather(char* const table_id, char* const new_table_id, simplepim_management_t* table_management); 27 | void simplepim_allreduce(char* const table_id, handle_t* binary_handle, simplepim_management_t* table_management); 28 | #endif 29 | -------------------------------------------------------------------------------- /lib/management/Management.c: -------------------------------------------------------------------------------- 1 | #include "Management.h" 2 | 3 | /* 4 | see description of the functions in Management.h 5 | */ 6 | 7 | simplepim_management_t* table_management_init(uint32_t num_dpus){ 8 | 9 | struct dpu_set_t set; 10 | DPU_ASSERT(dpu_alloc(num_dpus, NULL, &set)); 11 | 12 | small_table_init(set); 13 | simplepim_management_t* management = malloc(sizeof(simplepim_management_t)); 14 | management->set = set; 15 | management->num_dpus = num_dpus; 16 | management->num_tables = 0; 17 | management->curr_space = 16; 18 | management->tables = malloc(sizeof(table_host_t*)*16); 19 | management->zip_args = malloc(sizeof(zip_arguments_t)*num_dpus); 20 | management->map_args = malloc(sizeof(map_arguments_t)*num_dpus); 21 | management->red_args = malloc(sizeof(gen_red_arguments_t)*num_dpus); 22 | management->free_space_start_pos = 0; 23 | return management; 24 | } 25 | 26 | void add_table(table_host_t* table, simplepim_management_t* management){ 27 | uint32_t num_tables = management->num_tables; 28 | for(int i=0; itables[i]->name)==0){ 30 | free(management->tables[i]->lens_each_dpu); 31 | free(management->tables[i]); 32 | management->tables[i] = table; 33 | return; 34 | } 35 | } 36 | 37 | uint32_t curr_space = management->curr_space; 38 | if(curr_space == num_tables){ 39 | management->tables = realloc(management->tables, (curr_space+16)*(sizeof(table_host_t*))); 40 | management->tables[num_tables] = table; 41 | management->num_tables++; 42 | management->curr_space+=16; 43 | } 44 | else{ 45 | management->tables[num_tables] = table; 46 | management->num_tables++; 47 | } 48 | } 49 | 50 | 51 | uint32_t contains_table(const char* name, simplepim_management_t* management){ 52 | uint32_t num_tables = management->num_tables; 53 | for(int i=0; itables[i]->name)==0){ 55 | return 1; 56 | } 57 | } 58 | 59 | return 0; 60 | } 61 | 62 | void free_table(const char* name, simplepim_management_t* management){ 63 | if(!contains_table(name, management)){ 64 | return; 65 | } 66 | lookup_table(name, management)->name = ""; 67 | } 68 | 69 | table_host_t* lookup_table(const char* name, simplepim_management_t* management){ 70 | uint32_t num_tables = management->num_tables; 71 | for(int i=0; itables[i]->name)==0){ 73 | return management->tables[i]; 74 | } 75 | } 76 | 77 | printf("table "); 78 | printf(name); 79 | printf(" is not contains in current management unit\n"); 80 | return NULL; 81 | } 82 | 83 | uint32_t max_len_dpu(uint32_t num_dpus, table_host_t* table){ 84 | uint32_t max_len = 0; 85 | for(int i=0; ilens_each_dpu[i]>max_len?table->lens_each_dpu[i]:max_len; 87 | } 88 | return max_len; 89 | 90 | } 91 | -------------------------------------------------------------------------------- /lib/management/Management.h: -------------------------------------------------------------------------------- 1 | #ifndef MANAGEMENT_H 2 | #define MANAGEMENT_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include "SmallTableInit.h" 11 | #include "../processing/gen_red/GenRedArgs.h" 12 | #include "../processing/map/MapArgs.h" 13 | #include "../processing/zip/ZipArgs.h" 14 | 15 | // table_host_t holds information about one array, used by the framework and the end user does not need to care about its details 16 | typedef struct { 17 | char* name; 18 | uint32_t start; 19 | uint32_t end; 20 | uint64_t len; 21 | uint32_t* lens_each_dpu; 22 | uint32_t table_type_size; 23 | 24 | // fields handling virtual zip 25 | uint32_t is_virtual_zipped; 26 | uint32_t start1; 27 | uint32_t start2; 28 | uint32_t end1; 29 | uint32_t end2; 30 | uint32_t type1; 31 | uint32_t type2; 32 | } table_host_t; 33 | 34 | // simplepim_management_t holds information about all registered arrays, the pim hardware like number of pim cores etc. (see the paper for detials) 35 | typedef struct { 36 | uint32_t curr_space; 37 | uint32_t num_tables; 38 | table_host_t** tables; 39 | struct dpu_set_t set; 40 | uint32_t num_dpus; 41 | zip_arguments_t* zip_args; 42 | map_arguments_t* map_args; 43 | gen_red_arguments_t* red_args; 44 | // one could do memory management with more complex logic, currently just increment counter for each new array 45 | uint32_t free_space_start_pos; 46 | } simplepim_management_t; 47 | 48 | 49 | /* 50 | table_management_init initialise a management interface, can be called by end users 51 | add_table, contains_table, lookup_table, and free_table are used to retrieve information of an array from the management unit with its array id "name" (see the paper for detials) 52 | max_len_dpu is a helper function used by the framework 53 | */ 54 | simplepim_management_t* table_management_init(uint32_t num_dpus); 55 | void add_table(table_host_t* table, simplepim_management_t* management); 56 | uint32_t contains_table(const char* name, simplepim_management_t* management); 57 | table_host_t* lookup_table(const char* name, simplepim_management_t* management); 58 | void free_table(const char* name, simplepim_management_t* management); 59 | uint32_t max_len_dpu(uint32_t num_dpus, table_host_t* table); 60 | 61 | #endif 62 | -------------------------------------------------------------------------------- /lib/management/SmallTableInit.c: -------------------------------------------------------------------------------- 1 | #include "SmallTableInit.h" 2 | #define DPU_BINARY "bin/dpu_init_binary" 3 | 4 | void small_table_init(struct dpu_set_t set){ 5 | system("dpu-upmem-dpurte-clang -O2 -DNR_TASKLETS=1 -o bin/dpu_init_binary ../../lib/management/SmallTableInit_dpu.c"); 6 | DPU_ASSERT(dpu_load(set, DPU_BINARY, NULL)); 7 | DPU_ASSERT(dpu_launch(set, DPU_SYNCHRONOUS)); 8 | } -------------------------------------------------------------------------------- /lib/management/SmallTableInit.h: -------------------------------------------------------------------------------- 1 | #ifndef SMALLTABLEINIT_H 2 | #define SMALLTABLEINIT_H 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | /* 9 | the UPMEM hardware needs to call some setup code before running 10 | the code is called once management_init is called 11 | */ 12 | 13 | void small_table_init(struct dpu_set_t set); 14 | 15 | #endif -------------------------------------------------------------------------------- /lib/management/SmallTableInit_dpu.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include 7 | int main() { 8 | mem_reset(); // Reset the heap 9 | //printf("\n"); 10 | return 0; 11 | } -------------------------------------------------------------------------------- /lib/processing/ProcessingHelper.c: -------------------------------------------------------------------------------- 1 | #include "ProcessingHelper.h" 2 | uint64_t copy_block_size_fun(uint32_t type_size1, uint32_t type_size2, uint32_t num_elem){ 3 | //1024 4 | uint64_t res=0; 5 | uint32_t res_arr[2]; 6 | uint32_t max_type_size = type_size1>type_size2?type_size1:type_size2; 7 | if(type_size1%8 == 0 && type_size2%8 == 0 && (num_elem <= NR_TASKLETS || max_type_size > 512)){ 8 | res_arr[0] = 1; 9 | res_arr[1] = 0; 10 | } 11 | else if(type_size1%4 == 0 && type_size2%4 == 0 &&(num_elem <= 2*NR_TASKLETS || max_type_size > 256)){ 12 | res_arr[0] = 2; 13 | res_arr[1] = 1; 14 | } 15 | else if(type_size1%2 == 0 && type_size2%2 == 0 && max_type_size > 128){ 16 | res_arr[0] = 4; 17 | res_arr[1] = 2; 18 | } 19 | else if(max_type_size < 16){ 20 | res_arr[0] = 256; 21 | res_arr[1] = 8; 22 | } 23 | else if(max_type_size < 32){ 24 | res_arr[0] = 128; 25 | res_arr[1] = 7; 26 | } 27 | else{ 28 | res_arr[0] = 16; 29 | res_arr[1] = 4; 30 | } 31 | 32 | res = *(uint64_t*)res_arr; 33 | return res; 34 | 35 | } 36 | 37 | -------------------------------------------------------------------------------- /lib/processing/ProcessingHelper.h: -------------------------------------------------------------------------------- 1 | #ifndef PROCESSINGHELPER_H 2 | #define PROCESSINGHELPER_H 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include "mutex.h" 11 | 12 | uint64_t copy_block_size_fun(uint32_t type_size1, uint32_t type_size2, uint32_t num_elem); 13 | 14 | #endif -------------------------------------------------------------------------------- /lib/processing/ProcessingHelperHost.c: -------------------------------------------------------------------------------- 1 | #include "ProcessingHelperHost.h" 2 | handle_t* create_handle(const char* func_pathname, uint32_t func_type){ 3 | 4 | handle_t* handle = malloc(sizeof(handle_t)); 5 | handle->func_type = func_type; 6 | handle->bin_location = malloc(2048); 7 | char func_bodyname[2048]; 8 | strcpy(func_bodyname, func_pathname); 9 | for(int i=0; i<2048; i++){ 10 | if(func_bodyname[i] == '.'){ 11 | func_bodyname[i] = '_'; 12 | } 13 | } 14 | 15 | if(func_type == 0){ 16 | char compile_cmd[2048] = "dpu-upmem-dpurte-clang -O2 -DNR_TASKLETS=12 -I./ -D__mapfunc_filename__=\"<"; 17 | strcat(compile_cmd, func_pathname); 18 | strcat(compile_cmd, "/map.h>\" -o bin/dpu_map_"); 19 | strcat(compile_cmd, func_bodyname); 20 | strcat(compile_cmd, " ../../lib/processing/map/map_dpu.c ../../lib/processing/ProcessingHelper.c"); 21 | int succ = system(compile_cmd); 22 | 23 | char bin_location[2048] = "bin/dpu_map_"; 24 | strcat(bin_location, func_bodyname); 25 | strcpy(handle->bin_location, bin_location); 26 | } 27 | else if(func_type == 1){ 28 | char compile_cmd[2048] = "dpu-upmem-dpurte-clang -O2 -DNR_TASKLETS=12 -I./ -D__mapredfunc_pathname__=\"<"; 29 | strcat(compile_cmd, func_pathname); 30 | strcat(compile_cmd, "/map_to_val_func.h>\" -D__combinefunc_pathname__=\"<"); 31 | strcat(compile_cmd, func_pathname); 32 | strcat(compile_cmd, "/init_combine_func.h>\" -o bin/dpu_genred_"); 33 | strcat(compile_cmd, func_bodyname); 34 | strcat(compile_cmd, " ../../lib/processing/gen_red/gen_red_dpu.c ../../lib/processing/ProcessingHelper.c ../../lib/TableHost.c ../../lib/Table.c ../../lib/TableShared.c"); 35 | int succ = system(compile_cmd); 36 | 37 | char bin_location[2048] = "bin/dpu_genred_"; 38 | strcat(bin_location, func_bodyname); 39 | strcpy(handle->bin_location, bin_location); 40 | 41 | 42 | // generate .o file for dynamic linking 43 | char h_fname[2048] = ""; 44 | strcat(h_fname, func_pathname); 45 | strcat(h_fname, "/init_combine_func.h"); 46 | 47 | char c_fname[2048] = ""; 48 | strcat(c_fname, func_pathname); 49 | strcat(c_fname, "/init_combine_func.c"); 50 | 51 | char o_fname[2048] = ""; 52 | strcat(o_fname, func_pathname); 53 | strcat(o_fname, "_init_combine_func.o"); 54 | 55 | char so_fname[2048]; 56 | strcpy(so_fname, o_fname); 57 | so_fname[strlen(so_fname)-1] = '\0'; 58 | strcat(so_fname, "so"); 59 | 60 | char cp_cmd[2048] = "cp "; 61 | strcat(cp_cmd, h_fname); 62 | strcat(cp_cmd, " "); 63 | strcat(cp_cmd, c_fname); 64 | succ = system(cp_cmd); 65 | 66 | char compile_cmd1[2048] = "gcc -c -fPIC -o"; 67 | strcat(compile_cmd1, " bin/"); 68 | strcat(compile_cmd1, o_fname); 69 | strcat(compile_cmd1, " "); 70 | strcat(compile_cmd1, c_fname); 71 | succ = system(compile_cmd1); 72 | 73 | 74 | char compile_cmd2[2048] = "gcc -shared -o bin/"; 75 | 76 | strcat(compile_cmd2, so_fname); 77 | strcat(compile_cmd2, " bin/"); 78 | strcat(compile_cmd2, o_fname); 79 | succ = system(compile_cmd2); 80 | 81 | 82 | char compile_cmd3[2048] = "rm "; 83 | strcat(compile_cmd3, c_fname); 84 | succ = system(compile_cmd3); 85 | 86 | handle->so_bin_location = malloc(2048); 87 | char so_bin_location[2048] = "bin/"; 88 | strcat(so_bin_location, so_fname); 89 | strcpy(handle->so_bin_location, so_bin_location); 90 | 91 | 92 | 93 | } 94 | else if(func_type == 2){ 95 | char compile_cmd[2048] = "dpu-upmem-dpurte-clang -O2 -DNR_TASKLETS=12 -o bin/dpu_zip"; 96 | strcat(compile_cmd, " ../../lib/processing/zip/zip_dpu.c ../../lib/processing/zip/ZipProcessing.c ../../lib/processing/ProcessingHelper.c"); 97 | int succ = system(compile_cmd); 98 | 99 | char bin_location[2048] = "bin/dpu_zip"; 100 | strcpy(handle->bin_location, bin_location); 101 | 102 | } 103 | else{ 104 | printf("function handle not properly compiled!!!"); 105 | return NULL; 106 | } 107 | 108 | return handle; 109 | 110 | } 111 | -------------------------------------------------------------------------------- /lib/processing/ProcessingHelperHost.h: -------------------------------------------------------------------------------- 1 | #ifndef PROCESSINGHELPERHOST_H 2 | #define PROCESSINGHELPERHOST_H 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #define MAP 0 10 | #define REDUCE 1 11 | #define ZIP 2 12 | 13 | // handle_t contains information of where the handle's binary is located 14 | typedef struct { 15 | char* bin_location; 16 | char* so_bin_location; 17 | uint32_t func_type; 18 | } handle_t; 19 | 20 | 21 | /* 22 | create_handle creates a handle that can be understood by communcation and processing operators (see the paper for details) 23 | */ 24 | handle_t* create_handle(const char* func_fname, uint32_t func_type); 25 | 26 | #endif -------------------------------------------------------------------------------- /lib/processing/gen_red/GenRed.c: -------------------------------------------------------------------------------- 1 | #include "GenRed.h" 2 | 3 | /* 4 | description of table_gen_red see GenRed.h 5 | other functions are helper functions used by the framework 6 | */ 7 | 8 | void combine_table_entries(void* table1, void* table2, uint32_t table_size, uint32_t value_size, void (*combineFunc)(void*, void*)){ 9 | 10 | uint32_t curr_entry; 11 | 12 | for(int i=0; iset; 24 | uint32_t num_dpus = table_management->num_dpus; 25 | void* tables = malloc_reduce_aligned(len, type_size, table_management); 26 | uint32_t aligned_table_size = (len*type_size)+(len*type_size)%8; 27 | 28 | for(int i=0; ifree_space_start_pos; 90 | if(contains_table(dest_name, table_management)){ 91 | outputs = lookup_table(dest_name, table_management) -> start; 92 | } 93 | 94 | if(binary_handle->func_type == 1){ 95 | if(!contains_table(src_name, table_management)){ 96 | printf("source table "); 97 | printf(src_name); 98 | printf(" is not contains in current management unit\n"); 99 | return NULL; 100 | } 101 | 102 | //timing 103 | gettimeofday(&start_time, NULL); 104 | 105 | struct dpu_set_t set = table_management->set; 106 | uint32_t num_dpus = table_management->num_dpus; 107 | table_host_t* src_table = lookup_table(src_name, table_management); 108 | uint32_t* lens = src_table->lens_each_dpu; 109 | uint32_t input_type = src_table->table_type_size; 110 | uint32_t inputs = src_table->start; 111 | 112 | gen_red_arguments_t* input_args = table_management->red_args; 113 | // use handle for precompiled binaries 114 | const char* binary = binary_handle->bin_location; 115 | DPU_ASSERT(dpu_load(set, binary, NULL)); 116 | 117 | for(int i=0; iso_bin_location, RTLD_NOW); 153 | void (*init_func)(uint32_t, void*) = dlsym(lib,"init_func"); 154 | void (*combine_func)(void*, void*) = dlsym(lib, "combine_func"); 155 | 156 | if(lib == NULL){ 157 | printf("dynamic library linking failed!!!\n"); 158 | } 159 | 160 | 161 | gather_tables_to_host(table_management, my_table, output_len, output_type, outputs, init_func, combine_func); 162 | dlclose(lib); 163 | gettimeofday(&end_time, NULL); 164 | double host_table_reduction_time = (end_time.tv_sec - start_time.tv_sec) * 1000000.0 + 165 | (end_time.tv_usec - start_time.tv_usec); 166 | // back to dpus 167 | 168 | // table info 169 | gettimeofday(&start_time, NULL); 170 | 171 | int32_t* red_tables_lens = malloc(sizeof(uint32_t)*num_dpus); 172 | for(int i=0; iname = malloc(strlen(dest_name)+1); 179 | memcpy(t->name, dest_name, strlen(dest_name)+1); 180 | t->start = outputs; 181 | uint32_t max_end_dpu = outputs+output_len*output_type; 182 | t->end = max_end_dpu+(8-max_end_dpu%8); 183 | t->len = output_len; 184 | t->table_type_size = output_type; 185 | t->lens_each_dpu = red_tables_lens; 186 | t->is_virtual_zipped = 0; 187 | 188 | add_table(t, table_management); 189 | table_management->free_space_start_pos = table_management->free_space_start_pos > t->end ? table_management->free_space_start_pos : t->end; 190 | gettimeofday(&end_time, NULL); 191 | double register_table_time = (end_time.tv_sec - start_time.tv_sec) * 1000000.0 + 192 | (end_time.tv_usec - start_time.tv_usec); 193 | 194 | printf("--------------\n"); 195 | printf("table reduction function : "); 196 | printf(binary); 197 | printf("\nreduction function kernel execution time : %f\n", kernel_time/1000); 198 | printf("host reduction execution time : %f\n", host_table_reduction_time/1000); 199 | printf("function call and table management time : %f\n", (register_table_time+prepare_args_time)/1000); 200 | printf("--------------\n"); 201 | 202 | return my_table; 203 | } 204 | else{ 205 | printf("ERROR: compiled binary "); 206 | printf(binary_handle->bin_location); 207 | printf(" does not contain general reduction functions\n"); 208 | } 209 | 210 | } 211 | -------------------------------------------------------------------------------- /lib/processing/gen_red/GenRed.h: -------------------------------------------------------------------------------- 1 | #ifndef GENRED_H 2 | #define GENRED_H 3 | 4 | #include "GenRedArgs.h" 5 | #include "../ProcessingHelperHost.h" 6 | #include "../../communication/CommOps.h" 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | /* 17 | table_gen_red implements the array reduction as in the paper 18 | It parses the function_handle and setups the host for calling the pim kernel (GenRed.c) 19 | Then it runs the array reduction pim kernel (gen_red_dpu.c and GenRedProcessing.h) 20 | */ 21 | 22 | void* table_gen_red(const char* src_name, const char* dest_name, uint32_t output_type, uint32_t output_len, handle_t* binary_handle, simplepim_management_t* table_management, uint32_t info); 23 | #endif 24 | -------------------------------------------------------------------------------- /lib/processing/gen_red/GenRedArgs.h: -------------------------------------------------------------------------------- 1 | #ifndef GENREDARGS_H 2 | #define GENREDARGS_H 3 | #include 4 | #include 5 | 6 | typedef struct { 7 | uint32_t input_start_offset; 8 | uint32_t input_type_size; 9 | uint32_t output_start_offset; 10 | uint32_t output_type_size; 11 | uint32_t len; 12 | uint32_t table_len; 13 | uint32_t info; 14 | } gen_red_arguments_t; 15 | 16 | 17 | #endif -------------------------------------------------------------------------------- /lib/processing/gen_red/GenRedProcessing.h: -------------------------------------------------------------------------------- 1 | #ifndef GENREDPROCESSING_H 2 | #define GENREDPROCESSING_H 3 | #include __mapredfunc_pathname__ 4 | #include __combinefunc_pathname__ 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | #include "../ProcessingHelper.h" 14 | #include "../../StructsPIM.h" 15 | #include "../../Table.h" 16 | 17 | uint32_t get_shift_bits_for_type(uint32_t value_size){ 18 | switch (value_size) { 19 | case 2: 20 | return 1; 21 | case 4: 22 | return 2; 23 | case 8: 24 | return 3; 25 | case 16: 26 | return 4; 27 | case 32: 28 | return 5; 29 | case 64: 30 | return 6; 31 | case 128: 32 | return 7; 33 | case 256: 34 | return 8; 35 | case 512: 36 | return 9; 37 | case 1024: 38 | return 10; 39 | case 2048: 40 | return 11; 41 | case 4096: 42 | return 12; 43 | default: 44 | return 0; 45 | } 46 | } 47 | 48 | void gen_red_dpu(__mram_ptr void* inputs, __mram_ptr void* outputs, uint32_t input_type, uint32_t output_type, uint32_t len, uint32_t table_len){ 49 | uint32_t elem_type_size = input_type; 50 | uint32_t inter_type_size = output_type; 51 | uint32_t table_size = table_len; 52 | uint32_t num_tasklets = NR_TASKLETS; 53 | uint32_t pid = num_tasklets == 1 ? 0 : me(); 54 | 55 | uint64_t tuple = copy_block_size_fun(elem_type_size, inter_type_size, len); 56 | uint32_t* copy_block_size_ = (uint32_t*)&tuple; 57 | uint32_t copy_block_size = copy_block_size_[0]; 58 | uint32_t copy_block_size_shiftbits = copy_block_size_[1]; 59 | 60 | 61 | __mram_ptr void* elements = inputs; 62 | // try malloc/free for performance 63 | fsb_allocator_t elems_block_allocator = fsb_alloc(copy_block_size*elem_type_size, 1); 64 | __dma_aligned void* elems_block = fsb_get(elems_block_allocator); 65 | 66 | fsb_allocator_t table_allocator = fsb_alloc(sizeof(table_t), 1); 67 | __dma_aligned table_t* local_table = fsb_get(table_allocator); 68 | init_table(local_table, table_size, inter_type_size, init_func); 69 | 70 | 71 | fsb_allocator_t tmp_intermediate_allocator = fsb_alloc(inter_type_size, 1); 72 | __dma_aligned void* tmp_intermediate = fsb_get(tmp_intermediate_allocator); 73 | 74 | uint32_t last_block = (len/copy_block_size)*copy_block_size; 75 | uint32_t key = 0; 76 | 77 | void* local_table_entries = local_table->table; 78 | uint32_t curr_entry; 79 | uint32_t shift_bits = get_shift_bits_for_type(inter_type_size); 80 | 81 | 82 | // TODO : possible optimisation : shifts instead of mult, check bounds for last iterations (later) 83 | uint32_t total_len_in_bytes = len * elem_type_size; 84 | uint32_t copy_block_size_in_bytes = copy_block_size*elem_type_size; 85 | uint32_t stride = copy_block_size_in_bytes*num_tasklets; 86 | 87 | uint32_t divisible_len_in_bytes = ((len>>copy_block_size_shiftbits)< 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include "GenRedProcessing.h" 10 | #include "GenRedArgs.h" 11 | #include "../ProcessingHelper.h" 12 | 13 | 14 | __host gen_red_arguments_t GEN_RED_INPUT_ARGUMENTS; 15 | __dma_aligned void* aux; 16 | 17 | BARRIER_INIT(my_barrier, NR_TASKLETS); 18 | int main() { 19 | int pid = me(); 20 | if (pid == 0){ // Initialize once the cycle counter 21 | mem_reset(); // Reset the heap 22 | } 23 | barrier_wait(&my_barrier); 24 | 25 | //printf("\n"); 26 | 27 | uint32_t input_start_offset = GEN_RED_INPUT_ARGUMENTS.input_start_offset; 28 | uint32_t input_type_size = GEN_RED_INPUT_ARGUMENTS.input_type_size; 29 | uint32_t output_start_offset = GEN_RED_INPUT_ARGUMENTS.output_start_offset; 30 | uint32_t output_type_size = GEN_RED_INPUT_ARGUMENTS.output_type_size; 31 | uint32_t len = GEN_RED_INPUT_ARGUMENTS.len; 32 | uint32_t table_len = GEN_RED_INPUT_ARGUMENTS.table_len; 33 | 34 | start_func(&GEN_RED_INPUT_ARGUMENTS); 35 | gen_red_dpu(DPU_MRAM_HEAP_POINTER+input_start_offset, DPU_MRAM_HEAP_POINTER+output_start_offset, input_type_size, output_type_size, len, table_len); 36 | return 0; 37 | } -------------------------------------------------------------------------------- /lib/processing/map/Map.c: -------------------------------------------------------------------------------- 1 | #include "Map.h" 2 | 3 | void table_map(const char* src_name, const char* dest_name, uint32_t output_type, handle_t* binary_handle, simplepim_management_t* table_management, uint32_t info){ 4 | int i; 5 | struct dpu_set_t dpu; 6 | struct timeval start_time; 7 | struct timeval end_time; 8 | 9 | uint32_t outputs = table_management->free_space_start_pos; 10 | if(contains_table(dest_name, table_management)){ 11 | outputs = lookup_table(dest_name, table_management) -> start; 12 | } 13 | 14 | if(!contains_table(src_name, table_management)){ 15 | printf("source table "); 16 | printf(src_name); 17 | printf(" is not contains in current management unit\n"); 18 | return; 19 | } 20 | table_host_t* src_table = lookup_table(src_name, table_management); 21 | 22 | if(binary_handle->func_type == 0 && src_table->is_virtual_zipped == 0){ 23 | 24 | //timing 25 | gettimeofday(&start_time, NULL); 26 | 27 | struct dpu_set_t set = table_management->set; 28 | uint32_t num_dpus = table_management->num_dpus; 29 | uint32_t* lens = src_table->lens_each_dpu; 30 | uint32_t input_type = src_table->table_type_size; 31 | uint32_t inputs = src_table->start; 32 | 33 | map_arguments_t* input_args = table_management->map_args; 34 | // use handle for precompiled binaries 35 | const char* binary = binary_handle->bin_location; 36 | DPU_ASSERT(dpu_load(set, binary, NULL)); 37 | 38 | //parse arguments to map function call 39 | DPU_FOREACH(set, dpu, i) { 40 | input_args[i].input_start_offset = inputs; 41 | input_args[i].input_type_size = input_type; 42 | input_args[i].output_start_offset = outputs; 43 | input_args[i].output_type_size = output_type; 44 | input_args[i].len = lens[i]; 45 | input_args[i].info = info; 46 | input_args[i].is_virtually_zipped = 0; 47 | DPU_ASSERT(dpu_prepare_xfer(dpu, input_args + i)); 48 | } 49 | 50 | DPU_ASSERT(dpu_push_xfer(set, DPU_XFER_TO_DPU, "MAP_INPUT_ARGUMENTS", 0, sizeof(map_arguments_t), DPU_XFER_DEFAULT)); 51 | 52 | gettimeofday(&end_time, NULL); 53 | double prepare_args_time = (end_time.tv_sec - start_time.tv_sec) * 1000000.0 + 54 | (end_time.tv_usec - start_time.tv_usec); 55 | 56 | //call map function 57 | gettimeofday(&start_time, NULL); 58 | DPU_ASSERT(dpu_launch(set, DPU_SYNCHRONOUS)); 59 | gettimeofday(&end_time, NULL); 60 | 61 | double kernel_time = (end_time.tv_sec - start_time.tv_sec) * 1000000.0 + 62 | (end_time.tv_usec - start_time.tv_usec); 63 | 64 | // table information to management unit 65 | // timing 66 | gettimeofday(&start_time, NULL); 67 | 68 | table_host_t* t = malloc(sizeof(table_host_t)); 69 | t->name = malloc(strlen(dest_name)+1); 70 | memcpy(t->name, dest_name, strlen(dest_name)+1); 71 | t->start = outputs; 72 | uint32_t max_end_dpu = max_len_dpu(num_dpus, src_table)*output_type+outputs; 73 | t->end = max_end_dpu+(8-max_end_dpu%8); 74 | t->len = src_table->len; 75 | t->table_type_size = output_type; 76 | t->lens_each_dpu = malloc(num_dpus*sizeof(int32_t)); 77 | t->is_virtual_zipped = 0; 78 | memcpy(t->lens_each_dpu, lens, num_dpus*sizeof(int32_t)); 79 | 80 | add_table(t, table_management); 81 | table_management->free_space_start_pos = table_management->free_space_start_pos > t->end ? table_management->free_space_start_pos : t->end; 82 | 83 | gettimeofday(&end_time, NULL); 84 | double register_table_time = (end_time.tv_sec - start_time.tv_sec) * 1000000.0 + 85 | (end_time.tv_usec - start_time.tv_usec); 86 | 87 | printf("--------------\n"); 88 | printf("map function : "); 89 | printf(binary); 90 | printf("\nmap function kernel execution time : %f\n", kernel_time/1000); 91 | printf("function call and table management time : %f\n", (register_table_time+prepare_args_time)/1000); 92 | printf("--------------\n"); 93 | } 94 | else if (binary_handle->func_type == 0 && src_table->is_virtual_zipped == 1) 95 | { 96 | //timing 97 | gettimeofday(&start_time, NULL); 98 | 99 | struct dpu_set_t set = table_management->set; 100 | uint32_t num_dpus = table_management->num_dpus; 101 | uint32_t* lens = src_table->lens_each_dpu; 102 | uint32_t input_type = src_table->table_type_size; 103 | uint32_t inputs = src_table->start; 104 | 105 | map_arguments_t* input_args = table_management->map_args; 106 | // use handle for precompiled binaries 107 | const char* binary = binary_handle->bin_location; 108 | DPU_ASSERT(dpu_load(set, binary, NULL)); 109 | 110 | //parse arguments to map function call 111 | DPU_FOREACH(set, dpu, i) { 112 | input_args[i].input_start_offset = inputs; 113 | input_args[i].input_type_size = input_type; 114 | input_args[i].output_start_offset = outputs; 115 | input_args[i].output_type_size = output_type; 116 | input_args[i].len = lens[i]; 117 | input_args[i].info = info; 118 | input_args[i].is_virtually_zipped = 1; 119 | input_args[i].start1 = src_table->start1; 120 | input_args[i].start2 = src_table->start2; 121 | input_args[i].type1 = src_table->type1; 122 | input_args[i].type2 = src_table->type2; 123 | DPU_ASSERT(dpu_prepare_xfer(dpu, input_args + i)); 124 | } 125 | 126 | DPU_ASSERT(dpu_push_xfer(set, DPU_XFER_TO_DPU, "MAP_INPUT_ARGUMENTS", 0, sizeof(map_arguments_t), DPU_XFER_DEFAULT)); 127 | 128 | gettimeofday(&end_time, NULL); 129 | double prepare_args_time = (end_time.tv_sec - start_time.tv_sec) * 1000000.0 + 130 | (end_time.tv_usec - start_time.tv_usec); 131 | 132 | //call map function 133 | gettimeofday(&start_time, NULL); 134 | DPU_ASSERT(dpu_launch(set, DPU_SYNCHRONOUS)); 135 | gettimeofday(&end_time, NULL); 136 | 137 | double kernel_time = (end_time.tv_sec - start_time.tv_sec) * 1000000.0 + 138 | (end_time.tv_usec - start_time.tv_usec); 139 | 140 | // table information to management unit 141 | // timing 142 | gettimeofday(&start_time, NULL); 143 | 144 | table_host_t* t = malloc(sizeof(table_host_t)); 145 | t->name = malloc(strlen(dest_name)+1); 146 | memcpy(t->name, dest_name, strlen(dest_name)+1); 147 | t->start = outputs; 148 | uint32_t max_end_dpu = max_len_dpu(num_dpus, src_table)*output_type+outputs; 149 | t->end = max_end_dpu+(8-max_end_dpu%8); 150 | t->len = src_table->len; 151 | t->table_type_size = output_type; 152 | t->lens_each_dpu = malloc(num_dpus*sizeof(int32_t)); 153 | t->is_virtual_zipped = 0; 154 | memcpy(t->lens_each_dpu, lens, num_dpus*sizeof(int32_t)); 155 | 156 | add_table(t, table_management); 157 | 158 | 159 | gettimeofday(&end_time, NULL); 160 | double register_table_time = (end_time.tv_sec - start_time.tv_sec) * 1000000.0 + 161 | (end_time.tv_usec - start_time.tv_usec); 162 | 163 | printf("--------------\n"); 164 | printf("map function : "); 165 | printf(binary); 166 | printf("\nmap function kernel execution time : %f\n", kernel_time/1000); 167 | printf("function call and table management time : %f\n", (register_table_time+prepare_args_time)/1000); 168 | printf("--------------\n"); 169 | } 170 | else{ 171 | printf("ERROR: compiled binary "); 172 | printf(binary_handle->bin_location); 173 | printf(" does not contain map function\n"); 174 | } 175 | 176 | } 177 | -------------------------------------------------------------------------------- /lib/processing/map/Map.h: -------------------------------------------------------------------------------- 1 | #ifndef MAP_H 2 | #define MAP_H 3 | #include "MapArgs.h" 4 | #include "../ProcessingHelperHost.h" 5 | #include "../../management/Management.h" 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | /* 14 | table_map implements the array map operator as in the paper 15 | It parses the function_handle and setups the host for calling the pim kernel (Map.c) 16 | Then it runs the array reduction pim kernel (map_dpu.c and MapProcessing.h) 17 | */ 18 | 19 | void table_map(const char* src_name, const char* dest_name, uint32_t output_type, handle_t* binary_handle, simplepim_management_t* table_management, uint32_t info); 20 | #endif 21 | -------------------------------------------------------------------------------- /lib/processing/map/MapArgs.h: -------------------------------------------------------------------------------- 1 | #ifndef MAPARGS_H 2 | #define MAPARGS_H 3 | #include 4 | #include 5 | 6 | typedef struct { 7 | uint32_t input_start_offset; 8 | uint32_t input_type_size; 9 | uint32_t output_start_offset; 10 | uint32_t output_type_size; 11 | uint32_t len; 12 | uint32_t info; 13 | 14 | // handle virtual zip 15 | uint32_t is_virtually_zipped; 16 | uint32_t start1; 17 | uint32_t start2; 18 | uint32_t type1; 19 | uint32_t type2; 20 | } map_arguments_t; 21 | 22 | 23 | #endif -------------------------------------------------------------------------------- /lib/processing/map/MapProcessing.h: -------------------------------------------------------------------------------- 1 | #ifndef MAPPROCESSING_H 2 | #define MAPPROCESSING_H 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include "../ProcessingHelper.h" 10 | #include __mapfunc_filename__ 11 | 12 | BARRIER_INIT(barrier_p, NR_TASKLETS); 13 | void map_dpu(__mram_ptr void* inputs, __mram_ptr void* outputs, uint32_t input_type, uint32_t output_type, uint32_t len){ 14 | uint32_t elem_type_size = input_type; 15 | uint32_t inter_type_size = output_type; 16 | uint32_t num_tasklets = NR_TASKLETS; 17 | uint32_t pid = me(); 18 | uint64_t tuple = copy_block_size_fun(elem_type_size, inter_type_size, len); 19 | uint32_t* copy_block_size_ = (uint32_t*)&tuple; 20 | uint32_t copy_block_size = copy_block_size_[0]; 21 | uint32_t copy_block_size_shiftbits = copy_block_size_[1]; 22 | // try malloc/free for performance 23 | fsb_allocator_t elems_block_allocator = fsb_alloc(elem_type_size<>copy_block_size_shiftbits)<>2)<<2; 40 | uint32_t unroll_block_rest = copy_block_size-unroll_block_rest; 41 | 42 | uint32_t i_init = pid<>copy_block_size_shiftbits)<> 2; 164 | uint32_t input_type_2_div_4 = input_type2 >> 2; 165 | uint32_t input_type_1_rest_4 = input_type1 - (input_type_1_div_4<<2); 166 | uint32_t input_type_2_rest_4 = input_type2 - (input_type_2_div_4<<2); 167 | 168 | 169 | void* elem_plus_input1 = elem + input_type1; 170 | 171 | uint32_t unroll_block_size = (copy_block_size>>2)<<2; 172 | uint32_t unroll_block_rest = copy_block_size-unroll_block_rest; 173 | 174 | if(input_type1 ==4 && input_type2 == 4){ 175 | for(int i=pid*copy_block_size; i 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include "MapProcessing.h" 10 | #include "MapArgs.h" 11 | #include "../ProcessingHelper.h" 12 | 13 | __host map_arguments_t MAP_INPUT_ARGUMENTS; 14 | __dma_aligned void* aux; 15 | 16 | BARRIER_INIT(my_barrier, NR_TASKLETS); 17 | int main() { 18 | int pid = me(); 19 | if (pid == 0){ // Initialize once the cycle counter 20 | mem_reset(); // Reset the heap 21 | } 22 | barrier_wait(&my_barrier); 23 | 24 | //printf("\n"); 25 | 26 | uint32_t input_start_offset = MAP_INPUT_ARGUMENTS.input_start_offset; 27 | uint32_t input_type_size = MAP_INPUT_ARGUMENTS.input_type_size; 28 | uint32_t output_start_offset = MAP_INPUT_ARGUMENTS.output_start_offset; 29 | uint32_t output_type_size = MAP_INPUT_ARGUMENTS.output_type_size; 30 | uint32_t len = MAP_INPUT_ARGUMENTS.len; 31 | uint32_t is_zipped = MAP_INPUT_ARGUMENTS.is_virtually_zipped; 32 | 33 | // for virtually zipped table 34 | uint32_t start1 = MAP_INPUT_ARGUMENTS.start1; 35 | uint32_t start2 = MAP_INPUT_ARGUMENTS.start2; 36 | uint32_t type1 = MAP_INPUT_ARGUMENTS.type1; 37 | uint32_t type2 = MAP_INPUT_ARGUMENTS.type2; 38 | start_func(&MAP_INPUT_ARGUMENTS); 39 | if(is_zipped){ 40 | zip_map_dpu(DPU_MRAM_HEAP_POINTER+start1, DPU_MRAM_HEAP_POINTER+start2, DPU_MRAM_HEAP_POINTER+output_start_offset, type1, type2, output_type_size, len); 41 | } 42 | else{ 43 | map_dpu(DPU_MRAM_HEAP_POINTER+input_start_offset, DPU_MRAM_HEAP_POINTER+output_start_offset, input_type_size, output_type_size, len); 44 | } 45 | return 0; 46 | } -------------------------------------------------------------------------------- /lib/processing/zip/Zip.c: -------------------------------------------------------------------------------- 1 | #include "Zip.h" 2 | 3 | void table_zip(const char* src1_name, const char* src2_name, const char* dest_name, handle_t* binary_handle, simplepim_management_t* table_management){ 4 | int i; 5 | struct dpu_set_t dpu; 6 | struct timeval start_time; 7 | struct timeval end_time; 8 | uint32_t outputs = table_management->free_space_start_pos; 9 | if(contains_table(dest_name, table_management)){ 10 | outputs = lookup_table(dest_name, table_management) -> start; 11 | } 12 | 13 | if(binary_handle->func_type == 2){ 14 | 15 | // timing 16 | double kernel_time = 0; 17 | 18 | gettimeofday(&start_time, NULL); 19 | 20 | if(!contains_table(src1_name, table_management)){ 21 | printf("source table "); 22 | printf(src1_name); 23 | printf(" is not contains in current management unit\n"); 24 | return; 25 | } 26 | 27 | if(!contains_table(src2_name, table_management)){ 28 | printf("source table "); 29 | printf(src2_name); 30 | printf(" is not contains in current management unit\n"); 31 | return; 32 | } 33 | 34 | struct dpu_set_t set = table_management->set; 35 | uint32_t num_dpus = table_management->num_dpus; 36 | table_host_t* src1_table = lookup_table(src1_name, table_management); 37 | table_host_t* src2_table = lookup_table(src2_name, table_management); 38 | 39 | if(src1_table->len != src2_table->len){ 40 | printf("zip length does not match !!!"); 41 | return; 42 | } 43 | 44 | zip_arguments_t* input_args = table_management->zip_args; 45 | 46 | if(src1_table->is_virtual_zipped == 1){ 47 | src1_table->is_virtual_zipped = 0; 48 | const char* binary = binary_handle->bin_location; 49 | DPU_ASSERT(dpu_load(set, binary, NULL)); 50 | 51 | for(uint32_t i=0; istart1; 53 | input_args[i].input_start_offset2 = src1_table->start2; 54 | input_args[i].input_type_size1 = src1_table->type1; 55 | input_args[i].input_type_size2 = src1_table->type2; 56 | input_args[i].outputs = src1_table->start; 57 | 58 | } 59 | 60 | DPU_FOREACH(set, dpu, i) { 61 | DPU_ASSERT(dpu_prepare_xfer(dpu, input_args + i)); 62 | } 63 | 64 | DPU_ASSERT(dpu_push_xfer(set, DPU_XFER_TO_DPU, "ZIP_INPUT_ARGUMENTS", 0, sizeof(zip_arguments_t), DPU_XFER_DEFAULT)); 65 | gettimeofday(&start_time, NULL); 66 | DPU_ASSERT(dpu_launch(set, DPU_SYNCHRONOUS)); 67 | gettimeofday(&end_time, NULL); 68 | 69 | kernel_time += (end_time.tv_sec - start_time.tv_sec) * 1000000.0 + 70 | (end_time.tv_usec - start_time.tv_usec); 71 | } 72 | 73 | if(src2_table->is_virtual_zipped == 1){ 74 | src2_table->is_virtual_zipped = 0; 75 | const char* binary = binary_handle->bin_location; 76 | DPU_ASSERT(dpu_load(set, binary, NULL)); 77 | 78 | for(uint32_t i=0; istart1; 80 | input_args[i].input_start_offset2 = src2_table->start2; 81 | input_args[i].input_type_size1 = src2_table->type1; 82 | input_args[i].input_type_size2 = src2_table->type2; 83 | input_args[i].outputs = src2_table->start; 84 | 85 | } 86 | 87 | DPU_FOREACH(set, dpu, i) { 88 | DPU_ASSERT(dpu_prepare_xfer(dpu, input_args + i)); 89 | } 90 | 91 | DPU_ASSERT(dpu_push_xfer(set, DPU_XFER_TO_DPU, "ZIP_INPUT_ARGUMENTS", 0, sizeof(zip_arguments_t), DPU_XFER_DEFAULT)); 92 | gettimeofday(&start_time, NULL); 93 | DPU_ASSERT(dpu_launch(set, DPU_SYNCHRONOUS)); 94 | gettimeofday(&end_time, NULL); 95 | 96 | kernel_time += (end_time.tv_sec - start_time.tv_sec) * 1000000.0 + 97 | (end_time.tv_usec - start_time.tv_usec); 98 | } 99 | 100 | // virtually zip two tables 101 | uint32_t* lens = src2_table->lens_each_dpu; 102 | uint32_t* lens_ = src2_table->lens_each_dpu; 103 | 104 | uint32_t input_type1 = src1_table->table_type_size; 105 | uint32_t start1 = src1_table->start; 106 | uint32_t end1 = src1_table->end; 107 | uint32_t input_type2 = src2_table->table_type_size; 108 | uint32_t start2 = src2_table->start; 109 | uint32_t end2 = src2_table->end; 110 | 111 | for(uint32_t i=0; iname = malloc(strlen(dest_name)+1); 126 | memcpy(t->name, dest_name, strlen(dest_name)+1); 127 | uint32_t output_type = input_type1+input_type2; 128 | t->start = outputs; 129 | uint32_t max_end_dpu = max_len_dpu(num_dpus, src1_table)*output_type+outputs; 130 | t->end = max_end_dpu+(8-max_end_dpu%8); 131 | t->len = src1_table->len; 132 | t->table_type_size = output_type; 133 | t->lens_each_dpu = malloc(num_dpus*sizeof(int32_t)); 134 | memcpy(t->lens_each_dpu, lens, num_dpus*sizeof(int32_t)); 135 | 136 | t->is_virtual_zipped = 1; 137 | t->start1 = start1; 138 | t->end1 = end1; 139 | t->start2 = start2; 140 | t->end2 = end2; 141 | t->type1 = input_type1; 142 | t->type2 = input_type2; 143 | 144 | add_table(t, table_management); 145 | 146 | table_management->free_space_start_pos = table_management->free_space_start_pos > t->end ? table_management->free_space_start_pos : t->end; 147 | 148 | gettimeofday(&end_time, NULL); 149 | printf("\nzip function kernel execution time : %f\n", kernel_time/1000); 150 | double register_table_time = (end_time.tv_sec - start_time.tv_sec) * 1000000.0 + 151 | (end_time.tv_usec - start_time.tv_usec); 152 | printf("--------------\n"); 153 | printf("zip function call and table management time : %f\n", (register_table_time)/1000); 154 | printf("--------------\n"); 155 | } 156 | else{ 157 | printf("ERROR: compiled binary "); 158 | printf(binary_handle->bin_location); 159 | printf(" is not a zip function\n"); 160 | } 161 | } 162 | -------------------------------------------------------------------------------- /lib/processing/zip/Zip.h: -------------------------------------------------------------------------------- 1 | #ifndef ZIP_H 2 | #define ZIP_H 3 | #include "ZipArgs.h" 4 | #include "../ProcessingHelperHost.h" 5 | #include "../../management/Management.h" 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | /* 14 | table_zip implements the array zip operator as in the paper 15 | It setups the host for calling the pim kernel (Zip.c) 16 | Then it runs the array reduction pim kernel (zip_dpu.c and ZipProcessing.h) 17 | */ 18 | 19 | void table_zip(const char* src1_name, const char* src2_name, const char* dest_name, handle_t* binary_handle, simplepim_management_t* table_management); 20 | #endif 21 | -------------------------------------------------------------------------------- /lib/processing/zip/ZipArgs.h: -------------------------------------------------------------------------------- 1 | #ifndef ZIPARGS_H 2 | #define ZIPARGS_H 3 | #include 4 | #include 5 | 6 | typedef struct { 7 | uint32_t input_start_offset1; 8 | uint32_t input_start_offset2; 9 | uint32_t input_type_size1; 10 | uint32_t input_type_size2; 11 | uint32_t outputs; 12 | uint32_t len; 13 | } zip_arguments_t; 14 | 15 | 16 | #endif -------------------------------------------------------------------------------- /lib/processing/zip/ZipProcessing.c: -------------------------------------------------------------------------------- 1 | #include "ZipProcessing.h" 2 | BARRIER_INIT(barrier_p, NR_TASKLETS); 3 | 4 | void zip_dpu(__mram_ptr void* table_entries_1, __mram_ptr void* table_entries_2, __mram_ptr void* table_entries_res, uint32_t input_type_1, uint32_t input_type_2, uint32_t len){ 5 | uint32_t num_tasklets = NR_TASKLETS; 6 | uint32_t pid = me(); 7 | uint64_t tuple = copy_block_size_fun(input_type_1, input_type_2, len); 8 | uint32_t* copy_block_size_ = (uint32_t*)&tuple; 9 | uint32_t copy_block_size = copy_block_size_[0]; 10 | uint32_t copy_block_size_shiftbits = copy_block_size_[1]; 11 | 12 | uint32_t input_block_size_1 = input_type_1<>copy_block_size_shiftbits)<> 2; 51 | uint32_t input_type_2_div_4 = input_type_2 >> 2; 52 | uint32_t input_type_1_rest_4 = input_type_1 - (input_type_1_div_4<<2); 53 | uint32_t input_type_2_rest_4 = input_type_2 - (input_type_2_div_4<<2); 54 | 55 | uint32_t types_div_4 = (input_type_1_rest_4==0)&&(input_type_2_rest_4==0); 56 | uint32_t types_are_ints = (input_type_1_div_4==1)&&(input_type_2_div_4==1); 57 | 58 | 59 | if(types_div_4){ 60 | if(types_are_ints){ 61 | 62 | for(int i=pid_times_block_size; i 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include "../ProcessingHelper.h" 11 | 12 | void zip_dpu(__mram_ptr void* table_entries_1, __mram_ptr void* table_entries_2, __mram_ptr void* table_entries_res, uint32_t input_type_1, uint32_t input_type_2, uint32_t len); 13 | #endif -------------------------------------------------------------------------------- /lib/processing/zip/zip_dpu.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include "ZipProcessing.h" 10 | #include "ZipArgs.h" 11 | #include "../ProcessingHelper.h" 12 | 13 | __host zip_arguments_t ZIP_INPUT_ARGUMENTS; 14 | __dma_aligned void* aux; 15 | 16 | BARRIER_INIT(my_barrier, NR_TASKLETS); 17 | int main() { 18 | int pid = me(); 19 | if (pid == 0){ // Initialize once the cycle counter 20 | mem_reset(); // Reset the heap 21 | } 22 | barrier_wait(&my_barrier); 23 | 24 | //printf("\n"); 25 | 26 | uint32_t input_start_offset1 = ZIP_INPUT_ARGUMENTS.input_start_offset1; 27 | uint32_t input_start_offset2 = ZIP_INPUT_ARGUMENTS.input_start_offset2; 28 | uint32_t input_type_size1 = ZIP_INPUT_ARGUMENTS.input_type_size1; 29 | uint32_t input_type_size2 = ZIP_INPUT_ARGUMENTS.input_type_size2; 30 | uint32_t len = ZIP_INPUT_ARGUMENTS.len; 31 | uint32_t outputs = ZIP_INPUT_ARGUMENTS.outputs; 32 | 33 | zip_dpu(DPU_MRAM_HEAP_POINTER+input_start_offset1, DPU_MRAM_HEAP_POINTER+input_start_offset2, DPU_MRAM_HEAP_POINTER+outputs, input_type_size1, input_type_size2, len); 34 | return 0; 35 | } -------------------------------------------------------------------------------- /lib/timer.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | typedef struct Timer{ 4 | 5 | struct timeval startTime[6]; 6 | struct timeval stopTime[6]; 7 | double time[6]; 8 | 9 | }Timer; 10 | 11 | void start(Timer *timer, int i, int rep) { 12 | if(rep == 0) { 13 | timer->time[i] = 0.0; 14 | } 15 | gettimeofday(&timer->startTime[i], NULL); 16 | } 17 | 18 | void stop(Timer *timer, int i) { 19 | gettimeofday(&timer->stopTime[i], NULL); 20 | timer->time[i] += (timer->stopTime[i].tv_sec - timer->startTime[i].tv_sec) * 1000000.0 + 21 | (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec); 22 | //printf("Time (ms): %f\t",((timer->stopTime[i].tv_sec - timer->startTime[i].tv_sec) * 1000000.0 + 23 | // (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec)) / 1000); 24 | 25 | } 26 | 27 | void print(Timer *timer, int i, int REP) { printf("%f\t", timer->time[i] / (1000 * REP)); } --------------------------------------------------------------------------------