├── .gitignore
├── LICENSE
├── README.md
├── benchmarks
    ├── hist
    │   ├── Makefile
    │   ├── Param.h
    │   ├── bin
    │   │   └── host
    │   ├── hist_funcs
    │   │   ├── init_combine_func.h
    │   │   └── map_to_val_func.h
    │   └── host.c
    ├── kmeans
    │   ├── Makefile
    │   ├── Param.h
    │   ├── bin
    │   │   └── host
    │   ├── host.c
    │   ├── kmeans.py
    │   ├── kmeans_dpu.py
    │   ├── kmeans_funcs
    │   │   ├── init_combine_func.h
    │   │   └── map_to_val_func.h
    │   └── plot.py
    ├── linear_reg
    │   ├── Makefile
    │   ├── Param.h
    │   ├── bin
    │   │   └── host
    │   ├── host.c
    │   ├── lin_reg_funcs
    │   │   ├── init_combine_func.h
    │   │   └── map_to_val_func.h
    │   ├── linear_reg.py
    │   └── plot.py
    ├── log_reg
    │   ├── Makefile
    │   ├── Param.h
    │   ├── bin
    │   │   └── host
    │   ├── host.c
    │   ├── log_reg_funcs
    │   │   ├── init_combine_func.h
    │   │   └── map_to_val_func.h
    │   ├── logistic_reg.py
    │   └── plot.py
    ├── red
    │   ├── Makefile
    │   ├── Param.h
    │   ├── bin
    │   │   └── host
    │   ├── host.c
    │   └── red_funcs
    │   │   ├── init_combine_func.h
    │   │   └── map_to_val_func.h
    └── va
    │   ├── Makefile
    │   ├── Param.h
    │   ├── bin
    │       ├── dpu_init_binary
    │       ├── dpu_map_va_funcs
    │       ├── dpu_zip
    │       └── host
    │   ├── host.c
    │   └── va_funcs
    │       └── map.h
└── lib
    ├── Common.c
    ├── Common.h
    ├── Parallel.c
    ├── Parallel.h
    ├── Structs.h
    ├── StructsPIM.h
    ├── Table.c
    ├── Table.h
    ├── TableHost.c
    ├── TableHost.h
    ├── TableShared.c
    ├── TableShared.h
    ├── UpmemCustom.c
    ├── UpmemCustom.h
    ├── communication
        ├── CommHelper.c
        ├── CommHelper.h
        ├── CommOps.c
        └── CommOps.h
    ├── management
        ├── Management.c
        ├── Management.h
        ├── SmallTableInit.c
        ├── SmallTableInit.h
        └── SmallTableInit_dpu.c
    ├── processing
        ├── ProcessingHelper.c
        ├── ProcessingHelper.h
        ├── ProcessingHelperHost.c
        ├── ProcessingHelperHost.h
        ├── gen_red
        │   ├── GenRed.c
        │   ├── GenRed.h
        │   ├── GenRedArgs.h
        │   ├── GenRedProcessing.h
        │   └── gen_red_dpu.c
        ├── map
        │   ├── Map.c
        │   ├── Map.h
        │   ├── MapArgs.h
        │   ├── MapProcessing.h
        │   └── map_dpu.c
        └── zip
        │   ├── Zip.c
        │   ├── Zip.h
        │   ├── ZipArgs.h
        │   ├── ZipProcessing.c
        │   ├── ZipProcessing.h
        │   └── zip_dpu.c
    └── timer.h


/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | *.csv
3 | *.so
4 | *.json


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 SAFARI Research Group at ETH Zürich
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # SimplePIM: A Software Framework for Productive and Efficient In-Memory Processing
 2 | This project implements SimplePIM, a software framework for easy and efficient in-memory-hardware programming. The code is implemented on UPMEM, an actual, commercially available PIM hardware that combines traditional DRAM memory with general-purpose in-order cores inside the same chip. SimplePIM processes arrays of arbitrary elements on a PIM device by calling iterator functions from the host and provides primitives for communication among PIM cores and between PIM and the host system. 
 3 | 
 4 | We implement six applications with SimplePIM on UPMEM: 
 5 | - Vector Addtition
 6 | - Reduction
 7 | - K-Means Clustering
 8 | - Histogram
 9 | - Linear Regression
10 | - Logistic Regression
11 | 
12 | Previous manual UPMEM implementations of the same applications can be found in [PrIM Benchmark](https://github.com/CMU-SAFARI/prim-benchmarks), [dpu_kmeans](https://github.com/upmem/dpu_kmeans) and [prim-ml](https://github.com/CMU-SAFARI/pim-ml). These previous implementations can serve as baseline for measuring SimplePIM's performance as well as productivity improvements.
13 | 
14 | ## Citation
15 | Please cite the following papers if you find this repository useful.
16 | Jinfan Chen, Juan Gómez-Luna, Izzat El Hajj, Yuxin Guo and Onur Mutlu, "[SimplePIM: A Software Framework for Productive and Efficient In-Memory Processing](https://arxiv.org/abs/2310.01893)" , International Conference on Parallel Architectures and Compilation Techniques (PACT), 2023.
17 | 
18 | Bibtex entries for citation:
19 | ```
20 | @article{Chen2023SimplePIMPACT,
21 |   title={SimplePIM: A Software Framework for Productive and Efficient Processing-in-Memory},
22 |   author={Jinfan Chen and Juan G'omez-Luna and Izzat El Hajj and Yu-Yin Guo and Onur Mutlu},
23 |   year={2023},
24 |   booktitle = {PACT}
25 | }
26 | ```
27 | 
28 | ## Installation
29 | 
30 | ### Prerequisites
31 | Running PIM-ML requires installing the [UPMEM SDK](https://sdk.upmem.com). This benchmark designed to run on a server with real UPMEM modules, but they are also able to be run by the functional simulator in the UPMEM SDK.
32 | 
33 | ### Getting Started
34 | Clone the repository:
35 | ```
36 | $ git clone https://github.com/CMU-SAFARI/SimplePIM.git
37 | $ cd SimplePIM
38 | ```
39 | 
40 | ## Repository Structure
41 | ```
42 | .
43 | +-- LICENSE
44 | +-- README.md
45 | +-- gitignore
46 | +-- benchmarks/
47 | |   +-- hist/
48 | |   +-- kmeans/
49 | |   +-- linear_reg/
50 | |   +-- log_reg/
51 | |   +-- red/
52 | |   +-- va/
53 | +-- lib/
54 | |   +-- communication/
55 |     |   +-- CommOps.c
56 |     |   +-- CommOps.h
57 | |   +-- management/
58 |     |   +-- Management.c
59 |     |   +-- Management.h
60 | |   +-- processing/
61 |     |   +-- gen_red
62 |     |   +-- map
63 |     |   +-- zip
64 | ```
65 | 
66 | ## APIs 
67 | SimplePIM provides three APIs to the users. The management interface is under SimplePIM/lib/management/. The management interface code sets up the UPMEM hardware, records and manages information about the PIM arrays. The communication interface under SimplePIM/lib/communication/ contains code for PIM-to-PIM and host-PIM communication operators (gather, scatter, broadcast, allreduce, and allgather). Finally, the processing interface under SimplePIM/lib/processing/ contains the UPMEM implementation of array map, array zip and array reduction. Many workloads like histogram, kmeans and vector addition can be abstracted as a combination of the communication and processing operators.
68 | 
69 | SimplePIM/lib/ contains other files that are helper functions for the ease of framework development.
70 | 
71 | ## Running SimplePIM
72 | Each benchmark folder includes Makefiles to run the experiments:
73 | To run vector addition, redcution and histogram, one could simply go to each benchmark folder and run make. For example, to run vector addition, one could run 
74 | ```
75 | $ cd benchmarks/va
76 | $ make
77 | $ ./bin/host
78 | ```
79 | One can observe that SimplePIM produces the exact same result as the CPU code. One can change the parameters (number of elements, number of DPU used) in the Param.h file.
80 | 
81 | To run Linear Regresion, Logitic Regression, and KMeans, one needs to generate the input data with a python script under each benchmark folder. For example, to run linear regression, one firstly needs to run 
82 | ```
83 | $ cd benchmarks/linear_reg
84 | $ python linear_reg.py
85 | ```
86 | And then one can build and run the actual SimplePIM code as before with 
87 | ```
88 | $ make
89 | $ ./bin/host
90 | ```
91 | 
92 | ## Getting Help
93 | If you have any suggestions for improvement, please contact georgcjf at gmail dot com. If you find any bugs or have further questions or requests, please post an issue at the [issue page](https://github.com/CMU-SAFARI/SimplePIM/issues).
94 | 
95 | ## Acknowledgement
96 | We acknowledge support from the SAFARI Research Group’s industrial partners, especially Google, Huawei, Intel, Microsoft, VMware, and the Semiconductor Research Corporation. This research was partially supported by the ETH Future Computing Laboratory and the European Union’s Horizon programme for research and innovation under grant agreement No. 101047160, project BioPIM (Processing-in-memory architectures and pro- gramming libraries for bioinformatics algorithms). This research was also partially supported by ACCESS – AI Chip Center for Emerging Smart Systems, sponsored by InnoHK funding, Hong Kong SAR.
97 | 


--------------------------------------------------------------------------------
/benchmarks/hist/Makefile:
--------------------------------------------------------------------------------
1 | va: host.c
2 | 	@mkdir -p bin
3 | 	gcc --std=c99 -ldl -lm -fopenmp -O3 host.c -o bin/host ../../lib/processing/ProcessingHelperHost.c ../../lib/communication/CommHelper.c ../../lib/communication/CommOps.c ../../lib/management/SmallTableInit.c ../../lib/management/Management.c ../../lib/processing/gen_red/GenRed.c `dpu-pkg-config --cflags --libs dpu`


--------------------------------------------------------------------------------
/benchmarks/hist/Param.h:
--------------------------------------------------------------------------------
 1 | #ifndef PARAM_H
 2 | #define PARAM_H
 3 | #include <stdlib.h>
 4 | uint32_t print_info = 0;
 5 | typedef uint32_t T; 
 6 | const uint32_t dpu_number = 3; //2432
 7 | 
 8 | #define DEPTH 12 // 2^12 = 4096
 9 | #define bins 256
10 | 
11 | uint64_t nr_elements = dpu_number*128; //64*1536*1024
12 | #endif
13 | 


--------------------------------------------------------------------------------
/benchmarks/hist/bin/host:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CMU-SAFARI/SimplePIM/1d639c53532555f01e9f71d872e7712b166d6cba/benchmarks/hist/bin/host


--------------------------------------------------------------------------------
/benchmarks/hist/hist_funcs/init_combine_func.h:
--------------------------------------------------------------------------------
 1 | #ifndef INIT_COMBINE_FUNC_H
 2 | #define INIT_COMBINE_FUNC_H
 3 | 
 4 | #include <stdlib.h>
 5 | #include <stdint.h>
 6 | #include <stdio.h>
 7 | #include "../Param.h"
 8 | 
 9 | 
10 | void init_func(uint32_t size, void* ptr){
11 |     char* casted_value_ptr = (char*) ptr;
12 |     for(int i=0; i<size; i++){
13 |         casted_value_ptr[i] = 0;
14 |     }
15 | }
16 | 
17 | void combine_func(void* dest, void* src){
18 |     *(uint32_t*)dest += *(uint32_t*)src; 
19 | }
20 | 
21 | #endif
22 | 


--------------------------------------------------------------------------------
/benchmarks/hist/hist_funcs/map_to_val_func.h:
--------------------------------------------------------------------------------
 1 | #ifndef MAP_TO_VAL_FUNC_H
 2 | #define MAP_TO_VAL_FUNC_H
 3 | 
 4 | #include <stdlib.h>
 5 | #include <stdint.h>
 6 | #include <stdio.h>
 7 | 
 8 | #include <mram.h>
 9 | #include <alloc.h>
10 | #include <defs.h>
11 | #include <barrier.h>
12 | 
13 | #include "../Param.h"
14 | #include "../../../lib/processing/gen_red/GenRedArgs.h"
15 | 
16 | 
17 | void start_func(gen_red_arguments_t* args){}
18 | 
19 | void map_to_val_func(void* input, void* output, uint32_t* key){
20 |     uint32_t d = *((uint32_t*)input);
21 |     *(uint32_t*)output = 1;
22 |     *key = d*bins >> 12;
23 | }
24 | 
25 | #endif


--------------------------------------------------------------------------------
/benchmarks/hist/host.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <assert.h>
 4 | #include <time.h>
 5 | #include <dpu.h>
 6 | 
 7 | #include "../../lib/processing/gen_red/GenRed.h"
 8 | #include "../../lib/processing/ProcessingHelperHost.h"
 9 | #include "../../lib/timer.h"
10 | #include "Param.h"
11 | 
12 | 
13 | 
14 | void init_data(T* A){
15 |     for(unsigned long i=0; i<nr_elements; i++){
16 |         A[i] = i % 4096;
17 |     }
18 | } 
19 | 
20 | void zero_out_hist(uint32_t* histo){
21 |     for(int i=0; i<bins; i++){
22 |         histo[i] = 0;
23 |     }
24 | }
25 | 
26 | void histogram_host(uint32_t* histo, T* A){
27 |     for (unsigned int j = 0; j < nr_elements; j++) {
28 |             T d = A[j];
29 |             histo[(d * bins) >> DEPTH] += 1;
30 |         }
31 | }
32 | 
33 | void printf_hist(uint32_t* histo){
34 |     printf("the bins :\n");
35 |     for(int i=0; i<bins; i++){
36 |         printf("%d ", histo[i]);
37 |         if(i%8 == 7){
38 |             printf("\n");
39 |         } 
40 |     }
41 | }
42 | 
43 | void add(void* p1, void* p2){
44 |     uint32_t* ptr1 = (uint32_t*)(p1);
45 |     uint32_t* ptr2 = (uint32_t*)(p2);
46 |     *ptr1 += *ptr2;  
47 | }
48 | 
49 | void run(){
50 |     printf("the number of elements %lu\n", nr_elements);
51 |     simplepim_management_t* table_management = table_management_init(dpu_number);
52 |     T* A = (T*)malloc_scatter_aligned(nr_elements, sizeof(T), table_management);
53 | 
54 |     uint32_t* histo = (uint32_t*)malloc(sizeof(uint32_t)*bins);
55 |     init_data(A);
56 |     zero_out_hist(histo);
57 |     histogram_host(histo, A);
58 |     printf_hist(histo);
59 | 
60 |     simplepim_scatter("t1", A, nr_elements, sizeof(T), table_management);
61 |     printf("end of data transfer\n");
62 |  
63 |     handle_t* va_handle = create_handle("hist_funcs", REDUCE);
64 | 	
65 | 
66 |     T* res = table_gen_red("t1", "t2", sizeof(uint32_t), bins, va_handle, table_management, 0);
67 | 
68 |     printf("dpu results:\n");
69 |     printf_hist(res);
70 | 
71 | }
72 | 
73 | int main(int argc, char **argv){
74 |     srand(17); 
75 |     run();
76 |     return 0;
77 | }
78 | 


--------------------------------------------------------------------------------
/benchmarks/kmeans/Makefile:
--------------------------------------------------------------------------------
1 | kmeans: host.c
2 | 	@mkdir -p bin
3 | 	gcc --std=c99 -ldl -lm -fopenmp -O3 host.c -o bin/host ../../lib/processing/ProcessingHelperHost.c ../../lib/communication/CommHelper.c ../../lib/communication/CommOps.c ../../lib/management/SmallTableInit.c ../../lib/management/Management.c ../../lib/processing/gen_red/GenRed.c `dpu-pkg-config --cflags --libs dpu`


--------------------------------------------------------------------------------
/benchmarks/kmeans/Param.h:
--------------------------------------------------------------------------------
 1 | #ifndef PARAM_H
 2 | #define PARAM_H
 3 | 
 4 | #include <stdlib.h>
 5 | #include <stdint.h>
 6 | uint32_t print_info = 0;
 7 | typedef int32_t T; 
 8 | const uint32_t dpu_number = 5; // 2432
 9 | const uint32_t k = 10;
10 | const uint32_t dim = 10;
11 | const uint64_t num_elements = 1000*dpu_number;
12 | const uint32_t iter = 1;
13 | 
14 | #endif


--------------------------------------------------------------------------------
/benchmarks/kmeans/bin/host:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CMU-SAFARI/SimplePIM/1d639c53532555f01e9f71d872e7712b166d6cba/benchmarks/kmeans/bin/host


--------------------------------------------------------------------------------
/benchmarks/kmeans/host.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <stdlib.h>
  3 | #include <assert.h>
  4 | #include <time.h>
  5 | #include <dpu.h>
  6 | 
  7 | #include "../../lib/processing/gen_red/GenRed.h"
  8 | #include "../../lib/processing/ProcessingHelperHost.h"
  9 | #include "../../lib/communication/CommOps.h"
 10 | #include "../../lib/management/Management.h"
 11 | #include "../../lib/timer.h"
 12 | #include "Param.h"
 13 | 
 14 | 
 15 | 
 16 | FILE* fp;
 17 | 
 18 | void add(void* p1, void* p2){
 19 |     uint32_t* times1 = (uint32_t*)p1;
 20 |     uint32_t* times2 = (uint32_t*)p2;
 21 |     *times1 += *times2;
 22 |     T* ptr1 = (T*)(p1+sizeof(uint32_t));
 23 |     T* ptr2 = (T*)(p2+sizeof(uint32_t));
 24 |     
 25 |     for(int i=0; i<dim; i++){
 26 |         ptr1[i] = ptr1[i]+ptr2[i];
 27 |     }
 28 | }
 29 | 
 30 | int divRoundClosest(const int n, const int d)
 31 | {
 32 |   return ((n < 0) ^ (d < 0)) ? ((n - d/2)/d) : ((n + d/2)/d);
 33 | }
 34 | 
 35 | void average(int32_t times, void* f_dest, void* f){
 36 |   T* ptr = (T*)f;
 37 |   T* ptr_dest = (T*)f_dest;
 38 |   for(int i=0; i<dim; i++){
 39 |         ptr_dest[i] = times==0?0:divRoundClosest(ptr[i], times);
 40 |   }
 41 | }
 42 | 
 43 | void average_table_entries_to_arr(void* centroid_table, void* centroids){
 44 |   int32_t* times;
 45 |   void* dest_ptr;
 46 |   void* src_ptr;
 47 |   for(int i=0; i<k; i++){
 48 |     times = (int32_t*)(centroid_table+i*(dim*sizeof(T)+sizeof(int32_t)));
 49 |     src_ptr = centroid_table+i*(dim*sizeof(T)+sizeof(int32_t))+sizeof(uint32_t);
 50 |     dest_ptr = centroids + i*dim*sizeof(T);
 51 |     average(*times, dest_ptr, src_ptr);
 52 |   }
 53 | }
 54 | 
 55 | 
 56 | void read_csv_to_int_arr(FILE* fp, int32_t* arr, int32_t len, int32_t dim){
 57 |   if (fp == NULL) {
 58 |         fprintf(stderr, "Error reading file\n");
 59 |         return;
 60 |     }
 61 | 
 62 | 
 63 |     for (size_t i = 0; i < len; i++){
 64 |       for(size_t j = 0; j < dim-1; j++){
 65 |         //fscanf(fp, "%d,", &arr[i*dim+j]);
 66 |         arr[i*dim+j] = (i+j)%1000;
 67 |       }
 68 |       //fscanf(fp, "%d\n", &arr[i*dim+dim-1]);
 69 |       arr[i*dim+dim-1] = i%1000;
 70 |     }
 71 | 
 72 |     fclose(fp);
 73 | }
 74 | 
 75 | 
 76 | 
 77 | void write_time_to_csv(double* arr, int32_t len){
 78 |   if (fp == NULL) {
 79 |         fprintf(stderr, "Error reading file\n");
 80 |         return;
 81 |     }
 82 | 
 83 | 
 84 |     for (size_t i = 0; i < len; i++){
 85 |       fprintf(fp,"%f\n", arr[i]/1000.0);
 86 |     }
 87 | 
 88 |     fclose(fp);
 89 | }  
 90 | 
 91 | void get_output_file(int num_dpus, int dim, int num_elem, int k){
 92 |   char str1[10];
 93 |   char str2[10];
 94 |   char str3[10];
 95 |   char str4[10];
 96 |   sprintf(str1, "%d", num_dpus);
 97 |   sprintf(str2, "%d", dim);
 98 |   sprintf(str3, "%d", num_elem);
 99 |   sprintf(str4, "%d", k);
100 |   char out[100] = "results/framework_";
101 |   strcat(out, str1);
102 |   strcat(out,"_");
103 |   strcat(out, str2);
104 |   strcat(out,"_");
105 |   strcat(out, str3);
106 |   strcat(out,"_");
107 |   strcat(out, str4);
108 |   fp = fopen (out, "w");
109 | }
110 | 
111 | 
112 | 
113 | 
114 | 
115 | void run(){
116 |   simplepim_management_t* table_management = table_management_init(dpu_number);
117 |   printf("k: %d, dim: %d, num_elem: %d, iter: %d \n", k, dim, num_elements, iter);
118 | 
119 | 
120 |   // inputs
121 |   T* elements = (T*)malloc_scatter_aligned(num_elements, dim*sizeof(T), table_management);
122 |   
123 |   fp = fopen ("data/input.csv", "r");
124 |   read_csv_to_int_arr(fp, elements, num_elements, dim);
125 | 
126 |   //printf("additional centroids init data \n");
127 |   T* centroids = (T*)malloc_broadcast_aligned(k, sizeof(T)*dim, table_management);
128 |   for(int i=0; i<k; i++){
129 |     for(int j=0; j<dim; j++){
130 |       centroids[i*dim+j] = elements[i*dim+j];
131 |     }
132 |   }
133 |   printf("end of reading data from file\n");
134 | 
135 |   handle_t* va_handle = create_handle("kmeans_funcs", REDUCE);
136 | 
137 |   simplepim_scatter("t1", elements, num_elements, dim*sizeof(T), table_management);; 
138 |   uint32_t data_offset = lookup_table("t1", table_management)->end;
139 |   // main loop
140 |   for(int m=0; m<iter; m++){
141 |     simplepim_broadcast("t2", centroids, k, dim*sizeof(T), table_management);
142 |     T* res = table_gen_red("t1", "t3", dim*sizeof(T)+sizeof(int32_t), k, va_handle, table_management, data_offset);
143 | 
144 | 
145 |     average_table_entries_to_arr(res, centroids);
146 |     free_table("t2", table_management);
147 | 
148 |     for(int i=0; i<k; i++){
149 |       for(int j=0; j<dim; j++){
150 |         printf("%d ", centroids[i*dim+j]);
151 |       }
152 |       printf("\n");
153 |     }
154 |   }
155 | 
156 |   
157 |   
158 | }
159 | 
160 | 
161 | int main(int argc, char **argv){
162 |   run();
163 |   return 0;
164 | }
165 | 


--------------------------------------------------------------------------------
/benchmarks/kmeans/kmeans.py:
--------------------------------------------------------------------------------
 1 | #Intel(R) Xeon(R) Silver 4215 CPU @ 2.50GHz, 11264 KB cache
 2 | num_threads=32
 3 | import os
 4 | from joblib import parallel_backend
 5 | os.environ["OMP_NUM_THREADS"] = str(num_threads) 
 6 | os.environ["OPENBLAS_NUM_THREADS"] = str(num_threads)
 7 | os.environ["MKL_NUM_THREADS"] = str(num_threads) 
 8 | os.environ["BLIS_NUM_THREADS"] = str(num_threads)
 9 | 
10 | import time
11 | import random
12 | import sys
13 | import numpy as np
14 | from sklearn.cluster import KMeans
15 | 
16 | np.set_printoptions(precision=32)
17 | random.seed(10)
18 | np.set_printoptions(threshold=sys.maxsize)
19 | 
20 | def distance(a, b):
21 |     print(a-b)
22 |     return np.sum((a-b)**2)
23 | 
24 | def main():
25 |     num_dpus = 5
26 |     k, dim, num_elements, iter = 10, 10, 1000*num_dpus, 1
27 | 
28 |     init_centroids = np.zeros((k, dim), dtype=np.int32)
29 |     input = np.zeros((num_elements, dim), dtype=np.int32)
30 | 
31 |     for i in range(num_elements):
32 |         for j in range(dim):
33 |             r1, r2 = random.uniform(10, 100), random.uniform(0, 100)
34 |             input[i][j] = (int)((i%100)*r1 + j*r2) if i%2 == 0 else (int)(-1*((i%100)*r1 + j*r2))
35 |     
36 |     for i in range(k):
37 |         init_centroids[i] = input[i]
38 |     
39 | 
40 |     np.savetxt("data/args.csv", np.array([k, dim, num_elements, iter], dtype=np.intc).astype(int), delimiter=",", fmt='%s')
41 |     np.savetxt("data/input.csv", input, delimiter=",", fmt='%d')
42 | 
43 |     start = time.time()
44 |     kmeans = KMeans(algorithm = "full", n_clusters=k, init=init_centroids, n_init=1, max_iter=iter, tol=0)
45 |     kmeans.n_iter_ = iter
46 | 
47 |     kmeans.fit(input)
48 |     end = time.time()
49 | 
50 |     t = (end-start)*1000
51 |     print("the time consumed is "+str(t)+"ms")
52 |     
53 |     print("centroids of kmeans: ")
54 |     print(np.rint(kmeans.cluster_centers_))
55 | 
56 |     print("number of iterations: "+str(kmeans.n_iter_))
57 | 
58 | 
59 |     if not os.path.exists("results/"):
60 |         os.makedirs("results/")
61 | 
62 |     path = 'results/cpu_'+str(dim)+"_"+str(num_elements)+"_"+str(k)+".csv"
63 |     np.savetxt(path, np.array([t]))
64 |     '''
65 |     print("data points ")
66 |     print(input)
67 | 
68 | 
69 |     print("the labels of the data points")
70 |     print(kmeans.labels_)
71 | 
72 |     print("number of iterations: "+str(kmeans.n_iter_))
73 | 
74 |     print("distance to centroid1 "+str(distance(init_centroids[1], input[6])))
75 |     print("distance to centroid2 "+str(distance(init_centroids[2], input[6])))
76 |     print("label: "+str(kmeans.labels_[6]))
77 |     '''
78 |     
79 | 
80 | if __name__ == "__main__":
81 |     with parallel_backend('threading', n_jobs=num_threads):
82 |         main()
83 |     
84 | 
85 | 
86 | 
87 | 


--------------------------------------------------------------------------------
/benchmarks/kmeans/kmeans_dpu.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import time
 3 | from dpu_kmeans import KMeans as DPUKMeans
 4 | from dpu_kmeans import _dimm
 5 | from sklearn.datasets import make_blobs
 6 | from sklearn.metrics import adjusted_rand_score, mean_squared_error
 7 | 
 8 | if __name__ == "__main__":
 9 |     num_dpus = 1024
10 |     k, dim, num_elements, iter = 10, 10, num_dpus*10000, 1
11 |     input, labels, real_centroids = make_blobs(num_elements, dim, centers=k, random_state=42, return_centers=True)
12 |     real_centroids = real_centroids[np.lexsort(np.rot90(real_centroids))]
13 |     _dimm.set_n_dpu(num_dpus)
14 | 
15 |     dpu_kmeans = DPUKMeans(k, n_init=1, verbose=False, max_iter=iter, tol=1e-4)
16 |     dpu_kmeans.n_iter_ = iter
17 | 
18 |     start = time.time()
19 |     dpu_kmeans.fit(input)
20 |     end = time.time()
21 |     t = (end-start)*1000.
22 | 
23 |     result = dpu_kmeans.cluster_centers_
24 |     result = result[np.lexsort(np.rot90(result))]
25 |     duration = dpu_kmeans.dpu_run_time_ * 1000.
26 |     iterations = dpu_kmeans.n_iter_
27 | 
28 |     print("the time consumed is "+str(t)+" ms")
29 |     print("the kernel time consumed is "+str(duration)+" ms")
30 |     print("the number of iterations "+str(iterations))
31 |     print("the reduction time "+str(dpu_kmeans.pim_cpu_time_*1000)+" ms")
32 |     print("centroids of kmeans: ")
33 |     print(result)
34 |     print("real centroids: ")
35 |     print(real_centroids)
36 |     print("RMSE: ")
37 |     print(mean_squared_error(result, real_centroids, squared=False))
38 |     print("adjusted rand score: ")
39 |     print(adjusted_rand_score(dpu_kmeans.labels_, labels))
40 | 


--------------------------------------------------------------------------------
/benchmarks/kmeans/kmeans_funcs/init_combine_func.h:
--------------------------------------------------------------------------------
 1 | #ifndef INIT_COMBINE_FUNC_H
 2 | #define INIT_COMBINE_FUNC_H
 3 | 
 4 | #include <stdlib.h>
 5 | #include <stdint.h>
 6 | #include <stdio.h>
 7 | #include "../Param.h"
 8 | 
 9 | 
10 | void init_func(uint32_t size, void* ptr){
11 |     char* casted_value_ptr = (char*) ptr;
12 |     for(int i=0; i<size; i++){
13 |         casted_value_ptr[i] = 0;
14 |     }
15 | }
16 | 
17 | void combine_func(void* p1, void* p2){
18 |     int32_t* times1 = (int32_t*)p1;
19 |     int32_t* times2 = (int32_t*)p2;
20 |     *times1 += *times2;
21 |     p1 += sizeof(int32_t);
22 |     p2 += sizeof(int32_t);
23 |     T* ptr1 = (T*)(p1);
24 |     T* ptr2 = (T*)(p2);
25 | 
26 |     for(int i=0; i<dim; i++){
27 |         ptr1[i] = ptr1[i]+ptr2[i];
28 |     }
29 | }
30 | 
31 | #endif
32 | 
33 | 


--------------------------------------------------------------------------------
/benchmarks/kmeans/kmeans_funcs/map_to_val_func.h:
--------------------------------------------------------------------------------
 1 | #ifndef MAP_TO_VAL_FUNC_H
 2 | #define MAP_TO_VAL_FUNC_H
 3 | 
 4 | #include <stdlib.h>
 5 | #include <stdint.h>
 6 | #include <stdio.h>
 7 | 
 8 | #include <mram.h>
 9 | #include <alloc.h>
10 | #include <defs.h>
11 | #include <barrier.h>
12 | 
13 | #include "../Param.h"
14 | #include "../../../lib/processing/gen_red/GenRedArgs.h"
15 | 
16 | __dma_aligned void* centroids_data;
17 | 
18 | 
19 | BARRIER_INIT(barrier_maptoval, NR_TASKLETS);
20 | void start_func(gen_red_arguments_t* args){
21 |     uint32_t total_len = args->table_len * args->output_type_size;
22 |     uint32_t aligned_weights_size = total_len + 8-(total_len%8);
23 |     if(me()==0){
24 |         // initialise weights
25 |         fsb_allocator_t weights_allocator = fsb_alloc(aligned_weights_size, 1);
26 |         centroids_data = (void*)fsb_get(weights_allocator);
27 |         mram_read(DPU_MRAM_HEAP_POINTER+args->info, centroids_data, aligned_weights_size);
28 |     }
29 |     barrier_wait(&barrier_maptoval);
30 | }
31 | 
32 | void map_to_val_func(void* input_point, void* intermediate_input, uint32_t* centroid){
33 |     // the data is preserved and later added to corresponding centroid 
34 |     int32_t* times = (int32_t*)intermediate_input;
35 |     *times = 1;
36 |     intermediate_input+=sizeof(uint32_t);
37 | 
38 |     T* intermediate_ptr = (T*)intermediate_input;
39 |     T* input_point_ptr = (T*)input_point;
40 |     T* centroids_data_ptr = (T*)centroids_data;
41 |     
42 |     for(int i=0; i<dim; i++){
43 |         intermediate_ptr[i] = input_point_ptr[i];
44 |     }
45 | 
46 |     // find the right centroid
47 |     uint32_t curr_centroid_pos;
48 |     uint32_t curr_best_centroid;
49 |     T tmp;
50 |     uint64_t curr_dist;
51 |     uint64_t shortest_dist =  UINT64_MAX;
52 | 
53 |     for(int i=0; i<k; i++){
54 |         curr_centroid_pos = i*dim;
55 |         curr_dist = 0;
56 |         for(int j=0; j<dim; j++){
57 |             tmp = input_point_ptr[j]-centroids_data_ptr[curr_centroid_pos+j];
58 |             curr_dist += tmp*tmp;
59 |         }
60 |         if(curr_dist<shortest_dist){
61 |             curr_best_centroid = i; 
62 |             shortest_dist = curr_dist;
63 |         }
64 |     }
65 | 
66 |     *centroid = curr_best_centroid;
67 | }
68 | 
69 | 
70 | #endif


--------------------------------------------------------------------------------
/benchmarks/kmeans/plot.py:
--------------------------------------------------------------------------------
  1 | from cProfile import label
  2 | import matplotlib.pyplot as plt
  3 | import json
  4 | import numpy as np
  5 | import scipy.stats
  6 | from math import log2, exp
  7 | 
  8 | colors=['#23ef68','#32efff','#2eaf9f','#22222f','#eeeff1','#eee112','#00ef00','#aa0000','#0000aa','#000999','#2e3f56','#7eef1f','#eeef11']
  9 | 
 10 | font = {'weight' : 'bold',
 11 |         'size'   : 16}
 12 | plt.rcParams["figure.figsize"] = (16, 7)
 13 | plt.rc('font', **font)
 14 | 
 15 | 
 16 | def load_data(data, cpu_files, DPU_files):
 17 |     for i in range(len(cpu_files)):
 18 |         data["CPU"][i] = np.loadtxt(cpu_files[i]) /1000
 19 | 
 20 |     for i in range(len(DPU_files)):
 21 |         tmp = np.loadtxt(DPU_files[i])/1000
 22 |         data["DPU_initial_transfer"][i] = tmp[0]
 23 |         data["DPU_Kernel"][i] = tmp[1]
 24 |         data["DPU_D2C"][i] = tmp[2]
 25 |         data["DPU_C2D"][i] = tmp[4]
 26 |         data["DPU"][i] = tmp[5]
 27 | 
 28 | 
 29 | 
 30 | k_data={
 31 |     "title":"varing_#centroids_dimension",
 32 |     "x_name":"#centroids",
 33 |     "x_axis":["5", "10", "20"],
 34 |     "CPU":np.zeros(3),
 35 | 
 36 |     "DPU":np.zeros(3),
 37 |     "DPU_Kernel":np.zeros(3),
 38 |     "DPU_initial_transfer":np.zeros(3),
 39 |     "DPU_C2D":np.zeros(3),
 40 |     "DPU_D2C":np.zeros(3),
 41 | 
 42 |     "DPU_MASTER":np.array([0, 0, 0]),
 43 | 
 44 | }    
 45 | 
 46 | 
 47 | dim_data={
 48 |     "title":"varing_input_dimension",
 49 |     "x_name":"input dimension",
 50 |     "x_axis":["5", "10", "20"],
 51 |     "CPU":np.zeros(3),
 52 | 
 53 |     "DPU":np.zeros(3),
 54 |     "DPU_Kernel":np.zeros(3),
 55 |     "DPU_initial_transfer":np.zeros(3),
 56 |     "DPU_C2D":np.zeros(3),
 57 |     "DPU_D2C":np.zeros(3),
 58 | 
 59 |     "DPU_MASTER":np.array([2.245323657989501953e+02/1000, 0, 0]),
 60 | 
 61 | }
 62 | 
 63 | 
 64 | num_data={
 65 |     "title":"varing_#data",
 66 |     "x_name":"number of input data points",
 67 |     "x_axis":["100000", "1000000", "10000000"],
 68 |     "CPU":np.zeros(3),
 69 | 
 70 |     "DPU":np.zeros(3),
 71 |     "DPU_Kernel":np.zeros(3),
 72 |     "DPU_initial_transfer":np.zeros(3),
 73 |     "DPU_C2D":np.zeros(3),
 74 |     "DPU_D2C":np.zeros(3),
 75 | 
 76 |     "DPU_MASTER":np.array([1.108753681182861328e+02/1000, 0, 0]),
 77 | }
 78 | 
 79 | num_dpus_data={
 80 |     "title":"varing_#dpus",
 81 |     "x_name":"number of dpus",
 82 |     "x_axis":["128", "512", "2048"],
 83 |     "CPU":np.zeros(3),
 84 | 
 85 |     "DPU":np.zeros(3),
 86 |     "DPU_Kernel":np.zeros(3),
 87 |     "DPU_initial_transfer":np.zeros(3),
 88 |     "DPU_C2D":np.zeros(3),
 89 |     "DPU_D2C":np.zeros(3),
 90 | 
 91 |     "DPU_MASTER":np.array([0, 0, 0]),
 92 | }
 93 | 
 94 | 
 95 | def mean_confidence_interval(data, confidence=0.99):
 96 |     a = 1.0 * np.array(data)
 97 |     n = len(a)
 98 |     m, se = np.mean(a), scipy.stats.sem(a)
 99 |     h = se * scipy.stats.t.ppf((1 + confidence) / 2., n-1)
100 |     return h
101 | 
102 | def concate(arr_list):
103 |     arr_list = [x.reshape(len(x), 1) for x in arr_list] 
104 |     return np.concatenate(arr_list, axis = 1)
105 | 
106 | def plot_res(data):
107 |     x = data["x_axis"]
108 |     title = data["title"]
109 |     _, ax = plt.subplots()
110 | 
111 |     line_width = 0.25
112 |     x_pos = np.arange(len(x))
113 |     
114 |     
115 |     bar1 = ax.bar(x_pos - line_width + line_width*0, data["CPU"], width=line_width, edgecolor='k', color=colors[0], label ="CPU version")
116 |     bar2 = ax.bar(x_pos - line_width + line_width*1, data["DPU"], width=line_width, edgecolor='k', color=colors[1], label ="CPU-reduce")
117 |     bar4 = ax.bar(x_pos - line_width + line_width*1, data["DPU_Kernel"]+data["DPU_initial_transfer"]+data["DPU_D2C"]+data["DPU_C2D"], width=line_width, edgecolor='k', color=colors[2], label="C2D transfer")
118 |     bar5 = ax.bar(x_pos - line_width + line_width*1, data["DPU_Kernel"]+data["DPU_initial_transfer"]+data["DPU_D2C"], width=line_width, edgecolor='k', color=colors[3], label="D2C transfer")
119 |     bar6 = ax.bar(x_pos - line_width + line_width*1, data["DPU_Kernel"]+data["DPU_initial_transfer"], width=line_width, edgecolor='k', color=colors[4], label="initial transfer")
120 |     bar7 = ax.bar(x_pos - line_width + line_width*1, data["DPU_Kernel"], width=line_width, edgecolor='k', color=colors[5], label="DPU kernel")
121 |     bar3 = ax.bar(x_pos - line_width + line_width*2, data["DPU_MASTER"], width=line_width, edgecolor='k', color=colors[10])
122 | 
123 | 
124 |     for i in range(len(bar1 + bar2 +bar3)):
125 |         rect = (bar1 + bar2 +bar3)[i]
126 |         height = rect.get_height()
127 |         if(not height == 0):
128 |             if i//3 == 0:
129 |                 text = "cpu"
130 |             elif i//3 == 1:
131 |                 text = "pim-f"
132 |             else:
133 |                 text = "pim-h"
134 |             plt.text(rect.get_x()+rect.get_width() / 2, height, text, ha = 'center', va = 'bottom', fontdict={'size': 16})
135 |     
136 |     #plt.yscale('log',base=2)
137 |     ax.set_xticks(x_pos-line_width*0)
138 |     ax.set_xticklabels(x) 
139 |     ax.set_title(title)
140 |     plt.xlabel(data["x_name"], fontdict=font)
141 |     plt.ylabel("time in s", fontdict=font)
142 | 
143 |     legend1 = plt.legend(handles=[bar1, bar2], loc='upper left', shadow=True, bbox_to_anchor=(0, -0.12, 0, 0))
144 |     ax.add_artist(legend1)
145 |     legend2 = plt.legend(handles=[bar3, bar4, bar7], loc='upper left', shadow=True, bbox_to_anchor=(0.425, -0.12, 0, 0))
146 |     ax.add_artist(legend2)
147 |     plt.legend(handles=[bar5, bar6], loc='upper left', shadow=True, bbox_to_anchor=(0.2, -0.12, 0, 0))
148 |     plt.savefig("images/"+title, bbox_inches='tight')
149 |     plt.clf()
150 |     plt.close()
151 |     
152 | 
153 |     
154 |     
155 | 
156 | 
157 | 
158 | 
159 | 
160 | if __name__=="__main__":
161 |     dir = "results/"
162 |     load_data(dim_data, [dir+ i for i in ["cpu_5_1000000_10.csv", "cpu_10_1000000_10.csv", "cpu_20_1000000_10.csv"]], [dir+i for i in ["framework_2546_5_1000000_10", "framework_2546_10_1000000_10", "framework_2546_20_1000000_10"]])
163 |     plot_res(dim_data)
164 |     load_data(num_data, [dir+ i for i in ["cpu_10_100000_10.csv", "cpu_10_1000000_10.csv", "cpu_10_10000000_10.csv"]], [dir+i for i in ["framework_2546_10_100000_10", "framework_2546_10_1000000_10", "framework_2546_10_10000000_10"]])
165 |     plot_res(num_data)
166 |     load_data(num_dpus_data, [], [dir+i for i in ["framework_128_10_1000000_10", "framework_512_10_1000000_10", "framework_2048_10_1000000_10"]])
167 |     plot_res(num_dpus_data)
168 |     load_data(k_data, [dir+ i for i in ["cpu_10_1000000_5.csv", "cpu_10_1000000_10.csv", "cpu_10_1000000_20.csv"]], [dir+i for i in ["framework_2523_10_1000000_5", "framework_2546_10_1000000_10", "framework_2546_10_1000000_20"]])
169 |     plot_res(k_data)
170 | 


--------------------------------------------------------------------------------
/benchmarks/linear_reg/Makefile:
--------------------------------------------------------------------------------
1 | lin: host.c
2 | 	@mkdir -p bin
3 | 	gcc --std=c99 -ldl -lm -fopenmp -O3 host.c -o bin/host ../../lib/processing/ProcessingHelperHost.c ../../lib/communication/CommHelper.c ../../lib/communication/CommOps.c ../../lib/management/SmallTableInit.c ../../lib/management/Management.c ../../lib/processing/gen_red/GenRed.c `dpu-pkg-config --cflags --libs dpu`


--------------------------------------------------------------------------------
/benchmarks/linear_reg/Param.h:
--------------------------------------------------------------------------------
 1 | #ifndef PARAM_H
 2 | #define PARAM_H
 3 | 
 4 | #include <stdlib.h>
 5 | #include <stdint.h>
 6 | uint32_t print_info = 0;
 7 | typedef int T; 
 8 | const uint32_t dpu_number = 5; // 2432
 9 | const uint32_t dim = 10;
10 | const uint64_t num_elements = 1000*dpu_number;//10000*dpu_number;
11 | const uint32_t iter = 1;
12 | const float lr = 1e-4;
13 | const uint32_t shift_amount = 5;//10;
14 | const uint32_t prevent_overflow_shift_amount = 8;//15;
15 | #endif


--------------------------------------------------------------------------------
/benchmarks/linear_reg/bin/host:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CMU-SAFARI/SimplePIM/1d639c53532555f01e9f71d872e7712b166d6cba/benchmarks/linear_reg/bin/host


--------------------------------------------------------------------------------
/benchmarks/linear_reg/host.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <stdlib.h>
  3 | #include <string.h>
  4 | #include <assert.h>
  5 | #include <time.h>
  6 | #include <dpu.h>
  7 | 
  8 | #include "../../lib/processing/gen_red/GenRed.h"
  9 | #include "../../lib/processing/ProcessingHelperHost.h"
 10 | #include "../../lib/communication/CommOps.h"
 11 | #include "../../lib/management/Management.h"
 12 | #include "../../lib/timer.h"
 13 | #include "Param.h"
 14 | 
 15 | FILE* fp;
 16 | 
 17 | 
 18 | void read_csv_to_arr(FILE* fp, T* arr, int32_t len, int32_t d){
 19 |   if (fp == NULL) {
 20 |         fprintf(stderr, "Error reading file\n");
 21 |         return;
 22 |     }
 23 | 
 24 |     float tmp;
 25 |     for (size_t i = 0; i < len; i++){
 26 |       for(size_t j = 0; j < d-1; j++){
 27 |         fscanf(fp, "%f,", &tmp);
 28 |         arr[i*d+j] = (T)tmp;
 29 |       }
 30 |       fscanf(fp, "%f\n", &tmp);
 31 |       arr[i*d+d-1] = (T)tmp;
 32 |     }
 33 | 
 34 |     fclose(fp);
 35 | }
 36 | 
 37 | void write_time_to_csv(double* arr, int32_t len){
 38 |   if (fp == NULL) {
 39 |         fprintf(stderr, "Error reading file\n");
 40 |         return;
 41 |     }
 42 | 
 43 | 
 44 |     for (size_t i = 0; i < len; i++){
 45 |       fprintf(fp,"%f\n", arr[i]/1000.0);
 46 |     }
 47 | 
 48 |     fclose(fp);
 49 | }
 50 | 
 51 | void compute_gradients(const T*arr){
 52 | 
 53 |   // [X|Y] -> [X], [Y]
 54 |   T* X = malloc(num_elements*dim*sizeof(T));
 55 |   T* Y = malloc(num_elements*sizeof(T));
 56 |   for(uint32_t i=0; i<num_elements; i++){
 57 |     for(uint32_t j=0; j<dim; j++){
 58 |       X[i*dim+j] = arr[i*(dim+1)+j];
 59 |     }
 60 |     Y[i] = arr[i*(dim+1)+dim];
 61 |   }
 62 | 
 63 |   // actual code
 64 |   T* weights = malloc(dim*sizeof(T));
 65 | 
 66 |   for (uint32_t n = 0; n < dim; n++) {
 67 |     weights[n] = 0;
 68 |   }
 69 | 
 70 |   int64_t dot_product; 
 71 |   int64_t e;
 72 |   int64_t* gradient_tmp = (int64_t*) calloc(dim, sizeof(int64_t)); 
 73 |   for (uint32_t i = 0; i < iter; ++i){
 74 |     for (uint32_t n = 0; n < dim; ++n) {
 75 |       gradient_tmp[i] = 0;
 76 |     }
 77 | 
 78 |     for (uint32_t j = 0; j < num_elements; ++j) {
 79 |         dot_product = 0; 
 80 |         for (uint32_t k = 0; k < dim; k++) {
 81 |           dot_product += X[j*dim + k] * weights[k]; 
 82 |         }
 83 | 
 84 |         e = dot_product-(Y[j]<<shift_amount);
 85 |         for (uint32_t l = 0; l < dim; l++) {
 86 |                 gradient_tmp[l] += X[j*dim + l] * e >> prevent_overflow_shift_amount; 
 87 |         }
 88 |     }
 89 |   }
 90 | 
 91 |   printf("\nthe gradients on host: \n");
 92 |   for(int i=0; i<dim; i++){
 93 |     printf("%lld ", gradient_tmp[i]);
 94 |   }
 95 |   printf("\n");
 96 | 
 97 | }
 98 | 
 99 | void get_output_file(int num_dpus, int dim, int num_elem){
100 |   char str1[10];
101 |   char str2[10];
102 |   char str3[10];
103 |   sprintf(str1, "%d", num_dpus);
104 |   sprintf(str2, "%d", dim);
105 |   sprintf(str3, "%d", num_elem);
106 |   char out[100] = "results/framework_";
107 |   strcat(out, str1);
108 |   strcat(out,"_");
109 |   strcat(out, str2);
110 |   strcat(out,"_");
111 |   strcat(out, str3);
112 |   fp = fopen (out, "w");
113 | }
114 | 
115 | 
116 | 
117 | int main(){
118 |   simplepim_management_t* table_management = table_management_init(dpu_number);
119 |   printf("dim: %d, num_elem: %d, iter: %d, lr: %f \n", dim, num_elements, iter, lr);
120 | 
121 |   // reading arguments 
122 |   /*
123 |   fp = fopen ("data/args.csv", "r");
124 |   if (fp == NULL) {
125 |         fprintf(stderr, "Error reading file\n");
126 |         return -1;
127 |   }
128 |   else{
129 |     float tmp;
130 | 
131 |     fscanf(fp, "%f", &tmp);
132 |     dim = (int)tmp;
133 |     fscanf(fp, "%f", &tmp);
134 |     num_elements = (int)tmp;
135 |     fscanf(fp, "%f", &tmp);
136 |     iter = (int)tmp;
137 |     fscanf(fp, "%f", &lr);
138 |   }
139 |   fclose(fp);
140 |   */
141 |   // data contains y also as last element
142 | 
143 | 
144 |   // inputs
145 |   //printf("reading the input data\n");
146 |   T* elements = (T*)malloc_scatter_aligned(num_elements, (dim+1)*sizeof(T), table_management);
147 |   
148 |   fp = fopen ("data/input.csv", "r");
149 |   read_csv_to_arr(fp, elements, num_elements, dim+1);
150 |   
151 |   // weights data
152 |   T* weights = malloc_broadcast_aligned(1, sizeof(T)*dim, table_management);
153 |   float* weights_float = malloc_broadcast_aligned(1, sizeof(T)*dim, table_management);
154 |   for(int i=0; i<dim; i++){
155 |       weights[i] = 0;
156 |   }
157 | 
158 |   int64_t* gradients_dpu = malloc(dim*sizeof(T));
159 |   compute_gradients(elements);
160 | 
161 |   if(print_info){
162 |     printf("initial weight data \n");
163 |     for(int i=0; i<dim; i++){
164 |       printf("%d ",weights[i]);
165 |     }
166 |     printf("\n");
167 |   }
168 |   printf("end of reading data from file\n");
169 |   simplepim_scatter("t1", elements, num_elements, (dim+1)*sizeof(T), table_management);
170 |   uint32_t data_offset = lookup_table("t1", table_management)->end; 
171 |   simplepim_broadcast("t2", weights, 1, dim*sizeof(T),  table_management);
172 |   uint32_t weights_offset = lookup_table("t2", table_management)->end; 
173 | 
174 |   handle_t* va_handle = create_handle("lin_reg_funcs", REDUCE);
175 | 
176 |   for(int l=0; l<iter; l++){
177 |     int64_t* res = table_gen_red("t1", "t3",  dim*sizeof(int64_t), 1, va_handle, table_management, data_offset);
178 | 
179 |     //free_table("t2", table_management);
180 |     //free_table("t3", table_management);
181 |     simplepim_broadcast("t2", weights, 1, dim*sizeof(T), table_management);
182 | 
183 |     for(int i=0; i<dim; i++){
184 |       gradients_dpu[i] = res[i];
185 |     }
186 |     free(res);
187 |   }
188 | 
189 |   
190 |   printf("the gradients of linear model: \n");
191 |   for(int i=0; i<dim; i++){
192 |     printf("%lld ", gradients_dpu[i]);
193 |   }
194 |   printf("\n");
195 |   
196 | 
197 |   /*
198 | 
199 |   // preparing and parsing argument
200 |   dpu_arguments_t* input_args = (dpu_arguments_t*) malloc(num_dpus * sizeof(dpu_arguments_t));
201 | 
202 |   for(int i=0; i<num_dpus; i++){
203 |      input_args[i].input_start_offset = 0;
204 |      input_args[i].input_type_size = dim*sizeof(T);
205 |      input_args[i].data_start_offset = data_offset;
206 |      input_args[i].data_len = 1;
207 |      input_args[i].data_type_size = X_dim*sizeof(T);
208 |      input_args[i].end_offset = end_offset;
209 |      input_args[i].table_type_size = X_dim*sizeof(T);
210 |      input_args[i].table_len = 1;
211 |   }
212 | 
213 | 
214 |   prepare_input_len_and_parse_args(set, input_args, num_elements, dim*sizeof(T), num_dpus);
215 |   stop(&timer, 0);
216 |   //printf("end of parsing arguments to upmem\n");
217 | 
218 |   // launch actual dpu code
219 |   T* grads_table = (T*)malloc(sizeof(T)*X_dim);
220 |   for(int it=0; it<iter; it++){
221 |     start(&timer, 1, it);
222 |     DPU_ASSERT(dpu_launch(set, DPU_SYNCHRONOUS));
223 |     stop(&timer, 1);
224 | 
225 |     if(print_info){
226 |       DPU_FOREACH(set, dpu) {
227 |         DPU_ASSERT(dpu_log_read(dpu, stdout));
228 |       }
229 |     }
230 | 
231 |     start(&timer, 2, it);
232 |     gather_tables_to_host(set, grads_table, 1, X_dim*sizeof(T), end_offset, num_dpus, zero_init, add);
233 |     stop(&timer, 2);
234 | 
235 |     start(&timer, 3, it);
236 |     for(int i=0; i<X_dim; i++){
237 |       grads_table[i] *= 2;
238 |       grads_table[i] /= num_elements;
239 |       //printf("%f ", grads_table[i]);
240 |     }
241 |     //printf("\n");
242 | 
243 |     //update gradients
244 |     for(int i=0; i<X_dim; i++){
245 |         weights[i] -= lr * grads_table[i];
246 |     }
247 |     stop(&timer, 3);
248 | 
249 |     start(&timer, 4, it);
250 |     host_broadcast_to_dpu(set, weights, 1, X_dim*sizeof(T), data_offset);
251 |     stop(&timer, 4);
252 |   }
253 |   stop(&timer, 5);
254 | 
255 |   printf("the total time with timing consumed is (ms): ");
256 |   print(&timer, 5, 1);
257 |   printf("\n");
258 |   printf("initial CPU-DPU input transfer (ms): ");
259 | 	print(&timer, 0, 1);
260 |   printf("\n");
261 | 	printf("DPU Kernel Time (ms): ");
262 | 	print(&timer, 1, iter);
263 |   printf("\n");
264 | 	printf("DPU-CPU Time (ms): ");
265 | 	print(&timer, 2, iter);
266 |   printf("\n");
267 | 	printf("CPU combine table Time (ms): ");
268 | 	print(&timer, 3, iter);
269 |   printf("\n");
270 | 	printf("CPU-DPU Time (ms): ");
271 | 	print(&timer, 4, iter);
272 |   printf("\n");
273 | 
274 |   float total_time = timer.time[0];
275 |   for(int i=1; i<5; i++){
276 |     total_time += timer.time[i];
277 |   }
278 |   
279 |   printf("total time added up (ms): %f\n", total_time/1000);
280 | 
281 |   printf("the weights of linear model: \n");
282 |   for(int i=0; i<X_dim; i++){
283 |     printf("%f ", weights[i]);
284 |   }
285 |   printf("\n");
286 | 
287 |   double* times = (double*)malloc(sizeof(double)*6);
288 |   for(int i=0; i<5; i++){
289 |     times[i] = timer.time[i];
290 |   }
291 |   times[5] = total_time;
292 |   get_output_file(num_dpus, X_dim, num_elements);
293 |   write_time_to_csv(times, 6);
294 | 
295 | 
296 |   free(grads_table);
297 |   */
298 |   return 0;
299 | }
300 | 


--------------------------------------------------------------------------------
/benchmarks/linear_reg/lin_reg_funcs/init_combine_func.h:
--------------------------------------------------------------------------------
 1 | #ifndef INIT_COMBINE_FUNC_H
 2 | #define INIT_COMBINE_FUNC_H
 3 | 
 4 | #include <stdlib.h>
 5 | #include <stdint.h>
 6 | #include <stdio.h>
 7 | #include "../Param.h"
 8 | 
 9 | 
10 | void init_func(uint32_t size, void* ptr){
11 |     char* casted_value_ptr = (char*) ptr;
12 |     for(int i=0; i<size; i++){
13 |         casted_value_ptr[i] = 0;
14 |     }
15 | }
16 | 
17 | void combine_func(void* dest, void* src){
18 |     int64_t* ptr1 = (int64_t*)dest;
19 |     int64_t* ptr2 = (int64_t*)src;
20 |     for(int i=0; i<dim; i++){
21 |         ptr1[i] += ptr2[i];
22 |     }
23 | }
24 | 
25 | #endif


--------------------------------------------------------------------------------
/benchmarks/linear_reg/lin_reg_funcs/map_to_val_func.h:
--------------------------------------------------------------------------------
 1 | #ifndef MAP_TO_VAL_FUNC_H
 2 | #define MAP_TO_VAL_FUNC_H
 3 | 
 4 | #include <stdlib.h>
 5 | #include <stdint.h>
 6 | #include <stdio.h>
 7 | 
 8 | #include <mram.h>
 9 | #include <alloc.h>
10 | #include <defs.h>
11 | #include <barrier.h>
12 | 
13 | #include "../Param.h"
14 | #include "../../../lib/processing/gen_red/GenRedArgs.h"
15 | 
16 | 
17 | __dma_aligned T* weights_data;
18 | BARRIER_INIT(barrier_maptoval, NR_TASKLETS);
19 | 
20 | void start_func(gen_red_arguments_t* args){
21 |     uint32_t total_len = args->table_len * args->output_type_size;
22 |     uint32_t aligned_weights_size = total_len + 8-(total_len%8);
23 |     if(me()==0){
24 |         // initialise weights
25 |         fsb_allocator_t weights_allocator = fsb_alloc(aligned_weights_size, 1);
26 |         weights_data = (void*)fsb_get(weights_allocator);
27 |         mram_read(DPU_MRAM_HEAP_POINTER+args->info, weights_data, aligned_weights_size);
28 |     }
29 |     barrier_wait(&barrier_maptoval);
30 | }
31 | 
32 | 
33 | 
34 | void map_to_val_func(void* input, void* grads, uint32_t* dummy){
35 |     // the data is preserved and later added to corresponding weights 
36 |     int64_t* grads_ptr = (int64_t*)grads;
37 |     T* input_ptr = (T*)input;
38 |     T* weights_data_ptr = (T*)weights_data;
39 | 
40 |     // calculate gradients w.r.t. linear weights
41 |     int64_t dot_prod = 0;
42 |     for(int i=0; i<dim; i++){
43 |         dot_prod += input_ptr[i] * weights_data_ptr[i];
44 |     }
45 | 
46 |     int64_t e = dot_prod-(input_ptr[dim]<<shift_amount);
47 |     //printf("error : %f\n", e*e);
48 |     for(int i=0; i<dim; i++){
49 |         //grads_ptr[i] = e * input_ptr[i]>>prevent_overflow_shift_amount;
50 |         //printf("%f ", grads_ptr[i]);
51 |         grads_ptr[i] = input_ptr[i] * e >> prevent_overflow_shift_amount; 
52 |     }
53 |     //printf("\n");
54 |     
55 |     // put weight gradients to the 0th entry
56 |     *dummy = 0;
57 | 
58 | }
59 | 
60 | #endif


--------------------------------------------------------------------------------
/benchmarks/linear_reg/linear_reg.py:
--------------------------------------------------------------------------------
  1 | num_threads=32
  2 | import os
  3 | from joblib import parallel_backend
  4 | os.environ["OMP_NUM_THREADS"] = str(num_threads)
  5 | os.environ["OPENBLAS_NUM_THREADS"] = str(num_threads) 
  6 | os.environ["MKL_NUM_THREADS"] = str(num_threads) 
  7 | os.environ["BLIS_NUM_THREADS"] = str(num_threads) 
  8 | 
  9 | import time
 10 | import random
 11 | import numpy as np
 12 | import pandas as pd
 13 | import torch
 14 | from torch import float32
 15 | from torch.autograd import Variable
 16 | from torch.nn.functional import linear
 17 | random.seed(10)
 18 | np.set_printoptions(precision=4)
 19 | torch.set_printoptions(precision=4)
 20 | torch.set_default_dtype(float32)
 21 | torch.set_num_threads(num_threads)
 22 | 
 23 | class linearRegression(torch.nn.Module):
 24 |     def __init__(self, inputSize, init_weight):
 25 |         super(linearRegression, self).__init__()
 26 |         self.inputSize = inputSize
 27 |         self.weights = init_weight
 28 |         self.criterion = torch.nn.MSELoss(reduction='mean') 
 29 | 
 30 |     def forward(self, x, y):
 31 |         out = torch.squeeze(linear(x, self.weights))
 32 |         loss = self.criterion(out, y)
 33 |         return loss
 34 | 
 35 | def main():
 36 |     num_dpus = 5
 37 |     dim, num_elements, iter, lr = 10, 1000*num_dpus, 1, 1e-4
 38 | 
 39 |     df = pd.DataFrame([dim, num_elements, iter, lr])
 40 |     init_vector = np.zeros((dim), dtype=np.float32)
 41 |     input = np.zeros((num_elements, dim+1), dtype=np.float32)
 42 | 
 43 |     groud_truth = np.zeros((dim), dtype=np.float32)
 44 |     for i in range(dim):
 45 |         groud_truth[i] = random.randint(-2, 2) 
 46 | 
 47 |     for i in range(num_elements):
 48 |         for j in range(dim):
 49 |             r1, r2 = random.uniform(0, 1), random.uniform(0, 1)/dim
 50 |             input[i][j] = (int)((i-num_elements/2)*r1 + j*r2)%10 if j%2 == 0 else (int)(-1*((i-num_elements/2)*r1 + j*r2))%10
 51 |         input[i][dim] = groud_truth.dot(input[i][:-1])
 52 |     
 53 |  
 54 |     
 55 |     #np.savetxt("data/args.csv", np.array([dim, num_elements, iter, lr]), delimiter=",", fmt='%s')
 56 |     np.savetxt("data/input.csv", input, delimiter=",", fmt='%f')
 57 | 
 58 |     x_train, y_train = (input.transpose(1, 0)[0:-1]).transpose(1, 0), (input.transpose(1, 0)[-1])
 59 | 
 60 |     if torch.cuda.is_available():
 61 |         inputs = Variable(torch.from_numpy(x_train).cuda(), requires_grad=True)
 62 |         labels = Variable(torch.from_numpy(y_train).cuda(), requires_grad=True)
 63 |         init_weights = torch.nn.Parameter(Variable(torch.from_numpy(init_vector).cuda(), requires_grad=True))
 64 |     else:
 65 |         inputs = Variable(torch.from_numpy(x_train), requires_grad=True)
 66 |         labels = Variable(torch.from_numpy(y_train), requires_grad=True)
 67 |         init_weights = torch.nn.Parameter(Variable(torch.from_numpy(init_vector), requires_grad=True))
 68 |     
 69 | 
 70 |     model = linearRegression(dim, init_weights)
 71 |     optimizer = torch.optim.SGD(model.parameters(), lr=lr)
 72 | 
 73 |     start = time.time()
 74 |     for epoch in range(iter):
 75 |     # Clear gradient buffers because we don't want any gradient from previous epoch to carry forward, dont want to cummulate gradients
 76 |         optimizer.zero_grad()
 77 | 
 78 |     # get output from the model, given the inputs
 79 |         loss = model(inputs, labels)
 80 |         loss.backward()
 81 |         
 82 |         # update parameters
 83 |         optimizer.step()
 84 |     end = time.time()
 85 | 
 86 |     t = (end-start)*1000
 87 |     print("the time consumed is "+str(t)+"ms")
 88 |     print("linear model weights: ")
 89 |     print(model.weights.detach().numpy())
 90 | 
 91 |     print("groud truth: "+str(groud_truth))
 92 | 
 93 |     '''
 94 |     print("$$$$$")
 95 |     print((x_train[0].dot(init_vector)-y_train[0])*x_train[0])
 96 |     print((x_train[1].dot(init_vector)-y_train[1])*x_train[1])
 97 |     print((x_train[0].dot(init_vector)-y_train[0])*x_train[0] + (x_train[1].dot(init_vector)*init_vector-y_train[1])*x_train[1])
 98 |     print((x_train@init_vector-y_train)@x_train)
 99 |     '''
100 |     if not os.path.exists("results/"):
101 |         os.makedirs("results/")
102 | 
103 |     path = 'results/cpu_'+str(dim)+"_"+str(num_elements)+".csv"
104 |     np.savetxt(path, np.array([t]))
105 | 
106 | if __name__ == "__main__":
107 |     if not os.path.exists("data/"):
108 |         os.mkdir("data/")
109 |     with parallel_backend('threading', n_jobs=num_threads):
110 |         main()
111 |     
112 | 


--------------------------------------------------------------------------------
/benchmarks/linear_reg/plot.py:
--------------------------------------------------------------------------------
  1 | from cProfile import label
  2 | import matplotlib.pyplot as plt
  3 | import json
  4 | import numpy as np
  5 | import scipy.stats
  6 | from math import log2, exp
  7 | 
  8 | colors=['#23ef68','#32efff','#2eaf9f','#22222f','#eeeff1','#eee112','#00ef00','#aa0000','#0000aa','#000999','#2e3f56','#7eef1f','#eeef11']
  9 | 
 10 | font = {'weight' : 'bold',
 11 |         'size'   : 16}
 12 | plt.rcParams["figure.figsize"] = (16, 7)
 13 | plt.rc('font', **font)
 14 | 
 15 | 
 16 | def load_data(data, cpu_files, DPU_files, DPU_MASTER_files):
 17 |     for i in range(len(cpu_files)):
 18 |         data["CPU"][i] = np.loadtxt(cpu_files[i]) /1000
 19 | 
 20 |     for i in range(len(DPU_files)):
 21 |         tmp = np.loadtxt(DPU_files[i])/1000
 22 |         data["DPU_initial_transfer"][i] = tmp[0]
 23 |         data["DPU_Kernel"][i] = tmp[1]
 24 |         data["DPU_D2C"][i] = tmp[2]
 25 |         data["DPU_C2D"][i] = tmp[4]
 26 |         data["DPU"][i] = tmp[5]
 27 |     
 28 |     for i in range(len(DPU_MASTER_files)):
 29 |         tmp = np.loadtxt(DPU_MASTER_files[i])/1000
 30 |         data["DPU_MASTER_initial_transfer"][i] = tmp[0]
 31 |         data["DPU_MASTER_Kernel"][i] = tmp[1]
 32 |         data["DPU_MASTER_D2C"][i] = tmp[2]
 33 |         data["DPU_MASTER_C2D"][i] = tmp[4]
 34 |         data["DPU_MASTER"][i] = tmp[5]
 35 | 
 36 | 
 37 |     
 38 | 
 39 | 
 40 | dim_data={
 41 |     "title":"varing_input_dimension",
 42 |     "x_name":"input dimension",
 43 |     "x_axis":["5", "10", "20"],
 44 |     "CPU":np.zeros(3),
 45 | 
 46 |     "DPU":np.zeros(3),
 47 |     "DPU_Kernel":np.zeros(3),
 48 |     "DPU_initial_transfer":np.zeros(3),
 49 |     "DPU_C2D":np.zeros(3),
 50 |     "DPU_D2C":np.zeros(3),
 51 | 
 52 |     "DPU_MASTER":np.zeros(3),
 53 |     "DPU_MASTER_Kernel": np.zeros(3),
 54 |     "DPU_MASTER_initial_transfer": np.zeros(3),
 55 |     "DPU_MASTER_C2D": np.zeros(3),
 56 |     "DPU_MASTER_D2C": np.zeros(3),
 57 | 
 58 | }
 59 | 
 60 | 
 61 | num_data={
 62 |     "title":"varing_#data",
 63 |     "x_name":"number of input data points",
 64 |     "x_axis":["100000", "1000000", "10000000"],
 65 |     "CPU":np.zeros(3),
 66 | 
 67 |     "DPU":np.zeros(3),
 68 |     "DPU_Kernel":np.zeros(3),
 69 |     "DPU_initial_transfer":np.zeros(3),
 70 |     "DPU_C2D":np.zeros(3),
 71 |     "DPU_D2C":np.zeros(3),
 72 | 
 73 |     "DPU_MASTER":np.zeros(3),
 74 |     "DPU_MASTER_Kernel": np.zeros(3),
 75 |     "DPU_MASTER_initial_transfer": np.zeros(3),
 76 |     "DPU_MASTER_C2D": np.zeros(3),
 77 |     "DPU_MASTER_D2C": np.zeros(3),
 78 | }
 79 | 
 80 | num_dpus_data={
 81 |     "title":"varing_#dpus",
 82 |     "x_name":"number of dpus",
 83 |     "x_axis":["128", "512", "2048"],
 84 |     "CPU":np.zeros(3),
 85 | 
 86 |     "DPU":np.zeros(3),
 87 |     "DPU_Kernel":np.zeros(3),
 88 |     "DPU_initial_transfer":np.zeros(3),
 89 |     "DPU_C2D":np.zeros(3),
 90 |     "DPU_D2C":np.zeros(3),
 91 | 
 92 |     "DPU_MASTER":np.zeros(3),
 93 |     "DPU_MASTER_Kernel": np.zeros(3),
 94 |     "DPU_MASTER_initial_transfer": np.zeros(3),
 95 |     "DPU_MASTER_C2D": np.zeros(3),
 96 |     "DPU_MASTER_D2C": np.zeros(3),
 97 | }
 98 | 
 99 | 
100 | def mean_confidence_interval(data, confidence=0.99):
101 |     a = 1.0 * np.array(data)
102 |     n = len(a)
103 |     m, se = np.mean(a), scipy.stats.sem(a)
104 |     h = se * scipy.stats.t.ppf((1 + confidence) / 2., n-1)
105 |     return h
106 | 
107 | def concate(arr_list):
108 |     arr_list = [x.reshape(len(x), 1) for x in arr_list] 
109 |     return np.concatenate(arr_list, axis = 1)
110 | 
111 | def plot_res(data):
112 |     x = data["x_axis"]
113 |     title = data["title"]
114 |     _, ax = plt.subplots()
115 | 
116 |     line_width = 0.25
117 |     x_pos = np.arange(len(x))
118 |     
119 |     
120 |     bar1 = ax.bar(x_pos - line_width + line_width*0, data["CPU"], width=line_width, edgecolor='k', color=colors[0], label ="CPU version")
121 | 
122 |     bar2 = ax.bar(x_pos - line_width + line_width*1, data["DPU"], width=line_width, edgecolor='k', color=colors[1], label ="CPU-reduce")
123 |     bar4 = ax.bar(x_pos - line_width + line_width*1, data["DPU_Kernel"]+data["DPU_initial_transfer"]+data["DPU_D2C"]+data["DPU_C2D"], width=line_width, edgecolor='k', color=colors[2], label="C2D transfer")
124 |     bar5 = ax.bar(x_pos - line_width + line_width*1, data["DPU_Kernel"]+data["DPU_initial_transfer"]+data["DPU_D2C"], width=line_width, edgecolor='k', color=colors[3], label="D2C transfer")
125 |     bar6 = ax.bar(x_pos - line_width + line_width*1, data["DPU_Kernel"]+data["DPU_initial_transfer"], width=line_width, edgecolor='k', color=colors[4], label="initial transfer")
126 |     bar7 = ax.bar(x_pos - line_width + line_width*1, data["DPU_Kernel"], width=line_width, edgecolor='k', color=colors[5], label="DPU kernel")
127 | 
128 |     bar3 = ax.bar(x_pos - line_width + line_width*2, data["DPU_MASTER"], width=line_width, edgecolor='k', color=colors[1])
129 |     ax.bar(x_pos - line_width + line_width*2, data["DPU_MASTER_Kernel"]+data["DPU_MASTER_initial_transfer"]+data["DPU_MASTER_D2C"]+data["DPU_MASTER_C2D"], width=line_width, edgecolor='k', color=colors[2])
130 |     ax.bar(x_pos - line_width + line_width*2, data["DPU_MASTER_Kernel"]+data["DPU_MASTER_initial_transfer"]+data["DPU_MASTER_D2C"], width=line_width, edgecolor='k', color=colors[3])
131 |     ax.bar(x_pos - line_width + line_width*2, data["DPU_MASTER_Kernel"]+data["DPU_MASTER_initial_transfer"], width=line_width, edgecolor='k', color=colors[4])
132 |     ax.bar(x_pos - line_width + line_width*2, data["DPU_MASTER_Kernel"], width=line_width, edgecolor='k', color=colors[5])
133 | 
134 | 
135 | 
136 |     for i in range(len(bar1 + bar2 +bar3)):
137 |         rect = (bar1 + bar2 +bar3)[i]
138 |         height = rect.get_height()
139 |         if(not height == 0):
140 |             if i//3 == 0:
141 |                 text = "cpu"
142 |             elif i//3 == 1:
143 |                 text = "pim-f"
144 |             else:
145 |                 text = "pim-h"
146 |             plt.text(rect.get_x()+rect.get_width() / 2, height, text, ha = 'center', va = 'bottom', fontdict={'size': 16})
147 |     
148 |     #plt.yscale('log',base=2)
149 |     ax.set_xticks(x_pos-line_width*0)
150 |     ax.set_xticklabels(x) 
151 |     ax.set_title(title)
152 |     plt.xlabel(data["x_name"], fontdict=font)
153 |     plt.ylabel("time in s", fontdict=font)
154 | 
155 |     legend1 = plt.legend(handles=[bar1, bar2], loc='upper left', shadow=True, bbox_to_anchor=(0, -0.12, 0, 0))
156 |     ax.add_artist(legend1)
157 |     legend2 = plt.legend(handles=[bar3, bar4, bar7], loc='upper left', shadow=True, bbox_to_anchor=(0.425, -0.12, 0, 0))
158 |     ax.add_artist(legend2)
159 |     plt.legend(handles=[bar5, bar6], loc='upper left', shadow=True, bbox_to_anchor=(0.2, -0.12, 0, 0))
160 |     plt.savefig("images/"+title, bbox_inches='tight')
161 |     plt.clf()
162 |     plt.close()
163 |     
164 | 
165 | 
166 | 
167 | if __name__=="__main__":
168 |     dir = "results/"
169 |     load_data(dim_data, [dir+ i for i in ["cpu_5_1000000.csv", "cpu_10_1000000.csv", "cpu_20_1000000.csv"]], [dir+i for i in ["framework_2523_5_1000000", "framework_2523_10_1000000", "framework_2523_20_1000000"]], [dir+i for i in ["human_2523_5_1000000", "human_2523_10_1000000"]])
170 |     plot_res(dim_data)
171 |     load_data(num_data, [dir+ i for i in ["cpu_10_100000.csv", "cpu_10_1000000.csv", "cpu_10_10000000.csv"]], [dir+i for i in ["framework_2523_10_100000", "framework_2523_10_1000000", "framework_2523_10_10000000"]], [dir+i for i in ["human_2523_10_100000", "human_2523_10_1000000",  "human_2523_10_10000000"]])
172 |     plot_res(num_data)
173 |     load_data(num_dpus_data, [], [dir+i for i in ["framework_128_10_1000000", "framework_512_10_1000000", "framework_2048_10_1000000"]], [dir+i for i in ["human_128_10_1000000",  "human_512_10_1000000", "human_2048_10_1000000"]])
174 |     plot_res(num_dpus_data)
175 | 


--------------------------------------------------------------------------------
/benchmarks/log_reg/Makefile:
--------------------------------------------------------------------------------
1 | log: host.c
2 | 	@mkdir -p bin
3 | 	gcc --std=c99 -ldl -lm -fopenmp -O3 host.c -o bin/host ../../lib/processing/ProcessingHelperHost.c ../../lib/communication/CommHelper.c ../../lib/communication/CommOps.c ../../lib/management/SmallTableInit.c ../../lib/management/Management.c ../../lib/processing/gen_red/GenRed.c `dpu-pkg-config --cflags --libs dpu`


--------------------------------------------------------------------------------
/benchmarks/log_reg/Param.h:
--------------------------------------------------------------------------------
 1 | #ifndef PARAM_H
 2 | #define PARAM_H
 3 | 
 4 | #include <stdlib.h>
 5 | #include <stdint.h>
 6 | uint32_t print_info = 0;
 7 | typedef int T; 
 8 | const uint32_t dpu_number = 5; // 2432
 9 | const uint32_t dim = 10;
10 | const uint64_t num_elements = 1000*dpu_number;
11 | const uint32_t iter = 1;
12 | const float lr = 1e-4;
13 | const uint32_t prevent_overflow_shift_amount = 3;
14 | const uint32_t shift_amount = 5;
15 | #endif
16 | 


--------------------------------------------------------------------------------
/benchmarks/log_reg/bin/host:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CMU-SAFARI/SimplePIM/1d639c53532555f01e9f71d872e7712b166d6cba/benchmarks/log_reg/bin/host


--------------------------------------------------------------------------------
/benchmarks/log_reg/host.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <stdlib.h>
  3 | #include <string.h>
  4 | #include <assert.h>
  5 | #include <time.h>
  6 | #include <dpu.h>
  7 | 
  8 | 
  9 | #include "../../lib/processing/gen_red/GenRed.h"
 10 | #include "../../lib/processing/ProcessingHelperHost.h"
 11 | #include "../../lib/communication/CommOps.h"
 12 | #include "../../lib/management/Management.h"
 13 | #include "../../lib/timer.h"
 14 | #include "Param.h"
 15 | 
 16 | 
 17 | 
 18 | FILE* fp;
 19 | 
 20 | 
 21 | void read_csv_to_arr(FILE* fp, T* arr, int32_t len, int32_t d){
 22 |   if (fp == NULL) {
 23 |         fprintf(stderr, "Error reading file\n");
 24 |         return;
 25 |   }
 26 | 
 27 |   float tmp;
 28 |   for (size_t i = 0; i < len; i++){
 29 |       for(size_t j = 0; j < d-1; j++){
 30 |         fscanf(fp, "%f,", &tmp);
 31 |         arr[i*d+j] = (T)tmp;
 32 |       }
 33 |       fscanf(fp, "%f,", &tmp);
 34 |       arr[i*d+d-1] = (T)tmp;
 35 |     }
 36 | 
 37 |     fclose(fp);
 38 | }
 39 | 
 40 | void write_time_to_csv(double* arr, int32_t len){
 41 |   if (fp == NULL) {
 42 |         fprintf(stderr, "Error reading file\n");
 43 |         return;
 44 |     }
 45 | 
 46 |     for (size_t i = 0; i < len; i++){
 47 |       fprintf(fp,"%f\n", arr[i]/1000.0);
 48 |     }
 49 | 
 50 |     fclose(fp);
 51 | }
 52 | 
 53 | void get_output_file(int num_dpus, int dim, int num_elem){
 54 |   char str1[10];
 55 |   char str2[10];
 56 |   char str3[10];
 57 |   sprintf(str1, "%d", num_dpus);
 58 |   sprintf(str2, "%d", dim);
 59 |   sprintf(str3, "%d", num_elem);
 60 |   char out[100] = "results/framework_";
 61 |   strcat(out, str1);
 62 |   strcat(out,"_");
 63 |   strcat(out, str2);
 64 |   strcat(out,"_");
 65 |   strcat(out, str3);
 66 |   fp = fopen (out, "w");
 67 | }
 68 | 
 69 | 
 70 | void compute_gradients(const T*arr){
 71 | 
 72 |   // [X|Y] -> [X], [Y]
 73 |   T* X = malloc(num_elements*dim*sizeof(T));
 74 |   T* Y = malloc(num_elements*sizeof(T));
 75 |   for(uint32_t i=0; i<num_elements; i++){
 76 |     for(uint32_t j=0; j<dim; j++){
 77 |       X[i*dim+j] = arr[i*(dim+1)+j];
 78 |     }
 79 |     Y[i] = arr[i*(dim+1)+dim];
 80 |   }
 81 | 
 82 |   // actual code
 83 |   T* weights = malloc(dim*sizeof(T));
 84 | 
 85 |   for (uint32_t n = 0; n < dim; n++) {
 86 |     weights[n] = 0;
 87 |   }
 88 | 
 89 |   int64_t dot_product; 
 90 |   int64_t e;
 91 |   int64_t* gradient_tmp = (int64_t*) calloc(dim, sizeof(int64_t)); 
 92 |   for (uint32_t i = 0; i < iter; ++i){
 93 |     for (uint32_t n = 0; n < dim; ++n) {
 94 |       gradient_tmp[i] = 0;
 95 |     }
 96 | 
 97 |     for (uint32_t j = 0; j < num_elements; ++j) {
 98 |         dot_product = 0; 
 99 |         for (uint32_t k = 0; k < dim; k++) {
100 |           dot_product += X[j*dim + k] * weights[k]; 
101 |         }
102 | 
103 |         e = dot_product-(Y[j]<<shift_amount);
104 |         for (uint32_t l = 0; l < dim; l++) {
105 |                 gradient_tmp[l] += X[j*dim + l] * e >> prevent_overflow_shift_amount; 
106 |         }
107 |     }
108 |   }
109 | 
110 |   printf("\nthe gradients on host: \n");
111 |   for(int i=0; i<dim; i++){
112 |     printf("%lld ", gradient_tmp[i]);
113 |   }
114 |   printf("\n");
115 | 
116 | }
117 | 
118 | 
119 | 
120 | 
121 | 
122 | int main(){
123 |   simplepim_management_t* table_management = table_management_init(dpu_number);
124 | 
125 |   printf("dim: %d, num_elem: %d, iter: %d, lr: %f \n", dim, num_elements, iter, lr);
126 |   // data contains y also as last element
127 | 
128 |   // inputs
129 |   //printf("reading the input data\n");
130 |   T* elements = (T*)malloc_scatter_aligned(num_elements, (dim+1)*sizeof(T), table_management);
131 | 
132 |   
133 |   fp = fopen ("data/input.csv", "r");
134 |   read_csv_to_arr(fp, elements, num_elements, dim+1);
135 | 
136 |   if(print_info){
137 |     for(int i=0; i<num_elements; i++){
138 | 
139 |       for(int j=0; j<dim+1; j++){
140 |         printf("%f ", elements[i*dim+j]);
141 |       }
142 |       printf("\n");
143 |       printf("\n");
144 |     }
145 |   }
146 |   
147 |   // weights data
148 |   printf("init weight data \n");
149 |   T* weights = (T*)malloc_broadcast_aligned(1, sizeof(T)*dim, table_management);
150 |   for(int i=0; i<dim; i++){
151 |       weights[i] = 0.0;
152 |   }
153 | 
154 |   printf("end of reading data from file\n");
155 | 
156 |   simplepim_scatter("t1", elements, num_elements, (dim+1)*sizeof(T), table_management);
157 |   uint32_t data_offset = lookup_table("t1", table_management)->end; 
158 |   simplepim_broadcast("t2", weights, 1, dim*sizeof(T),  table_management);
159 | 
160 |   handle_t* va_handle = create_handle("log_reg_funcs", REDUCE);
161 |   T* res = table_gen_red("t1", "t3", dim*sizeof(T), 1, va_handle, table_management, data_offset);
162 | 
163 |   printf("the weights of linear model: \n");
164 |   for(int i=0; i<dim; i++){
165 |     res[i] /= num_elements;
166 |     weights[i] -= lr * res[i];
167 |     printf("%4.3e ", weights[i]);
168 |   }
169 | 
170 |   free(res);
171 | 
172 |   //printf("end of data transfer\n");
173 | 
174 |   /*
175 |   // preparing and parsing argument
176 |   dpu_arguments_t* input_args = (dpu_arguments_t*) malloc(num_dpus * sizeof(dpu_arguments_t));
177 | 
178 |   for(int i=0; i<num_dpus; i++){
179 |      input_args[i].input_start_offset = 0;
180 |      input_args[i].input_type_size = dim*sizeof(T);
181 |      input_args[i].data_start_offset = data_offset;
182 |      input_args[i].data_len = 1;
183 |      input_args[i].data_type_size = X_dim*sizeof(T);
184 |      input_args[i].end_offset = end_offset;
185 |      input_args[i].table_type_size = X_dim*sizeof(T);
186 |      input_args[i].table_len = 1;
187 |   }
188 | 
189 | 
190 |   prepare_input_len_and_parse_args(set, input_args, num_elements, dim*sizeof(T), num_dpus);
191 |   stop(&timer, 0);
192 |   //printf("end of parsing arguments to upmem\n");
193 | 
194 |   //init buffer for gathering all weights data
195 |   T* grads_table = (T*)malloc(sizeof(T)*X_dim);
196 | 
197 |   // launch actual dpu code
198 |   for(int it=0; it<iter; it++){
199 |     start(&timer, 1, it);
200 |     DPU_ASSERT(dpu_launch(set, DPU_SYNCHRONOUS));
201 |     stop(&timer, 1);
202 | 
203 |     if(print_info){
204 |       DPU_FOREACH(set, dpu) {
205 |         DPU_ASSERT(dpu_log_read(dpu, stdout));
206 |       }
207 |     }
208 | 
209 |     start(&timer, 2, it);
210 |     gather_tables_to_host(set, grads_table, 1, X_dim*sizeof(T), end_offset, num_dpus, zero_init, add);
211 |     stop(&timer, 2);
212 | 
213 |     //print_table_entries(grads_table, 1, X_dim*sizeof(float), print_floats);
214 | 
215 |     start(&timer, 3, it);
216 |     for(int i=0; i<X_dim; i++){
217 |       grads_table[i] /= num_elements;
218 |       //printf("%f ", grads_table[i]);
219 |     }
220 |     //printf("\n");
221 | 
222 |     //update gradients
223 |     for(int i=0; i<X_dim; i++){
224 |         weights[i] -= lr * grads_table[i];
225 |     }
226 |     stop(&timer, 3);
227 | 
228 |     start(&timer, 4, it);
229 |     host_broadcast_to_dpu(set, weights, 1, X_dim*sizeof(T), data_offset);
230 |     stop(&timer, 4);
231 |   }
232 |   stop(&timer, 5);
233 | 
234 |   printf("the total time with timing consumed is (ms): ");
235 |   print(&timer, 5, 1);
236 |   printf("\n");
237 |   printf("initial CPU-DPU input transfer (ms): ");
238 | 	print(&timer, 0, 1);
239 |   printf("\n");
240 | 	printf("DPU Kernel Time (ms): ");
241 | 	print(&timer, 1, iter);
242 |   printf("\n");
243 | 	printf("DPU-CPU Time (ms): ");
244 | 	print(&timer, 2, iter);
245 |   printf("\n");
246 | 	printf("CPU combine table Time (ms): ");
247 | 	print(&timer, 3, iter);
248 |   printf("\n");
249 | 	printf("CPU-DPU Time (ms): ");
250 | 	print(&timer, 4, iter);
251 |   printf("\n");
252 | 
253 |   float total_time = timer.time[0];
254 |   for(int i=1; i<5; i++){
255 |     total_time += timer.time[i];
256 |   }
257 |   
258 |   printf("total time added up (ms): %f\n", total_time/1000);
259 | 
260 |   double* times = (double*)malloc(sizeof(double)*6);
261 |   for(int i=0; i<5; i++){
262 |     times[i] = timer.time[i];
263 |   }
264 |   times[5] = total_time;
265 |   get_output_file(num_dpus, X_dim, num_elements);
266 |   write_time_to_csv(times, 6);
267 | 
268 | 
269 |   printf("the weights of linear model: \n");
270 |   for(int i=0; i<X_dim; i++){
271 |     printf("%4.3e ", weights[i]);
272 |   }
273 |   printf("\n");
274 | 
275 |   free(grads_table);
276 |   return 0;
277 |   */
278 | }
279 | 


--------------------------------------------------------------------------------
/benchmarks/log_reg/log_reg_funcs/init_combine_func.h:
--------------------------------------------------------------------------------
 1 | #ifndef INIT_COMBINE_FUNC_H
 2 | #define INIT_COMBINE_FUNC_H
 3 | 
 4 | #include <stdlib.h>
 5 | #include <stdint.h>
 6 | #include <stdio.h>
 7 | #include "../Param.h"
 8 | 
 9 | 
10 | void init_func(uint32_t size, void* ptr){
11 |     char* casted_value_ptr = (char*) ptr;
12 |     for(int i=0; i<size; i++){
13 |         casted_value_ptr[i] = 0;
14 |     }
15 | }
16 | 
17 | void combine_func(void* dest, void* src){
18 |     T* ptr1 = (T*)dest;
19 |     T* ptr2 = (T*)src;
20 |     for(int i=0; i<dim; i++){
21 |         ptr1[i] += ptr2[i];
22 |     } 
23 | }
24 | 
25 | #endif
26 | 


--------------------------------------------------------------------------------
/benchmarks/log_reg/log_reg_funcs/map_to_val_func.h:
--------------------------------------------------------------------------------
 1 | #ifndef MAP_TO_VAL_FUNC_H
 2 | #define MAP_TO_VAL_FUNC_H
 3 | 
 4 | #include <stdlib.h>
 5 | #include <stdint.h>
 6 | #include <stdio.h>
 7 | 
 8 | #include <mram.h>
 9 | #include <alloc.h>
10 | #include <defs.h>
11 | #include <barrier.h>
12 | 
13 | #include "../Param.h"
14 | #include "../../../lib/processing/gen_red/GenRedArgs.h"
15 | 
16 | 
17 | __dma_aligned T* weights_data;
18 | BARRIER_INIT(barrier_maptoval, NR_TASKLETS);
19 | 
20 | void start_func(gen_red_arguments_t* args){
21 |     uint32_t total_len = args->table_len * args->output_type_size;
22 |     uint32_t aligned_weights_size = total_len + 8-(total_len%8);
23 |     if(me()==0){
24 |         // initialise weights
25 |         fsb_allocator_t weights_allocator = fsb_alloc(aligned_weights_size, 1);
26 |         weights_data = (void*)fsb_get(weights_allocator);
27 |         mram_read(DPU_MRAM_HEAP_POINTER+args->info, weights_data, aligned_weights_size);
28 |     }
29 |     barrier_wait(&barrier_maptoval);
30 | }
31 | 
32 | static inline T sigmoid_dpu(T x){
33 |     if(x >= 15)
34 |         return 1.0; 
35 |     else if (x <= -15) 
36 |         return  0.0; 
37 |     else if (x == 0.0)
38 |         return 0.5; 
39 | 
40 |     float sum = 1.0;
41 |     float temp = 1.0; 
42 |     // iter 100 times 
43 |     for(uint32_t i = 1; i < 101; ++i){
44 |         temp = temp * (-x) / i;
45 |         sum = sum + temp; 
46 |     }
47 |     return (T)(1.0 / (1.0 + sum)); 
48 | }
49 | 
50 | 
51 | void map_to_val_func(void* input, void* grads, uint32_t* dummy){
52 |     // the data is preserved and later added to corresponding weights 
53 |     float* grads_ptr = (float*)grads;
54 |     float* input_ptr = (float*)input;
55 |     float* weights_data_ptr = (float*)weights_data;
56 | 
57 |     // calculate gradients w.r.t. linear weights
58 |     float dot = 0;
59 |     for(int i=0; i<dim; i++){
60 |         dot += input_ptr[i] * weights_data_ptr[i];
61 |     }
62 | 
63 |     float e = sigmoid_dpu(dot)-input_ptr[dim];
64 |     //printf("error : %f\n", e*e);
65 |     for(int i=0; i<dim; i++){
66 |         grads_ptr[i] = e * input_ptr[i];
67 |         //printf("%f ", grads_ptr[i]);
68 |     }
69 |     //printf("\n");  
70 |     // put weight gradients to the 0th entry
71 |     *dummy = 0;
72 | 
73 | }
74 | 
75 | #endif


--------------------------------------------------------------------------------
/benchmarks/log_reg/logistic_reg.py:
--------------------------------------------------------------------------------
  1 | num_threads=32
  2 | import os
  3 | from joblib import parallel_backend
  4 | os.environ["OMP_NUM_THREADS"] = str(num_threads)
  5 | os.environ["OPENBLAS_NUM_THREADS"] = str(num_threads) 
  6 | os.environ["MKL_NUM_THREADS"] = str(num_threads) 
  7 | os.environ["BLIS_NUM_THREADS"] = str(num_threads) 
  8 | 
  9 | import time
 10 | import random
 11 | import numpy as np
 12 | import pandas as pd
 13 | import torch
 14 | from torch import float32
 15 | from torch.autograd import Variable
 16 | from torch.nn.functional import linear
 17 | random.seed(10)
 18 | np.set_printoptions(precision=4)
 19 | torch.set_printoptions(precision=4)
 20 | torch.set_default_dtype(float32)
 21 | torch.set_num_threads(num_threads)
 22 | 
 23 | class logisticRegression(torch.nn.Module):
 24 |     def __init__(self, inputSize, init_weight):
 25 |         super(logisticRegression, self).__init__()
 26 |         self.inputSize = inputSize
 27 |         self.weights = init_weight
 28 |         self.criterion = torch.nn.BCELoss()
 29 | 
 30 |     def forward(self, x, y):
 31 |         out = torch.sigmoid(torch.squeeze(linear(x, self.weights)))
 32 |         loss = self.criterion(out, y)
 33 |         return loss
 34 | 
 35 | def main():
 36 |     num_dpus = 5
 37 |     dim, num_elements, iter, lr = 10, 1000*num_dpus, 1, 1e-4
 38 | 
 39 |     df = pd.DataFrame([dim, num_elements, iter, lr])
 40 |     init_vector = np.zeros((dim), dtype=np.float32)
 41 |     input = np.zeros((num_elements, dim+1), dtype=np.float32)
 42 | 
 43 |     groud_truth = np.zeros((dim), dtype=np.float32)
 44 |     for i in range(dim):
 45 |         groud_truth[i] = random.randint(-2, 2) 
 46 | 
 47 |     for i in range(num_elements):
 48 |         for j in range(dim):
 49 |             r1, r2 = random.uniform(0, 0.1), random.uniform(0, 0.1)/dim
 50 |             input[i][j] = (int)((i-num_elements/2)*r1 + j*r2)%10 if j%2 == 0 else (int)(-1*((i-num_elements/2)*r1 + j*r2))%10
 51 |         input[i][dim] = 1 if groud_truth.dot(input[i][:-1]) > 0 else 0
 52 |     
 53 |     print(input)
 54 | 
 55 |     np.savetxt("data/args.csv", np.array([dim, num_elements, iter, lr]), delimiter=",", fmt='%s')
 56 |     np.savetxt("data/input.csv", input, delimiter=",", fmt='%f')
 57 | 
 58 |     x_train, y_train = (input.transpose(1, 0)[0:-1]).transpose(1, 0), (input.transpose(1, 0)[-1])
 59 | 
 60 |     if torch.cuda.is_available():
 61 |         inputs = Variable(torch.from_numpy(x_train).cuda(), requires_grad=True)
 62 |         labels = Variable(torch.from_numpy(y_train).cuda(), requires_grad=True)
 63 |         init_weights = torch.nn.Parameter(Variable(torch.from_numpy(init_vector).cuda(), requires_grad=True))
 64 |     else:
 65 |         inputs = Variable(torch.from_numpy(x_train), requires_grad=True)
 66 |         labels = Variable(torch.from_numpy(y_train), requires_grad=True)
 67 |         init_weights = torch.nn.Parameter(Variable(torch.from_numpy(init_vector), requires_grad=True))
 68 |     
 69 | 
 70 |     model = logisticRegression(dim, init_weights)
 71 |     optimizer = torch.optim.SGD(model.parameters(), lr=lr)
 72 | 
 73 |     start = time.time()
 74 |     for epoch in range(iter):
 75 |     # Clear gradient buffers because we don't want any gradient from previous epoch to carry forward, dont want to cummulate gradients
 76 |         optimizer.zero_grad()
 77 | 
 78 |     # get output from the model, given the inputs
 79 |         loss = model(inputs, labels)
 80 |         loss.backward()
 81 |         
 82 |         # update parameters
 83 |         optimizer.step()
 84 |     end = time.time()
 85 |     t = (end-start) *1000
 86 |     print("the time consumed is "+str(t)+" ms")
 87 |     print("linear model weights: ")
 88 |     print(model.weights.detach().numpy())
 89 |     print("the groud truth: ")
 90 |     print(groud_truth)
 91 | 
 92 |     if not os.path.exists("results/"):
 93 |         os.makedirs("results/")
 94 | 
 95 |     path = 'results/cpu_'+str(dim)+"_"+str(num_elements)+".csv"
 96 |     np.savetxt(path, np.array([t]))
 97 | 
 98 | 
 99 | if __name__ == "__main__":
100 |     if not os.path.exists("data/"):
101 |         os.mkdir("data/")
102 |     with parallel_backend('threading', n_jobs=num_threads):
103 |         main()


--------------------------------------------------------------------------------
/benchmarks/log_reg/plot.py:
--------------------------------------------------------------------------------
  1 | from cProfile import label
  2 | import matplotlib.pyplot as plt
  3 | import json
  4 | import numpy as np
  5 | import scipy.stats
  6 | from math import log2, exp
  7 | 
  8 | colors=['#23ef68','#32efff','#2eaf9f','#22222f','#eeeff1','#eee112','#00ef00','#aa0000','#0000aa','#000999','#2e3f56','#7eef1f','#eeef11']
  9 | 
 10 | font = {'weight' : 'bold',
 11 |         'size'   : 16}
 12 | plt.rcParams["figure.figsize"] = (16, 7)
 13 | plt.rc('font', **font)
 14 | 
 15 | 
 16 | def load_data(data, cpu_files, DPU_files, DPU_MASTER_files):
 17 |     for i in range(len(cpu_files)):
 18 |         data["CPU"][i] = np.loadtxt(cpu_files[i]) /1000
 19 | 
 20 |     for i in range(len(DPU_files)):
 21 |         tmp = np.loadtxt(DPU_files[i])/1000
 22 |         data["DPU_initial_transfer"][i] = tmp[0]
 23 |         data["DPU_Kernel"][i] = tmp[1]
 24 |         data["DPU_D2C"][i] = tmp[2]
 25 |         data["DPU_C2D"][i] = tmp[4]
 26 |         data["DPU"][i] = tmp[5]
 27 |     
 28 |     for i in range(len(DPU_MASTER_files)):
 29 |         tmp = np.loadtxt(DPU_MASTER_files[i])/1000
 30 |         data["DPU_MASTER_initial_transfer"][i] = tmp[0]
 31 |         data["DPU_MASTER_Kernel"][i] = tmp[1]
 32 |         data["DPU_MASTER_D2C"][i] = tmp[2]
 33 |         data["DPU_MASTER_C2D"][i] = tmp[4]
 34 |         data["DPU_MASTER"][i] = tmp[5]
 35 | 
 36 | 
 37 |     
 38 | 
 39 | 
 40 | dim_data={
 41 |     "title":"varing_input_dimension",
 42 |     "x_name":"input dimension",
 43 |     "x_axis":["5", "10", "20"],
 44 |     "CPU":np.zeros(3),
 45 | 
 46 |     "DPU":np.zeros(3),
 47 |     "DPU_Kernel":np.zeros(3),
 48 |     "DPU_initial_transfer":np.zeros(3),
 49 |     "DPU_C2D":np.zeros(3),
 50 |     "DPU_D2C":np.zeros(3),
 51 | 
 52 |     "DPU_MASTER":np.zeros(3),
 53 |     "DPU_MASTER_Kernel": np.zeros(3),
 54 |     "DPU_MASTER_initial_transfer": np.zeros(3),
 55 |     "DPU_MASTER_C2D": np.zeros(3),
 56 |     "DPU_MASTER_D2C": np.zeros(3),
 57 | 
 58 | }
 59 | 
 60 | 
 61 | num_data={
 62 |     "title":"varing_#data",
 63 |     "x_name":"number of input data points",
 64 |     "x_axis":["100000", "1000000", "10000000"],
 65 |     "CPU":np.zeros(3),
 66 | 
 67 |     "DPU":np.zeros(3),
 68 |     "DPU_Kernel":np.zeros(3),
 69 |     "DPU_initial_transfer":np.zeros(3),
 70 |     "DPU_C2D":np.zeros(3),
 71 |     "DPU_D2C":np.zeros(3),
 72 | 
 73 |     "DPU_MASTER":np.zeros(3),
 74 |     "DPU_MASTER_Kernel": np.zeros(3),
 75 |     "DPU_MASTER_initial_transfer": np.zeros(3),
 76 |     "DPU_MASTER_C2D": np.zeros(3),
 77 |     "DPU_MASTER_D2C": np.zeros(3),
 78 | }
 79 | 
 80 | num_dpus_data={
 81 |     "title":"varing_#dpus",
 82 |     "x_name":"number of dpus",
 83 |     "x_axis":["128", "512", "2048"],
 84 |     "CPU":np.zeros(3),
 85 | 
 86 |     "DPU":np.zeros(3),
 87 |     "DPU_Kernel":np.zeros(3),
 88 |     "DPU_initial_transfer":np.zeros(3),
 89 |     "DPU_C2D":np.zeros(3),
 90 |     "DPU_D2C":np.zeros(3),
 91 | 
 92 |     "DPU_MASTER":np.zeros(3),
 93 |     "DPU_MASTER_Kernel": np.zeros(3),
 94 |     "DPU_MASTER_initial_transfer": np.zeros(3),
 95 |     "DPU_MASTER_C2D": np.zeros(3),
 96 |     "DPU_MASTER_D2C": np.zeros(3),
 97 | }
 98 | 
 99 | 
100 | def mean_confidence_interval(data, confidence=0.99):
101 |     a = 1.0 * np.array(data)
102 |     n = len(a)
103 |     m, se = np.mean(a), scipy.stats.sem(a)
104 |     h = se * scipy.stats.t.ppf((1 + confidence) / 2., n-1)
105 |     return h
106 | 
107 | def concate(arr_list):
108 |     arr_list = [x.reshape(len(x), 1) for x in arr_list] 
109 |     return np.concatenate(arr_list, axis = 1)
110 | 
111 | def plot_res(data):
112 |     x = data["x_axis"]
113 |     title = data["title"]
114 |     _, ax = plt.subplots()
115 | 
116 |     line_width = 0.25
117 |     x_pos = np.arange(len(x))
118 |     
119 |     
120 |     bar1 = ax.bar(x_pos - line_width + line_width*0, data["CPU"], width=line_width, edgecolor='k', color=colors[0], label ="CPU version")
121 | 
122 |     bar2 = ax.bar(x_pos - line_width + line_width*1, data["DPU"], width=line_width, edgecolor='k', color=colors[1], label ="CPU-reduce")
123 |     bar4 = ax.bar(x_pos - line_width + line_width*1, data["DPU_Kernel"]+data["DPU_initial_transfer"]+data["DPU_D2C"]+data["DPU_C2D"], width=line_width, edgecolor='k', color=colors[2], label="C2D transfer")
124 |     bar5 = ax.bar(x_pos - line_width + line_width*1, data["DPU_Kernel"]+data["DPU_initial_transfer"]+data["DPU_D2C"], width=line_width, edgecolor='k', color=colors[3], label="D2C transfer")
125 |     bar6 = ax.bar(x_pos - line_width + line_width*1, data["DPU_Kernel"]+data["DPU_initial_transfer"], width=line_width, edgecolor='k', color=colors[4], label="initial transfer")
126 |     bar7 = ax.bar(x_pos - line_width + line_width*1, data["DPU_Kernel"], width=line_width, edgecolor='k', color=colors[5], label="DPU kernel")
127 | 
128 |     bar3 = ax.bar(x_pos - line_width + line_width*2, data["DPU_MASTER"], width=line_width, edgecolor='k', color=colors[1])
129 |     ax.bar(x_pos - line_width + line_width*2, data["DPU_MASTER_Kernel"]+data["DPU_MASTER_initial_transfer"]+data["DPU_MASTER_D2C"]+data["DPU_MASTER_C2D"], width=line_width, edgecolor='k', color=colors[2])
130 |     ax.bar(x_pos - line_width + line_width*2, data["DPU_MASTER_Kernel"]+data["DPU_MASTER_initial_transfer"]+data["DPU_MASTER_D2C"], width=line_width, edgecolor='k', color=colors[3])
131 |     ax.bar(x_pos - line_width + line_width*2, data["DPU_MASTER_Kernel"]+data["DPU_MASTER_initial_transfer"], width=line_width, edgecolor='k', color=colors[4])
132 |     ax.bar(x_pos - line_width + line_width*2, data["DPU_MASTER_Kernel"], width=line_width, edgecolor='k', color=colors[5])
133 | 
134 | 
135 | 
136 |     for i in range(len(bar1 + bar2 +bar3)):
137 |         rect = (bar1 + bar2 +bar3)[i]
138 |         height = rect.get_height()
139 |         if(not height == 0):
140 |             if i//3 == 0:
141 |                 text = "cpu"
142 |             elif i//3 == 1:
143 |                 text = "pim-f"
144 |             else:
145 |                 text = "pim-h"
146 |             plt.text(rect.get_x()+rect.get_width() / 2, height, text, ha = 'center', va = 'bottom', fontdict={'size': 16})
147 |     
148 |     #plt.yscale('log',base=2)
149 |     ax.set_xticks(x_pos-line_width*0)
150 |     ax.set_xticklabels(x) 
151 |     ax.set_title(title)
152 |     plt.xlabel(data["x_name"], fontdict=font)
153 |     plt.ylabel("time in s", fontdict=font)
154 | 
155 |     legend1 = plt.legend(handles=[bar1, bar2], loc='upper left', shadow=True, bbox_to_anchor=(0, -0.12, 0, 0))
156 |     ax.add_artist(legend1)
157 |     legend2 = plt.legend(handles=[bar3, bar4, bar7], loc='upper left', shadow=True, bbox_to_anchor=(0.425, -0.12, 0, 0))
158 |     ax.add_artist(legend2)
159 |     plt.legend(handles=[bar5, bar6], loc='upper left', shadow=True, bbox_to_anchor=(0.2, -0.12, 0, 0))
160 |     plt.savefig("images/"+title, bbox_inches='tight')
161 |     plt.clf()
162 |     plt.close()
163 |     
164 | 
165 |     
166 |     
167 | 
168 | 
169 | 
170 | 
171 | 
172 | if __name__=="__main__":
173 |     dir = "results/"
174 |     load_data(dim_data, [dir+ i for i in ["cpu_5_1000000.csv", "cpu_10_1000000.csv", "cpu_20_1000000.csv"]], [dir+i for i in ["framework_2546_5_1000000", "framework_2546_10_1000000", "framework_2546_20_1000000"]], [dir+i for i in ["human_2546_5_1000000", "human_2546_10_1000000", "human_2546_20_1000000"]])
175 |     plot_res(dim_data)
176 |     load_data(num_data, [dir+ i for i in ["cpu_10_100000.csv", "cpu_10_1000000.csv", "cpu_10_10000000.csv"]], [dir+i for i in ["framework_2546_10_100000", "framework_2546_10_1000000", "framework_2546_10_10000000"]], [dir+i for i in ["human_2546_10_100000", "human_2546_10_1000000",  "human_2546_10_10000000"]])
177 |     plot_res(num_data)
178 |     load_data(num_dpus_data, [], [dir+i for i in ["framework_128_10_1000000", "framework_512_10_1000000", "framework_2048_10_1000000"]], [dir+i for i in ["human_128_10_1000000",  "human_512_10_1000000", "human_2048_10_1000000"]])
179 |     plot_res(num_dpus_data)


--------------------------------------------------------------------------------
/benchmarks/red/Makefile:
--------------------------------------------------------------------------------
1 | va: host.c
2 | 	@mkdir -p bin
3 | 	gcc --std=c99 -ldl -lm -fopenmp -O3 host.c -o bin/host ../../lib/processing/ProcessingHelperHost.c ../../lib/communication/CommHelper.c ../../lib/communication/CommOps.c ../../lib/management/SmallTableInit.c ../../lib/management/Management.c ../../lib/processing/gen_red/GenRed.c `dpu-pkg-config --cflags --libs dpu`


--------------------------------------------------------------------------------
/benchmarks/red/Param.h:
--------------------------------------------------------------------------------
 1 | #ifndef PARAM_H
 2 | #define PARAM_H
 3 | #include <stdlib.h>
 4 | 
 5 | typedef uint32_t T; 
 6 | 
 7 | const uint32_t dpu_number = 32; //2432
 8 | uint32_t print_info = 0;
 9 | uint64_t nr_elements = 1000*dpu_number; //1000000*dpu_number
10 | #endif
11 | 


--------------------------------------------------------------------------------
/benchmarks/red/bin/host:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CMU-SAFARI/SimplePIM/1d639c53532555f01e9f71d872e7712b166d6cba/benchmarks/red/bin/host


--------------------------------------------------------------------------------
/benchmarks/red/host.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <string.h>
 4 | #include <assert.h>
 5 | #include <time.h>
 6 | #include <dpu.h>
 7 | 
 8 | //#include "../../lib/communication/CommOps.h"
 9 | //#include "../../lib/management/Management.h"
10 | #include "../../lib/processing/gen_red/GenRed.h"
11 | #include "../../lib/processing/ProcessingHelperHost.h"
12 | #include "../../lib/timer.h"
13 | #include "Param.h"
14 | 
15 | 
16 | 
17 | void init(T* A){
18 |     for (uint64_t i = 0; i < nr_elements; i++) {
19 |         A[i] = i%1000;
20 |     }
21 | }
22 | 
23 | void add(void* p1, void* p2){
24 |     T* ptr1 = (T*)(p1);
25 |     T* ptr2 = (T*)(p2);
26 |     *ptr1 += *ptr2;  
27 | }
28 | 
29 | static T reduction_host(T* A) {
30 |     T count = 0;
31 |     for (uint64_t i = 0; i < nr_elements; i++) {
32 |         count += A[i];
33 |     }
34 |     return count;
35 | }
36 | 
37 | void run(){
38 |     simplepim_management_t* table_management = table_management_init(dpu_number);
39 |     T* A = (T*)malloc_scatter_aligned(nr_elements, sizeof(T), table_management);
40 |     init(A);
41 |     T correct_res = reduction_host(A);
42 | 
43 |     simplepim_scatter("t1", A, nr_elements, sizeof(T), table_management);
44 |     printf("end of data transfer\n");
45 | 
46 |     handle_t* va_handle = create_handle("red_funcs", REDUCE);
47 | 
48 |     
49 |     T* res = table_gen_red("t1", "t2", sizeof(T), 1, va_handle, table_management, 0);
50 | 
51 | 
52 |     if(print_info){
53 |       struct dpu_set_t set = table_management->set;
54 |       struct dpu_set_t dpu;
55 |       DPU_FOREACH(set, dpu) {
56 |         DPU_ASSERT(dpu_log_read(dpu, stdout));
57 |       }
58 |     }
59 | 
60 | 
61 |     // accumulating floats may have difference in precision
62 |     if( ((float)(correct_res-*res))/correct_res < 0.01 && ((float)(correct_res-*res))/correct_res > -0.01){
63 |         printf("the result is correct \n");
64 |     }
65 |     else{
66 |         printf("correct res : %f, got res : %f \n", correct_res, *res);
67 |         printf("cpu result does not match \n");
68 |     }
69 |     
70 |     
71 | }
72 | 
73 | 
74 | int main(int argc, char *argv[]){
75 |   run();
76 |   return 0;
77 | }
78 | 


--------------------------------------------------------------------------------
/benchmarks/red/red_funcs/init_combine_func.h:
--------------------------------------------------------------------------------
 1 | #ifndef INIT_COMBINE_FUNC_H
 2 | #define INIT_COMBINE_FUNC_H
 3 | 
 4 | #include <stdlib.h>
 5 | #include <stdint.h>
 6 | #include <stdio.h>
 7 | #include "../Param.h"
 8 | 
 9 | 
10 | void combine_func(void* dest, void* src){
11 |     *(uint32_t*)dest += *(uint32_t*)src; 
12 | }
13 | 
14 | void init_func(uint32_t size, void* ptr){
15 |     char* casted_value_ptr = (char*) ptr;
16 |     for(int i=0; i<size; i++){
17 |         casted_value_ptr[i] = 0;
18 |     }
19 | }
20 | 
21 | #endif
22 | 


--------------------------------------------------------------------------------
/benchmarks/red/red_funcs/map_to_val_func.h:
--------------------------------------------------------------------------------
 1 | #ifndef MAP_TO_VAL_FUNC_H
 2 | #define MAP_TO_VAL_FUNC_H
 3 | 
 4 | #include <stdlib.h>
 5 | #include <stdint.h>
 6 | #include <stdio.h>
 7 | 
 8 | #include <mram.h>
 9 | #include <alloc.h>
10 | #include <defs.h>
11 | #include <barrier.h>
12 | 
13 | #include "../Param.h"
14 | #include "../../../lib/processing/gen_red/GenRedArgs.h"
15 | 
16 | 
17 | void start_func(gen_red_arguments_t* args){
18 |     
19 | }
20 | 
21 | void map_to_val_func(void* input, void* output, uint32_t* key){
22 |     *key = 0;
23 |     *(T*)output = *(T*) input;
24 | }
25 | 
26 | #endif


--------------------------------------------------------------------------------
/benchmarks/va/Makefile:
--------------------------------------------------------------------------------
1 | va: host.c
2 | 	@mkdir -p bin
3 | 	gcc --std=c99 -lm -fopenmp -O3 host.c -o bin/host ../../lib/processing/ProcessingHelperHost.c ../../lib/communication/CommHelper.c  ../../lib/communication/CommOps.c ../../lib/management/SmallTableInit.c ../../lib/management/Management.c ../../lib/processing/map/Map.c ../../lib/processing/zip/Zip.c `dpu-pkg-config --cflags --libs dpu`


--------------------------------------------------------------------------------
/benchmarks/va/Param.h:
--------------------------------------------------------------------------------
1 | #ifndef PARAM_H
2 | #define PARAM_H
3 | #include <stdlib.h>
4 | 
5 | typedef uint32_t T; 
6 | const uint32_t dpu_number = 5; //2432
7 | uint32_t print_info = 0;
8 | uint64_t nr_elements = dpu_number*10000; //dpu_number*1000000
9 | #endif


--------------------------------------------------------------------------------
/benchmarks/va/bin/dpu_init_binary:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CMU-SAFARI/SimplePIM/1d639c53532555f01e9f71d872e7712b166d6cba/benchmarks/va/bin/dpu_init_binary


--------------------------------------------------------------------------------
/benchmarks/va/bin/dpu_map_va_funcs:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CMU-SAFARI/SimplePIM/1d639c53532555f01e9f71d872e7712b166d6cba/benchmarks/va/bin/dpu_map_va_funcs


--------------------------------------------------------------------------------
/benchmarks/va/bin/dpu_zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CMU-SAFARI/SimplePIM/1d639c53532555f01e9f71d872e7712b166d6cba/benchmarks/va/bin/dpu_zip


--------------------------------------------------------------------------------
/benchmarks/va/bin/host:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CMU-SAFARI/SimplePIM/1d639c53532555f01e9f71d872e7712b166d6cba/benchmarks/va/bin/host


--------------------------------------------------------------------------------
/benchmarks/va/host.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <stdlib.h>
  3 | #include <string.h>
  4 | #include <assert.h>
  5 | #include <time.h>
  6 | #include <dpu.h>
  7 | #include <omp.h>
  8 | 
  9 | #include "../../lib/processing/map/Map.h"
 10 | #include "../../lib/processing/zip/Zip.h"
 11 | #include "../../lib/processing/ProcessingHelperHost.h"
 12 | #include "../../lib/communication/CommOps.h"
 13 | #include "../../lib/management/Management.h"
 14 | #include "../../lib/timer.h"
 15 | #include "Param.h"
 16 | 
 17 | 
 18 | 
 19 | void init(T* A, uint32_t salt){
 20 |     for (uint64_t i = 0; i < nr_elements; i++) {
 21 |         A[i] = (i + salt)%128;
 22 |     }
 23 | }
 24 | 
 25 | void zip(T* A, T* B, T* res){
 26 |     for (uint64_t  i = 0; i < nr_elements; i++){
 27 |         res[2*i] = A[i];
 28 |         res[2*i+1] = B[i];
 29 |     }
 30 | }
 31 | 
 32 | void vec_add(T* A, T* res){
 33 |     for (uint64_t i = 0; i < nr_elements; i++){
 34 |         res[i] = A[i*2] + A[i*2+1];
 35 |     }
 36 | }
 37 | 
 38 | void vector_addition_host(T* A, T* B, T* res) {
 39 |     omp_set_num_threads(16);
 40 |     #pragma omp parallel for
 41 |     for (uint64_t  i = 0; i < nr_elements; i++) {
 42 |         res[i] = A[i] + B[i];
 43 |     }
 44 | }
 45 | 
 46 | 
 47 | void run(){
 48 |     simplepim_management_t* table_management = table_management_init(dpu_number);
 49 |     T* A = (T*)malloc_scatter_aligned(nr_elements, sizeof(T), table_management);
 50 |     T* B = (T*)malloc_scatter_aligned(nr_elements, sizeof(T), table_management);
 51 | 
 52 |     T* correct_res = (T*)malloc((uint64_t)sizeof(T)*nr_elements);
 53 |     init(A, 0);
 54 |     init(B, 1);
 55 |     vector_addition_host(A, B, correct_res);
 56 | 
 57 |     Timer timer;
 58 |     start(&timer, 0, 0);
 59 |     start(&timer, 5, 0);
 60 |     simplepim_scatter("t1", A, nr_elements,  sizeof(T), table_management);
 61 |     simplepim_scatter("t2", B, nr_elements,  sizeof(T),  table_management);
 62 |     stop(&timer, 0);
 63 |     printf("end of data transfer\n");
 64 | 
 65 |     handle_t* add_handle = create_handle("va_funcs", MAP);
 66 |     handle_t* zip_handle = create_handle("", ZIP);
 67 | 
 68 | 
 69 |     start(&timer, 1, 0);
 70 |     table_zip("t1", "t2", "t3",  zip_handle, table_management);
 71 |     table_map("t3", "t4", sizeof(T), add_handle, table_management, 0);
 72 |     stop(&timer, 1);
 73 | 
 74 |     
 75 |     if(print_info){
 76 |         struct dpu_set_t set, dpu;
 77 |         set = table_management->set;
 78 |       DPU_FOREACH(set, dpu) {
 79 |         DPU_ASSERT(dpu_log_read(dpu, stdout));
 80 |       }
 81 |     }
 82 |     
 83 | 
 84 |     start(&timer, 2, 0);
 85 |     T* res = simplepim_gather("t4", table_management);
 86 |     stop(&timer, 2);
 87 |     
 88 |     printf("the total time with timing consumed is (ms): ");
 89 |     print(&timer, 5, 1);
 90 |     printf("\n");
 91 |     printf("initial CPU-DPU input transfer (ms): ");
 92 | 	print(&timer, 0, 1);
 93 |     printf("\n");
 94 | 	printf("DPU Kernel Time (ms): ");
 95 | 	print(&timer, 1, 1);
 96 |     printf("\n");
 97 |     printf("DPU-CPU Time (ms): ");
 98 | 	print(&timer, 2, 1);
 99 |     printf("\n");
100 | 
101 |     
102 |     int32_t is_correct = 1;
103 | 
104 |     for(int i=0; i<nr_elements; i++){
105 |         if(res[i]!=correct_res[i]){
106 |             is_correct = 0;
107 |             printf("result mismatch at position %d, got %d, expected %d \n", i, res[i], correct_res[i]);
108 |             break;
109 |         }
110 |     } 
111 | 
112 |     if(is_correct){
113 |         printf("the result is correct \n");
114 |     }  
115 |     
116 |     
117 | 
118 | }
119 | 
120 | int main(int argc, char *argv[]){
121 |   run();
122 |   return 0;
123 | }
124 | 


--------------------------------------------------------------------------------
/benchmarks/va/va_funcs/map.h:
--------------------------------------------------------------------------------
 1 | #ifndef USER_H
 2 | #define USER_H
 3 | 
 4 | #include <stdio.h>
 5 | #include <stdlib.h>
 6 | #include "Param.h"
 7 | #include "../../../lib/processing/map/MapArgs.h"
 8 | 
 9 | void start_func(map_arguments_t* args){}
10 | 
11 | void map_func(void* input, void* res){
12 |     *(T*)res = ((T*)input)[0] + ((T*)input)[1];
13 | }
14 | 
15 | #endif


--------------------------------------------------------------------------------
/lib/Common.c:
--------------------------------------------------------------------------------
 1 | #include "Common.h"
 2 | void zero_init(uint32_t value_size, void* value_ptr){
 3 |     char* casted_value_ptr = (char*) value_ptr;
 4 |     for(int i=0; i<value_size; i++){
 5 |         casted_value_ptr[i] = 0;
 6 |     }
 7 | }
 8 | 
 9 | void print_int(void* value){
10 |     int *value_casted = (int*)value;
11 |     printf("%d\n", *value_casted);
12 | }
13 | 
14 | 
15 | void add_int(void* i1, void* i2){
16 |     int* i1_casted = (int*)i1;
17 |     int* i2_casted = (int*)i2;
18 |     *i1_casted = *i1_casted + *i2_casted;
19 | }
20 | 
21 | 
22 | 


--------------------------------------------------------------------------------
/lib/Common.h:
--------------------------------------------------------------------------------
 1 | #ifndef COMMON_H
 2 | #define COMMON_H
 3 | #include <stdint.h>
 4 | #include <stdlib.h>
 5 | #include <stdio.h>
 6 | 
 7 | void print_int(void* value);
 8 | void add_int(void* i1, void* i2);
 9 | void zero_init(uint32_t value_size, void* value_ptr);
10 | #endif 


--------------------------------------------------------------------------------
/lib/Parallel.c:
--------------------------------------------------------------------------------
  1 | #include "Parallel.h"
  2 | #include "../benchmarks/UserPath.h"
  3 | #include _user_h_
  4 | 
  5 | BARRIER_INIT(barrier_p, NR_TASKLETS);
  6 | MUTEX_INIT(mutex);
  7 | uint32_t curr_block;
  8 | 
  9 | uint64_t _copy_block_size(uint32_t type_size1, uint32_t type_size2,  uint32_t num_elem){
 10 |     //1024
 11 |     uint64_t res=0;
 12 |     uint32_t res_arr[2];
 13 |     uint32_t max_type_size = type_size1>type_size2?type_size1:type_size2;
 14 |     if(type_size1%8 == 0 && type_size2%8 == 0 && (num_elem <= NR_TASKLETS || max_type_size > 512)){
 15 |         res_arr[0] = 1;
 16 |         res_arr[1] = 0;
 17 |     }
 18 |     else if(type_size1%4 == 0 && type_size2%4 == 0 &&(num_elem <= 2*NR_TASKLETS || max_type_size > 256)){
 19 |         res_arr[0] = 2;
 20 |         res_arr[1] = 1;
 21 |     }
 22 |     else if(type_size1%2 == 0 && type_size2%2 == 0 && max_type_size > 128){
 23 |         res_arr[0] = 4;
 24 |         res_arr[1] = 2;
 25 |     }
 26 |     else if(max_type_size < 16){
 27 |         res_arr[0] = 256;
 28 |         res_arr[1] = 8;
 29 |     }
 30 |     else if(max_type_size < 32){
 31 |         res_arr[0] = 128;
 32 |         res_arr[1] = 7;
 33 |     }
 34 |     else{
 35 |         res_arr[0] = 16;
 36 |         res_arr[1] = 4;
 37 |     }
 38 | 
 39 |     res = *(uint64_t*)res_arr;
 40 |     return res;
 41 | 
 42 | }
 43 | 
 44 | uint32_t get_shift_bits_for_type(uint32_t value_size){
 45 |     switch (value_size) {
 46 |         case 2: 
 47 |            return 1;
 48 |         case 4:
 49 |             return 2;
 50 |         case 8: 
 51 |             return 3;
 52 |         case 16:
 53 |             return 4;
 54 |         case 32:
 55 |             return 5;
 56 |         case 64:
 57 |             return 6;
 58 |         case 128:
 59 |             return 7;
 60 |         case 256:
 61 |             return 8;
 62 |         case 512:
 63 |             return 9;
 64 |         case 1024:
 65 |             return 10;
 66 |         case 2048:
 67 |             return 11;
 68 |         case 4096:
 69 |             return 12;
 70 |         default: 
 71 |             return 0;
 72 |     }
 73 | }
 74 | 
 75 | 
 76 | void map_dpu(__mram_ptr void* inputs, __mram_ptr void* outputs, uint32_t input_type, uint32_t output_type, uint32_t len){
 77 |     uint32_t elem_type_size = input_type;
 78 |     uint32_t inter_type_size = output_type;
 79 |     uint32_t num_tasklets = NR_TASKLETS; 
 80 |     uint32_t pid = num_tasklets == 1 ? 0 : me();
 81 |     uint64_t tuple = _copy_block_size(elem_type_size, inter_type_size, len);
 82 |     uint32_t* copy_block_size_ = (uint32_t*)&tuple;
 83 |     uint32_t copy_block_size = copy_block_size_[0];
 84 |     uint32_t copy_block_size_shiftbits = copy_block_size_[1];
 85 |     // try malloc/free for performance
 86 |     fsb_allocator_t elems_block_allocator = fsb_alloc(elem_type_size<<copy_block_size_shiftbits, 1);
 87 |     __dma_aligned void* elems_block = fsb_get(elems_block_allocator);
 88 | 
 89 |     fsb_allocator_t inter_block_allocator = fsb_alloc(inter_type_size<<copy_block_size_shiftbits, 1);
 90 |     __dma_aligned void* inter_block = fsb_get(inter_block_allocator);
 91 | 
 92 |     uint32_t block_size_inputs = elem_type_size<<copy_block_size_shiftbits;
 93 |     uint32_t block_size_outputs = inter_type_size<<copy_block_size_shiftbits;
 94 |     uint32_t block_times_tasklets = num_tasklets<<copy_block_size_shiftbits;
 95 | 
 96 |     uint32_t divisible_len = (len>>copy_block_size_shiftbits)<<copy_block_size_shiftbits;
 97 |     uint32_t rest_len = len - divisible_len;
 98 | 
 99 |     void* inter, *elem;
100 | 
101 | 
102 |     uint32_t unroll_block_size = (copy_block_size>>2)<<2;
103 | 
104 |     uint32_t i_init = pid<<copy_block_size_shiftbits;
105 |     uint32_t i_elem_init = (uint32_t)inputs+i_init*elem_type_size;
106 |     uint32_t i_inter = (uint32_t)outputs+i_init*inter_type_size;
107 | 
108 |     uint32_t i_elem_stride = block_times_tasklets * elem_type_size;
109 |     uint32_t i_inter_stride = block_times_tasklets * inter_type_size;
110 |     uint32_t i_elem_divisible_len = divisible_len * elem_type_size;
111 | 
112 |     uint32_t j_elem_block_size = elem_type_size<<2;
113 |     uint32_t j_inter_block_size = inter_type_size <<2;
114 | 
115 |     uint32_t j_elem_unroll_block_end = (uint32_t)(unroll_block_size*elem_type_size + elems_block);
116 |     uint32_t j_inter_unroll_block_end = (uint32_t)(unroll_block_size*inter_type_size + inter_block);
117 | 
118 |     uint32_t j_elem_block_end = (uint32_t)(block_size_inputs + elems_block);
119 | 
120 |     for(uint32_t i_elem=i_elem_init; i_elem<i_elem_divisible_len; i_elem+=i_elem_stride){
121 |         
122 | 
123 |         mram_read((__mram_ptr void*)(i_elem), elems_block, block_size_inputs);
124 | 
125 |         inter = inter_block;
126 |         for(uint32_t j_elem=(uint32_t)elems_block; j_elem<j_elem_unroll_block_end; j_elem += j_elem_block_size){
127 | 
128 |             elem = (void*)j_elem;
129 |             map_func(elem, inter);
130 | 
131 |             elem += elem_type_size;
132 |             inter += inter_type_size;
133 |             map_func(elem, inter);
134 | 
135 |             elem += elem_type_size;
136 |             inter += inter_type_size;
137 |             map_func(elem, inter);
138 | 
139 |             elem += elem_type_size;
140 |             inter += inter_type_size;
141 |             map_func(elem, inter);
142 | 
143 |             inter += inter_type_size;
144 |         }
145 | 
146 |         inter = (void*)j_inter_unroll_block_end;
147 |         for(uint32_t j_elem=j_elem_unroll_block_end; j_elem<j_elem_block_end; j_elem += elem_type_size){
148 |             map_func((void*)j_elem, inter);
149 |             inter += inter_type_size;
150 |         }
151 | 
152 | 
153 |         mram_write(inter_block, (__mram_ptr void*)(i_inter),  block_size_outputs);
154 | 
155 |         i_inter += i_inter_stride;
156 |     }
157 | 
158 |     // handle last block 
159 |     if(pid==0 && rest_len != 0){
160 |         uint32_t last_block_elem_addr = divisible_len * elem_type_size;
161 |         uint32_t last_block_inter_addr = divisible_len * inter_type_size;
162 | 
163 |         mram_read((__mram_ptr void*)(outputs+last_block_inter_addr), inter_block, block_size_outputs);
164 |         mram_read((__mram_ptr void*)(inputs+last_block_elem_addr), elems_block, block_size_inputs);
165 | 
166 |         elem = elems_block;
167 |         inter = inter_block;
168 |         for(uint32_t j=0; j<rest_len; j++){
169 |             map_func(elem, inter);
170 |             elem += elem_type_size;
171 |             inter +=inter_type_size;
172 |         }
173 | 
174 |         mram_write(inter_block, (__mram_ptr void*)(outputs+last_block_inter_addr),  block_size_outputs);
175 | 
176 |     }
177 |     
178 | 
179 |     fsb_free(elems_block_allocator, elems_block);
180 |     fsb_free(inter_block_allocator, inter_block);
181 | 
182 |     barrier_wait(&barrier_p);
183 | }
184 | 
185 | void zip_dpu(__mram_ptr void* table_entries_1, __mram_ptr void* table_entries_2, __mram_ptr void* table_entries_res, uint32_t input_type_1, uint32_t input_type_2, uint32_t len){
186 |     uint32_t num_tasklets = NR_TASKLETS; 
187 |     uint32_t pid = num_tasklets == 1 ? 0 : me();
188 |     uint64_t tuple = _copy_block_size(input_type_1, input_type_2, len);
189 |     uint32_t* copy_block_size_ = (uint32_t*)&tuple;
190 |     uint32_t copy_block_size = copy_block_size_[0];
191 |     uint32_t copy_block_size_shiftbits = copy_block_size_[1];
192 | 
193 |     uint32_t input_block_size_1 = copy_block_size*input_type_1;
194 |     uint32_t input_block_size_2 = copy_block_size*input_type_2;
195 |     uint32_t output_type = input_type_1+input_type_2;
196 |     uint32_t output_block_size = copy_block_size*output_type;
197 | 
198 |     fsb_allocator_t input1_block_allocator = fsb_alloc(input_block_size_1, 1);
199 |     __dma_aligned void* input1_block = fsb_get(input1_block_allocator);
200 | 
201 |     fsb_allocator_t input2_block_allocator = fsb_alloc(input_block_size_2, 1);
202 |     __dma_aligned void* input2_block = fsb_get(input2_block_allocator);
203 | 
204 |     fsb_allocator_t output_block_allocator = fsb_alloc(output_block_size, 1);
205 |     __dma_aligned void* output_block = fsb_get(output_block_allocator);
206 | 
207 |     uint32_t divisible_len = len - len%copy_block_size;
208 |     uint32_t rest_len = len - divisible_len;
209 | 
210 |     uint32_t block_times_tasklets = copy_block_size*num_tasklets;
211 | 
212 |     uint32_t i_input1;
213 |     uint32_t i_input2;
214 |     uint32_t i_output;
215 |     uint32_t j_input1;
216 |     uint32_t j_input2;
217 |     uint32_t j_output;
218 | 
219 |     void* input1;
220 |     void* input2;
221 |     void* output;
222 |     uint32_t unroll_block_size = (copy_block_size>>2)<<2;
223 | 
224 |     for(int i=pid*copy_block_size; i<divisible_len; i+=block_times_tasklets){
225 |         i_input1 = i*input_type_1;
226 |         i_input2 = i*input_type_2;
227 |         i_output = i*output_type;
228 |         
229 |         mram_read(table_entries_1+i_input1, input1_block, input_block_size_1);
230 |         mram_read(table_entries_2+i_input2, input2_block, input_block_size_2);
231 | 
232 |         for(int j=0; j<unroll_block_size; j+=4){
233 |             //j_elem = j<<inputs_shift_bits;
234 |             //j_inter = j<<outputs_shift_bits;
235 |             j_input1 = j*input_type_1;
236 |             j_input2 = j*input_type_2;
237 |             j_output = j*output_type;
238 | 
239 |             input1 = input1_block+j_input1;
240 |             input2 = input2_block+j_input2;
241 |             output = output_block+j_output;
242 |             memcpy(output, input1, input_type_1);
243 |             memcpy(output+input_type_1, input2, input_type_2);
244 | 
245 |             input1 += input_type_1;
246 |             input2 += input_type_2;
247 |             output += output_type;
248 |             memcpy(output, input1, input_type_1);
249 |             memcpy(output+input_type_1, input2, input_type_2);
250 | 
251 |             input1 += input_type_1;
252 |             input2 += input_type_2;
253 |             output += output_type;
254 |             memcpy(output, input1, input_type_1);
255 |             memcpy(output+input_type_1, input2, input_type_2);
256 | 
257 |             input1 += input_type_1;
258 |             input2 += input_type_2;
259 |             output += output_type;
260 |             memcpy(output, input1, input_type_1);
261 |             memcpy(output+input_type_1, input2, input_type_2);
262 |         }
263 | 
264 |         mram_write(output_block, table_entries_res+i_output, output_block_size);
265 | 
266 | 
267 |     }
268 | 
269 |     // handle last block 
270 |     if(pid==0 && rest_len != 0){
271 |         uint32_t last_block_input1_addr = divisible_len * input_type_1;
272 |         uint32_t last_block_input2_addr = divisible_len * input_type_2;
273 |         uint32_t last_block_output_addr = divisible_len * output_type;
274 | 
275 |         mram_read((__mram_ptr void*)table_entries_1+last_block_input1_addr, input1_block, input_block_size_1);
276 |         mram_read((__mram_ptr void*)table_entries_2+last_block_input2_addr, input2_block, input_block_size_2);
277 |         mram_read((__mram_ptr void*)table_entries_res+last_block_output_addr, output_block, output_block_size);
278 | 
279 |         for(int j=0; j<rest_len; j++){
280 |             j_input1 = j*input_type_1;
281 |             j_input2 = j*input_type_2;
282 |             j_output = j*output_type;
283 | 
284 |             input1 = input1_block+j_input1;
285 |             input2 = input2_block+j_input2;
286 |             output = output_block+j_output;
287 |             memcpy(output, input1, input_type_1);
288 |             memcpy(output+input_type_1, input2, input_type_2);
289 | 
290 |         }
291 | 
292 |         mram_write(output_block, table_entries_res+last_block_output_addr, output_block_size);
293 | 
294 |     }
295 | 
296 |     fsb_free(output_block_allocator, output_block);
297 |     fsb_free(input1_block_allocator, input1_block);
298 |     fsb_free(input2_block_allocator, input2_block);
299 |     barrier_wait(&barrier_p);
300 | }
301 | 
302 | 
303 | void map_and_combine_oncache_imbalanced(__mram_ptr void* table_entries_mram, void (*init_func)(uint32_t, void*), void (*mapHashFunc)(void*, void*, uint32_t*), void (*combineFunc)(void*, void*), dpu_arguments_t* DPU_INPUT_ARGUMENTS){
304 |     uint32_t len = DPU_INPUT_ARGUMENTS->input_len;
305 |     uint32_t elem_type_size = DPU_INPUT_ARGUMENTS->input_type_size;
306 |     uint32_t inter_type_size = DPU_INPUT_ARGUMENTS->table_type_size;
307 |     uint32_t table_size = DPU_INPUT_ARGUMENTS->table_len;
308 |     uint32_t num_tasklets = NR_TASKLETS;
309 |     uint32_t pid = num_tasklets == 1 ? 0 : me();
310 | 
311 |     uint64_t tuple = _copy_block_size(elem_type_size, inter_type_size, len);
312 |     uint32_t* copy_block_size_ = (uint32_t*)&tuple;
313 |     uint32_t copy_block_size = copy_block_size_[0];
314 |     uint32_t copy_block_size_shiftbits = copy_block_size_[1];
315 | 
316 |     __mram_ptr void* elements = DPU_MRAM_HEAP_POINTER+DPU_INPUT_ARGUMENTS->input_start_offset;
317 |     // try malloc/free for performance
318 |     fsb_allocator_t elems_block_allocator = fsb_alloc(copy_block_size*elem_type_size, 1);
319 |     __dma_aligned void* elems_block = fsb_get(elems_block_allocator);
320 | 
321 |     fsb_allocator_t table_allocator = fsb_alloc(sizeof(table_t), 1);
322 |     __dma_aligned table_t* local_table = fsb_get(table_allocator);
323 |     init_table(local_table, table_size, inter_type_size, init_func);
324 | 
325 |     fsb_allocator_t tmp_intermediate_allocator = fsb_alloc(inter_type_size, 1);
326 |     __dma_aligned void* tmp_intermediate = fsb_get(tmp_intermediate_allocator);
327 | 
328 |     uint32_t key = 0;
329 |     // TODO : possible optimisation : shifts instead of mult, check bounds for last iterations (later)
330 |     
331 |     curr_block = 0;
332 |     uint32_t i;
333 |     uint32_t curr_block_local;
334 |     uint32_t num_blocks = (len%copy_block_size==0)?len/copy_block_size:len/copy_block_size+1;
335 |     uint32_t copy_block_size_in_bytes = copy_block_size*elem_type_size;
336 |     uint32_t total_len_in_bytes = len * elem_type_size;
337 |     
338 |     barrier_wait(&barrier_p);
339 |     while (curr_block<num_blocks)
340 |     {
341 |         mutex_lock(mutex);
342 |         curr_block_local = curr_block;
343 |         curr_block++; // current lock being processed
344 |         mutex_unlock(mutex);
345 | 
346 |         i = curr_block_local*copy_block_size_in_bytes;
347 |         mram_read((__mram_ptr void*)(& ((char*)elements)[i]), elems_block, copy_block_size_in_bytes);
348 |         for(int j=0; j<copy_block_size_in_bytes; j+=elem_type_size){
349 |             if(i+j<total_len_in_bytes){
350 |                 void* elem = (void*)(& ((char*)elems_block)[j]);
351 |                 // intermediate results are inserted in local_table
352 |                 (*mapHashFunc)(elem, tmp_intermediate, &key);
353 |                 insert_table(local_table, key, tmp_intermediate, combineFunc);
354 |             }
355 |             else{
356 |                 break;
357 |             }
358 |         }
359 |     }
360 | 
361 |     combine_tables_lockfree(table_entries_mram, local_table, init_func, combineFunc);
362 | 
363 |     fsb_free(elems_block_allocator, elems_block);
364 |     fsb_free(tmp_intermediate_allocator, tmp_intermediate);
365 | 
366 |     free_table(local_table);
367 |     fsb_free(table_allocator, local_table);
368 |     
369 | 
370 | }
371 | 
372 | 
373 | void combine_oncache_dpu(__mram_ptr void* table_entries_mram, dpu_arguments_t* DPU_INPUT_ARGUMENTS){
374 |     uint32_t len = DPU_INPUT_ARGUMENTS->input_len;
375 |     uint32_t elem_type_size = DPU_INPUT_ARGUMENTS->input_type_size;
376 |     uint32_t inter_type_size = DPU_INPUT_ARGUMENTS->table_type_size;
377 |     uint32_t table_size = DPU_INPUT_ARGUMENTS->table_len;
378 |     uint32_t num_tasklets = NR_TASKLETS;
379 |     uint32_t pid = num_tasklets == 1 ? 0 : me();
380 |     
381 |     uint64_t tuple = _copy_block_size(elem_type_size, inter_type_size, len);
382 |     uint32_t* copy_block_size_ = (uint32_t*)&tuple;
383 |     uint32_t copy_block_size = copy_block_size_[0];
384 |     uint32_t copy_block_size_shiftbits = copy_block_size_[1];
385 | 
386 |     
387 |     __mram_ptr void* elements = DPU_MRAM_HEAP_POINTER+DPU_INPUT_ARGUMENTS->input_start_offset;
388 |     // try malloc/free for performance
389 |     fsb_allocator_t elems_block_allocator = fsb_alloc(copy_block_size*elem_type_size, 1);
390 |     __dma_aligned void* elems_block = fsb_get(elems_block_allocator);
391 | 
392 |     
393 | 
394 |     fsb_allocator_t table_allocator = fsb_alloc(sizeof(table_t), 1);
395 |     __dma_aligned table_t* local_table = fsb_get(table_allocator);
396 |     init_table(local_table, table_size, inter_type_size, init_func);
397 | 
398 |     
399 |     fsb_allocator_t tmp_intermediate_allocator = fsb_alloc(inter_type_size, 1);
400 |     __dma_aligned void* tmp_intermediate = fsb_get(tmp_intermediate_allocator);
401 | 
402 |     uint32_t last_block = (len/copy_block_size)*copy_block_size;
403 |     uint32_t key = 0;
404 | 
405 |     void* local_table_entries = local_table->table;
406 |     uint32_t curr_entry;
407 |     uint32_t shift_bits = get_shift_bits_for_type(inter_type_size);
408 | 
409 | 
410 |     // TODO : possible optimisation : shifts instead of mult, check bounds for last iterations (later)
411 |     uint32_t total_len_in_bytes = len * elem_type_size;
412 |     uint32_t copy_block_size_in_bytes = copy_block_size*elem_type_size;
413 |     uint32_t stride = copy_block_size*num_tasklets*elem_type_size;
414 |     for(int i=pid*copy_block_size_in_bytes; i<total_len_in_bytes; i+=stride){
415 | 
416 |         mram_read((__mram_ptr void*)(& ((char*)elements)[i]), elems_block, copy_block_size_in_bytes);
417 | 
418 |         for(int j=0; j<copy_block_size_in_bytes; j+=elem_type_size){
419 |             if(i+j<total_len_in_bytes){
420 |                 void* elem = (void*)(& ((char*)elems_block)[j]);
421 |                 // intermediate results 
422 |                 map_to_val_func(elem, tmp_intermediate, &key);
423 | 
424 |                 // get entry for intermediate results
425 |                 
426 |                 if(key==0){
427 |                     curr_entry = 0;
428 |                 }
429 |                 else if(shift_bits){
430 |                     curr_entry = key<<shift_bits;
431 |                 }
432 |                 else{
433 |                     curr_entry = key*inter_type_size;
434 |                 }
435 |                 
436 |                 //curr_entry = key*inter_type_size;
437 |                 combine_func(local_table_entries+curr_entry, tmp_intermediate);
438 |             }
439 |             else{
440 |                 break;
441 |             }
442 |         }
443 |     }
444 |     combine_tables_lockfree(table_entries_mram, local_table, init_func, combine_func);
445 | 
446 |     fsb_free(elems_block_allocator, elems_block);
447 |     fsb_free(tmp_intermediate_allocator, tmp_intermediate);
448 | 
449 |     free_table(local_table);
450 |     fsb_free(table_allocator, local_table);
451 |     
452 | }
453 | 
454 | 
455 | __dma_aligned table_t* t_global;
456 | void combine_shared_dpu(__mram_ptr void* table_entries_mram, void (*init_func)(uint32_t, void*), void (*mapHashFunc)(void*, void*, uint32_t*), void (*combineFunc)(void*, void*), dpu_arguments_t* DPU_INPUT_ARGUMENTS){
457 |     uint32_t len = DPU_INPUT_ARGUMENTS->input_len;
458 |     uint32_t elem_type_size = DPU_INPUT_ARGUMENTS->input_type_size;
459 |     uint32_t inter_type_size = DPU_INPUT_ARGUMENTS->table_type_size;
460 |     uint32_t table_size = DPU_INPUT_ARGUMENTS->table_len;
461 |     uint32_t num_tasklets = NR_TASKLETS;
462 |     uint32_t pid = num_tasklets == 1 ? 0 : me();
463 |     
464 |     uint64_t tuple = _copy_block_size(elem_type_size, inter_type_size, len);
465 |     uint32_t* copy_block_size_ = (uint32_t*)&tuple;
466 |     uint32_t copy_block_size = copy_block_size_[0];
467 |     uint32_t copy_block_size_shiftbits = copy_block_size_[1];
468 |     
469 |     __mram_ptr void* elements = DPU_MRAM_HEAP_POINTER+DPU_INPUT_ARGUMENTS->input_start_offset;
470 |     // try malloc/free for performance
471 |     fsb_allocator_t elems_block_allocator = fsb_alloc(copy_block_size*elem_type_size, 1);
472 |     __dma_aligned void* elems_block = fsb_get(elems_block_allocator);
473 | 
474 |     fsb_allocator_t table_allocator = fsb_alloc(sizeof(table_t), 1);
475 |     if(pid == 0){
476 |         t_global = fsb_get(table_allocator);
477 |     }
478 |     init_shared_table(t_global, table_size, inter_type_size, init_func);
479 | 
480 |     fsb_allocator_t tmp_intermediate_allocator = fsb_alloc(inter_type_size, 1);
481 |     __dma_aligned void* tmp_intermediate = fsb_get(tmp_intermediate_allocator);
482 | 
483 |     int last_block = (len/copy_block_size)*copy_block_size;
484 |     uint32_t key = 0;
485 | 
486 |     // TODO : possible optimisation : shifts instead of mult, check bounds for last iterations (later)
487 |     uint32_t total_len_in_bytes = len * elem_type_size;
488 |     uint32_t copy_block_size_in_bytes = copy_block_size*elem_type_size;
489 |     uint32_t stride = copy_block_size*num_tasklets*elem_type_size;
490 |     for(int i=pid*copy_block_size_in_bytes; i<total_len_in_bytes; i+=stride){
491 | 
492 |         mram_read((__mram_ptr void*)(& ((char*)elements)[i]), elems_block, copy_block_size_in_bytes);
493 | 
494 |         for(int j=0; j<copy_block_size_in_bytes; j+=elem_type_size){
495 |             if(i+j<total_len_in_bytes){
496 |                 void* elem = (void*)(& ((char*)elems_block)[j]);
497 |                 // intermediate results are inserted in local_table
498 |                 (*mapHashFunc)(elem, tmp_intermediate, &key);
499 |                 insert_shared_table(t_global, key, tmp_intermediate, combineFunc);
500 |             }
501 |             else{
502 |                break;
503 |             }
504 |         }
505 |     }
506 |     
507 |     store_shared_table_on_heap(t_global, table_entries_mram);
508 | 
509 |     fsb_free(elems_block_allocator, elems_block);
510 |     fsb_free(tmp_intermediate_allocator, tmp_intermediate);
511 | 
512 |     free_table(t_global);
513 |     if(pid == 0){
514 |         fsb_free(table_allocator, t_global);
515 |     }
516 | }


--------------------------------------------------------------------------------
/lib/Parallel.h:
--------------------------------------------------------------------------------
 1 | #ifndef PARALLEL_H
 2 | #define PARALLEL_H
 3 | #include <stdio.h>
 4 | #include <stdlib.h>
 5 | #include <string.h>
 6 | #include <alloc.h>
 7 | #include <defs.h>
 8 | #include <mram.h>
 9 | #include "mutex.h"
10 | 
11 | #include "Structs.h"
12 | #include "StructsPIM.h"
13 | #include "Table.h"
14 | #include "TableShared.h"
15 | #include "Common.h"
16 | 
17 | void map_dpu(__mram_ptr void* inputs, __mram_ptr void* outputs, uint32_t input_type, uint32_t output_type, uint32_t len);
18 | void combine_oncache_dpu(__mram_ptr void* table_entries_mram, dpu_arguments_t* DPU_INPUT_ARGUMENTS);
19 | void combine_shared_dpu(__mram_ptr void* table_entries_mram, void (*init_func)(uint32_t, void*), void (*key_func)(void*, void*, uint32_t*), void (*combine_func)(void*, void*), dpu_arguments_t* DPU_INPUT_ARGUMENTS);
20 | void zip_dpu(__mram_ptr void* table_entries_1, __mram_ptr void* table_entries_2, __mram_ptr void* table_entries_res, uint32_t input_type_1, uint32_t input_type_2, uint32_t len);
21 | uint32_t get_shift_bits_for_type(uint32_t value_size);
22 | 
23 | #endif 


--------------------------------------------------------------------------------
/lib/Structs.h:
--------------------------------------------------------------------------------
 1 | #ifndef STRUCTS_H
 2 | #define STRUCTS_H
 3 | #include <stdint.h>
 4 | #include <stdio.h>
 5 | 
 6 | typedef struct {
 7 |    uint32_t  input_start_offset;
 8 |    uint32_t  input_len;
 9 |    uint32_t  input_type_size;
10 |    uint32_t  data_start_offset;
11 |    uint32_t  data_len;
12 |    uint32_t  data_type_size;
13 |    uint32_t  end_offset;
14 |    uint32_t  table_type_size;
15 |    uint32_t  table_len;
16 |    uint32_t mode; //mode 0, all on wram; mode 1, data on mram;
17 | } dpu_arguments_t;
18 | 
19 | 
20 | #endif 


--------------------------------------------------------------------------------
/lib/StructsPIM.h:
--------------------------------------------------------------------------------
 1 | #ifndef STRUCTSPIM_H
 2 | #define STRUCTSPIM_H
 3 | 
 4 | #include <alloc.h>
 5 | #include <mram.h>
 6 | #include <stdint.h>
 7 | 
 8 | typedef struct {
 9 |    uint8_t* locks;
10 |    uint32_t table_size;
11 |    uint32_t value_size;
12 |    __mram_ptr void* heap_ptr;
13 |    void* table; // table is an array of table_size, each element of size (uint32_t, uint32_t, value_size)
14 |    fsb_allocator_t table_allocator;
15 |    fsb_allocator_t locks_allocator;
16 | } table_t;
17 | 
18 | #endif 


--------------------------------------------------------------------------------
/lib/Table.c:
--------------------------------------------------------------------------------
  1 | #include "Table.h"
  2 | 
  3 | 
  4 | void init_table(table_t* t, uint32_t table_size, uint32_t value_size, void (*init_func)(uint32_t, void*)){
  5 |     t->table_size = table_size;
  6 |     t->value_size = value_size;
  7 |     t->table_allocator = fsb_alloc(table_size*value_size+(table_size*value_size)%8, 1);
  8 |     t->locks_allocator = NULL;
  9 |     t->table = fsb_get(t->table_allocator);
 10 |     t->locks = NULL;
 11 |     t->heap_ptr = NULL;
 12 |     void* tmp = t->table;
 13 | 
 14 |     for(int i=0; i<table_size; i++){
 15 |         init_func(value_size, tmp);
 16 |         tmp += value_size;
 17 |     }
 18 | }
 19 | 
 20 | void insert_table(table_t* t, uint32_t key, void* value, void (*combineFunc)(void*, void*)){
 21 |     uint32_t value_size = t->value_size;
 22 |     uint32_t curr_entry = key*value_size;
 23 |     void* value_ptr_in_table = t->table+curr_entry;
 24 | 
 25 |     (*combineFunc)(value_ptr_in_table, value);
 26 | 
 27 | }
 28 | 
 29 | void free_table(table_t* t){
 30 |     fsb_free(t->table_allocator, t->table);
 31 | }
 32 | 
 33 | 
 34 | void print_table(table_t* t, void (*printFunc)(void*)){
 35 |     uint32_t table_size = t->table_size;
 36 |     uint32_t value_size = t->value_size;
 37 |     print_table_entries(t->table, table_size, value_size, printFunc); 
 38 | }
 39 | 
 40 | 
 41 | void combine_tables(table_t* t1, table_t* t2, void (*combineFunc)(void*, void*)){
 42 |     uint32_t table_size = t1->table_size;
 43 |     uint32_t value_size = t1->value_size;
 44 |     void* table1 = t1->table;
 45 |     void* table2 = t2->table;
 46 |     combine_table_entries(table1, table2, table_size, value_size, combineFunc);
 47 | }
 48 | 
 49 | uint32_t d = 32;
 50 | void print_entry(void* p){
 51 |     int* int_p = (int*)p;
 52 |     printf("%d, ", *int_p);
 53 | 
 54 |     p += sizeof(int32_t);
 55 |     int_p = (int*)p;
 56 |     
 57 |     for(int i=0; i<d; i++){
 58 |         printf(" %d", int_p[i]);
 59 |     }
 60 |     printf("\n");
 61 |     
 62 | }
 63 | 
 64 | __dma_aligned table_t* global_table_helper;
 65 | BARRIER_INIT(barrier, NR_TASKLETS);
 66 | void combine_tables_lockfree(__mram_ptr void* table_entries_mram, table_t* local_table, void (*init_func)(uint32_t, void*), void (*combineFunc)(void*, void*)){
 67 |     uint32_t num_tasklets = NR_TASKLETS; 
 68 |     uint32_t pid = me();
 69 | 
 70 |     uint32_t table_size = local_table->table_size;
 71 |     uint32_t value_size = local_table->value_size;
 72 |     barrier_wait(&barrier);
 73 | 
 74 |     
 75 |     if(num_tasklets==1){
 76 |         //global table is empty&&on heap, and store local table on heap
 77 |         store_shared_table_on_heap(local_table, table_entries_mram);
 78 |     }
 79 |     else{
 80 | 
 81 |         
 82 |         // a helper table for global merge
 83 |         fsb_allocator_t global_table_allocator;
 84 |         if(pid==0){
 85 |             global_table_allocator = fsb_alloc(sizeof(table_t), 1);
 86 |             global_table_helper = (table_t*)fsb_get(global_table_allocator);
 87 |             init_table(global_table_helper, table_size, value_size, init_func);
 88 |         } 
 89 |         barrier_wait(&barrier);
 90 |         //
 91 |         uint32_t local_len = table_size / num_tasklets;
 92 |         uint32_t rest = table_size % num_tasklets;
 93 |         uint32_t curr_id;
 94 |         uint32_t curr_len;
 95 |         uint32_t start_pos;
 96 |         void* global_table_ptr = global_table_helper->table;
 97 |         void* local_table_ptr = local_table->table;
 98 |     
 99 |         for(int i=0; i<num_tasklets; i++){
100 |             curr_id = (pid+i)%num_tasklets;
101 |             curr_len = curr_id<rest?(local_len+1):local_len;
102 |             start_pos = curr_id<rest?(local_len+1)*curr_id:local_len*curr_id+rest;
103 |             combine_table_entries(global_table_ptr+start_pos*value_size, local_table_ptr+start_pos*value_size, curr_len, value_size, combineFunc);
104 |             
105 |            barrier_wait(&barrier);
106 |         }
107 | 
108 |     
109 |         uint32_t transfer_size = table_size*value_size+(table_size*value_size)%8;
110 |         store_arr_aligned(global_table_helper->table, table_entries_mram, transfer_size);
111 |         //print_shared_table(global_table_helper, print_entry);
112 |         if(pid == 0){
113 |             free_table(global_table_helper);
114 |             fsb_free(global_table_allocator, global_table_helper);
115 |         }
116 |         
117 |     }
118 | 
119 | 
120 | }
121 | 
122 | 
123 | 


--------------------------------------------------------------------------------
/lib/Table.h:
--------------------------------------------------------------------------------
 1 | #ifndef TABLE_H
 2 | #define TABLE_H
 3 | #include <alloc.h>
 4 | #include <mram.h>
 5 | #include <defs.h>
 6 | #include <barrier.h>
 7 | #include <stdio.h>
 8 | #include "TableHost.h"
 9 | #include "TableShared.h"
10 | #include "StructsPIM.h"
11 | 
12 | void init_table(table_t* t, uint32_t table_size, uint32_t value_size, void (*init_func)(uint32_t, void*));
13 | void free_table(table_t* t);
14 | void insert_table(table_t* t, uint32_t key, void* value, void (*combineFunc)(void*, void*));
15 | void combine_tables_lockfree(__mram_ptr void* table_entries_mram, table_t* local_table, void (*init_func)(uint32_t, void*), void (*combineFunc)(void*, void*));
16 | void print_table(table_t* t, void (*printFunc)(void*));
17 | #endif 


--------------------------------------------------------------------------------
/lib/TableHost.c:
--------------------------------------------------------------------------------
 1 | #include "TableHost.h"
 2 | void combine_table_entries(void* table1, void* table2, uint32_t table_size, uint32_t value_size, void (*combineFunc)(void*, void*)){
 3 |     
 4 |     uint32_t curr_entry;
 5 |  
 6 |     for(int i=0; i<table_size; ++i){
 7 |         curr_entry = i*(value_size);
 8 |         (*combineFunc)(table1+curr_entry, table2+curr_entry);
 9 |         
10 |     } 
11 | 
12 | }
13 | 
14 | 
15 | 
16 | void print_table_entries(void* table, uint32_t table_size, uint32_t value_size, void (*printFunc)(void*)){
17 |    uint32_t curr_entry;
18 |    void* value;
19 | 
20 |    for(int i=0; i<table_size; ++i){
21 |         curr_entry = i*(value_size);
22 |         value = (void*)(& ((char*)table)[curr_entry]);
23 |         printf("at position %d : ", i);
24 |         (*printFunc)(value);
25 |         
26 |     } 
27 | }


--------------------------------------------------------------------------------
/lib/TableHost.h:
--------------------------------------------------------------------------------
 1 | #ifndef TABLEHOST_H
 2 | #define TABLEHOST_H
 3 | #include <stdint.h>
 4 | #include <stdio.h>
 5 | #include "Structs.h"
 6 | void combine_table_entries(void* table1, void* table2, uint32_t table_size, uint32_t value_size, void (*combineFunc)(void*, void*));
 7 | void print_table_entries(void* table, uint32_t table_size, uint32_t value_size, void (*printFunc)(void*));
 8 | 
 9 | #endif 
10 | 


--------------------------------------------------------------------------------
/lib/TableShared.c:
--------------------------------------------------------------------------------
  1 | #include "TableShared.h"
  2 | 
  3 | // table in this file must be the same table seen by all threads
  4 | 
  5 | BARRIER_INIT(barrier_shared, NR_TASKLETS);
  6 | MUTEX_INIT(mutex_shared);
  7 | 
  8 | // multithreaded load and store, len must be 8 bytes aligned, can operate on more than 2kB data
  9 | uint32_t curr_block_shared;
 10 | void load_arr_aligned(void* arr, __mram_ptr void* heap_ptr, uint32_t len){
 11 |     uint32_t transfer_each_time = 256; //must be multiple of 8
 12 |     uint32_t transfer_size = len;
 13 |     curr_block_shared = 0;
 14 | 
 15 |     uint32_t curr_offset;
 16 |     uint32_t curr_block_local;
 17 |     uint32_t last_transfer_size = (transfer_size%transfer_each_time==0)?transfer_each_time:transfer_size/transfer_size%transfer_each_time;
 18 |     uint32_t num_blocks = (transfer_size%transfer_each_time==0)?transfer_size/transfer_each_time:transfer_size/transfer_each_time+1;
 19 |     
 20 |     
 21 |     barrier_wait(&barrier_shared);
 22 |     while (curr_block_shared<num_blocks)
 23 |     {
 24 |         mutex_lock(mutex_shared);
 25 |         curr_block_local = curr_block_shared;
 26 |         curr_block_shared++; // current lock being processed
 27 |         mutex_unlock(mutex_shared);
 28 | 
 29 |         
 30 |         curr_offset = curr_block_local*transfer_each_time;
 31 |         if(curr_block_local == num_blocks-1){
 32 |             mram_read(heap_ptr+curr_offset, arr+curr_offset,  last_transfer_size);
 33 |         }
 34 |         else{
 35 |             mram_read(heap_ptr+curr_offset, arr+curr_offset,  transfer_each_time);
 36 |         }
 37 |         
 38 | 
 39 |     }
 40 |     barrier_wait(&barrier_shared);
 41 |     
 42 | }
 43 | 
 44 | void store_arr_aligned(void* arr, __mram_ptr void* heap_ptr, uint32_t len){
 45 |     uint32_t transfer_each_time = 256; //must be multiple of 8
 46 |     uint32_t transfer_size = len;
 47 |     curr_block_shared = 0;
 48 | 
 49 |     uint32_t curr_offset;
 50 |     uint32_t curr_block_local;
 51 |     uint32_t last_transfer_size = (transfer_size%transfer_each_time==0)?transfer_each_time:transfer_size/transfer_size%transfer_each_time;
 52 |     uint32_t num_blocks = (transfer_size%transfer_each_time==0)?transfer_size/transfer_each_time:transfer_size/transfer_each_time+1;
 53 | 
 54 |     barrier_wait(&barrier_shared);
 55 |     while (curr_block_shared<num_blocks)
 56 |     {
 57 |         mutex_lock(mutex_shared);
 58 |         curr_block_local = curr_block_shared;
 59 |         curr_block_shared++; // current lock being processed
 60 |         mutex_unlock(mutex_shared);
 61 |         curr_offset = curr_block_local*transfer_each_time;
 62 | 
 63 |         if(curr_block_local == num_blocks-1){
 64 |             mram_write(arr+curr_offset, heap_ptr+curr_offset,  last_transfer_size);
 65 |         }
 66 |         else{
 67 |             mram_write(arr+curr_offset, heap_ptr+curr_offset,  transfer_each_time);
 68 |         }
 69 | 
 70 |     }
 71 |     barrier_wait(&barrier_shared);
 72 | }
 73 | 
 74 | // shared table operations
 75 | void init_shared_table(table_t* t, uint32_t table_size, uint32_t value_size, void (*init_func)(uint32_t, void*)){
 76 |     uint32_t pid = me();
 77 |     if(pid==0){
 78 |         t->table_size = table_size;
 79 |         t->value_size = value_size;
 80 |         t->table_allocator = fsb_alloc(table_size*value_size+(table_size*value_size)%8, 1);
 81 |         t->locks_allocator = fsb_alloc(table_size*sizeof(uint8_t), 1);
 82 |         t->table = fsb_get(t->table_allocator);
 83 |         t->locks = fsb_get(t->locks_allocator);
 84 |         t->heap_ptr = NULL;
 85 |         void* tmp = t->table;
 86 | 
 87 |         for(int i=0; i<table_size; i++){
 88 |             init_func(value_size, tmp);
 89 |             tmp += value_size;
 90 |         }
 91 | 
 92 |         for(int i=0; i<table_size; i++){
 93 |             mutex_unlock(&(t->locks[i]));
 94 |         }
 95 |     }
 96 | 
 97 |     barrier_wait(&barrier_shared);
 98 | }
 99 | 
100 | void insert_shared_table(table_t* t, uint32_t key, void* value, void (*combineFunc)(void*, void*)){
101 |     uint32_t value_size = t->value_size;
102 |     uint32_t curr_entry = key*value_size;
103 |     void* value_ptr_in_table = (t->table)+curr_entry;
104 |     mutex_lock(&(t->locks[key]));
105 |     (*combineFunc)(value_ptr_in_table, value);
106 |     mutex_unlock(&(t->locks[key]));
107 | }
108 | 
109 | void print_shared_table(table_t* t, void (*printFunc)(void*)){
110 |     uint32_t pid = me();
111 |     if(pid==0){
112 |         uint32_t table_size = t->table_size;
113 |         uint32_t value_size = t->value_size;
114 |         print_table_entries(t->table, table_size, value_size, printFunc); 
115 |     }
116 |     barrier_wait(&barrier_shared);
117 | }
118 | 
119 | void free_shared_table(table_t* t){
120 |     fsb_free(t->table_allocator, t->table);
121 |     fsb_free(t->locks_allocator, t->locks);
122 | }
123 | 
124 | uint32_t store_shared_table_on_heap(table_t* table, __mram_ptr void* heap_ptr){
125 |     uint32_t table_size = table->table_size;
126 |     uint32_t value_size = table->value_size;
127 |     uint32_t transfer_size = table_size*value_size+(table_size*value_size)%8;
128 |     store_arr_aligned(table->table, heap_ptr, transfer_size);
129 |     return (uint32_t)(heap_ptr-DPU_MRAM_HEAP_POINTER+transfer_size);
130 | }
131 | 
132 | void load_shared_table_from_heap(table_t* table,  __mram_ptr void* heap_ptr){
133 |     uint32_t table_size = table->table_size;
134 |     uint32_t value_size = table->value_size;
135 |     uint32_t transfer_size = table_size*value_size+(table_size*value_size)%8;
136 |     load_arr_aligned(table->table, heap_ptr, transfer_size);
137 | }
138 | 


--------------------------------------------------------------------------------
/lib/TableShared.h:
--------------------------------------------------------------------------------
 1 | #ifndef TABLESHARED_H
 2 | #define TABLESHARED_H
 3 | 
 4 | #include <stdio.h>
 5 | 
 6 | #include <alloc.h>
 7 | #include <mram.h>
 8 | #include <defs.h>
 9 | #include <barrier.h>
10 | #include "mutex.h"
11 | #include "TableHost.h"
12 | #include "StructsPIM.h"
13 | 
14 | void init_shared_table(table_t* t, uint32_t table_size, uint32_t value_size, void (*init_func)(uint32_t, void*));
15 | void free_shared_table(table_t* t);
16 | void insert_shared_table(table_t* t, uint32_t key, void* value, void (*combineFunc)(void*, void*));
17 | void print_shared_table(table_t* t, void (*printFunc)(void*));
18 | 
19 | uint32_t store_shared_table_on_heap(table_t* table, __mram_ptr void* heap_ptr);
20 | void load_shared_table_from_heap(table_t* table,  __mram_ptr void* heap_ptr);
21 | void load_arr_aligned(void* arr, __mram_ptr void* heap_ptr, uint32_t len);
22 | void store_arr_aligned(void* arr, __mram_ptr void* heap_ptr, uint32_t len);
23 | #endif 


--------------------------------------------------------------------------------
/lib/UpmemCustom.c:
--------------------------------------------------------------------------------
  1 | #include "UpmemCustom.h"
  2 | // allocate aligned arr and zero out rest
  3 | uint32_t calculate_pad_len(uint64_t len, uint32_t type_size, uint32_t num_dpus){
  4 |     uint64_t len_in_byte = (uint64_t)len*type_size;
  5 | 
  6 |     // calculate lcm of typesize and 8, each dpu gets %8
  7 |     uint32_t lcm = (type_size > 8) ? type_size : 8;
  8 | 
  9 |     while (1) {
 10 |         if (lcm % type_size == 0 && lcm % 8 == 0) {
 11 |             break;
 12 |         }
 13 |         ++lcm;
 14 |     }
 15 | 
 16 |     // divisible by typesize
 17 |     uint64_t padded_len = len_in_byte;
 18 | 
 19 |     while (1) {
 20 |         if (padded_len % num_dpus == 0 && (padded_len/num_dpus) % lcm == 0) {
 21 |             break;
 22 |         }
 23 |         ++padded_len;
 24 |     }
 25 | 
 26 |     uint64_t pad_len = padded_len - len_in_byte;
 27 |     return (uint32_t)pad_len;
 28 | }
 29 | 
 30 | void* malloc_split_aligned(uint32_t len, uint32_t type_size, uint32_t num_dpus){
 31 |     uint32_t pad_len = calculate_pad_len(len , type_size, num_dpus);
 32 |     uint64_t len_in_byte = (uint64_t)len * type_size;
 33 | 
 34 |     void* ptr = malloc(len_in_byte+pad_len);
 35 | 
 36 |     for(int i=0; i<len_in_byte+pad_len; i++){
 37 |         ((char*)ptr)[i] = 0;
 38 |     }
 39 | 
 40 |     return ptr;
 41 | }
 42 | 
 43 | void* malloc_broadcast_aligned(uint32_t len, uint32_t type_size){
 44 |     uint64_t len_in_byte = (uint64_t)len * type_size;
 45 |     uint32_t pad_len = len_in_byte%8;
 46 | 
 47 |     void* ptr = malloc(len_in_byte+pad_len);
 48 | 
 49 |     for(int i=0; i<pad_len+len_in_byte; i++){
 50 |         ((char*)ptr)[i] = 0;
 51 |     }
 52 | 
 53 |     return ptr;
 54 | }
 55 | 
 56 | void prepare_input_len_and_parse_args(struct dpu_set_t set, dpu_arguments_t* input_args, uint32_t input_len, uint32_t input_type_size, uint32_t num_dpus){
 57 |     uint32_t pad_len_in_elem = calculate_pad_len(input_len, input_type_size, num_dpus)/input_type_size;
 58 |     uint32_t num_transfered_elem_per_dpu = (input_len + pad_len_in_elem)/num_dpus;
 59 | 
 60 |     struct dpu_set_t dpu;
 61 | 
 62 |     int i;
 63 | 	DPU_FOREACH(set, dpu, i) {
 64 | 	    // prepare and copy input arguments to DPU
 65 |         if((i+1)*num_transfered_elem_per_dpu<input_len){
 66 |             input_args[i].input_len = num_transfered_elem_per_dpu;
 67 |         }
 68 |         else if(i*num_transfered_elem_per_dpu<input_len){
 69 |             input_args[i].input_len = input_len - i*num_transfered_elem_per_dpu;
 70 |         }
 71 |         else{
 72 |             input_args[i].input_len = 0;
 73 |         }
 74 | 		DPU_ASSERT(dpu_prepare_xfer(dpu, input_args + i));
 75 | 	}
 76 |     //DPU_ASSERT(dpu_push_xfer(set, DPU_XFER_TO_DPU, "DPU_INPUT_ARGUMENTS", 0, sizeof(dpu_arguments_t), DPU_XFER_DEFAULT));
 77 | 
 78 |     printf("%d DPUs and %u elements\n", num_dpus, input_len);
 79 |     uint32_t max = 0;
 80 |     for(int i=0; i<num_dpus; i++){
 81 |         max = max>input_args[i].input_len?max:input_args[i].input_len;
 82 |     }
 83 |     printf("assigning %d elements at max for each dpu\n", max);
 84 | }
 85 | 
 86 | 
 87 | 
 88 | 
 89 | uint32_t host_split_to_dpu(struct dpu_set_t set, void* elements, uint32_t len, uint32_t type_size, uint32_t num_dpus, uint32_t curr_offset){
 90 |     //elements must be 8 bytes aligned and divisible by num_dpus， pad not aglined, use malloc_aligned
 91 |     uint32_t pad_len = calculate_pad_len(len , type_size, num_dpus);
 92 | 
 93 |     //split elements to dpu and rest remains on host
 94 |     uint32_t len_per_dpu_in_byte = ((uint64_t)len*type_size+pad_len)/num_dpus;
 95 | 
 96 |     //transfer to dpu
 97 |     int i;
 98 |     struct dpu_set_t dpu;
 99 | 
100 | 
101 |     DPU_FOREACH(set, dpu, i) {
102 |         DPU_ASSERT(dpu_prepare_xfer(dpu, &((char*)elements)[i * len_per_dpu_in_byte]));
103 |     }
104 | 
105 |     // offset
106 |     
107 |     DPU_ASSERT(dpu_push_xfer(set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, curr_offset, len_per_dpu_in_byte, DPU_XFER_DEFAULT));
108 |     
109 |     return curr_offset+len_per_dpu_in_byte;
110 | 
111 | }
112 | 
113 | uint32_t host_broadcast_to_dpu(struct dpu_set_t set, void* elements, uint32_t len, uint32_t type_size, uint32_t curr_offset){
114 |     uint32_t broadcast_size = (len*type_size)+(len*type_size)%8;
115 |     DPU_ASSERT(dpu_broadcast_to(set, DPU_MRAM_HEAP_POINTER_NAME, curr_offset, elements, broadcast_size, DPU_XFER_DEFAULT));
116 |     return curr_offset+broadcast_size;
117 | }
118 | 
119 | void* malloc_gather_aligned(uint32_t len, uint32_t type_size, uint32_t num_dpus){
120 |     uint32_t len_in_byte = len * type_size;
121 |     uint32_t pad_len = len_in_byte%8;
122 | 
123 |     void* ptr = calloc((len_in_byte+pad_len)*num_dpus);
124 | 
125 |     return ptr;
126 | }
127 | 
128 | void gather_tables_to_host(struct dpu_set_t set, void* my_table, uint32_t len, uint32_t type_size, uint32_t curr_offset_on_mram, uint32_t num_dpus, void (*init_func)(uint32_t, void*) ,void (*combineFunc)(void*, void*)){
129 |     void* tables = malloc_gather_aligned(len, type_size, num_dpus);
130 |     uint32_t aligned_table_size = (len*type_size)+(len*type_size)%8;
131 | 
132 |     for(int i=0; i<len; i++){
133 |         (*init_func)(type_size, my_table+i*type_size);
134 |     }
135 | 
136 | 
137 |     int i;
138 |     struct dpu_set_t dpu;
139 | 
140 | 	DPU_FOREACH(set, dpu, i) {
141 | 	    
142 | 		DPU_ASSERT(dpu_prepare_xfer(dpu, tables+i*aligned_table_size));
143 | 	}
144 |     DPU_ASSERT(dpu_push_xfer(set, DPU_XFER_FROM_DPU, DPU_MRAM_HEAP_POINTER_NAME, curr_offset_on_mram, aligned_table_size, DPU_XFER_DEFAULT));
145 |     
146 |     uint32_t omp_threads = 8;
147 |     uint32_t thread_id;
148 |     uint32_t table_size = len*type_size;
149 | 
150 |     void* omp_helper_tables = malloc(table_size*omp_threads);
151 | 
152 | 
153 |     
154 |     omp_set_dynamic(0);     // Explicitly disable dynamic teams private(omp_table, thread_id) 
155 |     #pragma omp parallel num_threads(omp_threads) 
156 |     {
157 |         void* omp_table;
158 |         void* curr_table;
159 | 
160 |         thread_id = omp_get_thread_num();
161 |         omp_table = omp_helper_tables+table_size*thread_id;
162 | 
163 |     for(int i=0; i<len; i++){
164 |         (*init_func)(type_size, omp_table+i*type_size);
165 |     }
166 | 
167 |     #pragma omp for
168 |         for(int i=0; i<num_dpus; i++){
169 |             curr_table = (void*)(tables+i*aligned_table_size);
170 |             combine_table_entries(omp_table, curr_table, len, type_size, combineFunc);
171 |         }
172 |         
173 | 
174 |         #pragma omp barrier
175 |     }
176 | 
177 |     void* table;
178 |     for(int i=0; i<omp_threads; i++){
179 |         table = (void*)(omp_helper_tables+i*table_size);
180 |         combine_table_entries(my_table, table, len, type_size, combineFunc);
181 |     }
182 |     
183 | 
184 |     free(omp_helper_tables);
185 |     free(tables);
186 | 
187 | }
188 | 
189 | void* gather_to_host(struct dpu_set_t set, uint32_t* lens, uint32_t type_size, uint32_t curr_offset_on_mram, uint32_t num_dpus){
190 |     uint64_t total_size=0;
191 |     uint32_t max_len=0;
192 |     uint32_t aligned_max_len;
193 |     uint32_t len;
194 |     for(int i=0; i<num_dpus; i++){
195 |         len = lens[i];
196 |         max_len = max_len>len?max_len:len;
197 |         total_size += (uint64_t)len*type_size;
198 |     }
199 |     aligned_max_len = (max_len*type_size)+(max_len*type_size)%8;
200 |     
201 | 
202 |     void* tmp_buffer = malloc((uint64_t)num_dpus*aligned_max_len);
203 |     void* res = malloc(total_size);
204 |     printf("max len per dpu %u\n", aligned_max_len);
205 |     printf("transfer buffer size %u\n", num_dpus*aligned_max_len);
206 |     int i;
207 |     struct dpu_set_t dpu;
208 | 	DPU_FOREACH(set, dpu, i) {  
209 | 		DPU_ASSERT(dpu_prepare_xfer(dpu, tmp_buffer+i*aligned_max_len));
210 | 	}
211 |     DPU_ASSERT(dpu_push_xfer(set, DPU_XFER_FROM_DPU, DPU_MRAM_HEAP_POINTER_NAME, curr_offset_on_mram, aligned_max_len, DPU_XFER_DEFAULT));
212 | 
213 | 
214 |     void* buff_ptr = (void*)tmp_buffer;
215 |     void* ptr_in_res = (void*)res;
216 |     uint32_t curr_size;
217 | 
218 |     //printf("\n-----\n%d\n-----\n", *(int*)buff_ptr);
219 |     for(int i=0; i<num_dpus; i++){
220 |         curr_size = type_size*lens[i];
221 |         memcpy(ptr_in_res, buff_ptr, curr_size);
222 |         buff_ptr += aligned_max_len;
223 |         ptr_in_res += curr_size;
224 |     }
225 |     
226 | 
227 |     free(tmp_buffer);
228 |     return res;
229 | 
230 | }
231 | 
232 | 


--------------------------------------------------------------------------------
/lib/UpmemCustom.h:
--------------------------------------------------------------------------------
 1 | #ifndef UPMEMCUSTOM_H
 2 | #define UPMEMCUSTOM_H
 3 | #include <stdio.h>
 4 | #include <stdlib.h>
 5 | #include <assert.h>
 6 | #include <dpu.h>
 7 | #include <omp.h>
 8 | 
 9 | #include "Structs.h"
10 | #include "TableHost.h"
11 | #include "Common.h"
12 | 
13 | void* malloc_split_aligned(uint32_t len, uint32_t type_size, uint32_t num_dpus);
14 | void* malloc_broadcast_aligned(uint32_t len, uint32_t type_size);
15 | uint32_t host_split_to_dpu(struct dpu_set_t set, void* elements, uint32_t len, uint32_t type_size, uint32_t num_dpus, uint32_t curr_offset);
16 | uint32_t host_broadcast_to_dpu(struct dpu_set_t set, void* elements, uint32_t len, uint32_t type_size, uint32_t curr_offset);
17 | void prepare_input_len_and_parse_args(struct dpu_set_t set, dpu_arguments_t* input_args, uint32_t input_len, uint32_t input_type_size, uint32_t num_dpus);
18 | void* malloc_gather_aligned(uint32_t len, uint32_t type_size, uint32_t num_dpus);
19 | void gather_tables_to_host(struct dpu_set_t set, void* my_table, uint32_t len, uint32_t type_size, uint32_t curr_offset_on_mram, uint32_t num_dpus, void (*init_func)(uint32_t, void*) ,void (*combineFunc)(void*, void*));
20 | void* gather_to_host(struct dpu_set_t set, uint32_t* lens, uint32_t type_size, uint32_t curr_offset_on_mram, uint32_t num_dpus);
21 | #endif 


--------------------------------------------------------------------------------
/lib/communication/CommHelper.c:
--------------------------------------------------------------------------------
 1 | #include "CommHelper.h"
 2 | uint32_t calculate_pad_len(uint32_t len, uint32_t type_size, uint32_t num_dpus){
 3 |     uint64_t len_in_byte = (uint64_t)len*type_size;
 4 | 
 5 |     // calculate lcm of typesize and 8, each dpu gets %8
 6 |     uint32_t lcm = (type_size > 8) ? type_size : 8;
 7 | 
 8 |     while (1) {
 9 |         if (lcm % type_size == 0 && lcm % 8 == 0) {
10 |             break;
11 |         }
12 |         ++lcm;
13 |     }
14 | 
15 |     // divisible by typesize
16 |     uint64_t padded_len = len_in_byte;
17 | 
18 |     while (1) {
19 |         if (padded_len % num_dpus == 0 && (padded_len/num_dpus) % lcm == 0) {
20 |             break;
21 |         }
22 |         ++padded_len;
23 |     }
24 | 
25 |     uint64_t pad_len = padded_len - len_in_byte;
26 |     return (uint32_t)pad_len;
27 | }
28 | 


--------------------------------------------------------------------------------
/lib/communication/CommHelper.h:
--------------------------------------------------------------------------------
1 | #ifndef COMMHELPER_H
2 | #define COMMHELPER_H
3 | #include <stdint.h>
4 | #include <stdio.h>
5 | #include <string.h>
6 | uint32_t calculate_pad_len(uint32_t len, uint32_t type_size, uint32_t num_dpus);
7 | #endif 


--------------------------------------------------------------------------------
/lib/communication/CommOps.c:
--------------------------------------------------------------------------------
  1 | #include "CommOps.h"
  2 | 
  3 | /*
  4 |     see description of the functions in CommOps.h
  5 | */
  6 | void* malloc_scatter_aligned(uint64_t len, uint32_t type_size, simplepim_management_t* table_management){
  7 |     uint32_t num_dpus = table_management->num_dpus;
  8 |     uint32_t pad_len = calculate_pad_len(len , type_size, num_dpus);
  9 |     uint64_t len_in_byte = len * (uint64_t)type_size;
 10 | 
 11 |     void* ptr = calloc(1, len_in_byte+pad_len);
 12 | 
 13 |     return ptr;
 14 | }
 15 | 
 16 | void* malloc_reduce_aligned(uint32_t len, uint32_t type_size, simplepim_management_t* table_management){
 17 |     uint32_t num_dpus = table_management->num_dpus;
 18 |     uint64_t len_in_byte = len * type_size;
 19 |     uint64_t pad_len = 8-len_in_byte%8;
 20 | 
 21 |     void* ptr = calloc(num_dpus, (len_in_byte+pad_len));
 22 | 
 23 |     return ptr;
 24 | }
 25 | 
 26 | void* malloc_broadcast_aligned(uint32_t len, uint32_t type_size, simplepim_management_t* table_management){
 27 |     uint64_t len_in_byte = (uint64_t)len * type_size;
 28 |     uint64_t pad_len = 8-len_in_byte%8;
 29 | 
 30 |     void* ptr = calloc(1, len_in_byte+pad_len);
 31 | 
 32 |     return ptr;
 33 | }
 34 | 
 35 | void simplepim_scatter(char* const table_id, void* elements, uint64_t len, uint32_t type_size, simplepim_management_t* table_management){
 36 |     uint32_t curr_offset = table_management->free_space_start_pos;
 37 |     if(contains_table(table_id, table_management)){
 38 |         curr_offset = lookup_table(table_id, table_management) -> start;
 39 |     }
 40 |     // elements must be 8 bytes aligned and divisible by num_dpus， pad not aglined, use malloc_aligned
 41 |     if(contains_table(table_id, table_management)){
 42 |         printf(table_id);
 43 |         printf(" is contained in table management unit, invalid scatter\n");
 44 |         return;
 45 |     }
 46 |     uint32_t num_dpus = table_management->num_dpus;
 47 |     uint32_t pad_len = calculate_pad_len(len , type_size, num_dpus);
 48 | 
 49 |     // split elements to dpu and rest remains on host
 50 |     uint32_t len_per_dpu_in_byte = (len*type_size+pad_len)/num_dpus;
 51 | 
 52 |     // transfer to dpu
 53 |     int i;
 54 |     struct dpu_set_t dpu;
 55 |     struct dpu_set_t set = table_management->set;
 56 | 
 57 | 
 58 |     DPU_FOREACH(set, dpu, i) {
 59 |         DPU_ASSERT(dpu_prepare_xfer(dpu, &((char*)elements)[i * len_per_dpu_in_byte]));
 60 |     }
 61 | 
 62 |     // offset
 63 |     DPU_ASSERT(dpu_push_xfer(set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, curr_offset, len_per_dpu_in_byte, DPU_XFER_DEFAULT));
 64 |     
 65 |     // calculate lens per dpu
 66 |     uint32_t pad_len_in_elem = pad_len/type_size;
 67 |     uint32_t num_transfered_elem_per_dpu = (len + pad_len_in_elem)/num_dpus;
 68 |     uint32_t* lens = malloc(sizeof(int32_t)*num_dpus);
 69 |     for(int i=0; i<num_dpus; i++){
 70 |         if((i+1)*num_transfered_elem_per_dpu<len){
 71 |             lens[i] = num_transfered_elem_per_dpu;
 72 |         }
 73 |         else if(i*num_transfered_elem_per_dpu<len){
 74 |             lens[i] = len - i*num_transfered_elem_per_dpu;
 75 |         }
 76 |         else{
 77 |             lens[i] = 0;
 78 |         }
 79 |     }
 80 | 
 81 | 
 82 |     // table information to management unit
 83 |     table_host_t* t = malloc(sizeof(table_host_t));
 84 |     t->name = malloc(strlen(table_id)+1);
 85 |     memcpy(t->name, table_id, strlen(table_id)+1);
 86 |     t->start = curr_offset;
 87 |     t->end = curr_offset+len_per_dpu_in_byte;
 88 |     t->len = len;
 89 |     t->table_type_size = type_size;
 90 |     t->lens_each_dpu = lens;
 91 |     t->is_virtual_zipped = 0;
 92 | 
 93 |     add_table(t, table_management);
 94 |     table_management->free_space_start_pos = table_management->free_space_start_pos > t->end ? table_management->free_space_start_pos : t->end;
 95 | }
 96 | 
 97 | void* simplepim_gather(char* const table_id, simplepim_management_t* table_management){
 98 |     if(!contains_table(table_id, table_management)){
 99 |         printf(table_id);
100 |         printf(" is not contained in table management unit, invalid scatter\n");
101 |         return NULL;
102 |     }
103 | 
104 |     uint32_t num_dpus = table_management->num_dpus; 
105 |     table_host_t* t = lookup_table(table_id, table_management);
106 |     uint32_t* lens = t->lens_each_dpu;
107 |     uint32_t type_size = t->table_type_size;
108 |     uint32_t start_addr = t->start;
109 |     uint32_t max_len = 0;
110 |     for(int i=0; i<num_dpus; i++){
111 |         max_len = max_len>lens[i]?max_len:lens[i];
112 |     }
113 | 
114 |     uint64_t aligned_max_len = (max_len*type_size)+(8-(max_len*type_size)%8);
115 |     uint64_t buff_size = aligned_max_len*num_dpus;
116 |     uint64_t total_size = t->len*t->table_type_size;
117 |     struct dpu_set_t set = table_management->set;
118 |     void* tmp_buffer = malloc((uint64_t)num_dpus*aligned_max_len+2048);
119 |     void* res = malloc(total_size + 2048);
120 | 
121 |     int i;
122 |     struct dpu_set_t dpu;
123 | 	DPU_FOREACH(set, dpu, i) {  
124 | 		DPU_ASSERT(dpu_prepare_xfer(dpu, tmp_buffer+i*aligned_max_len));
125 | 	}
126 |     DPU_ASSERT(dpu_push_xfer(set, DPU_XFER_FROM_DPU, DPU_MRAM_HEAP_POINTER_NAME, start_addr, aligned_max_len, DPU_XFER_DEFAULT));
127 | 
128 |     void* buff_ptr = (void*)tmp_buffer;
129 |     void* ptr_in_res = (void*)res;
130 |     uint32_t curr_size;
131 | 
132 |     
133 |     for(int j=0; j<num_dpus; j++){
134 |         curr_size = type_size*lens[j];
135 |         memcpy(ptr_in_res, buff_ptr, curr_size);
136 |         buff_ptr += aligned_max_len;
137 |         ptr_in_res += curr_size;
138 |     }
139 |     
140 | 
141 |     free(tmp_buffer);
142 |     return res;
143 | 
144 | }
145 | 
146 | void simplepim_broadcast(char* const table_id, void* elements, uint64_t len, uint32_t type_size, simplepim_management_t* table_management){
147 |     uint32_t curr_offset = table_management->free_space_start_pos;
148 |     if(contains_table(table_id, table_management)){
149 |     	curr_offset = lookup_table(table_id, table_management) -> start;
150 |     }
151 |    
152 |     uint64_t broadcast_size = (len*type_size)+8-(len*type_size)%8;
153 |     uint32_t num_dpus = table_management->num_dpus; 
154 |     struct dpu_set_t set = table_management->set;
155 |     DPU_ASSERT(dpu_broadcast_to(set, DPU_MRAM_HEAP_POINTER_NAME, curr_offset, elements, broadcast_size, DPU_XFER_DEFAULT));
156 |     // table information to management unit
157 |     table_host_t* t = malloc(sizeof(table_host_t));
158 |     t->name = malloc(strlen(table_id)+1);
159 |     memcpy(t->name, table_id, strlen(table_id)+1);
160 |     t->start = curr_offset;
161 |     t->end = curr_offset+broadcast_size;
162 |     t->len = len;
163 |     t->table_type_size = type_size;
164 | 
165 |     uint32_t* lens = malloc(sizeof(int32_t)*num_dpus);
166 |     for(int i=0; i<num_dpus; i++){
167 |         lens[i] = len;
168 |     }
169 | 
170 |     t->lens_each_dpu = lens;
171 |     t->is_virtual_zipped = 0;
172 |     add_table(t, table_management);	
173 |     table_management->free_space_start_pos = t->end > table_management->free_space_start_pos ? t->end : table_management->free_space_start_pos;
174 | }
175 | 
176 | void simplepim_allgather(char* const table_id, char* const new_table_id, simplepim_management_t* table_management){
177 |     if(!contains_table(table_id, table_management)){
178 |         printf(table_id);
179 |         printf(" is not contained in table management unit, invalid allgather\n");
180 |         return;
181 |     }
182 | 
183 |     uint32_t num_dpus = table_management->num_dpus; 
184 |     table_host_t* t = lookup_table(table_id, table_management);
185 |     uint32_t* lens = t->lens_each_dpu;
186 |     uint32_t type_size = t->table_type_size;
187 | 
188 |     uint32_t total_len = 0;
189 | 
190 |     for(int i=0; i<num_dpus; i++){
191 |         total_len += lens[i];
192 |     }
193 | 
194 |     void* bc_buffer = malloc_broadcast_aligned(total_len, type_size, table_management);
195 |     void* res = simplepim_gather(table_id, table_management);
196 |     memcpy(bc_buffer, res, total_len*type_size);
197 |     simplepim_broadcast(new_table_id, bc_buffer, total_len, type_size, table_management);
198 | 
199 | 
200 |     free(bc_buffer);
201 | 
202 | }
203 | 
204 | 
205 | void simplepim_allreduce(char* const table_id, handle_t* binary_handle, simplepim_management_t* table_management){
206 |     if(binary_handle->func_type == 1){
207 |         if(!contains_table(table_id, table_management)){
208 |             printf("source table ");
209 |             printf(table_id);
210 |             printf(" is not contains in current management unit\n");
211 |             return;
212 |         }
213 | 
214 |         uint32_t num_dpus = table_management->num_dpus; 
215 |         table_host_t* t = lookup_table(table_id, table_management);
216 |         uint32_t* lens = t->lens_each_dpu;
217 |         uint32_t type_size = t->table_type_size;
218 |         uint32_t len = lens[0];
219 | 
220 |         for(int i=0; i<num_dpus; i++){
221 |             if(lens[i] != len){
222 |                 printf("can not call allreduce on inequal length array\n");
223 |                 return;
224 |             }
225 |         }
226 | 
227 |         void* lib=dlopen(binary_handle->so_bin_location, RTLD_NOW);
228 |         void (*combine_func)(void*, void*) = dlsym(lib, "combine_func");
229 | 
230 |         void* res = simplepim_gather(table_id, table_management);
231 |         for(int i=1; i<num_dpus; i++){
232 |             int offset = i*len*type_size;
233 |             for(int j=0; j<len; j++){
234 |                 combine_func(res+j*type_size, res+offset+j*type_size);
235 |             }
236 |         }
237 | 
238 |         void* bc_buffer = malloc_broadcast_aligned(len, type_size, table_management);
239 | 
240 |         memcpy(bc_buffer, res, len*type_size);
241 | 
242 |         uint64_t outputs_pos = lookup_table(table_id, table_management) -> start;
243 |         uint64_t broadcast_size = (len*type_size)+8-(len*type_size)%8;
244 |         struct dpu_set_t set = table_management->set;
245 |         DPU_ASSERT(dpu_broadcast_to(set, DPU_MRAM_HEAP_POINTER_NAME, outputs_pos, bc_buffer, broadcast_size, DPU_XFER_DEFAULT));
246 | 
247 |         free(bc_buffer);
248 |     
249 |     }
250 |     else{
251 |         printf("ERROR: compiled binary ");
252 |         printf(binary_handle->bin_location);
253 |         printf(" does not contain general reduction functions\n");
254 |     }
255 | 
256 | }
257 | 
258 | 


--------------------------------------------------------------------------------
/lib/communication/CommOps.h:
--------------------------------------------------------------------------------
 1 | #ifndef COMMOPS_H
 2 | #define COMMOPS_H
 3 | #include "CommHelper.h"
 4 | #include "../management/Management.h"
 5 | #include "../processing/ProcessingHelperHost.h"
 6 | #include <dpu.h>
 7 | #include <stdint.h>
 8 | #include <dlfcn.h>
 9 | #include <stdio.h>
10 | #include <string.h>
11 | 
12 | /*
13 | malloc_*_aligned allocates memory on host for alignent requirements of UPMEM, one can treat the returned pointer 
14 | as normal heap memory obtained by calling "malloc"
15 | 
16 | simplepim_* (for example simplepim_scatter) implements the communication operators as in the paper
17 | */
18 | 
19 | 
20 | void* malloc_scatter_aligned(uint64_t len, uint32_t type_size, simplepim_management_t* table_management);
21 | void* malloc_reduce_aligned(uint32_t len, uint32_t type_size, simplepim_management_t* table_management);
22 | void* malloc_broadcast_aligned(uint32_t len, uint32_t type_size, simplepim_management_t* table_management);
23 | void simplepim_scatter(char* const table_id, void* elements, uint64_t len, uint32_t type_size, simplepim_management_t* table_management);
24 | void* simplepim_gather(char* const table_id, simplepim_management_t* table_management);
25 | void simplepim_broadcast(char* const table_id, void* elements, uint64_t len, uint32_t type_size, simplepim_management_t* table_management);
26 | void simplepim_allgather(char* const table_id, char* const new_table_id, simplepim_management_t* table_management);
27 | void simplepim_allreduce(char* const table_id, handle_t* binary_handle, simplepim_management_t* table_management);
28 | #endif 
29 | 


--------------------------------------------------------------------------------
/lib/management/Management.c:
--------------------------------------------------------------------------------
 1 | #include "Management.h"
 2 | 
 3 | /*
 4 |     see description of the functions in Management.h
 5 | */
 6 | 
 7 | simplepim_management_t* table_management_init(uint32_t num_dpus){
 8 | 
 9 |     struct dpu_set_t set;
10 |     DPU_ASSERT(dpu_alloc(num_dpus, NULL, &set));
11 |     
12 |     small_table_init(set);
13 |     simplepim_management_t* management = malloc(sizeof(simplepim_management_t));
14 |     management->set = set;
15 |     management->num_dpus = num_dpus;
16 |     management->num_tables = 0;
17 |     management->curr_space = 16;
18 |     management->tables = malloc(sizeof(table_host_t*)*16);
19 |     management->zip_args = malloc(sizeof(zip_arguments_t)*num_dpus);
20 |     management->map_args = malloc(sizeof(map_arguments_t)*num_dpus);
21 |     management->red_args = malloc(sizeof(gen_red_arguments_t)*num_dpus);
22 |     management->free_space_start_pos = 0;
23 |     return management;
24 | }
25 | 
26 | void add_table(table_host_t* table, simplepim_management_t* management){
27 |     uint32_t num_tables = management->num_tables;
28 |     for(int i=0; i<num_tables; i++){
29 |         if(strcmp("", management->tables[i]->name)==0){
30 |             free(management->tables[i]->lens_each_dpu);
31 |             free(management->tables[i]);
32 |             management->tables[i] = table;
33 |             return;
34 |         }
35 |     }
36 | 
37 |     uint32_t curr_space = management->curr_space;   
38 |     if(curr_space == num_tables){
39 |         management->tables = realloc(management->tables, (curr_space+16)*(sizeof(table_host_t*)));
40 |         management->tables[num_tables] = table;
41 |         management->num_tables++;
42 |         management->curr_space+=16;
43 |     }
44 |     else{
45 |         management->tables[num_tables] = table;
46 |         management->num_tables++;
47 |     }
48 | }
49 | 
50 | 
51 | uint32_t contains_table(const char* name, simplepim_management_t* management){
52 |     uint32_t num_tables = management->num_tables;
53 |     for(int i=0; i<num_tables; i++){
54 |         if(strcmp(name, management->tables[i]->name)==0){
55 |             return 1;
56 |         }
57 |     }
58 | 
59 |     return 0;
60 | }
61 | 
62 | void free_table(const char* name, simplepim_management_t* management){
63 |     if(!contains_table(name, management)){
64 |         return;
65 |     }
66 |     lookup_table(name, management)->name = "";
67 | }
68 | 
69 | table_host_t* lookup_table(const char* name, simplepim_management_t* management){
70 |     uint32_t num_tables = management->num_tables;
71 |     for(int i=0; i<num_tables; i++){
72 |         if(strcmp(name, management->tables[i]->name)==0){
73 |             return management->tables[i];
74 |         }
75 |     }
76 | 
77 |     printf("table ");
78 |     printf(name);
79 |     printf(" is not contains in current management unit\n");
80 |     return NULL;
81 | }
82 | 
83 | uint32_t max_len_dpu(uint32_t num_dpus, table_host_t* table){
84 |     uint32_t max_len = 0;
85 |     for(int i=0; i<num_dpus; i++){
86 |         max_len = table->lens_each_dpu[i]>max_len?table->lens_each_dpu[i]:max_len;
87 |     }
88 |     return max_len;
89 | 
90 | }
91 | 


--------------------------------------------------------------------------------
/lib/management/Management.h:
--------------------------------------------------------------------------------
 1 | #ifndef MANAGEMENT_H
 2 | #define MANAGEMENT_H
 3 | 
 4 | #include <stdint.h>
 5 | #include <stdio.h>
 6 | #include <stdlib.h>
 7 | #include <string.h>
 8 | #include <dpu.h>
 9 | 
10 | #include "SmallTableInit.h"
11 | #include "../processing/gen_red/GenRedArgs.h"
12 | #include "../processing/map/MapArgs.h"
13 | #include "../processing/zip/ZipArgs.h"
14 | 
15 | // table_host_t holds information about one array, used by the framework and the end user does not need to care about its details
16 | typedef struct { 
17 |    char* name;
18 |    uint32_t start;
19 |    uint32_t end;
20 |    uint64_t len;
21 |    uint32_t* lens_each_dpu;
22 |    uint32_t table_type_size;
23 | 
24 |    // fields handling virtual zip
25 |    uint32_t is_virtual_zipped;
26 |    uint32_t start1;
27 |    uint32_t start2;
28 |    uint32_t end1;
29 |    uint32_t end2;
30 |    uint32_t type1;
31 |    uint32_t type2;
32 | } table_host_t;
33 | 
34 | // simplepim_management_t holds information about all registered arrays, the pim hardware like number of pim cores etc. (see the paper for detials)
35 | typedef struct { 
36 |    uint32_t curr_space;
37 |    uint32_t num_tables;
38 |    table_host_t** tables;
39 |    struct dpu_set_t set;
40 |    uint32_t num_dpus;
41 |    zip_arguments_t* zip_args;
42 |    map_arguments_t* map_args;
43 |    gen_red_arguments_t* red_args;
44 |    // one could do memory management with more complex logic, currently just increment counter for each new array 
45 |    uint32_t free_space_start_pos;
46 | } simplepim_management_t;
47 | 
48 | 
49 | /*
50 |    table_management_init initialise a management interface, can be called by end users
51 |    add_table, contains_table, lookup_table, and free_table are used to retrieve information of an array from the management unit with its array id "name" (see the paper for detials)
52 |    max_len_dpu is a helper function used by the framework
53 | */
54 | simplepim_management_t* table_management_init(uint32_t num_dpus);
55 | void add_table(table_host_t* table, simplepim_management_t* management);
56 | uint32_t contains_table(const char* name, simplepim_management_t* management);
57 | table_host_t* lookup_table(const char* name, simplepim_management_t* management);
58 | void free_table(const char* name, simplepim_management_t* management);
59 | uint32_t max_len_dpu(uint32_t num_dpus, table_host_t* table);
60 | 
61 | #endif 
62 | 


--------------------------------------------------------------------------------
/lib/management/SmallTableInit.c:
--------------------------------------------------------------------------------
1 | #include "SmallTableInit.h"
2 | #define DPU_BINARY "bin/dpu_init_binary"
3 | 
4 | void small_table_init(struct dpu_set_t set){
5 |     system("dpu-upmem-dpurte-clang -O2 -DNR_TASKLETS=1 -o bin/dpu_init_binary ../../lib/management/SmallTableInit_dpu.c");
6 |     DPU_ASSERT(dpu_load(set, DPU_BINARY, NULL));
7 |     DPU_ASSERT(dpu_launch(set, DPU_SYNCHRONOUS));
8 | }


--------------------------------------------------------------------------------
/lib/management/SmallTableInit.h:
--------------------------------------------------------------------------------
 1 | #ifndef SMALLTABLEINIT_H
 2 | #define SMALLTABLEINIT_H
 3 | #include <stdint.h>
 4 | #include <stdlib.h>
 5 | #include <stdio.h>
 6 | #include <dpu.h>
 7 | 
 8 | /*
 9 |     the UPMEM hardware needs to call some setup code before running
10 |     the code is called once management_init is called
11 | */
12 | 
13 | void small_table_init(struct dpu_set_t set);
14 | 
15 | #endif 


--------------------------------------------------------------------------------
/lib/management/SmallTableInit_dpu.c:
--------------------------------------------------------------------------------
 1 | #include <alloc.h>
 2 | #include <defs.h>
 3 | #include <mram.h>
 4 | #include <barrier.h>
 5 | 
 6 | #include <stdio.h>
 7 | int main() {
 8 |     mem_reset(); // Reset the heap
 9 |     //printf("\n");
10 |     return 0;
11 | }


--------------------------------------------------------------------------------
/lib/processing/ProcessingHelper.c:
--------------------------------------------------------------------------------
 1 | #include "ProcessingHelper.h"
 2 | uint64_t copy_block_size_fun(uint32_t type_size1, uint32_t type_size2,  uint32_t num_elem){
 3 |     //1024
 4 |     uint64_t res=0;
 5 |     uint32_t res_arr[2];
 6 |     uint32_t max_type_size = type_size1>type_size2?type_size1:type_size2;
 7 |     if(type_size1%8 == 0 && type_size2%8 == 0 && (num_elem <= NR_TASKLETS || max_type_size > 512)){
 8 |         res_arr[0] = 1;
 9 |         res_arr[1] = 0;
10 |     }
11 |     else if(type_size1%4 == 0 && type_size2%4 == 0 &&(num_elem <= 2*NR_TASKLETS || max_type_size > 256)){
12 |         res_arr[0] = 2;
13 |         res_arr[1] = 1;
14 |     }
15 |     else if(type_size1%2 == 0 && type_size2%2 == 0 && max_type_size > 128){
16 |         res_arr[0] = 4;
17 |         res_arr[1] = 2;
18 |     }
19 |     else if(max_type_size < 16){
20 |         res_arr[0] = 256;
21 |         res_arr[1] = 8;
22 |     }
23 |     else if(max_type_size < 32){
24 |         res_arr[0] = 128;
25 |         res_arr[1] = 7;
26 |     }
27 |     else{
28 |         res_arr[0] = 16;
29 |         res_arr[1] = 4;
30 |     }
31 | 
32 |     res = *(uint64_t*)res_arr;
33 |     return res;
34 | 
35 | }
36 | 
37 | 


--------------------------------------------------------------------------------
/lib/processing/ProcessingHelper.h:
--------------------------------------------------------------------------------
 1 | #ifndef PROCESSINGHELPER_H
 2 | #define PROCESSINGHELPER_H
 3 | #include <stdio.h>
 4 | #include <stdlib.h>
 5 | #include <stdint.h>
 6 | #include <alloc.h>
 7 | #include <defs.h>
 8 | #include <mram.h>
 9 | #include <barrier.h>
10 | #include "mutex.h"
11 | 
12 | uint64_t copy_block_size_fun(uint32_t type_size1, uint32_t type_size2,  uint32_t num_elem);
13 | 
14 | #endif 


--------------------------------------------------------------------------------
/lib/processing/ProcessingHelperHost.c:
--------------------------------------------------------------------------------
  1 | #include "ProcessingHelperHost.h"
  2 | handle_t* create_handle(const char* func_pathname, uint32_t func_type){
  3 | 
  4 |     handle_t* handle = malloc(sizeof(handle_t));
  5 |     handle->func_type = func_type;
  6 |     handle->bin_location = malloc(2048);
  7 |     char func_bodyname[2048];
  8 |     strcpy(func_bodyname, func_pathname);
  9 |     for(int i=0; i<2048; i++){
 10 |             if(func_bodyname[i] == '.'){
 11 |                 func_bodyname[i] = '_';
 12 |             }
 13 |     }
 14 | 
 15 |     if(func_type == 0){
 16 |         char compile_cmd[2048] = "dpu-upmem-dpurte-clang -O2 -DNR_TASKLETS=12 -I./ -D__mapfunc_filename__=\"<"; 
 17 |         strcat(compile_cmd, func_pathname);
 18 |         strcat(compile_cmd, "/map.h>\" -o bin/dpu_map_");
 19 |         strcat(compile_cmd, func_bodyname);
 20 |         strcat(compile_cmd, " ../../lib/processing/map/map_dpu.c ../../lib/processing/ProcessingHelper.c");
 21 |         int succ = system(compile_cmd);
 22 | 
 23 |         char bin_location[2048] = "bin/dpu_map_";
 24 |         strcat(bin_location, func_bodyname);
 25 |         strcpy(handle->bin_location, bin_location);
 26 |     }
 27 |     else if(func_type == 1){
 28 |         char compile_cmd[2048] = "dpu-upmem-dpurte-clang -O2 -DNR_TASKLETS=12 -I./ -D__mapredfunc_pathname__=\"<"; 
 29 |         strcat(compile_cmd, func_pathname);
 30 |         strcat(compile_cmd, "/map_to_val_func.h>\" -D__combinefunc_pathname__=\"<");
 31 |         strcat(compile_cmd, func_pathname);
 32 |         strcat(compile_cmd, "/init_combine_func.h>\" -o bin/dpu_genred_");
 33 |         strcat(compile_cmd, func_bodyname);
 34 |         strcat(compile_cmd, " ../../lib/processing/gen_red/gen_red_dpu.c ../../lib/processing/ProcessingHelper.c ../../lib/TableHost.c ../../lib/Table.c ../../lib/TableShared.c");
 35 |         int succ = system(compile_cmd);
 36 | 
 37 |         char bin_location[2048] = "bin/dpu_genred_";
 38 |         strcat(bin_location, func_bodyname);
 39 |         strcpy(handle->bin_location, bin_location);
 40 | 
 41 |         
 42 |         // generate .o file for dynamic linking
 43 |         char h_fname[2048] = "";
 44 |         strcat(h_fname, func_pathname);
 45 |         strcat(h_fname, "/init_combine_func.h");
 46 | 
 47 |         char c_fname[2048] = "";
 48 |         strcat(c_fname, func_pathname);
 49 |         strcat(c_fname, "/init_combine_func.c");
 50 | 
 51 |         char o_fname[2048] = "";
 52 |         strcat(o_fname, func_pathname);
 53 |         strcat(o_fname, "_init_combine_func.o");
 54 |         
 55 |         char so_fname[2048];
 56 |         strcpy(so_fname, o_fname);
 57 |         so_fname[strlen(so_fname)-1] = '\0';
 58 |         strcat(so_fname, "so");
 59 | 
 60 |         char cp_cmd[2048] = "cp ";
 61 |         strcat(cp_cmd, h_fname);
 62 |         strcat(cp_cmd, " ");
 63 |         strcat(cp_cmd, c_fname);
 64 |         succ = system(cp_cmd);
 65 | 
 66 |         char compile_cmd1[2048] = "gcc -c -fPIC -o";
 67 |         strcat(compile_cmd1, " bin/");
 68 |         strcat(compile_cmd1, o_fname);
 69 |         strcat(compile_cmd1, " ");
 70 |         strcat(compile_cmd1, c_fname);
 71 |         succ = system(compile_cmd1);
 72 | 
 73 |  
 74 |         char compile_cmd2[2048] = "gcc -shared -o bin/";
 75 | 
 76 |         strcat(compile_cmd2, so_fname);
 77 |         strcat(compile_cmd2, " bin/");
 78 |         strcat(compile_cmd2, o_fname);
 79 |         succ = system(compile_cmd2);
 80 | 
 81 | 
 82 |         char compile_cmd3[2048] = "rm ";
 83 |         strcat(compile_cmd3, c_fname);
 84 |         succ = system(compile_cmd3);
 85 | 
 86 |         handle->so_bin_location = malloc(2048);
 87 |         char so_bin_location[2048] = "bin/";
 88 |         strcat(so_bin_location, so_fname);
 89 |         strcpy(handle->so_bin_location, so_bin_location);
 90 |         
 91 | 
 92 | 
 93 |     }
 94 |     else if(func_type == 2){
 95 |         char compile_cmd[2048] = "dpu-upmem-dpurte-clang -O2 -DNR_TASKLETS=12 -o bin/dpu_zip";
 96 |         strcat(compile_cmd, " ../../lib/processing/zip/zip_dpu.c ../../lib/processing/zip/ZipProcessing.c ../../lib/processing/ProcessingHelper.c");
 97 |         int succ = system(compile_cmd);
 98 | 
 99 |         char bin_location[2048] = "bin/dpu_zip";
100 |         strcpy(handle->bin_location, bin_location);
101 | 
102 |     }
103 |     else{
104 |         printf("function handle not properly compiled!!!");
105 |         return NULL;
106 |     }
107 | 
108 |     return handle;
109 | 
110 | }
111 | 


--------------------------------------------------------------------------------
/lib/processing/ProcessingHelperHost.h:
--------------------------------------------------------------------------------
 1 | #ifndef PROCESSINGHELPERHOST_H
 2 | #define PROCESSINGHELPERHOST_H
 3 | #include <stdio.h>
 4 | #include <stdlib.h>
 5 | #include <stdint.h>
 6 | #include <string.h>
 7 | #include <sys/time.h>
 8 | 
 9 | #define MAP 0
10 | #define REDUCE 1
11 | #define ZIP 2
12 | 
13 | // handle_t contains information of where the handle's binary is located
14 | typedef struct {
15 |    char* bin_location; 
16 |    char* so_bin_location;
17 |    uint32_t func_type;
18 | } handle_t;
19 | 
20 | 
21 | /*
22 |    create_handle creates a handle that can be understood by communcation and processing operators (see the paper for details)
23 | */
24 | handle_t* create_handle(const char* func_fname, uint32_t func_type);
25 | 
26 | #endif 


--------------------------------------------------------------------------------
/lib/processing/gen_red/GenRed.c:
--------------------------------------------------------------------------------
  1 | #include "GenRed.h"
  2 | 
  3 | /*
  4 |     description of table_gen_red see GenRed.h
  5 |     other functions are helper functions used by the framework
  6 | */
  7 | 
  8 | void combine_table_entries(void* table1, void* table2, uint32_t table_size, uint32_t value_size, void (*combineFunc)(void*, void*)){
  9 |     
 10 |     uint32_t curr_entry;
 11 |  
 12 |     for(int i=0; i<table_size; ++i){
 13 |         curr_entry = i*(value_size);
 14 |         (*combineFunc)(table1+curr_entry, table2+curr_entry);
 15 |         
 16 |     } 
 17 | 
 18 | }
 19 | 
 20 | void gather_tables_to_host(simplepim_management_t* table_management, void* my_table, uint32_t len, uint32_t type_size, uint32_t curr_offset_on_mram, void (*init_func)(uint32_t, void*) ,void (*combineFunc)(void*, void*)){
 21 |     int i;
 22 |     struct dpu_set_t dpu;
 23 |     struct dpu_set_t set = table_management->set;
 24 |     uint32_t num_dpus = table_management->num_dpus;
 25 |     void* tables = malloc_reduce_aligned(len, type_size, table_management);
 26 |     uint32_t aligned_table_size = (len*type_size)+(len*type_size)%8;
 27 | 
 28 |     for(int i=0; i<len; i++){
 29 |         (*init_func)(type_size, my_table+i*type_size);
 30 |     }
 31 | 
 32 | 
 33 | 
 34 | 	DPU_FOREACH(set, dpu, i) {
 35 | 	    
 36 | 		DPU_ASSERT(dpu_prepare_xfer(dpu, tables+i*aligned_table_size));
 37 | 	}
 38 |    DPU_ASSERT(dpu_push_xfer(set, DPU_XFER_FROM_DPU, DPU_MRAM_HEAP_POINTER_NAME, curr_offset_on_mram, aligned_table_size, DPU_XFER_DEFAULT));
 39 |     
 40 |     uint32_t omp_threads = 8;
 41 |     uint32_t thread_id;
 42 |     uint32_t table_size = len*type_size;
 43 |     uint32_t curr_entry;
 44 | 
 45 |     void* omp_helper_tables = malloc(table_size*omp_threads);
 46 | 
 47 | 
 48 |     
 49 |     omp_set_dynamic(0);     // Explicitly disable dynamic teams private(omp_table, thread_id) 
 50 |     #pragma omp parallel num_threads(omp_threads) 
 51 |     {
 52 |         void* omp_table;
 53 |         void* curr_table;
 54 | 
 55 |         thread_id = omp_get_thread_num();
 56 |         omp_table = omp_helper_tables+table_size*thread_id;
 57 | 
 58 |         for(int i=0; i<len; i++){
 59 |             (*init_func)(type_size, omp_table+i*type_size);
 60 |         }
 61 | 
 62 |     #pragma omp for
 63 |         for(int i=0; i<num_dpus; i++){
 64 |             curr_table = (void*)(tables+i*aligned_table_size);
 65 |             combine_table_entries(omp_table, curr_table, len, type_size, combineFunc);  
 66 |         }
 67 |         
 68 | 
 69 |         #pragma omp barrier
 70 |     }
 71 | 
 72 |     void* table;
 73 |     for(int i=0; i<omp_threads; i++){
 74 |         table = (void*)(omp_helper_tables+i*table_size);
 75 |         combine_table_entries(my_table, table, len, type_size, combineFunc);
 76 |     }
 77 |     
 78 | 
 79 |     free(omp_helper_tables);
 80 |     free(tables);
 81 | 
 82 | }
 83 | 
 84 | void* table_gen_red(const char* src_name, const char* dest_name, uint32_t output_type, uint32_t output_len, handle_t* binary_handle, simplepim_management_t* table_management, uint32_t info){
 85 |     int i;
 86 |     struct dpu_set_t dpu;
 87 |     struct timeval start_time;
 88 |     struct timeval end_time;
 89 |     uint32_t outputs = table_management->free_space_start_pos;
 90 |     if(contains_table(dest_name, table_management)){
 91 |         outputs = lookup_table(dest_name, table_management) -> start;
 92 |     }
 93 |     
 94 |     if(binary_handle->func_type == 1){
 95 |         if(!contains_table(src_name, table_management)){
 96 |             printf("source table ");
 97 |             printf(src_name);
 98 |             printf(" is not contains in current management unit\n");
 99 |             return NULL;
100 |         }
101 | 
102 |         //timing
103 |         gettimeofday(&start_time, NULL);
104 | 
105 |         struct dpu_set_t set = table_management->set;
106 |         uint32_t num_dpus = table_management->num_dpus;
107 |         table_host_t* src_table = lookup_table(src_name, table_management);
108 |         uint32_t* lens = src_table->lens_each_dpu;
109 |         uint32_t input_type = src_table->table_type_size;
110 |         uint32_t inputs = src_table->start;
111 | 
112 |         gen_red_arguments_t* input_args = table_management->red_args;
113 |         // use handle for precompiled binaries
114 |         const char* binary = binary_handle->bin_location;
115 |         DPU_ASSERT(dpu_load(set, binary, NULL));
116 | 
117 |         for(int i=0; i<num_dpus; i++){
118 |             input_args[i].input_start_offset = inputs;
119 |             input_args[i].input_type_size = input_type;
120 |             input_args[i].output_start_offset = outputs;
121 |             input_args[i].output_type_size = output_type;
122 |             input_args[i].len = lens[i];
123 |             input_args[i].table_len = output_len;
124 |             input_args[i].info = info;
125 |         }
126 |     
127 |         //parse arguments to map function call
128 |         int i;
129 |         struct dpu_set_t dpu;
130 | 	    DPU_FOREACH(set, dpu, i) {
131 | 		    DPU_ASSERT(dpu_prepare_xfer(dpu, input_args + i));
132 | 	    }
133 | 
134 |         DPU_ASSERT(dpu_push_xfer(set, DPU_XFER_TO_DPU, "GEN_RED_INPUT_ARGUMENTS", 0, sizeof(gen_red_arguments_t), DPU_XFER_DEFAULT));
135 |         
136 |         gettimeofday(&end_time, NULL);
137 |         double prepare_args_time = (end_time.tv_sec - start_time.tv_sec) * 1000000.0 +
138 |                       (end_time.tv_usec - start_time.tv_usec);
139 |         
140 |         //call red function
141 |         gettimeofday(&start_time, NULL);
142 |         DPU_ASSERT(dpu_launch(set, DPU_SYNCHRONOUS));
143 |         gettimeofday(&end_time, NULL);
144 | 
145 |         double kernel_time = (end_time.tv_sec - start_time.tv_sec) * 1000000.0 +
146 |                       (end_time.tv_usec - start_time.tv_usec);
147 | 
148 |         // reduction on cpu
149 |         gettimeofday(&start_time, NULL);
150 |         void* my_table = malloc(output_len*output_type);
151 | 
152 |         void* lib=dlopen(binary_handle->so_bin_location, RTLD_NOW);
153 |         void (*init_func)(uint32_t, void*) = dlsym(lib,"init_func");
154 |         void (*combine_func)(void*, void*) = dlsym(lib, "combine_func");
155 | 
156 |         if(lib == NULL){
157 |         printf("dynamic library linking failed!!!\n");
158 |         }
159 | 
160 |         
161 |         gather_tables_to_host(table_management, my_table, output_len, output_type, outputs, init_func, combine_func);
162 |         dlclose(lib);
163 |         gettimeofday(&end_time, NULL);
164 |         double host_table_reduction_time = (end_time.tv_sec - start_time.tv_sec) * 1000000.0 +
165 |                       (end_time.tv_usec - start_time.tv_usec);
166 |         // back to dpus
167 | 
168 |         // table info
169 |         gettimeofday(&start_time, NULL);
170 | 
171 |         int32_t* red_tables_lens = malloc(sizeof(uint32_t)*num_dpus);
172 |         for(int i=0; i<num_dpus; i++){
173 |             red_tables_lens[i] = output_len;
174 |         }
175 | 
176 |         // table information to management unit
177 |         table_host_t* t = malloc(sizeof(table_host_t));
178 |         t->name = malloc(strlen(dest_name)+1);
179 |         memcpy(t->name, dest_name, strlen(dest_name)+1);
180 |         t->start = outputs;
181 |         uint32_t max_end_dpu = outputs+output_len*output_type;
182 |         t->end = max_end_dpu+(8-max_end_dpu%8);
183 |         t->len = output_len;
184 |         t->table_type_size = output_type;
185 |         t->lens_each_dpu = red_tables_lens;
186 |         t->is_virtual_zipped = 0;
187 | 
188 |         add_table(t, table_management);
189 | 	table_management->free_space_start_pos = table_management->free_space_start_pos > t->end ? table_management->free_space_start_pos : t->end;
190 |         gettimeofday(&end_time, NULL);
191 |         double register_table_time = (end_time.tv_sec - start_time.tv_sec) * 1000000.0 +
192 |                       (end_time.tv_usec - start_time.tv_usec);
193 |         
194 |         printf("--------------\n");
195 |         printf("table reduction function : ");
196 |         printf(binary);
197 |         printf("\nreduction function kernel execution time : %f\n", kernel_time/1000);
198 |         printf("host reduction execution time : %f\n", host_table_reduction_time/1000);
199 |         printf("function call and table management time : %f\n", (register_table_time+prepare_args_time)/1000);
200 |         printf("--------------\n");
201 | 
202 |         return my_table;
203 |     }
204 |     else{
205 |         printf("ERROR: compiled binary ");
206 |         printf(binary_handle->bin_location);
207 |         printf(" does not contain general reduction functions\n");
208 |     }
209 |     
210 | }
211 | 


--------------------------------------------------------------------------------
/lib/processing/gen_red/GenRed.h:
--------------------------------------------------------------------------------
 1 | #ifndef GENRED_H
 2 | #define GENRED_H
 3 | 
 4 | #include "GenRedArgs.h"
 5 | #include "../ProcessingHelperHost.h"
 6 | #include "../../communication/CommOps.h"
 7 | 
 8 | #include <stdio.h>
 9 | #include <stdlib.h>
10 | #include <stdint.h>
11 | #include <dlfcn.h>
12 | #include <sys/time.h>
13 | #include <omp.h>
14 | #include <dpu.h>
15 | 
16 | /*
17 |     table_gen_red implements the array reduction as in the paper
18 |     It parses the function_handle and setups the host for calling the pim kernel (GenRed.c)
19 |     Then it runs the array reduction pim kernel (gen_red_dpu.c and GenRedProcessing.h)
20 | */
21 | 
22 | void* table_gen_red(const char* src_name, const char* dest_name,  uint32_t output_type, uint32_t output_len, handle_t* binary_handle, simplepim_management_t* table_management, uint32_t info);
23 | #endif 
24 | 


--------------------------------------------------------------------------------
/lib/processing/gen_red/GenRedArgs.h:
--------------------------------------------------------------------------------
 1 | #ifndef GENREDARGS_H
 2 | #define GENREDARGS_H
 3 | #include <stdint.h>
 4 | #include <stdio.h>
 5 | 
 6 | typedef struct {
 7 |    uint32_t  input_start_offset;
 8 |    uint32_t  input_type_size;
 9 |    uint32_t  output_start_offset;
10 |    uint32_t  output_type_size;
11 |    uint32_t  len;
12 |    uint32_t  table_len;
13 |    uint32_t info;
14 | } gen_red_arguments_t;
15 | 
16 | 
17 | #endif 


--------------------------------------------------------------------------------
/lib/processing/gen_red/GenRedProcessing.h:
--------------------------------------------------------------------------------
  1 | #ifndef GENREDPROCESSING_H
  2 | #define GENREDPROCESSING_H
  3 | #include __mapredfunc_pathname__
  4 | #include __combinefunc_pathname__
  5 | 
  6 | #include <stdio.h>
  7 | #include <stdlib.h>
  8 | #include <alloc.h>
  9 | #include <defs.h>
 10 | #include <mram.h>
 11 | #include <barrier.h>
 12 | 
 13 | #include "../ProcessingHelper.h"
 14 | #include "../../StructsPIM.h"
 15 | #include "../../Table.h"
 16 | 
 17 | uint32_t get_shift_bits_for_type(uint32_t value_size){
 18 |     switch (value_size) {
 19 |         case 2: 
 20 |            return 1;
 21 |         case 4:
 22 |             return 2;
 23 |         case 8: 
 24 |             return 3;
 25 |         case 16:
 26 |             return 4;
 27 |         case 32:
 28 |             return 5;
 29 |         case 64:
 30 |             return 6;
 31 |         case 128:
 32 |             return 7;
 33 |         case 256:
 34 |             return 8;
 35 |         case 512:
 36 |             return 9;
 37 |         case 1024:
 38 |             return 10;
 39 |         case 2048:
 40 |             return 11;
 41 |         case 4096:
 42 |             return 12;
 43 |         default: 
 44 |             return 0;
 45 |     }
 46 | }
 47 | 
 48 | void gen_red_dpu(__mram_ptr void* inputs, __mram_ptr void* outputs, uint32_t input_type, uint32_t output_type, uint32_t len, uint32_t table_len){
 49 |     uint32_t elem_type_size = input_type;
 50 |     uint32_t inter_type_size = output_type;
 51 |     uint32_t table_size = table_len;
 52 |     uint32_t num_tasklets = NR_TASKLETS;
 53 |     uint32_t pid = num_tasklets == 1 ? 0 : me();
 54 |     
 55 |     uint64_t tuple = copy_block_size_fun(elem_type_size, inter_type_size, len);
 56 |     uint32_t* copy_block_size_ = (uint32_t*)&tuple;
 57 |     uint32_t copy_block_size = copy_block_size_[0];
 58 |     uint32_t copy_block_size_shiftbits = copy_block_size_[1];
 59 | 
 60 |     
 61 |     __mram_ptr void* elements = inputs;
 62 |     // try malloc/free for performance
 63 |     fsb_allocator_t elems_block_allocator = fsb_alloc(copy_block_size*elem_type_size, 1);
 64 |     __dma_aligned void* elems_block = fsb_get(elems_block_allocator);
 65 | 
 66 |     fsb_allocator_t table_allocator = fsb_alloc(sizeof(table_t), 1);
 67 |     __dma_aligned table_t* local_table = fsb_get(table_allocator);
 68 |     init_table(local_table, table_size, inter_type_size, init_func);
 69 | 
 70 |     
 71 |     fsb_allocator_t tmp_intermediate_allocator = fsb_alloc(inter_type_size, 1);
 72 |     __dma_aligned void* tmp_intermediate = fsb_get(tmp_intermediate_allocator);
 73 | 
 74 |     uint32_t last_block = (len/copy_block_size)*copy_block_size;
 75 |     uint32_t key = 0;
 76 | 
 77 |     void* local_table_entries = local_table->table;
 78 |     uint32_t curr_entry;
 79 |     uint32_t shift_bits = get_shift_bits_for_type(inter_type_size);
 80 | 
 81 | 
 82 |     // TODO : possible optimisation : shifts instead of mult, check bounds for last iterations (later)
 83 |     uint32_t total_len_in_bytes = len * elem_type_size;
 84 |     uint32_t copy_block_size_in_bytes = copy_block_size*elem_type_size;
 85 |     uint32_t stride = copy_block_size_in_bytes*num_tasklets;
 86 | 
 87 |     uint32_t divisible_len_in_bytes = ((len>>copy_block_size_shiftbits)<<copy_block_size_shiftbits)* elem_type_size;
 88 |     uint32_t rest_len_in_bytes = total_len_in_bytes - divisible_len_in_bytes;
 89 | 
 90 |     uint32_t copy_block_end = copy_block_size_in_bytes + (uint32_t)elems_block;
 91 | 
 92 |     if(shift_bits == 0){
 93 |         for(int i=pid*copy_block_size_in_bytes; i<divisible_len_in_bytes; i+=stride){
 94 | 
 95 |             mram_read((__mram_ptr void*)(elements+i), elems_block, copy_block_size_in_bytes);
 96 | 
 97 |             for(uint32_t j=(uint32_t)elems_block; j<copy_block_end; j+=elem_type_size){
 98 |                 // intermediate results 
 99 |                 map_to_val_func((void*)j, tmp_intermediate, &key);
100 |                 // get entry for intermediate results
101 |                 curr_entry = key*inter_type_size;
102 |                 // combine entry to table
103 |                 combine_func(local_table_entries+curr_entry, tmp_intermediate);
104 |     
105 |             }
106 |         }
107 | 
108 |         }
109 |     else{
110 |         for(int i=pid*copy_block_size_in_bytes; i<divisible_len_in_bytes; i+=stride){
111 | 
112 |             mram_read((__mram_ptr void*)(elements+i), elems_block, copy_block_size_in_bytes);
113 | 
114 |             for(uint32_t j=(uint32_t)elems_block; j<copy_block_end; j+=elem_type_size){
115 |                 // intermediate results 
116 |                 map_to_val_func((void*)j, tmp_intermediate, &key);
117 |                 // get entry for intermediate results
118 |                 curr_entry = key<<shift_bits;
119 |                 // combine entry to table
120 |                 combine_func(local_table_entries+curr_entry, tmp_intermediate);
121 |     
122 |             }
123 |         }
124 |     }
125 | 
126 |     // handle last block 
127 |     if(pid==NR_TASKLETS-1 && rest_len_in_bytes != 0){
128 | 
129 |         mram_read((__mram_ptr void*)(elements+divisible_len_in_bytes), elems_block, copy_block_size_in_bytes);
130 | 
131 |         for(uint32_t j=(uint32_t)elems_block; j<(uint32_t)elems_block+rest_len_in_bytes; j+=elem_type_size){
132 |             // intermediate results 
133 |             map_to_val_func((void*)j, tmp_intermediate, &key);
134 |             // get entry for intermediate results
135 |             curr_entry = key*inter_type_size;
136 |             // combine entry to table
137 |             combine_func(local_table_entries+curr_entry, tmp_intermediate);
138 |         }
139 | 
140 |     }
141 | 
142 |     combine_tables_lockfree(outputs, local_table, init_func, combine_func);
143 | 
144 |     fsb_free(elems_block_allocator, elems_block);
145 |     fsb_free(tmp_intermediate_allocator, tmp_intermediate);
146 | 
147 |     free_table(local_table);
148 |     fsb_free(table_allocator, local_table);
149 |     
150 | }
151 | #endif 


--------------------------------------------------------------------------------
/lib/processing/gen_red/gen_red_dpu.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <stdint.h>
 4 | #include <mram.h>
 5 | #include <alloc.h>
 6 | #include <defs.h>
 7 | #include <barrier.h>
 8 | 
 9 | #include "GenRedProcessing.h"
10 | #include "GenRedArgs.h"
11 | #include "../ProcessingHelper.h"
12 | 
13 | 
14 | __host gen_red_arguments_t GEN_RED_INPUT_ARGUMENTS;
15 | __dma_aligned void* aux;
16 | 
17 | BARRIER_INIT(my_barrier, NR_TASKLETS);
18 | int main() {
19 |     int pid = me();
20 |     if (pid == 0){ // Initialize once the cycle counter
21 |         mem_reset(); // Reset the heap
22 |     }
23 |     barrier_wait(&my_barrier);
24 | 
25 |     //printf("\n");
26 | 
27 |     uint32_t  input_start_offset = GEN_RED_INPUT_ARGUMENTS.input_start_offset;
28 |     uint32_t  input_type_size = GEN_RED_INPUT_ARGUMENTS.input_type_size;
29 |     uint32_t  output_start_offset = GEN_RED_INPUT_ARGUMENTS.output_start_offset;
30 |     uint32_t  output_type_size = GEN_RED_INPUT_ARGUMENTS.output_type_size;
31 |     uint32_t  len = GEN_RED_INPUT_ARGUMENTS.len;
32 |     uint32_t  table_len = GEN_RED_INPUT_ARGUMENTS.table_len;
33 | 
34 |     start_func(&GEN_RED_INPUT_ARGUMENTS);
35 |     gen_red_dpu(DPU_MRAM_HEAP_POINTER+input_start_offset, DPU_MRAM_HEAP_POINTER+output_start_offset, input_type_size, output_type_size, len, table_len);
36 |     return 0;
37 | }


--------------------------------------------------------------------------------
/lib/processing/map/Map.c:
--------------------------------------------------------------------------------
  1 | #include "Map.h"
  2 | 
  3 | void table_map(const char* src_name, const char* dest_name, uint32_t output_type, handle_t* binary_handle, simplepim_management_t* table_management, uint32_t info){
  4 |     int i;
  5 |     struct dpu_set_t dpu;
  6 |     struct timeval start_time;
  7 |     struct timeval end_time;
  8 |     
  9 |     uint32_t outputs = table_management->free_space_start_pos;
 10 |     if(contains_table(dest_name, table_management)){
 11 |         outputs = lookup_table(dest_name, table_management) -> start;
 12 |     }
 13 | 
 14 |     if(!contains_table(src_name, table_management)){
 15 |             printf("source table ");
 16 |             printf(src_name);
 17 |             printf(" is not contains in current management unit\n");
 18 |             return;
 19 |     }
 20 |     table_host_t* src_table = lookup_table(src_name, table_management);
 21 | 
 22 |     if(binary_handle->func_type == 0 && src_table->is_virtual_zipped == 0){
 23 | 
 24 |         //timing
 25 |         gettimeofday(&start_time, NULL);
 26 | 
 27 |         struct dpu_set_t set = table_management->set;
 28 |         uint32_t num_dpus = table_management->num_dpus;
 29 |         uint32_t* lens = src_table->lens_each_dpu;
 30 |         uint32_t input_type = src_table->table_type_size;
 31 |         uint32_t inputs = src_table->start;
 32 | 
 33 |         map_arguments_t* input_args = table_management->map_args;
 34 |         // use handle for precompiled binaries
 35 |         const char* binary = binary_handle->bin_location;
 36 |         DPU_ASSERT(dpu_load(set, binary, NULL));
 37 |     
 38 |         //parse arguments to map function call
 39 | 	    DPU_FOREACH(set, dpu, i) {
 40 | 	        input_args[i].input_start_offset = inputs;
 41 |             input_args[i].input_type_size = input_type;
 42 |             input_args[i].output_start_offset = outputs;
 43 |             input_args[i].output_type_size = output_type;
 44 |             input_args[i].len = lens[i];
 45 |             input_args[i].info = info;
 46 |             input_args[i].is_virtually_zipped = 0;
 47 | 		    DPU_ASSERT(dpu_prepare_xfer(dpu, input_args + i));
 48 | 	    }
 49 | 
 50 |         DPU_ASSERT(dpu_push_xfer(set, DPU_XFER_TO_DPU, "MAP_INPUT_ARGUMENTS", 0, sizeof(map_arguments_t), DPU_XFER_DEFAULT));
 51 |     
 52 |         gettimeofday(&end_time, NULL);
 53 |         double prepare_args_time = (end_time.tv_sec - start_time.tv_sec) * 1000000.0 +
 54 |                       (end_time.tv_usec - start_time.tv_usec);
 55 |                 
 56 |         //call map function
 57 |         gettimeofday(&start_time, NULL);
 58 |         DPU_ASSERT(dpu_launch(set, DPU_SYNCHRONOUS));
 59 |         gettimeofday(&end_time, NULL);
 60 | 
 61 |         double kernel_time = (end_time.tv_sec - start_time.tv_sec) * 1000000.0 +
 62 |                       (end_time.tv_usec - start_time.tv_usec);
 63 | 
 64 |         // table information to management unit
 65 |         // timing
 66 |         gettimeofday(&start_time, NULL);
 67 | 
 68 |         table_host_t* t = malloc(sizeof(table_host_t));
 69 |         t->name = malloc(strlen(dest_name)+1);
 70 |         memcpy(t->name, dest_name, strlen(dest_name)+1);
 71 |         t->start = outputs;
 72 |         uint32_t max_end_dpu = max_len_dpu(num_dpus, src_table)*output_type+outputs;
 73 |         t->end = max_end_dpu+(8-max_end_dpu%8);
 74 |         t->len = src_table->len;
 75 |         t->table_type_size = output_type;
 76 |         t->lens_each_dpu = malloc(num_dpus*sizeof(int32_t));
 77 |         t->is_virtual_zipped = 0;
 78 |         memcpy(t->lens_each_dpu, lens, num_dpus*sizeof(int32_t));
 79 | 	
 80 |         add_table(t, table_management);
 81 |     	table_management->free_space_start_pos = table_management->free_space_start_pos > t->end ? table_management->free_space_start_pos : t->end;
 82 | 
 83 |         gettimeofday(&end_time, NULL);
 84 |         double register_table_time = (end_time.tv_sec - start_time.tv_sec) * 1000000.0 +
 85 |                       (end_time.tv_usec - start_time.tv_usec);
 86 | 
 87 |         printf("--------------\n");
 88 |         printf("map function : ");
 89 |         printf(binary);
 90 |         printf("\nmap function kernel execution time : %f\n", kernel_time/1000);
 91 |         printf("function call and table management time : %f\n", (register_table_time+prepare_args_time)/1000);
 92 |         printf("--------------\n");
 93 |     }
 94 |     else if (binary_handle->func_type == 0 && src_table->is_virtual_zipped == 1)
 95 |     {
 96 |          //timing
 97 |         gettimeofday(&start_time, NULL);
 98 | 
 99 |         struct dpu_set_t set = table_management->set;
100 |         uint32_t num_dpus = table_management->num_dpus;
101 |         uint32_t* lens = src_table->lens_each_dpu;
102 |         uint32_t input_type = src_table->table_type_size;
103 |         uint32_t inputs = src_table->start;
104 | 
105 |         map_arguments_t* input_args = table_management->map_args;
106 |         // use handle for precompiled binaries
107 |         const char* binary = binary_handle->bin_location;
108 |         DPU_ASSERT(dpu_load(set, binary, NULL));
109 |     
110 |         //parse arguments to map function call
111 | 	    DPU_FOREACH(set, dpu, i) {
112 | 	        input_args[i].input_start_offset = inputs;
113 |             input_args[i].input_type_size = input_type;
114 |             input_args[i].output_start_offset = outputs;
115 |             input_args[i].output_type_size = output_type;
116 |             input_args[i].len = lens[i];
117 |             input_args[i].info = info;
118 |             input_args[i].is_virtually_zipped = 1;
119 |             input_args[i].start1 = src_table->start1;
120 |             input_args[i].start2 = src_table->start2;
121 |             input_args[i].type1 = src_table->type1;
122 |             input_args[i].type2 = src_table->type2;
123 | 		    DPU_ASSERT(dpu_prepare_xfer(dpu, input_args + i));
124 | 	    }
125 | 
126 |         DPU_ASSERT(dpu_push_xfer(set, DPU_XFER_TO_DPU, "MAP_INPUT_ARGUMENTS", 0, sizeof(map_arguments_t), DPU_XFER_DEFAULT));
127 |     
128 |         gettimeofday(&end_time, NULL);
129 |         double prepare_args_time = (end_time.tv_sec - start_time.tv_sec) * 1000000.0 +
130 |                       (end_time.tv_usec - start_time.tv_usec);
131 |                 
132 |         //call map function
133 |         gettimeofday(&start_time, NULL);
134 |         DPU_ASSERT(dpu_launch(set, DPU_SYNCHRONOUS));
135 |         gettimeofday(&end_time, NULL);
136 | 
137 |         double kernel_time = (end_time.tv_sec - start_time.tv_sec) * 1000000.0 +
138 |                       (end_time.tv_usec - start_time.tv_usec);
139 | 
140 |         // table information to management unit
141 |         // timing
142 |         gettimeofday(&start_time, NULL);
143 | 
144 |         table_host_t* t = malloc(sizeof(table_host_t));
145 |         t->name = malloc(strlen(dest_name)+1);
146 |         memcpy(t->name, dest_name, strlen(dest_name)+1);
147 |         t->start = outputs;
148 |         uint32_t max_end_dpu = max_len_dpu(num_dpus, src_table)*output_type+outputs;
149 |         t->end = max_end_dpu+(8-max_end_dpu%8);
150 |         t->len = src_table->len;
151 |         t->table_type_size = output_type;
152 |         t->lens_each_dpu = malloc(num_dpus*sizeof(int32_t));
153 |         t->is_virtual_zipped = 0;
154 |         memcpy(t->lens_each_dpu, lens, num_dpus*sizeof(int32_t));
155 | 
156 |         add_table(t, table_management);
157 |     
158 | 
159 |         gettimeofday(&end_time, NULL);
160 |         double register_table_time = (end_time.tv_sec - start_time.tv_sec) * 1000000.0 +
161 |                       (end_time.tv_usec - start_time.tv_usec);
162 | 
163 |         printf("--------------\n");
164 |         printf("map function : ");
165 |         printf(binary);
166 |         printf("\nmap function kernel execution time : %f\n", kernel_time/1000);
167 |         printf("function call and table management time : %f\n", (register_table_time+prepare_args_time)/1000);
168 |         printf("--------------\n");
169 |     }
170 |     else{
171 |         printf("ERROR: compiled binary ");
172 |         printf(binary_handle->bin_location);
173 |         printf(" does not contain map function\n");
174 |     }
175 |     
176 | }
177 | 


--------------------------------------------------------------------------------
/lib/processing/map/Map.h:
--------------------------------------------------------------------------------
 1 | #ifndef MAP_H
 2 | #define MAP_H
 3 | #include "MapArgs.h"
 4 | #include "../ProcessingHelperHost.h"
 5 | #include "../../management/Management.h"
 6 | #include <stdio.h>
 7 | #include <stdlib.h>
 8 | #include <stdint.h>
 9 | #include <string.h>
10 | #include <sys/time.h>
11 | #include <dpu.h>
12 | 
13 | /*
14 |     table_map implements the array map operator as in the paper
15 |     It parses the function_handle and setups the host for calling the pim kernel (Map.c)
16 |     Then it runs the array reduction pim kernel (map_dpu.c and MapProcessing.h)
17 | */
18 | 
19 | void table_map(const char* src_name, const char* dest_name, uint32_t output_type, handle_t* binary_handle, simplepim_management_t* table_management, uint32_t info);
20 | #endif 
21 | 


--------------------------------------------------------------------------------
/lib/processing/map/MapArgs.h:
--------------------------------------------------------------------------------
 1 | #ifndef MAPARGS_H
 2 | #define MAPARGS_H
 3 | #include <stdint.h>
 4 | #include <stdio.h>
 5 | 
 6 | typedef struct {
 7 |    uint32_t  input_start_offset;
 8 |    uint32_t  input_type_size;
 9 |    uint32_t  output_start_offset;
10 |    uint32_t  output_type_size;
11 |    uint32_t  len;
12 |    uint32_t  info;
13 | 
14 |    // handle virtual zip 
15 |    uint32_t is_virtually_zipped;
16 |    uint32_t start1;
17 |    uint32_t start2;
18 |    uint32_t type1;
19 |    uint32_t type2;
20 | } map_arguments_t;
21 | 
22 | 
23 | #endif 


--------------------------------------------------------------------------------
/lib/processing/map/MapProcessing.h:
--------------------------------------------------------------------------------
  1 | #ifndef MAPPROCESSING_H
  2 | #define MAPPROCESSING_H
  3 | #include <stdio.h>
  4 | #include <stdlib.h>
  5 | #include <alloc.h>
  6 | #include <defs.h>
  7 | #include <mram.h>
  8 | #include <barrier.h>
  9 | #include "../ProcessingHelper.h"
 10 | #include __mapfunc_filename__
 11 | 
 12 | BARRIER_INIT(barrier_p, NR_TASKLETS);
 13 | void map_dpu(__mram_ptr void* inputs, __mram_ptr void* outputs, uint32_t input_type, uint32_t output_type, uint32_t len){
 14 |     uint32_t elem_type_size = input_type;
 15 |     uint32_t inter_type_size = output_type;
 16 |     uint32_t num_tasklets = NR_TASKLETS; 
 17 |     uint32_t pid = me();
 18 |     uint64_t tuple = copy_block_size_fun(elem_type_size, inter_type_size, len);
 19 |     uint32_t* copy_block_size_ = (uint32_t*)&tuple;
 20 |     uint32_t copy_block_size = copy_block_size_[0];
 21 |     uint32_t copy_block_size_shiftbits = copy_block_size_[1];
 22 |     // try malloc/free for performance
 23 |     fsb_allocator_t elems_block_allocator = fsb_alloc(elem_type_size<<copy_block_size_shiftbits, 1);
 24 |     __dma_aligned void* elems_block = fsb_get(elems_block_allocator);
 25 | 
 26 |     fsb_allocator_t inter_block_allocator = fsb_alloc(inter_type_size<<copy_block_size_shiftbits, 1);
 27 |     __dma_aligned void* inter_block = fsb_get(inter_block_allocator);
 28 | 
 29 |     uint32_t block_size_inputs = elem_type_size<<copy_block_size_shiftbits;
 30 |     uint32_t block_size_outputs = inter_type_size<<copy_block_size_shiftbits;
 31 |     uint32_t block_times_tasklets = num_tasklets<<copy_block_size_shiftbits;
 32 | 
 33 |     uint32_t divisible_len = (len>>copy_block_size_shiftbits)<<copy_block_size_shiftbits;
 34 |     uint32_t rest_len = len - divisible_len;
 35 | 
 36 |     void* inter, *elem;
 37 | 
 38 | 
 39 |     uint32_t unroll_block_size = (copy_block_size>>2)<<2;
 40 |     uint32_t unroll_block_rest = copy_block_size-unroll_block_rest;
 41 | 
 42 |     uint32_t i_init = pid<<copy_block_size_shiftbits;
 43 |     uint32_t i_elem = (uint32_t)inputs+i_init*elem_type_size;
 44 |     uint32_t i_inter = (uint32_t)outputs+i_init*inter_type_size;
 45 | 
 46 |     uint32_t i_elem_stride = block_times_tasklets * elem_type_size;
 47 |     uint32_t i_inter_stride = block_times_tasklets * inter_type_size;
 48 | 
 49 |     for(int i=pid*copy_block_size; i<divisible_len; i+=block_times_tasklets){
 50 |         
 51 |         mram_read((__mram_ptr void*)(i_elem), elems_block, block_size_inputs);
 52 |         inter = inter_block;
 53 |         elem = elems_block;
 54 | 
 55 |         for(uint32_t j=0; j<unroll_block_size; j+=4){
 56 | 
 57 |             map_func(elem, inter);
 58 | 
 59 |             elem += elem_type_size;
 60 |             inter += inter_type_size;
 61 |             map_func(elem, inter);
 62 | 
 63 |             elem += elem_type_size;
 64 |             inter += inter_type_size;
 65 |             map_func(elem, inter);
 66 | 
 67 |             elem += elem_type_size;
 68 |             inter += inter_type_size;
 69 |             map_func(elem, inter);
 70 | 
 71 |             elem += elem_type_size;
 72 |             inter += inter_type_size;
 73 |         }
 74 |         
 75 | 
 76 |         for(uint32_t j=0; j<unroll_block_rest; j++){
 77 |             map_func(elem, inter);
 78 |             inter += inter_type_size;
 79 |             elem += elem_type_size;
 80 |         }
 81 | 
 82 |         mram_write(inter_block, (__mram_ptr void*)(i_inter),  block_size_outputs);
 83 |         i_inter += i_inter_stride;
 84 |         i_elem += i_elem_stride;
 85 |         
 86 |     }
 87 |     
 88 | 
 89 |     
 90 |     // handle last block 
 91 |     if(pid==NR_TASKLETS-1 && rest_len != 0){
 92 |         uint32_t last_block_elem_addr = divisible_len * elem_type_size;
 93 |         uint32_t last_block_inter_addr = divisible_len * inter_type_size;
 94 | 
 95 |         mram_read((__mram_ptr void*)(outputs+last_block_inter_addr), inter_block, block_size_outputs);
 96 |         mram_read((__mram_ptr void*)(inputs+last_block_elem_addr), elems_block, block_size_inputs);
 97 | 
 98 | 
 99 |         elem = elems_block;
100 |         inter = inter_block;
101 |         for(uint32_t j=0; j<rest_len; j++){
102 |             map_func(elem, inter);
103 |             elem += elem_type_size;
104 |             inter += inter_type_size;
105 |         }
106 | 
107 | 
108 |         mram_write(inter_block, (__mram_ptr void*)(outputs+last_block_inter_addr),  block_size_outputs);
109 |     }
110 |     
111 |     
112 | 
113 |     fsb_free(elems_block_allocator, elems_block);
114 |     fsb_free(inter_block_allocator, inter_block);
115 | 
116 |     barrier_wait(&barrier_p);
117 | }
118 | 
119 | void zip_map_dpu(__mram_ptr void* inputs1, __mram_ptr void* inputs2, __mram_ptr void* outputs, uint32_t input_type1, uint32_t input_type2, uint32_t output_type, uint32_t len){
120 |     uint32_t elem_type_size = input_type1+input_type2;
121 |     uint32_t inter_type_size = output_type;
122 |     uint32_t num_tasklets = NR_TASKLETS; 
123 |     uint32_t pid = me();
124 |     uint64_t tuple = copy_block_size_fun(elem_type_size, inter_type_size, len);
125 |     uint32_t* copy_block_size_ = (uint32_t*)&tuple;
126 |     uint32_t copy_block_size = copy_block_size_[0];
127 |     uint32_t copy_block_size_shiftbits = copy_block_size_[1];
128 |     // try malloc/free for performance
129 |     fsb_allocator_t elem_allocator = fsb_alloc(elem_type_size, 1);
130 |     __dma_aligned void* elem = fsb_get(elem_allocator);
131 | 
132 |     fsb_allocator_t elems_block_allocator_tmp = fsb_alloc(elem_type_size<<copy_block_size_shiftbits, 1);
133 |     __dma_aligned void* elems_block_tmp = fsb_get(elems_block_allocator_tmp);
134 | 
135 |     uint32_t input1_block_size = input_type1<<copy_block_size_shiftbits;
136 |     uint32_t input2_block_size = input_type2<<copy_block_size_shiftbits;
137 | 
138 |     void* inputs1_block = elems_block_tmp;
139 |     void* inputs2_block = elems_block_tmp + input1_block_size;
140 | 
141 |     fsb_allocator_t inter_block_allocator = fsb_alloc(inter_type_size<<copy_block_size_shiftbits, 1);
142 |     __dma_aligned void* inter_block = fsb_get(inter_block_allocator);
143 | 
144 |     uint32_t block_size_inputs = elem_type_size<<copy_block_size_shiftbits;
145 |     uint32_t block_size_outputs = inter_type_size<<copy_block_size_shiftbits;
146 |     uint32_t block_times_tasklets = num_tasklets<<copy_block_size_shiftbits;
147 | 
148 |     uint32_t divisible_len = (len>>copy_block_size_shiftbits)<<copy_block_size_shiftbits;
149 |     uint32_t rest_len = len - divisible_len;
150 | 
151 |     void* inter, *input_elem1, *input_elem2;
152 | 
153 | 
154 |     uint32_t i_init = pid<<copy_block_size_shiftbits;
155 |     uint32_t i_input1 = (uint32_t)inputs1+i_init*input_type1;
156 |     uint32_t i_input2 = (uint32_t)inputs2+i_init*input_type2;
157 |     uint32_t i_inter = (uint32_t)outputs+i_init*inter_type_size;
158 | 
159 |     uint32_t i_input1_stride = block_times_tasklets * input_type1;
160 |     uint32_t i_input2_stride = block_times_tasklets * input_type2;
161 |     uint32_t i_inter_stride = block_times_tasklets * inter_type_size;
162 | 
163 |     uint32_t input_type_1_div_4 = input_type1 >> 2;
164 |     uint32_t input_type_2_div_4 = input_type2 >> 2;
165 |     uint32_t input_type_1_rest_4 = input_type1 - (input_type_1_div_4<<2);
166 |     uint32_t input_type_2_rest_4 = input_type2 - (input_type_2_div_4<<2);
167 | 
168 | 
169 |     void* elem_plus_input1 = elem + input_type1;
170 | 
171 |     uint32_t unroll_block_size = (copy_block_size>>2)<<2;
172 |     uint32_t unroll_block_rest = copy_block_size-unroll_block_rest;
173 | 
174 |     if(input_type1 ==4 && input_type2 == 4){
175 |         for(int i=pid*copy_block_size; i<divisible_len; i+=block_times_tasklets){
176 |         
177 |             mram_read((__mram_ptr void*)(i_input1), inputs1_block, input1_block_size);
178 |             mram_read((__mram_ptr void*)(i_input2), inputs2_block, input2_block_size);
179 |             inter = inter_block;
180 |             input_elem1 = inputs1_block;
181 |             input_elem2 = inputs2_block;
182 |             for(uint32_t j=0; j<unroll_block_size; j+=4){
183 | 
184 |                 ((int32_t*)elem)[0] = *(int32_t*)input_elem1;
185 |                 ((int32_t*)elem)[1] = *(int32_t*)input_elem2;
186 |                 map_func(elem, inter);
187 |                 inter += inter_type_size;
188 |                 input_elem1 += input_type1;
189 |                 input_elem2 += input_type2; 
190 | 
191 |                 ((int32_t*)elem)[0] = *(int32_t*)input_elem1;
192 |                 ((int32_t*)elem)[1] = *(int32_t*)input_elem2;
193 |                 map_func(elem, inter);
194 |                 inter += inter_type_size;
195 |                 input_elem1 += input_type1;
196 |                 input_elem2 += input_type2;
197 | 
198 |                 ((int32_t*)elem)[0] = *(int32_t*)input_elem1;
199 |                 ((int32_t*)elem)[1] = *(int32_t*)input_elem2;
200 |                 map_func(elem, inter);
201 |                 inter += inter_type_size;
202 |                 input_elem1 += input_type1;
203 |                 input_elem2 += input_type2;
204 | 
205 |                 ((int32_t*)elem)[0] = *(int32_t*)input_elem1;
206 |                 ((int32_t*)elem)[1] = *(int32_t*)input_elem2;
207 |                 map_func(elem, inter);
208 |                 inter += inter_type_size;
209 |                 input_elem1 += input_type1;
210 |                 input_elem2 += input_type2;    
211 |             }
212 | 
213 |             for(uint32_t j=0; j<unroll_block_rest; j++){
214 |                 ((int32_t*)elem)[0] = *(int32_t*)input_elem1;
215 |                 ((int32_t*)elem)[1] = *(int32_t*)input_elem2;
216 |                 map_func(elem, inter);
217 |                 inter += inter_type_size;
218 |                 input_elem1 += input_type1;
219 |                 input_elem2 += input_type2; 
220 |             }
221 | 
222 |             mram_write(inter_block, (__mram_ptr void*)(i_inter),  block_size_outputs);
223 |             i_inter += i_inter_stride;
224 |             i_input1 += i_input1_stride;
225 |             i_input2 += i_input2_stride;
226 |         }
227 |     }
228 |     else{
229 |         for(int i=pid*copy_block_size; i<divisible_len; i+=block_times_tasklets){
230 |         
231 |             mram_read((__mram_ptr void*)(i_input1), inputs1_block, input1_block_size);
232 |             mram_read((__mram_ptr void*)(i_input2), inputs2_block, input2_block_size);
233 |             inter = inter_block;
234 |             input_elem1 = inputs1_block;
235 |             input_elem2 = inputs2_block;
236 |             for(uint32_t j=0; j<copy_block_size; j++){
237 | 
238 |             
239 |                 for(int k=0; k<input_type_1_div_4; k++){
240 |                     ((int32_t*)elem)[k] = ((int32_t*)input_elem1)[k];
241 |                 }
242 | 
243 |                 for(int k=input_type_1_rest_4; k<input_type1; k++){
244 |                     ((char*)elem)[k] = ((char*)input_elem1)[k];
245 |                 }
246 | 
247 | 
248 |                 for(int k=0; k<input_type_2_div_4; k++){
249 |                     ((int32_t*)elem_plus_input1)[k] = ((int32_t*)input_elem2)[k];
250 |                 }
251 | 
252 |                 for(int k=input_type_2_rest_4; k<input_type2; k++){
253 |                     ((char*)elem_plus_input1)[k] = ((char*)input_elem2)[k];
254 |                 }
255 |             
256 | 
257 |                 map_func(elem, inter);
258 |                 inter += inter_type_size;
259 |                 input_elem1 += input_type1;
260 |                 input_elem2 += input_type2;     
261 |             }
262 | 
263 |             mram_write(inter_block, (__mram_ptr void*)(i_inter),  block_size_outputs);
264 |             i_inter += i_inter_stride;
265 |             i_input1 += i_input1_stride;
266 |             i_input2 += i_input2_stride;
267 |         }
268 |     }
269 |     
270 | 
271 |     
272 |     // handle last block 
273 |     
274 |     if(pid==NR_TASKLETS-1 && rest_len != 0){
275 |         uint32_t last_block_elem1_addr = divisible_len * input_type1;
276 |         uint32_t last_block_elem2_addr = divisible_len * input_type2;
277 |         uint32_t last_block_inter_addr = divisible_len * inter_type_size;
278 | 
279 |         mram_read((__mram_ptr void*)(outputs+last_block_inter_addr), inter_block, block_size_outputs);
280 |         mram_read((__mram_ptr void*)(inputs1+last_block_elem1_addr), inputs1_block, input1_block_size);
281 |         mram_read((__mram_ptr void*)(inputs2+last_block_elem2_addr), inputs2_block, input2_block_size);
282 | 
283 |         inter = inter_block;
284 |         input_elem1 = inputs1_block;
285 |         input_elem2 = inputs2_block;
286 |         
287 |         for(uint32_t j=0; j<rest_len; j++){
288 | 
289 |             for(int k=0; k<input_type_1_div_4; k++){
290 |                 ((int32_t*)elem)[k] = ((int32_t*)input_elem1)[k];
291 |             }
292 | 
293 |             for(int k=input_type_1_rest_4; k<input_type1; k++){
294 |                 ((char*)elem)[k] = ((char*)input_elem1)[k];
295 |             }
296 | 
297 | 
298 |             for(int k=0; k<input_type_2_div_4; k++){
299 |                 ((int32_t*)elem_plus_input1)[k] = ((int32_t*)input_elem2)[k];
300 |             }
301 | 
302 |             for(int k=input_type_2_rest_4; k<input_type2; k++){
303 |                 ((char*)elem_plus_input1)[k] = ((char*)input_elem2)[k];
304 |             }
305 | 
306 |             map_func(elem, inter);
307 |             inter += inter_type_size;
308 |             input_elem1 += input_type1;
309 |             input_elem2 += input_type2;
310 |         }
311 |        
312 | 
313 |         mram_write(inter_block, (__mram_ptr void*)(outputs+last_block_inter_addr),  block_size_outputs);
314 | 
315 |     }
316 | 
317 |     
318 |     
319 | 
320 |     fsb_free(elem_allocator, elem);
321 |     fsb_free(elems_block_allocator_tmp, elems_block_tmp);
322 |     fsb_free(inter_block_allocator, inter_block);
323 | 
324 |     barrier_wait(&barrier_p);
325 | }
326 | #endif 


--------------------------------------------------------------------------------
/lib/processing/map/map_dpu.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <stdint.h>
 4 | #include <mram.h>
 5 | #include <alloc.h>
 6 | #include <defs.h>
 7 | #include <barrier.h>
 8 | 
 9 | #include "MapProcessing.h"
10 | #include "MapArgs.h"
11 | #include "../ProcessingHelper.h"
12 | 
13 | __host map_arguments_t MAP_INPUT_ARGUMENTS;
14 | __dma_aligned void* aux;
15 | 
16 | BARRIER_INIT(my_barrier, NR_TASKLETS);
17 | int main() {
18 |     int pid = me();
19 |     if (pid == 0){ // Initialize once the cycle counter
20 |         mem_reset(); // Reset the heap
21 |     }
22 |     barrier_wait(&my_barrier);
23 | 
24 |     //printf("\n");
25 | 
26 |     uint32_t  input_start_offset = MAP_INPUT_ARGUMENTS.input_start_offset;
27 |     uint32_t  input_type_size = MAP_INPUT_ARGUMENTS.input_type_size;
28 |     uint32_t  output_start_offset = MAP_INPUT_ARGUMENTS.output_start_offset;
29 |     uint32_t  output_type_size = MAP_INPUT_ARGUMENTS.output_type_size;
30 |     uint32_t  len = MAP_INPUT_ARGUMENTS.len;
31 |     uint32_t  is_zipped = MAP_INPUT_ARGUMENTS.is_virtually_zipped;
32 | 
33 |     // for virtually zipped table
34 |     uint32_t start1 = MAP_INPUT_ARGUMENTS.start1;
35 |     uint32_t start2 = MAP_INPUT_ARGUMENTS.start2;
36 |     uint32_t type1 = MAP_INPUT_ARGUMENTS.type1;
37 |     uint32_t type2 = MAP_INPUT_ARGUMENTS.type2;
38 |     start_func(&MAP_INPUT_ARGUMENTS);
39 |     if(is_zipped){
40 |         zip_map_dpu(DPU_MRAM_HEAP_POINTER+start1, DPU_MRAM_HEAP_POINTER+start2, DPU_MRAM_HEAP_POINTER+output_start_offset, type1, type2, output_type_size, len);
41 |     }
42 |     else{
43 |         map_dpu(DPU_MRAM_HEAP_POINTER+input_start_offset, DPU_MRAM_HEAP_POINTER+output_start_offset, input_type_size, output_type_size, len);
44 |     }
45 |     return 0;
46 | }


--------------------------------------------------------------------------------
/lib/processing/zip/Zip.c:
--------------------------------------------------------------------------------
  1 | #include "Zip.h"
  2 | 
  3 | void table_zip(const char* src1_name, const char* src2_name, const char* dest_name, handle_t* binary_handle, simplepim_management_t* table_management){
  4 |      int i;
  5 |      struct dpu_set_t dpu;
  6 |      struct timeval start_time;
  7 |      struct timeval end_time;
  8 |      uint32_t outputs = table_management->free_space_start_pos;
  9 |      if(contains_table(dest_name, table_management)){
 10 |         outputs = lookup_table(dest_name, table_management) -> start;
 11 |      }
 12 | 
 13 |      if(binary_handle->func_type == 2){
 14 | 
 15 |         // timing
 16 |         double kernel_time = 0;
 17 | 
 18 |         gettimeofday(&start_time, NULL);
 19 | 
 20 |         if(!contains_table(src1_name, table_management)){
 21 |             printf("source table ");
 22 |             printf(src1_name);
 23 |             printf(" is not contains in current management unit\n");
 24 |             return;
 25 |         }
 26 | 
 27 |         if(!contains_table(src2_name, table_management)){
 28 |             printf("source table ");
 29 |             printf(src2_name);
 30 |             printf(" is not contains in current management unit\n");
 31 |             return;
 32 |         }
 33 | 
 34 |         struct dpu_set_t set = table_management->set;
 35 |         uint32_t num_dpus = table_management->num_dpus;
 36 |         table_host_t* src1_table = lookup_table(src1_name, table_management);
 37 |         table_host_t* src2_table = lookup_table(src2_name, table_management);
 38 | 
 39 |         if(src1_table->len != src2_table->len){
 40 |             printf("zip length does not match !!!");
 41 |             return;
 42 |         }
 43 | 
 44 |         zip_arguments_t* input_args = table_management->zip_args;
 45 | 
 46 |         if(src1_table->is_virtual_zipped == 1){
 47 |             src1_table->is_virtual_zipped = 0;
 48 |             const char* binary = binary_handle->bin_location;
 49 |             DPU_ASSERT(dpu_load(set, binary, NULL));
 50 | 
 51 |             for(uint32_t i=0; i<num_dpus; i++){
 52 |                 input_args[i].input_start_offset1 = src1_table->start1;
 53 |                 input_args[i].input_start_offset2 = src1_table->start2;
 54 |                 input_args[i].input_type_size1 = src1_table->type1;
 55 |                 input_args[i].input_type_size2 = src1_table->type2;
 56 |                 input_args[i].outputs = src1_table->start;
 57 |             
 58 |             }
 59 | 
 60 |             DPU_FOREACH(set, dpu, i) {
 61 | 		        DPU_ASSERT(dpu_prepare_xfer(dpu, input_args + i));
 62 | 	        }
 63 | 
 64 |             DPU_ASSERT(dpu_push_xfer(set, DPU_XFER_TO_DPU, "ZIP_INPUT_ARGUMENTS", 0, sizeof(zip_arguments_t), DPU_XFER_DEFAULT));
 65 |             gettimeofday(&start_time, NULL);
 66 |             DPU_ASSERT(dpu_launch(set, DPU_SYNCHRONOUS));
 67 |             gettimeofday(&end_time, NULL);
 68 | 
 69 |             kernel_time += (end_time.tv_sec - start_time.tv_sec) * 1000000.0 +
 70 |                       (end_time.tv_usec - start_time.tv_usec);
 71 |         }
 72 | 
 73 |         if(src2_table->is_virtual_zipped == 1){
 74 |             src2_table->is_virtual_zipped = 0;
 75 |             const char* binary = binary_handle->bin_location;
 76 |             DPU_ASSERT(dpu_load(set, binary, NULL));
 77 | 
 78 |             for(uint32_t i=0; i<num_dpus; i++){
 79 |                 input_args[i].input_start_offset1 = src2_table->start1;
 80 |                 input_args[i].input_start_offset2 = src2_table->start2;
 81 |                 input_args[i].input_type_size1 = src2_table->type1;
 82 |                 input_args[i].input_type_size2 = src2_table->type2;
 83 |                 input_args[i].outputs = src2_table->start;
 84 |             
 85 |             }
 86 | 
 87 |             DPU_FOREACH(set, dpu, i) {
 88 | 		        DPU_ASSERT(dpu_prepare_xfer(dpu, input_args + i));
 89 | 	        }
 90 | 
 91 |             DPU_ASSERT(dpu_push_xfer(set, DPU_XFER_TO_DPU, "ZIP_INPUT_ARGUMENTS", 0, sizeof(zip_arguments_t), DPU_XFER_DEFAULT));
 92 |             gettimeofday(&start_time, NULL);
 93 |             DPU_ASSERT(dpu_launch(set, DPU_SYNCHRONOUS));
 94 |             gettimeofday(&end_time, NULL);
 95 | 
 96 |             kernel_time += (end_time.tv_sec - start_time.tv_sec) * 1000000.0 +
 97 |                       (end_time.tv_usec - start_time.tv_usec);
 98 |         }
 99 | 
100 |         // virtually zip two tables
101 |         uint32_t* lens = src2_table->lens_each_dpu;
102 |         uint32_t* lens_ = src2_table->lens_each_dpu;
103 | 
104 |         uint32_t input_type1 = src1_table->table_type_size;
105 |         uint32_t start1 = src1_table->start;
106 |         uint32_t end1 = src1_table->end;
107 |         uint32_t input_type2 = src2_table->table_type_size;
108 |         uint32_t start2 = src2_table->start;
109 |         uint32_t end2 = src2_table->end;
110 | 
111 |         for(uint32_t i=0; i<num_dpus; i++){
112 |             if(lens[i]!=lens_[i]){
113 |                 printf("zip length does not match on dpu %d!!!", i);
114 |                 return;
115 |             }
116 |         }
117 | 
118 | 
119 |         // table information to management unit
120 |         // timing
121 |         gettimeofday(&start_time, NULL);
122 | 
123 |         
124 |         table_host_t* t = malloc(sizeof(table_host_t));
125 |         t->name = malloc(strlen(dest_name)+1);
126 |         memcpy(t->name, dest_name, strlen(dest_name)+1);
127 |         uint32_t output_type = input_type1+input_type2;
128 |         t->start = outputs;
129 |         uint32_t max_end_dpu = max_len_dpu(num_dpus, src1_table)*output_type+outputs;
130 |         t->end = max_end_dpu+(8-max_end_dpu%8);
131 |         t->len = src1_table->len;
132 |         t->table_type_size = output_type;
133 |         t->lens_each_dpu = malloc(num_dpus*sizeof(int32_t));
134 |         memcpy(t->lens_each_dpu, lens, num_dpus*sizeof(int32_t));
135 | 
136 |         t->is_virtual_zipped = 1;
137 |         t->start1 = start1;
138 |         t->end1 = end1;
139 |         t->start2 = start2;
140 |         t->end2 = end2;
141 |         t->type1 = input_type1;
142 |         t->type2 = input_type2;
143 | 
144 |         add_table(t, table_management);
145 |         
146 |     	table_management->free_space_start_pos = table_management->free_space_start_pos > t->end ? table_management->free_space_start_pos : t->end;
147 | 
148 |         gettimeofday(&end_time, NULL);
149 |         printf("\nzip function kernel execution time : %f\n", kernel_time/1000);
150 |         double register_table_time = (end_time.tv_sec - start_time.tv_sec) * 1000000.0 +
151 |                       (end_time.tv_usec - start_time.tv_usec);
152 |         printf("--------------\n");
153 |         printf("zip function call and table management time : %f\n", (register_table_time)/1000);
154 |         printf("--------------\n");
155 |     }
156 |     else{
157 |         printf("ERROR: compiled binary ");
158 |         printf(binary_handle->bin_location);
159 |         printf(" is not a zip function\n");
160 |     }
161 | }
162 | 


--------------------------------------------------------------------------------
/lib/processing/zip/Zip.h:
--------------------------------------------------------------------------------
 1 | #ifndef ZIP_H
 2 | #define ZIP_H
 3 | #include "ZipArgs.h"
 4 | #include "../ProcessingHelperHost.h"
 5 | #include "../../management/Management.h"
 6 | #include <stdio.h>
 7 | #include <stdlib.h>
 8 | #include <stdint.h>
 9 | #include <string.h>
10 | #include <sys/time.h>
11 | #include <dpu.h>
12 | 
13 | /*
14 |     table_zip implements the array zip operator as in the paper
15 |     It setups the host for calling the pim kernel (Zip.c)
16 |     Then it runs the array reduction pim kernel (zip_dpu.c and ZipProcessing.h)
17 | */
18 | 
19 | void table_zip(const char* src1_name, const char* src2_name, const char* dest_name,  handle_t* binary_handle, simplepim_management_t* table_management);
20 | #endif 
21 | 


--------------------------------------------------------------------------------
/lib/processing/zip/ZipArgs.h:
--------------------------------------------------------------------------------
 1 | #ifndef ZIPARGS_H
 2 | #define ZIPARGS_H
 3 | #include <stdint.h>
 4 | #include <stdio.h>
 5 | 
 6 | typedef struct {
 7 |    uint32_t  input_start_offset1;
 8 |    uint32_t  input_start_offset2;
 9 |    uint32_t  input_type_size1;
10 |    uint32_t  input_type_size2;
11 |    uint32_t  outputs;
12 |    uint32_t  len;
13 | } zip_arguments_t;
14 | 
15 | 
16 | #endif 


--------------------------------------------------------------------------------
/lib/processing/zip/ZipProcessing.c:
--------------------------------------------------------------------------------
  1 | #include "ZipProcessing.h"
  2 | BARRIER_INIT(barrier_p, NR_TASKLETS);
  3 | 
  4 | void zip_dpu(__mram_ptr void* table_entries_1, __mram_ptr void* table_entries_2, __mram_ptr void* table_entries_res, uint32_t input_type_1, uint32_t input_type_2, uint32_t len){
  5 |     uint32_t num_tasklets = NR_TASKLETS; 
  6 |     uint32_t pid = me();
  7 |     uint64_t tuple = copy_block_size_fun(input_type_1, input_type_2, len);
  8 |     uint32_t* copy_block_size_ = (uint32_t*)&tuple;
  9 |     uint32_t copy_block_size = copy_block_size_[0];
 10 |     uint32_t copy_block_size_shiftbits = copy_block_size_[1];
 11 | 
 12 |     uint32_t input_block_size_1 = input_type_1<<copy_block_size_shiftbits;
 13 |     uint32_t input_block_size_2 = input_type_2<<copy_block_size_shiftbits;
 14 |     uint32_t output_type = input_type_1+input_type_2;
 15 |     uint32_t output_block_size = input_block_size_1+input_block_size_2;
 16 | 
 17 |     fsb_allocator_t input1_block_allocator = fsb_alloc(input_block_size_1, 1);
 18 |     __dma_aligned void* input1_block = fsb_get(input1_block_allocator);
 19 | 
 20 |     fsb_allocator_t input2_block_allocator = fsb_alloc(input_block_size_2, 1);
 21 |     __dma_aligned void* input2_block = fsb_get(input2_block_allocator);
 22 | 
 23 |     fsb_allocator_t output_block_allocator = fsb_alloc(output_block_size, 1);
 24 |     __dma_aligned void* output_block = fsb_get(output_block_allocator);
 25 | 
 26 |     uint32_t divisible_len = (len>>copy_block_size_shiftbits)<<copy_block_size_shiftbits;
 27 |     uint32_t rest_len = len - divisible_len;
 28 | 
 29 |     uint32_t block_times_tasklets = copy_block_size*num_tasklets;
 30 | 
 31 |     uint32_t i_input1;
 32 |     uint32_t i_input2;
 33 |     uint32_t i_output;
 34 |     
 35 |     void* j_output;
 36 |     void* input1;
 37 |     void* input2;
 38 |     void* output;
 39 | 
 40 |     uint32_t pid_times_block_size = pid*copy_block_size;
 41 | 
 42 |     i_input1 = pid_times_block_size*input_type_1;
 43 |     i_input2 = pid_times_block_size*input_type_2;
 44 |     i_output = pid_times_block_size*output_type;
 45 | 
 46 |     uint32_t input1_stride = block_times_tasklets*input_type_1;
 47 |     uint32_t input2_stride = block_times_tasklets*input_type_2;
 48 |     uint32_t output_stride = block_times_tasklets*output_type;
 49 | 
 50 |     uint32_t input_type_1_div_4 = input_type_1 >> 2;
 51 |     uint32_t input_type_2_div_4 = input_type_2 >> 2;
 52 |     uint32_t input_type_1_rest_4 = input_type_1 - (input_type_1_div_4<<2);
 53 |     uint32_t input_type_2_rest_4 = input_type_2 - (input_type_2_div_4<<2);
 54 | 
 55 |     uint32_t types_div_4 = (input_type_1_rest_4==0)&&(input_type_2_rest_4==0);
 56 |     uint32_t types_are_ints = (input_type_1_div_4==1)&&(input_type_2_div_4==1);
 57 | 
 58 |     
 59 |     if(types_div_4){
 60 |         if(types_are_ints){
 61 | 
 62 |             for(int i=pid_times_block_size; i<divisible_len; i+=block_times_tasklets){
 63 |         
 64 |                 mram_read((__mram_ptr void*)table_entries_1+i_input1, input1_block, input_block_size_1);
 65 |                 mram_read((__mram_ptr void*)table_entries_2+i_input2, input2_block, input_block_size_2);
 66 |                 mram_read((__mram_ptr void*)table_entries_res+i_output, output_block, output_block_size);
 67 | 
 68 |                 input1 = input1_block;
 69 |                 input2 = input2_block;
 70 |                 output = output_block;
 71 | 
 72 |                 for(int j=0; j<copy_block_size; j+=2){
 73 |             
 74 | 
 75 |                     ((int32_t*)output)[0] = ((int32_t*)input1)[0];
 76 |                     ((int32_t*)output)[1] = ((int32_t*)input2)[0];
 77 | 
 78 |                     ((int32_t*)output)[2] = ((int32_t*)input1)[1];
 79 |                     ((int32_t*)output)[3] = ((int32_t*)input2)[1];
 80 | 
 81 | 
 82 |                     input1 += 8;
 83 |                     input2 += 8;
 84 |                     output += 16;
 85 |                 }
 86 | 
 87 |                 mram_write(output_block, table_entries_res+i_output, output_block_size);
 88 | 
 89 |                 i_input1 += input1_stride;
 90 |                 i_input2 += input2_stride;
 91 |                 i_output += output_stride;
 92 |             }
 93 | 
 94 |         }
 95 |         else{
 96 | 
 97 |             for(int i=pid_times_block_size; i<divisible_len; i+=block_times_tasklets){
 98 |         
 99 |                 mram_read((__mram_ptr void*)table_entries_1+i_input1, input1_block, input_block_size_1);
100 |                 mram_read((__mram_ptr void*)table_entries_2+i_input2, input2_block, input_block_size_2);
101 |                 mram_read((__mram_ptr void*)table_entries_res+i_output, output_block, output_block_size);
102 | 
103 |                 input1 = input1_block;
104 |                 input2 = input2_block;
105 |                 output = output_block;
106 | 
107 |                 for(int j=0; j<copy_block_size; j++){
108 |             
109 | 
110 |                     for(int k=0; k<input_type_1_div_4; k++){
111 |                         ((int32_t*)output)[k] = ((int32_t*)input1)[k];
112 |                     }
113 | 
114 |                     output += input_type_1;
115 | 
116 |                     for(int k=0; k<input_type_2_div_4; k++){
117 |                         ((int32_t*)output)[k] = ((int32_t*)input2)[k];
118 |                     }
119 | 
120 |                     input1 += input_type_1;
121 |                     input2 += input_type_2;
122 |                     output += input_type_2;
123 |                 }
124 | 
125 |                 mram_write(output_block, table_entries_res+i_output, output_block_size);
126 | 
127 |                 i_input1 += input1_stride;
128 |                 i_input2 += input2_stride;
129 |                 i_output += output_stride;
130 |             }
131 | 
132 |         }
133 | 
134 |     }
135 |     else{
136 |         for(int i=pid_times_block_size; i<divisible_len; i+=block_times_tasklets){
137 |         
138 |             mram_read((__mram_ptr void*)table_entries_1+i_input1, input1_block, input_block_size_1);
139 |             mram_read((__mram_ptr void*)table_entries_2+i_input2, input2_block, input_block_size_2);
140 |             mram_read((__mram_ptr void*)table_entries_res+i_output, output_block, output_block_size);
141 | 
142 |             input1 = input1_block;
143 |             input2 = input2_block;
144 |             output = output_block;
145 | 
146 |             for(int j=0; j<copy_block_size; j++){
147 |             
148 | 
149 |                 for(int k=0; k<input_type_1_div_4; k++){
150 |                     ((int32_t*)output)[k] = ((int32_t*)input1)[k];
151 |                 }
152 | 
153 |                 for(int k=input_type_1_rest_4; k<input_type_1; k++){
154 |                     ((char*)output)[k] = ((char*)input1)[k];
155 |                 }
156 | 
157 |                 output += input_type_1;
158 | 
159 |                 for(int k=0; k<input_type_2_div_4; k++){
160 |                     ((int32_t*)output)[k] = ((int32_t*)input2)[k];
161 |                 }
162 | 
163 |                 for(int k=input_type_2_rest_4; k<input_type_2; k++){
164 |                     ((char*)output)[k] = ((char*)input2)[k];
165 |                 }
166 | 
167 |                 input1 += input_type_1;
168 |                 input2 += input_type_2;
169 |                 output += input_type_2;
170 |             }
171 | 
172 |             mram_write(output_block, table_entries_res+i_output, output_block_size);
173 | 
174 |             i_input1 += input1_stride;
175 |             i_input2 += input2_stride;
176 |             i_output += output_stride;
177 |         }
178 | 
179 |     }
180 | 
181 | 
182 |     // handle last block 
183 |     if(pid==NR_TASKLETS-1 && rest_len != 0){
184 |         uint32_t last_block_input1_addr = divisible_len * input_type_1;
185 |         uint32_t last_block_input2_addr = divisible_len * input_type_2;
186 |         uint32_t last_block_output_addr = divisible_len * output_type;
187 | 
188 |         mram_read((__mram_ptr void*)table_entries_1+last_block_input1_addr, input1_block, input_block_size_1);
189 |         mram_read((__mram_ptr void*)table_entries_2+last_block_input2_addr, input2_block, input_block_size_2);
190 |         mram_read((__mram_ptr void*)table_entries_res+last_block_output_addr, output_block, output_block_size);
191 | 
192 |         input1 = input1_block;
193 |         input2 = input2_block;
194 |         output = output_block;
195 | 
196 |         for(int j=0; j<rest_len; j++){
197 | 
198 |             for(int k=0; k<input_type_1_div_4; k++){
199 |                 ((int32_t*)output)[k] = ((int32_t*)input1)[k];
200 |             }
201 | 
202 |             
203 |             for(int k=input_type_1_rest_4; k<input_type_1; k++){
204 |                 ((char*)output)[k] = ((char*)input1)[k];
205 |             }
206 |         
207 | 
208 |             output += input_type_1;
209 | 
210 |             for(int k=0; k<input_type_2_div_4; k++){
211 |                 ((int32_t*)output)[k] = ((int32_t*)input2)[k];
212 |             }
213 | 
214 |             
215 |             for(int k=input_type_2_rest_4; k<input_type_2; k++){
216 |                 ((char*)output)[k] = ((char*)input2)[k];
217 |             }
218 |             
219 | 
220 |             input1 += input_type_1;
221 |             input2 += input_type_2;
222 |             output += input_type_2;
223 | 
224 |         }
225 | 
226 | 
227 |         mram_write(output_block, table_entries_res+last_block_output_addr, output_block_size);
228 | 
229 |     }
230 | 
231 |     fsb_free(output_block_allocator, output_block);
232 |     fsb_free(input1_block_allocator, input1_block);
233 |     fsb_free(input2_block_allocator, input2_block);
234 |     barrier_wait(&barrier_p);
235 | }


--------------------------------------------------------------------------------
/lib/processing/zip/ZipProcessing.h:
--------------------------------------------------------------------------------
 1 | #ifndef ZIPPROCESSING_H
 2 | #define ZIPPROCESSING_H
 3 | #include <stdio.h>
 4 | #include <stdlib.h>
 5 | #include <string.h>
 6 | #include <alloc.h>
 7 | #include <defs.h>
 8 | #include <mram.h>
 9 | #include <barrier.h>
10 | #include "../ProcessingHelper.h"
11 | 
12 | void zip_dpu(__mram_ptr void* table_entries_1, __mram_ptr void* table_entries_2, __mram_ptr void* table_entries_res, uint32_t input_type_1, uint32_t input_type_2, uint32_t len);
13 | #endif 


--------------------------------------------------------------------------------
/lib/processing/zip/zip_dpu.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <stdint.h>
 4 | #include <mram.h>
 5 | #include <alloc.h>
 6 | #include <defs.h>
 7 | #include <barrier.h>
 8 | 
 9 | #include "ZipProcessing.h"
10 | #include "ZipArgs.h"
11 | #include "../ProcessingHelper.h"
12 | 
13 | __host zip_arguments_t ZIP_INPUT_ARGUMENTS;
14 | __dma_aligned void* aux;
15 | 
16 | BARRIER_INIT(my_barrier, NR_TASKLETS);
17 | int main() {
18 |     int pid = me();
19 |     if (pid == 0){ // Initialize once the cycle counter
20 |         mem_reset(); // Reset the heap
21 |     }
22 |     barrier_wait(&my_barrier);
23 | 
24 |     //printf("\n");
25 | 
26 |    uint32_t  input_start_offset1 = ZIP_INPUT_ARGUMENTS.input_start_offset1;
27 |    uint32_t  input_start_offset2 = ZIP_INPUT_ARGUMENTS.input_start_offset2;
28 |    uint32_t  input_type_size1 = ZIP_INPUT_ARGUMENTS.input_type_size1;
29 |    uint32_t  input_type_size2 = ZIP_INPUT_ARGUMENTS.input_type_size2;
30 |    uint32_t  len = ZIP_INPUT_ARGUMENTS.len;
31 |    uint32_t  outputs = ZIP_INPUT_ARGUMENTS.outputs;
32 | 
33 |     zip_dpu(DPU_MRAM_HEAP_POINTER+input_start_offset1, DPU_MRAM_HEAP_POINTER+input_start_offset2, DPU_MRAM_HEAP_POINTER+outputs, input_type_size1, input_type_size2, len);
34 |     return 0;
35 | }


--------------------------------------------------------------------------------
/lib/timer.h:
--------------------------------------------------------------------------------
 1 | #include <sys/time.h>
 2 | #include <stdio.h>
 3 | typedef struct Timer{
 4 | 
 5 |     struct timeval startTime[6];
 6 |     struct timeval stopTime[6];
 7 |     double         time[6];
 8 | 
 9 | }Timer;
10 | 
11 | void start(Timer *timer, int i, int rep) {
12 |     if(rep == 0) {
13 |         timer->time[i] = 0.0;
14 |     }
15 |     gettimeofday(&timer->startTime[i], NULL);
16 | }
17 | 
18 | void stop(Timer *timer, int i) {
19 |     gettimeofday(&timer->stopTime[i], NULL);
20 |     timer->time[i] += (timer->stopTime[i].tv_sec - timer->startTime[i].tv_sec) * 1000000.0 +
21 |                       (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec);
22 |     //printf("Time (ms): %f\t",((timer->stopTime[i].tv_sec - timer->startTime[i].tv_sec) * 1000000.0 +
23 |     //                  (timer->stopTime[i].tv_usec - timer->startTime[i].tv_usec)) / 1000);
24 |  
25 | }
26 | 
27 | void print(Timer *timer, int i, int REP) { printf("%f\t", timer->time[i] / (1000 * REP)); }


--------------------------------------------------------------------------------