├── .gitignore
├── Makefile
├── README.md
├── example00
    ├── BLOG_POST.md
    └── main.cpp
├── example01
    ├── README.md
    └── main.cpp
├── example02
    ├── main.c
    └── main.cpp
├── example03
    └── main.c
├── example04
    ├── README.md
    ├── main.c
    └── main.py
└── example05
    ├── main.c
    └── main.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # compiled files
 2 | *.out
 3 | */bin
 4 | */build
 5 | 
 6 | # clFFT library
 7 | clFFT
 8 | 
 9 | # THIS IS MY SWAP
10 | # (vim swap files)
11 | .*.s??
12 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | CXX = gcc 
 2 | 
 3 | # clFFT lib & inc
 4 | CLFFT_LIB = -L/usr/local/cuda-7.0/targets/x86_64-linux/lib -lOpenCL -L./usr/local/lib64 -lclFFT
 5 | 
 6 | # standard math library
 7 | CXXFLAGS = -c $(CLFFT_INCLUDE)
 8 | LDFLAGS = -lm $(CLFFT_LIB) -lfftw3 -lm
 9 | EXE = Example 
10 | 
11 | # ignore warnings when compiling if warn=0
12 | ifeq ($(warn), 0)
13 | 	CXXFLAGS += -w
14 | endif
15 | 
16 | 
17 | all: ex04 ex05
18 | 
19 | ex04: example04/build/main.o
20 | 	@if [ ! -d "./example04/bin" ]; then mkdir ./example04/bin; fi
21 | 	$(CXX) $< $(LDFLAGS) -o example04/bin/$(EXE)
22 | 
23 | example04/build/main.o: example04/main.c
24 | 	@if [ ! -d "./example04/build" ]; then mkdir ./example04/build; fi
25 | 	$(CXX) $(CXXFLAGS) $< -o $@
26 | 
27 | 
28 | ex05: example05/build/main.o
29 | 	@if [ ! -d "./example05/bin" ]; then mkdir ./example05/bin; fi
30 | 	$(CXX) $< $(LDFLAGS) -o example05/bin/$(EXE)
31 | 
32 | example05/build/main.o: example05/main.c
33 | 	@if [ ! -d "./example05/build" ]; then mkdir ./example05/build; fi
34 | 	$(CXX) $(CXXFLAGS) $< -o $@
35 | 
36 | 
37 | # cleaning (remove executables and what not)
38 | clean:
39 | 	$(RM) -r ./example04/build/ ./example04/bin/
40 | 	$(RM) -r ./example05/build/ ./example05/bin/
41 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # OpenCL basic examples
 2 | here is my feeble attempt at learning OpenCL, please don't make fun of me too much :hamburger:
 3 | 
 4 | ## Configuration
 5 | This code uses OpenCL 1.1 on a NVIDIA GPU.
 6 | 
 7 | ### Linux
 8 | (Only tested on Ubuntu). For NVIDIA GPUs, I've installed the following packages: `nvidia-346 nvidia-346-dev nvidia-346-uvm nvidia-libopencl1-346 nvidia-modprobe nvidia-opencl-icd-346 nvidia-settings`. Since the `opencl-headers` package in the main repository is for OpenCL 1.2, you can get the OpenCL 1.1 header files from [here](http://packages.ubuntu.com/precise/opencl-headers).
 9 | 
10 | Then to compile the C++ code:
11 | 
12 | ```
13 | g++ -std=c++0x main.cpp -o main.out -lOpenCL
14 | ```
15 | 
16 | To compile the C code:
17 | 
18 | ```
19 | gcc main.c -o main.out -lOpenCL
20 | ```
21 | 
22 | For examples 04 and 05, you can run
23 | 
24 | ```bash
25 | make ex04  # executable is ./example04/bin/Example
26 | make ex05  # executable is ./example05/bin/Example
27 | make       # makes both!
28 | ```
29 | 
30 | ### OS X
31 | OpenCL is installed on OS X by default, but since this code uses the C++ bindings, you'll need to get that too. Get the [official C++ bindings from the OpenCL registr](https://www.khronos.org/registry/cl/api/1.1/cl.hpp) and copy it to the OpenCL framework directory, or do the following:
32 | 
33 | ```
34 | wget https://www.khronos.org/registry/cl/api/1.1/cl.hpp
35 | sudo cp cl.hpp /System/Library/Frameworks/OpenCL.framework/Headers/
36 | ```
37 | 
38 | To compile:
39 | 
40 | ```
41 | clang++ -std=c++0x -framework OpenCL main.cpp -o main.out
42 | ```
43 | 
44 | ### Windows
45 | For some reason, the makefile didn't want to work for Windows. I have no idea why.
46 | 
47 | For example 04, run (inside the directory):
48 | 
49 | ```
50 | gcc -I/c/Program\ Files/NVIDIA\ GPU\ Computing\ Toolkit/CUDA/v7.5/include -I/c/PATH/TO/CLFFT/include main.c -o main.exe -L/c/Program\ Files/NVIDIA\ GPU\ Computing\ Toolkit/CUDA/v7.5/lib/x64 -lOpenCL -L/c/PATH/TO/CLFFT/lib64/import -lclFFT
51 | ```
52 | 
53 | where `PATH/TO/CLFFT` is the path to the clFFT library.
54 | 
55 | For example 05, run (inside the directory):
56 | 
57 | ```
58 | gcc -I/c/Program\ Files/NVIDIA\ GPU\ Computing\ Toolkit/CUDA/v7.5/include -I/c/PATH/TO/CLFFT/include -I/c/PATH/TO/FFTW main.c -o main.exe -L/c/Program\ Files/NVIDIA\ GPU\ Computing\ Toolkit/CUDA/v7.5/lib/x64 -lOpenCL -L/c/PATH/TO/CLFFT/lib64/import -lclFFT -L/c/PATH/TO/FFTW -lfftw3-3
59 | ```
60 | 
61 | where `PATH/TO/FFTW` is the path to the FFTW3 library.
62 | 
63 | ## example 00
64 | this example is based off of [this example](http://simpleopencl.blogspot.ca/2013/06/tutorial-simple-start-with-opencl-and-c.html) (example-ception), but it goes a bit further. In the blogspot example, two 10-element vectors are created and a thread is used for each pair of elements. In this example, 10 threads are spawned but two 100-element vectors are used, and it is shown how to split up a specific number of elements per thread.
65 | 
66 | ## example 01
67 | Measures the duration of adding two vectors. See the README in the folder for more details.
68 | 
69 | ## example 02
70 | Demonstrates that one array can be modified several times without having to re-read and re-write data to and from the GPU.
71 | 
72 | ## example 03
73 | A simple example using the `cl_khr_fp64` extension which allows for usage of doubles instead of floats.
74 | 
75 | ## example 04
76 | An example of the CLFFT library for an in-place complex-planar transform. There is also Python code to check the answer; FFTW code will be added later, probably.
77 | 
78 | - clFFT is required; installation instructions can be found inside example04/README.md
79 | - for Python, numpy and scipy are required
80 | 
81 | ## example 05
82 | Another CLFFT example where an in-place real transform and an out-of-place real transform are performed. There's also FFTW code and Python code for checking the answer.
83 | 
84 | - clFFT is required; installation instructions can be found inside example04/README.md
85 | - FFTW is required; installation is as simple as extracting FFTW's tar file, then running `./configure && sudo make && sudo make install`
86 | - for Python, numpy and scipy are required
87 | 
88 | ## Some Notes
89 | From the [guide on programming OpenCL for NVIDIA](http://www.nvidia.com/content/cudazone/download/OpenCL/NVIDIA_OpenCL_ProgrammingGuide.pdf):
90 | 
91 | - **CUDA streaming multiprocessor** corresponds to an OpenCL compute unit
92 | - **CUDA thread** corresponds to an OpenCL work-item
93 | - **CUDA thread block** corresponds to an OpenCL work-group
94 | 
95 | 


--------------------------------------------------------------------------------
/example00/BLOG_POST.md:
--------------------------------------------------------------------------------
  1 | (NOTE: this content is from http://simpleopencl.blogspot.com/2013/06/tutorial-simple-start-with-opencl-and-c.html -- I am copying the content here so that it doesn't get lost to the sands of time.)
  2 | 
  3 | Saturday, June 1, 2013
  4 | Tutorial: Simple start with OpenCL and C++
  5 | To begin programming in OpenCL is always hard. Let's try with the basic example. We want to sum two arrays together.
  6 | 
  7 | At first you need to install the OpenCL libraries and other files. AMD has for CPU's and their GPU's  AMD APP: http://developer.amd.com/tools-and-sdks/heterogeneous-computing/amd-accelerated-parallel-processing-app-sdk/downloads/. Intel has their OpenCL libraries at http://software.intel.com/en-us/vcsource/tools/opencl-sdk. And Nvidia has everything at https://developer.nvidia.com/cuda-downloads. In some cases the graphic drivers already include all the files you need. I recommend that you continue with the next step and if anything will go wrong return to this step and install the needed OpenCL SDK toolkits.
  8 | 
  9 | 
 10 | We will program in C++11. To ease everything we will use OpenCL C++ binding 1.1 from www.khronos.org/registry/cl/api/1.1/cl.hpp . manual for this binding is available at www.khronos.org/registry/cl/specs/opencl-cplusplus-1.1.pdf. It might happen that cl.hpp is already installed at your computer. If not then simply download C++ binding to folder of your project. Don't forget to turn on the C++11. In case of QtCreator add next line into the .pro file:
 11 | 
 12 | ```bash
 13 | QMAKE_CXXFLAGS += -std=c++0x
 14 | ```
 15 | 
 16 | Also don't forget to use OpenCL library. In case of QtCreator add next line into the .pro file:
 17 | 
 18 | ```bash
 19 | LIBS+= -lOpenCL
 20 | ```
 21 | 
 22 | If you get any errors you need to adjust system variable to point to folder of OpenCL installation. You can also manually set path to OpenCL library path:
 23 | 
 24 | ```bash
 25 | LIBS+= -Lpath_to_openCL_libraries
 26 | ```
 27 | 
 28 | Or you can simply write hard-coded path to OpenCL library:
 29 | 
 30 | ```bash
 31 | LIBS+=/usr/.../libOpenCL.so
 32 | ```
 33 | 
 34 | Let's start with coding. We will create simple console program which will use OpenCL to sum two arrays like C=A+B. For our simple sample we will need only two headers:
 35 | 
 36 | ```c
 37 | #include <iostream>
 38 | #include <CL/cl.hpp>
 39 | ```
 40 | 
 41 | Everything else will happen inside main function. At start we need to get one of the OpenCL platforms. This is actually a driver you had previously installed. So platform can be from Nvidia, Intel, AMD....
 42 | 
 43 | ```c
 44 | int main(){
 45 |     //get all platforms (drivers)
 46 |     std::vector<cl::Platform> all_platforms;
 47 |     cl::Platform::get(&all_platforms);
 48 |     if(all_platforms.size()==0){
 49 |         std::cout<<" No platforms found. Check OpenCL installation!\n";
 50 |         exit(1);
 51 |     }
 52 |     cl::Platform default_platform=all_platforms[0];
 53 |     std::cout << "Using platform: "<<default_platform.getInfo<CL_PLATFORM_NAME>()<<"\n";
 54 | ```
 55 | 
 56 | Once we selected the first platform (default_platform) we will use it in the next steps. Now we need to get device of our platform. For example AMD's platform has support for multiple devices (CPU's and GPU's). We will now select the first device (default_device):
 57 | 
 58 | ```c
 59 | //get default device of the default platform
 60 | std::vector<cl::Device> all_devices;
 61 | default_platform.getDevices(CL_DEVICE_TYPE_ALL, &all_devices);
 62 | if(all_devices.size()==0){
 63 |     std::cout<<" No devices found. Check OpenCL installation!\n";
 64 |     exit(1);
 65 | }
 66 | cl::Device default_device=all_devices[0];
 67 | std::cout<< "Using device: "<<default_device.getInfo<CL_DEVICE_NAME>()<<"\n";
 68 | ```
 69 | 
 70 | Now we need to create a Context. Imagine the Context as the runtime link to the our device and platform:
 71 | 
 72 | ```c
 73 | cl::Context context({default_device});
 74 | ```
 75 | 
 76 | Next we need to create the program which we want to execute on our device:
 77 | 
 78 | ```c
 79 | cl::Program::Sources sources;
 80 | ```
 81 | 
 82 | Actual source of our program(kernel) is there:
 83 | 
 84 | ```c
 85 | // kernel calculates for each element C=A+B
 86 | std::string kernel_code=
 87 |         "   void kernel simple_add(global const int* A, global const int* B, global int* C){       "
 88 |         "       C[get_global_id(0)]=A[get_global_id(0)]+B[get_global_id(0)];                 "
 89 |         "   }                                                                               ";
 90 | ```
 91 | 
 92 | This code simply calculates C=A+B. As we want that one thread calculates sum of only one element, we use get_global_id(0). get_global_id(0) means get id of current thread. Id's can go from 0 to get_global_size(0) - 1. get_global_size(0) means number of threads. What is 0? 0 means first dimension. OpenCL supports running kernels on 1D, 2D and 3D problems. We will use 1D array! This means 1D problem.
 93 | 
 94 | Next we need our kernel sources to build. We also check for the errors at building:
 95 | 
 96 | ```c
 97 | sources.push_back({kernel_code.c_str(),kernel_code.length()});
 98 | 
 99 | cl::Program program(context,sources);
100 | if(program.build({default_device})!=CL_SUCCESS){
101 |     std::cout<<" Error building: "<<program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(default_device)<<"\n";
102 |     exit(1);
103 | }
104 | ```
105 | 
106 | For arrays A, B, C we need to allocate the space on the device:
107 | 
108 | ```c
109 | // create buffers on the device
110 | cl::Buffer buffer_A(context,CL_MEM_READ_WRITE,sizeof(int)*10);
111 | cl::Buffer buffer_B(context,CL_MEM_READ_WRITE,sizeof(int)*10);
112 | cl::Buffer buffer_C(context,CL_MEM_READ_WRITE,sizeof(int)*10);
113 | ```
114 | 
115 | Arrays will have 10 element. We want to calculate sum of next arrays (A, B).
116 | 
117 | ```c
118 | int A[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
119 | int B[] = {0, 1, 2, 0, 1, 2, 0, 1, 2, 0};
120 | ```
121 | 
122 | We need to copy arrays from A and B to the device. This means that we will copy arrays from the host to the device. Host represents our main. At first we need to create a queue which is the queue to the commands we will send to the our device:
123 | 
124 | ```c
125 | //create queue to which we will push commands for the device.
126 | cl::CommandQueue queue(context,default_device); Now we can copy data from arrays A and B to buffer_A and buffer_B which represent memory on the device:
127 | //write arrays A and B to the device
128 | queue.enqueueWriteBuffer(buffer_A,CL_TRUE,0,sizeof(int)*10,A);
129 | queue.enqueueWriteBuffer(buffer_B,CL_TRUE,0,sizeof(int)*10,B);
130 | ```
131 | 
132 | Now we can run the kernel which in parallel sums A and B and writes to C. We do this with KernelFunctor which runs the kernel on the device. Take a look at the "simple_add" this is the name of our kernel we wrote before. You can see the number 10. This corresponds to number of threads we want to run (our array size is 10):
133 | 
134 | ```c
135 | cl::KernelFunctor simple_add(cl::Kernel(program,"simple_add"),queue,cl::NullRange,cl::NDRange(10),cl::NullRange); Here we actually set the arguments to kernel simple_add and run the kernel:
136 | simple_add(buffer_A, buffer_B, buffer_C);
137 | ```
138 | 
139 | At the end we want to print memory C on our device. At first we need to transfer data from the device to our program (host):
140 | 
141 | ```c
142 | int C[10];
143 | //read result C from the device to array C
144 | queue.enqueueReadBuffer(buffer_C,CL_TRUE,0,sizeof(int)*10,C);
145 | 
146 | std::cout<<" result: \n";
147 | for(int i=0;i<10;i++){
148 |     std::cout<<C[i]<<" ";
149 | }
150 | 
151 | return 0;
152 | ```
153 | 
154 | This is it. Complete code is there:
155 | 
156 | ```c
157 | #include <iostream>
158 | #include <CL/cl.hpp>
159 | 
160 | int main(){
161 |     //get all platforms (drivers)
162 |     std::vector<cl::Platform> all_platforms;
163 |     cl::Platform::get(&all_platforms);
164 |     if(all_platforms.size()==0){
165 |         std::cout<<" No platforms found. Check OpenCL installation!\n";
166 |         exit(1);
167 |     }
168 |     cl::Platform default_platform=all_platforms[0];
169 |     std::cout << "Using platform: "<<default_platform.getInfo<CL_PLATFORM_NAME>()<<"\n";
170 | 
171 |     //get default device of the default platform
172 |     std::vector<cl::Device> all_devices;
173 |     default_platform.getDevices(CL_DEVICE_TYPE_ALL, &all_devices);
174 |     if(all_devices.size()==0){
175 |         std::cout<<" No devices found. Check OpenCL installation!\n";
176 |         exit(1);
177 |     }
178 |     cl::Device default_device=all_devices[0];
179 |     std::cout<< "Using device: "<<default_device.getInfo<CL_DEVICE_NAME>()<<"\n";
180 | 
181 | 
182 |     cl::Context context({default_device});
183 | 
184 |     cl::Program::Sources sources;
185 | 
186 |     // kernel calculates for each element C=A+B
187 |     std::string kernel_code=
188 |             "   void kernel simple_add(global const int* A, global const int* B, global int* C){       "
189 |             "       C[get_global_id(0)]=A[get_global_id(0)]+B[get_global_id(0)];                 "
190 |             "   }                                                                               ";
191 |     sources.push_back({kernel_code.c_str(),kernel_code.length()});
192 | 
193 |     cl::Program program(context,sources);
194 |     if(program.build({default_device})!=CL_SUCCESS){
195 |         std::cout<<" Error building: "<<program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(default_device)<<"\n";
196 |         exit(1);
197 |     }
198 | 
199 | 
200 |     // create buffers on the device
201 |     cl::Buffer buffer_A(context,CL_MEM_READ_WRITE,sizeof(int)*10);
202 |     cl::Buffer buffer_B(context,CL_MEM_READ_WRITE,sizeof(int)*10);
203 |     cl::Buffer buffer_C(context,CL_MEM_READ_WRITE,sizeof(int)*10);
204 | 
205 |     int A[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
206 |     int B[] = {0, 1, 2, 0, 1, 2, 0, 1, 2, 0};
207 | 
208 |     //create queue to which we will push commands for the device.
209 |     cl::CommandQueue queue(context,default_device);
210 | 
211 |     //write arrays A and B to the device
212 |     queue.enqueueWriteBuffer(buffer_A,CL_TRUE,0,sizeof(int)*10,A);
213 |     queue.enqueueWriteBuffer(buffer_B,CL_TRUE,0,sizeof(int)*10,B);
214 | 
215 | 
216 |     //run the kernel
217 |     cl::KernelFunctor simple_add(cl::Kernel(program,"simple_add"),queue,cl::NullRange,cl::NDRange(10),cl::NullRange);
218 |     simple_add(buffer_A,buffer_B,buffer_C);
219 | 
220 |     //alternative way to run the kernel
221 |     /*cl::Kernel kernel_add=cl::Kernel(program,"simple_add");
222 |     kernel_add.setArg(0,buffer_A);
223 |     kernel_add.setArg(1,buffer_B);
224 |     kernel_add.setArg(2,buffer_C);
225 |     queue.enqueueNDRangeKernel(kernel_add,cl::NullRange,cl::NDRange(10),cl::NullRange);
226 |     queue.finish();*/
227 | 
228 |     int C[10];
229 |     //read result C from the device to array C
230 |     queue.enqueueReadBuffer(buffer_C,CL_TRUE,0,sizeof(int)*10,C);
231 | 
232 |     std::cout<<" result: \n";
233 |     for(int i=0;i<10;i++){
234 |         std::cout<<C[i]<<" ";
235 |     }
236 | 
237 |     return 0;
238 | }
239 | ```
240 | 


--------------------------------------------------------------------------------
/example00/main.cpp:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #ifdef __APPLE__
  3 |     #include <OpenCL/cl.hpp>
  4 | #else
  5 |     #include <CL/cl.hpp>
  6 | #endif
  7 | 
  8 | int main() {
  9 |     // get all platforms (drivers), e.g. NVIDIA
 10 |     std::vector<cl::Platform> all_platforms;
 11 |     cl::Platform::get(&all_platforms);
 12 | 
 13 |     if (all_platforms.size()==0) {
 14 |         std::cout<<" No platforms found. Check OpenCL installation!\n";
 15 |         exit(1);
 16 |     }
 17 |     cl::Platform default_platform=all_platforms[0];
 18 |     std::cout << "Using platform: "<<default_platform.getInfo<CL_PLATFORM_NAME>()<<"\n";
 19 | 
 20 |     // get default device (CPUs, GPUs) of the default platform
 21 |     std::vector<cl::Device> all_devices;
 22 |     default_platform.getDevices(CL_DEVICE_TYPE_ALL, &all_devices);
 23 |     if(all_devices.size()==0){
 24 |         std::cout<<" No devices found. Check OpenCL installation!\n";
 25 |         exit(1);
 26 |     }
 27 | 
 28 |     // use device[1] because that's a GPU; device[0] is the CPU
 29 |     cl::Device default_device=all_devices[1];
 30 |     std::cout<< "Using device: "<<default_device.getInfo<CL_DEVICE_NAME>()<<"\n";
 31 | 
 32 |     // a context is like a "runtime link" to the device and platform;
 33 |     // i.e. communication is possible
 34 |     cl::Context context({default_device});
 35 | 
 36 |     // create the program that we want to execute on the device
 37 |     cl::Program::Sources sources;
 38 | 
 39 |     // calculates for each element; C = A + B
 40 |     std::string kernel_code=
 41 |         "   void kernel simple_add(global const int* A, global const int* B, global int* C, "
 42 |         "                          global const int* N) {"
 43 |         "       int ID, Nthreads, n, ratio, start, stop;"
 44 |         ""
 45 |         "       ID = get_global_id(0);"
 46 |         "       Nthreads = get_global_size(0);"
 47 |         "       n = N[0];"
 48 |         ""
 49 |         "       ratio = (n / Nthreads);"  // number of elements for each thread
 50 |         "       start = ratio * ID;"
 51 |         "       stop  = ratio * (ID + 1);"
 52 |         ""
 53 |         "       for (int i=start; i<stop; i++)"
 54 |         "           C[i] = A[i] + B[i];"
 55 |         "   }";
 56 |     sources.push_back({kernel_code.c_str(), kernel_code.length()});
 57 | 
 58 |     cl::Program program(context, sources);
 59 |     if (program.build({default_device}) != CL_SUCCESS) {
 60 |         std::cout << "Error building: " << program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(default_device) << std::endl;
 61 |         exit(1);
 62 |     }
 63 |     
 64 |     // apparently OpenCL only likes arrays ...
 65 |     // N holds the number of elements in the vectors we want to add
 66 |     int N[1] = {100};
 67 |     int n = N[0];
 68 | 
 69 |     // create buffers on device (allocate space on GPU)
 70 |     cl::Buffer buffer_A(context, CL_MEM_READ_WRITE, sizeof(int) * n);
 71 |     cl::Buffer buffer_B(context, CL_MEM_READ_WRITE, sizeof(int) * n);
 72 |     cl::Buffer buffer_C(context, CL_MEM_READ_WRITE, sizeof(int) * n);
 73 |     cl::Buffer buffer_N(context, CL_MEM_READ_ONLY,  sizeof(int));
 74 | 
 75 |     // create things on here (CPU)
 76 |     int A[n], B[n];
 77 |     for (int i=0; i<n; i++) {
 78 |         A[i] = i;
 79 |         B[i] = n - i - 1;
 80 |     }
 81 |     // create a queue (a queue of commands that the GPU will execute)
 82 |     cl::CommandQueue queue(context, default_device);
 83 | 
 84 |     // push write commands to queue
 85 |     queue.enqueueWriteBuffer(buffer_A, CL_TRUE, 0, sizeof(int)*n, A);
 86 |     queue.enqueueWriteBuffer(buffer_B, CL_TRUE, 0, sizeof(int)*n, B);
 87 |     queue.enqueueWriteBuffer(buffer_N, CL_TRUE, 0, sizeof(int),   N);
 88 | 
 89 |     // RUN ZE KERNEL
 90 |     cl::KernelFunctor simple_add(cl::Kernel(program, "simple_add"), queue, cl::NullRange, cl::NDRange(10), cl::NullRange);
 91 |     simple_add(buffer_A, buffer_B, buffer_C, buffer_N);
 92 | 
 93 |     int C[n];
 94 |     // read result from GPU to here
 95 |     queue.enqueueReadBuffer(buffer_C, CL_TRUE, 0, sizeof(int)*n, C);
 96 | 
 97 |     std::cout << "result: {";
 98 |     for (int i=0; i<n; i++) {
 99 |         std::cout << C[i] << " ";
100 |     }
101 |     std::cout << "}" << std::endl;
102 | 
103 |     return 0;
104 | }
105 | 
106 | 


--------------------------------------------------------------------------------
/example01/README.md:
--------------------------------------------------------------------------------
 1 | # Example 01
 2 | This example compares the timings of adding vectors on the CPU versus adding vectors on the GPU, the latter of which has different implementations.
 3 | 
 4 | ## About
 5 | The code runs the following implementations of adding large vectors (131072 elements; 8 * 32 * 512). The vectors are added together 10000 times.
 6 | 
 7 | - CPU
 8 | - GPU, where 1024 threads are spawned and each thread thus gets 128 elements to calculate; there are two implementations of this:
 9 |   - (Version 1) each thread gets 128 sequential elements (thread 0 gets 0-127, 1 gets 128-255, ...)
10 |   - (Version 2) each thread gets 128 elements, but coalescing happens (thread 0 gets 0,128,256..., thread 1 gets 1,129,257...)
11 | 


--------------------------------------------------------------------------------
/example01/main.cpp:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <ctime>
  3 | #ifdef __APPLE__
  4 |     #include <OpenCL/cl.hpp>
  5 | #else
  6 |     #include <CL/cl.hpp>
  7 | #endif
  8 | 
  9 | #define NUM_GLOBAL_WITEMS 1024
 10 | 
 11 | void compareResults (double CPUtime, double GPUtime, int trial) {
 12 |     double time_ratio = (CPUtime / GPUtime);
 13 |     std::cout << "VERSION "   << trial   << " -----------" << std::endl;
 14 |     std::cout << "CPU time: " << CPUtime << std::endl;
 15 |     std::cout << "GPU time: " << GPUtime << std::endl;
 16 |     std::cout << "GPU is ";
 17 |     if (time_ratio > 1)
 18 |         std::cout << time_ratio << " times faster!" << std::endl;
 19 |     else
 20 |         std::cout << (1/time_ratio) << " times slower :(" << std::endl;
 21 | }
 22 | 
 23 | 
 24 | double timeAddVectorsCPU(int n, int k) {
 25 |     // adds two vectors of size n, k times, returns total duration
 26 |     std::clock_t start;
 27 |     double duration;
 28 | 
 29 |     int A[n], B[n], C[n];
 30 |     for (int i=0; i<n; i++) {
 31 |         A[i] = i;
 32 |         B[i] = n-i;
 33 |         C[i] = 0;
 34 |     }
 35 | 
 36 |     start = std::clock();
 37 |     for (int i=0; i<k; i++) {
 38 |         for (int j=0; j<n; j++)
 39 |             C[j] = A[j] + B[j];
 40 |     }
 41 | 
 42 |     duration = (std::clock() - start) / (double) CLOCKS_PER_SEC;
 43 |     return duration;
 44 | }
 45 | 
 46 | 
 47 | void warmup(cl::Context &context, cl::CommandQueue &queue, 
 48 |             cl::Kernel &add, int A[], int B[], int n) {
 49 |     int C[n];
 50 |     // allocate space
 51 |     cl::Buffer buffer_A(context, CL_MEM_READ_WRITE, sizeof(int) * n);
 52 |     cl::Buffer buffer_B(context, CL_MEM_READ_WRITE, sizeof(int) * n);
 53 |     cl::Buffer buffer_C(context, CL_MEM_READ_WRITE, sizeof(int) * n);
 54 | 
 55 |     // push write commands to queue
 56 |     queue.enqueueWriteBuffer(buffer_A, CL_TRUE, 0, sizeof(int)*n, A);
 57 |     queue.enqueueWriteBuffer(buffer_B, CL_TRUE, 0, sizeof(int)*n, B);
 58 | 
 59 |     // RUN ZE KERNEL
 60 |     add.setArg(1, buffer_B);
 61 |     add.setArg(0, buffer_A);
 62 |     add.setArg(2, buffer_C);
 63 |     for (int i=0; i<5; i++)
 64 |         queue.enqueueNDRangeKernel(add, cl::NullRange, cl::NDRange(NUM_GLOBAL_WITEMS), cl::NDRange(32));              
 65 |    
 66 |     queue.enqueueReadBuffer(buffer_C, CL_TRUE, 0, sizeof(int)*n, C); 
 67 |     queue.finish(); 
 68 | }
 69 | 
 70 | 
 71 | int main(int argc, char* argv[]) {
 72 | 
 73 |     bool verbose;
 74 |     if (argc == 1 || std::strcmp(argv[1], "0") == 0)
 75 |         verbose = true;
 76 |     else
 77 |         verbose = false;
 78 |     
 79 |     const int n = 8*32*512;             // size of vectors
 80 |     const int k = 10000;                // number of loop iterations
 81 |     // const int NUM_GLOBAL_WITEMS = 1024; // number of threads
 82 | 
 83 |     // get all platforms (drivers), e.g. NVIDIA
 84 |     std::vector<cl::Platform> all_platforms;
 85 |     cl::Platform::get(&all_platforms);
 86 | 
 87 |     if (all_platforms.size()==0) {
 88 |         std::cout<<" No platforms found. Check OpenCL installation!\n";
 89 |         exit(1);
 90 |     }
 91 |     cl::Platform default_platform=all_platforms[0];
 92 |     // std::cout << "Using platform: "<<default_platform.getInfo<CL_PLATFORM_NAME>()<<"\n";
 93 | 
 94 |     // get default device (CPUs, GPUs) of the default platform
 95 |     std::vector<cl::Device> all_devices;
 96 |     default_platform.getDevices(CL_DEVICE_TYPE_ALL, &all_devices);
 97 |     if(all_devices.size()==0){
 98 |         std::cout<<" No devices found. Check OpenCL installation!\n";
 99 |         exit(1);
100 |     }
101 | 
102 |     // use device[1] because that's a GPU; device[0] is the CPU
103 |     cl::Device default_device=all_devices[1];
104 |     // std::cout<< "Using device: "<<default_device.getInfo<CL_DEVICE_NAME>()<<"\n";
105 | 
106 |     cl::Context context({default_device});
107 |     cl::Program::Sources sources;
108 | 
109 |     // calculates for each element; C = A + B
110 |     std::string kernel_code=
111 |         "   void kernel add(global const int* v1, global const int* v2, global int* v3) {"
112 |         "       int ID;"
113 |         "       ID = get_global_id(0);"
114 |         "       v3[ID] = v1[ID] + v2[ID];"
115 |         "   }"
116 |         ""
117 |         "   void kernel add_looped_1(global const int* v1, global const int* v2, global int* v3, "
118 |         "                          const int n, const int k) {"
119 |         "       int ID, NUM_GLOBAL_WITEMS, ratio, start, stop;"
120 |         "       ID = get_global_id(0);"
121 |         "       NUM_GLOBAL_WITEMS = get_global_size(0);"
122 |         ""
123 |         "       ratio = (n / NUM_GLOBAL_WITEMS);" // elements per thread
124 |         "       start = ratio * ID;"
125 |         "       stop  = ratio * (ID+1);"
126 |         ""
127 |         "       int i, j;" // will the compiler optimize this anyway? probably.
128 |         "       for (i=0; i<k; i++) {"
129 |         "           for (j=start; j<stop; j++)"
130 |         "               v3[j] = v1[j] + v2[j];"
131 |         "       }"
132 |         "   }"
133 |         ""
134 |         "   void kernel add_looped_2(global const int* v1, global const int* v2, global int* v3,"
135 |         "                            const int n, const int k) {"
136 |         "       int ID, NUM_GLOBAL_WITEMS, step;"
137 |         "       ID = get_global_id(0);"
138 |         "       NUM_GLOBAL_WITEMS = get_global_size(0);"
139 |         "       step = (n / NUM_GLOBAL_WITEMS);"
140 |         ""
141 |         "       int i,j;"
142 |         "       for (i=0; i<k; i++) {"
143 |         "           for (j=ID; j<n; j+=step)"
144 |         "               v3[j] = v1[j] + v2[j];"
145 |         "       }"
146 |         "   }"
147 |         ""    
148 |         "   void kernel add_single(global const int* v1, global const int* v2, global int* v3, "
149 |         "                          const int k) { "
150 |         "       int ID = get_global_id(0);"
151 |         "       for (int i=0; i<k; i++)"
152 |         "           v3[ID] = v1[ID] + v2[ID];"
153 |         "   }";
154 |     sources.push_back({kernel_code.c_str(), kernel_code.length()});
155 | 
156 |     cl::Program program(context, sources);
157 |     if (program.build({default_device}) != CL_SUCCESS) {
158 |         std::cout << "Error building: " << program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(default_device) << std::endl;
159 |         exit(1);
160 |     }
161 | 
162 |     // run the CPU code
163 |     float CPUtime = timeAddVectorsCPU(n, k);
164 | 
165 |     // set up kernels and vectors for GPU code
166 |     cl::CommandQueue queue(context, default_device);
167 |     cl::Kernel add          = cl::Kernel(program, "add");
168 |     cl::Kernel add_looped_1 = cl::Kernel(program, "add_looped_1");
169 |     cl::Kernel add_looped_2 = cl::Kernel(program, "add_looped_2");
170 |     cl::Kernel add_single   = cl::Kernel(program, "add_single");
171 | 
172 |     // construct vectors
173 |     int A[n], B[n], C[n];
174 |     for (int i=0; i<n; i++) {
175 |         A[i] = i;
176 |         B[i] = n - i - 1;
177 |     }
178 | 
179 |     // attempt at warm-up...
180 |     warmup(context, queue, add, A, B, n);
181 |     queue.finish();
182 | 
183 |     std::clock_t start_time;
184 | 
185 |     // VERSION 1 ==========================================
186 |     // start timer
187 |     double GPUtime1;
188 |     start_time = std::clock();
189 | 
190 |     // allocate space
191 |     cl::Buffer buffer_A(context, CL_MEM_READ_WRITE, sizeof(int) * n);
192 |     cl::Buffer buffer_B(context, CL_MEM_READ_WRITE, sizeof(int) * n);
193 |     cl::Buffer buffer_C(context, CL_MEM_READ_WRITE, sizeof(int) * n);
194 | 
195 |     // push write commands to queue
196 |     queue.enqueueWriteBuffer(buffer_A, CL_TRUE, 0, sizeof(int)*n, A);
197 |     queue.enqueueWriteBuffer(buffer_B, CL_TRUE, 0, sizeof(int)*n, B);
198 | 
199 |     // RUN ZE KERNEL
200 |     add_looped_1.setArg(0, buffer_A);
201 |     add_looped_1.setArg(1, buffer_B);
202 |     add_looped_1.setArg(2, buffer_C);
203 |     add_looped_1.setArg(3, n);
204 |     add_looped_1.setArg(4, k);
205 |     queue.enqueueNDRangeKernel(add_looped_1, cl::NullRange,  // kernel, offset
206 |             cl::NDRange(NUM_GLOBAL_WITEMS), // global number of work items
207 |             cl::NDRange(32));               // local number (per group)
208 | 
209 |     // read result from GPU to here; including for the sake of timing
210 |     queue.enqueueReadBuffer(buffer_C, CL_TRUE, 0, sizeof(int)*n, C);
211 |     queue.finish();
212 |     GPUtime1 = (std::clock() - start_time) / (double) CLOCKS_PER_SEC;
213 | 
214 | 
215 |     // VERSION 2 ==========================================
216 |     double GPUtime2;
217 | 
218 |     cl::Buffer buffer_A2(context, CL_MEM_READ_WRITE, sizeof(int)*n);
219 |     cl::Buffer buffer_B2(context, CL_MEM_READ_WRITE, sizeof(int)*n);
220 |     cl::Buffer buffer_C2(context, CL_MEM_READ_WRITE, sizeof(int)*n);
221 |     queue.enqueueWriteBuffer(buffer_A2, CL_TRUE, 0, sizeof(int)*n, A);
222 |     queue.enqueueWriteBuffer(buffer_B2, CL_TRUE, 0, sizeof(int)*n, B);
223 | 
224 |     start_time = std::clock();
225 |     add_looped_2.setArg(0, buffer_A2);
226 |     add_looped_2.setArg(1, buffer_B2);
227 |     add_looped_2.setArg(2, buffer_C2);
228 |     add_looped_2.setArg(3, n);
229 |     add_looped_2.setArg(4, k);
230 |     
231 |     queue.enqueueNDRangeKernel(add_looped_2, cl::NullRange, cl::NDRange(NUM_GLOBAL_WITEMS), cl::NDRange(32));
232 |     queue.enqueueReadBuffer(buffer_C2, CL_TRUE, 0, sizeof(int)*n, C);
233 |     queue.finish();
234 |     GPUtime2 = (std::clock() - start_time) / (double) CLOCKS_PER_SEC;
235 | 
236 |     // let's compare!
237 |     const int NUM_VERSIONS = 2;
238 |     double GPUtimes[NUM_VERSIONS] = {GPUtime1, GPUtime2};
239 |     if (verbose) {
240 |         for (int i=0; i<NUM_VERSIONS; i++)
241 |             compareResults(CPUtime, GPUtimes[i], i+1);
242 |     } else {
243 |         std::cout << CPUtime << ",";
244 |         for (int i=0; i<NUM_VERSIONS-1; i++)
245 |             std::cout << GPUtimes[i] << ",";
246 |         std::cout << GPUtimes[NUM_VERSIONS-1] << std::endl;
247 |     }
248 |     return 0;
249 | }
250 | 
251 | 


--------------------------------------------------------------------------------
/example02/main.c:
--------------------------------------------------------------------------------
 1 | #include <stdlib.h>
 2 | #include <stdio.h>
 3 | #include <CL/cl.h>
 4 | 
 5 | const char *kernel_code =
 6 |     "__kernel void multiply_by(__global int* A, const int c) {"
 7 |     "   A[get_global_id(0)] = c * A[get_global_id(0)];"
 8 |     "}";
 9 | 
10 | 
11 | int factorial(int n) {
12 |     return (n <= 1) ? 1 : n * factorial(n-1);
13 | }
14 | 
15 | 
16 | int main( void ) {
17 |     // OpenCL related declarations
18 |     cl_int err;
19 |     cl_platform_id platform;
20 |     cl_device_id device;
21 |     cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
22 |     cl_context ctx;
23 |     cl_program program;
24 |     cl_command_queue queue;
25 |     cl_event event = NULL;
26 |     cl_kernel k_multiplyby;
27 |     int i;
28 | 
29 |     // 
30 |     const size_t N = 1024; // vector size
31 |     const int c_max = 5;   // max value to iterate to
32 |     const int coeff = factorial(c_max);
33 |     
34 |     int *A, *B, *C;        // A is initial, B is result, C is expected result
35 |     A = (int*) malloc(N * sizeof(*A));
36 |     B = (int*) malloc(N * sizeof(*B));
37 |     C = (int*) malloc(N * sizeof(*C));
38 |     for (i=0; i<N; i++) {
39 |         A[i] = i;
40 |         C[i] = coeff*i;
41 |     }
42 |     cl_mem d_A;  // buffer object for A
43 | 
44 |     /* Setup OpenCL environment. */
45 |     err = clGetPlatformIDs( 1, &platform, NULL );
46 |     err = clGetDeviceIDs( platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL );
47 | 
48 |     props[1] = (cl_context_properties)platform;
49 |     ctx = clCreateContext( props, 1, &device, NULL, NULL, &err );
50 |     queue = clCreateCommandQueue( ctx, device, 0, &err );
51 |     program = clCreateProgramWithSource(ctx, 1, (const char **) &kernel_code, NULL, &err);
52 |     err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
53 |     k_multiplyby = clCreateKernel(program, "multiply_by", &err);
54 | 
55 |     // initialize buffer with data
56 |     d_A = clCreateBuffer( ctx, CL_MEM_READ_WRITE, N*sizeof(*A), NULL, &err );
57 | 
58 |     err = clEnqueueWriteBuffer( queue, d_A, CL_TRUE, 0, N*sizeof(*A), A, 0, NULL, NULL );
59 |     
60 |     clSetKernelArg(k_multiplyby, 0, sizeof(cl_mem), &d_A);
61 |     int c;
62 |     for (c=2; c<=c_max; c++) {
63 |         clSetKernelArg(k_multiplyby, 1, sizeof(int), &c);
64 |         clEnqueueNDRangeKernel(queue, k_multiplyby, 1, NULL, &N, &N, 0, NULL, NULL);
65 |     }
66 |     err = clFinish(queue);
67 | 
68 |     err = clEnqueueReadBuffer( queue, d_A, CL_TRUE, 0, N*sizeof(*B), B, 0, NULL, NULL );
69 |     err = clFinish(queue);
70 | 
71 |     int success = 1;
72 |     for (i=0; i<N; i++) {
73 |         if (B[i] != C[i]) {
74 |             success = 0;
75 |             break;
76 |         }
77 |     }
78 | 
79 |     if (success)
80 |         printf("Arrays are equal!\n");
81 |     else
82 |         printf("Arrays are NOT equal\n");
83 | 
84 | 
85 |     /* Release OpenCL memory objects. */
86 |     clReleaseMemObject( d_A );
87 |     free(A);
88 |     free(B);
89 |     free(C);
90 |     clReleaseCommandQueue( queue );
91 |     clReleaseContext( ctx );
92 | 
93 |     return 0;
94 | }
95 | 


--------------------------------------------------------------------------------
/example02/main.cpp:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <algorithm>
  3 | #include <iterator>
  4 | #ifdef __APPLE__
  5 |     #include <OpenCL/cl.hpp>
  6 | #else
  7 |     #include <CL/cl.hpp>
  8 | #endif
  9 | 
 10 | using namespace std;
 11 | using namespace cl;
 12 | 
 13 | 
 14 | int factorial(int n) {
 15 |     return (n <= 1) ? 1 : n * factorial(n-1);
 16 | }
 17 | 
 18 | 
 19 | Platform getPlatform() {
 20 |     /* Returns the first platform found. */
 21 |     std::vector<Platform> all_platforms;
 22 |     Platform::get(&all_platforms);
 23 | 
 24 |     if (all_platforms.size()==0) {
 25 |         cout << "No platforms found. Check OpenCL installation!\n";
 26 |         exit(1);
 27 |     }
 28 |     return all_platforms[0];
 29 | }
 30 | 
 31 | 
 32 | Device getDevice(Platform platform, int i, bool display=false) {
 33 |     /* Returns the deviced specified by the index i on platform.
 34 |      * If display is true, then all of the platforms are listed.
 35 |      */
 36 |     std::vector<Device> all_devices;
 37 |     platform.getDevices(CL_DEVICE_TYPE_ALL, &all_devices);
 38 |     if(all_devices.size()==0){
 39 |         cout << "No devices found. Check OpenCL installation!\n";
 40 |         exit(1);
 41 |     }
 42 | 
 43 |     if (display) {
 44 |         for (int j=0; j<all_devices.size(); j++)
 45 |             printf("Device %d: %s\n", j, all_devices[j].getInfo<CL_DEVICE_NAME>().c_str());
 46 |     }
 47 |     return all_devices[i];
 48 | }
 49 | 
 50 | 
 51 | int main() {
 52 |     const int n = 1024;    // size of vectors
 53 |     const int c_max = 5;   // max value to iterate to
 54 |     const int coeff = factorial(c_max);
 55 | 
 56 |     int A[n], B[n], C[n];     // A is initial, B is result, C is expected result
 57 |     for (int i=0; i<n; i++) {
 58 |         A[i] = i;
 59 |         C[i] = coeff * i;
 60 |     }
 61 |     Platform default_platform = getPlatform();
 62 |     Device default_device     = getDevice(default_platform, 1);
 63 |     Context context({default_device});
 64 |     Program::Sources sources;
 65 | 
 66 |     std::string kernel_code=
 67 |         "void kernel multiply_by(global int* A, const int c) {"
 68 |         "   A[get_global_id(0)] = c * A[get_global_id(0)];"
 69 |         "}";
 70 |     sources.push_back({kernel_code.c_str(), kernel_code.length()});
 71 | 
 72 |     Program program(context, sources);
 73 |     if (program.build({default_device}) != CL_SUCCESS) {
 74 |         cout << "Error building: " << program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(default_device) << std::endl;
 75 |         exit(1);
 76 |     }
 77 |     
 78 |     Buffer buffer_A(context, CL_MEM_READ_WRITE, sizeof(int) * n);
 79 |     CommandQueue queue(context, default_device);
 80 |     queue.enqueueWriteBuffer(buffer_A, CL_TRUE, 0, sizeof(int)*n, A);
 81 | 
 82 |     Kernel multiply_by = Kernel(program, "multiply_by");
 83 |     multiply_by.setArg(0, buffer_A);
 84 | 
 85 |     for (int c=2; c<=c_max; c++) {
 86 |         multiply_by.setArg(1, c);
 87 |         queue.enqueueNDRangeKernel(multiply_by, NullRange, NDRange(n), NDRange(32));
 88 |     }
 89 | 
 90 |     queue.enqueueReadBuffer(buffer_A, CL_TRUE, 0, sizeof(int)*n, B);
 91 |     
 92 |     if (std::equal(std::begin(B), std::end(B), std::begin(C)))
 93 |         cout << "Arrays are equal!" << endl;
 94 |     else
 95 |         cout << "Uh-oh, the arrays aren't equal!" << endl;
 96 | 
 97 |     return 0;
 98 | }
 99 | 
100 | 


--------------------------------------------------------------------------------
/example03/main.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <stdlib.h>
  3 | #include <math.h>
  4 | #include <CL/cl.h>
  5 | #pragma OPENCL EXTENSION cl_khr_fp64 : enable
  6 |  
  7 | const char *kernelSource =
  8 | "#pragma OPENCL EXTENSION cl_khr_fp64 : enable   \n"  \ 
  9 | "__kernel void mult(__global double *v) {  \n" \
 10 | "    int id;                   \n" \
 11 | "    id = get_global_id(0);    \n" \
 12 | "    v[id] = 2*v[id];        \n" \
 13 | "}                             \n" \
 14 | "\n" ;
 15 |  
 16 | int main( int argc, char* argv[] ) {
 17 |     // problem-related declarations
 18 |     unsigned int N = 128;
 19 |     size_t N_bytes = N * sizeof(double);
 20 | 
 21 |     // openCL declarations
 22 |     cl_platform_id platform;
 23 |     cl_device_id device_id;
 24 |     cl_context context; 
 25 |     cl_command_queue queue;
 26 |     cl_program program;
 27 |     cl_kernel k_mult;
 28 | 
 29 |     // host version of v
 30 |     double *h_v;  // real & imaginary parts
 31 |     h_v = (double*) malloc(N_bytes);
 32 |  
 33 |     // initialize v on host
 34 |     int i;
 35 |     for (i = 0; i < N; i++) {
 36 |         h_v[i] = i;
 37 |     }
 38 | 
 39 |     // global & local number of threads
 40 |     size_t globalSize, localSize;
 41 |     globalSize = N;
 42 |     localSize = 32;
 43 | 
 44 |     // show the extensions that are supported
 45 |     /*
 46 |     cl_char extensions[2048] = {0};
 47 |     clGetDeviceInfo(device_id, CL_DEVICE_EXTENSIONS, sizeof(extensions), &extensions, NULL);
 48 |     printf("%s\n", extensions);
 49 |     */
 50 | 
 51 |     // setup OpenCL stuff 
 52 |     cl_int err;
 53 |     err = clGetPlatformIDs(1, &platform, NULL);
 54 |     err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
 55 |     context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
 56 |     queue = clCreateCommandQueue(context, device_id, 0, &err);
 57 |     program = clCreateProgramWithSource(context, 1, (const char **) & kernelSource, NULL, &err);
 58 |  
 59 |     // Build the program executable 
 60 |     err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
 61 |     if (err != CL_SUCCESS) {
 62 |         printf("building program failed\n");
 63 |         if (err == CL_BUILD_PROGRAM_FAILURE) {
 64 |             size_t log_size;
 65 |             clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
 66 |             char *log = (char *) malloc(log_size);
 67 |             clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
 68 |             printf("%s\n", log);
 69 |         }
 70 |     }
 71 |     k_mult = clCreateKernel(program, "mult", &err);
 72 |  
 73 |     // create arrays on host and write them
 74 |     cl_mem d_v;
 75 |     d_v = clCreateBuffer(context, CL_MEM_READ_WRITE, N_bytes, NULL, NULL);
 76 |     err = clEnqueueWriteBuffer(queue, d_v, CL_TRUE, 0, N_bytes, h_v, 0, NULL, NULL);
 77 | 
 78 |     err  = clSetKernelArg(k_mult, 0, sizeof(cl_mem), &d_v);
 79 |  
 80 |     err = clEnqueueNDRangeKernel(queue, k_mult, 1, NULL, &globalSize, &localSize, 0, NULL, NULL);
 81 |     clFinish(queue);
 82 | 
 83 |     // transfer back
 84 |     clEnqueueReadBuffer(queue, d_v, CL_TRUE, 0, N_bytes, h_v, 0, NULL, NULL );
 85 |     clFinish(queue);
 86 | 
 87 |     
 88 |     int correct = 1; 
 89 |     for (i=0; i<N; i++) {
 90 |         if (h_v[i] != (double) 2*i)
 91 |             correct = 0;
 92 |     }
 93 |     if (correct)
 94 |         printf("Array is correct!\n");
 95 |     else
 96 |         printf("Array is incorrect :(\n");
 97 | 
 98 |     // release OpenCL resources
 99 |     clReleaseMemObject(d_v);
100 |     clReleaseProgram(program);
101 |     clReleaseKernel(k_mult);
102 |     clReleaseCommandQueue(queue);
103 |     clReleaseContext(context);
104 |  
105 |     //release host memory
106 |     free(h_v);
107 |  
108 |     return 0;
109 | }
110 | 


--------------------------------------------------------------------------------
/example04/README.md:
--------------------------------------------------------------------------------
 1 | # example 04
 2 | 
 3 | ## Installing CLFFT
 4 | After cloning the repository, run the following (in the top-level of the directory):
 5 | 
 6 | ```
 7 | git clone https://github.com/clMathLibraries/clFFT.git
 8 | cd clFFT
 9 | mkdir build
10 | cd build
11 | cmake ../src
12 | make
13 | sudo make install
14 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib64
15 | ```
16 | 
17 | ## Running it
18 | In the top-level directory, run
19 | 
20 | ```
21 | make ex04
22 | ./example04/bin/Example
23 | ```
24 | 
25 | and it should print out a vector! :hamburger:
26 | 
27 | 


--------------------------------------------------------------------------------
/example04/main.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <stdlib.h>
  3 | #include <math.h>
  4 | #include <clFFT.h>
  5 |  
  6 | const char *kernelSource =
  7 | "#pragma OPENCL EXTENSION cl_khr_fp64 : enable                 \n" \ 
  8 | "__kernel void mult(__global double *vR, __global double *vI) {  \n" \
  9 | "    int id;                   \n" \
 10 | "    id = get_global_id(0);    \n" \
 11 | "    vR[id] = 2*vR[id];        \n" \
 12 | "    vI[id] = 2*vI[id];        \n" \
 13 | "}                             \n" \
 14 | "\n" ;
 15 |  
 16 | int main( int argc, char* argv[] ) {
 17 |     // problem-related declarations
 18 |     unsigned int N = 128;
 19 |     size_t N_bytes = N * sizeof(double);
 20 | 
 21 |     // openCL declarations
 22 |     cl_platform_id platform;
 23 |     cl_device_id device_id;
 24 |     cl_context context; 
 25 |     cl_command_queue queue;
 26 |     cl_program program;
 27 |     cl_kernel k_mult;
 28 | 
 29 |     // clFFT declarations
 30 |     clfftPlanHandle planHandleForward, planHandleBackward;
 31 |     clfftDim dim = CLFFT_1D;
 32 |     size_t clLengths[1] = {N};
 33 |     clfftSetupData fftSetup;
 34 |     clfftInitSetupData(&fftSetup);
 35 |     clfftSetup(&fftSetup);
 36 |  
 37 |     // host version of v
 38 |     double *h_vR, *h_vI;  // real & imaginary parts
 39 |     h_vR = (double*) malloc(N_bytes);
 40 |     h_vI = (double*) malloc(N_bytes);
 41 |  
 42 |     // initialize v on host
 43 |     int i;
 44 |     for (i = 0; i < N; i++) {
 45 |         h_vR[i] = i;
 46 |         h_vI[i] = 2*i;
 47 |     }
 48 | 
 49 |     // global & local number of threads
 50 |     size_t globalSize, localSize;
 51 |     globalSize = N;
 52 |     localSize = 32;
 53 | 
 54 |     // setup OpenCL stuff 
 55 |     cl_int err;
 56 |     err = clGetPlatformIDs(1, &platform, NULL);
 57 |     err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
 58 |     context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
 59 |     queue = clCreateCommandQueue(context, device_id, 0, &err);
 60 |     program = clCreateProgramWithSource(context, 1, (const char **) & kernelSource, NULL, &err);
 61 |  
 62 |     // Build the program executable 
 63 |     err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
 64 |     if (err != CL_SUCCESS) {
 65 |         printf("building program failed\n");
 66 |         if (err == CL_BUILD_PROGRAM_FAILURE) {
 67 |             size_t log_size;
 68 |             clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
 69 |             char *log = (char *) malloc(log_size);
 70 |             clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
 71 |             printf("%s\n", log);
 72 |         }
 73 |     }
 74 |     k_mult = clCreateKernel(program, "mult", &err);
 75 |  
 76 |     // create arrays on host and write them
 77 |     cl_mem d_vR, d_vI;
 78 |     d_vR = clCreateBuffer(context, CL_MEM_READ_WRITE, N_bytes, NULL, NULL);
 79 |     d_vI = clCreateBuffer(context, CL_MEM_READ_WRITE, N_bytes, NULL, NULL);
 80 |     err = clEnqueueWriteBuffer(queue, d_vR, CL_TRUE, 0, N_bytes, h_vR, 0, NULL, NULL);
 81 |     err |= clEnqueueWriteBuffer(queue, d_vI, CL_TRUE, 0, N_bytes, h_vI, 0, NULL, NULL);
 82 | 
 83 |     // create forward plan and set its params
 84 |     clfftCreateDefaultPlan(&planHandleForward, context, dim, clLengths);
 85 |     clfftSetPlanPrecision(planHandleForward, CLFFT_DOUBLE);
 86 |     clfftSetLayout(planHandleForward, CLFFT_COMPLEX_PLANAR, CLFFT_COMPLEX_PLANAR);
 87 |     clfftSetResultLocation(planHandleForward, CLFFT_INPLACE);
 88 |     clfftBakePlan(planHandleForward, 1, &queue, NULL, NULL);
 89 | 
 90 |     // create backward plan and set its params
 91 |     clfftCreateDefaultPlan(&planHandleBackward, context, dim, clLengths);
 92 |     clfftSetPlanPrecision(planHandleBackward, CLFFT_DOUBLE);
 93 |     clfftSetLayout(planHandleBackward, CLFFT_COMPLEX_PLANAR, CLFFT_COMPLEX_PLANAR);
 94 |     clfftSetResultLocation(planHandleBackward, CLFFT_INPLACE);
 95 |     clfftBakePlan(planHandleBackward, 1, &queue, NULL, NULL);
 96 | 
 97 |     // set all of ze kernel args...
 98 |     err  = clSetKernelArg(k_mult, 0, sizeof(cl_mem), &d_vR);
 99 |     err |= clSetKernelArg(k_mult, 1, sizeof(cl_mem), &d_vI);
100 |  
101 |     // cl_mem array allows for complex_planar transform
102 |     cl_mem inputBuffers[2] = {0, 0};
103 |     inputBuffers[0] = d_vR;
104 |     inputBuffers[1] = d_vI;
105 |     
106 |     // FFT data, apply psi, IFFT data
107 |     clfftEnqueueTransform(planHandleForward, CLFFT_FORWARD, 1, &queue, 0, NULL, NULL, &inputBuffers, NULL, NULL);
108 |     clFinish(queue);
109 |  
110 |     err = clEnqueueNDRangeKernel(queue, k_mult, 1, NULL, &globalSize, &localSize, 0, NULL, NULL);
111 | 
112 |     clfftEnqueueTransform(planHandleBackward, CLFFT_BACKWARD, 1, &queue, 0, NULL, NULL, &inputBuffers, NULL, NULL);
113 | 
114 |     // transfer back
115 |     clEnqueueReadBuffer(queue, d_vR, CL_TRUE, 0, N_bytes, h_vR, 0, NULL, NULL );
116 |     clEnqueueReadBuffer(queue, d_vI, CL_TRUE, 0, N_bytes, h_vI, 0, NULL, NULL );
117 |     clFinish(queue);
118 |  
119 |     printf("[  ");
120 |     for (i=0; i<N; i++)
121 |         printf("(%f, %f)  ", h_vR[i], h_vI[i]);
122 |     printf("]\n");
123 | 
124 |     // release clFFT stuff
125 |     clfftDestroyPlan( &planHandleForward );
126 |     clfftDestroyPlan( &planHandleBackward );
127 |     clfftTeardown();
128 |  
129 |     // release OpenCL resources
130 |     clReleaseMemObject(d_vR);
131 |     clReleaseMemObject(d_vI);
132 |     clReleaseProgram(program);
133 |     clReleaseKernel(k_mult);
134 |     clReleaseCommandQueue(queue);
135 |     clReleaseContext(context);
136 |  
137 |     //release host memory
138 |     free(h_vR);
139 |     free(h_vI);
140 |  
141 |     return 0;
142 | }
143 | 


--------------------------------------------------------------------------------
/example04/main.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import scipy.fftpack as fft
 3 | 
 4 | N = 128
 5 | v = np.arange(N) + 2*np.arange(N)*1j
 6 | 
 7 | v_fft = fft.fft(v)
 8 | 
 9 | v_fft_altered = 2 * v_fft
10 | 
11 | v_final = fft.ifft(v_fft_altered)
12 | 
13 | print v_final
14 | 


--------------------------------------------------------------------------------
/example05/main.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <stdlib.h>
  3 | #include <math.h>
  4 | #include <clFFT.h>
  5 | #include <fftw3.h>
  6 | 
  7 | 
  8 | static const char *kernelSource =
  9 | "#pragma OPENCL EXTENSION cl_khr_fp64 : enable  \n" \
 10 | "__kernel void mult(__global double *v) {       \n" \
 11 | "    int id, v_re, v_im;       \n" \
 12 | "    id   = get_global_id(0);  \n" \
 13 | "    v_re = 2*id;              \n" \
 14 | "    v_im = v_re + 1;          \n" \
 15 | "                              \n" \
 16 | "    v[v_re] = 2*v[v_re];      \n" \
 17 | "    v[v_im] = 4*v[v_im];      \n" \
 18 | "}                             \n" \
 19 | "\n" ;
 20 | 
 21 | 
 22 | int roundUpToNearest(int x, int n) {
 23 |     /* Rounds x UP to nearest multiple of n. */
 24 |     int x_rem = x % n;
 25 |     if (x_rem == 0)
 26 |         return x;
 27 | 
 28 |     return x + (n - x_rem);
 29 | }
 30 | 
 31 | 
 32 | void checkIfArraysEqual(double *h_v, double *v, int N, double epsilon) {
 33 |     int arrays_equal = 1;
 34 |     int i;
 35 | 
 36 |     for (i=0; i<N; i++) {
 37 |         // printf("[%f %f]  ", h_v[i], v[i]);
 38 |         if (abs(v[i] - h_v[i]) > epsilon)
 39 |             arrays_equal = 0;
 40 |     }
 41 |     
 42 |     if (arrays_equal)
 43 |         printf("Arrays are equal!\n");
 44 |     else
 45 |         printf("Arrays are NOT equal!\n");
 46 | }
 47 | 
 48 |  
 49 | int main( int argc, char* argv[] ) {
 50 |     /* This setup is a bit tricky. Since we're doing a real transform, CLFFT
 51 |      * requires N+2 elements in the array. This is because only N/2 + 1 numbers
 52 |      * are calculated, and since each number is complex, it requires 2 elements
 53 |      * for space.
 54 |      *
 55 |      * To avoid warp divergence, we want to avoid any conditionals in the
 56 |      * kernel. Thus we cannot check to see if the thread ID is even or odd to
 57 |      * act on a real number or imaginary number. To do this, one thread should
 58 |      * handle one complex number (one real, one imag), i.e. ID_j should handle
 59 |      * array elements j, j+1.
 60 |      *
 61 |      * But we also need the number of global items to be a multiple of 32 (warp
 62 |      * size). What we can do, for example, N = 128, is pad it by 2 (130),
 63 |      * divide it by 2 (65), round that UP to the nearest 32 (96), multiply that
 64 |      * by 2 (192). The kernel will operate on zeros, but it should be faster
 65 |      * than the scenario with warp divergence. */
 66 | 
 67 |     unsigned int N = 4096;
 68 |     unsigned int N_pad = 2*roundUpToNearest( (N+2)/2, 32 );
 69 |     size_t N_bytes = N_pad * sizeof(double);
 70 | 
 71 |     // openCL declarations
 72 |     cl_platform_id platform;
 73 |     cl_device_id device_id;
 74 |     cl_context context; 
 75 |     cl_command_queue queue;
 76 |     cl_program program;
 77 |     cl_kernel k_mult;
 78 | 
 79 |     // clFFT declarations
 80 |     clfftPlanHandle planHandleForward, planHandleBackward;
 81 |     clfftDim dim = CLFFT_1D;
 82 |     size_t clLengths[1] = {N};
 83 |     clfftSetupData fftSetup;
 84 |     clfftInitSetupData(&fftSetup);
 85 |     clfftSetup(&fftSetup);
 86 |  
 87 |     // host version of v
 88 |     double *h_v;
 89 |     h_v = (double*) malloc(N_bytes);
 90 |  
 91 |     // initialize v on host (GPU and CPU)
 92 |     int i;
 93 |     for (i = 0; i < N; i++)
 94 |         h_v[i] = i;
 95 | 
 96 |     // CPU TRANSFORM ----------------------------------------------------------
 97 |     double *v;
 98 |     fftw_complex *V;
 99 |     int N_COMPLEX = N/2 + 1;
100 |     int REAL = 0;
101 |     int IMAG = 1;
102 | 
103 |     v = (double*) malloc(N * sizeof(double));
104 |     V = (fftw_complex*) malloc(N_COMPLEX * sizeof(fftw_complex));
105 | 
106 |     fftw_plan fft  = fftw_plan_dft_r2c_1d(N, v, V, FFTW_MEASURE);
107 |     fftw_plan ifft = fftw_plan_dft_c2r_1d(N, V, v, FFTW_MEASURE);
108 | 
109 |     // initialize v here because otherwise fftw_execute will run before we 
110 |     // initialize the plan... for some reason.
111 |     for (i=0; i<N; i++)
112 |         v[i] = i;
113 | 
114 |     fftw_execute(fft);
115 |     for (i=0; i<N_COMPLEX; i++) {
116 |         V[i][REAL] = 2 * V[i][REAL];
117 |         V[i][IMAG] = 4 * V[i][IMAG];
118 |     }
119 |     fftw_execute(ifft);
120 | 
121 |     // scale array as FFTW doesn't automatically do this for back transform
122 |     for (i=0; i<N; i++)
123 |         v[i] = v[i]/N;
124 | 
125 | 
126 |     // GPU STUFF --------------------------------------------------------------
127 |     // global & local number of threads
128 |     size_t globalSize, localSize;
129 |     globalSize = N_pad / 2;
130 |     localSize = 32;
131 | 
132 |     // setup OpenCL stuff 
133 |     cl_int err;
134 |     err = clGetPlatformIDs(1, &platform, NULL);
135 |     err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
136 |     context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
137 |     queue = clCreateCommandQueue(context, device_id, 0, &err);
138 |     program = clCreateProgramWithSource(context, 1, (const char **) & kernelSource, NULL, &err);
139 |  
140 |     // Build the program executable 
141 |     err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
142 |     if (err != CL_SUCCESS) {
143 |         printf("building program failed\n");
144 |         if (err == CL_BUILD_PROGRAM_FAILURE) {
145 |             size_t log_size;
146 |             clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
147 |             char *log = (char *) malloc(log_size);
148 |             clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
149 |             printf("%s\n", log);
150 |         }
151 |     }
152 |     k_mult = clCreateKernel(program, "mult", &err);
153 |  
154 |     // create arrays on host and write them
155 |     cl_mem d_v, d_V;
156 |     d_v = clCreateBuffer(context, CL_MEM_READ_WRITE, N_bytes, NULL, NULL);
157 |     err = clEnqueueWriteBuffer(queue, d_v, CL_TRUE, 0, N_bytes, h_v, 0, NULL, NULL);
158 | 
159 |     // REAL IN-PLACE TRANSFORM ------------------------------------------------
160 |     // create forward plan and set its params
161 |     clfftCreateDefaultPlan(&planHandleForward, context, dim, clLengths);
162 |     clfftSetPlanPrecision(planHandleForward, CLFFT_DOUBLE);
163 |     clfftSetLayout(planHandleForward, CLFFT_REAL, CLFFT_HERMITIAN_INTERLEAVED);
164 |     clfftSetResultLocation(planHandleForward, CLFFT_INPLACE);
165 |     clfftBakePlan(planHandleForward, 1, &queue, NULL, NULL);
166 | 
167 |     // create backward plan and set its params
168 |     clfftCreateDefaultPlan(&planHandleBackward, context, dim, clLengths);
169 |     clfftSetPlanPrecision(planHandleBackward, CLFFT_DOUBLE);
170 |     clfftSetLayout(planHandleBackward, CLFFT_HERMITIAN_INTERLEAVED, CLFFT_REAL);
171 |     clfftSetResultLocation(planHandleBackward, CLFFT_INPLACE);
172 |     clfftBakePlan(planHandleBackward, 1, &queue, NULL, NULL);
173 | 
174 |     err  = clSetKernelArg(k_mult, 0, sizeof(cl_mem), &d_v);
175 |  
176 |     // FFT data, multiply elements, IFFT data
177 |     clfftEnqueueTransform(planHandleForward, CLFFT_FORWARD, 1, &queue, 0, NULL, NULL, &d_v, NULL, NULL);
178 |     clFinish(queue);
179 |     err = clEnqueueNDRangeKernel(queue, k_mult, 1, NULL, &globalSize, &localSize, 0, NULL, NULL);
180 |     clFinish(queue);
181 |     clfftEnqueueTransform(planHandleBackward, CLFFT_BACKWARD, 1, &queue, 0, NULL, NULL, &d_v, NULL, NULL);
182 |     clFinish(queue);
183 |     clEnqueueReadBuffer(queue, d_v, CL_TRUE, 0, N_bytes, h_v, 0, NULL, NULL );
184 |     clFinish(queue);
185 |     
186 |     clfftDestroyPlan( &planHandleForward );
187 |     clfftDestroyPlan( &planHandleBackward );
188 |     printf("Testing in-place real transform... ");
189 |     checkIfArraysEqual(h_v, v, N, 0.0);
190 | 
191 | 
192 |     
193 |     // REAL OUT-OF-PLACE TRANSFORM --------------------------------------------
194 |     // reset array
195 |     d_v = clCreateBuffer(context, CL_MEM_READ_WRITE, N_bytes, NULL, NULL);
196 |     d_V = clCreateBuffer(context, CL_MEM_READ_WRITE, N_bytes, NULL, NULL);
197 |     
198 |     cl_mem inputBuffers[1] = {0}, outputBuffers[1] = {0};
199 |     inputBuffers[0] = d_v;
200 |     outputBuffers[0] = d_V;
201 | 
202 |     err = clEnqueueWriteBuffer(queue, d_v, CL_TRUE, 0, N_bytes, h_v, 0, NULL, NULL);
203 | 
204 |     clfftCreateDefaultPlan(&planHandleForward, context, dim, clLengths);
205 |     clfftSetPlanPrecision(planHandleForward, CLFFT_DOUBLE);
206 |     clfftSetLayout(planHandleForward, CLFFT_REAL, CLFFT_HERMITIAN_INTERLEAVED);
207 |     clfftSetResultLocation(planHandleForward, CLFFT_OUTOFPLACE);
208 |     clfftBakePlan(planHandleForward, 1, &queue, NULL, NULL);
209 | 
210 |     clfftCreateDefaultPlan(&planHandleBackward, context, dim, clLengths);
211 |     clfftSetPlanPrecision(planHandleBackward, CLFFT_DOUBLE);
212 |     clfftSetLayout(planHandleBackward, CLFFT_HERMITIAN_INTERLEAVED, CLFFT_REAL);
213 |     clfftSetResultLocation(planHandleBackward, CLFFT_OUTOFPLACE);
214 |     clfftBakePlan(planHandleBackward, 1, &queue, NULL, NULL);
215 | 
216 |     clfftEnqueueTransform(planHandleForward, CLFFT_FORWARD, 1, &queue, 0, NULL, NULL, &inputBuffers, &outputBuffers, NULL);
217 |     clFinish(queue);
218 |     err = clEnqueueNDRangeKernel(queue, k_mult, 1, NULL, &globalSize, &localSize, 0, NULL, NULL);
219 |     clFinish(queue);
220 |     clfftEnqueueTransform(planHandleBackward, CLFFT_BACKWARD, 1, &queue, 0, NULL, NULL, &inputBuffers, &outputBuffers, NULL);
221 |     clFinish(queue);
222 |     clEnqueueReadBuffer(queue, d_v, CL_TRUE, 0, N_bytes, h_v, 0, NULL, NULL );
223 |     clFinish(queue);
224 | 
225 |     printf("Testing out-of-place transform... ");
226 |     checkIfArraysEqual(h_v, v, N, 0.0);
227 | 
228 | 
229 |     // release FFT stuff
230 |     fftw_free(V);
231 |     clfftDestroyPlan( &planHandleForward );
232 |     clfftDestroyPlan( &planHandleBackward );
233 |     clfftTeardown();
234 |  
235 |     // release OpenCL resources
236 |     clReleaseMemObject(d_v);
237 |     clReleaseProgram(program);
238 |     clReleaseKernel(k_mult);
239 |     clReleaseCommandQueue(queue);
240 |     clReleaseContext(context);
241 |  
242 |     //release host memory
243 |     free(v);
244 |     free(h_v);
245 |  
246 |     return 0;
247 | }
248 | 


--------------------------------------------------------------------------------
/example05/main.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import scipy.fftpack as fft
 3 | 
 4 | N = 2048 
 5 | v = np.arange(N)
 6 | 
 7 | v_fft = fft.fft(v)
 8 | 
 9 | v_fft_altered = 2*v_fft.real + 4*v_fft.imag*1j
10 | 
11 | v_final = fft.ifft(v_fft_altered)
12 | print v_final
13 | 
14 | 


--------------------------------------------------------------------------------