├── .gitignore
├── CMakeLists.txt
├── LICENSE
├── Makefile
├── README.md
├── c++
    ├── CMakeLists.txt
    ├── mcpi-mpi.cpp
    ├── mcpi-par.cpp
    └── mcpi-seq.cpp
├── cuda
    ├── CMakeLists.txt
    └── mcpi-cuda.cu
├── jupyter
    └── mcpi.ipynb
├── python
    └── mcpi.py
└── scripts
    ├── run-mcpi-cuda.sh
    ├── run-mcpi-mpi.sh
    ├── run-mcpi-par.sh
    └── run-mcpi-seq.sh


/.gitignore:
--------------------------------------------------------------------------------
1 | build*
2 | bin
3 | *.out
4 | .vscode


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.18...3.20 FATAL_ERROR)
 2 | 
 3 | project(vscode-hpc CXX)
 4 | 
 5 | add_subdirectory(c++)
 6 | 
 7 | include(CheckLanguage)
 8 | check_language(CUDA)
 9 | if (CMAKE_CUDA_COMPILER)
10 |   add_subdirectory(cuda)
11 | endif()


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Armin Sobhani
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | CXX=g++
 2 | MPICXX=mpicxx
 3 | CXX_FLAGS = -std=c++14 -O3 -Wall -DNDEBUG
 4 | BIN=bin
 5 | SRC=c++
 6 | 
 7 | dir_guard=@mkdir -p $(BIN)
 8 | 
 9 | all: mcpi-seq mcpi-par mcpi-mpi
10 | 
11 | mcpi-seq: $(SRC)/mcpi-seq.cpp
12 | 	$(dir_guard)
13 | 	@$(CXX) -o $(BIN)/$@ $^ $(CXX_FLAGS)
14 | 
15 | mcpi-par: $(SRC)/mcpi-par.cpp
16 | 	$(dir_guard)
17 | 	@$(CXX) -fopenmp -o $(BIN)/$@ $^ $(CXX_FLAGS)
18 | 
19 | mcpi-mpi: $(SRC)/mcpi-mpi.cpp
20 | 	$(dir_guard)
21 | 	@$(MPICXX) -o $(BIN)/$@ $^ $(CXX_FLAGS)
22 | 
23 | .PHONY: clean
24 | 
25 | clean:
26 | 	rm -f $(BIN)/*


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # vscode-hpc
 2 | This is a sample code accompanying SHARCNET General Interest Seminars entitled: *Remote Development on HPC Clusters with VSCode*. You can find recorded versions of the talks on the [SHARCNET YouTube channel](https://youtube.sharcnet.ca):
 3 | 
 4 | * [Part I](https://www.youtube.com/watch?v=u9k6HikDyqk)
 5 | * [Part II](https://www.youtube.com/watch?v=CsgBGpampvc)
 6 | 
 7 | ## The setup
 8 | 
 9 | * [Visual Studio Code (VSCode)](https://code.visualstudio.com/) on the system that you work on it along with the following extensions:
10 |     - [C/C++ Extension Pack](https://marketplace.visualstudio.com/items?itemName=ms-vscode.cpptools-extension-pack)
11 |     - [Python](https://marketplace.visualstudio.com/items?itemName=ms-python.python)
12 |         * [Jupyter](https://marketplace.visualstudio.com/items?itemName=ms-toolsai.jupyter)
13 |         * [Pylance](https://marketplace.visualstudio.com/items?itemName=ms-python.vscode-pylance) (*optional dependency*)
14 |         * [Visual Studio IntelliCode](https://marketplace.visualstudio.com/items?itemName=VisualStudioExptTeam.vscodeintellicode) (*optional*)
15 |     - [Nsight Visual Studio Code Edition](https://marketplace.visualstudio.com/items?itemName=NVIDIA.nsight-vscode-edition)
16 |     - [Makefile Tools](https://marketplace.visualstudio.com/items?itemName=ms-vscode.makefile-tools) (*optional*)
17 |     - [GitHub Pull Requests and Issues](https://marketplace.visualstudio.com/items?itemName=GitHub.vscode-pull-request-github) (*optional*)
18 |     - [GitLens](https://marketplace.visualstudio.com/items?itemName=eamodio.gitlens) (*optional*)
19 | 
20 | And the followings on the platform(s) that you want to do either *local* or *remote* development:
21 | ### The Alliance Clusters
22 | For remote development on the
23 | [Digital Research Alliance of Canada (the Alliance)](https://alliancecan.ca)
24 | clusters, the following ```module``` command will do the trick but often you have to add it at the end of your ```~/.bashrc``` file:
25 | 
26 | ```
27 | module load cmake cuda scipy-stack/2022a ipykernel
28 | ```
29 | ### Linux
30 | * C++ compiler supporting the ```C++14``` standard (e.g. ```gcc``` 9.3)
31 | * [Python 3](https://www.python.org/downloads/)
32 | * [Git](https://git-scm.com/download/linux) for *Linux*
33 | * [CMake](https://cmake.org/) 3.18 or higher for *Linux*
34 | * An MPI implementation (e.g. ```OpenMPI``` or ```MPICH```)
35 | * [CUDA toolkit](https://developer.nvidia.com/cuda-downloads?target_os=Linux) for *Linux*
36 | 
37 | ### Windows
38 | * [Visual Studio Community Edition](https://visualstudio.microsoft.com/vs/community/) with ```C++``` and ```Python``` support
39 | * [Git](https://git-scm.com/download/win) for *Windows*
40 | * [CMake](https://cmake.org/download/) 3.18 or higher for *Windows*
41 | * [Windows Terminal](https://aka.ms/terminal) or [Windows Terminal Preview](https://aka.ms/terminal-preview)
42 | * [MS-MPI](https://www.microsoft.com/en-us/download/details.aspx?id=100593) (both ```msmpisetup.exe``` and ```	
43 | msmpisdk.msi```)
44 | * [CUDA toolkit](https://developer.nvidia.com/cuda-downloads?target_os=Windows&target_arch=x86_64) for *Windows*
45 | 
46 | On Windows systems, if you want to do both local development on *Windows* and remote development on [WSL2](https://docs.microsoft.com/en-us/windows/wsl/), you have to first install the [NVIDIA drivers for WSL with CUDA and DirectML support](https://developer.nvidia.com/cuda/wsl/download) on *Windows* and then follow these [instructions](https://docs.nvidia.com/cuda/wsl-user-guide/index.html#setting-up-linux-dev-env) in order to install ```CUDA toolkit``` on *WSL2*.
47 | 
48 | ### macOS
49 | * C++ compiler supporting the ```C++14``` standard (e.g. ```clang``` 3.4)
50 | * [Python 3](https://www.python.org/downloads/)
51 | * [Git](https://git-scm.com/download/mac) for *macOS*
52 | * [CMake](https://cmake.org/download/) 3.18 or higher for *macOS*
53 | * An MPI implementation (e.g. ```OpenMPI``` or ```MPICH```)
54 | * [CUDA toolkit](https://developer.nvidia.com/nvidia-cuda-toolkit-developer-tools-mac-hosts) for *macOS*
55 | 
56 | ## Get started
57 | Just run *VSCode* on the system that you work on it and then select ```Clone Git Repository...``` from ```Get Started``` page or type ```git: clone``` in the *command palette* (<kbd>Ctrl</kbd>+<kbd>Shift</kbd>+<kbd>p</kbd> or <kbd>F1</kbd>). Then paste ```https://github.com/sharcnet/vscode-hpc.git``` and hit ```Enter```.
58 | 


--------------------------------------------------------------------------------
/c++/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_executable(mcpi-seq mcpi-seq.cpp)
 2 | target_compile_features(mcpi-seq PRIVATE cxx_std_14)
 3 | install(TARGETS mcpi-seq RUNTIME DESTINATION bin COMPONENT mcpi-seq)
 4 | 
 5 | find_package(OpenMP)
 6 | if (OpenMP_FOUND)
 7 |   add_executable(mcpi-par mcpi-par.cpp)
 8 |   target_compile_features(mcpi-par PRIVATE cxx_std_14)
 9 |   target_link_libraries(mcpi-par PUBLIC OpenMP::OpenMP_CXX)
10 |   install(TARGETS mcpi-par RUNTIME DESTINATION bin COMPONENT mcpi-par)
11 | endif()
12 | 
13 | find_package(MPI)
14 | if (MPI_FOUND)
15 |   add_executable(mcpi-mpi mcpi-mpi.cpp)
16 |   target_compile_features(mcpi-mpi PRIVATE cxx_std_14)
17 |   target_link_libraries(mcpi-mpi PUBLIC MPI::MPI_CXX)
18 |   install(TARGETS mcpi-mpi RUNTIME DESTINATION bin COMPONENT mcpi-mpi)
19 | endif()
20 | 


--------------------------------------------------------------------------------
/c++/mcpi-mpi.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <chrono>
 3 | #include <cstdlib>
 4 | #include <random>
 5 | #include <iomanip>
 6 | 
 7 | #include <mpi.h>
 8 | 
 9 | using namespace std;
10 | 
11 | typedef chrono::high_resolution_clock timer;
12 | 
13 | // get input from the command line for total number of tosses
14 | size_t process_cmdline(int argc, char* argv[])
15 | {
16 |     if (argc > 2)
17 |     {
18 |         cout << "Usage: "
19 |              << argv[0]
20 |              << " [number of tosses]"
21 |              << endl;
22 |         return 0;
23 |     }
24 |     else if (1 == argc)
25 |         return 10'000'000;
26 |     else
27 |         return atoll(argv[1]);
28 | }
29 | 
30 | // perform Monte Carlo version of tossing darts at a board
31 | size_t toss(size_t n, int size, int rank)
32 | {
33 |     size_t in{};
34 |     std::random_device rx, ry;
35 |     std::uniform_real_distribution<float> u(0, 1);
36 | 
37 | 
38 |     for (size_t i{size_t(rank)}; i < n; i += size)
39 |     {
40 |         float x{u(rx)}, y{u(ry)};   // choose random x- and y-coordinates
41 |         if (x * x + y * y <= 1.0)   // is point in circle?
42 |             ++in;                   // increase counter
43 |     }
44 |     return in;
45 | }
46 | 
47 | int main(int argc, char* argv[])
48 | {
49 |     // initialize MPI environment
50 |     int size, rank;
51 |     MPI_Init(&argc, &argv);
52 |     MPI_Comm_size(MPI_COMM_WORLD, &size);  // get total number of processes
53 |     MPI_Comm_rank(MPI_COMM_WORLD, &rank);  // get rank of current process
54 | 
55 |     // read total number of tosses from the command line
56 |     size_t n_tosses = process_cmdline(argc, argv);
57 |     if (0 == n_tosses)
58 |         return -1;
59 | 
60 |     if (0 == rank)
61 |         cout << "Monte-Carlo Pi Estimator\n"
62 |              << "Method: MPI -- "
63 |              << size << " process(es)\n"
64 |              << "Number of tosses: " << n_tosses << endl;
65 | 
66 |     // run the simulation and time it...
67 | 
68 |     timer::time_point start = timer::now();
69 |     size_t n_partial = toss(n_tosses, size, rank);
70 |     // calculate sum of all local variables 'in' and storre result
71 |     // in 'in_all' on process 0
72 |     size_t n_in_circle;
73 |     MPI_Reduce(&n_partial, &n_in_circle, 1,
74 |         MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);
75 | 
76 |     // ouput the results
77 |     if (0 == rank)
78 |     {
79 |         timer::duration elapsed = timer::now() - start;
80 |         const long double pi = 3.141592653589793238462643L; // 25-digit Pi
81 |         long double pi_estimate = 4.0L * n_in_circle / n_tosses;
82 |         cout << "Estimated Pi: " << fixed << setw(17) << setprecision(15)
83 |              << pi_estimate << endl;
84 |         cout << "Percent error: " << setprecision(3)
85 |              << abs(pi_estimate - pi) / pi * 100.0 << '%' << endl;
86 |         cout << "Elapsed time: "
87 |              << chrono::duration_cast<chrono::milliseconds>(elapsed).count()
88 |              << " ms" << endl;
89 |     }
90 | 
91 |     MPI_Finalize();  // quit MPI
92 | }
93 | 


--------------------------------------------------------------------------------
/c++/mcpi-par.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <chrono>
 3 | #include <cstdlib>
 4 | #include <random>
 5 | #include <iomanip>
 6 | 
 7 | #include <omp.h>
 8 | 
 9 | using namespace std;
10 | 
11 | typedef chrono::high_resolution_clock timer;
12 | 
13 | // get input from the command line for total number of tosses
14 | size_t process_cmdline(int argc, char* argv[])
15 | {
16 |     if (argc > 4)
17 |     {
18 |         cout << "Usage: "
19 |              << argv[0]
20 |              << " [number of tosses] [number of threads]"
21 |              << endl;
22 |         return 0;
23 |     }
24 |     else if (1 == argc)
25 |         return 10'000'000;
26 |     else
27 |         return atoll(argv[1]);
28 | }
29 | 
30 | // perform Monte Carlo version of tossing darts at a board
31 | size_t toss(size_t n)
32 | {
33 |     size_t in{};
34 |     std::random_device rx, ry;
35 |     std::uniform_real_distribution<float> u(0, 1);
36 | 
37 |     // distribute workload over all processes and make a global reduction
38 |     #pragma omp parallel for reduction(+ : in)
39 |     for (int i = 0; i < int(n); ++i)
40 |     {
41 |         const float x{u(rx)}, y{u(ry)};  // choose random x- and y-coords
42 |         if (x * x + y * y <= 1.0)        // is point in circle?
43 |             ++in;                        // increase counter
44 |     }
45 | 
46 |     return in;
47 | }
48 | 
49 | int main(int argc, char* argv[])
50 | {
51 |     // set the number of threads
52 |     auto n_threads = omp_get_max_threads();
53 |     if (3 == argc)
54 |     {
55 |         int n = atoi(argv[2]);
56 |         if (0 == n)
57 |         {
58 |             cout << "Usage: "
59 |                  << argv[0]
60 |                  << " [number of tosses] [number of threads]"
61 |                  << endl;
62 |             return -1;
63 |         }
64 |         if (n < n_threads)
65 |             n_threads = n;
66 |     }
67 | 
68 |     // read total number of tosses from the command line
69 |     size_t n_tosses = process_cmdline(argc, argv);
70 |     if (0 == n_tosses)
71 |         return -1;
72 |     cout << "Monte-Carlo Pi Estimator\n"
73 |          << "Method: Parallel (OpenMP) -- "
74 |          << n_threads << " thread(s)\n"
75 |          << "Number of tosses: " << n_tosses << endl;
76 | 
77 |     // run the simulation and time it...
78 |     omp_set_num_threads(n_threads);
79 |     timer::time_point start = timer::now();
80 |     size_t n_in_circle = toss(n_tosses);
81 |     timer::duration elapsed = timer::now() - start;
82 | 
83 |     // ouput the results
84 |     const long double pi = 3.141592653589793238462643L; // 25-digit Pi
85 |     long double pi_estimate = 4.0L * n_in_circle / n_tosses;
86 |     cout << "Estimated Pi: " << fixed << setw(17) << setprecision(15)
87 |          << pi_estimate << endl
88 |          << "Percent error: " << setprecision(3)
89 |          << abs(pi_estimate - pi) / pi * 100.0 << '%' << endl
90 |          << "Elapsed time: "
91 |          << chrono::duration_cast<chrono::milliseconds>(elapsed).count()
92 |          << " ms" << endl;
93 | }
94 | 


--------------------------------------------------------------------------------
/c++/mcpi-seq.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <chrono>
 3 | #include <cstdlib>
 4 | #include <random>
 5 | #include <iomanip>
 6 | 
 7 | using namespace std;
 8 | 
 9 | typedef chrono::high_resolution_clock timer;
10 | 
11 | // get input from the command line for total number of tosses
12 | size_t process_cmdline(int argc, char* argv[])
13 | {
14 |     if (argc > 2)
15 |     {
16 |         cout << "Usage: "
17 |              << argv[0]
18 |              << " [number of tosses]"
19 |              << endl;
20 |         return 0;
21 |     }
22 |     else if (1 == argc)
23 |         return 10'000'000;
24 |     else
25 |         return atoll(argv[1]);
26 | }
27 | 
28 | // perform Monte Carlo version of tossing darts at a board
29 | size_t toss(size_t n)
30 | {
31 |     size_t in{};
32 |     std::random_device rx, ry;
33 |     std::uniform_real_distribution<float> u(0, 1);
34 |     for (size_t i{}; i < n; ++i)
35 |     {
36 |         float x{u(rx)}, y{u(ry)};   // choose random x- and y-coordinates
37 |         if (x * x + y * y <= 1.0)   // is point in circle?
38 |             ++in;                   // increase counter
39 |     }
40 |     return in;
41 | }
42 | 
43 | int main(int argc, char* argv[])
44 | {
45 |     // read total number of tosses from the command line
46 |     size_t n_tosses = process_cmdline(argc, argv);
47 |     if (0 == n_tosses)
48 |         return -1;
49 |     cout << "Monte-Carlo Pi Estimator\n"
50 |          << "Method: Sequential\n"
51 |          << "Number of tosses: " << n_tosses << endl;
52 | 
53 |     // run the simulation and time it...
54 |     timer::time_point start = timer::now();
55 |     size_t n_in_circle = toss(n_tosses);
56 |     timer::duration elapsed = timer::now() - start;
57 | 
58 |     // ouput the results
59 |     const long double pi = 3.141592653589793238462643L; // 25-digit Pi
60 |     long double pi_estimate = 4.0L * n_in_circle / n_tosses;
61 |     cout << "Estimated Pi: " << fixed << setw(17) << setprecision(15)
62 |          << pi_estimate << endl
63 |          << "Percent error: " << setprecision(3)
64 |          << abs(pi_estimate - pi) / pi * 100.0 << '%' << endl
65 |          << "Elapsed time: "
66 |          << chrono::duration_cast<chrono::milliseconds>(elapsed).count()
67 |          << " ms" << endl;
68 | }
69 | 


--------------------------------------------------------------------------------
/cuda/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | enable_language(CUDA)
2 | find_package(CUDAToolkit REQUIRED)
3 | add_executable(mcpi-cuda mcpi-cuda.cu)
4 | target_compile_options(mcpi-cuda PRIVATE
5 |   "$<$<AND:$<CONFIG:Debug>,$<COMPILE_LANGUAGE:CUDA>>:-G;-src-in-ptx>")
6 | target_link_libraries(mcpi-cuda PUBLIC CUDA::curand)
7 | 


--------------------------------------------------------------------------------
/cuda/mcpi-cuda.cu:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <chrono>
  3 | #include <cstdlib>
  4 | #include <random>
  5 | #include <iomanip>
  6 | 
  7 | #include <cuda.h>
  8 | #include <curand_kernel.h>
  9 | 
 10 | using namespace std;
 11 | 
 12 | typedef chrono::high_resolution_clock timer;
 13 | 
 14 | // check if there are any errors launching the kernel
 15 | #define cuda_error_check() { cuda_assert(__FILE__, __LINE__); }
 16 | inline void cuda_assert(const char *file, int line, bool abort = true)
 17 | {
 18 |     auto error = cudaGetLastError();
 19 |     if (error != cudaSuccess)
 20 |     {
 21 |         cerr << "CUDA error: "
 22 |              << cudaGetErrorString(error)
 23 |              << " (" << error << ") -- "
 24 |              << file << " -- line: "
 25 |              << line << endl;
 26 |         if (abort) exit(error);
 27 |     }
 28 | }
 29 | 
 30 | // get input from the command line for total number of tosses
 31 | size_t process_cmdline(int argc, char* argv[])
 32 | {
 33 |     if (argc > 4)
 34 |     {
 35 |         cout << "Usage: "
 36 |              << argv[0]
 37 |              << " [number of tosses] [number of threads]"
 38 |              << endl;
 39 |         return 0;
 40 |     }
 41 |     else if (1 == argc)
 42 |         return 10'000'000;
 43 |     else
 44 |         return atoll(argv[1]);
 45 | }
 46 | 
 47 | // device kernel to perform Monte Carlo version of tossing darts at a board
 48 | __global__ void cuda_toss(size_t n, size_t* in)
 49 | {
 50 |     size_t rank = threadIdx.x;
 51 |     size_t size = blockDim.x;
 52 | 
 53 |     // Initialize RNG
 54 |     curandState_t rng;
 55 |     curand_init(clock64(), threadIdx.x + blockIdx.x * blockDim.x, 0, &rng);
 56 | 
 57 |     in[rank] = 0;                           // local number of points in circle
 58 |     for (size_t i = 0; i < n / size; ++i)
 59 |     {
 60 |         float x = curand_uniform(&rng);     // Random x position in [0,1]
 61 |         float y = curand_uniform(&rng);     // Random y position in [0,1]
 62 |         // if (x * x + y * y <= 1)          // is point in circle?
 63 |         //     ++in[rank];                  // increase thread-local counter
 64 |         in[rank] += 1 - int(x * x + y * y); // no conditional version (faster)
 65 |     }
 66 | }
 67 | 
 68 | int main(int argc, char* argv[])
 69 | {
 70 |     // querying device properties
 71 |     cudaDeviceProp prop;
 72 |     cudaGetDeviceProperties(&prop, 0);
 73 |     cuda_error_check();
 74 | 
 75 |     // set the number of threads
 76 |     size_t n_threads = 128;
 77 |     if (3 == argc)
 78 |     {
 79 |         int n = atoi(argv[2]);
 80 |         if (0 == n)
 81 |         {
 82 |             cout << "Usage: "
 83 |                  << argv[0]
 84 |                  << " [number of tosses] [number of threads]"
 85 |                  << endl;
 86 |             return -1;
 87 |         }
 88 |         if (n <= prop.maxThreadsPerBlock)
 89 |             n_threads = n;
 90 |     }
 91 | 
 92 |     // read total number of tosses from the command line
 93 |     size_t n_tosses = process_cmdline(argc, argv);
 94 |     if (0 == n_tosses)
 95 |         return -1;
 96 | 
 97 |     cout << "Monte-Carlo Pi Estimator\n"
 98 |          << "Method: CUDA (GPU) -- "
 99 |          << n_threads << " thread(s)\n"
100 |          << "Device name: " << prop.name
101 |          << "\nNumber of tosses: " << n_tosses << endl;
102 | 
103 |     // run the simulation and time it...
104 |     //------> start timer
105 |     timer::time_point start = timer::now();
106 | 
107 |     // memory for thread local results
108 |     size_t* in_device;
109 |     cudaMalloc(&in_device, n_threads * sizeof(size_t));
110 |     cuda_error_check();
111 |     // start parallel Monte Carlo
112 |     cuda_toss<<<1, n_threads>>>(n_tosses, in_device);
113 |     cuda_error_check();
114 | 
115 |     // reducing...
116 |     vector<size_t> in(n_threads);
117 |     cudaMemcpy( 
118 |         in.data()
119 |     ,   in_device
120 |     ,   n_threads * sizeof(size_t)
121 |     ,   cudaMemcpyDeviceToHost);
122 |     cuda_error_check();
123 |     cudaFree(in_device);
124 |     size_t n_in_circle{0};
125 |     for (size_t i{0}; i < n_threads; ++i)
126 |         n_in_circle += in[i];
127 | 
128 |     timer::duration elapsed = timer::now() - start;
129 |     //------> end timer
130 | 
131 |     // ouput the results
132 |     const long double pi = 3.141592653589793238462643L; // 25-digit Pi
133 |     long double pi_estimate = 4.0L * n_in_circle / n_tosses;
134 |     cout << "Estimated Pi: " << fixed << setw(17) << setprecision(15)
135 |          << pi_estimate << endl
136 |          << "Percent error: " << setprecision(3)
137 |          << abs(pi_estimate - pi) / pi * 100.0 << '%' << endl
138 |          << "Elapsed time: "
139 |          << chrono::duration_cast<chrono::milliseconds>(elapsed).count()
140 |          << " ms" << endl;
141 | }
142 | 


--------------------------------------------------------------------------------
/python/mcpi.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | import math
 6 | 
 7 | from time import perf_counter
 8 | 
 9 | n_tosses = 10000
10 | 
11 | #<---- start timing
12 | start = perf_counter()
13 | 
14 | x = np.random.rand(n_tosses)
15 | y = np.random.rand(n_tosses)
16 | 
17 | n_in_circle = 0
18 | for i in range(0, n_tosses - 1):
19 |     if (x[i]**2 + y[i]**2 <= 1):
20 |         n_in_circle += 1
21 | 
22 | end = perf_counter()
23 | #<---- end timing
24 | 
25 | pi_estimate = 4 * (n_in_circle / n_tosses)
26 | print("Monte-Carlo Pi Estimator")
27 | print("Estimated π: ", pi_estimate)
28 | print("Percent error: ", 100 * math.fabs(math.pi - pi_estimate) / math.pi, "%")
29 | print("Elapsed time: ", end - start, "seconds")
30 | 
31 | circle_x = x[np.sqrt(x**2 + y**2) <= 1]
32 | circle_y = y[np.sqrt(x**2 + y**2) <= 1]
33 | 
34 | fig = plt.figure()
35 | plot = fig.add_subplot(111)
36 | plot.scatter(x, y, marker='.', color='blue')
37 | plot.scatter(circle_x, circle_y, marker='.', color='red')
38 | 
39 | x = np.linspace(0, 1, 100)
40 | y = np.sqrt(1 - x**2)
41 | plot.plot(x, y, color='black')
42 | 
43 | plot.set_aspect(1.0)
44 | 
45 | plt.show()


--------------------------------------------------------------------------------
/scripts/run-mcpi-cuda.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --account=def-someuser
3 | #SBATCH --time=5
4 | #SBATCH --mem=4G
5 | #SBATCH --gres=gpu:1
6 | ./mcpi-cuda 100000000


--------------------------------------------------------------------------------
/scripts/run-mcpi-mpi.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --account=def-someuser
3 | #SBATCH --time=5
4 | #SBATCH --ntasks=64         # number of MPI processes
5 | #SBATCH --mem-per-cpu=1024M # memory; default unit is megabytes
6 | srun ./mcpi-mpi 100000000


--------------------------------------------------------------------------------
/scripts/run-mcpi-par.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --account=def-someuser
3 | #SBATCH --time=5
4 | #SBATCH --mem=4G
5 | #SBATCH --cpus-per-task=16
6 | export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK
7 | ./mcpi-par 100000000 $SLURM_CPUS_PER_TASK


--------------------------------------------------------------------------------
/scripts/run-mcpi-seq.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --account=def-someuser
3 | #SBATCH --time=5
4 | #SBATCH --mem=4G
5 | ./mcpi-seq 100000000


--------------------------------------------------------------------------------