├── .gitignore
├── Build_Library
    ├── README
    ├── bashrc_update
    ├── netCDF_build
    │   ├── install_netCDF.sh
    │   ├── netCDF_test.c
    │   └── run_netCDF_test.sh
    └── openmpi_build
    │   ├── MPI_test.c
    │   ├── install_openmpi.sh
    │   └── run_MPI_test.sh
├── Data_Analysis
    ├── README
    ├── SSW-preprocessing.ipynb
    ├── SSW.ipynb
    ├── SSWkmeans.m
    ├── covertype_cluster
    │   ├── analysis.ipynb
    │   ├── cluster_sample.m
    │   ├── cluster_sample.py
    │   ├── cluster_train_full.m
    │   ├── covertype.ipynb
    │   ├── figures
    │   │   ├── accuracy_graph.png
    │   │   ├── covertype_distribution.png
    │   │   ├── study_area_map.png
    │   │   ├── vis_label.png
    │   │   └── vis_pred.png
    │   └── preprocess.py
    ├── data
    │   ├── Label_matlab.bin
    │   ├── Label_py.bin
    │   └── SSWdata.bin
    ├── figures
    │   ├── PV.png
    │   ├── SSW.png
    │   ├── SSWsubset.png
    │   ├── T.png
    │   ├── intro1.png
    │   ├── k3_svalue.png
    │   ├── silScoreSubset.png
    │   └── svalue.png
    └── readData.py
├── Other_Image
    ├── Kmean_illustration
    │   └── Kmeans.gif
    └── pseudo.png
├── Parallel_Algorithm
    ├── Cuda
    │   ├── compile.sh
    │   ├── kmeans_cdf.cu
    │   ├── kmeans_txt.cu
    │   ├── test.txt
    │   └── test_multithreadPerBlock.sh
    ├── MPI
    │   ├── Kmean_mpi.c
    │   ├── compile.sh
    │   ├── test_hybrid.sh
    │   └── test_multiprocess.sh
    ├── OpenMP
    │   ├── Kmean_omp.c
    │   ├── compile.sh
    │   ├── pseudo.c
    │   └── test_multithread.sh
    ├── README
    ├── python_reference
    │   ├── Apply_Kmean.py
    │   ├── IO_util.py
    │   ├── Kmean_iris.py
    │   ├── check_SSWdata.py
    │   ├── check_results.py
    │   ├── convert_SSWdata.py
    │   └── make_fake_data.py
    ├── shared
    │   ├── make_2D_array.c
    │   ├── make_2D_array.h
    │   ├── math_util.c
    │   ├── math_util.h
    │   ├── ncdf_util.c
    │   ├── ncdf_util.h
    │   └── timing.h
    └── test_data
    │   ├── .gitignore
    │   └── README
├── README.md
├── Slides
    ├── CUDA_part.key
    ├── FinalPre.key
    ├── SSW part.key
    └── covertype part.key
├── Timing_Results
    ├── Blobs_smp20000_fea30_cls8.xlsx
    ├── info
    │   ├── compiler
    │   └── cpu
    ├── log
    │   ├── Blobs_Cuda.log
    │   ├── Blobs_MPI.log
    │   ├── Blobs_OpenMP.log
    │   └── Blobs_hybrid.log
    ├── plot_timing.ipynb
    └── plots
    │   ├── Cuda_scaling.jpg
    │   ├── MPI_scaling.jpg
    │   ├── OpenMP_scaling.jpg
    │   └── hybrid_scaling.jpg
└── _config.yml


/.gitignore:
--------------------------------------------------------------------------------
1 | *.swp
2 | *.out
3 | *.nc
4 | __pycache__
5 | 


--------------------------------------------------------------------------------
/Build_Library/README:
--------------------------------------------------------------------------------
1 | Install OpenMPI and netCDF4 libraries
2 | 
3 | Tested on Amazon Linux AMI. Should also work on other machines with minor modifications.
4 | 


--------------------------------------------------------------------------------
/Build_Library/bashrc_update:
--------------------------------------------------------------------------------
1 | # new environment settings that need be added to bashrc after libraries are built
2 | 
3 | # Add netCDF lib to the search path
4 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib
5 | 
6 | # Add to mpicc to PATH
7 | export PATH=$PATH:/usr/local/openmpi/bin
8 | 


--------------------------------------------------------------------------------
/Build_Library/netCDF_build/install_netCDF.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # ==================
 4 | # Install netCDF-C library
 5 | # Tested successfully on Amazon-Linux AMI
 6 | #
 7 | # Jiawei Zhuang 2017/4
 8 | # ==================
 9 | 
10 | # ==================
11 | # Note:
12 | # Use the zlib,HDF5,NetCDF4 versions specified in 
13 | # https://github.com/amznlabs/amazon-dsstne/blob/master/docs/getting_started/setup.md#openmpi-setup
14 | # but added more --prefix and include options according to
15 | # http://www.unidata.ucar.edu/software/netcdf/docs/getting_and_building_netcdf.html#build_default
16 | #
17 | # The older version(netcdf 4.1.3) seems much easier to install than the lastest version (netcdf 4.4.1) 
18 | # ==================
19 | 
20 | # ==================
21 | # for C compiler if not installed yet
22 | # ==================
23 | #sudo yum install gcc
24 | #sudo yum install gcc-c++
25 | #CC=gcc
26 | #CXX=g++
27 | 
28 | # ==================
29 | # make a new directory if not exist
30 | # ==================
31 | mkdir -p $HOME/lib
32 | cd $HOME/lib
33 | 
34 | # ==================
35 | # for zlib
36 | # ==================
37 | wget ftp://ftp.unidata.ucar.edu/pub/netcdf/netcdf-4/zlib-1.2.8.tar.gz
38 | tar xvf zlib-1.2.8.tar.gz
39 | cd zlib-1.2.8
40 | 
41 | # Build and install zlib
42 | ZDIR=/usr/local
43 | ./configure --prefix=${ZDIR}
44 | make check
45 | sudo make install
46 | 
47 | cd ..
48 | 
49 | # ==================
50 | # for HDF5 
51 | # The "make check" step takes 10~20 minutes
52 | # Some of the tests might fail, but doesn't affect netCDF functionality
53 | # ==================
54 | wget ftp://ftp.unidata.ucar.edu/pub/netcdf/netcdf-4/hdf5-1.8.12.tar.gz
55 | tar xvfz hdf5-1.8.12.tar.gz
56 | cd hdf5-1.8.12
57 | 
58 | # Build and install HDF5
59 | H5DIR=/usr/local
60 | ./configure --with-zlib=${ZDIR} --prefix=${H5DIR}
61 | make check
62 | sudo make install
63 | 
64 | cd ..
65 | 
66 | # ==================
67 | # for m4 if necessary
68 | # (https://geeksww.com/tutorials/libraries/m4/installation/installing_m4_macro_processor_ubuntu_linux.php)
69 | # ==================
70 | 
71 | # ==================
72 | # for netCDF4
73 | # The "make check" step takes 5~10 minutes
74 | # ==================
75 | 
76 | wget ftp://ftp.unidata.ucar.edu/pub/netcdf/netcdf-4.1.3.tar.gz
77 | tar xvf netcdf-4.1.3.tar.gz
78 | cd netcdf-4.1.3
79 | 
80 | #export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${H5DIR}/lib
81 | 
82 | # Build and install netCDF-4. We don't need Fortran support (no gfortran installed)
83 | NCDIR=/usr/local
84 | CPPFLAGS=-I${H5DIR}/include LDFLAGS=-L${H5DIR}/lib ./configure --prefix=${NCDIR} --disable-fortran
85 | make check # will fail fortran check without "--disable-fortran" in the configure step
86 | sudo make install
87 | 
88 | # show the configure details
89 | nc-config --all
90 | 


--------------------------------------------------------------------------------
/Build_Library/netCDF_build/netCDF_test.c:
--------------------------------------------------------------------------------
 1 | #include <netcdf.h>
 2 | int main()
 3 | {
 4 |    int ncid;
 5 |    if (nc_create("tmp.nc", NC_NETCDF4, &ncid))
 6 |       return 1;
 7 |    if (nc_close(ncid))
 8 |       return 2;
 9 |    return 0;
10 | }
11 | 
12 | 
13 | 


--------------------------------------------------------------------------------
/Build_Library/netCDF_build/run_netCDF_test.sh:
--------------------------------------------------------------------------------
 1 | # all of them should pass (a single build should work for different compilers)
 2 | gcc -lnetcdf netCDF_test.c -o netCDF_test_by_gcc.out
 3 | ./netCDF_test_by_gcc.out
 4 | 
 5 | nvcc -lnetcdf netCDF_test.c -o netCDF_test_by_nvcc.out
 6 | ./netCDF_test_by_nvcc.out
 7 | 
 8 | pgcc -lnetcdf netCDF_test.c -o netCDF_test_by_pgcc.out
 9 | ./netCDF_test_by_pgcc.out
10 | 


--------------------------------------------------------------------------------
/Build_Library/openmpi_build/MPI_test.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h> /* printf and BUFSIZ defined there */
 2 | #include <stdlib.h> /* exit defined there */
 3 | #include <mpi.h> /* all MPI-2 functions defined there */
 4 | 
 5 | int main(argc, argv)
 6 |         int argc;
 7 |         char *argv[];
 8 |         {
 9 |         int rank, size, length;
10 |         char name[BUFSIZ];
11 | 
12 |         MPI_Init(&argc, &argv);
13 |         MPI_Comm_rank(MPI_COMM_WORLD, &rank);
14 |         MPI_Comm_size(MPI_COMM_WORLD, &size);
15 |         MPI_Get_processor_name(name, &length);
16 | 
17 |         printf("%s: hello world from process %d of %d\n", name, rank, size);
18 | 
19 |         MPI_Finalize();
20 | 
21 |         exit(0);
22 | }
23 | 


--------------------------------------------------------------------------------
/Build_Library/openmpi_build/install_openmpi.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # ==================
 4 | # Install openmpi library
 5 | # Tested successfully on Amazon-Linux AMI
 6 | #
 7 | # Jiawei Zhuang 2017/4
 8 | # ==================
 9 | 
10 | # ==================
11 | # make a new directory if not exist
12 | # ==================
13 | mkdir -p $HOME/lib
14 | cd $HOME/lib
15 | 
16 | # ==================
17 | # openmpi build (make install takes many minutes)
18 | # Some of the "make check "tests might fail, 
19 | # but it doesn't affect basic MPI functionality
20 | # ==================
21 | 
22 | wget https://www.open-mpi.org/software/ompi/v2.1/downloads/openmpi-2.1.0.tar.gz
23 | tar xvf openmpi-2.1.0.tar.gz 
24 | cd openmpi-2.1.0
25 | ./configure --prefix=/usr/local/openmpi
26 | make check
27 | sudo make install
28 | 


--------------------------------------------------------------------------------
/Build_Library/openmpi_build/run_MPI_test.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | mpicc --version
4 | mpicc MPI_test.c -o MPI_test.out
5 | mpirun -np 4 ./MPI_test.out
6 | 


--------------------------------------------------------------------------------
/Data_Analysis/README:
--------------------------------------------------------------------------------
1 | Apply the k-mean algorithm to real data
2 | 


--------------------------------------------------------------------------------
/Data_Analysis/SSW-preprocessing.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "metadata": {
   3 |   "name": ""
   4 |  },
   5 |  "nbformat": 3,
   6 |  "nbformat_minor": 0,
   7 |  "worksheets": [
   8 |   {
   9 |    "cells": [
  10 |     {
  11 |      "cell_type": "code",
  12 |      "collapsed": false,
  13 |      "input": [
  14 |       "%matplotlib inline\n",
  15 |       "from netCDF4 import Dataset\n",
  16 |       "from sklearn import preprocessing\n",
  17 |       "import numpy as np\n",
  18 |       "import math\n",
  19 |       "import scipy\n",
  20 |       "import sys\n",
  21 |       "from sklearn.cluster import KMeans\n",
  22 |       "from sklearn.metrics import silhouette_samples, silhouette_score\n",
  23 |       "import matplotlib.cm as cm\n",
  24 |       "import matplotlib.pyplot as plt"
  25 |      ],
  26 |      "language": "python",
  27 |      "metadata": {},
  28 |      "outputs": [],
  29 |      "prompt_number": 1
  30 |     },
  31 |     {
  32 |      "cell_type": "code",
  33 |      "collapsed": false,
  34 |      "input": [
  35 |       "top_zn = 0\n",
  36 |       "bottom_zn = 18\n",
  37 |       "levels = bottom_zn-top_zn\n",
  38 |       "Z_lat = 79"
  39 |      ],
  40 |      "language": "python",
  41 |      "metadata": {},
  42 |      "outputs": [],
  43 |      "prompt_number": 224
  44 |     },
  45 |     {
  46 |      "cell_type": "code",
  47 |      "collapsed": false,
  48 |      "input": [
  49 |       "file_nm = '../data/T_11_1.nc'\n",
  50 |       "T = Dataset(file_nm,mode='r')\n",
  51 |       "print T.dimensions.keys()\n",
  52 |       "print T.variables.keys()\n",
  53 |       "lat = T.variables['lat'][:]\n",
  54 |       "lon = T.variables['lon'][:]\n",
  55 |       "lev = T.variables['lev'][:]\n",
  56 |       "T = T.variables['T'][:]\n",
  57 |       "T.shape\n",
  58 |       "print lev[top_zn:bottom_zn]\n",
  59 |       "lat = np.expand_dims(lat, axis=0)\n",
  60 |       "lat = np.expand_dims(lat, axis=0)\n",
  61 |       "lat = np.expand_dims(lat, axis=3)\n",
  62 |       "idx = np.where((lat>60)&(lat<=70))\n",
  63 |       "lat.shape"
  64 |      ],
  65 |      "language": "python",
  66 |      "metadata": {},
  67 |      "outputs": [
  68 |       {
  69 |        "output_type": "stream",
  70 |        "stream": "stdout",
  71 |        "text": [
  72 |         "[u'time', u'lev', u'lat', u'lon']\n",
  73 |         "[u'T', u'lat', u'lev', u'lon', u'time']\n",
  74 |         "[   1.244795    1.61285     2.079325    2.667425    3.404875    4.324575\n",
  75 |         "    5.4654      6.87285     8.599725   10.70705    13.26475    16.35175\n",
  76 |         "   20.05675    24.479      29.728      35.92325    43.19375    51.6775\n",
  77 |         "   61.5205     72.8745     85.65715   100.514695]"
  78 |        ]
  79 |       },
  80 |       {
  81 |        "output_type": "stream",
  82 |        "stream": "stdout",
  83 |        "text": [
  84 |         "\n"
  85 |        ]
  86 |       },
  87 |       {
  88 |        "metadata": {},
  89 |        "output_type": "pyout",
  90 |        "prompt_number": 225,
  91 |        "text": [
  92 |         "(1, 1, 96, 1)"
  93 |        ]
  94 |       }
  95 |      ],
  96 |      "prompt_number": 225
  97 |     },
  98 |     {
  99 |      "cell_type": "code",
 100 |      "collapsed": false,
 101 |      "input": [
 102 |       "Tmean1save = np.empty([1,levels])\n",
 103 |       "Tmean2save = np.empty([1,levels])\n",
 104 |       "Tmean3save = np.empty([1,levels])\n",
 105 |       "Tgradsave = np.empty([1,levels])\n",
 106 |       "Umean1save = np.empty([1,levels])\n",
 107 |       "Umean2save = np.empty([1,levels])\n",
 108 |       "ZAsave = np.empty([1,levels,3])\n",
 109 |       "Zphasesave = np.empty([1,levels,2])"
 110 |      ],
 111 |      "language": "python",
 112 |      "metadata": {},
 113 |      "outputs": [],
 114 |      "prompt_number": 226
 115 |     },
 116 |     {
 117 |      "cell_type": "code",
 118 |      "collapsed": false,
 119 |      "input": [
 120 |       "for year in range(11,60):\n",
 121 |       "    for mon in range(1,13):\n",
 122 |       "        \n",
 123 |       "        # Read in Temperature\n",
 124 |       "        file_nm = '../data/T_'+str(year)+'_'+str(mon)+'.nc'\n",
 125 |       "        T = Dataset(file_nm,mode='r')\n",
 126 |       "        T = T.variables['T'][:]\n",
 127 |       "        \n",
 128 |       "        # Calculte zonal averaged temperature for different latitudes and height\n",
 129 |       "        idx = np.where((lat>60)&(lat<=70))\n",
 130 |       "        Tmean1 = np.mean(np.sum(T[:,top_zn:bottom_zn,idx[2],:]*np.cos(lat[:,:,idx[2]]/180*math.pi),axis=2)/np.sum(np.cos(lat[:,:,idx[2]]/180*math.pi)),axis=2)\n",
 131 |       "        idx = np.where((lat>70)&(lat<=80))\n",
 132 |       "        Tmean2 = np.mean(np.sum(T[:,top_zn:bottom_zn,idx[2],:]*np.cos(lat[:,:,idx[2]]/180*math.pi),axis=2)/np.sum(np.cos(lat[:,:,idx[2]]/180*math.pi)),axis=2)\n",
 133 |       "        idx = np.where((lat>80)&(lat<=90))\n",
 134 |       "        Tmean3 = np.mean(np.sum(T[:,top_zn:bottom_zn,idx[2],:]*np.cos(lat[:,:,idx[2]]/180*math.pi),axis=2)/np.sum(np.cos(lat[:,:,idx[2]]/180*math.pi)),axis=2)\n",
 135 |       "        print Tmean3.shape\n",
 136 |       "        \n",
 137 |       "        # Calculate latitudinal temperature gradients\n",
 138 |       "        idx = np.where((lat>50)&(lat<=60))\n",
 139 |       "        Tmean4 = np.mean(np.sum(T[:,top_zn:bottom_zn,idx[2],:]*np.cos(lat[:,:,idx[2]]/180*math.pi),axis=2)/np.sum(np.cos(lat[:,:,idx[2]]/180*math.pi)),axis=2)\n",
 140 |       "        Tgrad = Tmean3-Tmean4\n",
 141 |       "        \n",
 142 |       "        # Calculate Zonal winds, zonally averaged for different latitudes and height\n",
 143 |       "        file_nm = '../data/U_'+str(year)+'_'+str(mon)+'.nc'\n",
 144 |       "        U = Dataset(file_nm,mode='r')\n",
 145 |       "        U = U.variables['U'][:]\n",
 146 |       "        idx = np.where((lat>60)&(lat<=70))\n",
 147 |       "        Umean1 = np.mean(np.sum(U[:,top_zn:bottom_zn,idx[2],:]*np.cos(lat[:,:,idx[2]]/180*math.pi),axis=2)/np.sum(np.cos(lat[:,:,idx[2]]/180*math.pi)),axis=2)\n",
 148 |       "        idx = np.where((lat>70)&(lat<=80))\n",
 149 |       "        Umean2 = np.mean(np.sum(U[:,top_zn:bottom_zn,idx[2],:]*np.cos(lat[:,:,idx[2]]/180*math.pi),axis=2)/np.sum(np.cos(lat[:,:,idx[2]]/180*math.pi)),axis=2)\n",
 150 |       "        \n",
 151 |       "        # Calculate Geopotential height\n",
 152 |       "        file_nm = '../data/Z3_'+str(year)+'_'+str(mon)+'.nc'\n",
 153 |       "        Z = Dataset(file_nm,mode='r')\n",
 154 |       "        Z = Z.variables['Z3'][:]\n",
 155 |       "        Z = Z[:,top_zn:bottom_zn,Z_lat,:]\n",
 156 |       "        # FFT to get wave number 1 and wave number 2 component\n",
 157 |       "        ZA = np.fft.fft(Z)\n",
 158 |       "        ZA = np.abs(ZA[:,:,0:3])\n",
 159 |       "        Zphase = np.angle(ZA[:,:,1:3])\n",
 160 |       "        \n",
 161 |       "        # Store features to save arrays\n",
 162 |       "        Tmean1save = np.append(Tmean1save,Tmean1,axis=0)\n",
 163 |       "        Tmean2save = np.append(Tmean2save,Tmean2,axis=0)\n",
 164 |       "        Tmean3save = np.append(Tmean3save,Tmean3,axis=0)\n",
 165 |       "        Tgradsave = np.append(Tgradsave,Tgrad,axis=0)\n",
 166 |       "        Umean1save = np.append(Umean1save,Umean1,axis=0)\n",
 167 |       "        Umean2save = np.append(Umean2save,Umean2,axis=0)\n",
 168 |       "        ZAsave = np.append(ZAsave,ZA,axis=0)\n",
 169 |       "        Zphasesave = np.append(Zphasesave,Zphase,axis=0)"
 170 |      ],
 171 |      "language": "python",
 172 |      "metadata": {},
 173 |      "outputs": [
 174 |       {
 175 |        "output_type": "stream",
 176 |        "stream": "stdout",
 177 |        "text": [
 178 |         "(31, 22)\n",
 179 |         "(28, 22)"
 180 |        ]
 181 |       },
 182 |       {
 183 |        "output_type": "stream",
 184 |        "stream": "stdout",
 185 |        "text": [
 186 |         "\n",
 187 |         "(31, 22)"
 188 |        ]
 189 |       },
 190 |       {
 191 |        "output_type": "stream",
 192 |        "stream": "stdout",
 193 |        "text": [
 194 |         "\n",
 195 |         "(31, 22)"
 196 |        ]
 197 |       },
 198 |       {
 199 |        "output_type": "stream",
 200 |        "stream": "stdout",
 201 |        "text": [
 202 |         "\n",
 203 |         "(28, 22)"
 204 |        ]
 205 |       },
 206 |       {
 207 |        "output_type": "stream",
 208 |        "stream": "stdout",
 209 |        "text": [
 210 |         "\n",
 211 |         "(31, 22)"
 212 |        ]
 213 |       },
 214 |       {
 215 |        "output_type": "stream",
 216 |        "stream": "stdout",
 217 |        "text": [
 218 |         "\n",
 219 |         "(31, 22)"
 220 |        ]
 221 |       },
 222 |       {
 223 |        "output_type": "stream",
 224 |        "stream": "stdout",
 225 |        "text": [
 226 |         "\n",
 227 |         "(28, 22)"
 228 |        ]
 229 |       },
 230 |       {
 231 |        "output_type": "stream",
 232 |        "stream": "stdout",
 233 |        "text": [
 234 |         "\n",
 235 |         "(31, 22)"
 236 |        ]
 237 |       },
 238 |       {
 239 |        "output_type": "stream",
 240 |        "stream": "stdout",
 241 |        "text": [
 242 |         "\n",
 243 |         "(31, 22)"
 244 |        ]
 245 |       },
 246 |       {
 247 |        "output_type": "stream",
 248 |        "stream": "stdout",
 249 |        "text": [
 250 |         "\n",
 251 |         "(28, 22)"
 252 |        ]
 253 |       },
 254 |       {
 255 |        "output_type": "stream",
 256 |        "stream": "stdout",
 257 |        "text": [
 258 |         "\n",
 259 |         "(31, 22)"
 260 |        ]
 261 |       },
 262 |       {
 263 |        "output_type": "stream",
 264 |        "stream": "stdout",
 265 |        "text": [
 266 |         "\n",
 267 |         "(31, 22)"
 268 |        ]
 269 |       },
 270 |       {
 271 |        "output_type": "stream",
 272 |        "stream": "stdout",
 273 |        "text": [
 274 |         "\n",
 275 |         "(28, 22)"
 276 |        ]
 277 |       },
 278 |       {
 279 |        "output_type": "stream",
 280 |        "stream": "stdout",
 281 |        "text": [
 282 |         "\n",
 283 |         "(31, 22)"
 284 |        ]
 285 |       },
 286 |       {
 287 |        "output_type": "stream",
 288 |        "stream": "stdout",
 289 |        "text": [
 290 |         "\n",
 291 |         "(31, 22)"
 292 |        ]
 293 |       },
 294 |       {
 295 |        "output_type": "stream",
 296 |        "stream": "stdout",
 297 |        "text": [
 298 |         "\n",
 299 |         "(28, 22)"
 300 |        ]
 301 |       },
 302 |       {
 303 |        "output_type": "stream",
 304 |        "stream": "stdout",
 305 |        "text": [
 306 |         "\n",
 307 |         "(31, 22)"
 308 |        ]
 309 |       },
 310 |       {
 311 |        "output_type": "stream",
 312 |        "stream": "stdout",
 313 |        "text": [
 314 |         "\n",
 315 |         "(31, 22)"
 316 |        ]
 317 |       },
 318 |       {
 319 |        "output_type": "stream",
 320 |        "stream": "stdout",
 321 |        "text": [
 322 |         "\n",
 323 |         "(28, 22)"
 324 |        ]
 325 |       },
 326 |       {
 327 |        "output_type": "stream",
 328 |        "stream": "stdout",
 329 |        "text": [
 330 |         "\n",
 331 |         "(31, 22)"
 332 |        ]
 333 |       },
 334 |       {
 335 |        "output_type": "stream",
 336 |        "stream": "stdout",
 337 |        "text": [
 338 |         "\n",
 339 |         "(31, 22)"
 340 |        ]
 341 |       },
 342 |       {
 343 |        "output_type": "stream",
 344 |        "stream": "stdout",
 345 |        "text": [
 346 |         "\n",
 347 |         "(28, 22)"
 348 |        ]
 349 |       },
 350 |       {
 351 |        "output_type": "stream",
 352 |        "stream": "stdout",
 353 |        "text": [
 354 |         "\n",
 355 |         "(31, 22)"
 356 |        ]
 357 |       },
 358 |       {
 359 |        "output_type": "stream",
 360 |        "stream": "stdout",
 361 |        "text": [
 362 |         "\n",
 363 |         "(31, 22)"
 364 |        ]
 365 |       },
 366 |       {
 367 |        "output_type": "stream",
 368 |        "stream": "stdout",
 369 |        "text": [
 370 |         "\n",
 371 |         "(28, 22)"
 372 |        ]
 373 |       },
 374 |       {
 375 |        "output_type": "stream",
 376 |        "stream": "stdout",
 377 |        "text": [
 378 |         "\n",
 379 |         "(31, 22)"
 380 |        ]
 381 |       },
 382 |       {
 383 |        "output_type": "stream",
 384 |        "stream": "stdout",
 385 |        "text": [
 386 |         "\n",
 387 |         "(31, 22)"
 388 |        ]
 389 |       },
 390 |       {
 391 |        "output_type": "stream",
 392 |        "stream": "stdout",
 393 |        "text": [
 394 |         "\n",
 395 |         "(28, 22)"
 396 |        ]
 397 |       },
 398 |       {
 399 |        "output_type": "stream",
 400 |        "stream": "stdout",
 401 |        "text": [
 402 |         "\n",
 403 |         "(31, 22)"
 404 |        ]
 405 |       },
 406 |       {
 407 |        "output_type": "stream",
 408 |        "stream": "stdout",
 409 |        "text": [
 410 |         "\n",
 411 |         "(31, 22)"
 412 |        ]
 413 |       },
 414 |       {
 415 |        "output_type": "stream",
 416 |        "stream": "stdout",
 417 |        "text": [
 418 |         "\n",
 419 |         "(28, 22)"
 420 |        ]
 421 |       },
 422 |       {
 423 |        "output_type": "stream",
 424 |        "stream": "stdout",
 425 |        "text": [
 426 |         "\n",
 427 |         "(31, 22)"
 428 |        ]
 429 |       },
 430 |       {
 431 |        "output_type": "stream",
 432 |        "stream": "stdout",
 433 |        "text": [
 434 |         "\n",
 435 |         "(31, 22)"
 436 |        ]
 437 |       },
 438 |       {
 439 |        "output_type": "stream",
 440 |        "stream": "stdout",
 441 |        "text": [
 442 |         "\n",
 443 |         "(28, 22)"
 444 |        ]
 445 |       },
 446 |       {
 447 |        "output_type": "stream",
 448 |        "stream": "stdout",
 449 |        "text": [
 450 |         "\n",
 451 |         "(31, 22)"
 452 |        ]
 453 |       },
 454 |       {
 455 |        "output_type": "stream",
 456 |        "stream": "stdout",
 457 |        "text": [
 458 |         "\n",
 459 |         "(31, 22)"
 460 |        ]
 461 |       },
 462 |       {
 463 |        "output_type": "stream",
 464 |        "stream": "stdout",
 465 |        "text": [
 466 |         "\n",
 467 |         "(28, 22)"
 468 |        ]
 469 |       },
 470 |       {
 471 |        "output_type": "stream",
 472 |        "stream": "stdout",
 473 |        "text": [
 474 |         "\n",
 475 |         "(31, 22)"
 476 |        ]
 477 |       },
 478 |       {
 479 |        "output_type": "stream",
 480 |        "stream": "stdout",
 481 |        "text": [
 482 |         "\n",
 483 |         "(31, 22)"
 484 |        ]
 485 |       },
 486 |       {
 487 |        "output_type": "stream",
 488 |        "stream": "stdout",
 489 |        "text": [
 490 |         "\n",
 491 |         "(28, 22)"
 492 |        ]
 493 |       },
 494 |       {
 495 |        "output_type": "stream",
 496 |        "stream": "stdout",
 497 |        "text": [
 498 |         "\n",
 499 |         "(31, 22)"
 500 |        ]
 501 |       },
 502 |       {
 503 |        "output_type": "stream",
 504 |        "stream": "stdout",
 505 |        "text": [
 506 |         "\n",
 507 |         "(31, 22)"
 508 |        ]
 509 |       },
 510 |       {
 511 |        "output_type": "stream",
 512 |        "stream": "stdout",
 513 |        "text": [
 514 |         "\n",
 515 |         "(28, 22)"
 516 |        ]
 517 |       },
 518 |       {
 519 |        "output_type": "stream",
 520 |        "stream": "stdout",
 521 |        "text": [
 522 |         "\n",
 523 |         "(31, 22)"
 524 |        ]
 525 |       },
 526 |       {
 527 |        "output_type": "stream",
 528 |        "stream": "stdout",
 529 |        "text": [
 530 |         "\n",
 531 |         "(31, 22)"
 532 |        ]
 533 |       },
 534 |       {
 535 |        "output_type": "stream",
 536 |        "stream": "stdout",
 537 |        "text": [
 538 |         "\n",
 539 |         "(28, 22)"
 540 |        ]
 541 |       },
 542 |       {
 543 |        "output_type": "stream",
 544 |        "stream": "stdout",
 545 |        "text": [
 546 |         "\n",
 547 |         "(31, 22)"
 548 |        ]
 549 |       },
 550 |       {
 551 |        "output_type": "stream",
 552 |        "stream": "stdout",
 553 |        "text": [
 554 |         "\n",
 555 |         "(31, 22)"
 556 |        ]
 557 |       },
 558 |       {
 559 |        "output_type": "stream",
 560 |        "stream": "stdout",
 561 |        "text": [
 562 |         "\n",
 563 |         "(28, 22)"
 564 |        ]
 565 |       },
 566 |       {
 567 |        "output_type": "stream",
 568 |        "stream": "stdout",
 569 |        "text": [
 570 |         "\n",
 571 |         "(31, 22)"
 572 |        ]
 573 |       },
 574 |       {
 575 |        "output_type": "stream",
 576 |        "stream": "stdout",
 577 |        "text": [
 578 |         "\n",
 579 |         "(31, 22)"
 580 |        ]
 581 |       },
 582 |       {
 583 |        "output_type": "stream",
 584 |        "stream": "stdout",
 585 |        "text": [
 586 |         "\n",
 587 |         "(28, 22)"
 588 |        ]
 589 |       },
 590 |       {
 591 |        "output_type": "stream",
 592 |        "stream": "stdout",
 593 |        "text": [
 594 |         "\n",
 595 |         "(31, 22)"
 596 |        ]
 597 |       },
 598 |       {
 599 |        "output_type": "stream",
 600 |        "stream": "stdout",
 601 |        "text": [
 602 |         "\n",
 603 |         "(31, 22)"
 604 |        ]
 605 |       },
 606 |       {
 607 |        "output_type": "stream",
 608 |        "stream": "stdout",
 609 |        "text": [
 610 |         "\n",
 611 |         "(28, 22)"
 612 |        ]
 613 |       },
 614 |       {
 615 |        "output_type": "stream",
 616 |        "stream": "stdout",
 617 |        "text": [
 618 |         "\n",
 619 |         "(31, 22)"
 620 |        ]
 621 |       },
 622 |       {
 623 |        "output_type": "stream",
 624 |        "stream": "stdout",
 625 |        "text": [
 626 |         "\n",
 627 |         "(31, 22)"
 628 |        ]
 629 |       },
 630 |       {
 631 |        "output_type": "stream",
 632 |        "stream": "stdout",
 633 |        "text": [
 634 |         "\n",
 635 |         "(28, 22)"
 636 |        ]
 637 |       },
 638 |       {
 639 |        "output_type": "stream",
 640 |        "stream": "stdout",
 641 |        "text": [
 642 |         "\n",
 643 |         "(31, 22)"
 644 |        ]
 645 |       },
 646 |       {
 647 |        "output_type": "stream",
 648 |        "stream": "stdout",
 649 |        "text": [
 650 |         "\n",
 651 |         "(31, 22)"
 652 |        ]
 653 |       },
 654 |       {
 655 |        "output_type": "stream",
 656 |        "stream": "stdout",
 657 |        "text": [
 658 |         "\n",
 659 |         "(28, 22)"
 660 |        ]
 661 |       },
 662 |       {
 663 |        "output_type": "stream",
 664 |        "stream": "stdout",
 665 |        "text": [
 666 |         "\n",
 667 |         "(31, 22)"
 668 |        ]
 669 |       },
 670 |       {
 671 |        "output_type": "stream",
 672 |        "stream": "stdout",
 673 |        "text": [
 674 |         "\n",
 675 |         "(31, 22)"
 676 |        ]
 677 |       },
 678 |       {
 679 |        "output_type": "stream",
 680 |        "stream": "stdout",
 681 |        "text": [
 682 |         "\n",
 683 |         "(28, 22)"
 684 |        ]
 685 |       },
 686 |       {
 687 |        "output_type": "stream",
 688 |        "stream": "stdout",
 689 |        "text": [
 690 |         "\n",
 691 |         "(31, 22)"
 692 |        ]
 693 |       },
 694 |       {
 695 |        "output_type": "stream",
 696 |        "stream": "stdout",
 697 |        "text": [
 698 |         "\n",
 699 |         "(31, 22)"
 700 |        ]
 701 |       },
 702 |       {
 703 |        "output_type": "stream",
 704 |        "stream": "stdout",
 705 |        "text": [
 706 |         "\n",
 707 |         "(28, 22)"
 708 |        ]
 709 |       },
 710 |       {
 711 |        "output_type": "stream",
 712 |        "stream": "stdout",
 713 |        "text": [
 714 |         "\n",
 715 |         "(31, 22)"
 716 |        ]
 717 |       },
 718 |       {
 719 |        "output_type": "stream",
 720 |        "stream": "stdout",
 721 |        "text": [
 722 |         "\n",
 723 |         "(31, 22)"
 724 |        ]
 725 |       },
 726 |       {
 727 |        "output_type": "stream",
 728 |        "stream": "stdout",
 729 |        "text": [
 730 |         "\n",
 731 |         "(28, 22)"
 732 |        ]
 733 |       },
 734 |       {
 735 |        "output_type": "stream",
 736 |        "stream": "stdout",
 737 |        "text": [
 738 |         "\n",
 739 |         "(31, 22)"
 740 |        ]
 741 |       },
 742 |       {
 743 |        "output_type": "stream",
 744 |        "stream": "stdout",
 745 |        "text": [
 746 |         "\n",
 747 |         "(31, 22)"
 748 |        ]
 749 |       },
 750 |       {
 751 |        "output_type": "stream",
 752 |        "stream": "stdout",
 753 |        "text": [
 754 |         "\n",
 755 |         "(28, 22)"
 756 |        ]
 757 |       },
 758 |       {
 759 |        "output_type": "stream",
 760 |        "stream": "stdout",
 761 |        "text": [
 762 |         "\n",
 763 |         "(31, 22)"
 764 |        ]
 765 |       },
 766 |       {
 767 |        "output_type": "stream",
 768 |        "stream": "stdout",
 769 |        "text": [
 770 |         "\n",
 771 |         "(31, 22)"
 772 |        ]
 773 |       },
 774 |       {
 775 |        "output_type": "stream",
 776 |        "stream": "stdout",
 777 |        "text": [
 778 |         "\n",
 779 |         "(28, 22)"
 780 |        ]
 781 |       },
 782 |       {
 783 |        "output_type": "stream",
 784 |        "stream": "stdout",
 785 |        "text": [
 786 |         "\n",
 787 |         "(31, 22)"
 788 |        ]
 789 |       },
 790 |       {
 791 |        "output_type": "stream",
 792 |        "stream": "stdout",
 793 |        "text": [
 794 |         "\n",
 795 |         "(31, 22)"
 796 |        ]
 797 |       },
 798 |       {
 799 |        "output_type": "stream",
 800 |        "stream": "stdout",
 801 |        "text": [
 802 |         "\n",
 803 |         "(28, 22)"
 804 |        ]
 805 |       },
 806 |       {
 807 |        "output_type": "stream",
 808 |        "stream": "stdout",
 809 |        "text": [
 810 |         "\n",
 811 |         "(31, 22)"
 812 |        ]
 813 |       },
 814 |       {
 815 |        "output_type": "stream",
 816 |        "stream": "stdout",
 817 |        "text": [
 818 |         "\n",
 819 |         "(31, 22)"
 820 |        ]
 821 |       },
 822 |       {
 823 |        "output_type": "stream",
 824 |        "stream": "stdout",
 825 |        "text": [
 826 |         "\n",
 827 |         "(28, 22)"
 828 |        ]
 829 |       },
 830 |       {
 831 |        "output_type": "stream",
 832 |        "stream": "stdout",
 833 |        "text": [
 834 |         "\n",
 835 |         "(31, 22)"
 836 |        ]
 837 |       },
 838 |       {
 839 |        "output_type": "stream",
 840 |        "stream": "stdout",
 841 |        "text": [
 842 |         "\n",
 843 |         "(31, 22)"
 844 |        ]
 845 |       },
 846 |       {
 847 |        "output_type": "stream",
 848 |        "stream": "stdout",
 849 |        "text": [
 850 |         "\n",
 851 |         "(28, 22)"
 852 |        ]
 853 |       },
 854 |       {
 855 |        "output_type": "stream",
 856 |        "stream": "stdout",
 857 |        "text": [
 858 |         "\n",
 859 |         "(31, 22)"
 860 |        ]
 861 |       },
 862 |       {
 863 |        "output_type": "stream",
 864 |        "stream": "stdout",
 865 |        "text": [
 866 |         "\n"
 867 |        ]
 868 |       }
 869 |      ],
 870 |      "prompt_number": 227
 871 |     },
 872 |     {
 873 |      "cell_type": "code",
 874 |      "collapsed": false,
 875 |      "input": [
 876 |       "if sys.byteorder=='little':\n",
 877 |       "    Tmean1save.byteswap(True)\n",
 878 |       "    Tmean2save.byteswap(True)\n",
 879 |       "    Tmean3save.byteswap(True)\n",
 880 |       "    Tgradsave.byteswap(True)\n",
 881 |       "    Umean1save.byteswap(True)\n",
 882 |       "    Umean2save.byteswap(True)\n",
 883 |       "    ZAsave.byteswap(True)\n",
 884 |       "    Zphasesave.byteswap(True)\n",
 885 |       "#Create the binary files of the input files\n",
 886 |       "filename=open(r'./Tmean1.bin','wb')\n",
 887 |       "Tmean1save.ravel().tofile(filename)\n",
 888 |       "filename.close()\n",
 889 |       "filename=open(r'./Tmean2.bin','wb')\n",
 890 |       "Tmean2save.ravel().tofile(filename)\n",
 891 |       "filename.close()\n",
 892 |       "filename=open(r'./Tmean3.bin','wb')\n",
 893 |       "Tmean3save.ravel().tofile(filename)\n",
 894 |       "filename.close()\n",
 895 |       "filename=open(r'./Tgrad.bin','wb')\n",
 896 |       "Tgradsave.ravel().tofile(filename)\n",
 897 |       "filename.close()\n",
 898 |       "filename=open(r'./Umean1.bin','wb')\n",
 899 |       "Umean1save.ravel().tofile(filename)\n",
 900 |       "filename.close()\n",
 901 |       "filename=open(r'./Umean2.bin','wb')\n",
 902 |       "Umean2save.ravel().tofile(filename)\n",
 903 |       "filename.close()\n",
 904 |       "filename=open(r'./ZA.bin','wb')\n",
 905 |       "ZAsave.ravel().tofile(filename)\n",
 906 |       "filename.close()\n",
 907 |       "filename=open(r'./Zphase.bin','wb')\n",
 908 |       "Zphasesave.ravel().tofile(filename)\n",
 909 |       "filename.close()"
 910 |      ],
 911 |      "language": "python",
 912 |      "metadata": {},
 913 |      "outputs": [],
 914 |      "prompt_number": 228
 915 |     },
 916 |     {
 917 |      "cell_type": "code",
 918 |      "collapsed": false,
 919 |      "input": [
 920 |       "nt = 365*49+1\n",
 921 |       "levels = 18\n",
 922 |       "file1=open('./data/Tmean1.bin','rb')\n",
 923 |       "Tmean1read=np.fromfile(file1)\n",
 924 |       "if sys.byteorder=='little':\n",
 925 |       "    Tmean1read.byteswap(True)\n",
 926 |       "Tmean1read=Tmean1read.reshape(nt,levels)\n",
 927 |       "\n",
 928 |       "file1=open('./data/Tmean2.bin','rb')\n",
 929 |       "Tmean2read=np.fromfile(file1)\n",
 930 |       "if sys.byteorder=='little':\n",
 931 |       "    Tmean2read.byteswap(True)\n",
 932 |       "Tmean2read=Tmean2read.reshape(nt,levels)\n",
 933 |       "\n",
 934 |       "file1=open('./data/Tmean3.bin','rb')\n",
 935 |       "Tmean3read=np.fromfile(file1)\n",
 936 |       "if sys.byteorder=='little':\n",
 937 |       "    Tmean3read.byteswap(True)\n",
 938 |       "Tmean3read=Tmean3read.reshape(nt,levels)\n",
 939 |       "\n",
 940 |       "file1=open('./data/Tgrad.bin','rb')\n",
 941 |       "Tgradread=np.fromfile(file1)\n",
 942 |       "if sys.byteorder=='little':\n",
 943 |       "    Tgradread.byteswap(True)\n",
 944 |       "Tgradread=Tgradread.reshape(nt,levels)\n",
 945 |       "\n",
 946 |       "file1=open('./data/Umean1.bin','rb')\n",
 947 |       "Umean1read=np.fromfile(file1)\n",
 948 |       "if sys.byteorder=='little':\n",
 949 |       "    Umean1read.byteswap(True)\n",
 950 |       "Umean1read=Umean1read.reshape(nt,levels)\n",
 951 |       "\n",
 952 |       "file1=open('./data/Umean2.bin','rb')\n",
 953 |       "Umean2read=np.fromfile(file1)\n",
 954 |       "if sys.byteorder=='little':\n",
 955 |       "    Umean2read.byteswap(True)\n",
 956 |       "Umean2read=Umean2read.reshape(nt,levels)\n",
 957 |       "\n",
 958 |       "file1=open('./data/ZA.bin','rb')\n",
 959 |       "ZAread=np.fromfile(file1)\n",
 960 |       "if sys.byteorder=='little':\n",
 961 |       "    ZAread.byteswap(True)\n",
 962 |       "ZAread=ZAread.reshape(nt,levels,3)\n",
 963 |       "\n",
 964 |       "file1=open('./data/Zphase.bin','rb')\n",
 965 |       "Zphaseread=np.fromfile(file1)\n",
 966 |       "if sys.byteorder=='little':\n",
 967 |       "    Zphaseread.byteswap(True)\n",
 968 |       "Zphaseread=Zphaseread.reshape(nt,levels,2)"
 969 |      ],
 970 |      "language": "python",
 971 |      "metadata": {},
 972 |      "outputs": [],
 973 |      "prompt_number": 2
 974 |     },
 975 |     {
 976 |      "cell_type": "code",
 977 |      "collapsed": false,
 978 |      "input": [
 979 |       "# Calculating Data Metric\n",
 980 |       "Tmean1read = Tmean1read[1:,:]-np.mean(Tmean1read[1:,:],axis=0)\n",
 981 |       "Tmean2read = Tmean2read[1:,:]-np.mean(Tmean2read[1:,:],axis=0)\n",
 982 |       "Tmean3read = Tmean3read[1:,:]-np.mean(Tmean3read[1:,:],axis=0)\n",
 983 |       "Tmean1tend = Tmean1read[7:,:]-Tmean1read[0:nt-8,:]\n",
 984 |       "Tmean2tend = Tmean2read[7:,:]-Tmean2read[0:nt-8,:]\n",
 985 |       "Tmean3tend = Tmean3read[7:,:]-Tmean3read[0:nt-8,:]\n",
 986 |       "Tgradread = Tgradread[1:,:]\n",
 987 |       "Tgradtend = Tgradread[7:,:]-Tgradread[0:nt-8,:]\n",
 988 |       "Umean1read = Umean1read[1:,:]\n",
 989 |       "Umean2read = Umean2read[1:,:]\n",
 990 |       "Umean1tend = Umean1read[7:,:]-Umean1read[0:nt-8,:]\n",
 991 |       "Umean2tend = Umean2read[7:,:]-Umean2read[0:nt-8,:]\n",
 992 |       "ZAread = ZAread[1:,:]\n",
 993 |       "ZA1read = ZAread[:,:,1]/ZAread[:,:,0]\n",
 994 |       "ZA2read = ZAread[:,:,2]/ZAread[:,:,0]\n",
 995 |       "Zphase1read = Zphaseread[1:,:,0]\n",
 996 |       "Zphase2read = Zphaseread[1:,:,1]"
 997 |      ],
 998 |      "language": "python",
 999 |      "metadata": {},
1000 |      "outputs": [],
1001 |      "prompt_number": 3
1002 |     },
1003 |     {
1004 |      "cell_type": "code",
1005 |      "collapsed": false,
1006 |      "input": [
1007 |       "X = np.concatenate((Tmean1read[7:,:],Tmean2read[7:,:],Tmean3read[7:,:],Tmean1tend,Tmean2tend,Tmean3tend,\n",
1008 |       "                    Tgradread[7:,:],Tgradtend,Umean1read[7:,:],Umean2read[7:,:],Umean1tend,Umean2tend,ZA1read[7:,:],ZA2read[7:,:]),axis=1)"
1009 |      ],
1010 |      "language": "python",
1011 |      "metadata": {},
1012 |      "outputs": [],
1013 |      "prompt_number": 4
1014 |     },
1015 |     {
1016 |      "cell_type": "code",
1017 |      "collapsed": false,
1018 |      "input": [
1019 |       "# Save data metric\n",
1020 |       "if sys.byteorder=='little':\n",
1021 |       "    X.byteswap(True)\n",
1022 |       "#Create the binary files of the input files\n",
1023 |       "filename=open(r'./SSWdata.bin','wb')\n",
1024 |       "X.ravel().tofile(filename)\n",
1025 |       "filename.close()"
1026 |      ],
1027 |      "language": "python",
1028 |      "metadata": {},
1029 |      "outputs": []
1030 |     }
1031 |    ],
1032 |    "metadata": {}
1033 |   }
1034 |  ]
1035 | }


--------------------------------------------------------------------------------
/Data_Analysis/SSWkmeans.m:
--------------------------------------------------------------------------------
 1 | nt = 365*49-7;
 2 | levels = 18;
 3 | fileID = fopen('data/SSWdata.bin');
 4 | X=fread(fileID,[nt*levels*14 1],'double');
 5 | X=reshape(swapbytes(X),[levels*14 nt]);
 6 | size(X)
 7 | tic
 8 | idx = kmeans(X',3,'Distance','correlation');
 9 | toc
10 | tic
11 | [s,h]=silhouette(X',idx,'correlation');
12 | toc
13 | 


--------------------------------------------------------------------------------
/Data_Analysis/covertype_cluster/cluster_sample.m:
--------------------------------------------------------------------------------
 1 | M = csvread('data/observation_sample.csv');
 2 | s = size(M);
 3 | M1= M(2:s,:);
 4 | score = zeros(30,1);
 5 | for i = 5:30
 6 |     disp(i)
 7 |     tic
 8 |     idx = kmeans(M1,i,'Distance','sqEuclidean');
 9 |     toc   
10 |     tic
11 |     [s,h]=silhouette(M1,idx,'sqEuclidean');
12 |     toc
13 |     score(i)=mean(s);
14 | end    
15 | disp(score);


--------------------------------------------------------------------------------
/Data_Analysis/covertype_cluster/cluster_sample.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | import math
  4 | import random
  5 | 
  6 | from sklearn.cluster import KMeans
  7 | from sklearn.metrics import silhouette_samples, silhouette_score
  8 | from sklearn import mixture
  9 | 
 10 | X_normed=pd.read_csv("data/observation_sample.csv")
 11 | Y=pd.read_csv("data/label_sample.csv")
 12 | 
 13 | #use silhouette_values to choose best K
 14 | sil=[]
 15 | start = 5
 16 | end = 7
 17 | for n_clusters in range(start,end):
 18 |     kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(X_normed)
 19 |     cluster_labels=kmeans.labels_
 20 |     silhouette_values = silhouette_samples(X_normed,cluster_labels)
 21 | 
 22 |     silhouette_avg = silhouette_score(X_normed, cluster_labels)
 23 |     print("For n_clusters =", n_clusters,
 24 |               "The average silhouette_score is :", silhouette_avg)
 25 |     sil.append(silhouette_avg)
 26 | 
 27 | 
 28 | 
 29 | K = start+np.array(sil).argmax()
 30 | 
 31 | #Now we've found the K, do the clustering again
 32 | kmeans = KMeans(n_clusters=K, random_state=0).fit(X_normed)
 33 | cluster_labels=kmeans.labels_
 34 | silhouette_values = silhouette_samples(X_normed,cluster_labels)
 35 | 
 36 | silhouette_avg = silhouette_score(X_normed, cluster_labels)
 37 | print("For n_clusters =", K,
 38 |           "The average silhouette_score is :", silhouette_avg)
 39 | 
 40 | 
 41 | #find the purest(has the highest percentage for a cluster) cluster for the 7 labels, use their center as our final cluster center so that future data points could
 42 | #be classified
 43 | 
 44 | #build a table so that all the percentage could be recorded
 45 | #cluster Number  label_1  label_2   ...   label_7   
 46 | #0
 47 | #1
 48 | #...
 49 | #K-1 
 50 | 
 51 | table = pd.DataFrame(np.zeros((K,8)),columns=['label_1', 'label_2', 'label_3', 'label_4','label_5','label_6','label_7','size'])
 52 | 
 53 | 
 54 | pred = pd.DataFrame(data = kmeans.labels_, columns=['cluster'])
 55 | for k in range(K):
 56 |     for i in range(1,8):  
 57 |         c = Y.iloc[pred[pred['cluster']==k].index]
 58 |         table.loc[k,'label_'+str(i)] = c[c==i].shape[0]
 59 | 
 60 | 
 61 | # print(table)
 62 | 
 63 | 
 64 | target = open("data/output.txt", 'w')
 65 | target.write(str(K))
 66 | target.write("\n")
 67 | target.write(str(sil))
 68 | target.write("\n")
 69 | 
 70 | 
 71 | 
 72 | pred.to_csv("data/pred_labels.csv")
 73 | #select 7 clusters, set their centers for use
 74 | #1: 7
 75 | #2: 4
 76 | #3: 22
 77 | #4: 23
 78 | #5: 0
 79 | #6: 3
 80 | #7: 24
 81 | # selected_cluster  = [7,4,22,23,0,3,24]
 82 | # centers = kmeans.cluster_centers_[selected_cluster]
 83 | 
 84 | 
 85 | 
 86 | #test our classification accuracy 
 87 | 
 88 | 
 89 | 
 90 | 
 91 | 
 92 | 
 93 | 
 94 | 
 95 | 
 96 | 
 97 | 
 98 | 
 99 | 
100 | 
101 | 
102 | 
103 | 
104 | 
105 | 
106 | 
107 | 
108 | 
109 | 
110 | 
111 | 
112 | 
113 | 
114 | 
115 | 
116 | 
117 | 
118 | 


--------------------------------------------------------------------------------
/Data_Analysis/covertype_cluster/cluster_train_full.m:
--------------------------------------------------------------------------------
1 | M = csvread('data/observation_train_full.csv');
2 | s = size(M);
3 | M1= M(2:s,:);
4 | tic
5 | idx = kmeans(M1,23);
6 | toc 


--------------------------------------------------------------------------------
/Data_Analysis/covertype_cluster/figures/accuracy_graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JiaweiZhuang/CS205_final_project/0751060adec3f6e4ca9676baae1bfd4bf4ba542a/Data_Analysis/covertype_cluster/figures/accuracy_graph.png


--------------------------------------------------------------------------------
/Data_Analysis/covertype_cluster/figures/covertype_distribution.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JiaweiZhuang/CS205_final_project/0751060adec3f6e4ca9676baae1bfd4bf4ba542a/Data_Analysis/covertype_cluster/figures/covertype_distribution.png


--------------------------------------------------------------------------------
/Data_Analysis/covertype_cluster/figures/study_area_map.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JiaweiZhuang/CS205_final_project/0751060adec3f6e4ca9676baae1bfd4bf4ba542a/Data_Analysis/covertype_cluster/figures/study_area_map.png


--------------------------------------------------------------------------------
/Data_Analysis/covertype_cluster/figures/vis_label.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JiaweiZhuang/CS205_final_project/0751060adec3f6e4ca9676baae1bfd4bf4ba542a/Data_Analysis/covertype_cluster/figures/vis_label.png


--------------------------------------------------------------------------------
/Data_Analysis/covertype_cluster/figures/vis_pred.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JiaweiZhuang/CS205_final_project/0751060adec3f6e4ca9676baae1bfd4bf4ba542a/Data_Analysis/covertype_cluster/figures/vis_pred.png


--------------------------------------------------------------------------------
/Data_Analysis/covertype_cluster/preprocess.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import math
 4 | import random
 5 | 
 6 | df= pd.read_csv("data/covtype_data.csv", header=None)
 7 | 
 8 | #normalize
 9 | # X = df.iloc[:, :-1]
10 | # X_normed = X / X.max(axis=0)
11 | # Y = df.iloc[: ,-1]
12 | # X_normed = X_normed.fillna(0)
13 | 
14 | # X_normed.to_csv("data/observation_full.csv", index=False)
15 | # Y.to_csv("data/label_full.csv", index=False, header=True)
16 | 
17 | 
18 | 
19 | 
20 | sample = pd.DataFrame(columns = df.columns)
21 | #choose 10% for each group 
22 | for i in range(1,8):
23 |     subset = df[df[54]==i]
24 |     rows = random.sample(subset.index, int(subset.shape[0]*0.05))
25 |     sample = sample.append(df.ix[rows],ignore_index=True)
26 | 
27 | 
28 | #normalize
29 | X = sample.iloc[:, :-1]
30 | X_normed = X / X.max(axis=0)
31 | Y = sample.iloc[: ,-1]
32 | X_normed = X_normed.fillna(0)
33 | 
34 | X_normed.to_csv("data/observation_sample.csv", index=False)
35 | 
36 | Y.to_csv("data/label_sample.csv", index=False, header=True)


--------------------------------------------------------------------------------
/Data_Analysis/data/SSWdata.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JiaweiZhuang/CS205_final_project/0751060adec3f6e4ca9676baae1bfd4bf4ba542a/Data_Analysis/data/SSWdata.bin


--------------------------------------------------------------------------------
/Data_Analysis/figures/PV.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JiaweiZhuang/CS205_final_project/0751060adec3f6e4ca9676baae1bfd4bf4ba542a/Data_Analysis/figures/PV.png


--------------------------------------------------------------------------------
/Data_Analysis/figures/SSW.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JiaweiZhuang/CS205_final_project/0751060adec3f6e4ca9676baae1bfd4bf4ba542a/Data_Analysis/figures/SSW.png


--------------------------------------------------------------------------------
/Data_Analysis/figures/SSWsubset.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JiaweiZhuang/CS205_final_project/0751060adec3f6e4ca9676baae1bfd4bf4ba542a/Data_Analysis/figures/SSWsubset.png


--------------------------------------------------------------------------------
/Data_Analysis/figures/T.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JiaweiZhuang/CS205_final_project/0751060adec3f6e4ca9676baae1bfd4bf4ba542a/Data_Analysis/figures/T.png


--------------------------------------------------------------------------------
/Data_Analysis/figures/intro1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JiaweiZhuang/CS205_final_project/0751060adec3f6e4ca9676baae1bfd4bf4ba542a/Data_Analysis/figures/intro1.png


--------------------------------------------------------------------------------
/Data_Analysis/figures/k3_svalue.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JiaweiZhuang/CS205_final_project/0751060adec3f6e4ca9676baae1bfd4bf4ba542a/Data_Analysis/figures/k3_svalue.png


--------------------------------------------------------------------------------
/Data_Analysis/figures/silScoreSubset.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JiaweiZhuang/CS205_final_project/0751060adec3f6e4ca9676baae1bfd4bf4ba542a/Data_Analysis/figures/silScoreSubset.png


--------------------------------------------------------------------------------
/Data_Analysis/figures/svalue.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JiaweiZhuang/CS205_final_project/0751060adec3f6e4ca9676baae1bfd4bf4ba542a/Data_Analysis/figures/svalue.png


--------------------------------------------------------------------------------
/Data_Analysis/readData.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import numpy as np
 3 | ndata = 17878
 4 | nfeatures = 252
 5 | # Read data points
 6 | file1=open('data/SSWdata.bin','rb')
 7 | X=np.fromfile(file1)
 8 | if sys.byteorder=='little':
 9 |     X.byteswap(True)
10 | X=X.reshape(ndata,nfeatures)
11 | 
12 | # Read python label
13 | file1=open('data/Label_py.bin','rb')
14 | Y_py=np.fromfile(file1,np.int32)
15 | if sys.byteorder=='little':
16 |     Y_py.byteswap(True)
17 | 
18 | # Read matlab label
19 | file1=open('data/Label_matlab.bin','rb')
20 | Y_matlab=np.fromfile(file1,np.int32)
21 | 
22 | 


--------------------------------------------------------------------------------
/Other_Image/Kmean_illustration/Kmeans.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JiaweiZhuang/CS205_final_project/0751060adec3f6e4ca9676baae1bfd4bf4ba542a/Other_Image/Kmean_illustration/Kmeans.gif


--------------------------------------------------------------------------------
/Other_Image/pseudo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JiaweiZhuang/CS205_final_project/0751060adec3f6e4ca9676baae1bfd4bf4ba542a/Other_Image/pseudo.png


--------------------------------------------------------------------------------
/Parallel_Algorithm/Cuda/compile.sh:
--------------------------------------------------------------------------------
1 | export PATH=/usr/local/cuda-7.5/bin${PATH:+:${PATH}}
2 | export LD_LIBRARY_PATH=/usr/local/cuda-7.5/lib64\${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
3 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib
4 | 
5 | nvcc -lnetcdf kmeans_cdf.cu
6 | 


--------------------------------------------------------------------------------
/Parallel_Algorithm/Cuda/kmeans_cdf.cu:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <stdlib.h>
  3 | #include <iostream>
  4 | #include <fstream>
  5 | #include <string>
  6 | #include <vector>
  7 | #include <sstream>
  8 | #include <time.h>
  9 | #include <sys/time.h>
 10 | extern "C" {
 11 |   #include <netcdf.h>
 12 | }
 13 | 
 14 | using namespace std;
 15 | 
 16 | // #define FAKE_DATA "../test_data/Blobs_smp20000_fea30_cls8.nc"
 17 | #define ERRCODE 2
 18 | #define ERR(e) {printf("Error: %s\n", nc_strerror(e)); exit(ERRCODE);}
 19 | 
 20 | double iStart1, iStart2, iStart3a, iStart3b, iStart4a, iStart4b, iStart4c, iStart4d, iStart5;
 21 | double iElaps1=0, iElaps2=0, iElaps3a=0, iElaps3b=0, iElaps4=0, iElaps5=0;
 22 | // Hold configurations for Kmeans
 23 | struct Info {
 24 |   int     numPoints;
 25 |   int     dim;
 26 |   int     numCentroids;
 27 |   int     numRepeats;
 28 |   int    *belongs;
 29 |   float **points;
 30 |   float **centroids;
 31 |   float **guess;
 32 |   int     thresholdLoops;
 33 |   float   thresholdFraction;
 34 |   int     threadPerBlock;
 35 | };
 36 | 
 37 | // ************************** Utils ************************** //
 38 | 
 39 | float** Make2DFloatArray(int rows, int cols) {
 40 |     float *data = (float *)malloc(rows*cols*sizeof(float));
 41 |     float **array= (float **)malloc(rows*sizeof(float*));
 42 |     for (int i=0; i<rows; i++)
 43 |         array[i] = &(data[cols*i]);
 44 | 
 45 |     return array;
 46 | }
 47 | int** Make2DIntArray(int rows, int cols) {
 48 |     int *data = (int *)malloc(rows*cols*sizeof(int));
 49 |     int **array= (int **)malloc(rows*sizeof(int*));
 50 |     for (int i=0; i<rows; i++)
 51 |         array[i] = &(data[cols*i]);
 52 | 
 53 |     return array;
 54 | }
 55 | 
 56 | int readX(char* FILE_NAME, float*** p_X,int*** p_GUESS, int* p_N_samples,int* p_N_features, int* p_N_clusters, int* p_N_repeat) {
 57 |     int ncid, varid,dimid;
 58 |     int retval;
 59 |     size_t N_temp;
 60 | 
 61 |     // printf("Reading data...\n");
 62 | 
 63 |     /* Open the file. NC_NOWRITE tells netCDF we want read-only access
 64 |      * to the file.*/
 65 |     if ((retval = nc_open(FILE_NAME, NC_NOWRITE, &ncid)))
 66 |        ERR(retval);
 67 | 
 68 |     /* Get the size of the data for dynamical allocation*/
 69 |     nc_inq_dimid(ncid,"N_samples",&dimid);
 70 |     nc_inq_dimlen(ncid,dimid,&N_temp);
 71 |     *p_N_samples = (int)N_temp;
 72 |     // printf("Number of samples: %d \n",*p_N_samples);
 73 | 
 74 |     nc_inq_dimid(ncid,"N_features",&dimid);
 75 |     nc_inq_dimlen(ncid,dimid,&N_temp);
 76 |     *p_N_features = (int)N_temp;
 77 |     // printf("Number of features: %d \n",*p_N_features);
 78 | 
 79 |     nc_inq_dimid(ncid,"N_clusters",&dimid);
 80 |     nc_inq_dimlen(ncid,dimid,&N_temp);
 81 |     *p_N_clusters = (int)N_temp;
 82 |     // printf("Number of clusters: %d \n",*p_N_clusters);
 83 | 
 84 |     nc_inq_dimid(ncid,"N_repeat",&dimid);
 85 |     nc_inq_dimlen(ncid,dimid,&N_temp);
 86 |     *p_N_repeat = (int)N_temp;
 87 |     // printf("Number of repeated runs: %d \n",*p_N_repeat);
 88 | 
 89 |      /* Get the varid of the data variable, based on its name. */
 90 |     if ((retval = nc_inq_varid(ncid, "X", &varid)))
 91 |        ERR(retval);
 92 |     /* Read the data. */
 93 |     *p_X = Make2DFloatArray(*p_N_samples,*p_N_features);
 94 |     if ((retval = nc_get_var_float(ncid, varid, (*p_X)[0])))
 95 |        ERR(retval);
 96 | 
 97 |      /* Initial Guess*/
 98 |     if ((retval = nc_inq_varid(ncid, "GUESS", &varid)))
 99 |        ERR(retval);
100 |     *p_GUESS = Make2DIntArray(*p_N_repeat,*p_N_clusters);
101 |     if ((retval = nc_get_var_int(ncid, varid, (*p_GUESS)[0])))
102 |        ERR(retval);
103 | 
104 |     /*close the netcdf file*/
105 |     if ((retval = nc_close(ncid) ))
106 |        ERR(retval);
107 | 
108 |     // printf("Reading data finished. \n");
109 | 
110 |     return 0;
111 |  }
112 | 
113 | 
114 | double cpuSecond() {
115 |     struct timeval tp;
116 |     gettimeofday(&tp,NULL);
117 |     return ((double)tp.tv_sec + (double)tp.tv_usec * 1.e-6);
118 | }
119 | 
120 | static inline int nextPowerOfTwo(int v) {
121 |   int res = v;
122 |   for (int i = 1; i <= 16; i *= 2) {
123 |     res |= res >> i;
124 |   }
125 |   return res + 1;
126 | }
127 | 
128 | float** make2DArray(int x, int y) {
129 |   float **res = (float **)malloc(x * sizeof(float *));
130 | 
131 |   // for (int i = 0; i < x; i++) {
132 |   //   res[i] = (float *)malloc(y * sizeof(float));
133 |   // }
134 |   res[0] = (float *)malloc(x * y * sizeof(float));
135 |   for (size_t i = 1; i < x; i++) res[i] = res[i-1] + y;
136 |   for (size_t i = 0; i < x; i++) {
137 |     for (size_t j = 0; j < y; j++) {
138 |       res[i][j] = 0.0;
139 |     }
140 |   }
141 |   return res;
142 | }
143 | 
144 | void invert2DArray(float **A, float **B, int x, int y) {
145 |   for (int i = 0; i < x; i++) {
146 |     for (int j = 0; j < y; j++) {
147 |       A[i][j] = B[j][i];
148 |     }
149 |   }
150 | }
151 | 
152 | void copy2DArray(float **A, float **B, int x, int y) {
153 |   for (int i = 0; i < x; i++) {
154 |     for (int j = 0; j < y; j++) {
155 |       A[i][j] = B[i][j];
156 |     }
157 |   }
158 | }
159 | 
160 | // ************************** Utils ************************** //
161 | 
162 | __host__ __device__ inline static float
163 |     computeDist(Info* info, int pointId, int centroidId, int distType, float *gPoints, float *gCentroids) {
164 |   float res = 0;
165 |   if (distType == 0) {
166 |     for (int i = 0; i < info->dim; i++) {
167 |       res +=
168 |         (gPoints[i * (info->numPoints) + pointId] - gCentroids[i * (info->numCentroids) + centroidId]) *
169 |         (gPoints[i * (info->numPoints) + pointId] - gCentroids[i * (info->numCentroids) + centroidId]);
170 |     }
171 |   }
172 |   return res;
173 | }
174 | 
175 | // Use reduction to compute the sum of an array
176 | // Refer to
177 | // http://developer.download.nvidia.com/compute/cuda/1.1-Beta/x86_website/projects/reduction/doc/reduction.pdf
178 | __global__ static void reduce(int *g_idata, int l1, int l2) {
179 |   extern __shared__ unsigned int sdata[];
180 |   unsigned int tid = threadIdx.x;
181 | 
182 |   if (tid < l1) {
183 |     sdata[tid] = g_idata[tid];
184 |   } else {
185 |     sdata[tid] = 0;
186 |   }
187 |   __syncthreads();
188 | 
189 |   // Parallel Reduction (l2 must be power of 2)
190 |   for (unsigned int s = l2 / 2; s > 0; s >>= 1) {
191 |     if (tid < s)     {
192 |       sdata[tid] += sdata[tid + s];
193 |     }
194 |     __syncthreads();
195 |   }
196 | 
197 |   if (tid == 0) {
198 |     g_idata[0] = sdata[0];
199 |   }
200 | }
201 | 
202 | __global__ static void nearestCentroid(int *blockResult, int *gBelongs, float *gPoints, float *gCentroids, Info *gInfo) {
203 | 
204 |   int pointId = blockDim.x * blockIdx.x + threadIdx.x;
205 |   if (pointId >= (gInfo->numPoints)) return;
206 | 
207 |   // For test on test.txt
208 |   // printf("Thread: %d - %.2f,  %.2f, %.2f,  %.2f \n", pointId, gCentroids[0], gCentroids[1], gCentroids[2], gCentroids[3]);
209 |   // printf("Thread: %d - %.2f,  %.2f, %.2f,  %.2f, %.2f,  %.2f, %.2f,  %.2f \n",
210 |   //          pointId, gPoints[0], gPoints[1], gPoints[2], gPoints[3], gPoints[4], gPoints[5], gPoints[6], gPoints[7]);
211 | 
212 |   // Get the minimum distance
213 |   float mDist = computeDist(gInfo, pointId, 0, 0, gPoints, gCentroids);
214 | 
215 |   int tmpIdx = 0;
216 |   int numCentroids = gInfo->numCentroids;
217 |   for (int i = 0; i < numCentroids; i++) {
218 |     float tmpDist = computeDist(gInfo, pointId, i, 0, gPoints, gCentroids);
219 |     if (tmpDist < mDist) {
220 |       mDist  = tmpDist;
221 |       tmpIdx = i;
222 |     }
223 |   }
224 | 
225 |   // use reduction to add the total number of changes (change from one centroid to another) in this block
226 |   extern __shared__ int sdata2[];
227 |   sdata2[threadIdx.x] = 0;
228 |   if (gBelongs[pointId] != tmpIdx) {
229 |     sdata2[threadIdx.x] = 1;
230 |   }
231 |   gBelongs[pointId] = tmpIdx;
232 |   __syncthreads();
233 | 
234 |   // Reduction
235 |   for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
236 |     if (threadIdx.x < s) {
237 |       sdata2[threadIdx.x] += sdata2[threadIdx.x + s];
238 |     }
239 |     __syncthreads();
240 |   }
241 | 
242 |   // Put the sum to the location corresbonding to current block
243 |   if (threadIdx.x == 0) {
244 |     blockResult[blockIdx.x] = sdata2[0];
245 |   }
246 | }
247 | 
248 | void processData(char *fileName, Info *info, int i_repeat) {
249 |   float **X;
250 |   int   **GUESS;
251 | 
252 |   int N_samples, N_features, N_clusters, N_repeat;
253 | 
254 |   readX(fileName,&X,&GUESS,&N_samples,&N_features,&N_clusters,&N_repeat);
255 | 
256 |   // cout << N_samples << "," <<  N_features << ","  <<  N_clusters << "," << N_repeat << '\n';
257 | 
258 |   // Test purpose
259 |   // N_samples  = 4;
260 |   // N_features = 2;
261 |   // N_clusters = 2;
262 |   // N_repeat   = 1;
263 | 
264 |   info->numPoints         = N_samples;
265 |   info->dim               = N_features;
266 |   info->numCentroids      = N_clusters;
267 |   info->numRepeats        = N_repeat;
268 |   info->thresholdFraction = 0.001;
269 |   info->thresholdLoops    = 200;
270 |   info->points            = X;
271 | 
272 |   float **guess = make2DArray(N_clusters, N_features);
273 |   for (int k=0; k<N_clusters; k++){
274 |        int initial_idx = GUESS[i_repeat][k];
275 |        for (int j=0; j<N_features; j++){
276 |            guess[k][j]=X[initial_idx][j];
277 |        }
278 |    }
279 |    info->guess = guess;
280 | 
281 |   /* belongs: the cluster id for each data object */
282 |   int *belongs = new int[N_samples];
283 |   for (int i = 0; i < N_samples; i++) belongs[i] = -1;
284 |   info->belongs = belongs;
285 | }
286 | 
287 | 
288 | 
289 | void cudaKmeans(Info *info) {
290 |   // Initialization
291 |   int numPoints         = info->numPoints;
292 |   int dim               = info->dim;
293 |   int numCentroids      = info->numCentroids;
294 |   int thresholdLoops    = info->thresholdLoops;
295 |   int thresholdFraction = info->thresholdFraction;
296 |   int* belongs          = info->belongs;
297 |   float **points        = info->points;
298 |   float **centroids     = info->centroids;
299 |   float **guess         = info->guess;
300 |   int threadPerBlock    = info->threadPerBlock;
301 | 
302 |   iStart4d = cpuSecond();
303 | 
304 |   // invert (transpose matrix)
305 |   float **iPoints = make2DArray(dim, numPoints);
306 |   invert2DArray(iPoints, points, dim, numPoints);
307 | 
308 |   // initial guess
309 |   float **iCentroids = make2DArray(dim, numCentroids);
310 |   // copy2DArray(iCentroids, iPoints, dim, numCentroids);
311 |   invert2DArray(iCentroids, guess, dim, numCentroids);
312 | 
313 |   // centroid -> number of points
314 |   int *pointsCount   = new int[numCentroids];
315 |   float **iNewCentroids = make2DArray(dim, numCentroids);
316 | 
317 |   iElaps4 += cpuSecond() - iStart4d;
318 | 
319 |   // Some cuda constants
320 |   const unsigned int bthreads = threadPerBlock;
321 |   const unsigned int l1       = (numPoints + bthreads - 1) / bthreads;
322 |   const unsigned int l2       = nextPowerOfTwo(l1);
323 |   const unsigned int sdsize2  = bthreads * sizeof(unsigned int); // shared memory size for sdata2
324 |   const unsigned int sdsize1  = l2 * sizeof(unsigned int); // shared memory size for sdata1
325 | 
326 |   // Cuda device Initialization
327 |   float *gPoints;
328 |   float *gCentroids;
329 |   int   *gBelongs;
330 |   Info   *gInfo;
331 |   int   *tmp;
332 | 
333 |   // Data transfer
334 |   iStart4a = cpuSecond();
335 |   cudaMalloc(&gPoints,    numPoints * dim * sizeof(float));
336 |   cudaMalloc(&gCentroids, numCentroids * dim * sizeof(float));
337 |   cudaMalloc(&gBelongs,   numPoints * sizeof(int));
338 |   cudaMalloc((void**)&gInfo,   sizeof(Info));
339 |   cudaMalloc(&tmp,        l2 * sizeof(unsigned int)); // For reduction
340 |   cudaMemcpy(gBelongs,
341 |              belongs,
342 |              numPoints * sizeof(int),
343 |              cudaMemcpyHostToDevice);
344 |   cudaMemcpy(gPoints,
345 |              iPoints[0],
346 |              numPoints * dim * sizeof(float),
347 |              cudaMemcpyHostToDevice);
348 |   cudaMemcpy(gInfo,info,sizeof(Info),cudaMemcpyHostToDevice);
349 | 
350 |   iElaps4 += cpuSecond() - iStart4a;
351 | 
352 |   int count = 0;
353 |   float frac = 1.0;
354 | 
355 |   while (count < thresholdLoops) {
356 |     iStart4b = cpuSecond();
357 |     cudaMemcpy(gCentroids, iCentroids[0], dim * numCentroids * sizeof(float), cudaMemcpyHostToDevice);
358 |     iElaps4 += cpuSecond() - iStart4b;
359 | 
360 |     // E-Step: assign points to the nearest cluster center
361 |     iStart2 = cpuSecond();
362 |     // nearestCentroid<<<l1, bthreads, sdsize2>>>(dim, numPoints, numCentroids, gPoints, gCentroids, gBelongs, tmp);
363 |     nearestCentroid<<<l1, bthreads, sdsize2>>>(tmp, gBelongs, gPoints, gCentroids, gInfo);
364 |     cudaDeviceSynchronize();
365 |     iElaps2 += (cpuSecond() - iStart2);
366 | 
367 |     // Update belongs
368 |     iStart4c = cpuSecond();
369 |     cudaMemcpy(belongs, gBelongs, numPoints * sizeof(int), cudaMemcpyDeviceToHost);
370 |     iElaps4 += cpuSecond() - iStart4c;
371 | 
372 |     // M-Step first half: set the cluster centers to the mean
373 |     iStart3a = cpuSecond();
374 | 
375 |     // Clear the two temp variables
376 |     for (int i = 0; i < numCentroids; i++) {
377 |       pointsCount[i] = 0;
378 |       for (int j = 0; j < dim; j++) {
379 |         iNewCentroids[j][i] = 0.0;
380 |       }
381 |     }
382 | 
383 |     // Add up points in each centroid
384 |     for (int i = 0; i < numPoints; i++) {
385 |       int idx = belongs[i];
386 |       pointsCount[idx] += 1;
387 |       for (int j = 0; j < dim; j++) {
388 |         iNewCentroids[j][idx] += points[i][j];
389 |       }
390 |     }
391 |     iElaps3a += cpuSecond() - iStart3a;
392 | 
393 |     // M-Step second half: convert the sum to the mean
394 |     // Update to new centroids
395 |     iStart3b = cpuSecond();
396 |     for (int i = 0; i < numCentroids; i++) {
397 |       for (int j = 0; j < dim; j++) {
398 |         if (pointsCount[i] > 0) {
399 |           iCentroids[j][i] = iNewCentroids[j][i] / pointsCount[i];
400 |         }
401 |       }
402 |     }
403 |     iElaps3b += cpuSecond() - iStart3b;
404 | 
405 |     // Check convergence
406 |     iStart5 = cpuSecond();
407 | 
408 |     // Check if too few number of points change their centroids
409 |     reduce <<<1, l2, sdsize1>>>(tmp, l1, l2);
410 |     cudaDeviceSynchronize();
411 |     int tmpFloat;
412 |     cudaMemcpy(&tmpFloat, tmp, sizeof(int), cudaMemcpyDeviceToHost);
413 |     frac = (float)tmpFloat / numPoints;
414 |     // cout << "Iteration: " << count << "," << frac << "," << tmpFloat  << "\n";
415 |     count++;
416 |     if (frac <= thresholdFraction) break;
417 | 
418 |     iElaps5 += cpuSecond() - iStart5;
419 | 
420 |   }
421 | 
422 |   iStart4d = cpuSecond();
423 |   centroids = make2DArray(numCentroids, dim);
424 |   invert2DArray(centroids, iCentroids, numCentroids, dim);
425 |   info->centroids = centroids;
426 |   iElaps4 += cpuSecond() - iStart4d;
427 | 
428 |   // Free device memory
429 |   cudaFree(gPoints);
430 |   cudaFree(gCentroids);
431 |   cudaFree(gBelongs);
432 |   cudaFree(tmp);
433 | 
434 | }
435 | 
436 | int main(int argc, char *argv[]) {
437 |   Info *info     = new Info;
438 |   info->threadPerBlock = atoi(argv[1]);
439 |   char *fileName = argv[2];
440 |   processData(fileName, info, 0);
441 | 
442 |   printf("Number of samples: %d \n",info->numPoints);
443 |   printf("Number of features: %d \n", info->dim);
444 |   printf("Number of clusters: %d \n", info->numCentroids);
445 |   printf("Number of repeated runs: %d \n", info->numRepeats);
446 |   for (int i = 0; i < info->numRepeats; i++) {
447 |     // cout << "====== Begin Loop " << i << " ======\n";
448 |     iStart1 = cpuSecond();
449 |     cudaKmeans(info);
450 |     iElaps1 += cpuSecond() - iStart1;
451 | 
452 |     // cout << "Ref: " << info->centroids[0][0] << "\n";
453 |     // cout << "====== End of Loop " << i << " ======\n";
454 |     // break;
455 | 
456 |     // Reload info
457 |     delete(info);
458 |     if (i + 1== info->numRepeats) break;
459 |     info     = new Info;
460 |     info->threadPerBlock = atoi(argv[1]);
461 |     processData(fileName, info, i+1);
462 |   }
463 | 
464 | 
465 |   cout << "Total time: " << iElaps1*1000 << "\n";
466 |   cout << "E-step time use (ms): " << iElaps2*1000 << "\n";
467 |   cout << "M-step-1st-half time use (ms): " << iElaps3a*1000 << "\n";
468 |   cout << "M-step-2nd-half time use (ms): " << iElaps3b*1000 << "\n";
469 |   cout << "Cuda Data IO (ms): " << iElaps4*1000 << "\n";
470 |   cout << "Check Convergence (ms): " << iElaps5*1000 << "\n";
471 | }
472 | 


--------------------------------------------------------------------------------
/Parallel_Algorithm/Cuda/kmeans_txt.cu:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <stdlib.h>
  3 | #include <iostream>
  4 | #include <fstream>
  5 | #include <string>
  6 | #include <vector>
  7 | #include <sstream>
  8 | #include <time.h>
  9 | #include <sys/time.h>
 10 | 
 11 | using namespace std;
 12 | double iStart1, iStart2, iStart3a, iStart3b, iStart4a, iStart4b, iStart4c, iStart5;
 13 | double iElaps1=0, iElaps2=0, iElaps3a=0, iElaps3b=0, iElaps4=0, iElaps5=0;
 14 | // Hold configurations for Kmeans
 15 | struct Info {
 16 |   int     numPoints;
 17 |   int     dim;
 18 |   int     numCentroids;
 19 |   int     numRepeats;
 20 |   int    *belongs;
 21 |   float **points;
 22 |   float **centroids;
 23 |   int     thresholdLoops;
 24 |   float   thresholdFraction;
 25 |   int     threadPerBlock;
 26 | };
 27 | 
 28 | // ************* Utils ************* //
 29 | 
 30 | double cpuSecond() {
 31 |     struct timeval tp;
 32 |     gettimeofday(&tp,NULL);
 33 |     return ((double)tp.tv_sec + (double)tp.tv_usec * 1.e-6);
 34 | }
 35 | 
 36 | static inline int nextPowerOfTwo(int v) {
 37 |   int res = v;
 38 |   for (int i = 1; i <= 16; i *= 2) {
 39 |     res |= res >> i;
 40 |   }
 41 |   return res + 1;
 42 | }
 43 | 
 44 | float** make2DArray(int x, int y) {
 45 |   float **res = (float **)malloc(x * sizeof(float *));
 46 | 
 47 |   // for (int i = 0; i < x; i++) {
 48 |   //   res[i] = (float *)malloc(y * sizeof(float));
 49 |   // }
 50 |   res[0] = (float *)malloc(x * y * sizeof(float));
 51 |   for (size_t i = 1; i < x; i++) res[i] = res[i-1] + y;
 52 |   for (size_t i = 0; i < x; i++) {
 53 |     for (size_t j = 0; j < y; j++) {
 54 |       res[i][j] = 0.0;
 55 |     }
 56 |   }
 57 |   return res;
 58 | }
 59 | 
 60 | void invert2DArray(float **A, float **B, int x, int y) {
 61 |   for (int i = 0; i < x; i++) {
 62 |     for (int j = 0; j < y; j++) {
 63 |       A[i][j] = B[j][i];
 64 |     }
 65 |   }
 66 | }
 67 | 
 68 | void copy2DArray(float **A, float **B, int x, int y) {
 69 |   for (int i = 0; i < x; i++) {
 70 |     for (int j = 0; j < y; j++) {
 71 |       A[i][j] = B[i][j];
 72 |     }
 73 |   }
 74 | }
 75 | 
 76 | // ************* Utils ************* //
 77 | 
 78 | __host__ __device__ inline static float
 79 |     computeDist(Info* info, int pointId, int centroidId, int distType, float *gPoints, float *gCentroids) {
 80 |   float res = 0;
 81 |   if (distType == 0) {
 82 |     for (int i = 0; i < info->dim; i++) {
 83 |       res +=
 84 |         (gPoints[i * (info->numPoints) + pointId] - gCentroids[i * (info->numCentroids) + centroidId]) *
 85 |         (gPoints[i * (info->numPoints) + pointId] - gCentroids[i * (info->numCentroids) + centroidId]);
 86 |     }
 87 |   }
 88 |   return res;
 89 | }
 90 | 
 91 | // Use reduction to compute the sum of an array
 92 | // Refer to
 93 | // http://developer.download.nvidia.com/compute/cuda/1.1-Beta/x86_website/projects/reduction/doc/reduction.pdf
 94 | __global__ static void reduce(int *g_idata, int l1, int l2) {
 95 |   extern __shared__ unsigned int sdata[];
 96 |   unsigned int tid = threadIdx.x;
 97 | 
 98 |   if (tid < l1) {
 99 |     sdata[tid] = g_idata[tid];
100 |   } else {
101 |     sdata[tid] = 0;
102 |   }
103 |   __syncthreads();
104 | 
105 |   // Parallel Reduction (l2 must be power of 2)
106 |   for (unsigned int s = l2 / 2; s > 0; s >>= 1) {
107 |     if (tid < s)     {
108 |       sdata[tid] += sdata[tid + s];
109 |     }
110 |     __syncthreads();
111 |   }
112 | 
113 |   if (tid == 0) {
114 |     g_idata[0] = sdata[0];
115 |   }
116 | }
117 | 
118 | __global__ static void nearestCentroid(int *blockResult, int *gBelongs, float *gPoints, float *gCentroids, Info *gInfo) {
119 | 
120 |   int pointId = blockDim.x * blockIdx.x + threadIdx.x;
121 |   if (pointId >= (gInfo->numPoints)) return;
122 | 
123 |   // For test on test.txt
124 |   // printf("Thread: %d - %.2f,  %.2f, %.2f,  %.2f \n", pointId, gCentroids[0], gCentroids[1], gCentroids[2], gCentroids[3]);
125 |   // printf("Thread: %d - %.2f,  %.2f, %.2f,  %.2f, %.2f,  %.2f, %.2f,  %.2f \n",
126 |   //          pointId, gPoints[0], gPoints[1], gPoints[2], gPoints[3], gPoints[4], gPoints[5], gPoints[6], gPoints[7]);
127 | 
128 |   // Get the minimum distance
129 |   float mDist = computeDist(gInfo, pointId, 0, 0, gPoints, gCentroids);
130 | 
131 |   int tmpIdx = 0;
132 |   int numCentroids = gInfo->numCentroids;
133 |   for (int i = 0; i < numCentroids; i++) {
134 |     float tmpDist = computeDist(gInfo, pointId, i, 0, gPoints, gCentroids);
135 |     if (tmpDist < mDist) {
136 |       mDist  = tmpDist;
137 |       tmpIdx = i;
138 |     }
139 |   }
140 | 
141 |   // use reduction to add the total number of changes (change from one centroid to another) in this block
142 |   extern __shared__ int sdata2[];
143 |   sdata2[threadIdx.x] = 0;
144 |   if (gBelongs[pointId] != tmpIdx) {
145 |     sdata2[threadIdx.x] = 1;
146 |   }
147 |   gBelongs[pointId] = tmpIdx;
148 |   __syncthreads();
149 | 
150 |   // Reduction
151 |   for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
152 |     if (threadIdx.x < s) {
153 |       sdata2[threadIdx.x] += sdata2[threadIdx.x + s];
154 |     }
155 |     __syncthreads();
156 |   }
157 | 
158 |   // Put the sum to the location corresbonding to current block
159 |   if (threadIdx.x == 0) {
160 |     blockResult[blockIdx.x] = sdata2[0];
161 |   }
162 | }
163 | 
164 | void processData(char *fileName, Info *info) {
165 |   float **X;
166 |   int   **GUESS;
167 | 
168 |   int N_samples, N_features, N_clusters, N_repeat;
169 | 
170 |   // readX(FILE_NAME,&X,&GUESS,&N_samples,&N_features,&N_clusters,&N_repeat);
171 | 
172 |   // Test purpose
173 |   N_samples  = 100;
174 |   N_features = 9;
175 |   N_clusters = 4;
176 |   N_repeat   = 10;
177 | 
178 |   info->numPoints         = N_samples;
179 |   info->dim               = N_features;
180 |   info->numCentroids      = N_clusters;
181 |   info->numRepeats        = N_repeat;
182 |   info->thresholdFraction = 0.005;
183 |   info->thresholdLoops    = 200;
184 | 
185 |   // Process data point
186 |   X = make2DArray(N_samples, N_features);
187 | 
188 |   string str(fileName);
189 |   ifstream file(str);
190 |      string line1;
191 |      int i = 0;
192 |      while (getline(file, line1)) {
193 |      std::istringstream iss(line1);
194 |      int j = -1;
195 |      for(string s; iss >> s;) {
196 |              if (j == -1) {
197 |                      j++;
198 |                      continue;
199 |              }
200 |              // cout << s << " ";
201 |              X[i][j] = stof(s);
202 |              j++;
203 |      }
204 |      i++;
205 |   }
206 |   info->points = X;
207 | 
208 |   /* belongs: the cluster id for each data object */
209 |   int *belongs = new int[N_samples];
210 |   for (i = 0; i < N_samples; i++) belongs[i] = -1;
211 |   info->belongs = belongs;
212 | }
213 | 
214 | 
215 | 
216 | void cudaKmeans(Info *info) {
217 |   // Initialization
218 |   int numPoints         = info->numPoints;
219 |   int dim               = info->dim;
220 |   int numCentroids      = info->numCentroids;
221 |   int thresholdLoops    = info->thresholdLoops;
222 |   int thresholdFraction = info->thresholdFraction;
223 |   int* belongs          = info->belongs;
224 |   float **points        = info->points;
225 |   float **centroids     = info->centroids;
226 |   int threadPerBlock    = info->threadPerBlock;
227 | 
228 |   // invert (transpose matrix)
229 |   float **iPoints = make2DArray(dim, numPoints);
230 |   invert2DArray(iPoints, points, dim, numPoints);
231 | 
232 |   // initial guess
233 |   float **iCentroids = make2DArray(dim, numCentroids);
234 |   copy2DArray(iCentroids, iPoints, dim, numCentroids);
235 |   // invert2DArray(iCentroids, points, dim, numCentroids);
236 | 
237 |   // centroid -> number of points
238 |   int *pointsCount   = new int[numCentroids];
239 |   float **iNewCentroids = make2DArray(dim, numCentroids);
240 | 
241 |   // Some cuda constants
242 |   const unsigned int bthreads = threadPerBlock;
243 |   const unsigned int l1       = (numPoints + bthreads - 1) / bthreads;
244 |   const unsigned int l2       = nextPowerOfTwo(l1);
245 |   const unsigned int sdsize2  = bthreads * sizeof(unsigned int); // shared memory size for sdata2
246 |   const unsigned int sdsize1  = l2 * sizeof(unsigned int); // shared memory size for sdata1
247 | 
248 |   // Cuda device Initialization
249 |   float *gPoints;
250 |   float *gCentroids;
251 |   int   *gBelongs;
252 |   Info   *gInfo;
253 |   int   *tmp;
254 | 
255 |   // Data transfer
256 |   iStart4a = cpuSecond();
257 |   cudaMalloc(&gPoints,    numPoints * dim * sizeof(float));
258 |   cudaMalloc(&gCentroids, numCentroids * dim * sizeof(float));
259 |   cudaMalloc(&gBelongs,   numPoints * sizeof(int));
260 |   cudaMalloc((void**)&gInfo,   sizeof(Info));
261 |   cudaMalloc(&tmp,        l2 * sizeof(unsigned int)); // For reduction
262 |   cudaMemcpy(gBelongs,
263 |              belongs,
264 |              numPoints * sizeof(int),
265 |              cudaMemcpyHostToDevice);
266 |   cudaMemcpy(gPoints,
267 |              iPoints[0],
268 |              numPoints * dim * sizeof(float),
269 |              cudaMemcpyHostToDevice);
270 |   cudaMemcpy(gInfo,info,sizeof(Info),cudaMemcpyHostToDevice);
271 | 
272 |   iElaps4 += cpuSecond() - iStart4a;
273 | 
274 |   int count = 0;
275 |   float frac = 1.0;
276 | 
277 |   while (count < thresholdLoops) {
278 |     iStart4b = cpuSecond();
279 |     cudaMemcpy(gCentroids, iCentroids[0], dim * numCentroids * sizeof(float), cudaMemcpyHostToDevice);
280 |     iElaps4 += cpuSecond() - iStart4b;
281 | 
282 |     // E-Step: assign points to the nearest cluster center
283 |     iStart2 = cpuSecond();
284 |     // nearestCentroid<<<l1, bthreads, sdsize2>>>(dim, numPoints, numCentroids, gPoints, gCentroids, gBelongs, tmp);
285 |     nearestCentroid<<<l1, bthreads, sdsize2>>>(tmp, gBelongs, gPoints, gCentroids, gInfo);
286 |     cudaDeviceSynchronize();
287 |     iElaps2 += (cpuSecond() - iStart2);
288 | 
289 |     // Update belongs
290 |     iStart4c = cpuSecond();
291 |     cudaMemcpy(belongs, gBelongs, numPoints * sizeof(int), cudaMemcpyDeviceToHost);
292 |     iElaps4 += cpuSecond() - iStart4c;
293 | 
294 |     // M-Step first half: set the cluster centers to the mean
295 |     iStart3a = cpuSecond();
296 | 
297 |     // Clear the two temp variables
298 |     for (int i = 0; i < numCentroids; i++) {
299 |       pointsCount[i] = 0;
300 |       for (int j = 0; j < dim; j++) {
301 |         iNewCentroids[j][i] = 0.0;
302 |       }
303 |     }
304 | 
305 |     // Add up points in each centroid
306 |     for (int i = 0; i < numPoints; i++) {
307 |       int idx = belongs[i];
308 |       pointsCount[idx] += 1;
309 |       for (int j = 0; j < dim; j++) {
310 |         iNewCentroids[j][idx] += points[i][j];
311 |       }
312 |     }
313 |     iElaps3a += cpuSecond() - iStart3a;
314 | 
315 |     // M-Step second half: convert the sum to the mean
316 |     // Update to new centroids
317 |     iStart3b = cpuSecond();
318 |     for (int i = 0; i < numCentroids; i++) {
319 |       for (int j = 0; j < dim; j++) {
320 |         if (pointsCount[i] > 0) {
321 |           iCentroids[j][i] = iNewCentroids[j][i] / pointsCount[i];
322 |         }
323 |       }
324 |     }
325 |     iElaps3b += cpuSecond() - iStart3b;
326 | 
327 |     // Check convergence
328 |     iStart5 = cpuSecond();
329 | 
330 |     // Check if too few number of points change their centroids
331 |     reduce <<<1, l2, sdsize1>>>(tmp, l1, l2);
332 |     cudaDeviceSynchronize();
333 |     int tmpFloat;
334 |     cudaMemcpy(&tmpFloat, tmp, sizeof(int), cudaMemcpyDeviceToHost);
335 |     frac = (float)tmpFloat / numPoints;
336 |     cout << "Iteration: " << count << "," << frac << "," << tmpFloat  << "\n";
337 |     count++;
338 |     if (frac <= thresholdFraction) break;
339 | 
340 |     iElaps5 += cpuSecond() - iStart5;
341 | 
342 |   }
343 | 
344 |   centroids = make2DArray(numCentroids, dim);
345 |   invert2DArray(centroids, iCentroids, numCentroids, dim);
346 |   info->centroids = centroids;
347 | 
348 |   // Free device memory
349 |   cudaFree(gPoints);
350 |   cudaFree(gCentroids);
351 |   cudaFree(gBelongs);
352 |   cudaFree(tmp);
353 | 
354 | }
355 | 
356 | int main(int argc, char *argv[]) {
357 |   Info *info     = new Info;
358 |   info->threadPerBlock = atoi(argv[1]);
359 |   char *fileName = argv[2];
360 |   processData(fileName, info);
361 | 
362 |   for (int i = 0; i < info->numRepeats; i++) {
363 |     iStart1 = cpuSecond();
364 |     cudaKmeans(info);
365 |     iElaps1 += cpuSecond() - iStart1;
366 | 
367 |     // cout << info->centroids[0][0] << "," << info->centroids[0][1] << ","
368 |     //      << info->centroids[1][0] << "," << info->centroids[1][1] << "\n";
369 | 
370 |     // Reload info
371 |     delete(info);
372 |     info     = new Info;
373 |     info->threadPerBlock = atoi(argv[1]);
374 |     processData(fileName, info);
375 |   }
376 | 
377 | 
378 |   cout << "Total time: " << iElaps1*1000 << "\n";
379 |   cout << "E-step time use (ms): " << iElaps2*1000 << "\n";
380 |   cout << "M-step-1st-half time use (ms): " << iElaps3a*1000 << "\n";
381 |   cout << "M-step-2nd-half time use (ms): " << iElaps3b*1000 << "\n";
382 |   cout << "Cuda Data IO (ms): " << iElaps4*1000 << "\n";
383 |   cout << "Other (ms): " << iElaps5*1000 << "\n";
384 | }
385 | 


--------------------------------------------------------------------------------
/Parallel_Algorithm/Cuda/test.txt:
--------------------------------------------------------------------------------
1 |   1 2.0 2.0
2 |   2 2.0 -2.0
3 |   3 -3.0 -2.0
4 |   4 -3.0 2.0


--------------------------------------------------------------------------------
/Parallel_Algorithm/Cuda/test_multithreadPerBlock.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | export PATH=/usr/local/cuda-7.5/bin${PATH:+:${PATH}}
 3 | export LD_LIBRARY_PATH=/usr/local/cuda-7.5/lib64\${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
 4 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib
 5 | 
 6 | thread_list='1 2 4 8 16 32 64'
 7 | 
 8 | for thread in $thread_list
 9 | do
10 |     echo " "
11 |     echo =========================================
12 |     echo =========================================
13 |     echo testing with $thread threads per block on device
14 |     ./a.out $thread ../test_data/Blobs_smp20000_fea30_cls8.nc
15 | done
16 | 


--------------------------------------------------------------------------------
/Parallel_Algorithm/MPI/Kmean_mpi.c:
--------------------------------------------------------------------------------
  1 | //#include "../shared/timing.h" //for timer seconds()
  2 | #include <stdio.h>
  3 | #include <stdlib.h>
  4 | #include <float.h> //for FLT_MAX
  5 | #include <mpi.h>
  6 | #include "../shared/make_2D_array.h"
  7 | #include "../shared/ncdf_util.h"
  8 | #include "../shared/math_util.h"
  9 | 
 10 | /* This is the name of the data file we will read. */
 11 | //#define FILE_NAME "../test_data/Blobs_smp20000_fea30_cls8.nc"
 12 | #define FILE_NAME "../../Data_Analysis/data/SSWdata.nc"
 13 | #define TOL 0.0001
 14 | #define MAX_ITER 100 
 15 | 
 16 | int main() {
 17 | 
 18 |     /*
 19 |     ======================================================
 20 |     ----------------  Initialization ---------------------
 21 |     ======================================================
 22 |     */
 23 | 
 24 |     int rank, size;
 25 |     MPI_Init(NULL,NULL);
 26 |     MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 27 |     MPI_Comm_size(MPI_COMM_WORLD, &size);
 28 |     //printf("hello world from process %d of %d\n", rank, size);
 29 | 
 30 |     int N_samples_all,N_samples,N_features,N_clusters,N_repeat;
 31 |     //i for samples; j for features; k for clusters (typically)
 32 |     int i,j,k;
 33 |     int k_best,initial_idx;
 34 |     float** X; //unlike in serial/OpenMP versions, here X is local data
 35 |     float** X_all; //only master node holds the full data
 36 |     int** GUESS;
 37 |     float dist,dist_min,dist_sum_old,dist_sum_new,inert_best=FLT_MAX;
 38 | 
 39 |     /*
 40 |     ======================================================
 41 |     -- Read data by master node and distribute over processes --
 42 |     ======================================================
 43 |     */
 44 |     
 45 |     double iStart1 = MPI_Wtime(); 
 46 |     // let master core read data and broadcast to other cores
 47 | 
 48 |     if (rank == 0){
 49 |     // get input data and its size
 50 |     readX(FILE_NAME,&X_all,&GUESS,&N_samples_all,&N_features,&N_clusters,&N_repeat);
 51 |     }
 52 |     else{
 53 |     /*
 54 |     MPI_Scatter needs to access *X_all in all processes
 55 |     For non-root, we need to assign NULL to prevent memory error
 56 |     */
 57 |     float* dummy_for_X_all=NULL;
 58 |     X_all = &dummy_for_X_all;
 59 |     }
 60 | 
 61 |     MPI_Bcast(&N_samples_all,1,MPI_INT,0,MPI_COMM_WORLD);
 62 |     MPI_Bcast(&N_features,1,MPI_INT,0,MPI_COMM_WORLD);
 63 |     MPI_Bcast(&N_clusters,1,MPI_INT,0,MPI_COMM_WORLD);
 64 |     MPI_Bcast(&N_repeat,1,MPI_INT,0,MPI_COMM_WORLD);
 65 |     //printf("%d: %d,%d,%d,%d\n",rank,N_samples_all,N_features,N_clusters,N_repeat);
 66 | 
 67 |     if (rank==0){
 68 |         printf("Last element in global array: %f \n",X_all[N_samples_all-1][N_features-1]);
 69 |     }
 70 | 
 71 | 
 72 |     // Naive Scatter: Assume N_sample_all is divisible by size   
 73 |     /*
 74 |     N_samples = N_samples_all / size; 
 75 |     X = Make2DFloatArray(N_samples,N_features);
 76 |     MPI_Scatter(*X_all, N_samples*N_features, MPI_FLOAT, *X,
 77 |            N_samples*N_features, MPI_FLOAT, 0, MPI_COMM_WORLD);
 78 |     */
 79 | 
 80 |     // Correct sactter: works for any numbers
 81 | 
 82 |     int *sendcounts,*displs;
 83 |     if (rank == 0){
 84 |         int N_samples_slave = N_samples_all/size; //master node needs to know the data size for other nodes
 85 |         N_samples = N_samples_all - N_samples_slave*(size-1);// the remaining data
 86 |     
 87 |         sendcounts = (int *)malloc(size*sizeof(int)); // the number of elements to send to each processor
 88 |         displs = (int *)malloc(size*sizeof(int)); //displacement relative to sendbuf for data sent to process i
 89 | 
 90 |         sendcounts[0]=N_samples*N_features;
 91 |         displs[0]=0;
 92 |         for (i=1; i<size; i++) { 
 93 |             displs[i] = (N_samples+(i-1)*N_samples_slave)*N_features; // continous data 
 94 |             sendcounts[i] = N_samples_slave*N_features; 
 95 |         } 
 96 |     }
 97 |     else{
 98 |         //other nodes
 99 |         N_samples = N_samples_all/size;
100 | 
101 |         sendcounts = NULL; 
102 |         displs = NULL;
103 |     }
104 | 
105 |     // This allocation works for all nodes
106 |     X = Make2DFloatArray(N_samples,N_features);
107 | 
108 |     MPI_Scatterv(*X_all, sendcounts, displs,
109 |                  MPI_FLOAT, *X,  N_samples*N_features,
110 |                  MPI_FLOAT, 0, MPI_COMM_WORLD);
111 | 
112 |     //printf("%d, Local samples: %d \n",rank,N_samples);
113 | 
114 |     // check scattered results
115 |     if (rank==size-1){
116 |         printf("Last element after scattering %d: %f \n",rank,X[N_samples-1][N_features-1]);
117 |     }
118 | 
119 |     double iElaps1 = MPI_Wtime() - iStart1;
120 | 
121 |     /*
122 |     ======================================================
123 |     -------  Continue to initialize variables
124 |     ======================================================
125 |     */
126 | 
127 |     // each data point belongs to which cluster
128 |     // values range from 0 to N_cluster-1
129 |     int* labels = (int *)malloc(N_samples*sizeof(int));
130 |     int* labels_best = (int *)malloc(N_samples*sizeof(int));
131 | 
132 |     // The position of each cluster center.
133 |     // Two arrays are needed as we are calculating the distance to the
134 |     // old centers and accumulating the new centers in the same iteration.
135 |     float** old_cluster_centers = Make2DFloatArray(N_clusters,N_features);
136 |     float** new_cluster_centers = Make2DFloatArray(N_clusters,N_features);
137 | 
138 |     // how many data points in the cluster
139 |     // needed by calculating the average position of data points in each cluster
140 |     int* cluster_sizes = (int *)malloc(N_clusters*sizeof(int));
141 | 
142 |     /*
143 |     ======================================================
144 |     ----------------  Kmean initial centers --------------
145 |     ======================================================
146 |     */
147 |     MPI_Barrier(MPI_COMM_WORLD);
148 |     if (rank == 0)
149 |         printf("=====Applying K-mean======\n");
150 | 
151 |     // record timing results
152 |     double iStart2,iElaps2;
153 |     double iStart3a,iStart3b,iStart3c;
154 |     double iElaps3a=0,iElaps3b=0,iElaps3c=0;
155 | 
156 |     /* Run the K-mean algorithm for N_repeat times with
157 |      * different starting points
158 |      */
159 |     iStart2 = MPI_Wtime();
160 |     for (int i_repeat=0; i_repeat < N_repeat; i_repeat++){
161 | 
162 |     // guess initial centers
163 |     if (rank==0) {
164 |     // only master node holds the full data X_all and the initial GUESS
165 |     for (k=0; k<N_clusters; k++){
166 |         cluster_sizes[k] = 0; // for accumulating
167 |         // the index of data points as the initial guess for cluster centers
168 |         initial_idx = GUESS[i_repeat][k];
169 |         for (j=0; j<N_features; j++){
170 |             old_cluster_centers[k][j]=X_all[initial_idx][j];
171 |             //set the "new" array to 0 for accumulating
172 |             new_cluster_centers[k][j] = 0.0;
173 |             }
174 |         }
175 |     }
176 |     else{
177 |     // initialize other nodes
178 |     for (k=0; k<N_clusters; k++){
179 |         cluster_sizes[k] = 0; 
180 |         for (j=0; j<N_features; j++){
181 |             new_cluster_centers[k][j] = 0.0;
182 |             }
183 |         }
184 |     }
185 | 
186 |     //if(rank==0)
187 |     //    printf("master node: %f \n",old_cluster_centers[(int)N_clusters-1][(int)N_features-1]);
188 |     MPI_Bcast(*old_cluster_centers,N_clusters*N_features,MPI_FLOAT,0,MPI_COMM_WORLD);
189 | 
190 |     // check broadcast results
191 |     // printf("%d : %f \n",rank,old_cluster_centers[(int)N_clusters-1][(int)N_features-1]);
192 | 
193 |     /*
194 |     ======================================================
195 |     ----------------  core Kmean stepping ---------------------
196 |     ======================================================
197 |     */
198 | 
199 |     int i_iter = 0;//record iteration counts
200 |     dist_sum_new = 0.0;//prevent the firt iteration error
201 |     do {
202 |     i_iter++;
203 |     dist_sum_old = dist_sum_new;
204 |     dist_sum_new = 0.0;
205 | 
206 |     // E-Step: assign points to the nearest cluster center
207 |     iStart3a = MPI_Wtime();
208 |     #pragma omp parallel for default(shared) schedule(static)\
209 |             private(i,j,k,k_best,dist,dist_min) \
210 |             reduction(+:dist_sum_new)
211 |     for (i = 0; i < N_samples; i++) {
212 |         k_best = 0;//assume cluster no.0 is the nearest
213 |         //dist_min = distance(N_features, X[i], old_cluster_centers[k_best]); 
214 |         dist_min = correlation(N_features, X[i], old_cluster_centers[k_best]);
215 |         for (k = 1; k < N_clusters; k++){
216 |             //dist = distance(N_features, X[i], old_cluster_centers[k]); 
217 |             dist = correlation(N_features, X[i], old_cluster_centers[k]);
218 |             if (dist < dist_min){
219 |                 dist_min = dist;
220 |                 k_best = k;  
221 |             }
222 |         }
223 |        labels[i] = k_best;
224 |        dist_sum_new += dist_min;
225 |     } // end of E-step loop
226 |     iElaps3a += (MPI_Wtime()-iStart3a);
227 | 
228 |     // M-Step first half: set the cluster centers to the mean
229 |     iStart3b = MPI_Wtime();
230 |     for (i = 0; i < N_samples; i++) {
231 |         k_best = labels[i];
232 |         cluster_sizes[k_best]++; // add one more points to this cluster
233 |         // As the total number of samples in each cluster is not known yet,
234 |         // here we are just calculating the sum, not the mean.
235 |         for (j=0; j<N_features; j++)
236 |             new_cluster_centers[k_best][j] += X[i][j];
237 |     } // end of M-Step first half
238 | 
239 |     /* Before converting sum to mean, different processes need to talk
240 |        to each other to get the full cluster center information.
241 |        However, there's no need to share the "label" variable, which can
242 |        keep local till the writing back stage.
243 |     */
244 |     MPI_Allreduce(MPI_IN_PLACE, *new_cluster_centers, N_clusters*N_features,
245 |                   MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD);
246 |     MPI_Allreduce(MPI_IN_PLACE, cluster_sizes, N_clusters, MPI_INT,
247 |                   MPI_SUM, MPI_COMM_WORLD);
248 | 
249 |     iElaps3b += (MPI_Wtime()-iStart3b);
250 | 
251 |     // M-Step second half: convert the sum to the mean
252 |     iStart3c = MPI_Wtime();
253 |     for (k=0; k<N_clusters; k++) {
254 |             for (j=0; j<N_features; j++) {
255 | 
256 |                 if (cluster_sizes[k] > 0) //avoid divide-by-zero error
257 |                     // sum -> mean
258 |                     old_cluster_centers[k][j] = new_cluster_centers[k][j] / cluster_sizes[k];
259 | 
260 |                new_cluster_centers[k][j] = 0.0;//for the next iteration
261 |             }
262 |             cluster_sizes[k] = 0;//for the next iteration
263 |     } // end of M-Step second half
264 | 
265 |     iElaps3c += (MPI_Wtime()-iStart3c);
266 | 
267 |     // To test convergence, we need the global sum of distances
268 |     MPI_Allreduce(MPI_IN_PLACE,&dist_sum_new, 1, MPI_FLOAT, 
269 |                   MPI_SUM, MPI_COMM_WORLD);
270 | 
271 |     } while( i_iter==1 || ((dist_sum_old - dist_sum_new > TOL)&&i_iter<MAX_ITER) );
272 |     //end of K-mean stepping
273 |    
274 |     //MPI_Barrier(MPI_COMM_WORLD);
275 |     //if (rank==0)
276 |     //    printf("Final inertia: %f, iteration: %d \n",dist_sum_new,i_iter);
277 | 
278 |     // record the best results
279 |     // non-root processes don't need this data, but they don't have 
280 |     // other thing else to do.
281 |     if (dist_sum_new < inert_best) {
282 |         inert_best = dist_sum_new;
283 |         for (i = 0; i < N_samples; i++)
284 |             labels_best[i] = labels[i];
285 |     }
286 | 
287 |     } //end of one repeated run
288 |     iElaps2 = MPI_Wtime() - iStart2;
289 | 
290 |     /*
291 |     ======================================================
292 |     ----------------  Finalization ---------------------
293 |     ======================================================
294 |     */
295 | 
296 |     // write data back to NetCDF file
297 |     // writeY(FILE_NAME,labels_best, inert_best);
298 | 
299 | 
300 |     /* get the max timing measured among all processes */
301 |     double iElaps1_max;
302 |     MPI_Reduce(&iElaps1, &iElaps1_max, 1, MPI_DOUBLE,
303 |                 MPI_MAX, 0, MPI_COMM_WORLD);
304 | 
305 |     // print summary
306 |     if (rank == 0){
307 |     printf("Best inertia: %f \n",inert_best);
308 |     printf("I/O time use (ms): %f \n", iElaps1_max*1000.0);
309 |     printf("Kmean total time use (ms): %f \n", iElaps2*1000.0);
310 |     printf("\n(sub-component timing not accurate) \n");
311 |     printf("E-step time use (ms): %f \n", iElaps3a*1000.0);
312 |     printf("M-step-1st-half time use (ms): %f \n", iElaps3b*1000.0);
313 |     printf("M-step-2nd-half time use (ms): %f \n", iElaps3c*1000.0);
314 |     }
315 | 
316 |     MPI_Finalize();
317 | 
318 |     return 0;
319 | }
320 | 


--------------------------------------------------------------------------------
/Parallel_Algorithm/MPI/compile.sh:
--------------------------------------------------------------------------------
1 | mpicc -o Kmean_mpi.out -O2 -std=c99 -lm -lnetcdf -fopenmp -lpthread ../shared/make_2D_array.c ../shared/ncdf_util.c ../shared/math_util.c Kmean_mpi.c 
2 | 


--------------------------------------------------------------------------------
/Parallel_Algorithm/MPI/test_hybrid.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | np_list='1 2 4 8 16 32 64'
 4 | thread_list='1 2 4 8'
 5 | 
 6 | for np in $np_list
 7 | do
 8 |     for thread in $thread_list
 9 |     do
10 |     echo " "
11 |     echo =========================================
12 |     echo =========================================
13 |         echo testing with $np processes, $thread threads
14 |         export OMP_NUM_THREADS=$thread
15 |         mpiexec -np $np ./Kmean_mpi.out
16 |     done
17 | done
18 | 


--------------------------------------------------------------------------------
/Parallel_Algorithm/MPI/test_multiprocess.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | np_list='1 2 4 8 16 32 64'
 4 | 
 5 | export OMP_NUM_THREADS=1 # make sure OpenMP disabled 
 6 | for np in $np_list
 7 | do
 8 |     echo " "
 9 |     echo =========================================
10 |     echo =========================================
11 |     echo testing with $np processes
12 |     mpiexec -np $np ./Kmean_mpi.out 
13 | done
14 | 


--------------------------------------------------------------------------------
/Parallel_Algorithm/OpenMP/Kmean_omp.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <stdlib.h>
  3 | #include <float.h> //for FLT_MAX
  4 | #include "../shared/timing.h" //for timer seconds()
  5 | #include "../shared/make_2D_array.h"
  6 | #include "../shared/ncdf_util.h"
  7 | #include "../shared/math_util.h"
  8 | 
  9 | /* This is the name of the data file we will read. */
 10 | //#define FILE_NAME "../test_data/Blobs_smp20000_fea30_cls8.nc"
 11 | #define FILE_NAME "../../Data_Analysis/data/SSWdata.nc"
 12 | #define TOL 0.0001 
 13 | #define MAX_ITER 100 
 14 | 
 15 | int main() {
 16 | 
 17 |     /*
 18 |     ======================================================
 19 |     ----------------  Initialization ---------------------
 20 |     ======================================================
 21 |     */
 22 |     int N_samples,N_features,N_clusters,N_repeat;
 23 |     //i for samples; j for features; k for clusters (typically)
 24 |     int i,j,k;
 25 |     int k_best,initial_idx;
 26 |     float** X;
 27 |     int** GUESS;
 28 |     float dist,dist_min,dist_sum_old,dist_sum_new,inert_best=FLT_MAX;
 29 |     
 30 |     // get input data and its size
 31 |     double iStart1 = seconds();
 32 |     readX(FILE_NAME,&X,&GUESS,&N_samples,&N_features,&N_clusters,&N_repeat);
 33 |     double iElaps1 = seconds() - iStart1;
 34 | 
 35 |     // each data point belongs to which cluster
 36 |     // values range from 0 to N_cluster-1
 37 |     int* labels = (int *)malloc(N_samples*sizeof(int));
 38 |     int* labels_best = (int *)malloc(N_samples*sizeof(int));
 39 | 
 40 |     // The position of each cluster center.
 41 |     // Two arrays are needed as we are calculating the distance to the
 42 |     // old centers and accumulating the new centers in the same iteration.
 43 |     float** old_cluster_centers = Make2DFloatArray(N_clusters,N_features);
 44 |     float** new_cluster_centers = Make2DFloatArray(N_clusters,N_features);
 45 | 
 46 |     // how many data points in the cluster
 47 |     // needed by calculating the average position of data points in each cluster
 48 |     int* cluster_sizes = (int *)malloc(N_clusters*sizeof(int)); 
 49 | 
 50 |     /*
 51 |     ======================================================
 52 |     ----------------  Kmean stepping ---------------------
 53 |     ======================================================
 54 |     */
 55 |     printf("=====Applying K-mean======\n");
 56 |    
 57 |     // record timing results 
 58 |     double iStart2,iElaps2;
 59 |     double iStart3a,iStart3b,iStart3c;
 60 |     double iElaps3a=0,iElaps3b=0,iElaps3c=0;
 61 | 
 62 |     /* Run the K-mean algorithm for N_repeat times with 
 63 |      * different starting points
 64 |      */
 65 |     iStart2 = seconds();
 66 |     for (int i_repeat=0; i_repeat < N_repeat; i_repeat++){
 67 | 
 68 |     // guess initial centers
 69 |     for (k=0; k<N_clusters; k++){
 70 |         cluster_sizes[k] = 0; // for accumulating 
 71 |         // the index of data points as the initial guess for cluster centers
 72 |         initial_idx = GUESS[i_repeat][k]; 
 73 |         for (j=0; j<N_features; j++){
 74 |             old_cluster_centers[k][j]=X[initial_idx][j];
 75 |             //set the "new" array to 0 for accumulating
 76 |             new_cluster_centers[k][j] = 0.0;
 77 |     }
 78 |     }
 79 | 
 80 |     // core K-mean stepping (Expectation-Maximization) begins here!!
 81 |     int i_iter = 0;//record iteration counts
 82 |     dist_sum_new = 0.0;//prevent the firt iteration error
 83 |     do {
 84 |     i_iter++;
 85 |     dist_sum_old = dist_sum_new; 
 86 |     dist_sum_new = 0.0;
 87 | 
 88 |     // E-Step: assign points to the nearest cluster center
 89 |     iStart3a = seconds();
 90 |     #pragma omp parallel default(shared) \
 91 |             private(i,j,k,k_best,dist,dist_min) \
 92 |             reduction(+:dist_sum_new)
 93 |     {
 94 |     #pragma omp for schedule(static)
 95 |     for (i = 0; i < N_samples; i++) {
 96 |         k_best = 0;//assume cluster no.0 is the nearest
 97 |         //dist_min = distance(N_features, X[i], old_cluster_centers[k_best]); 
 98 |         dist_min = correlation(N_features, X[i], old_cluster_centers[k_best]); 
 99 |         for (k = 1; k < N_clusters; k++){
100 |             //dist = distance(N_features, X[i], old_cluster_centers[k]); 
101 |             dist = correlation(N_features, X[i], old_cluster_centers[k]); 
102 |             if (dist < dist_min){
103 |                 dist_min = dist;
104 |                 k_best = k; 
105 |             }
106 |         }
107 |        labels[i] = k_best;
108 |        dist_sum_new += dist_min;
109 |     } // end of omp for loop
110 |     } // end of omp parallel region
111 |     iElaps3a += (seconds()-iStart3a);
112 | 
113 |     // M-Step first half: set the cluster centers to the mean
114 |     iStart3b = seconds();
115 |     for (i = 0; i < N_samples; i++) {
116 |         k_best = labels[i];
117 |         cluster_sizes[k_best]++; // add one more points to this cluster
118 |         // As the total number of samples in each cluster is not known yet,
119 |         // here we are just calculating the sum, not the mean.
120 |         for (j=0; j<N_features; j++)
121 |             new_cluster_centers[k_best][j] += X[i][j]; 
122 |     } // end of M-Step first half
123 |     iElaps3b += (seconds()-iStart3b);
124 | 
125 |     iStart3c = seconds();
126 |     // M-Step second half: convert the sum to the mean
127 |     for (k=0; k<N_clusters; k++) {
128 |             for (j=0; j<N_features; j++) {
129 | 
130 |                 if (cluster_sizes[k] > 0) //avoid divide-by-zero error
131 |                     // sum -> mean
132 |                     old_cluster_centers[k][j] = new_cluster_centers[k][j] / cluster_sizes[k];
133 |                
134 |                new_cluster_centers[k][j] = 0.0;//for the next iteration
135 |             }
136 |             cluster_sizes[k] = 0;//for the next iteration
137 |     } // end of M-Step second half
138 |     iElaps3c += (seconds()-iStart3c);
139 | 
140 |     } while( i_iter==1 || ((dist_sum_old - dist_sum_new > TOL)&&i_iter<MAX_ITER) ); 
141 |     //end of K-mean stepping
142 | 
143 |     //printf("Final inertia: %f, iteration: %d \n",dist_sum_new,i_iter);
144 | 
145 |     // record the best results
146 |     if (dist_sum_new < inert_best) {
147 |         inert_best = dist_sum_new;
148 |         for (i = 0; i < N_samples; i++)
149 |             labels_best[i] = labels[i];
150 |     }
151 | 
152 |     } //end of one repeated run
153 |     iElaps2 = seconds() - iStart2;
154 | 
155 |     /*
156 |     ======================================================
157 |     ----------------  Finalization ---------------------
158 |     ======================================================
159 |     */
160 | 
161 |     // write data back to NetCDF file
162 |     writeY(FILE_NAME,labels_best, inert_best);
163 | 
164 |     // print summary
165 |     printf("Best inertia: %f \n",inert_best);
166 |     printf("Kmean total time use (ms): %f \n", iElaps2*1000.0);
167 |     printf("E-step time use (ms): %f \n", iElaps3a*1000.0);
168 |     printf("M-step-1st-half time use (ms): %f \n", iElaps3b*1000.0);
169 |     printf("M-step-2nd-half time use (ms): %f \n", iElaps3c*1000.0);
170 |     printf("I/O time use (ms): %f \n", iElaps1*1000.0);
171 | 
172 |     return 0;
173 | }
174 | 


--------------------------------------------------------------------------------
/Parallel_Algorithm/OpenMP/compile.sh:
--------------------------------------------------------------------------------
1 | gcc -o Kmean_omp.out -O2 -std=c99 -lm -lnetcdf -fopenmp -lpthread ../shared/make_2D_array.c ../shared/ncdf_util.c ../shared/math_util.c Kmean_omp.c 
2 | 
3 | #For debugging with gdb
4 | #gcc -g -O0 -std=c99 -lnetcdf Kmean_seq.c -o Kmean_seq.out
5 | 


--------------------------------------------------------------------------------
/Parallel_Algorithm/OpenMP/pseudo.c:
--------------------------------------------------------------------------------
 1 | do{
 2 |     reset_old_variable_values(...)
 3 |     // E-Step: assign points to the nearest cluster center
 4 |     for (i = 0; i < N_samples; i++) {
 5 |         k_best = 0;
 6 |         dist_min = distance(N_features, X[i], old_cluster_centers[k_best]); 
 7 |         for (k = 1; k < N_clusters; k++){
 8 |             dist = distance(N_features, X[i], old_cluster_centers[k]); 
 9 |             if (dist < dist_min){
10 |                 dist_min = dist;
11 |                 k_best = k;
12 |             }
13 |         }
14 |        labels[i] = k_best;
15 |        dist_sum_new += dist_min;
16 |     }
17 | 
18 |     // M-Step: set the cluster centers to the mean
19 |     // M-Step first half: As the total number of samples in each cluster is not known yet,here we are just calculating the sum, not the mean.
20 |     for (i = 0; i < N_samples; i++) {
21 |         k_best = labels[i];
22 |         cluster_sizes[k_best]++; 
23 |         for (j=0; j<N_features; j++)
24 |             new_cluster_centers[k_best][j] += X[i][j]; 
25 |     } 
26 |     // M-Step second half: convert the sum to the mean
27 |     for (k=0; k<N_clusters; k++) {
28 |     for (j=0; j<N_features; j++) {
29 |         old_cluster_centers[k][j] = new_cluster_centers[k][j] / cluster_sizes[k];
30 |     }
31 |     }
32 | } while(dist_sum_old - dist_sum_new > TOL)
33 | 
34 | 
35 | 


--------------------------------------------------------------------------------
/Parallel_Algorithm/OpenMP/test_multithread.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | thread_list='1 2 4 8 16 32 64'
 4 | 
 5 | for thread in $thread_list
 6 | do
 7 |     echo " "
 8 |     echo =========================================
 9 |     echo =========================================
10 |     echo testing with $thread threads
11 |     export OMP_NUM_THREADS=$thread
12 |     ./Kmean_omp.out 
13 | done
14 | 


--------------------------------------------------------------------------------
/Parallel_Algorithm/README:
--------------------------------------------------------------------------------
 1 | C code for the parallel k-mean clustering algorithm.
 2 | Doesn't depend on to any specific data set.
 3 | 
 4 | Include:
 5 | 
 6 | 1) Homogeneous Parallel Environment
 7 | Pure OpenMP
 8 | Pure MPI
 9 | Pure CUDA (single GPU)
10 | 
11 | 2) Heterogenous Parallel Environment 
12 | Hybrid OpenMP+MPI
13 | CUDA with multi-GPU support
14 | Hybrid CUDA+MPI
15 | 


--------------------------------------------------------------------------------
/Parallel_Algorithm/python_reference/Apply_Kmean.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import numpy as np
 5 | import xarray as xr
 6 | from netCDF4  import Dataset
 7 | from timeit import default_timer as timer
 8 | from sklearn.cluster import KMeans
 9 | 
10 | dirname = "../test_data/"
11 | filename = "Blobs_smp20000_fea30_cls8.nc"
12 | 
13 | # read data from nc file
14 | start1 = timer()
15 | with xr.open_dataset(dirname+filename) as ds: 
16 |     n_clusters = ds.dims["N_clusters"]
17 |     n_features = ds.dims["N_features"]
18 |     n_repeat = ds.dims["N_repeat"]
19 |     X = ds["X"].values
20 |     GUESS = ds["GUESS"].values
21 | del ds
22 | 
23 | elapse1 = timer()-start1
24 | 
25 | # apply Kmeans
26 | start2 = timer()
27 | inert_best = np.inf
28 | for i_repeat in range(n_repeat):
29 |     # manually guess initial clusters (to compare with C)
30 |     initial_idx = GUESS[i_repeat,:]
31 |     initial_position = X[initial_idx,:]
32 |     kmeans = KMeans(n_clusters=n_clusters,n_init=1,init=initial_position,
33 |                     algorithm='full',tol=1e-4) 
34 |     kmeans.fit(X)
35 |     
36 |     if kmeans.inertia_ < inert_best:
37 |         inert_best = kmeans.inertia_
38 |         y_kmeans = kmeans.labels_
39 | 
40 | elapse2 = timer()-start2
41 | 
42 | # write results  back
43 | with Dataset(dirname+filename,mode='r+') as dset:
44 |     dset["Y_Py"][:] = y_kmeans
45 |     dset["INERT_Py"][:] = inert_best
46 | 
47 | # summary        
48 | print("final inertia:",inert_best)
49 | print("Kmean time use (ms):",elapse2*1e3)
50 | print("I/O time use (ms):",elapse1*1e3)


--------------------------------------------------------------------------------
/Parallel_Algorithm/python_reference/IO_util.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import xarray as xr
 3 | 
 4 | def Raw_to_NetCDF(X,ind,filename,y_true=None,feature_names=None):
 5 |     
 6 |     N_samples,N_features = X.shape
 7 |     label_zero = np.zeros(N_samples,dtype=np.int32)
 8 |     if feature_names is None:
 9 |         feature_names = np.arange(N_features,dtype=np.int32)
10 |     if y_true is None:
11 |         y_true = label_zero
12 |     
13 |     ds = xr.Dataset()
14 |     ds['X'] = (['N_samples', 'N_features'], np.float32(X) )
15 |     ds['X'].attrs["long_name"]="data points"
16 |     
17 |     ds['GUESS'] = (['N_repeat', 'N_clusters'], ind)
18 |     ds['GUESS'].attrs["long_name"]="indices of data points as initial guess of cluster centers"
19 |     ds['GUESS'].attrs["purpose"]="make sure that C and python use the same initial starting points"
20 |     
21 |     ds['Y_TRUE']=(['N_samples'], np.int32(y_true) )
22 |     ds['Y_TRUE'].attrs["long_name"]="(optional) true label of each data point"
23 |     
24 |     ds['Y_Py']=(['N_samples'], label_zero)
25 |     ds['Y_Py'].attrs["long_name"]="labels predicted by python Kmean function"
26 |     
27 |     ds['Y_C']=(['N_samples'], label_zero)
28 |     ds['Y_C'].attrs["long_name"] = "labels predicted by C implementation"
29 |     ds['Y_C'].attrs["purpose"] = "make sure that C implementation gives the same result as python"
30 |     
31 |     ds['INERT_Py'] = np.float32(0.0)
32 |     ds['INERT_Py'].attrs["long_name"] = "kmeans.inertia_ in python code, "+\
33 |         "i.e. sum of distances between data points and cluster centers"
34 |     
35 |     ds['INERT_C'] = np.float32(0.0)
36 |     ds['INERT_C'].attrs["long_name"] = "the C version of kmeans.inertia_"
37 |     ds['INERT_C'].attrs["purpose"] = "make sure that C implementation gives the same result as python"
38 |     
39 |     ds['FEATURES']=(['N_features'], feature_names)
40 |     ds['FEATURES'].attrs["long_name"] = "(optional) the meaning of each feature"
41 |                      
42 |     ds.to_netcdf(filename)
43 |     ds.close()
44 | 


--------------------------------------------------------------------------------
/Parallel_Algorithm/python_reference/Kmean_iris.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Tue Apr  4 17:11:36 2017
 5 | 
 6 | @author: desnow
 7 | """
 8 | 
 9 | import numpy as np
10 | import pandas as pd
11 | import xarray as xr
12 | 
13 | from sklearn.cluster import KMeans
14 | from sklearn.datasets import load_iris
15 | 
16 | # load iris data set from sklearn package
17 | iris = load_iris()
18 | 
19 | # extract the data to numpy array
20 | X =  np.float32(iris['data'])
21 | y_true = np.int32(iris['target'])
22 | 
23 | '''
24 | # convert to pandas and print
25 | df_iris = pd.DataFrame(data= np.c_[iris['target'],iris['data']],
26 |                      columns= ['target']+iris['feature_names'] )
27 | print(df_iris.head())
28 | '''
29 | 
30 | 
31 | # apply K-mean
32 | kmeans = KMeans(n_clusters=3,n_init=10,init='random',
33 |                 algorithm='full',tol=1e-2) 
34 | kmeans.fit(X)
35 | y_kmeans = kmeans.labels_
36 | #y_kmeans = kmeans.predict(X) # this can predict new points
37 | print("final inertia:",kmeans.inertia_)
38 | 
39 | '''
40 | # print results
41 | for i in range(len(y_kmeans)): 
42 |     print(i+1,y_kmeans[i],y_true[i])
43 | '''
44 | 
45 | # convert to xarray Dataset and write into nc files.
46 | ds = xr.Dataset({'X': (['N_samples', 'N_features'], X),
47 |                  'y_true': (['N_samples'],        y_true),
48 |                  'y_kmeans_python': (['N_samples'],        y_kmeans),
49 |                  'inertia_python': np.float32(kmeans.inertia_) ,
50 |                  'y_kmeans_C': (['N_samples'], np.zeros_like(y_kmeans)),
51 |                  'inertia_C': np.float32(0.0)
52 |                  },
53 |                   coords={'samples': (['N_samples'],np.arange(y_true.size,dtype=np.float32)+1),
54 |                           'features': (['N_features'], iris['feature_names'])}
55 |               )
56 | ds.to_netcdf('../test_data/iris_data_Kmean.nc')
57 | 


--------------------------------------------------------------------------------
/Parallel_Algorithm/python_reference/check_SSWdata.py:
--------------------------------------------------------------------------------
 1 | import sys 
 2 | import numpy as np
 3 | from IO_util import Raw_to_NetCDF
 4 | import xarray as xr
 5 | 
 6 | dirname = '../../Data_Analysis/data/'
 7 | filename='SSWdata.nc'
 8 | 
 9 | ds = xr.open_dataset(dirname+filename)
10 | 
11 | print('total data size',ds["Y_TRUE"].size)
12 | print('size of 2nd cluster by MATLAB',ds["Y_TRUE"].sum())
13 | print('size of 2nd cluster by C',ds["Y_C"].sum())
14 | 
15 | mismatch = (ds["Y_TRUE"].values != ds["Y_C"].values) 
16 | print("inconsistent labels: ",mismatch.sum())
17 | 
18 | #ds.close()
19 | 


--------------------------------------------------------------------------------
/Parallel_Algorithm/python_reference/check_results.py:
--------------------------------------------------------------------------------
 1 | import xarray as xr
 2 | 
 3 | dirname = "../test_data/"
 4 | filename = "Blobs_smp20000_fea30_cls8.nc"
 5 | 
 6 | with xr.open_dataset(dirname+filename) as ds:
 7 |     mismatch = (ds["Y_Py"].values != ds["Y_C"].values) 
 8 |    
 9 | print("total number of samples: ",mismatch.size)
10 | print("inconsistent labels: ",mismatch.sum())
11 | 


--------------------------------------------------------------------------------
/Parallel_Algorithm/python_reference/convert_SSWdata.py:
--------------------------------------------------------------------------------
 1 | import sys 
 2 | import numpy as np
 3 | from IO_util import Raw_to_NetCDF
 4 | 
 5 | ndata = 17878
 6 | nfeatures = 252 
 7 | 
 8 | dirname = '../../Data_Analysis/data/'
 9 | # Read data points
10 | file1=open(dirname+'SSWdata.bin','rb')
11 | X=np.fromfile(file1)
12 | if sys.byteorder=='little':
13 |     X.byteswap(True)
14 | X=X.reshape(ndata,nfeatures)
15 | 
16 | # Read python label
17 | file1=open(dirname+'Label_py.bin','rb')
18 | Y_py=np.fromfile(file1,np.int32)
19 | if sys.byteorder=='little':
20 |     Y_py.byteswap(True)
21 | 
22 | # Read matlab label
23 | file1=open(dirname+'Label_matlab.bin','rb')
24 | Y_matlab=np.fromfile(file1,np.int32)
25 | Y_matlab -= 1 # 1~2 to 0~1
26 | 
27 | # ========================
28 | # convert the NetCDF format
29 | # ========================
30 | N_clusters = 2
31 | N_samples = ndata 
32 | N_features = nfeatures 
33 | N_repeat = 20
34 | 
35 | initial_ind = np.zeros([N_repeat,N_clusters],dtype=np.int32)
36 | for i in range(N_repeat):
37 |     initial_ind[i,:] = np.random.choice(np.arange(N_samples),
38 |                                    N_clusters,replace=False)
39 | 
40 | filename='SSWdata.nc'
41 | Raw_to_NetCDF(X,initial_ind,dirname+filename,y_true=Y_matlab)
42 | 
43 | 


--------------------------------------------------------------------------------
/Parallel_Algorithm/python_reference/make_fake_data.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import numpy as np
 5 | from sklearn.datasets.samples_generator import make_blobs
 6 | from IO_util import Raw_to_NetCDF
 7 | 
 8 | N_clusters = 8
 9 | N_samples = 20000
10 | N_features = 30
11 | N_repeat = 20
12 | 
13 | X, y = make_blobs(n_samples=N_samples, centers=N_clusters,
14 |                   n_features=N_features,random_state=0,
15 |                   cluster_std=1.0)
16 | 
17 | initial_ind = np.zeros([N_repeat,N_clusters],dtype=np.int32)
18 | 
19 | for i in range(N_repeat):
20 |     initial_ind[i,:] = np.random.choice(np.arange(N_samples),
21 |                                    N_clusters,replace=False)
22 | 
23 | dirname = "../test_data/"
24 | filename = "Blobs_smp{0}_fea{1}_cls{2}.nc".format(N_samples,N_features,N_clusters)
25 | 
26 | Raw_to_NetCDF(X,initial_ind,dirname+filename,y_true=y)


--------------------------------------------------------------------------------
/Parallel_Algorithm/shared/make_2D_array.c:
--------------------------------------------------------------------------------
 1 | #include <stdlib.h> //for malloc
 2 | #include "make_2D_array.h"
 3 | 
 4 | /* For dynamically allocating 2D array in pure-C environment.
 5 | Unlke in HW2, the array here is contagious!
 6 | See:
 7 | http://stackoverflow.com/questions/33794657/how-to-pass-a-2d-array-to-a-function-in-c-when-the-array-is-formatted-like-this
 8 | http://stackoverflow.com/questions/5901476/sending-and-receiving-2d-array-over-mpi
 9 | */
10 | float** Make2DFloatArray(int rows, int cols) {
11 |     float *data = (float *)malloc(rows*cols*sizeof(float));
12 |     float **array= (float **)malloc(rows*sizeof(float*));
13 |     for (int i=0; i<rows; i++)
14 |         array[i] = &(data[cols*i]);
15 | 
16 |     return array;
17 | }
18 | int** Make2DIntArray(int rows, int cols) {
19 |     int *data = (int *)malloc(rows*cols*sizeof(int));
20 |     int **array= (int **)malloc(rows*sizeof(int*));
21 |     for (int i=0; i<rows; i++)
22 |         array[i] = &(data[cols*i]);
23 | 
24 |     return array;
25 | }
26 | 
27 | 
28 | 


--------------------------------------------------------------------------------
/Parallel_Algorithm/shared/make_2D_array.h:
--------------------------------------------------------------------------------
1 | #ifndef MAKE_2D_ARRAY_H 
2 | #define MAKE_2D_ARRAY_H
3 | 
4 | float** Make2DFloatArray(int rows, int cols);
5 | int** Make2DIntArray(int rows, int cols);
6 | 
7 | #endif // MAKE_2D_ARRAY_H 
8 | 


--------------------------------------------------------------------------------
/Parallel_Algorithm/shared/math_util.c:
--------------------------------------------------------------------------------
 1 | #include <math.h>
 2 | #include "math_util.h"
 3 | 
 4 | // square of the distance between x1[N_features] and x2[N_features]
 5 | float distance(int N_features,float *x1,float *x2){
 6 |     float dist=0.0;
 7 |     for (int j=0; j<N_features; j++)
 8 |         dist += (x1[j]-x2[j])*(x1[j]-x2[j]);
 9 |     return(dist);
10 | }
11 | 
12 | float correlation(int N_features,float *x,float *y){
13 |    float xsum=0.0,ysum=0.0,xysum=0.0,xsqr_sum=0.0,ysqr_sum=0.0;
14 |    for (int j = 0; j<N_features; j++) {
15 |             xsum = xsum + x[j];
16 |             ysum = ysum + y[j];
17 |             xysum = xysum + x[j] * y[j];
18 |             xsqr_sum = xsqr_sum + x[j] * x[j];
19 |             ysqr_sum = ysqr_sum + y[j] * y[j];
20 |     }
21 | 
22 |     float num = ((N_features * xysum) - (xsum * ysum));
23 |     float deno = ((N_features * xsqr_sum - xsum * xsum)* (N_features * ysqr_sum - ysum * ysum));
24 |     float coeff = num / sqrt(deno);
25 |     return(coeff);
26 | }
27 | 


--------------------------------------------------------------------------------
/Parallel_Algorithm/shared/math_util.h:
--------------------------------------------------------------------------------
1 | #ifndef MATH_UTIL_H 
2 | #define MATH_UTIL_H
3 | 
4 | float distance(int N_features,float *x1,float *x2);
5 | 
6 | float correlation(int N_features,float *x,float *y);
7 | 
8 | #endif // MATH_UTIL_H
9 | 


--------------------------------------------------------------------------------
/Parallel_Algorithm/shared/ncdf_util.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <stdlib.h>
  3 | #include <netcdf.h>
  4 | #include "make_2D_array.h"
  5 | #include "ncdf_util.h"
  6 | // including <stdlib.h> at last leads to "error: unknown type name ‘size_t’"
  7 | // no idea why?
  8 | 
  9 | /* Handle errors by printing an error message and exiting with a
 10 |  * non-zero status. */
 11 | #define ERRCODE 2
 12 | #define ERR(e) {printf("Error: %s\n", nc_strerror(e)); exit(ERRCODE);}
 13 | 
 14 | /* Read the input data from NetCDF file. 
 15 |  * Dynamically allocate the array based on the data size.
 16 |  * 
 17 |  * Why need 3-levels of pointers:
 18 |  * The first two levels are for 2D dynamic array, 
 19 |  * the last level is for modifying function arguments in place.
 20 |  * (need to pass the address)
 21 |  */
 22 | int readX(char* FILE_NAME, float*** p_X,int*** p_GUESS,
 23 |           int* p_N_samples,int* p_N_features,
 24 |           int* p_N_clusters,int* p_N_repeat ) {
 25 |    int ncid, varid,dimid;
 26 |    int retval;
 27 |    size_t N_temp;
 28 | 
 29 |     printf("reading data \n");
 30 | 
 31 |    /* Open the file. NC_NOWRITE tells netCDF we want read-only access
 32 |     * to the file.*/
 33 |    if ((retval = nc_open(FILE_NAME, NC_NOWRITE, &ncid)))
 34 |       ERR(retval);
 35 | 
 36 |    /* Get the size of the data for dynamical allocation*/
 37 |    nc_inq_dimid(ncid,"N_samples",&dimid);
 38 |    nc_inq_dimlen(ncid,dimid,&N_temp);
 39 |    *p_N_samples = (int)N_temp;
 40 |    printf("Number of samples: %d \n",*p_N_samples);
 41 | 
 42 |    nc_inq_dimid(ncid,"N_features",&dimid);
 43 |    nc_inq_dimlen(ncid,dimid,&N_temp);
 44 |    *p_N_features = (int)N_temp;
 45 |    printf("Number of features: %d \n",*p_N_features);
 46 | 
 47 |    nc_inq_dimid(ncid,"N_clusters",&dimid);
 48 |    nc_inq_dimlen(ncid,dimid,&N_temp);
 49 |    *p_N_clusters = (int)N_temp;
 50 |    printf("Number of clusters: %d \n",*p_N_clusters);
 51 | 
 52 |    nc_inq_dimid(ncid,"N_repeat",&dimid);
 53 |    nc_inq_dimlen(ncid,dimid,&N_temp);
 54 |    *p_N_repeat = (int)N_temp;
 55 |    printf("Number of repeated runs: %d \n",*p_N_repeat);
 56 | 
 57 |     /* Get the varid of the data variable, based on its name. */
 58 |    if ((retval = nc_inq_varid(ncid, "X", &varid)))
 59 |       ERR(retval);
 60 |    /* Read the data. */
 61 |    *p_X = Make2DFloatArray(*p_N_samples,*p_N_features);
 62 |    if ((retval = nc_get_var_float(ncid, varid, (*p_X)[0])))
 63 |       ERR(retval);
 64 | 
 65 |     /* Initial Guess*/
 66 |    if ((retval = nc_inq_varid(ncid, "GUESS", &varid)))
 67 |       ERR(retval);
 68 |    *p_GUESS = Make2DIntArray(*p_N_repeat,*p_N_clusters);
 69 |    if ((retval = nc_get_var_int(ncid, varid, (*p_GUESS)[0])))
 70 |       ERR(retval);
 71 | 
 72 |    /*close the netcdf file*/
 73 |    if ((retval = nc_close(ncid) ))
 74 |       ERR(retval);
 75 | 
 76 |     printf("=====reading data finished======\n");
 77 | 
 78 |    return 0;
 79 | }
 80 | 
 81 | int writeY(char* FILE_NAME, int* labels, float inert) {
 82 |    int ncid, varid;
 83 |    int retval;
 84 | 
 85 |    if ((retval = nc_open(FILE_NAME, NC_WRITE, &ncid)))
 86 |       ERR(retval);
 87 | 
 88 |    if ((retval = nc_inq_varid(ncid, "INERT_C", &varid)))
 89 |       ERR(retval)
 90 |    if ((retval = nc_put_var_float(ncid, varid, &inert )))
 91 |       ERR(retval);
 92 | 
 93 |    if ((retval = nc_inq_varid(ncid, "Y_C", &varid)))
 94 |       ERR(retval)
 95 |    if ((retval = nc_put_var_int(ncid, varid, labels )))
 96 |       ERR(retval);
 97 | 
 98 |    /*close the netcdf file*/
 99 |    if ((retval = nc_close(ncid) ))
100 |       ERR(retval);
101 | 
102 |     printf("=====writting data finished======\n");
103 | 
104 |    return 0;
105 | }
106 | 


--------------------------------------------------------------------------------
/Parallel_Algorithm/shared/ncdf_util.h:
--------------------------------------------------------------------------------
 1 | #ifndef NCDF_UTIL_H 
 2 | #define NCDF_UTIL_H
 3 | 
 4 | int readX(char* FILE_NAME, float*** p_X,int*** p_GUESS,
 5 |           int* p_N_samples,int* p_N_features,
 6 |           int* p_N_clusters,int* p_N_repeat );
 7 | 
 8 | int writeY(char* FILE_NAME, int* labels, float inert);
 9 | 
10 | #endif // NCDF_UTIL_H 
11 | 


--------------------------------------------------------------------------------
/Parallel_Algorithm/shared/timing.h:
--------------------------------------------------------------------------------
 1 | #include <sys/time.h>
 2 | #include <stdlib.h>
 3 | 
 4 | inline double seconds()
 5 | {
 6 |     struct timeval tp;
 7 |     int i = gettimeofday(&tp, NULL);
 8 |     return ((double)tp.tv_sec + (double)tp.tv_usec * 1.e-6);
 9 | }
10 | 


--------------------------------------------------------------------------------
/Parallel_Algorithm/test_data/.gitignore:
--------------------------------------------------------------------------------
1 | *.nc
2 | 


--------------------------------------------------------------------------------
/Parallel_Algorithm/test_data/README:
--------------------------------------------------------------------------------
1 | test data in NetCDF format
2 | 
3 | Data can be generated by python scripts
4 | This Git repo contains no actually data (to keep its size small).
5 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Table of Contents
  2 |   * [Introduction](#introduction)
  3 |   * [Parallel Kmeans Algorithms](#parallel-kmeans-algorithms)
  4 |       * [OpenMP, MPI and hybrid MPI-OpenMP parallelization](#openmp-mpi-and-hybrid-mpi-openmp-parallelization)
  5 |       * [Advanced Feature: CUDA](#advanced-feature-cuda)
  6 |   * [Applications](#applications)
  7 |       * [Forest Cover Type Classification](#forest-cover-type-classification)
  8 |       * [Advanced Feature: Abnormal Climate Events Identification](#advanced-feature-abnormal-climate-events-identification)
  9 |   * [Discussion](#discussion)
 10 |   * [Computational Platforms and Software Libraries](#computational-platforms-and-software-libraries)
 11 |   * [References](#references)
 12 |   
 13 | ---  
 14 | # Introduction
 15 | K-means clustering is a simple and scalable clustering method, which partitions observations into k clusters in an objective manner. It has very broad applications, such as image segmentation, retail product classification (Kusrini, 2015), environmental problems like greenhouse gas emissions (Kijewska and Bluszcz, 2015). K-means clustering could be used in combination with other advanced methodologies. For example, it was used with support vector machine (SVM) to perform automatic text classification (Perrone and Connell, 2000). It could also be used as a preprocessing method, such as initialization in a hidden Markov model (HMM) (Hu and Zanibbi, 2011). Its extensive applications and its simple computational complexity make k-means clustering one of the popular methods today.
 16 | 
 17 | Finding the minimum of a k-means cost function is a NP-hard problem when the dimension d>1 and the number of clusters k>1. Scientists came up with several heuristic methods to find the local minimum, but the process is still computationally-intensive, especially for large datasets with high dimensional features. Therefore, we want to implement a parallel version of a k-means heuristic method on a cluster of machines, to significantly speed up the algorithm, without sacrificing its accuracy. 
 18 | 
 19 | A typical approach for k-mean clustering is Expectation–Maximization (E–M). E-step assigns points to the nearest cluster center, while M-step sets the cluster centers to the mean. Below is an animation demonstating the Kmean algorithm, based on a wonderful [K-means visualization made by Naftali Harris ](https://www.naftaliharris.com/blog/visualizing-k-means-clustering/).
 20 | 
 21 | <p align="center">
 22 | <img src="Other_Image/Kmean_illustration/Kmeans.gif" width="640">
 23 | </p>
 24 | 
 25 | The pseudo C-code for this algorithm is shown below, which is an abbreviated version of our real code. 
 26 | 
 27 | <p align="center">
 28 | <img src="Other_Image/pseudo.png" width="480">
 29 | </p>
 30 | 
 31 | X[N_samples][N_features] is the data points. We always use i as the looping index for samples, j as the index for feature dimensions, and k as the index for clusters. This notation is consistent throught out the real code. Other variables should be self-explanatory.
 32 | 
 33 | ---
 34 | # Parallel Kmeans Algorithms
 35 | 
 36 | ## OpenMP, MPI and hybrid MPI-OpenMP parallelization
 37 | 
 38 | ### OpenMP
 39 | 
 40 | With OpenMP parallelization, only E-step can be directly parallelized. If M-step is directly parallelized with OpenMP pragmas, different data points might be added to one cluster at the same time, leading to Write-After-Write (WAW) harzard. Although it is possible to make drastic modifications to parallelize the M-step, it contradicts the basic idea of OpenMP that the serial code shoud be almost untouched. Therefore, we only focus on the E-step. 
 41 | [(View our OpenMP code)](Parallel_Algorithm/OpenMP/Kmean_omp.c)
 42 | 
 43 | Unsurprisingly, while the E-step scales well, the M-step even gets slower because of thread overheads. Although the M-step is not time-consuming in the serial case, it finally becomes the bottleneck when the number of cores gets large:
 44 | <p align="center">
 45 | <img src="Timing_Results/plots/OpenMP_scaling.jpg" width="720">
 46 | </p>
 47 | 
 48 | [(View the raw timing log)](Timing_Results/log/Blobs_OpenMP.log)
 49 | 
 50 | Because the compute node we are testing has only [32 CPUs](#computational-platforms-and-software-libraries), the performance gets lower with 64 threads due to the implicit context-switching and increased overheads. Same for the MPI and the hybrid tests below.
 51 | 
 52 | ### MPI
 53 | 
 54 | With MPI, we can distribute data points to different processes using MPI_Bcast, and use MPI_Allreduce to exchange information whenever needed. Thus, both the E-step and the M-step can be parallelized. [(View our MPI code)](Parallel_Algorithm/MPI/Kmean_mpi.c)
 55 | 
 56 | This time, we get speed-up in both steps, so the overall scaling is better than OpenMP.
 57 | <p align="center">
 58 | <img src="Timing_Results/plots/MPI_scaling.jpg" width="720">
 59 | </p>
 60 | 
 61 | [(View the raw timing log)](Timing_Results/log/Blobs_MPI.log)
 62 | 
 63 | ### Hybrid MPI-OpenMP
 64 | 
 65 | We simply add OpenMP pragmas to the MPI code, to get the hybrid version. This time we have many combinations of OpenMP threads and MPI processes to test. In general, we find that the speed-up depends on the product of the number of OpenMP threads (n_omp hereinafter) and the number of MPI processes (N_MPI hereinafter):
 66 | 
 67 | <p align="center">
 68 | <img src="Timing_Results/plots/hybrid_scaling.jpg" width="480">
 69 | </p>
 70 | 
 71 | [(View the raw timing log)](Timing_Results/log/Blobs_hybrid.log)
 72 | 
 73 | Interestingly, for N_MPI*n_omp=32, we have tested 4 cases (N_MPI,n_omp) = (32,1), (16,2), (8,4) or (4,8), and all of them have almost the same speed. 
 74 | [(see the exact time use in the last cell)](https://github.com/JiaweiZhuang/CS205_final_project/blob/master/Timing_Results/plot_timing.ipynb)
 75 | 
 76 | ### Improvements over Previous Works
 77 | 
 78 | Our main reference for the OpenMP&MPI Kmean algorithm is Bisgin 2008, along with their [public code](http://www.ece.northwestern.edu/~wkliao/Kmeans/index.html). We have made a couple of improvements over their original algorithm:
 79 | * Their algorithm always use the first N_cluster data points as the initial cluster center, which can be inefficient and might not find the global minimum. This is understandable because generating random indices in C is not straightforward. To facilitate random initialization and to make a fair comparison with Python's sklearn.cluster.KMeans, we build a seamless interface between C and Python using the [NetCDF library](#the-netcdf4-library-for-data-io). Random initial centers are generated by Python and written into file. Then, both the Python and the C versions use the same starting points from that file. This ensures the same amount of computation for both Python and C, along with the parallel C version at different number of cores. By using the same initial condition, we have confirmed that our serial C version has essential the same speed as Python's sklearn.cluster.KMeans, and all our parallel versions show significant speed up.
 80 | * In the OpenMP version, they use "atomic" operations in the M-step to avoid data-racing. However, in our test, atomic operation significantly slows down the M-step, which more than compensates the speed-up of E-step. To cope with this issue, we refactor the M-step to move it out of the OpenMP parallel region, making the overall scalability much better.
 81 | * In MPI the version, their orginal code is unnecessarily redundant. By making use of MPI_IN_PLACE, we avoid duplicating variables for MPI function calls such as MPI_Allreduce. We also use MPI_Scatterv, which is a more flexible version of MPI_Scatter, to allow different CPUs to hold different numbers of data points. This allows us to use any number of CPUs for any number of data points.
 82 | * We also implemented an option to use correlation as the measure of "distance". It improves the clustering results in the [SSW study](#advanced-feature-abnormal-climate-events-identification) later in this page. 
 83 | 
 84 | ## Advanced Feature: CUDA
 85 | 
 86 | Given the massive potential of parallelism on GPU, we implemented a parallel version of k-means algorithm using Nvidia CUDA library. In our implementation, we parallelize the E-step by distributing the computations of the nearest distance over blocks on "device". Also, 
 87 | we use reduction to help check the convergence of clustering (see the "reduce" function). For M-step, we decide not to parallelize (parallelize means using reduction in this case), because by including the time for data to tranfer between device and host, which is a huge burden, the parallel version has no outstanding advantages over the serial version of M-step. Similar to the OpenMP version, our focus is also on the E-step. [(View our CUDA code)](Parallel_Algorithm/Cuda/kmeans_cdf.cu)
 88 | 
 89 | Generally, we see that the timing and scaling is quite promising when the number of threads per block is less than 32, which is also the wrap size. The "other" portion is no doubt the data tranfer between device and host, and it's even a more severe bottleneck than the serial M-step. By the way, we can definitely improve this by using better I/O hardware, i.e. using SSD instead of EBS volume for the ec2 instance, and optimizing memory access, e.g. using shared memory as possible and coalesce memory operations. Also, note that compared to OpenMP/MPI version, the time of E-step using CUDA is sinigicantly shorter. 
 90 | 
 91 | <p align="center">
 92 | <img src="Timing_Results/plots/Cuda_scaling.jpg" width="720">
 93 | </p>
 94 | 
 95 | The weird bump up as the number of threads goes up to 64 is because we run out of shared memory, but we're not sure why it affects the M-step so much. 
 96 | 
 97 | For optimization, currently we've used parallel reduction to speedup the checking of convergence, and matrix transpose to improve memory access locality as the number of points is significantely larger than the number of features. We haven't tried deploying this version on multiple GPU, because the documentation is rare online and a single Tesla K80 GPU already has enough capacity (4992 cores, 26 SMs, 2048 threads per SM) to parallelize our computation.
 98 | 
 99 | ---
100 | # Applications
101 | ## Forest Cover Type Classification
102 | In this section, we utilize k-means to perform forest covertype classification with cartographic variables only. Our dataset is obtained from the UCI KDD archive, and was original derived from US Forest Service (USFS) Region 2 Resource Information System data and US Geological Survey (USGS) data. The study area is in Roosevelt National Forest of northern Colorado, which is a forest region with the minimum human disturbance and therefore mostly goes through natural process.
103 | 
104 | <p align="center">
105 | <img src="Data_Analysis/covertype_cluster/figures/study_area_map.png" width="300" height="200">
106 | <img src="Data_Analysis/covertype_cluster/figures/covertype_distribution.png" width="700" height="200">
107 | </p>
108 | 
109 | It is a fairly large dataset that contains 581012 observations and 54 features including numerical and categorical features. The attributes include elevation, slope, horizontal distance to hydrology, vertical distance to hydrology, and etc. This dataset is already labeled with 1-7 which represents 7 different forest cover types: Spruce/Fir, Lodgepole Pine, Ponderosa Pine, Cottonwood/Willow, Aspen, Douglas-fir, and Krummholz. Our goal is to implement a k-means based classification method, and to show that besides basic clustering problems, k-means has a broad usage in various data science problems. 
110 | 
111 | Our first step is to normalize the feature values since some of them are in single digits whereas some are in thousands. Then we split the whole dataset into a training set and a testing set with the ratio 7:3. Since the dataset is quite unbalanced, our stragegy is to randomly pick the desired percentage of entries out of each category, and then join them to a final sampling dataset. We always use this method each time we need a dataset split in this problem.
112 | 
113 | We then split the training set into a sub-training set and a validation set with the ratio 8:2. The testing set we obtained above would be untouched until the final test. Our selection process of the best K is as follows:
114 | 1. cluster the sub-training set using k-means with a given K.
115 | 2. pick the 7 purest clusters corresponding to the 7 labels respectively to be the standard clusters. 
116 | Compute the 7 cluster centers for use.
117 | 3. For each data point in the validation set, assign this point to the cluster with the closest cluster center.
118 | 4. Calculate the average classification accuracy of the validation set.
119 | 
120 | <p align="center">
121 | <img src="Data_Analysis/covertype_cluster/figures/accuracy_graph.png" width="450" height="280">
122 | </p>
123 | 
124 | We choose K from 7 to 30, repeat the above steps and find that 23 is the best cluster number. Finally we perform k-means on the whole training set to get the 7 centers and test on the testing set. The final classification accuracy is around 30%.
125 | 
126 | <p align="center">
127 | <img src="Data_Analysis/covertype_cluster/figures/vis_label.png" width="600" height="350">
128 | <img src="Data_Analysis/covertype_cluster/figures/vis_pred.png" width="600" height="350">
129 | </p>
130 | 
131 | The classification accuracy is not very high, so we would like to take a further look at the dataset. It is hard to directly visualize the dataset due to its high feature dimension, so we apply PCA to perform dimension reduction first and then plot the scatter graph based on the first two principle components. We choose 10 percent out of the testing samples, and color code the points using the true labels(the first graph) and the predicted labels(the second graph). Now we could see that, the original data points are acutally mixed on the 2D projection. Our k-means algorithm actually does a good clustering job since the lumps are seperated well in color. Therefore, for this problem, more complicated algorithms such as artifical neural network would do a better job (Dean, 1999) with classification accuracy at around 70%. But our result is already much better than the randomly classification which only holds an accuracy at around 14%.
132 | 
133 | ## Advanced Feature: Abnormal Climate Events Identification
134 | In this section, we will explore the application of k-means clustering technique on identifying abnormal climate events. Abnormal climate events are usually identified if a highly simplified index exceeds an arbitrary threshold. For example, El Nino events are identified if the Nino 3.4 index exceeds the threshold of 0.5&deg;C. This simple criteria works in some cases, however, there are two caveats associated with this methodology. First, the highly simplified index may not well capture all the main dynamic aspects. Second, setting an arbitrary threshold makes it a subjective way of identifying abnormal events.  
135 | 
136 | K-means clustering serves as a powerful technique in dealing with those caveats. First, instead of using a highly simplified index, a high dimensional feature vector characterizing the event from multiple dynamical aspects can be utilized. In addition, k-means clustering is highly scalable to cluster large datasets, such as those from simulations. Second, k-means clustering is able to identify different states in a completely objective manner with no preconceived notion of the groups and no preselection on the basis of known influencing factors (Coughlin and Gray, 2009). Third, k-means clustering technique is especially useful for detecting abnormal events, because k-means clustering easily allows for unevenly distributed clusters, whereas some other techniques, such as hierachical clustering, tend to determine clusters of similar sizes.
137 | 
138 | The abnormal climate events we would like to explore is called sudden stratospheric warming (SSW), which happens sometime in the stratosphere near the North pole during winters. It is important to understand them because they usually proceed extreme weathers in the troposphere by about a month, and thus have the potential to serve as a forecasting tool. 
139 | 
140 | ---
141 | > ### What is Sudden Stratospheric Warming (SSW)?
142 | > During Northern hemisphere winter, because of the very cold temperature at the pole, the climatological zonal winds in the stratosphere are generally westerly and their strength increases with height. These winds can form very persistent "polar night jet” vortex, as shown in Fig(a). However, at times this zonal-mean configuration can be dramatically disturbed, as shown in Fig(b) and Fig(c), with the vortex being displaced or split. At the same time, the stratosphere near the pole experiences sudden warming, with latitudinal temperature gradient and zonal-mean winds at the pole being reversed.
143 | <p align="center">
144 | <img src="Data_Analysis/figures/intro1.png" width="480">
145 | </p>
146 | 
147 | ---
148 | 
149 | ### Data and Measure of Distance
150 | Our analysis is based on the daily output from a 49-year all-winter simulation, which gives us more than 17,000 samples. Daily data is pre-processed to get averaged temperatures at three latitudes, and their tendencies over time, latitudinal temperature gradient, and its tendency, averaged zonal winds at two latitudes, and their tendencies, and wave-number one and two components of geo-potential height. Temperatures are averaged over 60&deg;N to 70&deg;N, 70&deg;N to 80&deg;N, and 80&deg;N to 90&deg;N, while zonal winds are averaged over 60&deg;N to 70&deg;N and 70&deg;N to 80&deg;N. Tendencies are calculated as the differences between its current value and its value 7 days before. Altogether, there are 252 features for a sample, including 14 features each vertical level, and 18 levels in total across the stratosphere. 
151 | 
152 | Because a sample includes different types of features, such as temperature, velocity and length scale, we need to carefully choose the measure of distance.  Here, we choose 1-corr(x1,x2) as the measure of distance, because we consider two patterns to be close to each other if they are highly correlated. 
153 | 
154 | ### Results
155 | We have tested the number of clusters from 2 to 4, and use Silhouette score to evaluate the result of clustering. Two clusters give the highest averaged score of 0.65, as shown in Fig 1, while the averaged score for three clusters is 0.47, and for four clusters is 0.42. Therefore, we think two clusters are separated well by k-means clustering. 
156 | <p align="center">
157 | <img src="Data_Analysis/figures/svalue.png" width="480">
158 | </p>
159 | <p align="center">
160 | Figure 1: Silhouette score
161 | </p>
162 | 
163 | ---
164 | > #### Silhouette Score
165 | > The Silhouette coefficient for a sample is defined as (*b*-*a*)/max(*a*,*b*), where *a* is the mean intra-cluster distance, and *b* is the distance between a sample and the nearest cluster that the sample is not a part of. The Silhouette coefficient is a measure of the ratio between averaged intra-cluster similarity and cross-cluster similarity. Silhouette coefficients near +1 indicate that the sample is far away from the neighboring clusters. A value of 0 indicates that the sample is on or very close to the decision boundary between two neighboring clusters and negative values indicate that those samples might have been assigned to the wrong cluster. Therefore Silhouette coefficient can be a useful metric to evaluate the clustering results. 
166 | 
167 | ---
168 | 
169 | Fig 2 shows the temperature anomaly over the stratosphere for both clusters. The second cluster shows a substantial warming (more than 10K) compared to the first cluster. 
170 | <p align="center">
171 | <img src="Data_Analysis/figures/T.png" width="480">
172 | </p>
173 | <p align="center">
174 | Figure 2: Averaged temperature anomaly for each cluster
175 | </p>
176 | 
177 | The vortex structure shown in Fig. 3 is consistent with the temperature anomaly. For normal events, the polar vortex centers at the pole, while the vortex is displaced during abnormal warming period. These results are consistent with the findings from Coughlin and Gray, 2009, in which the analysis is based on observational data. 
178 | <p align="center">
179 | <img src="Data_Analysis/figures/PV.png" width="480">
180 | </p>
181 | <p align="center">
182 | Figure 3: Averaged potential vorticity for each cluster
183 | </p>
184 | 
185 | Furthermore, we are also interested in different types of abnormal events, because they may be caused by different mechanisms, and may have different effects on future weather patterns. Therefore, we further cluster the abnormal events based on the amplitude and phase angle of the wave-number one and two geo-potential height. We find that the vortex can be displaced toward different longitude, or even be split into two vortices.
186 | <p align="center">
187 | <img src="Data_Analysis/figures/SSWsubset.png" width="480">
188 | </p>
189 | <p align="center">
190 | Figure 4: Averaged potential vorticity for each sub-cluster
191 | </p>
192 | 
193 | ---
194 | # Discussion
195 | 
196 | Although the algorithm of k-means clustering is fast and simple, it has its own limitations compared to other more complicated algorithms. First of all, the clustering procedure and the final clusters highly depend on the number of clusters *k*, and extra effort needs to be made to find an optimal *k*. Hierarchical clustering could easily overcome this obstacle. Its computation is independent of the number of clusters *k*, and its hierarchical structure can provide more insight to determine the number of clusters *k*. Secondly, k-means clustering assumes spherically distributed clusters and equal probabilities for each clusters. To overcome these caveats, Gaussian mixture models include a covariance matrix and weights in its optimization. In addition, k-means clustering works poorly on non-convex clusters, while spectral clustering does a better job in this situation. 
197 | 
198 | Admittedly, there are more complicated algorithms that work better than k-means clustering in some cases, k-means clustering is still a powerful algorithm dealing with large datasets with high dimensional features. For a simple clustering algorithm, like k-means to have a better performance, more effort needs to be made to pre-process data and to map them onto a space where they are more spherically distributed. There is no the best algorithm, but the most suitable situation to apply an algorithm.
199 | 
200 | 
201 | ---
202 | # Computational Platforms and Software Libraries
203 | 
204 | ## Amazon EC2 cloud computing environment (OpenMP & MPI)
205 | 
206 | Although MPI programs typically run on local HPC facilities like Harvard's Odyssey, we found that MPI jobs at small-to-medium-scales (e.g. < 64 cores) can also run very efficiently on cloud platforms like Amazon EC2. This gives us great flexibility in requesting computational resources, so that we can finish simulations very quickly without worrying about job pending on Odyssey.
207 | 
208 | The instance we use for the timing tests is cc2.8xlarge [(see detailed cpuinfo)](Timing_Results/info/cpu). In the Amazon console, it is said to have 64 "virtual" CPUs. However, it actually only contains 32 physical CPUs as shown by the "lscpu" command.
209 | 
210 | We have installed various software libraries to facilitate our K-mean application. An EC2 AMI is made public the so that others can also run our codes directly without installing those libraries on their own. Search for "ami-3f79ef29" or  "GCC_NetCDF_MPI_Conda_04162017" in the N. Virginia region.
211 | 
212 | ## Amazon EC2 cloud computing environment (CUDA)
213 | 
214 | The instance we use for timing tests is p2.xlarge, with 1 Tesla K80 GPU, 4 "virtual" CPUs. For the K80 GPU, it has 4992 CUDA cores, 26 SMs, and 2048 threads per SM. 
215 | 
216 | ## The OpenMPI library
217 | 
218 | We built OpenMPI 2.1.0 upon the gcc4.8.3 compiler, to get the wrapped "mpicc" compiler. The script for building this library is available [here](Build_Library/openmpi_build/install_openmpi.sh).
219 | 
220 | ## The Cuda library
221 | 
222 | We use the CUDA 7.5 and the nvcc compiler included in the toolkit. For convenience, there is a pre-built AMI: search "ami-52f7b345" in the N. Virginia region.
223 | 
224 | ## The NetCDF4 library for data I/O
225 | 
226 | While high-level languages like Python and Matlab can read and write data in any formats very conveniently, data I/O in low-level languages such as C and Fortran can be a pain. Therefore, we make use of the [NetCDF4 library](https://www.unidata.ucar.edu/software/netcdf/) to facilitate data I/O. It can be viewed as a structured combination of numerical (like binary) and text (like ASCII) data. The numerical part makes it suited for storing large data arrays in our application, and the text description part makes it self-descriptive, which is a significant advantage over plain binary files. All commonly used languages have NetCDF4 APIs and are able to operate on this data format.
227 | 
228 | In Python, the [xarray package](http://xarray.pydata.org/en/stable/) is a good way to handle NetCDF data. It is a higher-dimension extension of the well-known Pandas package. While Pandas is great for data science, xarray also suits various physical sciences.
229 | 
230 | In C, we've provided a [script](Build_Library/netCDF_build/install_netCDF.sh) to install that library. A single build can work for various compilers including the basic gcc compiler, the pgcc compiler for OpenACC, and the nvcc compiler for CUDA. With the NetCDF-C library, we can read all the data we need and dynamically allocate memories for them in a single function [readX()](Parallel_Algorithm/shared/ncdf_util.c)
231 | 
232 | It is also worth mentioning that, NetCDF is the standard data format used for the Intergovernmental Panel on Climate Change (IPCC) report :)
233 | 
234 | # References
235 | Bisgin H, Dalfes H N. Parallel clustering algorithms with application to climatology[C] *Geophysical Research Abstracts* 2008, 10.
236 | 
237 | Blackard, J.A., Dean, D.J., 1999. Comparative accuracies of artificial neural networks and discriminant analysis in predicting forest cover types from cartographic variables. *Computers and Electronics in Agriculture* 24 (1999): 131–151.
238 | 
239 | Coughlin, K., and Lesley Janet Gray. "A continuum of sudden stratospheric warmings." *Journal of the Atmospheric Sciences* 66.2 (2009): 531-540.
240 | 
241 | Li, M., Cheng, Y., Zhao, H., 2004. Unlabeled Data Classification via Support Vector Machines and k-means Clustering.* Proceedings of the International Conference on Computer Graphics*, Imaging and Visualization (CGIV’04).
242 | 
243 | Kijewska, A., Bluszcz, A., 2015. Research of varying levels of greenhouse gas emissions in European countries using the k-means method. *Atmospheric Pollution Research* 7 (2016):935-944
244 | 
245 | Kusrini, K. 2015. Grouping of Retail Items by Using K-Means Clustering.  Procedia Computer Science 72 ( 2015 ): 495–502.
246 | Perrone, M.P., Connell, S.D., 2000. K-Means Clustering for Hidden Markov Models. *Proceedings of the Seventh International Workshop on Frontiers in Handwriting Recognition*. 
247 | 
248 | 
249 | 
250 | 


--------------------------------------------------------------------------------
/Slides/CUDA_part.key:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JiaweiZhuang/CS205_final_project/0751060adec3f6e4ca9676baae1bfd4bf4ba542a/Slides/CUDA_part.key


--------------------------------------------------------------------------------
/Slides/FinalPre.key:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JiaweiZhuang/CS205_final_project/0751060adec3f6e4ca9676baae1bfd4bf4ba542a/Slides/FinalPre.key


--------------------------------------------------------------------------------
/Slides/SSW part.key:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JiaweiZhuang/CS205_final_project/0751060adec3f6e4ca9676baae1bfd4bf4ba542a/Slides/SSW part.key


--------------------------------------------------------------------------------
/Slides/covertype part.key:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JiaweiZhuang/CS205_final_project/0751060adec3f6e4ca9676baae1bfd4bf4ba542a/Slides/covertype part.key


--------------------------------------------------------------------------------
/Timing_Results/Blobs_smp20000_fea30_cls8.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JiaweiZhuang/CS205_final_project/0751060adec3f6e4ca9676baae1bfd4bf4ba542a/Timing_Results/Blobs_smp20000_fea30_cls8.xlsx


--------------------------------------------------------------------------------
/Timing_Results/info/compiler:
--------------------------------------------------------------------------------
1 | [ami-3f79ef29]
2 | 
3 | gcc (GCC) 4.8.3 20140911 (Red Hat 4.8.3-9)
4 | Copyright (C) 2013 Free Software Foundation, Inc.
5 | This is free software; see the source for copying conditions.  There is NO
6 | warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
7 | 
8 | Open MPI 2.1.0
9 | 


--------------------------------------------------------------------------------
/Timing_Results/info/cpu:
--------------------------------------------------------------------------------
 1 | [cc2.8xlarge]
 2 | Architecture:          x86_64
 3 | CPU op-mode(s):        32-bit, 64-bit
 4 | Byte Order:            Little Endian
 5 | CPU(s):                32
 6 | On-line CPU(s) list:   0-31
 7 | Thread(s) per core:    2
 8 | Core(s) per socket:    8
 9 | Socket(s):             2
10 | NUMA node(s):          1
11 | Vendor ID:             GenuineIntel
12 | CPU family:            6
13 | Model:                 45
14 | Model name:            Intel(R) Xeon(R) CPU E5-2670 0 @ 2.60GHz
15 | Stepping:              7
16 | CPU MHz:               2593.961
17 | BogoMIPS:              5257.93
18 | Hypervisor vendor:     Xen
19 | Virtualization type:   full
20 | L1d cache:             32K
21 | L1i cache:             32K
22 | L2 cache:              256K
23 | L3 cache:              20480K
24 | NUMA node0 CPU(s):     0-31
25 | 


--------------------------------------------------------------------------------
/Timing_Results/log/Blobs_Cuda.log:
--------------------------------------------------------------------------------
 1 |  
 2 | =========================================
 3 | =========================================
 4 | testing with 1 threads per block on device
 5 | Number of samples: 20000 
 6 | Number of features: 30 
 7 | Number of clusters: 8 
 8 | Number of repeated runs: 20 
 9 | Total time: 783.192
10 | E-step time use (ms): 356.685
11 | M-step-1st-half time use (ms): 119.512
12 | M-step-2nd-half time use (ms): 0.067234
13 | Cuda Data IO (ms): 302.046
14 | Check Convergence (ms): 0.521421
15 |  
16 | =========================================
17 | =========================================
18 | testing with 2 threads per block on device
19 | Number of samples: 20000 
20 | Number of features: 30 
21 | Number of clusters: 8 
22 | Number of repeated runs: 20 
23 | Total time: 636.066
24 | E-step time use (ms): 202.417
25 | M-step-1st-half time use (ms): 132.744
26 | M-step-2nd-half time use (ms): 0.0743866
27 | Cuda Data IO (ms): 294.903
28 | Check Convergence (ms): 0.592709
29 |  
30 | =========================================
31 | =========================================
32 | testing with 4 threads per block on device
33 | Number of samples: 20000 
34 | Number of features: 30 
35 | Number of clusters: 8 
36 | Number of repeated runs: 20 
37 | Total time: 547.731
38 | E-step time use (ms): 108.101
39 | M-step-1st-half time use (ms): 138.242
40 | M-step-2nd-half time use (ms): 0.0698566
41 | Cuda Data IO (ms): 295.276
42 | Check Convergence (ms): 0.645161
43 |  
44 | =========================================
45 | =========================================
46 | testing with 8 threads per block on device
47 | Number of samples: 20000 
48 | Number of features: 30 
49 | Number of clusters: 8 
50 | Number of repeated runs: 20 
51 | Total time: 541.216
52 | E-step time use (ms): 69.1309
53 | M-step-1st-half time use (ms): 169.611
54 | M-step-2nd-half time use (ms): 0.089407
55 | Cuda Data IO (ms): 296.227
56 | Check Convergence (ms): 0.853777
57 |  
58 | =========================================
59 | =========================================
60 | testing with 16 threads per block on device
61 | Number of samples: 20000 
62 | Number of features: 30 
63 | Number of clusters: 8 
64 | Number of repeated runs: 20 
65 | Total time: 511.788
66 | E-step time use (ms): 38.1258
67 | M-step-1st-half time use (ms): 171.403
68 | M-step-2nd-half time use (ms): 0.0870228
69 | Cuda Data IO (ms): 295.944
70 | Check Convergence (ms): 0.88644
71 |  
72 | =========================================
73 | =========================================
74 | testing with 32 threads per block on device
75 | Number of samples: 20000 
76 | Number of features: 30 
77 | Number of clusters: 8 
78 | Number of repeated runs: 20 
79 | Total time: 2496.99
80 | E-step time use (ms): 245.165
81 | M-step-1st-half time use (ms): 1893.42
82 | M-step-2nd-half time use (ms): 0.981092
83 | Cuda Data IO (ms): 322.414
84 | Check Convergence (ms): 27.8046
85 |  
86 | =========================================
87 | =========================================
88 | testing with 64 threads per block on device
89 | Number of samples: 20000 
90 | Number of features: 30 
91 | Number of clusters: 8 
92 | Number of repeated runs: 20 
93 | Total time: 2437.49
94 | E-step time use (ms): 162.151
95 | M-step-1st-half time use (ms): 1918.99
96 | M-step-2nd-half time use (ms): 0.959396
97 | Cuda Data IO (ms): 321.023
98 | Check Convergence (ms): 27.1459
99 | 


--------------------------------------------------------------------------------
/Timing_Results/log/Blobs_MPI.log:
--------------------------------------------------------------------------------
  1 |  
  2 | =========================================
  3 | =========================================
  4 | testing with 1 processes
  5 | reading data 
  6 | Number of samples: 20000 
  7 | Number of features: 30 
  8 | Number of clusters: 8 
  9 | Number of repeated runs: 20 
 10 | =====reading data finished======
 11 | Last element in global array: -5.464932 
 12 | Last element after scattering 0: -5.464932 
 13 | =====Applying K-mean======
 14 | Best inertia: 2308841.500000 
 15 | I/O time use (ms): 3.746011 
 16 | Kmean total time use (ms): 2327.527416 
 17 | 
 18 | (sub-component timing not accurate) 
 19 | E-step time use (ms): 2080.869320 
 20 | M-step-1st-half time use (ms): 245.803846 
 21 | M-step-2nd-half time use (ms): 0.497344 
 22 |  
 23 | =========================================
 24 | =========================================
 25 | testing with 2 processes
 26 | reading data 
 27 | Number of samples: 20000 
 28 | Number of features: 30 
 29 | Number of clusters: 8 
 30 | Number of repeated runs: 20 
 31 | =====reading data finished======
 32 | Last element in global array: -5.464932 
 33 | Last element after scattering 1: -5.464932 
 34 | =====Applying K-mean======
 35 | Best inertia: 2308842.000000 
 36 | I/O time use (ms): 3.857213 
 37 | Kmean total time use (ms): 1370.546596 
 38 | 
 39 | (sub-component timing not accurate) 
 40 | E-step time use (ms): 1208.027011 
 41 | M-step-1st-half time use (ms): 161.206449 
 42 | M-step-2nd-half time use (ms): 0.596461 
 43 |  
 44 | =========================================
 45 | =========================================
 46 | testing with 4 processes
 47 | reading data 
 48 | Number of samples: 20000 
 49 | Number of features: 30 
 50 | Number of clusters: 8 
 51 | Number of repeated runs: 20 
 52 | =====reading data finished======
 53 | Last element in global array: -5.464932 
 54 | Last element after scattering 3: -5.464932 
 55 | =====Applying K-mean======
 56 | Best inertia: 2308840.750000 
 57 | I/O time use (ms): 4.113028 
 58 | Kmean total time use (ms): 889.274717 
 59 | 
 60 | (sub-component timing not accurate) 
 61 | E-step time use (ms): 783.437196 
 62 | M-step-1st-half time use (ms): 103.108949 
 63 | M-step-2nd-half time use (ms): 0.884013 
 64 |  
 65 | =========================================
 66 | =========================================
 67 | testing with 8 processes
 68 | reading data 
 69 | Number of samples: 20000 
 70 | Number of features: 30 
 71 | Number of clusters: 8 
 72 | Number of repeated runs: 20 
 73 | =====reading data finished======
 74 | Last element in global array: -5.464932 
 75 | Last element after scattering 7: -5.464932 
 76 | =====Applying K-mean======
 77 | Best inertia: 2308833.500000 
 78 | I/O time use (ms): 4.099924 
 79 | Kmean total time use (ms): 490.620752 
 80 | 
 81 | (sub-component timing not accurate) 
 82 | E-step time use (ms): 421.552152 
 83 | M-step-1st-half time use (ms): 63.779867 
 84 | M-step-2nd-half time use (ms): 1.227340 
 85 |  
 86 | =========================================
 87 | =========================================
 88 | testing with 16 processes
 89 | reading data 
 90 | Number of samples: 20000 
 91 | Number of features: 30 
 92 | Number of clusters: 8 
 93 | Number of repeated runs: 20 
 94 | =====reading data finished======
 95 | Last element in global array: -5.464932 
 96 | Last element after scattering 15: -5.464932 
 97 | =====Applying K-mean======
 98 | Best inertia: 2308833.500000 
 99 | I/O time use (ms): 4.935520 
100 | Kmean total time use (ms): 298.665461 
101 | 
102 | (sub-component timing not accurate) 
103 | E-step time use (ms): 221.786738 
104 | M-step-1st-half time use (ms): 67.226194 
105 | M-step-2nd-half time use (ms): 1.788349 
106 |  
107 | =========================================
108 | =========================================
109 | testing with 32 processes
110 | reading data 
111 | Number of samples: 20000 
112 | Number of features: 30 
113 | Number of clusters: 8 
114 | Number of repeated runs: 20 
115 | =====reading data finished======
116 | Last element in global array: -5.464932 
117 | Last element after scattering 31: -5.464932 
118 | =====Applying K-mean======
119 | Best inertia: 2308833.000000 
120 | I/O time use (ms): 9.008225 
121 | Kmean total time use (ms): 280.026070 
122 | 
123 | (sub-component timing not accurate) 
124 | E-step time use (ms): 183.410053 
125 | M-step-1st-half time use (ms): 76.902479 
126 | M-step-2nd-half time use (ms): 1.928765 
127 |  
128 | =========================================
129 | =========================================
130 | testing with 64 processes
131 | reading data 
132 | Number of samples: 20000 
133 | Number of features: 30 
134 | Number of clusters: 8 
135 | Number of repeated runs: 20 
136 | =====reading data finished======
137 | Last element in global array: -5.464932 
138 | Last element after scattering 63: -5.464932 
139 | =====Applying K-mean======
140 | Best inertia: 2308827.750000 
141 | I/O time use (ms): 19.194771 
142 | Kmean total time use (ms): 985.778767 
143 | 
144 | (sub-component timing not accurate) 
145 | E-step time use (ms): 106.266378 
146 | M-step-1st-half time use (ms): 605.020996 
147 | M-step-2nd-half time use (ms): 2.200050 
148 | 


--------------------------------------------------------------------------------
/Timing_Results/log/Blobs_OpenMP.log:
--------------------------------------------------------------------------------
  1 |  
  2 | =========================================
  3 | =========================================
  4 | testing with 1 threads
  5 | reading data 
  6 | Number of samples: 20000 
  7 | Number of features: 30 
  8 | Number of clusters: 8 
  9 | Number of repeated runs: 20 
 10 | =====reading data finished======
 11 | =====Applying K-mean======
 12 | =====writting data finished======
 13 | Best inertia: 2308841.500000 
 14 | Kmean total time use (ms): 2539.128065 
 15 | E-step time use (ms): 2293.212175 
 16 | M-step-1st-half time use (ms): 245.102882 
 17 | M-step-2nd-half time use (ms): 0.499487 
 18 | I/O time use (ms): 2.501011 
 19 |  
 20 | =========================================
 21 | =========================================
 22 | testing with 2 threads
 23 | reading data 
 24 | Number of samples: 20000 
 25 | Number of features: 30 
 26 | Number of clusters: 8 
 27 | Number of repeated runs: 20 
 28 | =====reading data finished======
 29 | =====Applying K-mean======
 30 | =====writting data finished======
 31 | Best inertia: 2308842.000000 
 32 | Kmean total time use (ms): 1731.789112 
 33 | E-step time use (ms): 1426.280260 
 34 | M-step-1st-half time use (ms): 304.410219 
 35 | M-step-2nd-half time use (ms): 0.734806 
 36 | I/O time use (ms): 1.926899 
 37 |  
 38 | =========================================
 39 | =========================================
 40 | testing with 4 threads
 41 | reading data 
 42 | Number of samples: 20000 
 43 | Number of features: 30 
 44 | Number of clusters: 8 
 45 | Number of repeated runs: 20 
 46 | =====reading data finished======
 47 | =====Applying K-mean======
 48 | =====writting data finished======
 49 | Best inertia: 2308840.500000 
 50 | Kmean total time use (ms): 1336.784840 
 51 | E-step time use (ms): 934.716463 
 52 | M-step-1st-half time use (ms): 399.653912 
 53 | M-step-2nd-half time use (ms): 1.964569 
 54 | I/O time use (ms): 2.602816 
 55 |  
 56 | =========================================
 57 | =========================================
 58 | testing with 8 threads
 59 | reading data 
 60 | Number of samples: 20000 
 61 | Number of features: 30 
 62 | Number of clusters: 8 
 63 | Number of repeated runs: 20 
 64 | =====reading data finished======
 65 | =====Applying K-mean======
 66 | =====writting data finished======
 67 | Best inertia: 2308833.500000 
 68 | Kmean total time use (ms): 900.095940 
 69 | E-step time use (ms): 490.777731 
 70 | M-step-1st-half time use (ms): 407.839060 
 71 | M-step-2nd-half time use (ms): 1.050949 
 72 | I/O time use (ms): 2.654076 
 73 |  
 74 | =========================================
 75 | =========================================
 76 | testing with 16 threads
 77 | reading data 
 78 | Number of samples: 20000 
 79 | Number of features: 30 
 80 | Number of clusters: 8 
 81 | Number of repeated runs: 20 
 82 | =====reading data finished======
 83 | =====Applying K-mean======
 84 | =====writting data finished======
 85 | Best inertia: 2308838.000000 
 86 | Kmean total time use (ms): 741.780996 
 87 | E-step time use (ms): 309.553146 
 88 | M-step-1st-half time use (ms): 430.645466 
 89 | M-step-2nd-half time use (ms): 1.129389 
 90 | I/O time use (ms): 2.640963 
 91 |  
 92 | =========================================
 93 | =========================================
 94 | testing with 32 threads
 95 | reading data 
 96 | Number of samples: 20000 
 97 | Number of features: 30 
 98 | Number of clusters: 8 
 99 | Number of repeated runs: 20 
100 | =====reading data finished======
101 | =====Applying K-mean======
102 | =====writting data finished======
103 | Best inertia: 2308832.750000 
104 | Kmean total time use (ms): 782.614946 
105 | E-step time use (ms): 210.109472 
106 | M-step-1st-half time use (ms): 570.574045 
107 | M-step-2nd-half time use (ms): 1.244783 
108 | I/O time use (ms): 2.243996 
109 |  
110 | =========================================
111 | =========================================
112 | testing with 64 threads
113 | reading data 
114 | Number of samples: 20000 
115 | Number of features: 30 
116 | Number of clusters: 8 
117 | Number of repeated runs: 20 
118 | =====reading data finished======
119 | =====Applying K-mean======
120 | =====writting data finished======
121 | Best inertia: 2308833.750000 
122 | Kmean total time use (ms): 1440.789938 
123 | E-step time use (ms): 1023.807049 
124 | M-step-1st-half time use (ms): 415.407658 
125 | M-step-2nd-half time use (ms): 1.029730 
126 | I/O time use (ms): 2.650023 
127 | 


--------------------------------------------------------------------------------
/Timing_Results/log/Blobs_hybrid.log:
--------------------------------------------------------------------------------
  1 |  
  2 | =========================================
  3 | =========================================
  4 | testing with 1 processes, 1 threads
  5 | reading data 
  6 | Number of samples: 20000 
  7 | Number of features: 30 
  8 | Number of clusters: 8 
  9 | Number of repeated runs: 20 
 10 | =====reading data finished======
 11 | Last element in global array: -5.464932 
 12 | Last element after scattering 0: -5.464932 
 13 | =====Applying K-mean======
 14 | Best inertia: 2308841.500000 
 15 | I/O time use (ms): 4.006589 
 16 | Kmean total time use (ms): 2363.860598 
 17 | 
 18 | (sub-component timing not accurate) 
 19 | E-step time use (ms): 2112.615470 
 20 | M-step-1st-half time use (ms): 250.408402 
 21 | M-step-2nd-half time use (ms): 0.497923 
 22 |  
 23 | =========================================
 24 | =========================================
 25 | testing with 1 processes, 2 threads
 26 | reading data 
 27 | Number of samples: 20000 
 28 | Number of features: 30 
 29 | Number of clusters: 8 
 30 | Number of repeated runs: 20 
 31 | =====reading data finished======
 32 | Last element in global array: -5.464932 
 33 | Last element after scattering 0: -5.464932 
 34 | =====Applying K-mean======
 35 | Best inertia: 2308842.000000 
 36 | I/O time use (ms): 3.732907 
 37 | Kmean total time use (ms): 1662.239609 
 38 | 
 39 | (sub-component timing not accurate) 
 40 | E-step time use (ms): 1356.238149 
 41 | M-step-1st-half time use (ms): 304.833094 
 42 | M-step-2nd-half time use (ms): 0.622449 
 43 |  
 44 | =========================================
 45 | =========================================
 46 | testing with 1 processes, 4 threads
 47 | reading data 
 48 | Number of samples: 20000 
 49 | Number of features: 30 
 50 | Number of clusters: 8 
 51 | Number of repeated runs: 20 
 52 | =====reading data finished======
 53 | Last element in global array: -5.464932 
 54 | Last element after scattering 0: -5.464932 
 55 | =====Applying K-mean======
 56 | Best inertia: 2308840.500000 
 57 | I/O time use (ms): 3.693638 
 58 | Kmean total time use (ms): 1189.350777 
 59 | 
 60 | (sub-component timing not accurate) 
 61 | E-step time use (ms): 813.090273 
 62 | M-step-1st-half time use (ms): 375.009360 
 63 | M-step-2nd-half time use (ms): 0.758050 
 64 |  
 65 | =========================================
 66 | =========================================
 67 | testing with 1 processes, 8 threads
 68 | reading data 
 69 | Number of samples: 20000 
 70 | Number of features: 30 
 71 | Number of clusters: 8 
 72 | Number of repeated runs: 20 
 73 | =====reading data finished======
 74 | Last element in global array: -5.464932 
 75 | Last element after scattering 0: -5.464932 
 76 | =====Applying K-mean======
 77 | Best inertia: 2308833.500000 
 78 | I/O time use (ms): 3.934515 
 79 | Kmean total time use (ms): 931.707351 
 80 | 
 81 | (sub-component timing not accurate) 
 82 | E-step time use (ms): 500.118240 
 83 | M-step-1st-half time use (ms): 429.142173 
 84 | M-step-2nd-half time use (ms): 0.863691 
 85 |  
 86 | =========================================
 87 | =========================================
 88 | testing with 2 processes, 1 threads
 89 | reading data 
 90 | Number of samples: 20000 
 91 | Number of features: 30 
 92 | Number of clusters: 8 
 93 | Number of repeated runs: 20 
 94 | =====reading data finished======
 95 | Last element in global array: -5.464932 
 96 | Last element after scattering 1: -5.464932 
 97 | =====Applying K-mean======
 98 | Best inertia: 2308842.000000 
 99 | I/O time use (ms): 3.800784 
100 | Kmean total time use (ms): 1358.942563 
101 | 
102 | (sub-component timing not accurate) 
103 | E-step time use (ms): 1212.020400 
104 | M-step-1st-half time use (ms): 145.609492 
105 | M-step-2nd-half time use (ms): 0.594695 
106 |  
107 | =========================================
108 | =========================================
109 | testing with 2 processes, 2 threads
110 | reading data 
111 | Number of samples: 20000 
112 | Number of features: 30 
113 | Number of clusters: 8 
114 | Number of repeated runs: 20 
115 | =====reading data finished======
116 | Last element in global array: -5.464932 
117 | Last element after scattering 1: -5.464932 
118 | =====Applying K-mean======
119 | Best inertia: 2308840.500000 
120 | I/O time use (ms): 3.766548 
121 | Kmean total time use (ms): 943.486173 
122 | 
123 | (sub-component timing not accurate) 
124 | E-step time use (ms): 760.652990 
125 | M-step-1st-half time use (ms): 181.228196 
126 | M-step-2nd-half time use (ms): 0.741900 
127 |  
128 | =========================================
129 | =========================================
130 | testing with 2 processes, 4 threads
131 | reading data 
132 | Number of samples: 20000 
133 | Number of features: 30 
134 | Number of clusters: 8 
135 | Number of repeated runs: 20 
136 | =====reading data finished======
137 | Last element in global array: -5.464932 
138 | Last element after scattering 1: -5.464932 
139 | =====Applying K-mean======
140 | Best inertia: 2308833.500000 
141 | I/O time use (ms): 3.823837 
142 | Kmean total time use (ms): 680.486505 
143 | 
144 | (sub-component timing not accurate) 
145 | E-step time use (ms): 452.964565 
146 | M-step-1st-half time use (ms): 225.771176 
147 | M-step-2nd-half time use (ms): 0.831982 
148 |  
149 | =========================================
150 | =========================================
151 | testing with 2 processes, 8 threads
152 | reading data 
153 | Number of samples: 20000 
154 | Number of features: 30 
155 | Number of clusters: 8 
156 | Number of repeated runs: 20 
157 | =====reading data finished======
158 | Last element in global array: -5.464932 
159 | Last element after scattering 1: -5.464932 
160 | =====Applying K-mean======
161 | Best inertia: 2308830.000000 
162 | I/O time use (ms): 3.852878 
163 | Kmean total time use (ms): 663.490676 
164 | 
165 | (sub-component timing not accurate) 
166 | E-step time use (ms): 353.790135 
167 | M-step-1st-half time use (ms): 307.547264 
168 | M-step-2nd-half time use (ms): 0.888228 
169 |  
170 | =========================================
171 | =========================================
172 | testing with 4 processes, 1 threads
173 | reading data 
174 | Number of samples: 20000 
175 | Number of features: 30 
176 | Number of clusters: 8 
177 | Number of repeated runs: 20 
178 | =====reading data finished======
179 | Last element in global array: -5.464932 
180 | Last element after scattering 3: -5.464932 
181 | =====Applying K-mean======
182 | Best inertia: 2308840.750000 
183 | I/O time use (ms): 4.334087 
184 | Kmean total time use (ms): 909.529512 
185 | 
186 | (sub-component timing not accurate) 
187 | E-step time use (ms): 783.812606 
188 | M-step-1st-half time use (ms): 122.883010 
189 | M-step-2nd-half time use (ms): 0.911755 
190 |  
191 | =========================================
192 | =========================================
193 | testing with 4 processes, 2 threads
194 | reading data 
195 | Number of samples: 20000 
196 | Number of features: 30 
197 | Number of clusters: 8 
198 | Number of repeated runs: 20 
199 | =====reading data finished======
200 | Last element in global array: -5.464932 
201 | Last element after scattering 3: -5.464932 
202 | =====Applying K-mean======
203 | Best inertia: 2308833.500000 
204 | I/O time use (ms): 4.096406 
205 | Kmean total time use (ms): 557.608783 
206 | 
207 | (sub-component timing not accurate) 
208 | E-step time use (ms): 445.375796 
209 | M-step-1st-half time use (ms): 109.179241 
210 | M-step-2nd-half time use (ms): 1.016673 
211 |  
212 | =========================================
213 | =========================================
214 | testing with 4 processes, 4 threads
215 | reading data 
216 | Number of samples: 20000 
217 | Number of features: 30 
218 | Number of clusters: 8 
219 | Number of repeated runs: 20 
220 | =====reading data finished======
221 | Last element in global array: -5.464932 
222 | Last element after scattering 3: -5.464932 
223 | =====Applying K-mean======
224 | Best inertia: 2308830.000000 
225 | I/O time use (ms): 4.249204 
226 | Kmean total time use (ms): 382.619795 
227 | 
228 | (sub-component timing not accurate) 
229 | E-step time use (ms): 230.821690 
230 | M-step-1st-half time use (ms): 148.596137 
231 | M-step-2nd-half time use (ms): 1.035195 
232 |  
233 | =========================================
234 | =========================================
235 | testing with 4 processes, 8 threads
236 | reading data 
237 | Number of samples: 20000 
238 | Number of features: 30 
239 | Number of clusters: 8 
240 | Number of repeated runs: 20 
241 | =====reading data finished======
242 | Last element in global array: -5.464932 
243 | Last element after scattering 3: -5.464932 
244 | =====Applying K-mean======
245 | Best inertia: 2308833.000000 
246 | I/O time use (ms): 4.232157 
247 | Kmean total time use (ms): 402.007910 
248 | 
249 | (sub-component timing not accurate) 
250 | E-step time use (ms): 210.121067 
251 | M-step-1st-half time use (ms): 187.990701 
252 | M-step-2nd-half time use (ms): 1.202342 
253 |  
254 | =========================================
255 | =========================================
256 | testing with 8 processes, 1 threads
257 | reading data 
258 | Number of samples: 20000 
259 | Number of features: 30 
260 | Number of clusters: 8 
261 | Number of repeated runs: 20 
262 | =====reading data finished======
263 | Last element in global array: -5.464932 
264 | Last element after scattering 7: -5.464932 
265 | =====Applying K-mean======
266 | Best inertia: 2308833.500000 
267 | I/O time use (ms): 4.592522 
268 | Kmean total time use (ms): 505.915032 
269 | 
270 | (sub-component timing not accurate) 
271 | E-step time use (ms): 420.972104 
272 | M-step-1st-half time use (ms): 79.467921 
273 | M-step-2nd-half time use (ms): 1.211620 
274 |  
275 | =========================================
276 | =========================================
277 | testing with 8 processes, 2 threads
278 | reading data 
279 | Number of samples: 20000 
280 | Number of features: 30 
281 | Number of clusters: 8 
282 | Number of repeated runs: 20 
283 | =====reading data finished======
284 | Last element in global array: -5.464932 
285 | Last element after scattering 7: -5.464932 
286 | =====Applying K-mean======
287 | Best inertia: 2308833.500000 
288 | I/O time use (ms): 4.143904 
289 | Kmean total time use (ms): 301.355469 
290 | 
291 | (sub-component timing not accurate) 
292 | E-step time use (ms): 227.431369 
293 | M-step-1st-half time use (ms): 68.241809 
294 | M-step-2nd-half time use (ms): 1.332544 
295 |  
296 | =========================================
297 | =========================================
298 | testing with 8 processes, 4 threads
299 | reading data 
300 | Number of samples: 20000 
301 | Number of features: 30 
302 | Number of clusters: 8 
303 | Number of repeated runs: 20 
304 | =====reading data finished======
305 | Last element in global array: -5.464932 
306 | Last element after scattering 7: -5.464932 
307 | =====Applying K-mean======
308 | Best inertia: 2308833.000000 
309 | I/O time use (ms): 4.099906 
310 | Kmean total time use (ms): 311.526036 
311 | 
312 | (sub-component timing not accurate) 
313 | E-step time use (ms): 193.247065 
314 | M-step-1st-half time use (ms): 111.617091 
315 | M-step-2nd-half time use (ms): 1.467510 
316 |  
317 | =========================================
318 | =========================================
319 | testing with 8 processes, 8 threads
320 | reading data 
321 | Number of samples: 20000 
322 | Number of features: 30 
323 | Number of clusters: 8 
324 | Number of repeated runs: 20 
325 | =====reading data finished======
326 | Last element in global array: -5.464932 
327 | Last element after scattering 7: -5.464932 
328 | =====Applying K-mean======
329 | Best inertia: 2308833.500000 
330 | I/O time use (ms): 4.120600 
331 | Kmean total time use (ms): 4898.684051 
332 | 
333 | (sub-component timing not accurate) 
334 | E-step time use (ms): 3469.320709 
335 | M-step-1st-half time use (ms): 1420.115374 
336 | M-step-2nd-half time use (ms): 1.485388 
337 |  
338 | =========================================
339 | =========================================
340 | testing with 16 processes, 1 threads
341 | reading data 
342 | Number of samples: 20000 
343 | Number of features: 30 
344 | Number of clusters: 8 
345 | Number of repeated runs: 20 
346 | =====reading data finished======
347 | Last element in global array: -5.464932 
348 | Last element after scattering 15: -5.464932 
349 | =====Applying K-mean======
350 | Best inertia: 2308833.500000 
351 | I/O time use (ms): 7.041087 
352 | Kmean total time use (ms): 316.260453 
353 | 
354 | (sub-component timing not accurate) 
355 | E-step time use (ms): 251.028026 
356 | M-step-1st-half time use (ms): 54.075352 
357 | M-step-2nd-half time use (ms): 1.848250 
358 |  
359 | =========================================
360 | =========================================
361 | testing with 16 processes, 2 threads
362 | reading data 
363 | Number of samples: 20000 
364 | Number of features: 30 
365 | Number of clusters: 8 
366 | Number of repeated runs: 20 
367 | =====reading data finished======
368 | Last element in global array: -5.464932 
369 | Last element after scattering 15: -5.464932 
370 | =====Applying K-mean======
371 | Best inertia: 2308833.000000 
372 | I/O time use (ms): 4.883557 
373 | Kmean total time use (ms): 313.827766 
374 | 
375 | (sub-component timing not accurate) 
376 | E-step time use (ms): 188.408545 
377 | M-step-1st-half time use (ms): 115.188138 
378 | M-step-2nd-half time use (ms): 1.672651 
379 |  
380 | =========================================
381 | =========================================
382 | testing with 16 processes, 4 threads
383 | reading data 
384 | Number of samples: 20000 
385 | Number of features: 30 
386 | Number of clusters: 8 
387 | Number of repeated runs: 20 
388 | =====reading data finished======
389 | Last element in global array: -5.464932 
390 | Last element after scattering 15: -5.464932 
391 | =====Applying K-mean======
392 | Best inertia: 2308827.750000 
393 | I/O time use (ms): 5.108052 
394 | Kmean total time use (ms): 5194.099651 
395 | 
396 | (sub-component timing not accurate) 
397 | E-step time use (ms): 3129.300050 
398 | M-step-1st-half time use (ms): 2012.502378 
399 | M-step-2nd-half time use (ms): 1.927712 
400 |  
401 | =========================================
402 | =========================================
403 | testing with 16 processes, 8 threads
404 | reading data 
405 | Number of samples: 20000 
406 | Number of features: 30 
407 | Number of clusters: 8 
408 | Number of repeated runs: 20 
409 | =====reading data finished======
410 | Last element in global array: -5.464932 
411 | Last element after scattering 15: -5.464932 
412 | =====Applying K-mean======
413 | Best inertia: 2308827.750000 
414 | I/O time use (ms): 6.952672 
415 | Kmean total time use (ms): 12382.280717 
416 | 
417 | (sub-component timing not accurate) 
418 | E-step time use (ms): 8528.571661 
419 | M-step-1st-half time use (ms): 3817.337787 
420 | M-step-2nd-half time use (ms): 1.997874 
421 |  
422 | =========================================
423 | =========================================
424 | testing with 32 processes, 1 threads
425 | reading data 
426 | Number of samples: 20000 
427 | Number of features: 30 
428 | Number of clusters: 8 
429 | Number of repeated runs: 20 
430 | =====reading data finished======
431 | Last element in global array: -5.464932 
432 | Last element after scattering 31: -5.464932 
433 | =====Applying K-mean======
434 | Best inertia: 2308833.000000 
435 | I/O time use (ms): 6.334857 
436 | Kmean total time use (ms): 297.959041 
437 | 
438 | (sub-component timing not accurate) 
439 | E-step time use (ms): 176.053357 
440 | M-step-1st-half time use (ms): 97.873692 
441 | M-step-2nd-half time use (ms): 1.992122 
442 |  
443 | =========================================
444 | =========================================
445 | testing with 32 processes, 2 threads
446 | reading data 
447 | Number of samples: 20000 
448 | Number of features: 30 
449 | Number of clusters: 8 
450 | Number of repeated runs: 20 
451 | =====reading data finished======
452 | Last element in global array: -5.464932 
453 | Last element after scattering 31: -5.464932 
454 | =====Applying K-mean======
455 | Best inertia: 2308827.750000 
456 | I/O time use (ms): 7.848291 
457 | Kmean total time use (ms): 4021.197152 
458 | 
459 | (sub-component timing not accurate) 
460 | E-step time use (ms): 2844.393182 
461 | M-step-1st-half time use (ms): 1116.176527 
462 | M-step-2nd-half time use (ms): 2.084090 
463 |  
464 | =========================================
465 | =========================================
466 | testing with 32 processes, 4 threads
467 | reading data 
468 | Number of samples: 20000 
469 | Number of features: 30 
470 | Number of clusters: 8 
471 | Number of repeated runs: 20 
472 | =====reading data finished======
473 | Last element in global array: -5.464932 
474 | Last element after scattering 31: -5.464932 
475 | =====Applying K-mean======
476 | Best inertia: 2308828.000000 
477 | I/O time use (ms): 8.923698 
478 | Kmean total time use (ms): 13332.237565 
479 | 
480 | (sub-component timing not accurate) 
481 | E-step time use (ms): 8839.237808 
482 | M-step-1st-half time use (ms): 4086.888532 
483 | M-step-2nd-half time use (ms): 2.295686 
484 |  
485 | =========================================
486 | =========================================
487 | testing with 32 processes, 8 threads
488 | reading data 
489 | Number of samples: 20000 
490 | Number of features: 30 
491 | Number of clusters: 8 
492 | Number of repeated runs: 20 
493 | =====reading data finished======
494 | Last element in global array: -5.464932 
495 | Last element after scattering 31: -5.464932 
496 | =====Applying K-mean======
497 | Best inertia: 2308827.500000 
498 | I/O time use (ms): 9.748165 
499 | Kmean total time use (ms): 33550.686761 
500 | 
501 | (sub-component timing not accurate) 
502 | E-step time use (ms): 21187.669129 
503 | M-step-1st-half time use (ms): 10737.786934 
504 | M-step-2nd-half time use (ms): 2.450273 
505 |  
506 | =========================================
507 | =========================================
508 | testing with 64 processes, 1 threads
509 | reading data 
510 | Number of samples: 20000 
511 | Number of features: 30 
512 | Number of clusters: 8 
513 | Number of repeated runs: 20 
514 | =====reading data finished======
515 | Last element in global array: -5.464932 
516 | Last element after scattering 63: -5.464932 
517 | =====Applying K-mean======
518 | Best inertia: 2308827.750000 
519 | I/O time use (ms): 10.888008 
520 | Kmean total time use (ms): 783.003317 
521 | 
522 | (sub-component timing not accurate) 
523 | E-step time use (ms): 102.731732 
524 | M-step-1st-half time use (ms): 491.794025 
525 | M-step-2nd-half time use (ms): 2.122726 
526 |  
527 | =========================================
528 | =========================================
529 | testing with 64 processes, 2 threads
530 | reading data 
531 | Number of samples: 20000 
532 | Number of features: 30 
533 | Number of clusters: 8 
534 | Number of repeated runs: 20 
535 | =====reading data finished======
536 | Last element in global array: -5.464932 
537 | Last element after scattering 63: -5.464932 
538 | =====Applying K-mean======
539 | Best inertia: 2308828.000000 
540 | I/O time use (ms): 16.582364 
541 | Kmean total time use (ms): 10385.388325 
542 | 
543 | (sub-component timing not accurate) 
544 | E-step time use (ms): 2861.456678 
545 | M-step-1st-half time use (ms): 5335.016671 
546 | M-step-2nd-half time use (ms): 2.135070 
547 |  
548 | =========================================
549 | =========================================
550 | testing with 64 processes, 4 threads
551 | reading data 
552 | Number of samples: 20000 
553 | Number of features: 30 
554 | Number of clusters: 8 
555 | Number of repeated runs: 20 
556 | =====reading data finished======
557 | Last element in global array: -5.464932 
558 | Last element after scattering 63: -5.464932 
559 | =====Applying K-mean======
560 | Best inertia: 2308827.500000 
561 | I/O time use (ms): 19.903018 
562 | Kmean total time use (ms): 30551.306969 
563 | 
564 | (sub-component timing not accurate) 
565 | E-step time use (ms): 11010.092788 
566 | M-step-1st-half time use (ms): 13064.622681 
567 | M-step-2nd-half time use (ms): 2.318505 
568 |  
569 | =========================================
570 | =========================================
571 | testing with 64 processes, 8 threads
572 | reading data 
573 | Number of samples: 20000 
574 | Number of features: 30 
575 | Number of clusters: 8 
576 | Number of repeated runs: 20 
577 | =====reading data finished======
578 | Last element in global array: -5.464932 
579 | Last element after scattering 63: -5.464932 
580 | =====Applying K-mean======
581 | Best inertia: 2308827.500000 
582 | I/O time use (ms): 11.120870 
583 | Kmean total time use (ms): 69710.298024 
584 | 
585 | (sub-component timing not accurate) 
586 | E-step time use (ms): 24806.787879 
587 | M-step-1st-half time use (ms): 30334.412542 
588 | M-step-2nd-half time use (ms): 1.942891 
589 | 


--------------------------------------------------------------------------------
/Timing_Results/plots/Cuda_scaling.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JiaweiZhuang/CS205_final_project/0751060adec3f6e4ca9676baae1bfd4bf4ba542a/Timing_Results/plots/Cuda_scaling.jpg


--------------------------------------------------------------------------------
/Timing_Results/plots/MPI_scaling.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JiaweiZhuang/CS205_final_project/0751060adec3f6e4ca9676baae1bfd4bf4ba542a/Timing_Results/plots/MPI_scaling.jpg


--------------------------------------------------------------------------------
/Timing_Results/plots/OpenMP_scaling.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JiaweiZhuang/CS205_final_project/0751060adec3f6e4ca9676baae1bfd4bf4ba542a/Timing_Results/plots/OpenMP_scaling.jpg


--------------------------------------------------------------------------------
/Timing_Results/plots/hybrid_scaling.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JiaweiZhuang/CS205_final_project/0751060adec3f6e4ca9676baae1bfd4bf4ba542a/Timing_Results/plots/hybrid_scaling.jpg


--------------------------------------------------------------------------------
/_config.yml:
--------------------------------------------------------------------------------
1 | theme: jekyll-theme-cayman
2 | title: Paralleled Kmeans Clustering
3 | description: 
4 |   -CS205 Final Project, 2017 Spring- 
5 |   -Jiahua Guo, Jiachen Song, Xinyuan Wang, Jiawei Zhuang-
6 | 


--------------------------------------------------------------------------------