├── .gitignore ├── Build_Library ├── README ├── bashrc_update ├── netCDF_build │ ├── install_netCDF.sh │ ├── netCDF_test.c │ └── run_netCDF_test.sh └── openmpi_build │ ├── MPI_test.c │ ├── install_openmpi.sh │ └── run_MPI_test.sh ├── Data_Analysis ├── README ├── SSW-preprocessing.ipynb ├── SSW.ipynb ├── SSWkmeans.m ├── covertype_cluster │ ├── analysis.ipynb │ ├── cluster_sample.m │ ├── cluster_sample.py │ ├── cluster_train_full.m │ ├── covertype.ipynb │ ├── figures │ │ ├── accuracy_graph.png │ │ ├── covertype_distribution.png │ │ ├── study_area_map.png │ │ ├── vis_label.png │ │ └── vis_pred.png │ └── preprocess.py ├── data │ ├── Label_matlab.bin │ ├── Label_py.bin │ └── SSWdata.bin ├── figures │ ├── PV.png │ ├── SSW.png │ ├── SSWsubset.png │ ├── T.png │ ├── intro1.png │ ├── k3_svalue.png │ ├── silScoreSubset.png │ └── svalue.png └── readData.py ├── Other_Image ├── Kmean_illustration │ └── Kmeans.gif └── pseudo.png ├── Parallel_Algorithm ├── Cuda │ ├── compile.sh │ ├── kmeans_cdf.cu │ ├── kmeans_txt.cu │ ├── test.txt │ └── test_multithreadPerBlock.sh ├── MPI │ ├── Kmean_mpi.c │ ├── compile.sh │ ├── test_hybrid.sh │ └── test_multiprocess.sh ├── OpenMP │ ├── Kmean_omp.c │ ├── compile.sh │ ├── pseudo.c │ └── test_multithread.sh ├── README ├── python_reference │ ├── Apply_Kmean.py │ ├── IO_util.py │ ├── Kmean_iris.py │ ├── check_SSWdata.py │ ├── check_results.py │ ├── convert_SSWdata.py │ └── make_fake_data.py ├── shared │ ├── make_2D_array.c │ ├── make_2D_array.h │ ├── math_util.c │ ├── math_util.h │ ├── ncdf_util.c │ ├── ncdf_util.h │ └── timing.h └── test_data │ ├── .gitignore │ └── README ├── README.md ├── Slides ├── CUDA_part.key ├── FinalPre.key ├── SSW part.key └── covertype part.key ├── Timing_Results ├── Blobs_smp20000_fea30_cls8.xlsx ├── info │ ├── compiler │ └── cpu ├── log │ ├── Blobs_Cuda.log │ ├── Blobs_MPI.log │ ├── Blobs_OpenMP.log │ └── Blobs_hybrid.log ├── plot_timing.ipynb └── plots │ ├── Cuda_scaling.jpg │ ├── MPI_scaling.jpg │ ├── OpenMP_scaling.jpg │ └── hybrid_scaling.jpg └── _config.yml /.gitignore: -------------------------------------------------------------------------------- 1 | *.swp 2 | *.out 3 | *.nc 4 | __pycache__ 5 | -------------------------------------------------------------------------------- /Build_Library/README: -------------------------------------------------------------------------------- 1 | Install OpenMPI and netCDF4 libraries 2 | 3 | Tested on Amazon Linux AMI. Should also work on other machines with minor modifications. 4 | -------------------------------------------------------------------------------- /Build_Library/bashrc_update: -------------------------------------------------------------------------------- 1 | # new environment settings that need be added to bashrc after libraries are built 2 | 3 | # Add netCDF lib to the search path 4 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib 5 | 6 | # Add to mpicc to PATH 7 | export PATH=$PATH:/usr/local/openmpi/bin 8 | -------------------------------------------------------------------------------- /Build_Library/netCDF_build/install_netCDF.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # ================== 4 | # Install netCDF-C library 5 | # Tested successfully on Amazon-Linux AMI 6 | # 7 | # Jiawei Zhuang 2017/4 8 | # ================== 9 | 10 | # ================== 11 | # Note: 12 | # Use the zlib,HDF5,NetCDF4 versions specified in 13 | # https://github.com/amznlabs/amazon-dsstne/blob/master/docs/getting_started/setup.md#openmpi-setup 14 | # but added more --prefix and include options according to 15 | # http://www.unidata.ucar.edu/software/netcdf/docs/getting_and_building_netcdf.html#build_default 16 | # 17 | # The older version(netcdf 4.1.3) seems much easier to install than the lastest version (netcdf 4.4.1) 18 | # ================== 19 | 20 | # ================== 21 | # for C compiler if not installed yet 22 | # ================== 23 | #sudo yum install gcc 24 | #sudo yum install gcc-c++ 25 | #CC=gcc 26 | #CXX=g++ 27 | 28 | # ================== 29 | # make a new directory if not exist 30 | # ================== 31 | mkdir -p $HOME/lib 32 | cd $HOME/lib 33 | 34 | # ================== 35 | # for zlib 36 | # ================== 37 | wget ftp://ftp.unidata.ucar.edu/pub/netcdf/netcdf-4/zlib-1.2.8.tar.gz 38 | tar xvf zlib-1.2.8.tar.gz 39 | cd zlib-1.2.8 40 | 41 | # Build and install zlib 42 | ZDIR=/usr/local 43 | ./configure --prefix=${ZDIR} 44 | make check 45 | sudo make install 46 | 47 | cd .. 48 | 49 | # ================== 50 | # for HDF5 51 | # The "make check" step takes 10~20 minutes 52 | # Some of the tests might fail, but doesn't affect netCDF functionality 53 | # ================== 54 | wget ftp://ftp.unidata.ucar.edu/pub/netcdf/netcdf-4/hdf5-1.8.12.tar.gz 55 | tar xvfz hdf5-1.8.12.tar.gz 56 | cd hdf5-1.8.12 57 | 58 | # Build and install HDF5 59 | H5DIR=/usr/local 60 | ./configure --with-zlib=${ZDIR} --prefix=${H5DIR} 61 | make check 62 | sudo make install 63 | 64 | cd .. 65 | 66 | # ================== 67 | # for m4 if necessary 68 | # (https://geeksww.com/tutorials/libraries/m4/installation/installing_m4_macro_processor_ubuntu_linux.php) 69 | # ================== 70 | 71 | # ================== 72 | # for netCDF4 73 | # The "make check" step takes 5~10 minutes 74 | # ================== 75 | 76 | wget ftp://ftp.unidata.ucar.edu/pub/netcdf/netcdf-4.1.3.tar.gz 77 | tar xvf netcdf-4.1.3.tar.gz 78 | cd netcdf-4.1.3 79 | 80 | #export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${H5DIR}/lib 81 | 82 | # Build and install netCDF-4. We don't need Fortran support (no gfortran installed) 83 | NCDIR=/usr/local 84 | CPPFLAGS=-I${H5DIR}/include LDFLAGS=-L${H5DIR}/lib ./configure --prefix=${NCDIR} --disable-fortran 85 | make check # will fail fortran check without "--disable-fortran" in the configure step 86 | sudo make install 87 | 88 | # show the configure details 89 | nc-config --all 90 | -------------------------------------------------------------------------------- /Build_Library/netCDF_build/netCDF_test.c: -------------------------------------------------------------------------------- 1 | #include 2 | int main() 3 | { 4 | int ncid; 5 | if (nc_create("tmp.nc", NC_NETCDF4, &ncid)) 6 | return 1; 7 | if (nc_close(ncid)) 8 | return 2; 9 | return 0; 10 | } 11 | 12 | 13 | -------------------------------------------------------------------------------- /Build_Library/netCDF_build/run_netCDF_test.sh: -------------------------------------------------------------------------------- 1 | # all of them should pass (a single build should work for different compilers) 2 | gcc -lnetcdf netCDF_test.c -o netCDF_test_by_gcc.out 3 | ./netCDF_test_by_gcc.out 4 | 5 | nvcc -lnetcdf netCDF_test.c -o netCDF_test_by_nvcc.out 6 | ./netCDF_test_by_nvcc.out 7 | 8 | pgcc -lnetcdf netCDF_test.c -o netCDF_test_by_pgcc.out 9 | ./netCDF_test_by_pgcc.out 10 | -------------------------------------------------------------------------------- /Build_Library/openmpi_build/MPI_test.c: -------------------------------------------------------------------------------- 1 | #include /* printf and BUFSIZ defined there */ 2 | #include /* exit defined there */ 3 | #include /* all MPI-2 functions defined there */ 4 | 5 | int main(argc, argv) 6 | int argc; 7 | char *argv[]; 8 | { 9 | int rank, size, length; 10 | char name[BUFSIZ]; 11 | 12 | MPI_Init(&argc, &argv); 13 | MPI_Comm_rank(MPI_COMM_WORLD, &rank); 14 | MPI_Comm_size(MPI_COMM_WORLD, &size); 15 | MPI_Get_processor_name(name, &length); 16 | 17 | printf("%s: hello world from process %d of %d\n", name, rank, size); 18 | 19 | MPI_Finalize(); 20 | 21 | exit(0); 22 | } 23 | -------------------------------------------------------------------------------- /Build_Library/openmpi_build/install_openmpi.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # ================== 4 | # Install openmpi library 5 | # Tested successfully on Amazon-Linux AMI 6 | # 7 | # Jiawei Zhuang 2017/4 8 | # ================== 9 | 10 | # ================== 11 | # make a new directory if not exist 12 | # ================== 13 | mkdir -p $HOME/lib 14 | cd $HOME/lib 15 | 16 | # ================== 17 | # openmpi build (make install takes many minutes) 18 | # Some of the "make check "tests might fail, 19 | # but it doesn't affect basic MPI functionality 20 | # ================== 21 | 22 | wget https://www.open-mpi.org/software/ompi/v2.1/downloads/openmpi-2.1.0.tar.gz 23 | tar xvf openmpi-2.1.0.tar.gz 24 | cd openmpi-2.1.0 25 | ./configure --prefix=/usr/local/openmpi 26 | make check 27 | sudo make install 28 | -------------------------------------------------------------------------------- /Build_Library/openmpi_build/run_MPI_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | mpicc --version 4 | mpicc MPI_test.c -o MPI_test.out 5 | mpirun -np 4 ./MPI_test.out 6 | -------------------------------------------------------------------------------- /Data_Analysis/README: -------------------------------------------------------------------------------- 1 | Apply the k-mean algorithm to real data 2 | -------------------------------------------------------------------------------- /Data_Analysis/SSW-preprocessing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "" 4 | }, 5 | "nbformat": 3, 6 | "nbformat_minor": 0, 7 | "worksheets": [ 8 | { 9 | "cells": [ 10 | { 11 | "cell_type": "code", 12 | "collapsed": false, 13 | "input": [ 14 | "%matplotlib inline\n", 15 | "from netCDF4 import Dataset\n", 16 | "from sklearn import preprocessing\n", 17 | "import numpy as np\n", 18 | "import math\n", 19 | "import scipy\n", 20 | "import sys\n", 21 | "from sklearn.cluster import KMeans\n", 22 | "from sklearn.metrics import silhouette_samples, silhouette_score\n", 23 | "import matplotlib.cm as cm\n", 24 | "import matplotlib.pyplot as plt" 25 | ], 26 | "language": "python", 27 | "metadata": {}, 28 | "outputs": [], 29 | "prompt_number": 1 30 | }, 31 | { 32 | "cell_type": "code", 33 | "collapsed": false, 34 | "input": [ 35 | "top_zn = 0\n", 36 | "bottom_zn = 18\n", 37 | "levels = bottom_zn-top_zn\n", 38 | "Z_lat = 79" 39 | ], 40 | "language": "python", 41 | "metadata": {}, 42 | "outputs": [], 43 | "prompt_number": 224 44 | }, 45 | { 46 | "cell_type": "code", 47 | "collapsed": false, 48 | "input": [ 49 | "file_nm = '../data/T_11_1.nc'\n", 50 | "T = Dataset(file_nm,mode='r')\n", 51 | "print T.dimensions.keys()\n", 52 | "print T.variables.keys()\n", 53 | "lat = T.variables['lat'][:]\n", 54 | "lon = T.variables['lon'][:]\n", 55 | "lev = T.variables['lev'][:]\n", 56 | "T = T.variables['T'][:]\n", 57 | "T.shape\n", 58 | "print lev[top_zn:bottom_zn]\n", 59 | "lat = np.expand_dims(lat, axis=0)\n", 60 | "lat = np.expand_dims(lat, axis=0)\n", 61 | "lat = np.expand_dims(lat, axis=3)\n", 62 | "idx = np.where((lat>60)&(lat<=70))\n", 63 | "lat.shape" 64 | ], 65 | "language": "python", 66 | "metadata": {}, 67 | "outputs": [ 68 | { 69 | "output_type": "stream", 70 | "stream": "stdout", 71 | "text": [ 72 | "[u'time', u'lev', u'lat', u'lon']\n", 73 | "[u'T', u'lat', u'lev', u'lon', u'time']\n", 74 | "[ 1.244795 1.61285 2.079325 2.667425 3.404875 4.324575\n", 75 | " 5.4654 6.87285 8.599725 10.70705 13.26475 16.35175\n", 76 | " 20.05675 24.479 29.728 35.92325 43.19375 51.6775\n", 77 | " 61.5205 72.8745 85.65715 100.514695]" 78 | ] 79 | }, 80 | { 81 | "output_type": "stream", 82 | "stream": "stdout", 83 | "text": [ 84 | "\n" 85 | ] 86 | }, 87 | { 88 | "metadata": {}, 89 | "output_type": "pyout", 90 | "prompt_number": 225, 91 | "text": [ 92 | "(1, 1, 96, 1)" 93 | ] 94 | } 95 | ], 96 | "prompt_number": 225 97 | }, 98 | { 99 | "cell_type": "code", 100 | "collapsed": false, 101 | "input": [ 102 | "Tmean1save = np.empty([1,levels])\n", 103 | "Tmean2save = np.empty([1,levels])\n", 104 | "Tmean3save = np.empty([1,levels])\n", 105 | "Tgradsave = np.empty([1,levels])\n", 106 | "Umean1save = np.empty([1,levels])\n", 107 | "Umean2save = np.empty([1,levels])\n", 108 | "ZAsave = np.empty([1,levels,3])\n", 109 | "Zphasesave = np.empty([1,levels,2])" 110 | ], 111 | "language": "python", 112 | "metadata": {}, 113 | "outputs": [], 114 | "prompt_number": 226 115 | }, 116 | { 117 | "cell_type": "code", 118 | "collapsed": false, 119 | "input": [ 120 | "for year in range(11,60):\n", 121 | " for mon in range(1,13):\n", 122 | " \n", 123 | " # Read in Temperature\n", 124 | " file_nm = '../data/T_'+str(year)+'_'+str(mon)+'.nc'\n", 125 | " T = Dataset(file_nm,mode='r')\n", 126 | " T = T.variables['T'][:]\n", 127 | " \n", 128 | " # Calculte zonal averaged temperature for different latitudes and height\n", 129 | " idx = np.where((lat>60)&(lat<=70))\n", 130 | " Tmean1 = np.mean(np.sum(T[:,top_zn:bottom_zn,idx[2],:]*np.cos(lat[:,:,idx[2]]/180*math.pi),axis=2)/np.sum(np.cos(lat[:,:,idx[2]]/180*math.pi)),axis=2)\n", 131 | " idx = np.where((lat>70)&(lat<=80))\n", 132 | " Tmean2 = np.mean(np.sum(T[:,top_zn:bottom_zn,idx[2],:]*np.cos(lat[:,:,idx[2]]/180*math.pi),axis=2)/np.sum(np.cos(lat[:,:,idx[2]]/180*math.pi)),axis=2)\n", 133 | " idx = np.where((lat>80)&(lat<=90))\n", 134 | " Tmean3 = np.mean(np.sum(T[:,top_zn:bottom_zn,idx[2],:]*np.cos(lat[:,:,idx[2]]/180*math.pi),axis=2)/np.sum(np.cos(lat[:,:,idx[2]]/180*math.pi)),axis=2)\n", 135 | " print Tmean3.shape\n", 136 | " \n", 137 | " # Calculate latitudinal temperature gradients\n", 138 | " idx = np.where((lat>50)&(lat<=60))\n", 139 | " Tmean4 = np.mean(np.sum(T[:,top_zn:bottom_zn,idx[2],:]*np.cos(lat[:,:,idx[2]]/180*math.pi),axis=2)/np.sum(np.cos(lat[:,:,idx[2]]/180*math.pi)),axis=2)\n", 140 | " Tgrad = Tmean3-Tmean4\n", 141 | " \n", 142 | " # Calculate Zonal winds, zonally averaged for different latitudes and height\n", 143 | " file_nm = '../data/U_'+str(year)+'_'+str(mon)+'.nc'\n", 144 | " U = Dataset(file_nm,mode='r')\n", 145 | " U = U.variables['U'][:]\n", 146 | " idx = np.where((lat>60)&(lat<=70))\n", 147 | " Umean1 = np.mean(np.sum(U[:,top_zn:bottom_zn,idx[2],:]*np.cos(lat[:,:,idx[2]]/180*math.pi),axis=2)/np.sum(np.cos(lat[:,:,idx[2]]/180*math.pi)),axis=2)\n", 148 | " idx = np.where((lat>70)&(lat<=80))\n", 149 | " Umean2 = np.mean(np.sum(U[:,top_zn:bottom_zn,idx[2],:]*np.cos(lat[:,:,idx[2]]/180*math.pi),axis=2)/np.sum(np.cos(lat[:,:,idx[2]]/180*math.pi)),axis=2)\n", 150 | " \n", 151 | " # Calculate Geopotential height\n", 152 | " file_nm = '../data/Z3_'+str(year)+'_'+str(mon)+'.nc'\n", 153 | " Z = Dataset(file_nm,mode='r')\n", 154 | " Z = Z.variables['Z3'][:]\n", 155 | " Z = Z[:,top_zn:bottom_zn,Z_lat,:]\n", 156 | " # FFT to get wave number 1 and wave number 2 component\n", 157 | " ZA = np.fft.fft(Z)\n", 158 | " ZA = np.abs(ZA[:,:,0:3])\n", 159 | " Zphase = np.angle(ZA[:,:,1:3])\n", 160 | " \n", 161 | " # Store features to save arrays\n", 162 | " Tmean1save = np.append(Tmean1save,Tmean1,axis=0)\n", 163 | " Tmean2save = np.append(Tmean2save,Tmean2,axis=0)\n", 164 | " Tmean3save = np.append(Tmean3save,Tmean3,axis=0)\n", 165 | " Tgradsave = np.append(Tgradsave,Tgrad,axis=0)\n", 166 | " Umean1save = np.append(Umean1save,Umean1,axis=0)\n", 167 | " Umean2save = np.append(Umean2save,Umean2,axis=0)\n", 168 | " ZAsave = np.append(ZAsave,ZA,axis=0)\n", 169 | " Zphasesave = np.append(Zphasesave,Zphase,axis=0)" 170 | ], 171 | "language": "python", 172 | "metadata": {}, 173 | "outputs": [ 174 | { 175 | "output_type": "stream", 176 | "stream": "stdout", 177 | "text": [ 178 | "(31, 22)\n", 179 | "(28, 22)" 180 | ] 181 | }, 182 | { 183 | "output_type": "stream", 184 | "stream": "stdout", 185 | "text": [ 186 | "\n", 187 | "(31, 22)" 188 | ] 189 | }, 190 | { 191 | "output_type": "stream", 192 | "stream": "stdout", 193 | "text": [ 194 | "\n", 195 | "(31, 22)" 196 | ] 197 | }, 198 | { 199 | "output_type": "stream", 200 | "stream": "stdout", 201 | "text": [ 202 | "\n", 203 | "(28, 22)" 204 | ] 205 | }, 206 | { 207 | "output_type": "stream", 208 | "stream": "stdout", 209 | "text": [ 210 | "\n", 211 | "(31, 22)" 212 | ] 213 | }, 214 | { 215 | "output_type": "stream", 216 | "stream": "stdout", 217 | "text": [ 218 | "\n", 219 | "(31, 22)" 220 | ] 221 | }, 222 | { 223 | "output_type": "stream", 224 | "stream": "stdout", 225 | "text": [ 226 | "\n", 227 | "(28, 22)" 228 | ] 229 | }, 230 | { 231 | "output_type": "stream", 232 | "stream": "stdout", 233 | "text": [ 234 | "\n", 235 | "(31, 22)" 236 | ] 237 | }, 238 | { 239 | "output_type": "stream", 240 | "stream": "stdout", 241 | "text": [ 242 | "\n", 243 | "(31, 22)" 244 | ] 245 | }, 246 | { 247 | "output_type": "stream", 248 | "stream": "stdout", 249 | "text": [ 250 | "\n", 251 | "(28, 22)" 252 | ] 253 | }, 254 | { 255 | "output_type": "stream", 256 | "stream": "stdout", 257 | "text": [ 258 | "\n", 259 | "(31, 22)" 260 | ] 261 | }, 262 | { 263 | "output_type": "stream", 264 | "stream": "stdout", 265 | "text": [ 266 | "\n", 267 | "(31, 22)" 268 | ] 269 | }, 270 | { 271 | "output_type": "stream", 272 | "stream": "stdout", 273 | "text": [ 274 | "\n", 275 | "(28, 22)" 276 | ] 277 | }, 278 | { 279 | "output_type": "stream", 280 | "stream": "stdout", 281 | "text": [ 282 | "\n", 283 | "(31, 22)" 284 | ] 285 | }, 286 | { 287 | "output_type": "stream", 288 | "stream": "stdout", 289 | "text": [ 290 | "\n", 291 | "(31, 22)" 292 | ] 293 | }, 294 | { 295 | "output_type": "stream", 296 | "stream": "stdout", 297 | "text": [ 298 | "\n", 299 | "(28, 22)" 300 | ] 301 | }, 302 | { 303 | "output_type": "stream", 304 | "stream": "stdout", 305 | "text": [ 306 | "\n", 307 | "(31, 22)" 308 | ] 309 | }, 310 | { 311 | "output_type": "stream", 312 | "stream": "stdout", 313 | "text": [ 314 | "\n", 315 | "(31, 22)" 316 | ] 317 | }, 318 | { 319 | "output_type": "stream", 320 | "stream": "stdout", 321 | "text": [ 322 | "\n", 323 | "(28, 22)" 324 | ] 325 | }, 326 | { 327 | "output_type": "stream", 328 | "stream": "stdout", 329 | "text": [ 330 | "\n", 331 | "(31, 22)" 332 | ] 333 | }, 334 | { 335 | "output_type": "stream", 336 | "stream": "stdout", 337 | "text": [ 338 | "\n", 339 | "(31, 22)" 340 | ] 341 | }, 342 | { 343 | "output_type": "stream", 344 | "stream": "stdout", 345 | "text": [ 346 | "\n", 347 | "(28, 22)" 348 | ] 349 | }, 350 | { 351 | "output_type": "stream", 352 | "stream": "stdout", 353 | "text": [ 354 | "\n", 355 | "(31, 22)" 356 | ] 357 | }, 358 | { 359 | "output_type": "stream", 360 | "stream": "stdout", 361 | "text": [ 362 | "\n", 363 | "(31, 22)" 364 | ] 365 | }, 366 | { 367 | "output_type": "stream", 368 | "stream": "stdout", 369 | "text": [ 370 | "\n", 371 | "(28, 22)" 372 | ] 373 | }, 374 | { 375 | "output_type": "stream", 376 | "stream": "stdout", 377 | "text": [ 378 | "\n", 379 | "(31, 22)" 380 | ] 381 | }, 382 | { 383 | "output_type": "stream", 384 | "stream": "stdout", 385 | "text": [ 386 | "\n", 387 | "(31, 22)" 388 | ] 389 | }, 390 | { 391 | "output_type": "stream", 392 | "stream": "stdout", 393 | "text": [ 394 | "\n", 395 | "(28, 22)" 396 | ] 397 | }, 398 | { 399 | "output_type": "stream", 400 | "stream": "stdout", 401 | "text": [ 402 | "\n", 403 | "(31, 22)" 404 | ] 405 | }, 406 | { 407 | "output_type": "stream", 408 | "stream": "stdout", 409 | "text": [ 410 | "\n", 411 | "(31, 22)" 412 | ] 413 | }, 414 | { 415 | "output_type": "stream", 416 | "stream": "stdout", 417 | "text": [ 418 | "\n", 419 | "(28, 22)" 420 | ] 421 | }, 422 | { 423 | "output_type": "stream", 424 | "stream": "stdout", 425 | "text": [ 426 | "\n", 427 | "(31, 22)" 428 | ] 429 | }, 430 | { 431 | "output_type": "stream", 432 | "stream": "stdout", 433 | "text": [ 434 | "\n", 435 | "(31, 22)" 436 | ] 437 | }, 438 | { 439 | "output_type": "stream", 440 | "stream": "stdout", 441 | "text": [ 442 | "\n", 443 | "(28, 22)" 444 | ] 445 | }, 446 | { 447 | "output_type": "stream", 448 | "stream": "stdout", 449 | "text": [ 450 | "\n", 451 | "(31, 22)" 452 | ] 453 | }, 454 | { 455 | "output_type": "stream", 456 | "stream": "stdout", 457 | "text": [ 458 | "\n", 459 | "(31, 22)" 460 | ] 461 | }, 462 | { 463 | "output_type": "stream", 464 | "stream": "stdout", 465 | "text": [ 466 | "\n", 467 | "(28, 22)" 468 | ] 469 | }, 470 | { 471 | "output_type": "stream", 472 | "stream": "stdout", 473 | "text": [ 474 | "\n", 475 | "(31, 22)" 476 | ] 477 | }, 478 | { 479 | "output_type": "stream", 480 | "stream": "stdout", 481 | "text": [ 482 | "\n", 483 | "(31, 22)" 484 | ] 485 | }, 486 | { 487 | "output_type": "stream", 488 | "stream": "stdout", 489 | "text": [ 490 | "\n", 491 | "(28, 22)" 492 | ] 493 | }, 494 | { 495 | "output_type": "stream", 496 | "stream": "stdout", 497 | "text": [ 498 | "\n", 499 | "(31, 22)" 500 | ] 501 | }, 502 | { 503 | "output_type": "stream", 504 | "stream": "stdout", 505 | "text": [ 506 | "\n", 507 | "(31, 22)" 508 | ] 509 | }, 510 | { 511 | "output_type": "stream", 512 | "stream": "stdout", 513 | "text": [ 514 | "\n", 515 | "(28, 22)" 516 | ] 517 | }, 518 | { 519 | "output_type": "stream", 520 | "stream": "stdout", 521 | "text": [ 522 | "\n", 523 | "(31, 22)" 524 | ] 525 | }, 526 | { 527 | "output_type": "stream", 528 | "stream": "stdout", 529 | "text": [ 530 | "\n", 531 | "(31, 22)" 532 | ] 533 | }, 534 | { 535 | "output_type": "stream", 536 | "stream": "stdout", 537 | "text": [ 538 | "\n", 539 | "(28, 22)" 540 | ] 541 | }, 542 | { 543 | "output_type": "stream", 544 | "stream": "stdout", 545 | "text": [ 546 | "\n", 547 | "(31, 22)" 548 | ] 549 | }, 550 | { 551 | "output_type": "stream", 552 | "stream": "stdout", 553 | "text": [ 554 | "\n", 555 | "(31, 22)" 556 | ] 557 | }, 558 | { 559 | "output_type": "stream", 560 | "stream": "stdout", 561 | "text": [ 562 | "\n", 563 | "(28, 22)" 564 | ] 565 | }, 566 | { 567 | "output_type": "stream", 568 | "stream": "stdout", 569 | "text": [ 570 | "\n", 571 | "(31, 22)" 572 | ] 573 | }, 574 | { 575 | "output_type": "stream", 576 | "stream": "stdout", 577 | "text": [ 578 | "\n", 579 | "(31, 22)" 580 | ] 581 | }, 582 | { 583 | "output_type": "stream", 584 | "stream": "stdout", 585 | "text": [ 586 | "\n", 587 | "(28, 22)" 588 | ] 589 | }, 590 | { 591 | "output_type": "stream", 592 | "stream": "stdout", 593 | "text": [ 594 | "\n", 595 | "(31, 22)" 596 | ] 597 | }, 598 | { 599 | "output_type": "stream", 600 | "stream": "stdout", 601 | "text": [ 602 | "\n", 603 | "(31, 22)" 604 | ] 605 | }, 606 | { 607 | "output_type": "stream", 608 | "stream": "stdout", 609 | "text": [ 610 | "\n", 611 | "(28, 22)" 612 | ] 613 | }, 614 | { 615 | "output_type": "stream", 616 | "stream": "stdout", 617 | "text": [ 618 | "\n", 619 | "(31, 22)" 620 | ] 621 | }, 622 | { 623 | "output_type": "stream", 624 | "stream": "stdout", 625 | "text": [ 626 | "\n", 627 | "(31, 22)" 628 | ] 629 | }, 630 | { 631 | "output_type": "stream", 632 | "stream": "stdout", 633 | "text": [ 634 | "\n", 635 | "(28, 22)" 636 | ] 637 | }, 638 | { 639 | "output_type": "stream", 640 | "stream": "stdout", 641 | "text": [ 642 | "\n", 643 | "(31, 22)" 644 | ] 645 | }, 646 | { 647 | "output_type": "stream", 648 | "stream": "stdout", 649 | "text": [ 650 | "\n", 651 | "(31, 22)" 652 | ] 653 | }, 654 | { 655 | "output_type": "stream", 656 | "stream": "stdout", 657 | "text": [ 658 | "\n", 659 | "(28, 22)" 660 | ] 661 | }, 662 | { 663 | "output_type": "stream", 664 | "stream": "stdout", 665 | "text": [ 666 | "\n", 667 | "(31, 22)" 668 | ] 669 | }, 670 | { 671 | "output_type": "stream", 672 | "stream": "stdout", 673 | "text": [ 674 | "\n", 675 | "(31, 22)" 676 | ] 677 | }, 678 | { 679 | "output_type": "stream", 680 | "stream": "stdout", 681 | "text": [ 682 | "\n", 683 | "(28, 22)" 684 | ] 685 | }, 686 | { 687 | "output_type": "stream", 688 | "stream": "stdout", 689 | "text": [ 690 | "\n", 691 | "(31, 22)" 692 | ] 693 | }, 694 | { 695 | "output_type": "stream", 696 | "stream": "stdout", 697 | "text": [ 698 | "\n", 699 | "(31, 22)" 700 | ] 701 | }, 702 | { 703 | "output_type": "stream", 704 | "stream": "stdout", 705 | "text": [ 706 | "\n", 707 | "(28, 22)" 708 | ] 709 | }, 710 | { 711 | "output_type": "stream", 712 | "stream": "stdout", 713 | "text": [ 714 | "\n", 715 | "(31, 22)" 716 | ] 717 | }, 718 | { 719 | "output_type": "stream", 720 | "stream": "stdout", 721 | "text": [ 722 | "\n", 723 | "(31, 22)" 724 | ] 725 | }, 726 | { 727 | "output_type": "stream", 728 | "stream": "stdout", 729 | "text": [ 730 | "\n", 731 | "(28, 22)" 732 | ] 733 | }, 734 | { 735 | "output_type": "stream", 736 | "stream": "stdout", 737 | "text": [ 738 | "\n", 739 | "(31, 22)" 740 | ] 741 | }, 742 | { 743 | "output_type": "stream", 744 | "stream": "stdout", 745 | "text": [ 746 | "\n", 747 | "(31, 22)" 748 | ] 749 | }, 750 | { 751 | "output_type": "stream", 752 | "stream": "stdout", 753 | "text": [ 754 | "\n", 755 | "(28, 22)" 756 | ] 757 | }, 758 | { 759 | "output_type": "stream", 760 | "stream": "stdout", 761 | "text": [ 762 | "\n", 763 | "(31, 22)" 764 | ] 765 | }, 766 | { 767 | "output_type": "stream", 768 | "stream": "stdout", 769 | "text": [ 770 | "\n", 771 | "(31, 22)" 772 | ] 773 | }, 774 | { 775 | "output_type": "stream", 776 | "stream": "stdout", 777 | "text": [ 778 | "\n", 779 | "(28, 22)" 780 | ] 781 | }, 782 | { 783 | "output_type": "stream", 784 | "stream": "stdout", 785 | "text": [ 786 | "\n", 787 | "(31, 22)" 788 | ] 789 | }, 790 | { 791 | "output_type": "stream", 792 | "stream": "stdout", 793 | "text": [ 794 | "\n", 795 | "(31, 22)" 796 | ] 797 | }, 798 | { 799 | "output_type": "stream", 800 | "stream": "stdout", 801 | "text": [ 802 | "\n", 803 | "(28, 22)" 804 | ] 805 | }, 806 | { 807 | "output_type": "stream", 808 | "stream": "stdout", 809 | "text": [ 810 | "\n", 811 | "(31, 22)" 812 | ] 813 | }, 814 | { 815 | "output_type": "stream", 816 | "stream": "stdout", 817 | "text": [ 818 | "\n", 819 | "(31, 22)" 820 | ] 821 | }, 822 | { 823 | "output_type": "stream", 824 | "stream": "stdout", 825 | "text": [ 826 | "\n", 827 | "(28, 22)" 828 | ] 829 | }, 830 | { 831 | "output_type": "stream", 832 | "stream": "stdout", 833 | "text": [ 834 | "\n", 835 | "(31, 22)" 836 | ] 837 | }, 838 | { 839 | "output_type": "stream", 840 | "stream": "stdout", 841 | "text": [ 842 | "\n", 843 | "(31, 22)" 844 | ] 845 | }, 846 | { 847 | "output_type": "stream", 848 | "stream": "stdout", 849 | "text": [ 850 | "\n", 851 | "(28, 22)" 852 | ] 853 | }, 854 | { 855 | "output_type": "stream", 856 | "stream": "stdout", 857 | "text": [ 858 | "\n", 859 | "(31, 22)" 860 | ] 861 | }, 862 | { 863 | "output_type": "stream", 864 | "stream": "stdout", 865 | "text": [ 866 | "\n" 867 | ] 868 | } 869 | ], 870 | "prompt_number": 227 871 | }, 872 | { 873 | "cell_type": "code", 874 | "collapsed": false, 875 | "input": [ 876 | "if sys.byteorder=='little':\n", 877 | " Tmean1save.byteswap(True)\n", 878 | " Tmean2save.byteswap(True)\n", 879 | " Tmean3save.byteswap(True)\n", 880 | " Tgradsave.byteswap(True)\n", 881 | " Umean1save.byteswap(True)\n", 882 | " Umean2save.byteswap(True)\n", 883 | " ZAsave.byteswap(True)\n", 884 | " Zphasesave.byteswap(True)\n", 885 | "#Create the binary files of the input files\n", 886 | "filename=open(r'./Tmean1.bin','wb')\n", 887 | "Tmean1save.ravel().tofile(filename)\n", 888 | "filename.close()\n", 889 | "filename=open(r'./Tmean2.bin','wb')\n", 890 | "Tmean2save.ravel().tofile(filename)\n", 891 | "filename.close()\n", 892 | "filename=open(r'./Tmean3.bin','wb')\n", 893 | "Tmean3save.ravel().tofile(filename)\n", 894 | "filename.close()\n", 895 | "filename=open(r'./Tgrad.bin','wb')\n", 896 | "Tgradsave.ravel().tofile(filename)\n", 897 | "filename.close()\n", 898 | "filename=open(r'./Umean1.bin','wb')\n", 899 | "Umean1save.ravel().tofile(filename)\n", 900 | "filename.close()\n", 901 | "filename=open(r'./Umean2.bin','wb')\n", 902 | "Umean2save.ravel().tofile(filename)\n", 903 | "filename.close()\n", 904 | "filename=open(r'./ZA.bin','wb')\n", 905 | "ZAsave.ravel().tofile(filename)\n", 906 | "filename.close()\n", 907 | "filename=open(r'./Zphase.bin','wb')\n", 908 | "Zphasesave.ravel().tofile(filename)\n", 909 | "filename.close()" 910 | ], 911 | "language": "python", 912 | "metadata": {}, 913 | "outputs": [], 914 | "prompt_number": 228 915 | }, 916 | { 917 | "cell_type": "code", 918 | "collapsed": false, 919 | "input": [ 920 | "nt = 365*49+1\n", 921 | "levels = 18\n", 922 | "file1=open('./data/Tmean1.bin','rb')\n", 923 | "Tmean1read=np.fromfile(file1)\n", 924 | "if sys.byteorder=='little':\n", 925 | " Tmean1read.byteswap(True)\n", 926 | "Tmean1read=Tmean1read.reshape(nt,levels)\n", 927 | "\n", 928 | "file1=open('./data/Tmean2.bin','rb')\n", 929 | "Tmean2read=np.fromfile(file1)\n", 930 | "if sys.byteorder=='little':\n", 931 | " Tmean2read.byteswap(True)\n", 932 | "Tmean2read=Tmean2read.reshape(nt,levels)\n", 933 | "\n", 934 | "file1=open('./data/Tmean3.bin','rb')\n", 935 | "Tmean3read=np.fromfile(file1)\n", 936 | "if sys.byteorder=='little':\n", 937 | " Tmean3read.byteswap(True)\n", 938 | "Tmean3read=Tmean3read.reshape(nt,levels)\n", 939 | "\n", 940 | "file1=open('./data/Tgrad.bin','rb')\n", 941 | "Tgradread=np.fromfile(file1)\n", 942 | "if sys.byteorder=='little':\n", 943 | " Tgradread.byteswap(True)\n", 944 | "Tgradread=Tgradread.reshape(nt,levels)\n", 945 | "\n", 946 | "file1=open('./data/Umean1.bin','rb')\n", 947 | "Umean1read=np.fromfile(file1)\n", 948 | "if sys.byteorder=='little':\n", 949 | " Umean1read.byteswap(True)\n", 950 | "Umean1read=Umean1read.reshape(nt,levels)\n", 951 | "\n", 952 | "file1=open('./data/Umean2.bin','rb')\n", 953 | "Umean2read=np.fromfile(file1)\n", 954 | "if sys.byteorder=='little':\n", 955 | " Umean2read.byteswap(True)\n", 956 | "Umean2read=Umean2read.reshape(nt,levels)\n", 957 | "\n", 958 | "file1=open('./data/ZA.bin','rb')\n", 959 | "ZAread=np.fromfile(file1)\n", 960 | "if sys.byteorder=='little':\n", 961 | " ZAread.byteswap(True)\n", 962 | "ZAread=ZAread.reshape(nt,levels,3)\n", 963 | "\n", 964 | "file1=open('./data/Zphase.bin','rb')\n", 965 | "Zphaseread=np.fromfile(file1)\n", 966 | "if sys.byteorder=='little':\n", 967 | " Zphaseread.byteswap(True)\n", 968 | "Zphaseread=Zphaseread.reshape(nt,levels,2)" 969 | ], 970 | "language": "python", 971 | "metadata": {}, 972 | "outputs": [], 973 | "prompt_number": 2 974 | }, 975 | { 976 | "cell_type": "code", 977 | "collapsed": false, 978 | "input": [ 979 | "# Calculating Data Metric\n", 980 | "Tmean1read = Tmean1read[1:,:]-np.mean(Tmean1read[1:,:],axis=0)\n", 981 | "Tmean2read = Tmean2read[1:,:]-np.mean(Tmean2read[1:,:],axis=0)\n", 982 | "Tmean3read = Tmean3read[1:,:]-np.mean(Tmean3read[1:,:],axis=0)\n", 983 | "Tmean1tend = Tmean1read[7:,:]-Tmean1read[0:nt-8,:]\n", 984 | "Tmean2tend = Tmean2read[7:,:]-Tmean2read[0:nt-8,:]\n", 985 | "Tmean3tend = Tmean3read[7:,:]-Tmean3read[0:nt-8,:]\n", 986 | "Tgradread = Tgradread[1:,:]\n", 987 | "Tgradtend = Tgradread[7:,:]-Tgradread[0:nt-8,:]\n", 988 | "Umean1read = Umean1read[1:,:]\n", 989 | "Umean2read = Umean2read[1:,:]\n", 990 | "Umean1tend = Umean1read[7:,:]-Umean1read[0:nt-8,:]\n", 991 | "Umean2tend = Umean2read[7:,:]-Umean2read[0:nt-8,:]\n", 992 | "ZAread = ZAread[1:,:]\n", 993 | "ZA1read = ZAread[:,:,1]/ZAread[:,:,0]\n", 994 | "ZA2read = ZAread[:,:,2]/ZAread[:,:,0]\n", 995 | "Zphase1read = Zphaseread[1:,:,0]\n", 996 | "Zphase2read = Zphaseread[1:,:,1]" 997 | ], 998 | "language": "python", 999 | "metadata": {}, 1000 | "outputs": [], 1001 | "prompt_number": 3 1002 | }, 1003 | { 1004 | "cell_type": "code", 1005 | "collapsed": false, 1006 | "input": [ 1007 | "X = np.concatenate((Tmean1read[7:,:],Tmean2read[7:,:],Tmean3read[7:,:],Tmean1tend,Tmean2tend,Tmean3tend,\n", 1008 | " Tgradread[7:,:],Tgradtend,Umean1read[7:,:],Umean2read[7:,:],Umean1tend,Umean2tend,ZA1read[7:,:],ZA2read[7:,:]),axis=1)" 1009 | ], 1010 | "language": "python", 1011 | "metadata": {}, 1012 | "outputs": [], 1013 | "prompt_number": 4 1014 | }, 1015 | { 1016 | "cell_type": "code", 1017 | "collapsed": false, 1018 | "input": [ 1019 | "# Save data metric\n", 1020 | "if sys.byteorder=='little':\n", 1021 | " X.byteswap(True)\n", 1022 | "#Create the binary files of the input files\n", 1023 | "filename=open(r'./SSWdata.bin','wb')\n", 1024 | "X.ravel().tofile(filename)\n", 1025 | "filename.close()" 1026 | ], 1027 | "language": "python", 1028 | "metadata": {}, 1029 | "outputs": [] 1030 | } 1031 | ], 1032 | "metadata": {} 1033 | } 1034 | ] 1035 | } -------------------------------------------------------------------------------- /Data_Analysis/SSWkmeans.m: -------------------------------------------------------------------------------- 1 | nt = 365*49-7; 2 | levels = 18; 3 | fileID = fopen('data/SSWdata.bin'); 4 | X=fread(fileID,[nt*levels*14 1],'double'); 5 | X=reshape(swapbytes(X),[levels*14 nt]); 6 | size(X) 7 | tic 8 | idx = kmeans(X',3,'Distance','correlation'); 9 | toc 10 | tic 11 | [s,h]=silhouette(X',idx,'correlation'); 12 | toc 13 | -------------------------------------------------------------------------------- /Data_Analysis/covertype_cluster/cluster_sample.m: -------------------------------------------------------------------------------- 1 | M = csvread('data/observation_sample.csv'); 2 | s = size(M); 3 | M1= M(2:s,:); 4 | score = zeros(30,1); 5 | for i = 5:30 6 | disp(i) 7 | tic 8 | idx = kmeans(M1,i,'Distance','sqEuclidean'); 9 | toc 10 | tic 11 | [s,h]=silhouette(M1,idx,'sqEuclidean'); 12 | toc 13 | score(i)=mean(s); 14 | end 15 | disp(score); -------------------------------------------------------------------------------- /Data_Analysis/covertype_cluster/cluster_sample.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import math 4 | import random 5 | 6 | from sklearn.cluster import KMeans 7 | from sklearn.metrics import silhouette_samples, silhouette_score 8 | from sklearn import mixture 9 | 10 | X_normed=pd.read_csv("data/observation_sample.csv") 11 | Y=pd.read_csv("data/label_sample.csv") 12 | 13 | #use silhouette_values to choose best K 14 | sil=[] 15 | start = 5 16 | end = 7 17 | for n_clusters in range(start,end): 18 | kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(X_normed) 19 | cluster_labels=kmeans.labels_ 20 | silhouette_values = silhouette_samples(X_normed,cluster_labels) 21 | 22 | silhouette_avg = silhouette_score(X_normed, cluster_labels) 23 | print("For n_clusters =", n_clusters, 24 | "The average silhouette_score is :", silhouette_avg) 25 | sil.append(silhouette_avg) 26 | 27 | 28 | 29 | K = start+np.array(sil).argmax() 30 | 31 | #Now we've found the K, do the clustering again 32 | kmeans = KMeans(n_clusters=K, random_state=0).fit(X_normed) 33 | cluster_labels=kmeans.labels_ 34 | silhouette_values = silhouette_samples(X_normed,cluster_labels) 35 | 36 | silhouette_avg = silhouette_score(X_normed, cluster_labels) 37 | print("For n_clusters =", K, 38 | "The average silhouette_score is :", silhouette_avg) 39 | 40 | 41 | #find the purest(has the highest percentage for a cluster) cluster for the 7 labels, use their center as our final cluster center so that future data points could 42 | #be classified 43 | 44 | #build a table so that all the percentage could be recorded 45 | #cluster Number label_1 label_2 ... label_7 46 | #0 47 | #1 48 | #... 49 | #K-1 50 | 51 | table = pd.DataFrame(np.zeros((K,8)),columns=['label_1', 'label_2', 'label_3', 'label_4','label_5','label_6','label_7','size']) 52 | 53 | 54 | pred = pd.DataFrame(data = kmeans.labels_, columns=['cluster']) 55 | for k in range(K): 56 | for i in range(1,8): 57 | c = Y.iloc[pred[pred['cluster']==k].index] 58 | table.loc[k,'label_'+str(i)] = c[c==i].shape[0] 59 | 60 | 61 | # print(table) 62 | 63 | 64 | target = open("data/output.txt", 'w') 65 | target.write(str(K)) 66 | target.write("\n") 67 | target.write(str(sil)) 68 | target.write("\n") 69 | 70 | 71 | 72 | pred.to_csv("data/pred_labels.csv") 73 | #select 7 clusters, set their centers for use 74 | #1: 7 75 | #2: 4 76 | #3: 22 77 | #4: 23 78 | #5: 0 79 | #6: 3 80 | #7: 24 81 | # selected_cluster = [7,4,22,23,0,3,24] 82 | # centers = kmeans.cluster_centers_[selected_cluster] 83 | 84 | 85 | 86 | #test our classification accuracy 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | -------------------------------------------------------------------------------- /Data_Analysis/covertype_cluster/cluster_train_full.m: -------------------------------------------------------------------------------- 1 | M = csvread('data/observation_train_full.csv'); 2 | s = size(M); 3 | M1= M(2:s,:); 4 | tic 5 | idx = kmeans(M1,23); 6 | toc -------------------------------------------------------------------------------- /Data_Analysis/covertype_cluster/figures/accuracy_graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JiaweiZhuang/CS205_final_project/0751060adec3f6e4ca9676baae1bfd4bf4ba542a/Data_Analysis/covertype_cluster/figures/accuracy_graph.png -------------------------------------------------------------------------------- /Data_Analysis/covertype_cluster/figures/covertype_distribution.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JiaweiZhuang/CS205_final_project/0751060adec3f6e4ca9676baae1bfd4bf4ba542a/Data_Analysis/covertype_cluster/figures/covertype_distribution.png -------------------------------------------------------------------------------- /Data_Analysis/covertype_cluster/figures/study_area_map.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JiaweiZhuang/CS205_final_project/0751060adec3f6e4ca9676baae1bfd4bf4ba542a/Data_Analysis/covertype_cluster/figures/study_area_map.png -------------------------------------------------------------------------------- /Data_Analysis/covertype_cluster/figures/vis_label.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JiaweiZhuang/CS205_final_project/0751060adec3f6e4ca9676baae1bfd4bf4ba542a/Data_Analysis/covertype_cluster/figures/vis_label.png -------------------------------------------------------------------------------- /Data_Analysis/covertype_cluster/figures/vis_pred.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JiaweiZhuang/CS205_final_project/0751060adec3f6e4ca9676baae1bfd4bf4ba542a/Data_Analysis/covertype_cluster/figures/vis_pred.png -------------------------------------------------------------------------------- /Data_Analysis/covertype_cluster/preprocess.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import math 4 | import random 5 | 6 | df= pd.read_csv("data/covtype_data.csv", header=None) 7 | 8 | #normalize 9 | # X = df.iloc[:, :-1] 10 | # X_normed = X / X.max(axis=0) 11 | # Y = df.iloc[: ,-1] 12 | # X_normed = X_normed.fillna(0) 13 | 14 | # X_normed.to_csv("data/observation_full.csv", index=False) 15 | # Y.to_csv("data/label_full.csv", index=False, header=True) 16 | 17 | 18 | 19 | 20 | sample = pd.DataFrame(columns = df.columns) 21 | #choose 10% for each group 22 | for i in range(1,8): 23 | subset = df[df[54]==i] 24 | rows = random.sample(subset.index, int(subset.shape[0]*0.05)) 25 | sample = sample.append(df.ix[rows],ignore_index=True) 26 | 27 | 28 | #normalize 29 | X = sample.iloc[:, :-1] 30 | X_normed = X / X.max(axis=0) 31 | Y = sample.iloc[: ,-1] 32 | X_normed = X_normed.fillna(0) 33 | 34 | X_normed.to_csv("data/observation_sample.csv", index=False) 35 | 36 | Y.to_csv("data/label_sample.csv", index=False, header=True) -------------------------------------------------------------------------------- /Data_Analysis/data/SSWdata.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JiaweiZhuang/CS205_final_project/0751060adec3f6e4ca9676baae1bfd4bf4ba542a/Data_Analysis/data/SSWdata.bin -------------------------------------------------------------------------------- /Data_Analysis/figures/PV.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JiaweiZhuang/CS205_final_project/0751060adec3f6e4ca9676baae1bfd4bf4ba542a/Data_Analysis/figures/PV.png -------------------------------------------------------------------------------- /Data_Analysis/figures/SSW.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JiaweiZhuang/CS205_final_project/0751060adec3f6e4ca9676baae1bfd4bf4ba542a/Data_Analysis/figures/SSW.png -------------------------------------------------------------------------------- /Data_Analysis/figures/SSWsubset.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JiaweiZhuang/CS205_final_project/0751060adec3f6e4ca9676baae1bfd4bf4ba542a/Data_Analysis/figures/SSWsubset.png -------------------------------------------------------------------------------- /Data_Analysis/figures/T.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JiaweiZhuang/CS205_final_project/0751060adec3f6e4ca9676baae1bfd4bf4ba542a/Data_Analysis/figures/T.png -------------------------------------------------------------------------------- /Data_Analysis/figures/intro1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JiaweiZhuang/CS205_final_project/0751060adec3f6e4ca9676baae1bfd4bf4ba542a/Data_Analysis/figures/intro1.png -------------------------------------------------------------------------------- /Data_Analysis/figures/k3_svalue.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JiaweiZhuang/CS205_final_project/0751060adec3f6e4ca9676baae1bfd4bf4ba542a/Data_Analysis/figures/k3_svalue.png -------------------------------------------------------------------------------- /Data_Analysis/figures/silScoreSubset.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JiaweiZhuang/CS205_final_project/0751060adec3f6e4ca9676baae1bfd4bf4ba542a/Data_Analysis/figures/silScoreSubset.png -------------------------------------------------------------------------------- /Data_Analysis/figures/svalue.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JiaweiZhuang/CS205_final_project/0751060adec3f6e4ca9676baae1bfd4bf4ba542a/Data_Analysis/figures/svalue.png -------------------------------------------------------------------------------- /Data_Analysis/readData.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import numpy as np 3 | ndata = 17878 4 | nfeatures = 252 5 | # Read data points 6 | file1=open('data/SSWdata.bin','rb') 7 | X=np.fromfile(file1) 8 | if sys.byteorder=='little': 9 | X.byteswap(True) 10 | X=X.reshape(ndata,nfeatures) 11 | 12 | # Read python label 13 | file1=open('data/Label_py.bin','rb') 14 | Y_py=np.fromfile(file1,np.int32) 15 | if sys.byteorder=='little': 16 | Y_py.byteswap(True) 17 | 18 | # Read matlab label 19 | file1=open('data/Label_matlab.bin','rb') 20 | Y_matlab=np.fromfile(file1,np.int32) 21 | 22 | -------------------------------------------------------------------------------- /Other_Image/Kmean_illustration/Kmeans.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JiaweiZhuang/CS205_final_project/0751060adec3f6e4ca9676baae1bfd4bf4ba542a/Other_Image/Kmean_illustration/Kmeans.gif -------------------------------------------------------------------------------- /Other_Image/pseudo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JiaweiZhuang/CS205_final_project/0751060adec3f6e4ca9676baae1bfd4bf4ba542a/Other_Image/pseudo.png -------------------------------------------------------------------------------- /Parallel_Algorithm/Cuda/compile.sh: -------------------------------------------------------------------------------- 1 | export PATH=/usr/local/cuda-7.5/bin${PATH:+:${PATH}} 2 | export LD_LIBRARY_PATH=/usr/local/cuda-7.5/lib64\${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}} 3 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib 4 | 5 | nvcc -lnetcdf kmeans_cdf.cu 6 | -------------------------------------------------------------------------------- /Parallel_Algorithm/Cuda/kmeans_cdf.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | extern "C" { 11 | #include 12 | } 13 | 14 | using namespace std; 15 | 16 | // #define FAKE_DATA "../test_data/Blobs_smp20000_fea30_cls8.nc" 17 | #define ERRCODE 2 18 | #define ERR(e) {printf("Error: %s\n", nc_strerror(e)); exit(ERRCODE);} 19 | 20 | double iStart1, iStart2, iStart3a, iStart3b, iStart4a, iStart4b, iStart4c, iStart4d, iStart5; 21 | double iElaps1=0, iElaps2=0, iElaps3a=0, iElaps3b=0, iElaps4=0, iElaps5=0; 22 | // Hold configurations for Kmeans 23 | struct Info { 24 | int numPoints; 25 | int dim; 26 | int numCentroids; 27 | int numRepeats; 28 | int *belongs; 29 | float **points; 30 | float **centroids; 31 | float **guess; 32 | int thresholdLoops; 33 | float thresholdFraction; 34 | int threadPerBlock; 35 | }; 36 | 37 | // ************************** Utils ************************** // 38 | 39 | float** Make2DFloatArray(int rows, int cols) { 40 | float *data = (float *)malloc(rows*cols*sizeof(float)); 41 | float **array= (float **)malloc(rows*sizeof(float*)); 42 | for (int i=0; i> i; 124 | } 125 | return res + 1; 126 | } 127 | 128 | float** make2DArray(int x, int y) { 129 | float **res = (float **)malloc(x * sizeof(float *)); 130 | 131 | // for (int i = 0; i < x; i++) { 132 | // res[i] = (float *)malloc(y * sizeof(float)); 133 | // } 134 | res[0] = (float *)malloc(x * y * sizeof(float)); 135 | for (size_t i = 1; i < x; i++) res[i] = res[i-1] + y; 136 | for (size_t i = 0; i < x; i++) { 137 | for (size_t j = 0; j < y; j++) { 138 | res[i][j] = 0.0; 139 | } 140 | } 141 | return res; 142 | } 143 | 144 | void invert2DArray(float **A, float **B, int x, int y) { 145 | for (int i = 0; i < x; i++) { 146 | for (int j = 0; j < y; j++) { 147 | A[i][j] = B[j][i]; 148 | } 149 | } 150 | } 151 | 152 | void copy2DArray(float **A, float **B, int x, int y) { 153 | for (int i = 0; i < x; i++) { 154 | for (int j = 0; j < y; j++) { 155 | A[i][j] = B[i][j]; 156 | } 157 | } 158 | } 159 | 160 | // ************************** Utils ************************** // 161 | 162 | __host__ __device__ inline static float 163 | computeDist(Info* info, int pointId, int centroidId, int distType, float *gPoints, float *gCentroids) { 164 | float res = 0; 165 | if (distType == 0) { 166 | for (int i = 0; i < info->dim; i++) { 167 | res += 168 | (gPoints[i * (info->numPoints) + pointId] - gCentroids[i * (info->numCentroids) + centroidId]) * 169 | (gPoints[i * (info->numPoints) + pointId] - gCentroids[i * (info->numCentroids) + centroidId]); 170 | } 171 | } 172 | return res; 173 | } 174 | 175 | // Use reduction to compute the sum of an array 176 | // Refer to 177 | // http://developer.download.nvidia.com/compute/cuda/1.1-Beta/x86_website/projects/reduction/doc/reduction.pdf 178 | __global__ static void reduce(int *g_idata, int l1, int l2) { 179 | extern __shared__ unsigned int sdata[]; 180 | unsigned int tid = threadIdx.x; 181 | 182 | if (tid < l1) { 183 | sdata[tid] = g_idata[tid]; 184 | } else { 185 | sdata[tid] = 0; 186 | } 187 | __syncthreads(); 188 | 189 | // Parallel Reduction (l2 must be power of 2) 190 | for (unsigned int s = l2 / 2; s > 0; s >>= 1) { 191 | if (tid < s) { 192 | sdata[tid] += sdata[tid + s]; 193 | } 194 | __syncthreads(); 195 | } 196 | 197 | if (tid == 0) { 198 | g_idata[0] = sdata[0]; 199 | } 200 | } 201 | 202 | __global__ static void nearestCentroid(int *blockResult, int *gBelongs, float *gPoints, float *gCentroids, Info *gInfo) { 203 | 204 | int pointId = blockDim.x * blockIdx.x + threadIdx.x; 205 | if (pointId >= (gInfo->numPoints)) return; 206 | 207 | // For test on test.txt 208 | // printf("Thread: %d - %.2f, %.2f, %.2f, %.2f \n", pointId, gCentroids[0], gCentroids[1], gCentroids[2], gCentroids[3]); 209 | // printf("Thread: %d - %.2f, %.2f, %.2f, %.2f, %.2f, %.2f, %.2f, %.2f \n", 210 | // pointId, gPoints[0], gPoints[1], gPoints[2], gPoints[3], gPoints[4], gPoints[5], gPoints[6], gPoints[7]); 211 | 212 | // Get the minimum distance 213 | float mDist = computeDist(gInfo, pointId, 0, 0, gPoints, gCentroids); 214 | 215 | int tmpIdx = 0; 216 | int numCentroids = gInfo->numCentroids; 217 | for (int i = 0; i < numCentroids; i++) { 218 | float tmpDist = computeDist(gInfo, pointId, i, 0, gPoints, gCentroids); 219 | if (tmpDist < mDist) { 220 | mDist = tmpDist; 221 | tmpIdx = i; 222 | } 223 | } 224 | 225 | // use reduction to add the total number of changes (change from one centroid to another) in this block 226 | extern __shared__ int sdata2[]; 227 | sdata2[threadIdx.x] = 0; 228 | if (gBelongs[pointId] != tmpIdx) { 229 | sdata2[threadIdx.x] = 1; 230 | } 231 | gBelongs[pointId] = tmpIdx; 232 | __syncthreads(); 233 | 234 | // Reduction 235 | for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) { 236 | if (threadIdx.x < s) { 237 | sdata2[threadIdx.x] += sdata2[threadIdx.x + s]; 238 | } 239 | __syncthreads(); 240 | } 241 | 242 | // Put the sum to the location corresbonding to current block 243 | if (threadIdx.x == 0) { 244 | blockResult[blockIdx.x] = sdata2[0]; 245 | } 246 | } 247 | 248 | void processData(char *fileName, Info *info, int i_repeat) { 249 | float **X; 250 | int **GUESS; 251 | 252 | int N_samples, N_features, N_clusters, N_repeat; 253 | 254 | readX(fileName,&X,&GUESS,&N_samples,&N_features,&N_clusters,&N_repeat); 255 | 256 | // cout << N_samples << "," << N_features << "," << N_clusters << "," << N_repeat << '\n'; 257 | 258 | // Test purpose 259 | // N_samples = 4; 260 | // N_features = 2; 261 | // N_clusters = 2; 262 | // N_repeat = 1; 263 | 264 | info->numPoints = N_samples; 265 | info->dim = N_features; 266 | info->numCentroids = N_clusters; 267 | info->numRepeats = N_repeat; 268 | info->thresholdFraction = 0.001; 269 | info->thresholdLoops = 200; 270 | info->points = X; 271 | 272 | float **guess = make2DArray(N_clusters, N_features); 273 | for (int k=0; kguess = guess; 280 | 281 | /* belongs: the cluster id for each data object */ 282 | int *belongs = new int[N_samples]; 283 | for (int i = 0; i < N_samples; i++) belongs[i] = -1; 284 | info->belongs = belongs; 285 | } 286 | 287 | 288 | 289 | void cudaKmeans(Info *info) { 290 | // Initialization 291 | int numPoints = info->numPoints; 292 | int dim = info->dim; 293 | int numCentroids = info->numCentroids; 294 | int thresholdLoops = info->thresholdLoops; 295 | int thresholdFraction = info->thresholdFraction; 296 | int* belongs = info->belongs; 297 | float **points = info->points; 298 | float **centroids = info->centroids; 299 | float **guess = info->guess; 300 | int threadPerBlock = info->threadPerBlock; 301 | 302 | iStart4d = cpuSecond(); 303 | 304 | // invert (transpose matrix) 305 | float **iPoints = make2DArray(dim, numPoints); 306 | invert2DArray(iPoints, points, dim, numPoints); 307 | 308 | // initial guess 309 | float **iCentroids = make2DArray(dim, numCentroids); 310 | // copy2DArray(iCentroids, iPoints, dim, numCentroids); 311 | invert2DArray(iCentroids, guess, dim, numCentroids); 312 | 313 | // centroid -> number of points 314 | int *pointsCount = new int[numCentroids]; 315 | float **iNewCentroids = make2DArray(dim, numCentroids); 316 | 317 | iElaps4 += cpuSecond() - iStart4d; 318 | 319 | // Some cuda constants 320 | const unsigned int bthreads = threadPerBlock; 321 | const unsigned int l1 = (numPoints + bthreads - 1) / bthreads; 322 | const unsigned int l2 = nextPowerOfTwo(l1); 323 | const unsigned int sdsize2 = bthreads * sizeof(unsigned int); // shared memory size for sdata2 324 | const unsigned int sdsize1 = l2 * sizeof(unsigned int); // shared memory size for sdata1 325 | 326 | // Cuda device Initialization 327 | float *gPoints; 328 | float *gCentroids; 329 | int *gBelongs; 330 | Info *gInfo; 331 | int *tmp; 332 | 333 | // Data transfer 334 | iStart4a = cpuSecond(); 335 | cudaMalloc(&gPoints, numPoints * dim * sizeof(float)); 336 | cudaMalloc(&gCentroids, numCentroids * dim * sizeof(float)); 337 | cudaMalloc(&gBelongs, numPoints * sizeof(int)); 338 | cudaMalloc((void**)&gInfo, sizeof(Info)); 339 | cudaMalloc(&tmp, l2 * sizeof(unsigned int)); // For reduction 340 | cudaMemcpy(gBelongs, 341 | belongs, 342 | numPoints * sizeof(int), 343 | cudaMemcpyHostToDevice); 344 | cudaMemcpy(gPoints, 345 | iPoints[0], 346 | numPoints * dim * sizeof(float), 347 | cudaMemcpyHostToDevice); 348 | cudaMemcpy(gInfo,info,sizeof(Info),cudaMemcpyHostToDevice); 349 | 350 | iElaps4 += cpuSecond() - iStart4a; 351 | 352 | int count = 0; 353 | float frac = 1.0; 354 | 355 | while (count < thresholdLoops) { 356 | iStart4b = cpuSecond(); 357 | cudaMemcpy(gCentroids, iCentroids[0], dim * numCentroids * sizeof(float), cudaMemcpyHostToDevice); 358 | iElaps4 += cpuSecond() - iStart4b; 359 | 360 | // E-Step: assign points to the nearest cluster center 361 | iStart2 = cpuSecond(); 362 | // nearestCentroid<<>>(dim, numPoints, numCentroids, gPoints, gCentroids, gBelongs, tmp); 363 | nearestCentroid<<>>(tmp, gBelongs, gPoints, gCentroids, gInfo); 364 | cudaDeviceSynchronize(); 365 | iElaps2 += (cpuSecond() - iStart2); 366 | 367 | // Update belongs 368 | iStart4c = cpuSecond(); 369 | cudaMemcpy(belongs, gBelongs, numPoints * sizeof(int), cudaMemcpyDeviceToHost); 370 | iElaps4 += cpuSecond() - iStart4c; 371 | 372 | // M-Step first half: set the cluster centers to the mean 373 | iStart3a = cpuSecond(); 374 | 375 | // Clear the two temp variables 376 | for (int i = 0; i < numCentroids; i++) { 377 | pointsCount[i] = 0; 378 | for (int j = 0; j < dim; j++) { 379 | iNewCentroids[j][i] = 0.0; 380 | } 381 | } 382 | 383 | // Add up points in each centroid 384 | for (int i = 0; i < numPoints; i++) { 385 | int idx = belongs[i]; 386 | pointsCount[idx] += 1; 387 | for (int j = 0; j < dim; j++) { 388 | iNewCentroids[j][idx] += points[i][j]; 389 | } 390 | } 391 | iElaps3a += cpuSecond() - iStart3a; 392 | 393 | // M-Step second half: convert the sum to the mean 394 | // Update to new centroids 395 | iStart3b = cpuSecond(); 396 | for (int i = 0; i < numCentroids; i++) { 397 | for (int j = 0; j < dim; j++) { 398 | if (pointsCount[i] > 0) { 399 | iCentroids[j][i] = iNewCentroids[j][i] / pointsCount[i]; 400 | } 401 | } 402 | } 403 | iElaps3b += cpuSecond() - iStart3b; 404 | 405 | // Check convergence 406 | iStart5 = cpuSecond(); 407 | 408 | // Check if too few number of points change their centroids 409 | reduce <<<1, l2, sdsize1>>>(tmp, l1, l2); 410 | cudaDeviceSynchronize(); 411 | int tmpFloat; 412 | cudaMemcpy(&tmpFloat, tmp, sizeof(int), cudaMemcpyDeviceToHost); 413 | frac = (float)tmpFloat / numPoints; 414 | // cout << "Iteration: " << count << "," << frac << "," << tmpFloat << "\n"; 415 | count++; 416 | if (frac <= thresholdFraction) break; 417 | 418 | iElaps5 += cpuSecond() - iStart5; 419 | 420 | } 421 | 422 | iStart4d = cpuSecond(); 423 | centroids = make2DArray(numCentroids, dim); 424 | invert2DArray(centroids, iCentroids, numCentroids, dim); 425 | info->centroids = centroids; 426 | iElaps4 += cpuSecond() - iStart4d; 427 | 428 | // Free device memory 429 | cudaFree(gPoints); 430 | cudaFree(gCentroids); 431 | cudaFree(gBelongs); 432 | cudaFree(tmp); 433 | 434 | } 435 | 436 | int main(int argc, char *argv[]) { 437 | Info *info = new Info; 438 | info->threadPerBlock = atoi(argv[1]); 439 | char *fileName = argv[2]; 440 | processData(fileName, info, 0); 441 | 442 | printf("Number of samples: %d \n",info->numPoints); 443 | printf("Number of features: %d \n", info->dim); 444 | printf("Number of clusters: %d \n", info->numCentroids); 445 | printf("Number of repeated runs: %d \n", info->numRepeats); 446 | for (int i = 0; i < info->numRepeats; i++) { 447 | // cout << "====== Begin Loop " << i << " ======\n"; 448 | iStart1 = cpuSecond(); 449 | cudaKmeans(info); 450 | iElaps1 += cpuSecond() - iStart1; 451 | 452 | // cout << "Ref: " << info->centroids[0][0] << "\n"; 453 | // cout << "====== End of Loop " << i << " ======\n"; 454 | // break; 455 | 456 | // Reload info 457 | delete(info); 458 | if (i + 1== info->numRepeats) break; 459 | info = new Info; 460 | info->threadPerBlock = atoi(argv[1]); 461 | processData(fileName, info, i+1); 462 | } 463 | 464 | 465 | cout << "Total time: " << iElaps1*1000 << "\n"; 466 | cout << "E-step time use (ms): " << iElaps2*1000 << "\n"; 467 | cout << "M-step-1st-half time use (ms): " << iElaps3a*1000 << "\n"; 468 | cout << "M-step-2nd-half time use (ms): " << iElaps3b*1000 << "\n"; 469 | cout << "Cuda Data IO (ms): " << iElaps4*1000 << "\n"; 470 | cout << "Check Convergence (ms): " << iElaps5*1000 << "\n"; 471 | } 472 | -------------------------------------------------------------------------------- /Parallel_Algorithm/Cuda/kmeans_txt.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | using namespace std; 12 | double iStart1, iStart2, iStart3a, iStart3b, iStart4a, iStart4b, iStart4c, iStart5; 13 | double iElaps1=0, iElaps2=0, iElaps3a=0, iElaps3b=0, iElaps4=0, iElaps5=0; 14 | // Hold configurations for Kmeans 15 | struct Info { 16 | int numPoints; 17 | int dim; 18 | int numCentroids; 19 | int numRepeats; 20 | int *belongs; 21 | float **points; 22 | float **centroids; 23 | int thresholdLoops; 24 | float thresholdFraction; 25 | int threadPerBlock; 26 | }; 27 | 28 | // ************* Utils ************* // 29 | 30 | double cpuSecond() { 31 | struct timeval tp; 32 | gettimeofday(&tp,NULL); 33 | return ((double)tp.tv_sec + (double)tp.tv_usec * 1.e-6); 34 | } 35 | 36 | static inline int nextPowerOfTwo(int v) { 37 | int res = v; 38 | for (int i = 1; i <= 16; i *= 2) { 39 | res |= res >> i; 40 | } 41 | return res + 1; 42 | } 43 | 44 | float** make2DArray(int x, int y) { 45 | float **res = (float **)malloc(x * sizeof(float *)); 46 | 47 | // for (int i = 0; i < x; i++) { 48 | // res[i] = (float *)malloc(y * sizeof(float)); 49 | // } 50 | res[0] = (float *)malloc(x * y * sizeof(float)); 51 | for (size_t i = 1; i < x; i++) res[i] = res[i-1] + y; 52 | for (size_t i = 0; i < x; i++) { 53 | for (size_t j = 0; j < y; j++) { 54 | res[i][j] = 0.0; 55 | } 56 | } 57 | return res; 58 | } 59 | 60 | void invert2DArray(float **A, float **B, int x, int y) { 61 | for (int i = 0; i < x; i++) { 62 | for (int j = 0; j < y; j++) { 63 | A[i][j] = B[j][i]; 64 | } 65 | } 66 | } 67 | 68 | void copy2DArray(float **A, float **B, int x, int y) { 69 | for (int i = 0; i < x; i++) { 70 | for (int j = 0; j < y; j++) { 71 | A[i][j] = B[i][j]; 72 | } 73 | } 74 | } 75 | 76 | // ************* Utils ************* // 77 | 78 | __host__ __device__ inline static float 79 | computeDist(Info* info, int pointId, int centroidId, int distType, float *gPoints, float *gCentroids) { 80 | float res = 0; 81 | if (distType == 0) { 82 | for (int i = 0; i < info->dim; i++) { 83 | res += 84 | (gPoints[i * (info->numPoints) + pointId] - gCentroids[i * (info->numCentroids) + centroidId]) * 85 | (gPoints[i * (info->numPoints) + pointId] - gCentroids[i * (info->numCentroids) + centroidId]); 86 | } 87 | } 88 | return res; 89 | } 90 | 91 | // Use reduction to compute the sum of an array 92 | // Refer to 93 | // http://developer.download.nvidia.com/compute/cuda/1.1-Beta/x86_website/projects/reduction/doc/reduction.pdf 94 | __global__ static void reduce(int *g_idata, int l1, int l2) { 95 | extern __shared__ unsigned int sdata[]; 96 | unsigned int tid = threadIdx.x; 97 | 98 | if (tid < l1) { 99 | sdata[tid] = g_idata[tid]; 100 | } else { 101 | sdata[tid] = 0; 102 | } 103 | __syncthreads(); 104 | 105 | // Parallel Reduction (l2 must be power of 2) 106 | for (unsigned int s = l2 / 2; s > 0; s >>= 1) { 107 | if (tid < s) { 108 | sdata[tid] += sdata[tid + s]; 109 | } 110 | __syncthreads(); 111 | } 112 | 113 | if (tid == 0) { 114 | g_idata[0] = sdata[0]; 115 | } 116 | } 117 | 118 | __global__ static void nearestCentroid(int *blockResult, int *gBelongs, float *gPoints, float *gCentroids, Info *gInfo) { 119 | 120 | int pointId = blockDim.x * blockIdx.x + threadIdx.x; 121 | if (pointId >= (gInfo->numPoints)) return; 122 | 123 | // For test on test.txt 124 | // printf("Thread: %d - %.2f, %.2f, %.2f, %.2f \n", pointId, gCentroids[0], gCentroids[1], gCentroids[2], gCentroids[3]); 125 | // printf("Thread: %d - %.2f, %.2f, %.2f, %.2f, %.2f, %.2f, %.2f, %.2f \n", 126 | // pointId, gPoints[0], gPoints[1], gPoints[2], gPoints[3], gPoints[4], gPoints[5], gPoints[6], gPoints[7]); 127 | 128 | // Get the minimum distance 129 | float mDist = computeDist(gInfo, pointId, 0, 0, gPoints, gCentroids); 130 | 131 | int tmpIdx = 0; 132 | int numCentroids = gInfo->numCentroids; 133 | for (int i = 0; i < numCentroids; i++) { 134 | float tmpDist = computeDist(gInfo, pointId, i, 0, gPoints, gCentroids); 135 | if (tmpDist < mDist) { 136 | mDist = tmpDist; 137 | tmpIdx = i; 138 | } 139 | } 140 | 141 | // use reduction to add the total number of changes (change from one centroid to another) in this block 142 | extern __shared__ int sdata2[]; 143 | sdata2[threadIdx.x] = 0; 144 | if (gBelongs[pointId] != tmpIdx) { 145 | sdata2[threadIdx.x] = 1; 146 | } 147 | gBelongs[pointId] = tmpIdx; 148 | __syncthreads(); 149 | 150 | // Reduction 151 | for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) { 152 | if (threadIdx.x < s) { 153 | sdata2[threadIdx.x] += sdata2[threadIdx.x + s]; 154 | } 155 | __syncthreads(); 156 | } 157 | 158 | // Put the sum to the location corresbonding to current block 159 | if (threadIdx.x == 0) { 160 | blockResult[blockIdx.x] = sdata2[0]; 161 | } 162 | } 163 | 164 | void processData(char *fileName, Info *info) { 165 | float **X; 166 | int **GUESS; 167 | 168 | int N_samples, N_features, N_clusters, N_repeat; 169 | 170 | // readX(FILE_NAME,&X,&GUESS,&N_samples,&N_features,&N_clusters,&N_repeat); 171 | 172 | // Test purpose 173 | N_samples = 100; 174 | N_features = 9; 175 | N_clusters = 4; 176 | N_repeat = 10; 177 | 178 | info->numPoints = N_samples; 179 | info->dim = N_features; 180 | info->numCentroids = N_clusters; 181 | info->numRepeats = N_repeat; 182 | info->thresholdFraction = 0.005; 183 | info->thresholdLoops = 200; 184 | 185 | // Process data point 186 | X = make2DArray(N_samples, N_features); 187 | 188 | string str(fileName); 189 | ifstream file(str); 190 | string line1; 191 | int i = 0; 192 | while (getline(file, line1)) { 193 | std::istringstream iss(line1); 194 | int j = -1; 195 | for(string s; iss >> s;) { 196 | if (j == -1) { 197 | j++; 198 | continue; 199 | } 200 | // cout << s << " "; 201 | X[i][j] = stof(s); 202 | j++; 203 | } 204 | i++; 205 | } 206 | info->points = X; 207 | 208 | /* belongs: the cluster id for each data object */ 209 | int *belongs = new int[N_samples]; 210 | for (i = 0; i < N_samples; i++) belongs[i] = -1; 211 | info->belongs = belongs; 212 | } 213 | 214 | 215 | 216 | void cudaKmeans(Info *info) { 217 | // Initialization 218 | int numPoints = info->numPoints; 219 | int dim = info->dim; 220 | int numCentroids = info->numCentroids; 221 | int thresholdLoops = info->thresholdLoops; 222 | int thresholdFraction = info->thresholdFraction; 223 | int* belongs = info->belongs; 224 | float **points = info->points; 225 | float **centroids = info->centroids; 226 | int threadPerBlock = info->threadPerBlock; 227 | 228 | // invert (transpose matrix) 229 | float **iPoints = make2DArray(dim, numPoints); 230 | invert2DArray(iPoints, points, dim, numPoints); 231 | 232 | // initial guess 233 | float **iCentroids = make2DArray(dim, numCentroids); 234 | copy2DArray(iCentroids, iPoints, dim, numCentroids); 235 | // invert2DArray(iCentroids, points, dim, numCentroids); 236 | 237 | // centroid -> number of points 238 | int *pointsCount = new int[numCentroids]; 239 | float **iNewCentroids = make2DArray(dim, numCentroids); 240 | 241 | // Some cuda constants 242 | const unsigned int bthreads = threadPerBlock; 243 | const unsigned int l1 = (numPoints + bthreads - 1) / bthreads; 244 | const unsigned int l2 = nextPowerOfTwo(l1); 245 | const unsigned int sdsize2 = bthreads * sizeof(unsigned int); // shared memory size for sdata2 246 | const unsigned int sdsize1 = l2 * sizeof(unsigned int); // shared memory size for sdata1 247 | 248 | // Cuda device Initialization 249 | float *gPoints; 250 | float *gCentroids; 251 | int *gBelongs; 252 | Info *gInfo; 253 | int *tmp; 254 | 255 | // Data transfer 256 | iStart4a = cpuSecond(); 257 | cudaMalloc(&gPoints, numPoints * dim * sizeof(float)); 258 | cudaMalloc(&gCentroids, numCentroids * dim * sizeof(float)); 259 | cudaMalloc(&gBelongs, numPoints * sizeof(int)); 260 | cudaMalloc((void**)&gInfo, sizeof(Info)); 261 | cudaMalloc(&tmp, l2 * sizeof(unsigned int)); // For reduction 262 | cudaMemcpy(gBelongs, 263 | belongs, 264 | numPoints * sizeof(int), 265 | cudaMemcpyHostToDevice); 266 | cudaMemcpy(gPoints, 267 | iPoints[0], 268 | numPoints * dim * sizeof(float), 269 | cudaMemcpyHostToDevice); 270 | cudaMemcpy(gInfo,info,sizeof(Info),cudaMemcpyHostToDevice); 271 | 272 | iElaps4 += cpuSecond() - iStart4a; 273 | 274 | int count = 0; 275 | float frac = 1.0; 276 | 277 | while (count < thresholdLoops) { 278 | iStart4b = cpuSecond(); 279 | cudaMemcpy(gCentroids, iCentroids[0], dim * numCentroids * sizeof(float), cudaMemcpyHostToDevice); 280 | iElaps4 += cpuSecond() - iStart4b; 281 | 282 | // E-Step: assign points to the nearest cluster center 283 | iStart2 = cpuSecond(); 284 | // nearestCentroid<<>>(dim, numPoints, numCentroids, gPoints, gCentroids, gBelongs, tmp); 285 | nearestCentroid<<>>(tmp, gBelongs, gPoints, gCentroids, gInfo); 286 | cudaDeviceSynchronize(); 287 | iElaps2 += (cpuSecond() - iStart2); 288 | 289 | // Update belongs 290 | iStart4c = cpuSecond(); 291 | cudaMemcpy(belongs, gBelongs, numPoints * sizeof(int), cudaMemcpyDeviceToHost); 292 | iElaps4 += cpuSecond() - iStart4c; 293 | 294 | // M-Step first half: set the cluster centers to the mean 295 | iStart3a = cpuSecond(); 296 | 297 | // Clear the two temp variables 298 | for (int i = 0; i < numCentroids; i++) { 299 | pointsCount[i] = 0; 300 | for (int j = 0; j < dim; j++) { 301 | iNewCentroids[j][i] = 0.0; 302 | } 303 | } 304 | 305 | // Add up points in each centroid 306 | for (int i = 0; i < numPoints; i++) { 307 | int idx = belongs[i]; 308 | pointsCount[idx] += 1; 309 | for (int j = 0; j < dim; j++) { 310 | iNewCentroids[j][idx] += points[i][j]; 311 | } 312 | } 313 | iElaps3a += cpuSecond() - iStart3a; 314 | 315 | // M-Step second half: convert the sum to the mean 316 | // Update to new centroids 317 | iStart3b = cpuSecond(); 318 | for (int i = 0; i < numCentroids; i++) { 319 | for (int j = 0; j < dim; j++) { 320 | if (pointsCount[i] > 0) { 321 | iCentroids[j][i] = iNewCentroids[j][i] / pointsCount[i]; 322 | } 323 | } 324 | } 325 | iElaps3b += cpuSecond() - iStart3b; 326 | 327 | // Check convergence 328 | iStart5 = cpuSecond(); 329 | 330 | // Check if too few number of points change their centroids 331 | reduce <<<1, l2, sdsize1>>>(tmp, l1, l2); 332 | cudaDeviceSynchronize(); 333 | int tmpFloat; 334 | cudaMemcpy(&tmpFloat, tmp, sizeof(int), cudaMemcpyDeviceToHost); 335 | frac = (float)tmpFloat / numPoints; 336 | cout << "Iteration: " << count << "," << frac << "," << tmpFloat << "\n"; 337 | count++; 338 | if (frac <= thresholdFraction) break; 339 | 340 | iElaps5 += cpuSecond() - iStart5; 341 | 342 | } 343 | 344 | centroids = make2DArray(numCentroids, dim); 345 | invert2DArray(centroids, iCentroids, numCentroids, dim); 346 | info->centroids = centroids; 347 | 348 | // Free device memory 349 | cudaFree(gPoints); 350 | cudaFree(gCentroids); 351 | cudaFree(gBelongs); 352 | cudaFree(tmp); 353 | 354 | } 355 | 356 | int main(int argc, char *argv[]) { 357 | Info *info = new Info; 358 | info->threadPerBlock = atoi(argv[1]); 359 | char *fileName = argv[2]; 360 | processData(fileName, info); 361 | 362 | for (int i = 0; i < info->numRepeats; i++) { 363 | iStart1 = cpuSecond(); 364 | cudaKmeans(info); 365 | iElaps1 += cpuSecond() - iStart1; 366 | 367 | // cout << info->centroids[0][0] << "," << info->centroids[0][1] << "," 368 | // << info->centroids[1][0] << "," << info->centroids[1][1] << "\n"; 369 | 370 | // Reload info 371 | delete(info); 372 | info = new Info; 373 | info->threadPerBlock = atoi(argv[1]); 374 | processData(fileName, info); 375 | } 376 | 377 | 378 | cout << "Total time: " << iElaps1*1000 << "\n"; 379 | cout << "E-step time use (ms): " << iElaps2*1000 << "\n"; 380 | cout << "M-step-1st-half time use (ms): " << iElaps3a*1000 << "\n"; 381 | cout << "M-step-2nd-half time use (ms): " << iElaps3b*1000 << "\n"; 382 | cout << "Cuda Data IO (ms): " << iElaps4*1000 << "\n"; 383 | cout << "Other (ms): " << iElaps5*1000 << "\n"; 384 | } 385 | -------------------------------------------------------------------------------- /Parallel_Algorithm/Cuda/test.txt: -------------------------------------------------------------------------------- 1 | 1 2.0 2.0 2 | 2 2.0 -2.0 3 | 3 -3.0 -2.0 4 | 4 -3.0 2.0 -------------------------------------------------------------------------------- /Parallel_Algorithm/Cuda/test_multithreadPerBlock.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export PATH=/usr/local/cuda-7.5/bin${PATH:+:${PATH}} 3 | export LD_LIBRARY_PATH=/usr/local/cuda-7.5/lib64\${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}} 4 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib 5 | 6 | thread_list='1 2 4 8 16 32 64' 7 | 8 | for thread in $thread_list 9 | do 10 | echo " " 11 | echo ========================================= 12 | echo ========================================= 13 | echo testing with $thread threads per block on device 14 | ./a.out $thread ../test_data/Blobs_smp20000_fea30_cls8.nc 15 | done 16 | -------------------------------------------------------------------------------- /Parallel_Algorithm/MPI/Kmean_mpi.c: -------------------------------------------------------------------------------- 1 | //#include "../shared/timing.h" //for timer seconds() 2 | #include 3 | #include 4 | #include //for FLT_MAX 5 | #include 6 | #include "../shared/make_2D_array.h" 7 | #include "../shared/ncdf_util.h" 8 | #include "../shared/math_util.h" 9 | 10 | /* This is the name of the data file we will read. */ 11 | //#define FILE_NAME "../test_data/Blobs_smp20000_fea30_cls8.nc" 12 | #define FILE_NAME "../../Data_Analysis/data/SSWdata.nc" 13 | #define TOL 0.0001 14 | #define MAX_ITER 100 15 | 16 | int main() { 17 | 18 | /* 19 | ====================================================== 20 | ---------------- Initialization --------------------- 21 | ====================================================== 22 | */ 23 | 24 | int rank, size; 25 | MPI_Init(NULL,NULL); 26 | MPI_Comm_rank(MPI_COMM_WORLD, &rank); 27 | MPI_Comm_size(MPI_COMM_WORLD, &size); 28 | //printf("hello world from process %d of %d\n", rank, size); 29 | 30 | int N_samples_all,N_samples,N_features,N_clusters,N_repeat; 31 | //i for samples; j for features; k for clusters (typically) 32 | int i,j,k; 33 | int k_best,initial_idx; 34 | float** X; //unlike in serial/OpenMP versions, here X is local data 35 | float** X_all; //only master node holds the full data 36 | int** GUESS; 37 | float dist,dist_min,dist_sum_old,dist_sum_new,inert_best=FLT_MAX; 38 | 39 | /* 40 | ====================================================== 41 | -- Read data by master node and distribute over processes -- 42 | ====================================================== 43 | */ 44 | 45 | double iStart1 = MPI_Wtime(); 46 | // let master core read data and broadcast to other cores 47 | 48 | if (rank == 0){ 49 | // get input data and its size 50 | readX(FILE_NAME,&X_all,&GUESS,&N_samples_all,&N_features,&N_clusters,&N_repeat); 51 | } 52 | else{ 53 | /* 54 | MPI_Scatter needs to access *X_all in all processes 55 | For non-root, we need to assign NULL to prevent memory error 56 | */ 57 | float* dummy_for_X_all=NULL; 58 | X_all = &dummy_for_X_all; 59 | } 60 | 61 | MPI_Bcast(&N_samples_all,1,MPI_INT,0,MPI_COMM_WORLD); 62 | MPI_Bcast(&N_features,1,MPI_INT,0,MPI_COMM_WORLD); 63 | MPI_Bcast(&N_clusters,1,MPI_INT,0,MPI_COMM_WORLD); 64 | MPI_Bcast(&N_repeat,1,MPI_INT,0,MPI_COMM_WORLD); 65 | //printf("%d: %d,%d,%d,%d\n",rank,N_samples_all,N_features,N_clusters,N_repeat); 66 | 67 | if (rank==0){ 68 | printf("Last element in global array: %f \n",X_all[N_samples_all-1][N_features-1]); 69 | } 70 | 71 | 72 | // Naive Scatter: Assume N_sample_all is divisible by size 73 | /* 74 | N_samples = N_samples_all / size; 75 | X = Make2DFloatArray(N_samples,N_features); 76 | MPI_Scatter(*X_all, N_samples*N_features, MPI_FLOAT, *X, 77 | N_samples*N_features, MPI_FLOAT, 0, MPI_COMM_WORLD); 78 | */ 79 | 80 | // Correct sactter: works for any numbers 81 | 82 | int *sendcounts,*displs; 83 | if (rank == 0){ 84 | int N_samples_slave = N_samples_all/size; //master node needs to know the data size for other nodes 85 | N_samples = N_samples_all - N_samples_slave*(size-1);// the remaining data 86 | 87 | sendcounts = (int *)malloc(size*sizeof(int)); // the number of elements to send to each processor 88 | displs = (int *)malloc(size*sizeof(int)); //displacement relative to sendbuf for data sent to process i 89 | 90 | sendcounts[0]=N_samples*N_features; 91 | displs[0]=0; 92 | for (i=1; i 0) //avoid divide-by-zero error 257 | // sum -> mean 258 | old_cluster_centers[k][j] = new_cluster_centers[k][j] / cluster_sizes[k]; 259 | 260 | new_cluster_centers[k][j] = 0.0;//for the next iteration 261 | } 262 | cluster_sizes[k] = 0;//for the next iteration 263 | } // end of M-Step second half 264 | 265 | iElaps3c += (MPI_Wtime()-iStart3c); 266 | 267 | // To test convergence, we need the global sum of distances 268 | MPI_Allreduce(MPI_IN_PLACE,&dist_sum_new, 1, MPI_FLOAT, 269 | MPI_SUM, MPI_COMM_WORLD); 270 | 271 | } while( i_iter==1 || ((dist_sum_old - dist_sum_new > TOL)&&i_iter 2 | #include 3 | #include //for FLT_MAX 4 | #include "../shared/timing.h" //for timer seconds() 5 | #include "../shared/make_2D_array.h" 6 | #include "../shared/ncdf_util.h" 7 | #include "../shared/math_util.h" 8 | 9 | /* This is the name of the data file we will read. */ 10 | //#define FILE_NAME "../test_data/Blobs_smp20000_fea30_cls8.nc" 11 | #define FILE_NAME "../../Data_Analysis/data/SSWdata.nc" 12 | #define TOL 0.0001 13 | #define MAX_ITER 100 14 | 15 | int main() { 16 | 17 | /* 18 | ====================================================== 19 | ---------------- Initialization --------------------- 20 | ====================================================== 21 | */ 22 | int N_samples,N_features,N_clusters,N_repeat; 23 | //i for samples; j for features; k for clusters (typically) 24 | int i,j,k; 25 | int k_best,initial_idx; 26 | float** X; 27 | int** GUESS; 28 | float dist,dist_min,dist_sum_old,dist_sum_new,inert_best=FLT_MAX; 29 | 30 | // get input data and its size 31 | double iStart1 = seconds(); 32 | readX(FILE_NAME,&X,&GUESS,&N_samples,&N_features,&N_clusters,&N_repeat); 33 | double iElaps1 = seconds() - iStart1; 34 | 35 | // each data point belongs to which cluster 36 | // values range from 0 to N_cluster-1 37 | int* labels = (int *)malloc(N_samples*sizeof(int)); 38 | int* labels_best = (int *)malloc(N_samples*sizeof(int)); 39 | 40 | // The position of each cluster center. 41 | // Two arrays are needed as we are calculating the distance to the 42 | // old centers and accumulating the new centers in the same iteration. 43 | float** old_cluster_centers = Make2DFloatArray(N_clusters,N_features); 44 | float** new_cluster_centers = Make2DFloatArray(N_clusters,N_features); 45 | 46 | // how many data points in the cluster 47 | // needed by calculating the average position of data points in each cluster 48 | int* cluster_sizes = (int *)malloc(N_clusters*sizeof(int)); 49 | 50 | /* 51 | ====================================================== 52 | ---------------- Kmean stepping --------------------- 53 | ====================================================== 54 | */ 55 | printf("=====Applying K-mean======\n"); 56 | 57 | // record timing results 58 | double iStart2,iElaps2; 59 | double iStart3a,iStart3b,iStart3c; 60 | double iElaps3a=0,iElaps3b=0,iElaps3c=0; 61 | 62 | /* Run the K-mean algorithm for N_repeat times with 63 | * different starting points 64 | */ 65 | iStart2 = seconds(); 66 | for (int i_repeat=0; i_repeat < N_repeat; i_repeat++){ 67 | 68 | // guess initial centers 69 | for (k=0; k 0) //avoid divide-by-zero error 131 | // sum -> mean 132 | old_cluster_centers[k][j] = new_cluster_centers[k][j] / cluster_sizes[k]; 133 | 134 | new_cluster_centers[k][j] = 0.0;//for the next iteration 135 | } 136 | cluster_sizes[k] = 0;//for the next iteration 137 | } // end of M-Step second half 138 | iElaps3c += (seconds()-iStart3c); 139 | 140 | } while( i_iter==1 || ((dist_sum_old - dist_sum_new > TOL)&&i_iter TOL) 33 | 34 | 35 | -------------------------------------------------------------------------------- /Parallel_Algorithm/OpenMP/test_multithread.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | thread_list='1 2 4 8 16 32 64' 4 | 5 | for thread in $thread_list 6 | do 7 | echo " " 8 | echo ========================================= 9 | echo ========================================= 10 | echo testing with $thread threads 11 | export OMP_NUM_THREADS=$thread 12 | ./Kmean_omp.out 13 | done 14 | -------------------------------------------------------------------------------- /Parallel_Algorithm/README: -------------------------------------------------------------------------------- 1 | C code for the parallel k-mean clustering algorithm. 2 | Doesn't depend on to any specific data set. 3 | 4 | Include: 5 | 6 | 1) Homogeneous Parallel Environment 7 | Pure OpenMP 8 | Pure MPI 9 | Pure CUDA (single GPU) 10 | 11 | 2) Heterogenous Parallel Environment 12 | Hybrid OpenMP+MPI 13 | CUDA with multi-GPU support 14 | Hybrid CUDA+MPI 15 | -------------------------------------------------------------------------------- /Parallel_Algorithm/python_reference/Apply_Kmean.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import numpy as np 5 | import xarray as xr 6 | from netCDF4 import Dataset 7 | from timeit import default_timer as timer 8 | from sklearn.cluster import KMeans 9 | 10 | dirname = "../test_data/" 11 | filename = "Blobs_smp20000_fea30_cls8.nc" 12 | 13 | # read data from nc file 14 | start1 = timer() 15 | with xr.open_dataset(dirname+filename) as ds: 16 | n_clusters = ds.dims["N_clusters"] 17 | n_features = ds.dims["N_features"] 18 | n_repeat = ds.dims["N_repeat"] 19 | X = ds["X"].values 20 | GUESS = ds["GUESS"].values 21 | del ds 22 | 23 | elapse1 = timer()-start1 24 | 25 | # apply Kmeans 26 | start2 = timer() 27 | inert_best = np.inf 28 | for i_repeat in range(n_repeat): 29 | # manually guess initial clusters (to compare with C) 30 | initial_idx = GUESS[i_repeat,:] 31 | initial_position = X[initial_idx,:] 32 | kmeans = KMeans(n_clusters=n_clusters,n_init=1,init=initial_position, 33 | algorithm='full',tol=1e-4) 34 | kmeans.fit(X) 35 | 36 | if kmeans.inertia_ < inert_best: 37 | inert_best = kmeans.inertia_ 38 | y_kmeans = kmeans.labels_ 39 | 40 | elapse2 = timer()-start2 41 | 42 | # write results back 43 | with Dataset(dirname+filename,mode='r+') as dset: 44 | dset["Y_Py"][:] = y_kmeans 45 | dset["INERT_Py"][:] = inert_best 46 | 47 | # summary 48 | print("final inertia:",inert_best) 49 | print("Kmean time use (ms):",elapse2*1e3) 50 | print("I/O time use (ms):",elapse1*1e3) -------------------------------------------------------------------------------- /Parallel_Algorithm/python_reference/IO_util.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import xarray as xr 3 | 4 | def Raw_to_NetCDF(X,ind,filename,y_true=None,feature_names=None): 5 | 6 | N_samples,N_features = X.shape 7 | label_zero = np.zeros(N_samples,dtype=np.int32) 8 | if feature_names is None: 9 | feature_names = np.arange(N_features,dtype=np.int32) 10 | if y_true is None: 11 | y_true = label_zero 12 | 13 | ds = xr.Dataset() 14 | ds['X'] = (['N_samples', 'N_features'], np.float32(X) ) 15 | ds['X'].attrs["long_name"]="data points" 16 | 17 | ds['GUESS'] = (['N_repeat', 'N_clusters'], ind) 18 | ds['GUESS'].attrs["long_name"]="indices of data points as initial guess of cluster centers" 19 | ds['GUESS'].attrs["purpose"]="make sure that C and python use the same initial starting points" 20 | 21 | ds['Y_TRUE']=(['N_samples'], np.int32(y_true) ) 22 | ds['Y_TRUE'].attrs["long_name"]="(optional) true label of each data point" 23 | 24 | ds['Y_Py']=(['N_samples'], label_zero) 25 | ds['Y_Py'].attrs["long_name"]="labels predicted by python Kmean function" 26 | 27 | ds['Y_C']=(['N_samples'], label_zero) 28 | ds['Y_C'].attrs["long_name"] = "labels predicted by C implementation" 29 | ds['Y_C'].attrs["purpose"] = "make sure that C implementation gives the same result as python" 30 | 31 | ds['INERT_Py'] = np.float32(0.0) 32 | ds['INERT_Py'].attrs["long_name"] = "kmeans.inertia_ in python code, "+\ 33 | "i.e. sum of distances between data points and cluster centers" 34 | 35 | ds['INERT_C'] = np.float32(0.0) 36 | ds['INERT_C'].attrs["long_name"] = "the C version of kmeans.inertia_" 37 | ds['INERT_C'].attrs["purpose"] = "make sure that C implementation gives the same result as python" 38 | 39 | ds['FEATURES']=(['N_features'], feature_names) 40 | ds['FEATURES'].attrs["long_name"] = "(optional) the meaning of each feature" 41 | 42 | ds.to_netcdf(filename) 43 | ds.close() 44 | -------------------------------------------------------------------------------- /Parallel_Algorithm/python_reference/Kmean_iris.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Tue Apr 4 17:11:36 2017 5 | 6 | @author: desnow 7 | """ 8 | 9 | import numpy as np 10 | import pandas as pd 11 | import xarray as xr 12 | 13 | from sklearn.cluster import KMeans 14 | from sklearn.datasets import load_iris 15 | 16 | # load iris data set from sklearn package 17 | iris = load_iris() 18 | 19 | # extract the data to numpy array 20 | X = np.float32(iris['data']) 21 | y_true = np.int32(iris['target']) 22 | 23 | ''' 24 | # convert to pandas and print 25 | df_iris = pd.DataFrame(data= np.c_[iris['target'],iris['data']], 26 | columns= ['target']+iris['feature_names'] ) 27 | print(df_iris.head()) 28 | ''' 29 | 30 | 31 | # apply K-mean 32 | kmeans = KMeans(n_clusters=3,n_init=10,init='random', 33 | algorithm='full',tol=1e-2) 34 | kmeans.fit(X) 35 | y_kmeans = kmeans.labels_ 36 | #y_kmeans = kmeans.predict(X) # this can predict new points 37 | print("final inertia:",kmeans.inertia_) 38 | 39 | ''' 40 | # print results 41 | for i in range(len(y_kmeans)): 42 | print(i+1,y_kmeans[i],y_true[i]) 43 | ''' 44 | 45 | # convert to xarray Dataset and write into nc files. 46 | ds = xr.Dataset({'X': (['N_samples', 'N_features'], X), 47 | 'y_true': (['N_samples'], y_true), 48 | 'y_kmeans_python': (['N_samples'], y_kmeans), 49 | 'inertia_python': np.float32(kmeans.inertia_) , 50 | 'y_kmeans_C': (['N_samples'], np.zeros_like(y_kmeans)), 51 | 'inertia_C': np.float32(0.0) 52 | }, 53 | coords={'samples': (['N_samples'],np.arange(y_true.size,dtype=np.float32)+1), 54 | 'features': (['N_features'], iris['feature_names'])} 55 | ) 56 | ds.to_netcdf('../test_data/iris_data_Kmean.nc') 57 | -------------------------------------------------------------------------------- /Parallel_Algorithm/python_reference/check_SSWdata.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import numpy as np 3 | from IO_util import Raw_to_NetCDF 4 | import xarray as xr 5 | 6 | dirname = '../../Data_Analysis/data/' 7 | filename='SSWdata.nc' 8 | 9 | ds = xr.open_dataset(dirname+filename) 10 | 11 | print('total data size',ds["Y_TRUE"].size) 12 | print('size of 2nd cluster by MATLAB',ds["Y_TRUE"].sum()) 13 | print('size of 2nd cluster by C',ds["Y_C"].sum()) 14 | 15 | mismatch = (ds["Y_TRUE"].values != ds["Y_C"].values) 16 | print("inconsistent labels: ",mismatch.sum()) 17 | 18 | #ds.close() 19 | -------------------------------------------------------------------------------- /Parallel_Algorithm/python_reference/check_results.py: -------------------------------------------------------------------------------- 1 | import xarray as xr 2 | 3 | dirname = "../test_data/" 4 | filename = "Blobs_smp20000_fea30_cls8.nc" 5 | 6 | with xr.open_dataset(dirname+filename) as ds: 7 | mismatch = (ds["Y_Py"].values != ds["Y_C"].values) 8 | 9 | print("total number of samples: ",mismatch.size) 10 | print("inconsistent labels: ",mismatch.sum()) 11 | -------------------------------------------------------------------------------- /Parallel_Algorithm/python_reference/convert_SSWdata.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import numpy as np 3 | from IO_util import Raw_to_NetCDF 4 | 5 | ndata = 17878 6 | nfeatures = 252 7 | 8 | dirname = '../../Data_Analysis/data/' 9 | # Read data points 10 | file1=open(dirname+'SSWdata.bin','rb') 11 | X=np.fromfile(file1) 12 | if sys.byteorder=='little': 13 | X.byteswap(True) 14 | X=X.reshape(ndata,nfeatures) 15 | 16 | # Read python label 17 | file1=open(dirname+'Label_py.bin','rb') 18 | Y_py=np.fromfile(file1,np.int32) 19 | if sys.byteorder=='little': 20 | Y_py.byteswap(True) 21 | 22 | # Read matlab label 23 | file1=open(dirname+'Label_matlab.bin','rb') 24 | Y_matlab=np.fromfile(file1,np.int32) 25 | Y_matlab -= 1 # 1~2 to 0~1 26 | 27 | # ======================== 28 | # convert the NetCDF format 29 | # ======================== 30 | N_clusters = 2 31 | N_samples = ndata 32 | N_features = nfeatures 33 | N_repeat = 20 34 | 35 | initial_ind = np.zeros([N_repeat,N_clusters],dtype=np.int32) 36 | for i in range(N_repeat): 37 | initial_ind[i,:] = np.random.choice(np.arange(N_samples), 38 | N_clusters,replace=False) 39 | 40 | filename='SSWdata.nc' 41 | Raw_to_NetCDF(X,initial_ind,dirname+filename,y_true=Y_matlab) 42 | 43 | -------------------------------------------------------------------------------- /Parallel_Algorithm/python_reference/make_fake_data.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import numpy as np 5 | from sklearn.datasets.samples_generator import make_blobs 6 | from IO_util import Raw_to_NetCDF 7 | 8 | N_clusters = 8 9 | N_samples = 20000 10 | N_features = 30 11 | N_repeat = 20 12 | 13 | X, y = make_blobs(n_samples=N_samples, centers=N_clusters, 14 | n_features=N_features,random_state=0, 15 | cluster_std=1.0) 16 | 17 | initial_ind = np.zeros([N_repeat,N_clusters],dtype=np.int32) 18 | 19 | for i in range(N_repeat): 20 | initial_ind[i,:] = np.random.choice(np.arange(N_samples), 21 | N_clusters,replace=False) 22 | 23 | dirname = "../test_data/" 24 | filename = "Blobs_smp{0}_fea{1}_cls{2}.nc".format(N_samples,N_features,N_clusters) 25 | 26 | Raw_to_NetCDF(X,initial_ind,dirname+filename,y_true=y) -------------------------------------------------------------------------------- /Parallel_Algorithm/shared/make_2D_array.c: -------------------------------------------------------------------------------- 1 | #include //for malloc 2 | #include "make_2D_array.h" 3 | 4 | /* For dynamically allocating 2D array in pure-C environment. 5 | Unlke in HW2, the array here is contagious! 6 | See: 7 | http://stackoverflow.com/questions/33794657/how-to-pass-a-2d-array-to-a-function-in-c-when-the-array-is-formatted-like-this 8 | http://stackoverflow.com/questions/5901476/sending-and-receiving-2d-array-over-mpi 9 | */ 10 | float** Make2DFloatArray(int rows, int cols) { 11 | float *data = (float *)malloc(rows*cols*sizeof(float)); 12 | float **array= (float **)malloc(rows*sizeof(float*)); 13 | for (int i=0; i 2 | #include "math_util.h" 3 | 4 | // square of the distance between x1[N_features] and x2[N_features] 5 | float distance(int N_features,float *x1,float *x2){ 6 | float dist=0.0; 7 | for (int j=0; j 2 | #include 3 | #include 4 | #include "make_2D_array.h" 5 | #include "ncdf_util.h" 6 | // including at last leads to "error: unknown type name ‘size_t’" 7 | // no idea why? 8 | 9 | /* Handle errors by printing an error message and exiting with a 10 | * non-zero status. */ 11 | #define ERRCODE 2 12 | #define ERR(e) {printf("Error: %s\n", nc_strerror(e)); exit(ERRCODE);} 13 | 14 | /* Read the input data from NetCDF file. 15 | * Dynamically allocate the array based on the data size. 16 | * 17 | * Why need 3-levels of pointers: 18 | * The first two levels are for 2D dynamic array, 19 | * the last level is for modifying function arguments in place. 20 | * (need to pass the address) 21 | */ 22 | int readX(char* FILE_NAME, float*** p_X,int*** p_GUESS, 23 | int* p_N_samples,int* p_N_features, 24 | int* p_N_clusters,int* p_N_repeat ) { 25 | int ncid, varid,dimid; 26 | int retval; 27 | size_t N_temp; 28 | 29 | printf("reading data \n"); 30 | 31 | /* Open the file. NC_NOWRITE tells netCDF we want read-only access 32 | * to the file.*/ 33 | if ((retval = nc_open(FILE_NAME, NC_NOWRITE, &ncid))) 34 | ERR(retval); 35 | 36 | /* Get the size of the data for dynamical allocation*/ 37 | nc_inq_dimid(ncid,"N_samples",&dimid); 38 | nc_inq_dimlen(ncid,dimid,&N_temp); 39 | *p_N_samples = (int)N_temp; 40 | printf("Number of samples: %d \n",*p_N_samples); 41 | 42 | nc_inq_dimid(ncid,"N_features",&dimid); 43 | nc_inq_dimlen(ncid,dimid,&N_temp); 44 | *p_N_features = (int)N_temp; 45 | printf("Number of features: %d \n",*p_N_features); 46 | 47 | nc_inq_dimid(ncid,"N_clusters",&dimid); 48 | nc_inq_dimlen(ncid,dimid,&N_temp); 49 | *p_N_clusters = (int)N_temp; 50 | printf("Number of clusters: %d \n",*p_N_clusters); 51 | 52 | nc_inq_dimid(ncid,"N_repeat",&dimid); 53 | nc_inq_dimlen(ncid,dimid,&N_temp); 54 | *p_N_repeat = (int)N_temp; 55 | printf("Number of repeated runs: %d \n",*p_N_repeat); 56 | 57 | /* Get the varid of the data variable, based on its name. */ 58 | if ((retval = nc_inq_varid(ncid, "X", &varid))) 59 | ERR(retval); 60 | /* Read the data. */ 61 | *p_X = Make2DFloatArray(*p_N_samples,*p_N_features); 62 | if ((retval = nc_get_var_float(ncid, varid, (*p_X)[0]))) 63 | ERR(retval); 64 | 65 | /* Initial Guess*/ 66 | if ((retval = nc_inq_varid(ncid, "GUESS", &varid))) 67 | ERR(retval); 68 | *p_GUESS = Make2DIntArray(*p_N_repeat,*p_N_clusters); 69 | if ((retval = nc_get_var_int(ncid, varid, (*p_GUESS)[0]))) 70 | ERR(retval); 71 | 72 | /*close the netcdf file*/ 73 | if ((retval = nc_close(ncid) )) 74 | ERR(retval); 75 | 76 | printf("=====reading data finished======\n"); 77 | 78 | return 0; 79 | } 80 | 81 | int writeY(char* FILE_NAME, int* labels, float inert) { 82 | int ncid, varid; 83 | int retval; 84 | 85 | if ((retval = nc_open(FILE_NAME, NC_WRITE, &ncid))) 86 | ERR(retval); 87 | 88 | if ((retval = nc_inq_varid(ncid, "INERT_C", &varid))) 89 | ERR(retval) 90 | if ((retval = nc_put_var_float(ncid, varid, &inert ))) 91 | ERR(retval); 92 | 93 | if ((retval = nc_inq_varid(ncid, "Y_C", &varid))) 94 | ERR(retval) 95 | if ((retval = nc_put_var_int(ncid, varid, labels ))) 96 | ERR(retval); 97 | 98 | /*close the netcdf file*/ 99 | if ((retval = nc_close(ncid) )) 100 | ERR(retval); 101 | 102 | printf("=====writting data finished======\n"); 103 | 104 | return 0; 105 | } 106 | -------------------------------------------------------------------------------- /Parallel_Algorithm/shared/ncdf_util.h: -------------------------------------------------------------------------------- 1 | #ifndef NCDF_UTIL_H 2 | #define NCDF_UTIL_H 3 | 4 | int readX(char* FILE_NAME, float*** p_X,int*** p_GUESS, 5 | int* p_N_samples,int* p_N_features, 6 | int* p_N_clusters,int* p_N_repeat ); 7 | 8 | int writeY(char* FILE_NAME, int* labels, float inert); 9 | 10 | #endif // NCDF_UTIL_H 11 | -------------------------------------------------------------------------------- /Parallel_Algorithm/shared/timing.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | inline double seconds() 5 | { 6 | struct timeval tp; 7 | int i = gettimeofday(&tp, NULL); 8 | return ((double)tp.tv_sec + (double)tp.tv_usec * 1.e-6); 9 | } 10 | -------------------------------------------------------------------------------- /Parallel_Algorithm/test_data/.gitignore: -------------------------------------------------------------------------------- 1 | *.nc 2 | -------------------------------------------------------------------------------- /Parallel_Algorithm/test_data/README: -------------------------------------------------------------------------------- 1 | test data in NetCDF format 2 | 3 | Data can be generated by python scripts 4 | This Git repo contains no actually data (to keep its size small). 5 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Table of Contents 2 | * [Introduction](#introduction) 3 | * [Parallel Kmeans Algorithms](#parallel-kmeans-algorithms) 4 | * [OpenMP, MPI and hybrid MPI-OpenMP parallelization](#openmp-mpi-and-hybrid-mpi-openmp-parallelization) 5 | * [Advanced Feature: CUDA](#advanced-feature-cuda) 6 | * [Applications](#applications) 7 | * [Forest Cover Type Classification](#forest-cover-type-classification) 8 | * [Advanced Feature: Abnormal Climate Events Identification](#advanced-feature-abnormal-climate-events-identification) 9 | * [Discussion](#discussion) 10 | * [Computational Platforms and Software Libraries](#computational-platforms-and-software-libraries) 11 | * [References](#references) 12 | 13 | --- 14 | # Introduction 15 | K-means clustering is a simple and scalable clustering method, which partitions observations into k clusters in an objective manner. It has very broad applications, such as image segmentation, retail product classification (Kusrini, 2015), environmental problems like greenhouse gas emissions (Kijewska and Bluszcz, 2015). K-means clustering could be used in combination with other advanced methodologies. For example, it was used with support vector machine (SVM) to perform automatic text classification (Perrone and Connell, 2000). It could also be used as a preprocessing method, such as initialization in a hidden Markov model (HMM) (Hu and Zanibbi, 2011). Its extensive applications and its simple computational complexity make k-means clustering one of the popular methods today. 16 | 17 | Finding the minimum of a k-means cost function is a NP-hard problem when the dimension d>1 and the number of clusters k>1. Scientists came up with several heuristic methods to find the local minimum, but the process is still computationally-intensive, especially for large datasets with high dimensional features. Therefore, we want to implement a parallel version of a k-means heuristic method on a cluster of machines, to significantly speed up the algorithm, without sacrificing its accuracy. 18 | 19 | A typical approach for k-mean clustering is Expectation–Maximization (E–M). E-step assigns points to the nearest cluster center, while M-step sets the cluster centers to the mean. Below is an animation demonstating the Kmean algorithm, based on a wonderful [K-means visualization made by Naftali Harris ](https://www.naftaliharris.com/blog/visualizing-k-means-clustering/). 20 | 21 |

22 | 23 |

24 | 25 | The pseudo C-code for this algorithm is shown below, which is an abbreviated version of our real code. 26 | 27 |

28 | 29 |

30 | 31 | X[N_samples][N_features] is the data points. We always use i as the looping index for samples, j as the index for feature dimensions, and k as the index for clusters. This notation is consistent throught out the real code. Other variables should be self-explanatory. 32 | 33 | --- 34 | # Parallel Kmeans Algorithms 35 | 36 | ## OpenMP, MPI and hybrid MPI-OpenMP parallelization 37 | 38 | ### OpenMP 39 | 40 | With OpenMP parallelization, only E-step can be directly parallelized. If M-step is directly parallelized with OpenMP pragmas, different data points might be added to one cluster at the same time, leading to Write-After-Write (WAW) harzard. Although it is possible to make drastic modifications to parallelize the M-step, it contradicts the basic idea of OpenMP that the serial code shoud be almost untouched. Therefore, we only focus on the E-step. 41 | [(View our OpenMP code)](Parallel_Algorithm/OpenMP/Kmean_omp.c) 42 | 43 | Unsurprisingly, while the E-step scales well, the M-step even gets slower because of thread overheads. Although the M-step is not time-consuming in the serial case, it finally becomes the bottleneck when the number of cores gets large: 44 |

45 | 46 |

47 | 48 | [(View the raw timing log)](Timing_Results/log/Blobs_OpenMP.log) 49 | 50 | Because the compute node we are testing has only [32 CPUs](#computational-platforms-and-software-libraries), the performance gets lower with 64 threads due to the implicit context-switching and increased overheads. Same for the MPI and the hybrid tests below. 51 | 52 | ### MPI 53 | 54 | With MPI, we can distribute data points to different processes using MPI_Bcast, and use MPI_Allreduce to exchange information whenever needed. Thus, both the E-step and the M-step can be parallelized. [(View our MPI code)](Parallel_Algorithm/MPI/Kmean_mpi.c) 55 | 56 | This time, we get speed-up in both steps, so the overall scaling is better than OpenMP. 57 |

58 | 59 |

60 | 61 | [(View the raw timing log)](Timing_Results/log/Blobs_MPI.log) 62 | 63 | ### Hybrid MPI-OpenMP 64 | 65 | We simply add OpenMP pragmas to the MPI code, to get the hybrid version. This time we have many combinations of OpenMP threads and MPI processes to test. In general, we find that the speed-up depends on the product of the number of OpenMP threads (n_omp hereinafter) and the number of MPI processes (N_MPI hereinafter): 66 | 67 |

68 | 69 |

70 | 71 | [(View the raw timing log)](Timing_Results/log/Blobs_hybrid.log) 72 | 73 | Interestingly, for N_MPI*n_omp=32, we have tested 4 cases (N_MPI,n_omp) = (32,1), (16,2), (8,4) or (4,8), and all of them have almost the same speed. 74 | [(see the exact time use in the last cell)](https://github.com/JiaweiZhuang/CS205_final_project/blob/master/Timing_Results/plot_timing.ipynb) 75 | 76 | ### Improvements over Previous Works 77 | 78 | Our main reference for the OpenMP&MPI Kmean algorithm is Bisgin 2008, along with their [public code](http://www.ece.northwestern.edu/~wkliao/Kmeans/index.html). We have made a couple of improvements over their original algorithm: 79 | * Their algorithm always use the first N_cluster data points as the initial cluster center, which can be inefficient and might not find the global minimum. This is understandable because generating random indices in C is not straightforward. To facilitate random initialization and to make a fair comparison with Python's sklearn.cluster.KMeans, we build a seamless interface between C and Python using the [NetCDF library](#the-netcdf4-library-for-data-io). Random initial centers are generated by Python and written into file. Then, both the Python and the C versions use the same starting points from that file. This ensures the same amount of computation for both Python and C, along with the parallel C version at different number of cores. By using the same initial condition, we have confirmed that our serial C version has essential the same speed as Python's sklearn.cluster.KMeans, and all our parallel versions show significant speed up. 80 | * In the OpenMP version, they use "atomic" operations in the M-step to avoid data-racing. However, in our test, atomic operation significantly slows down the M-step, which more than compensates the speed-up of E-step. To cope with this issue, we refactor the M-step to move it out of the OpenMP parallel region, making the overall scalability much better. 81 | * In MPI the version, their orginal code is unnecessarily redundant. By making use of MPI_IN_PLACE, we avoid duplicating variables for MPI function calls such as MPI_Allreduce. We also use MPI_Scatterv, which is a more flexible version of MPI_Scatter, to allow different CPUs to hold different numbers of data points. This allows us to use any number of CPUs for any number of data points. 82 | * We also implemented an option to use correlation as the measure of "distance". It improves the clustering results in the [SSW study](#advanced-feature-abnormal-climate-events-identification) later in this page. 83 | 84 | ## Advanced Feature: CUDA 85 | 86 | Given the massive potential of parallelism on GPU, we implemented a parallel version of k-means algorithm using Nvidia CUDA library. In our implementation, we parallelize the E-step by distributing the computations of the nearest distance over blocks on "device". Also, 87 | we use reduction to help check the convergence of clustering (see the "reduce" function). For M-step, we decide not to parallelize (parallelize means using reduction in this case), because by including the time for data to tranfer between device and host, which is a huge burden, the parallel version has no outstanding advantages over the serial version of M-step. Similar to the OpenMP version, our focus is also on the E-step. [(View our CUDA code)](Parallel_Algorithm/Cuda/kmeans_cdf.cu) 88 | 89 | Generally, we see that the timing and scaling is quite promising when the number of threads per block is less than 32, which is also the wrap size. The "other" portion is no doubt the data tranfer between device and host, and it's even a more severe bottleneck than the serial M-step. By the way, we can definitely improve this by using better I/O hardware, i.e. using SSD instead of EBS volume for the ec2 instance, and optimizing memory access, e.g. using shared memory as possible and coalesce memory operations. Also, note that compared to OpenMP/MPI version, the time of E-step using CUDA is sinigicantly shorter. 90 | 91 |

92 | 93 |

94 | 95 | The weird bump up as the number of threads goes up to 64 is because we run out of shared memory, but we're not sure why it affects the M-step so much. 96 | 97 | For optimization, currently we've used parallel reduction to speedup the checking of convergence, and matrix transpose to improve memory access locality as the number of points is significantely larger than the number of features. We haven't tried deploying this version on multiple GPU, because the documentation is rare online and a single Tesla K80 GPU already has enough capacity (4992 cores, 26 SMs, 2048 threads per SM) to parallelize our computation. 98 | 99 | --- 100 | # Applications 101 | ## Forest Cover Type Classification 102 | In this section, we utilize k-means to perform forest covertype classification with cartographic variables only. Our dataset is obtained from the UCI KDD archive, and was original derived from US Forest Service (USFS) Region 2 Resource Information System data and US Geological Survey (USGS) data. The study area is in Roosevelt National Forest of northern Colorado, which is a forest region with the minimum human disturbance and therefore mostly goes through natural process. 103 | 104 |

105 | 106 | 107 |

108 | 109 | It is a fairly large dataset that contains 581012 observations and 54 features including numerical and categorical features. The attributes include elevation, slope, horizontal distance to hydrology, vertical distance to hydrology, and etc. This dataset is already labeled with 1-7 which represents 7 different forest cover types: Spruce/Fir, Lodgepole Pine, Ponderosa Pine, Cottonwood/Willow, Aspen, Douglas-fir, and Krummholz. Our goal is to implement a k-means based classification method, and to show that besides basic clustering problems, k-means has a broad usage in various data science problems. 110 | 111 | Our first step is to normalize the feature values since some of them are in single digits whereas some are in thousands. Then we split the whole dataset into a training set and a testing set with the ratio 7:3. Since the dataset is quite unbalanced, our stragegy is to randomly pick the desired percentage of entries out of each category, and then join them to a final sampling dataset. We always use this method each time we need a dataset split in this problem. 112 | 113 | We then split the training set into a sub-training set and a validation set with the ratio 8:2. The testing set we obtained above would be untouched until the final test. Our selection process of the best K is as follows: 114 | 1. cluster the sub-training set using k-means with a given K. 115 | 2. pick the 7 purest clusters corresponding to the 7 labels respectively to be the standard clusters. 116 | Compute the 7 cluster centers for use. 117 | 3. For each data point in the validation set, assign this point to the cluster with the closest cluster center. 118 | 4. Calculate the average classification accuracy of the validation set. 119 | 120 |

121 | 122 |

123 | 124 | We choose K from 7 to 30, repeat the above steps and find that 23 is the best cluster number. Finally we perform k-means on the whole training set to get the 7 centers and test on the testing set. The final classification accuracy is around 30%. 125 | 126 |

127 | 128 | 129 |

130 | 131 | The classification accuracy is not very high, so we would like to take a further look at the dataset. It is hard to directly visualize the dataset due to its high feature dimension, so we apply PCA to perform dimension reduction first and then plot the scatter graph based on the first two principle components. We choose 10 percent out of the testing samples, and color code the points using the true labels(the first graph) and the predicted labels(the second graph). Now we could see that, the original data points are acutally mixed on the 2D projection. Our k-means algorithm actually does a good clustering job since the lumps are seperated well in color. Therefore, for this problem, more complicated algorithms such as artifical neural network would do a better job (Dean, 1999) with classification accuracy at around 70%. But our result is already much better than the randomly classification which only holds an accuracy at around 14%. 132 | 133 | ## Advanced Feature: Abnormal Climate Events Identification 134 | In this section, we will explore the application of k-means clustering technique on identifying abnormal climate events. Abnormal climate events are usually identified if a highly simplified index exceeds an arbitrary threshold. For example, El Nino events are identified if the Nino 3.4 index exceeds the threshold of 0.5°C. This simple criteria works in some cases, however, there are two caveats associated with this methodology. First, the highly simplified index may not well capture all the main dynamic aspects. Second, setting an arbitrary threshold makes it a subjective way of identifying abnormal events. 135 | 136 | K-means clustering serves as a powerful technique in dealing with those caveats. First, instead of using a highly simplified index, a high dimensional feature vector characterizing the event from multiple dynamical aspects can be utilized. In addition, k-means clustering is highly scalable to cluster large datasets, such as those from simulations. Second, k-means clustering is able to identify different states in a completely objective manner with no preconceived notion of the groups and no preselection on the basis of known influencing factors (Coughlin and Gray, 2009). Third, k-means clustering technique is especially useful for detecting abnormal events, because k-means clustering easily allows for unevenly distributed clusters, whereas some other techniques, such as hierachical clustering, tend to determine clusters of similar sizes. 137 | 138 | The abnormal climate events we would like to explore is called sudden stratospheric warming (SSW), which happens sometime in the stratosphere near the North pole during winters. It is important to understand them because they usually proceed extreme weathers in the troposphere by about a month, and thus have the potential to serve as a forecasting tool. 139 | 140 | --- 141 | > ### What is Sudden Stratospheric Warming (SSW)? 142 | > During Northern hemisphere winter, because of the very cold temperature at the pole, the climatological zonal winds in the stratosphere are generally westerly and their strength increases with height. These winds can form very persistent "polar night jet” vortex, as shown in Fig(a). However, at times this zonal-mean configuration can be dramatically disturbed, as shown in Fig(b) and Fig(c), with the vortex being displaced or split. At the same time, the stratosphere near the pole experiences sudden warming, with latitudinal temperature gradient and zonal-mean winds at the pole being reversed. 143 |

144 | 145 |

146 | 147 | --- 148 | 149 | ### Data and Measure of Distance 150 | Our analysis is based on the daily output from a 49-year all-winter simulation, which gives us more than 17,000 samples. Daily data is pre-processed to get averaged temperatures at three latitudes, and their tendencies over time, latitudinal temperature gradient, and its tendency, averaged zonal winds at two latitudes, and their tendencies, and wave-number one and two components of geo-potential height. Temperatures are averaged over 60°N to 70°N, 70°N to 80°N, and 80°N to 90°N, while zonal winds are averaged over 60°N to 70°N and 70°N to 80°N. Tendencies are calculated as the differences between its current value and its value 7 days before. Altogether, there are 252 features for a sample, including 14 features each vertical level, and 18 levels in total across the stratosphere. 151 | 152 | Because a sample includes different types of features, such as temperature, velocity and length scale, we need to carefully choose the measure of distance. Here, we choose 1-corr(x1,x2) as the measure of distance, because we consider two patterns to be close to each other if they are highly correlated. 153 | 154 | ### Results 155 | We have tested the number of clusters from 2 to 4, and use Silhouette score to evaluate the result of clustering. Two clusters give the highest averaged score of 0.65, as shown in Fig 1, while the averaged score for three clusters is 0.47, and for four clusters is 0.42. Therefore, we think two clusters are separated well by k-means clustering. 156 |

157 | 158 |

159 |

160 | Figure 1: Silhouette score 161 |

162 | 163 | --- 164 | > #### Silhouette Score 165 | > The Silhouette coefficient for a sample is defined as (*b*-*a*)/max(*a*,*b*), where *a* is the mean intra-cluster distance, and *b* is the distance between a sample and the nearest cluster that the sample is not a part of. The Silhouette coefficient is a measure of the ratio between averaged intra-cluster similarity and cross-cluster similarity. Silhouette coefficients near +1 indicate that the sample is far away from the neighboring clusters. A value of 0 indicates that the sample is on or very close to the decision boundary between two neighboring clusters and negative values indicate that those samples might have been assigned to the wrong cluster. Therefore Silhouette coefficient can be a useful metric to evaluate the clustering results. 166 | 167 | --- 168 | 169 | Fig 2 shows the temperature anomaly over the stratosphere for both clusters. The second cluster shows a substantial warming (more than 10K) compared to the first cluster. 170 |

171 | 172 |

173 |

174 | Figure 2: Averaged temperature anomaly for each cluster 175 |

176 | 177 | The vortex structure shown in Fig. 3 is consistent with the temperature anomaly. For normal events, the polar vortex centers at the pole, while the vortex is displaced during abnormal warming period. These results are consistent with the findings from Coughlin and Gray, 2009, in which the analysis is based on observational data. 178 |

179 | 180 |

181 |

182 | Figure 3: Averaged potential vorticity for each cluster 183 |

184 | 185 | Furthermore, we are also interested in different types of abnormal events, because they may be caused by different mechanisms, and may have different effects on future weather patterns. Therefore, we further cluster the abnormal events based on the amplitude and phase angle of the wave-number one and two geo-potential height. We find that the vortex can be displaced toward different longitude, or even be split into two vortices. 186 |

187 | 188 |

189 |

190 | Figure 4: Averaged potential vorticity for each sub-cluster 191 |

192 | 193 | --- 194 | # Discussion 195 | 196 | Although the algorithm of k-means clustering is fast and simple, it has its own limitations compared to other more complicated algorithms. First of all, the clustering procedure and the final clusters highly depend on the number of clusters *k*, and extra effort needs to be made to find an optimal *k*. Hierarchical clustering could easily overcome this obstacle. Its computation is independent of the number of clusters *k*, and its hierarchical structure can provide more insight to determine the number of clusters *k*. Secondly, k-means clustering assumes spherically distributed clusters and equal probabilities for each clusters. To overcome these caveats, Gaussian mixture models include a covariance matrix and weights in its optimization. In addition, k-means clustering works poorly on non-convex clusters, while spectral clustering does a better job in this situation. 197 | 198 | Admittedly, there are more complicated algorithms that work better than k-means clustering in some cases, k-means clustering is still a powerful algorithm dealing with large datasets with high dimensional features. For a simple clustering algorithm, like k-means to have a better performance, more effort needs to be made to pre-process data and to map them onto a space where they are more spherically distributed. There is no the best algorithm, but the most suitable situation to apply an algorithm. 199 | 200 | 201 | --- 202 | # Computational Platforms and Software Libraries 203 | 204 | ## Amazon EC2 cloud computing environment (OpenMP & MPI) 205 | 206 | Although MPI programs typically run on local HPC facilities like Harvard's Odyssey, we found that MPI jobs at small-to-medium-scales (e.g. < 64 cores) can also run very efficiently on cloud platforms like Amazon EC2. This gives us great flexibility in requesting computational resources, so that we can finish simulations very quickly without worrying about job pending on Odyssey. 207 | 208 | The instance we use for the timing tests is cc2.8xlarge [(see detailed cpuinfo)](Timing_Results/info/cpu). In the Amazon console, it is said to have 64 "virtual" CPUs. However, it actually only contains 32 physical CPUs as shown by the "lscpu" command. 209 | 210 | We have installed various software libraries to facilitate our K-mean application. An EC2 AMI is made public the so that others can also run our codes directly without installing those libraries on their own. Search for "ami-3f79ef29" or "GCC_NetCDF_MPI_Conda_04162017" in the N. Virginia region. 211 | 212 | ## Amazon EC2 cloud computing environment (CUDA) 213 | 214 | The instance we use for timing tests is p2.xlarge, with 1 Tesla K80 GPU, 4 "virtual" CPUs. For the K80 GPU, it has 4992 CUDA cores, 26 SMs, and 2048 threads per SM. 215 | 216 | ## The OpenMPI library 217 | 218 | We built OpenMPI 2.1.0 upon the gcc4.8.3 compiler, to get the wrapped "mpicc" compiler. The script for building this library is available [here](Build_Library/openmpi_build/install_openmpi.sh). 219 | 220 | ## The Cuda library 221 | 222 | We use the CUDA 7.5 and the nvcc compiler included in the toolkit. For convenience, there is a pre-built AMI: search "ami-52f7b345" in the N. Virginia region. 223 | 224 | ## The NetCDF4 library for data I/O 225 | 226 | While high-level languages like Python and Matlab can read and write data in any formats very conveniently, data I/O in low-level languages such as C and Fortran can be a pain. Therefore, we make use of the [NetCDF4 library](https://www.unidata.ucar.edu/software/netcdf/) to facilitate data I/O. It can be viewed as a structured combination of numerical (like binary) and text (like ASCII) data. The numerical part makes it suited for storing large data arrays in our application, and the text description part makes it self-descriptive, which is a significant advantage over plain binary files. All commonly used languages have NetCDF4 APIs and are able to operate on this data format. 227 | 228 | In Python, the [xarray package](http://xarray.pydata.org/en/stable/) is a good way to handle NetCDF data. It is a higher-dimension extension of the well-known Pandas package. While Pandas is great for data science, xarray also suits various physical sciences. 229 | 230 | In C, we've provided a [script](Build_Library/netCDF_build/install_netCDF.sh) to install that library. A single build can work for various compilers including the basic gcc compiler, the pgcc compiler for OpenACC, and the nvcc compiler for CUDA. With the NetCDF-C library, we can read all the data we need and dynamically allocate memories for them in a single function [readX()](Parallel_Algorithm/shared/ncdf_util.c) 231 | 232 | It is also worth mentioning that, NetCDF is the standard data format used for the Intergovernmental Panel on Climate Change (IPCC) report :) 233 | 234 | # References 235 | Bisgin H, Dalfes H N. Parallel clustering algorithms with application to climatology[C] *Geophysical Research Abstracts* 2008, 10. 236 | 237 | Blackard, J.A., Dean, D.J., 1999. Comparative accuracies of artificial neural networks and discriminant analysis in predicting forest cover types from cartographic variables. *Computers and Electronics in Agriculture* 24 (1999): 131–151. 238 | 239 | Coughlin, K., and Lesley Janet Gray. "A continuum of sudden stratospheric warmings." *Journal of the Atmospheric Sciences* 66.2 (2009): 531-540. 240 | 241 | Li, M., Cheng, Y., Zhao, H., 2004. Unlabeled Data Classification via Support Vector Machines and k-means Clustering.* Proceedings of the International Conference on Computer Graphics*, Imaging and Visualization (CGIV’04). 242 | 243 | Kijewska, A., Bluszcz, A., 2015. Research of varying levels of greenhouse gas emissions in European countries using the k-means method. *Atmospheric Pollution Research* 7 (2016):935-944 244 | 245 | Kusrini, K. 2015. Grouping of Retail Items by Using K-Means Clustering. Procedia Computer Science 72 ( 2015 ): 495–502. 246 | Perrone, M.P., Connell, S.D., 2000. K-Means Clustering for Hidden Markov Models. *Proceedings of the Seventh International Workshop on Frontiers in Handwriting Recognition*. 247 | 248 | 249 | 250 | -------------------------------------------------------------------------------- /Slides/CUDA_part.key: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JiaweiZhuang/CS205_final_project/0751060adec3f6e4ca9676baae1bfd4bf4ba542a/Slides/CUDA_part.key -------------------------------------------------------------------------------- /Slides/FinalPre.key: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JiaweiZhuang/CS205_final_project/0751060adec3f6e4ca9676baae1bfd4bf4ba542a/Slides/FinalPre.key -------------------------------------------------------------------------------- /Slides/SSW part.key: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JiaweiZhuang/CS205_final_project/0751060adec3f6e4ca9676baae1bfd4bf4ba542a/Slides/SSW part.key -------------------------------------------------------------------------------- /Slides/covertype part.key: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JiaweiZhuang/CS205_final_project/0751060adec3f6e4ca9676baae1bfd4bf4ba542a/Slides/covertype part.key -------------------------------------------------------------------------------- /Timing_Results/Blobs_smp20000_fea30_cls8.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JiaweiZhuang/CS205_final_project/0751060adec3f6e4ca9676baae1bfd4bf4ba542a/Timing_Results/Blobs_smp20000_fea30_cls8.xlsx -------------------------------------------------------------------------------- /Timing_Results/info/compiler: -------------------------------------------------------------------------------- 1 | [ami-3f79ef29] 2 | 3 | gcc (GCC) 4.8.3 20140911 (Red Hat 4.8.3-9) 4 | Copyright (C) 2013 Free Software Foundation, Inc. 5 | This is free software; see the source for copying conditions. There is NO 6 | warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 7 | 8 | Open MPI 2.1.0 9 | -------------------------------------------------------------------------------- /Timing_Results/info/cpu: -------------------------------------------------------------------------------- 1 | [cc2.8xlarge] 2 | Architecture: x86_64 3 | CPU op-mode(s): 32-bit, 64-bit 4 | Byte Order: Little Endian 5 | CPU(s): 32 6 | On-line CPU(s) list: 0-31 7 | Thread(s) per core: 2 8 | Core(s) per socket: 8 9 | Socket(s): 2 10 | NUMA node(s): 1 11 | Vendor ID: GenuineIntel 12 | CPU family: 6 13 | Model: 45 14 | Model name: Intel(R) Xeon(R) CPU E5-2670 0 @ 2.60GHz 15 | Stepping: 7 16 | CPU MHz: 2593.961 17 | BogoMIPS: 5257.93 18 | Hypervisor vendor: Xen 19 | Virtualization type: full 20 | L1d cache: 32K 21 | L1i cache: 32K 22 | L2 cache: 256K 23 | L3 cache: 20480K 24 | NUMA node0 CPU(s): 0-31 25 | -------------------------------------------------------------------------------- /Timing_Results/log/Blobs_Cuda.log: -------------------------------------------------------------------------------- 1 | 2 | ========================================= 3 | ========================================= 4 | testing with 1 threads per block on device 5 | Number of samples: 20000 6 | Number of features: 30 7 | Number of clusters: 8 8 | Number of repeated runs: 20 9 | Total time: 783.192 10 | E-step time use (ms): 356.685 11 | M-step-1st-half time use (ms): 119.512 12 | M-step-2nd-half time use (ms): 0.067234 13 | Cuda Data IO (ms): 302.046 14 | Check Convergence (ms): 0.521421 15 | 16 | ========================================= 17 | ========================================= 18 | testing with 2 threads per block on device 19 | Number of samples: 20000 20 | Number of features: 30 21 | Number of clusters: 8 22 | Number of repeated runs: 20 23 | Total time: 636.066 24 | E-step time use (ms): 202.417 25 | M-step-1st-half time use (ms): 132.744 26 | M-step-2nd-half time use (ms): 0.0743866 27 | Cuda Data IO (ms): 294.903 28 | Check Convergence (ms): 0.592709 29 | 30 | ========================================= 31 | ========================================= 32 | testing with 4 threads per block on device 33 | Number of samples: 20000 34 | Number of features: 30 35 | Number of clusters: 8 36 | Number of repeated runs: 20 37 | Total time: 547.731 38 | E-step time use (ms): 108.101 39 | M-step-1st-half time use (ms): 138.242 40 | M-step-2nd-half time use (ms): 0.0698566 41 | Cuda Data IO (ms): 295.276 42 | Check Convergence (ms): 0.645161 43 | 44 | ========================================= 45 | ========================================= 46 | testing with 8 threads per block on device 47 | Number of samples: 20000 48 | Number of features: 30 49 | Number of clusters: 8 50 | Number of repeated runs: 20 51 | Total time: 541.216 52 | E-step time use (ms): 69.1309 53 | M-step-1st-half time use (ms): 169.611 54 | M-step-2nd-half time use (ms): 0.089407 55 | Cuda Data IO (ms): 296.227 56 | Check Convergence (ms): 0.853777 57 | 58 | ========================================= 59 | ========================================= 60 | testing with 16 threads per block on device 61 | Number of samples: 20000 62 | Number of features: 30 63 | Number of clusters: 8 64 | Number of repeated runs: 20 65 | Total time: 511.788 66 | E-step time use (ms): 38.1258 67 | M-step-1st-half time use (ms): 171.403 68 | M-step-2nd-half time use (ms): 0.0870228 69 | Cuda Data IO (ms): 295.944 70 | Check Convergence (ms): 0.88644 71 | 72 | ========================================= 73 | ========================================= 74 | testing with 32 threads per block on device 75 | Number of samples: 20000 76 | Number of features: 30 77 | Number of clusters: 8 78 | Number of repeated runs: 20 79 | Total time: 2496.99 80 | E-step time use (ms): 245.165 81 | M-step-1st-half time use (ms): 1893.42 82 | M-step-2nd-half time use (ms): 0.981092 83 | Cuda Data IO (ms): 322.414 84 | Check Convergence (ms): 27.8046 85 | 86 | ========================================= 87 | ========================================= 88 | testing with 64 threads per block on device 89 | Number of samples: 20000 90 | Number of features: 30 91 | Number of clusters: 8 92 | Number of repeated runs: 20 93 | Total time: 2437.49 94 | E-step time use (ms): 162.151 95 | M-step-1st-half time use (ms): 1918.99 96 | M-step-2nd-half time use (ms): 0.959396 97 | Cuda Data IO (ms): 321.023 98 | Check Convergence (ms): 27.1459 99 | -------------------------------------------------------------------------------- /Timing_Results/log/Blobs_MPI.log: -------------------------------------------------------------------------------- 1 | 2 | ========================================= 3 | ========================================= 4 | testing with 1 processes 5 | reading data 6 | Number of samples: 20000 7 | Number of features: 30 8 | Number of clusters: 8 9 | Number of repeated runs: 20 10 | =====reading data finished====== 11 | Last element in global array: -5.464932 12 | Last element after scattering 0: -5.464932 13 | =====Applying K-mean====== 14 | Best inertia: 2308841.500000 15 | I/O time use (ms): 3.746011 16 | Kmean total time use (ms): 2327.527416 17 | 18 | (sub-component timing not accurate) 19 | E-step time use (ms): 2080.869320 20 | M-step-1st-half time use (ms): 245.803846 21 | M-step-2nd-half time use (ms): 0.497344 22 | 23 | ========================================= 24 | ========================================= 25 | testing with 2 processes 26 | reading data 27 | Number of samples: 20000 28 | Number of features: 30 29 | Number of clusters: 8 30 | Number of repeated runs: 20 31 | =====reading data finished====== 32 | Last element in global array: -5.464932 33 | Last element after scattering 1: -5.464932 34 | =====Applying K-mean====== 35 | Best inertia: 2308842.000000 36 | I/O time use (ms): 3.857213 37 | Kmean total time use (ms): 1370.546596 38 | 39 | (sub-component timing not accurate) 40 | E-step time use (ms): 1208.027011 41 | M-step-1st-half time use (ms): 161.206449 42 | M-step-2nd-half time use (ms): 0.596461 43 | 44 | ========================================= 45 | ========================================= 46 | testing with 4 processes 47 | reading data 48 | Number of samples: 20000 49 | Number of features: 30 50 | Number of clusters: 8 51 | Number of repeated runs: 20 52 | =====reading data finished====== 53 | Last element in global array: -5.464932 54 | Last element after scattering 3: -5.464932 55 | =====Applying K-mean====== 56 | Best inertia: 2308840.750000 57 | I/O time use (ms): 4.113028 58 | Kmean total time use (ms): 889.274717 59 | 60 | (sub-component timing not accurate) 61 | E-step time use (ms): 783.437196 62 | M-step-1st-half time use (ms): 103.108949 63 | M-step-2nd-half time use (ms): 0.884013 64 | 65 | ========================================= 66 | ========================================= 67 | testing with 8 processes 68 | reading data 69 | Number of samples: 20000 70 | Number of features: 30 71 | Number of clusters: 8 72 | Number of repeated runs: 20 73 | =====reading data finished====== 74 | Last element in global array: -5.464932 75 | Last element after scattering 7: -5.464932 76 | =====Applying K-mean====== 77 | Best inertia: 2308833.500000 78 | I/O time use (ms): 4.099924 79 | Kmean total time use (ms): 490.620752 80 | 81 | (sub-component timing not accurate) 82 | E-step time use (ms): 421.552152 83 | M-step-1st-half time use (ms): 63.779867 84 | M-step-2nd-half time use (ms): 1.227340 85 | 86 | ========================================= 87 | ========================================= 88 | testing with 16 processes 89 | reading data 90 | Number of samples: 20000 91 | Number of features: 30 92 | Number of clusters: 8 93 | Number of repeated runs: 20 94 | =====reading data finished====== 95 | Last element in global array: -5.464932 96 | Last element after scattering 15: -5.464932 97 | =====Applying K-mean====== 98 | Best inertia: 2308833.500000 99 | I/O time use (ms): 4.935520 100 | Kmean total time use (ms): 298.665461 101 | 102 | (sub-component timing not accurate) 103 | E-step time use (ms): 221.786738 104 | M-step-1st-half time use (ms): 67.226194 105 | M-step-2nd-half time use (ms): 1.788349 106 | 107 | ========================================= 108 | ========================================= 109 | testing with 32 processes 110 | reading data 111 | Number of samples: 20000 112 | Number of features: 30 113 | Number of clusters: 8 114 | Number of repeated runs: 20 115 | =====reading data finished====== 116 | Last element in global array: -5.464932 117 | Last element after scattering 31: -5.464932 118 | =====Applying K-mean====== 119 | Best inertia: 2308833.000000 120 | I/O time use (ms): 9.008225 121 | Kmean total time use (ms): 280.026070 122 | 123 | (sub-component timing not accurate) 124 | E-step time use (ms): 183.410053 125 | M-step-1st-half time use (ms): 76.902479 126 | M-step-2nd-half time use (ms): 1.928765 127 | 128 | ========================================= 129 | ========================================= 130 | testing with 64 processes 131 | reading data 132 | Number of samples: 20000 133 | Number of features: 30 134 | Number of clusters: 8 135 | Number of repeated runs: 20 136 | =====reading data finished====== 137 | Last element in global array: -5.464932 138 | Last element after scattering 63: -5.464932 139 | =====Applying K-mean====== 140 | Best inertia: 2308827.750000 141 | I/O time use (ms): 19.194771 142 | Kmean total time use (ms): 985.778767 143 | 144 | (sub-component timing not accurate) 145 | E-step time use (ms): 106.266378 146 | M-step-1st-half time use (ms): 605.020996 147 | M-step-2nd-half time use (ms): 2.200050 148 | -------------------------------------------------------------------------------- /Timing_Results/log/Blobs_OpenMP.log: -------------------------------------------------------------------------------- 1 | 2 | ========================================= 3 | ========================================= 4 | testing with 1 threads 5 | reading data 6 | Number of samples: 20000 7 | Number of features: 30 8 | Number of clusters: 8 9 | Number of repeated runs: 20 10 | =====reading data finished====== 11 | =====Applying K-mean====== 12 | =====writting data finished====== 13 | Best inertia: 2308841.500000 14 | Kmean total time use (ms): 2539.128065 15 | E-step time use (ms): 2293.212175 16 | M-step-1st-half time use (ms): 245.102882 17 | M-step-2nd-half time use (ms): 0.499487 18 | I/O time use (ms): 2.501011 19 | 20 | ========================================= 21 | ========================================= 22 | testing with 2 threads 23 | reading data 24 | Number of samples: 20000 25 | Number of features: 30 26 | Number of clusters: 8 27 | Number of repeated runs: 20 28 | =====reading data finished====== 29 | =====Applying K-mean====== 30 | =====writting data finished====== 31 | Best inertia: 2308842.000000 32 | Kmean total time use (ms): 1731.789112 33 | E-step time use (ms): 1426.280260 34 | M-step-1st-half time use (ms): 304.410219 35 | M-step-2nd-half time use (ms): 0.734806 36 | I/O time use (ms): 1.926899 37 | 38 | ========================================= 39 | ========================================= 40 | testing with 4 threads 41 | reading data 42 | Number of samples: 20000 43 | Number of features: 30 44 | Number of clusters: 8 45 | Number of repeated runs: 20 46 | =====reading data finished====== 47 | =====Applying K-mean====== 48 | =====writting data finished====== 49 | Best inertia: 2308840.500000 50 | Kmean total time use (ms): 1336.784840 51 | E-step time use (ms): 934.716463 52 | M-step-1st-half time use (ms): 399.653912 53 | M-step-2nd-half time use (ms): 1.964569 54 | I/O time use (ms): 2.602816 55 | 56 | ========================================= 57 | ========================================= 58 | testing with 8 threads 59 | reading data 60 | Number of samples: 20000 61 | Number of features: 30 62 | Number of clusters: 8 63 | Number of repeated runs: 20 64 | =====reading data finished====== 65 | =====Applying K-mean====== 66 | =====writting data finished====== 67 | Best inertia: 2308833.500000 68 | Kmean total time use (ms): 900.095940 69 | E-step time use (ms): 490.777731 70 | M-step-1st-half time use (ms): 407.839060 71 | M-step-2nd-half time use (ms): 1.050949 72 | I/O time use (ms): 2.654076 73 | 74 | ========================================= 75 | ========================================= 76 | testing with 16 threads 77 | reading data 78 | Number of samples: 20000 79 | Number of features: 30 80 | Number of clusters: 8 81 | Number of repeated runs: 20 82 | =====reading data finished====== 83 | =====Applying K-mean====== 84 | =====writting data finished====== 85 | Best inertia: 2308838.000000 86 | Kmean total time use (ms): 741.780996 87 | E-step time use (ms): 309.553146 88 | M-step-1st-half time use (ms): 430.645466 89 | M-step-2nd-half time use (ms): 1.129389 90 | I/O time use (ms): 2.640963 91 | 92 | ========================================= 93 | ========================================= 94 | testing with 32 threads 95 | reading data 96 | Number of samples: 20000 97 | Number of features: 30 98 | Number of clusters: 8 99 | Number of repeated runs: 20 100 | =====reading data finished====== 101 | =====Applying K-mean====== 102 | =====writting data finished====== 103 | Best inertia: 2308832.750000 104 | Kmean total time use (ms): 782.614946 105 | E-step time use (ms): 210.109472 106 | M-step-1st-half time use (ms): 570.574045 107 | M-step-2nd-half time use (ms): 1.244783 108 | I/O time use (ms): 2.243996 109 | 110 | ========================================= 111 | ========================================= 112 | testing with 64 threads 113 | reading data 114 | Number of samples: 20000 115 | Number of features: 30 116 | Number of clusters: 8 117 | Number of repeated runs: 20 118 | =====reading data finished====== 119 | =====Applying K-mean====== 120 | =====writting data finished====== 121 | Best inertia: 2308833.750000 122 | Kmean total time use (ms): 1440.789938 123 | E-step time use (ms): 1023.807049 124 | M-step-1st-half time use (ms): 415.407658 125 | M-step-2nd-half time use (ms): 1.029730 126 | I/O time use (ms): 2.650023 127 | -------------------------------------------------------------------------------- /Timing_Results/log/Blobs_hybrid.log: -------------------------------------------------------------------------------- 1 | 2 | ========================================= 3 | ========================================= 4 | testing with 1 processes, 1 threads 5 | reading data 6 | Number of samples: 20000 7 | Number of features: 30 8 | Number of clusters: 8 9 | Number of repeated runs: 20 10 | =====reading data finished====== 11 | Last element in global array: -5.464932 12 | Last element after scattering 0: -5.464932 13 | =====Applying K-mean====== 14 | Best inertia: 2308841.500000 15 | I/O time use (ms): 4.006589 16 | Kmean total time use (ms): 2363.860598 17 | 18 | (sub-component timing not accurate) 19 | E-step time use (ms): 2112.615470 20 | M-step-1st-half time use (ms): 250.408402 21 | M-step-2nd-half time use (ms): 0.497923 22 | 23 | ========================================= 24 | ========================================= 25 | testing with 1 processes, 2 threads 26 | reading data 27 | Number of samples: 20000 28 | Number of features: 30 29 | Number of clusters: 8 30 | Number of repeated runs: 20 31 | =====reading data finished====== 32 | Last element in global array: -5.464932 33 | Last element after scattering 0: -5.464932 34 | =====Applying K-mean====== 35 | Best inertia: 2308842.000000 36 | I/O time use (ms): 3.732907 37 | Kmean total time use (ms): 1662.239609 38 | 39 | (sub-component timing not accurate) 40 | E-step time use (ms): 1356.238149 41 | M-step-1st-half time use (ms): 304.833094 42 | M-step-2nd-half time use (ms): 0.622449 43 | 44 | ========================================= 45 | ========================================= 46 | testing with 1 processes, 4 threads 47 | reading data 48 | Number of samples: 20000 49 | Number of features: 30 50 | Number of clusters: 8 51 | Number of repeated runs: 20 52 | =====reading data finished====== 53 | Last element in global array: -5.464932 54 | Last element after scattering 0: -5.464932 55 | =====Applying K-mean====== 56 | Best inertia: 2308840.500000 57 | I/O time use (ms): 3.693638 58 | Kmean total time use (ms): 1189.350777 59 | 60 | (sub-component timing not accurate) 61 | E-step time use (ms): 813.090273 62 | M-step-1st-half time use (ms): 375.009360 63 | M-step-2nd-half time use (ms): 0.758050 64 | 65 | ========================================= 66 | ========================================= 67 | testing with 1 processes, 8 threads 68 | reading data 69 | Number of samples: 20000 70 | Number of features: 30 71 | Number of clusters: 8 72 | Number of repeated runs: 20 73 | =====reading data finished====== 74 | Last element in global array: -5.464932 75 | Last element after scattering 0: -5.464932 76 | =====Applying K-mean====== 77 | Best inertia: 2308833.500000 78 | I/O time use (ms): 3.934515 79 | Kmean total time use (ms): 931.707351 80 | 81 | (sub-component timing not accurate) 82 | E-step time use (ms): 500.118240 83 | M-step-1st-half time use (ms): 429.142173 84 | M-step-2nd-half time use (ms): 0.863691 85 | 86 | ========================================= 87 | ========================================= 88 | testing with 2 processes, 1 threads 89 | reading data 90 | Number of samples: 20000 91 | Number of features: 30 92 | Number of clusters: 8 93 | Number of repeated runs: 20 94 | =====reading data finished====== 95 | Last element in global array: -5.464932 96 | Last element after scattering 1: -5.464932 97 | =====Applying K-mean====== 98 | Best inertia: 2308842.000000 99 | I/O time use (ms): 3.800784 100 | Kmean total time use (ms): 1358.942563 101 | 102 | (sub-component timing not accurate) 103 | E-step time use (ms): 1212.020400 104 | M-step-1st-half time use (ms): 145.609492 105 | M-step-2nd-half time use (ms): 0.594695 106 | 107 | ========================================= 108 | ========================================= 109 | testing with 2 processes, 2 threads 110 | reading data 111 | Number of samples: 20000 112 | Number of features: 30 113 | Number of clusters: 8 114 | Number of repeated runs: 20 115 | =====reading data finished====== 116 | Last element in global array: -5.464932 117 | Last element after scattering 1: -5.464932 118 | =====Applying K-mean====== 119 | Best inertia: 2308840.500000 120 | I/O time use (ms): 3.766548 121 | Kmean total time use (ms): 943.486173 122 | 123 | (sub-component timing not accurate) 124 | E-step time use (ms): 760.652990 125 | M-step-1st-half time use (ms): 181.228196 126 | M-step-2nd-half time use (ms): 0.741900 127 | 128 | ========================================= 129 | ========================================= 130 | testing with 2 processes, 4 threads 131 | reading data 132 | Number of samples: 20000 133 | Number of features: 30 134 | Number of clusters: 8 135 | Number of repeated runs: 20 136 | =====reading data finished====== 137 | Last element in global array: -5.464932 138 | Last element after scattering 1: -5.464932 139 | =====Applying K-mean====== 140 | Best inertia: 2308833.500000 141 | I/O time use (ms): 3.823837 142 | Kmean total time use (ms): 680.486505 143 | 144 | (sub-component timing not accurate) 145 | E-step time use (ms): 452.964565 146 | M-step-1st-half time use (ms): 225.771176 147 | M-step-2nd-half time use (ms): 0.831982 148 | 149 | ========================================= 150 | ========================================= 151 | testing with 2 processes, 8 threads 152 | reading data 153 | Number of samples: 20000 154 | Number of features: 30 155 | Number of clusters: 8 156 | Number of repeated runs: 20 157 | =====reading data finished====== 158 | Last element in global array: -5.464932 159 | Last element after scattering 1: -5.464932 160 | =====Applying K-mean====== 161 | Best inertia: 2308830.000000 162 | I/O time use (ms): 3.852878 163 | Kmean total time use (ms): 663.490676 164 | 165 | (sub-component timing not accurate) 166 | E-step time use (ms): 353.790135 167 | M-step-1st-half time use (ms): 307.547264 168 | M-step-2nd-half time use (ms): 0.888228 169 | 170 | ========================================= 171 | ========================================= 172 | testing with 4 processes, 1 threads 173 | reading data 174 | Number of samples: 20000 175 | Number of features: 30 176 | Number of clusters: 8 177 | Number of repeated runs: 20 178 | =====reading data finished====== 179 | Last element in global array: -5.464932 180 | Last element after scattering 3: -5.464932 181 | =====Applying K-mean====== 182 | Best inertia: 2308840.750000 183 | I/O time use (ms): 4.334087 184 | Kmean total time use (ms): 909.529512 185 | 186 | (sub-component timing not accurate) 187 | E-step time use (ms): 783.812606 188 | M-step-1st-half time use (ms): 122.883010 189 | M-step-2nd-half time use (ms): 0.911755 190 | 191 | ========================================= 192 | ========================================= 193 | testing with 4 processes, 2 threads 194 | reading data 195 | Number of samples: 20000 196 | Number of features: 30 197 | Number of clusters: 8 198 | Number of repeated runs: 20 199 | =====reading data finished====== 200 | Last element in global array: -5.464932 201 | Last element after scattering 3: -5.464932 202 | =====Applying K-mean====== 203 | Best inertia: 2308833.500000 204 | I/O time use (ms): 4.096406 205 | Kmean total time use (ms): 557.608783 206 | 207 | (sub-component timing not accurate) 208 | E-step time use (ms): 445.375796 209 | M-step-1st-half time use (ms): 109.179241 210 | M-step-2nd-half time use (ms): 1.016673 211 | 212 | ========================================= 213 | ========================================= 214 | testing with 4 processes, 4 threads 215 | reading data 216 | Number of samples: 20000 217 | Number of features: 30 218 | Number of clusters: 8 219 | Number of repeated runs: 20 220 | =====reading data finished====== 221 | Last element in global array: -5.464932 222 | Last element after scattering 3: -5.464932 223 | =====Applying K-mean====== 224 | Best inertia: 2308830.000000 225 | I/O time use (ms): 4.249204 226 | Kmean total time use (ms): 382.619795 227 | 228 | (sub-component timing not accurate) 229 | E-step time use (ms): 230.821690 230 | M-step-1st-half time use (ms): 148.596137 231 | M-step-2nd-half time use (ms): 1.035195 232 | 233 | ========================================= 234 | ========================================= 235 | testing with 4 processes, 8 threads 236 | reading data 237 | Number of samples: 20000 238 | Number of features: 30 239 | Number of clusters: 8 240 | Number of repeated runs: 20 241 | =====reading data finished====== 242 | Last element in global array: -5.464932 243 | Last element after scattering 3: -5.464932 244 | =====Applying K-mean====== 245 | Best inertia: 2308833.000000 246 | I/O time use (ms): 4.232157 247 | Kmean total time use (ms): 402.007910 248 | 249 | (sub-component timing not accurate) 250 | E-step time use (ms): 210.121067 251 | M-step-1st-half time use (ms): 187.990701 252 | M-step-2nd-half time use (ms): 1.202342 253 | 254 | ========================================= 255 | ========================================= 256 | testing with 8 processes, 1 threads 257 | reading data 258 | Number of samples: 20000 259 | Number of features: 30 260 | Number of clusters: 8 261 | Number of repeated runs: 20 262 | =====reading data finished====== 263 | Last element in global array: -5.464932 264 | Last element after scattering 7: -5.464932 265 | =====Applying K-mean====== 266 | Best inertia: 2308833.500000 267 | I/O time use (ms): 4.592522 268 | Kmean total time use (ms): 505.915032 269 | 270 | (sub-component timing not accurate) 271 | E-step time use (ms): 420.972104 272 | M-step-1st-half time use (ms): 79.467921 273 | M-step-2nd-half time use (ms): 1.211620 274 | 275 | ========================================= 276 | ========================================= 277 | testing with 8 processes, 2 threads 278 | reading data 279 | Number of samples: 20000 280 | Number of features: 30 281 | Number of clusters: 8 282 | Number of repeated runs: 20 283 | =====reading data finished====== 284 | Last element in global array: -5.464932 285 | Last element after scattering 7: -5.464932 286 | =====Applying K-mean====== 287 | Best inertia: 2308833.500000 288 | I/O time use (ms): 4.143904 289 | Kmean total time use (ms): 301.355469 290 | 291 | (sub-component timing not accurate) 292 | E-step time use (ms): 227.431369 293 | M-step-1st-half time use (ms): 68.241809 294 | M-step-2nd-half time use (ms): 1.332544 295 | 296 | ========================================= 297 | ========================================= 298 | testing with 8 processes, 4 threads 299 | reading data 300 | Number of samples: 20000 301 | Number of features: 30 302 | Number of clusters: 8 303 | Number of repeated runs: 20 304 | =====reading data finished====== 305 | Last element in global array: -5.464932 306 | Last element after scattering 7: -5.464932 307 | =====Applying K-mean====== 308 | Best inertia: 2308833.000000 309 | I/O time use (ms): 4.099906 310 | Kmean total time use (ms): 311.526036 311 | 312 | (sub-component timing not accurate) 313 | E-step time use (ms): 193.247065 314 | M-step-1st-half time use (ms): 111.617091 315 | M-step-2nd-half time use (ms): 1.467510 316 | 317 | ========================================= 318 | ========================================= 319 | testing with 8 processes, 8 threads 320 | reading data 321 | Number of samples: 20000 322 | Number of features: 30 323 | Number of clusters: 8 324 | Number of repeated runs: 20 325 | =====reading data finished====== 326 | Last element in global array: -5.464932 327 | Last element after scattering 7: -5.464932 328 | =====Applying K-mean====== 329 | Best inertia: 2308833.500000 330 | I/O time use (ms): 4.120600 331 | Kmean total time use (ms): 4898.684051 332 | 333 | (sub-component timing not accurate) 334 | E-step time use (ms): 3469.320709 335 | M-step-1st-half time use (ms): 1420.115374 336 | M-step-2nd-half time use (ms): 1.485388 337 | 338 | ========================================= 339 | ========================================= 340 | testing with 16 processes, 1 threads 341 | reading data 342 | Number of samples: 20000 343 | Number of features: 30 344 | Number of clusters: 8 345 | Number of repeated runs: 20 346 | =====reading data finished====== 347 | Last element in global array: -5.464932 348 | Last element after scattering 15: -5.464932 349 | =====Applying K-mean====== 350 | Best inertia: 2308833.500000 351 | I/O time use (ms): 7.041087 352 | Kmean total time use (ms): 316.260453 353 | 354 | (sub-component timing not accurate) 355 | E-step time use (ms): 251.028026 356 | M-step-1st-half time use (ms): 54.075352 357 | M-step-2nd-half time use (ms): 1.848250 358 | 359 | ========================================= 360 | ========================================= 361 | testing with 16 processes, 2 threads 362 | reading data 363 | Number of samples: 20000 364 | Number of features: 30 365 | Number of clusters: 8 366 | Number of repeated runs: 20 367 | =====reading data finished====== 368 | Last element in global array: -5.464932 369 | Last element after scattering 15: -5.464932 370 | =====Applying K-mean====== 371 | Best inertia: 2308833.000000 372 | I/O time use (ms): 4.883557 373 | Kmean total time use (ms): 313.827766 374 | 375 | (sub-component timing not accurate) 376 | E-step time use (ms): 188.408545 377 | M-step-1st-half time use (ms): 115.188138 378 | M-step-2nd-half time use (ms): 1.672651 379 | 380 | ========================================= 381 | ========================================= 382 | testing with 16 processes, 4 threads 383 | reading data 384 | Number of samples: 20000 385 | Number of features: 30 386 | Number of clusters: 8 387 | Number of repeated runs: 20 388 | =====reading data finished====== 389 | Last element in global array: -5.464932 390 | Last element after scattering 15: -5.464932 391 | =====Applying K-mean====== 392 | Best inertia: 2308827.750000 393 | I/O time use (ms): 5.108052 394 | Kmean total time use (ms): 5194.099651 395 | 396 | (sub-component timing not accurate) 397 | E-step time use (ms): 3129.300050 398 | M-step-1st-half time use (ms): 2012.502378 399 | M-step-2nd-half time use (ms): 1.927712 400 | 401 | ========================================= 402 | ========================================= 403 | testing with 16 processes, 8 threads 404 | reading data 405 | Number of samples: 20000 406 | Number of features: 30 407 | Number of clusters: 8 408 | Number of repeated runs: 20 409 | =====reading data finished====== 410 | Last element in global array: -5.464932 411 | Last element after scattering 15: -5.464932 412 | =====Applying K-mean====== 413 | Best inertia: 2308827.750000 414 | I/O time use (ms): 6.952672 415 | Kmean total time use (ms): 12382.280717 416 | 417 | (sub-component timing not accurate) 418 | E-step time use (ms): 8528.571661 419 | M-step-1st-half time use (ms): 3817.337787 420 | M-step-2nd-half time use (ms): 1.997874 421 | 422 | ========================================= 423 | ========================================= 424 | testing with 32 processes, 1 threads 425 | reading data 426 | Number of samples: 20000 427 | Number of features: 30 428 | Number of clusters: 8 429 | Number of repeated runs: 20 430 | =====reading data finished====== 431 | Last element in global array: -5.464932 432 | Last element after scattering 31: -5.464932 433 | =====Applying K-mean====== 434 | Best inertia: 2308833.000000 435 | I/O time use (ms): 6.334857 436 | Kmean total time use (ms): 297.959041 437 | 438 | (sub-component timing not accurate) 439 | E-step time use (ms): 176.053357 440 | M-step-1st-half time use (ms): 97.873692 441 | M-step-2nd-half time use (ms): 1.992122 442 | 443 | ========================================= 444 | ========================================= 445 | testing with 32 processes, 2 threads 446 | reading data 447 | Number of samples: 20000 448 | Number of features: 30 449 | Number of clusters: 8 450 | Number of repeated runs: 20 451 | =====reading data finished====== 452 | Last element in global array: -5.464932 453 | Last element after scattering 31: -5.464932 454 | =====Applying K-mean====== 455 | Best inertia: 2308827.750000 456 | I/O time use (ms): 7.848291 457 | Kmean total time use (ms): 4021.197152 458 | 459 | (sub-component timing not accurate) 460 | E-step time use (ms): 2844.393182 461 | M-step-1st-half time use (ms): 1116.176527 462 | M-step-2nd-half time use (ms): 2.084090 463 | 464 | ========================================= 465 | ========================================= 466 | testing with 32 processes, 4 threads 467 | reading data 468 | Number of samples: 20000 469 | Number of features: 30 470 | Number of clusters: 8 471 | Number of repeated runs: 20 472 | =====reading data finished====== 473 | Last element in global array: -5.464932 474 | Last element after scattering 31: -5.464932 475 | =====Applying K-mean====== 476 | Best inertia: 2308828.000000 477 | I/O time use (ms): 8.923698 478 | Kmean total time use (ms): 13332.237565 479 | 480 | (sub-component timing not accurate) 481 | E-step time use (ms): 8839.237808 482 | M-step-1st-half time use (ms): 4086.888532 483 | M-step-2nd-half time use (ms): 2.295686 484 | 485 | ========================================= 486 | ========================================= 487 | testing with 32 processes, 8 threads 488 | reading data 489 | Number of samples: 20000 490 | Number of features: 30 491 | Number of clusters: 8 492 | Number of repeated runs: 20 493 | =====reading data finished====== 494 | Last element in global array: -5.464932 495 | Last element after scattering 31: -5.464932 496 | =====Applying K-mean====== 497 | Best inertia: 2308827.500000 498 | I/O time use (ms): 9.748165 499 | Kmean total time use (ms): 33550.686761 500 | 501 | (sub-component timing not accurate) 502 | E-step time use (ms): 21187.669129 503 | M-step-1st-half time use (ms): 10737.786934 504 | M-step-2nd-half time use (ms): 2.450273 505 | 506 | ========================================= 507 | ========================================= 508 | testing with 64 processes, 1 threads 509 | reading data 510 | Number of samples: 20000 511 | Number of features: 30 512 | Number of clusters: 8 513 | Number of repeated runs: 20 514 | =====reading data finished====== 515 | Last element in global array: -5.464932 516 | Last element after scattering 63: -5.464932 517 | =====Applying K-mean====== 518 | Best inertia: 2308827.750000 519 | I/O time use (ms): 10.888008 520 | Kmean total time use (ms): 783.003317 521 | 522 | (sub-component timing not accurate) 523 | E-step time use (ms): 102.731732 524 | M-step-1st-half time use (ms): 491.794025 525 | M-step-2nd-half time use (ms): 2.122726 526 | 527 | ========================================= 528 | ========================================= 529 | testing with 64 processes, 2 threads 530 | reading data 531 | Number of samples: 20000 532 | Number of features: 30 533 | Number of clusters: 8 534 | Number of repeated runs: 20 535 | =====reading data finished====== 536 | Last element in global array: -5.464932 537 | Last element after scattering 63: -5.464932 538 | =====Applying K-mean====== 539 | Best inertia: 2308828.000000 540 | I/O time use (ms): 16.582364 541 | Kmean total time use (ms): 10385.388325 542 | 543 | (sub-component timing not accurate) 544 | E-step time use (ms): 2861.456678 545 | M-step-1st-half time use (ms): 5335.016671 546 | M-step-2nd-half time use (ms): 2.135070 547 | 548 | ========================================= 549 | ========================================= 550 | testing with 64 processes, 4 threads 551 | reading data 552 | Number of samples: 20000 553 | Number of features: 30 554 | Number of clusters: 8 555 | Number of repeated runs: 20 556 | =====reading data finished====== 557 | Last element in global array: -5.464932 558 | Last element after scattering 63: -5.464932 559 | =====Applying K-mean====== 560 | Best inertia: 2308827.500000 561 | I/O time use (ms): 19.903018 562 | Kmean total time use (ms): 30551.306969 563 | 564 | (sub-component timing not accurate) 565 | E-step time use (ms): 11010.092788 566 | M-step-1st-half time use (ms): 13064.622681 567 | M-step-2nd-half time use (ms): 2.318505 568 | 569 | ========================================= 570 | ========================================= 571 | testing with 64 processes, 8 threads 572 | reading data 573 | Number of samples: 20000 574 | Number of features: 30 575 | Number of clusters: 8 576 | Number of repeated runs: 20 577 | =====reading data finished====== 578 | Last element in global array: -5.464932 579 | Last element after scattering 63: -5.464932 580 | =====Applying K-mean====== 581 | Best inertia: 2308827.500000 582 | I/O time use (ms): 11.120870 583 | Kmean total time use (ms): 69710.298024 584 | 585 | (sub-component timing not accurate) 586 | E-step time use (ms): 24806.787879 587 | M-step-1st-half time use (ms): 30334.412542 588 | M-step-2nd-half time use (ms): 1.942891 589 | -------------------------------------------------------------------------------- /Timing_Results/plots/Cuda_scaling.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JiaweiZhuang/CS205_final_project/0751060adec3f6e4ca9676baae1bfd4bf4ba542a/Timing_Results/plots/Cuda_scaling.jpg -------------------------------------------------------------------------------- /Timing_Results/plots/MPI_scaling.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JiaweiZhuang/CS205_final_project/0751060adec3f6e4ca9676baae1bfd4bf4ba542a/Timing_Results/plots/MPI_scaling.jpg -------------------------------------------------------------------------------- /Timing_Results/plots/OpenMP_scaling.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JiaweiZhuang/CS205_final_project/0751060adec3f6e4ca9676baae1bfd4bf4ba542a/Timing_Results/plots/OpenMP_scaling.jpg -------------------------------------------------------------------------------- /Timing_Results/plots/hybrid_scaling.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JiaweiZhuang/CS205_final_project/0751060adec3f6e4ca9676baae1bfd4bf4ba542a/Timing_Results/plots/hybrid_scaling.jpg -------------------------------------------------------------------------------- /_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-cayman 2 | title: Paralleled Kmeans Clustering 3 | description: 4 | -CS205 Final Project, 2017 Spring- 5 | -Jiahua Guo, Jiachen Song, Xinyuan Wang, Jiawei Zhuang- 6 | --------------------------------------------------------------------------------