├── linpack ├── .gitignore ├── xlinpack_xeon64 ├── lininput_xeon64 ├── run_cont_perf.sh ├── run_cont.sh └── runme_xeon64 ├── quicksort ├── .gitignore ├── quicksort.sh ├── Makefile └── quicksort.cpp ├── spark └── pagerank │ ├── .gitignore │ ├── build.sbt │ └── src │ └── main │ └── scala │ └── PageRankExample.scala ├── stream ├── stream_c.exe ├── Makefile ├── mysecond.c ├── READ.ME ├── stream.f └── stream.c ├── setup ├── destroy_cgroups.sh └── init_bench_cgroups.sh ├── protocol ├── gen_protocol.sh └── protocol.proto ├── memaslap ├── memaslap_fill └── memaslap_etc ├── .gitmodules ├── lib ├── constants.py ├── utils.py ├── container.py ├── ftracer.py └── workloads.py ├── kmeans └── kmeans.py ├── tensorflow ├── tf-resnet.sh └── tf-inception.sh ├── benchmark.py ├── README.md ├── scheduler.py └── server.py /linpack/.gitignore: -------------------------------------------------------------------------------- 1 | lin_xeon64.txt 2 | -------------------------------------------------------------------------------- /quicksort/.gitignore: -------------------------------------------------------------------------------- 1 | quicksort 2 | -------------------------------------------------------------------------------- /spark/pagerank/.gitignore: -------------------------------------------------------------------------------- 1 | project 2 | target 3 | -------------------------------------------------------------------------------- /quicksort/quicksort.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | /usr/bin/time -v ./quicksort 2047 4 | -------------------------------------------------------------------------------- /stream/stream_c.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clusterfarmem/cfm/HEAD/stream/stream_c.exe -------------------------------------------------------------------------------- /linpack/xlinpack_xeon64: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clusterfarmem/cfm/HEAD/linpack/xlinpack_xeon64 -------------------------------------------------------------------------------- /setup/destroy_cgroups.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | sudo rmdir /cgroup2/benchmarks 4 | sudo umount /cgroup2 5 | -------------------------------------------------------------------------------- /protocol/gen_protocol.sh: -------------------------------------------------------------------------------- 1 | cd .. 2 | python3 -m grpc_tools.protoc -I. --python_out=. --grpc_python_out=. protocol/protocol.proto 3 | -------------------------------------------------------------------------------- /quicksort/Makefile: -------------------------------------------------------------------------------- 1 | quicksort: quicksort.cpp 2 | g++ -std=c++11 -O3 -g quicksort.cpp -o quicksort 3 | 4 | clean: 5 | rm quicksort 6 | -------------------------------------------------------------------------------- /memaslap/memaslap_fill: -------------------------------------------------------------------------------- 1 | generated keys 2 | key 3 | 16 16 1 4 | total generated values 5 | value 6 | 16 512 0.9 7 | 1 15 0.1 8 | cmd 9 | 0 1 10 | -------------------------------------------------------------------------------- /memaslap/memaslap_etc: -------------------------------------------------------------------------------- 1 | generated keys 2 | key 3 | 16 16 1 4 | total generated values 5 | value 6 | 16 512 0.9 7 | 1 15 0.1 8 | cmd 9 | 0 0.05 10 | 1 0.95 11 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "tensorflow/benchmarks"] 2 | path = tensorflow/benchmarks 3 | url = https://github.com/tensorflow/benchmarks.git 4 | branch = cnn_tf_v1.14_compatible 5 | -------------------------------------------------------------------------------- /lib/constants.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | CGROUP_PATH = "/cgroup2/benchmarks" 4 | PROCS = CGROUP_PATH + '/' + "{}/cgroup.procs" 5 | WORK_DIR = os.getcwd() 6 | TRACING_DIR = '/sys/kernel/debug/tracing/' 7 | SPARK_HOME = '~/spark-2.4.0-bin-hadoop2.7/' 8 | -------------------------------------------------------------------------------- /spark/pagerank/build.sbt: -------------------------------------------------------------------------------- 1 | name := "pagerank" 2 | 3 | version := "1.0" 4 | 5 | scalaVersion := "2.11.12" 6 | 7 | libraryDependencies ++= Seq( 8 | "org.apache.spark" %% "spark-sql" % "2.4.0", 9 | "org.apache.spark" %% "spark-graphx" % "2.4.0", 10 | ) 11 | -------------------------------------------------------------------------------- /linpack/lininput_xeon64: -------------------------------------------------------------------------------- 1 | Sample Intel(R) Optimized LINPACK Benchmark data file (lininput_xeon64) 2 | Intel(R) Optimized LINPACK Benchmark data 3 | 1 # number of tests 4 | 14000 # problem sizes 5 | 14000 # leading dimensions 6 | 1 # times to run a test 7 | 4 # alignment values (in KBytes) 8 | -------------------------------------------------------------------------------- /kmeans/kmeans.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.cluster import KMeans 3 | from sklearn.datasets.samples_generator import make_blobs 4 | 5 | np.random.seed(42) 6 | 7 | samples, labels = make_blobs(n_samples=15000000, centers=10, random_state=0) 8 | k_means = KMeans(10, precompute_distances=True) 9 | k_means.fit(samples) 10 | -------------------------------------------------------------------------------- /tensorflow/tf-resnet.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" 4 | 5 | pushd $DIR/benchmarks/scripts/tf_cnn_benchmarks 6 | /usr/bin/time -v python3 tf_cnn_benchmarks.py --forward_only=True --data_format=NHWC --device=cpu --batch_size=64 --num_inter_threads=0 --num_intra_threads=2 --nodistortions --model=resnet50 --kmp_blocktime=0 --num_batches=20 --num_warmup_batches 0 7 | -------------------------------------------------------------------------------- /tensorflow/tf-inception.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" 4 | 5 | pushd $DIR/benchmarks/scripts/tf_cnn_benchmarks 6 | /usr/bin/time -v python3 tf_cnn_benchmarks.py --forward_only=True --data_format=NHWC --device=cpu --batch_size=64 --num_inter_threads=0 --num_intra_threads=2 --nodistortions --model=inception3 --kmp_blocktime=0 --num_batches=20 --num_warmup_batches 0 7 | -------------------------------------------------------------------------------- /setup/init_bench_cgroups.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CGROUP_ROOT=/cgroup2 4 | CGROUP_BENCH=$CGROUP_ROOT/benchmarks 5 | USER=$(whoami) 6 | 7 | echo "will setup cgroups at $CGROUP_ROOT for user $USER" 8 | sudo mount -t cgroup2 nodev $CGROUP_ROOT 9 | sudo sh -c "echo '+memory' > $CGROUP_ROOT/cgroup.subtree_control" 10 | 11 | sudo mkdir $CGROUP_BENCH 12 | sudo sh -c "echo '+memory' > $CGROUP_BENCH/cgroup.subtree_control" 13 | 14 | sudo chown $USER -R $CGROUP_ROOT 15 | 16 | echo "enabling readahead" 17 | sudo sh -c "echo 3 > /proc/sys/vm/page-cluster" 18 | 19 | echo "done" 20 | -------------------------------------------------------------------------------- /linpack/run_cont_perf.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | # the workload uses 1.5G of memory 4 | set -e 5 | PERF="perf stat -d" 6 | 7 | #echo "running linpack without memory limit" 8 | #sudo $PERF ./runme_xeon64 9 | # 10 | #echo "running linpack 1152M" 11 | #../scripts/changemem_cgroup2.sh 1152M 12 | #sudo ../scripts/exec_cgroupv2.sh $PERF ./runme_xeon64 13 | # 14 | #echo "running linpack 768M" 15 | #../scripts/changemem_cgroup2.sh 768M 16 | #sudo ../scripts/exec_cgroupv2.sh $PERF ./runme_xeon64 17 | 18 | echo "running linpack 384M" 19 | ../scripts/changemem_cgroup2.sh 384M 20 | sudo ../scripts/exec_cgroupv2.sh ./runme_xeon64 21 | -------------------------------------------------------------------------------- /linpack/run_cont.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | # the workload uses 1.5G of memory 4 | set -e 5 | 6 | echo "running linpack without memory limit" 7 | sudo /usr/bin/time -v ./runme_xeon64 8 | 9 | echo "running linpack 1152M" 10 | ../scripts/changemem_cgroup2.sh 1152M 11 | sudo /usr/bin/time -v ../scripts/exec_cgroupv2.sh ./runme_xeon64 12 | 13 | echo "running linpack 768M" 14 | ../scripts/changemem_cgroup2.sh 768M 15 | sudo /usr/bin/time -v ../scripts/exec_cgroupv2.sh ./runme_xeon64 16 | 17 | echo "running linpack 384M" 18 | ../scripts/changemem_cgroup2.sh 384M 19 | sudo /usr/bin/time -v ../scripts/exec_cgroupv2.sh ./runme_xeon64 20 | -------------------------------------------------------------------------------- /stream/Makefile: -------------------------------------------------------------------------------- 1 | CC = gcc 2 | CFLAGS = -O2 -DSTREAM_ARRAY_SIZE=178955000 -DNTIMES=21 -mcmodel=medium -fopenmp 3 | 4 | FF = gfortran -std=legacy 5 | FFLAGS = -O2 6 | 7 | all: stream_c.exe 8 | 9 | stream_f.exe: stream.f mysecond.o 10 | $(CC) $(CFLAGS) -c mysecond.c 11 | $(FF) $(FFLAGS) -c stream.f 12 | $(FF) $(FFLAGS) stream.o mysecond.o -o stream_f.exe 13 | 14 | stream_c.exe: stream.c 15 | $(CC) $(CFLAGS) stream.c -o stream_c.exe 16 | 17 | clean: 18 | rm -f stream_f.exe stream_c.exe *.o 19 | 20 | # an example of a more complex build line for the Intel icc compiler 21 | stream.icc: stream.c 22 | icc -O3 -xCORE-AVX2 -ffreestanding -qopenmp -DSTREAM_ARRAY_SIZE=80000000 -DNTIMES=20 stream.c -o stream.omp.AVX2.80M.20x.icc 23 | -------------------------------------------------------------------------------- /stream/mysecond.c: -------------------------------------------------------------------------------- 1 | /* A gettimeofday routine to give access to the wall 2 | clock timer on most UNIX-like systems. 3 | 4 | This version defines two entry points -- with 5 | and without appended underscores, so it *should* 6 | automagically link with FORTRAN */ 7 | 8 | #include 9 | 10 | double mysecond() 11 | { 12 | /* struct timeval { long tv_sec; 13 | long tv_usec; }; 14 | 15 | struct timezone { int tz_minuteswest; 16 | int tz_dsttime; }; */ 17 | 18 | struct timeval tp; 19 | struct timezone tzp; 20 | int i; 21 | 22 | i = gettimeofday(&tp,&tzp); 23 | return ( (double) tp.tv_sec + (double) tp.tv_usec * 1.e-6 ); 24 | } 25 | 26 | double mysecond_() {return mysecond();} 27 | 28 | -------------------------------------------------------------------------------- /quicksort/quicksort.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | const size_t MB = 1024 * 1024; 8 | using namespace std::chrono; 9 | 10 | void die(const char *msg, bool printErrno) { 11 | std::cerr << msg << "\n"; 12 | exit(1); 13 | } 14 | 15 | void print_time_diff(time_point start, 16 | time_point end) { 17 | auto diff = end - start; 18 | std::cout << "time " << duration (diff).count() << "\n"; 19 | } 20 | 21 | void print_time_diff_ms(time_point start, 22 | time_point end) { 23 | auto diff = end - start; 24 | std::cout << "time " << duration (diff).count() << " ms\n"; 25 | } 26 | 27 | int main(int argc, char *argv[]) { 28 | if (argc != 2) 29 | die("need MB of integers to sort", false); 30 | 31 | long size = std::stoi(argv[1]) * MB; 32 | long numInts = size / sizeof(int); 33 | 34 | std::cout << "will sort " << numInts << " integers (" << size / MB << " MB)\n"; 35 | std::vector v(numInts); 36 | 37 | std::srand(std::time(0)); 38 | time_point start, end; 39 | 40 | std::generate(v.begin(), v.end(), std::rand); 41 | start = high_resolution_clock::now(); 42 | std::sort(v.begin(), v.end(), std::greater()); 43 | 44 | end = high_resolution_clock::now(); 45 | print_time_diff_ms(start, end); 46 | 47 | return 0; 48 | } 49 | -------------------------------------------------------------------------------- /protocol/protocol.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto3"; 2 | 3 | //option java_multiple_files = true; 4 | //option java_package = "io.grpc.examples.helloworld"; 5 | //option java_outer_classname = "HelloWorldProto"; 6 | //option objc_class_prefix = "HLW"; 7 | 8 | package scheduler; 9 | 10 | service Scheduler { 11 | rpc checkin (CheckinReq) returns (CheckinReply) {} 12 | rpc execute (ExecuteReq) returns (ExecuteReply) {} 13 | rpc get_resources (GetResourcesReq) returns (GetResourcesReply) {} 14 | rpc get_finished (GetFinishedReq) returns (GetFinishedReply) {} 15 | rpc shutdown (ShutdownReq) returns (ShutdownReply) {} 16 | rpc get_samples (GetSamplesReq) returns (GetSamplesReply) {} 17 | } 18 | 19 | message CheckinReq { 20 | bool use_remote_mem = 1; 21 | uint32 max_cpus = 2; 22 | uint32 max_mem = 3; 23 | float uniform_ratio = 5; 24 | map variable_ratios = 6; 25 | bool limit_remote_mem = 7; 26 | bool optimal = 8; 27 | } 28 | 29 | message CheckinReply { 30 | string server_name = 1; 31 | bool success = 2; 32 | } 33 | 34 | message ExecuteReq { 35 | string wname = 1; 36 | uint32 idd = 2; 37 | } 38 | 39 | message ExecuteReply { 40 | bool success = 1; 41 | } 42 | 43 | message GetResourcesReq { } 44 | 45 | message GetResourcesReply { 46 | float free_cpus = 1; 47 | float alloc_mem = 2; 48 | float min_mem_sum = 3; 49 | } 50 | 51 | message GetFinishedReq { } 52 | 53 | message GetFinishedReply { 54 | map start_times = 1; 55 | map finished_times = 2; 56 | } 57 | 58 | message ShutdownReq { } 59 | 60 | message ShutdownReply { 61 | bool success = 1; 62 | } 63 | 64 | message GetSamplesReq { } 65 | 66 | message GetSamplesReply { 67 | repeated float cpu_util = 1; 68 | repeated float mem_util = 2; 69 | repeated float swap_util = 3; 70 | repeated float bw_in = 4; 71 | repeated float bw_out = 5; 72 | repeated uint32 curr_pages = 6; 73 | float bytes_in = 7; 74 | float bytes_out = 8; 75 | } 76 | -------------------------------------------------------------------------------- /benchmark.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | import argparse 3 | from lib import workloads 4 | from lib import utils 5 | 6 | def print_output(workload, args): 7 | print(workload.stdout.decode('utf-8'), '\n') 8 | print(workload.stderr.decode('utf-8'), '\n') 9 | process_duration = workload.get_process_duration() 10 | print('Python Wall Time: {}'.format(process_duration), '\n') 11 | 12 | usr_bin_time = workload.get_usr_bin_time() 13 | usr_bin_time = sorted(usr_bin_time.items(), key=lambda x:x[0]) 14 | header, values = zip(*usr_bin_time) 15 | header = ','.join(header) 16 | values = map(str, values) 17 | values = ','.join(values) 18 | print(header, values, sep='\n') 19 | 20 | def run_benchmark(args): 21 | workload_class = workloads.get_workload_class(args.name) 22 | 23 | # Use user-specified cpus, otherwise use first n cpus 24 | if args.cpus: 25 | pinned_cpus = args.cpus 26 | else: 27 | pinned_cpus = range(workload_class.cpu_req) 28 | 29 | workload = workload_class(args.id, pinned_cpus, args.ratio) 30 | 31 | try: 32 | workload.start() 33 | workload.thread.join() # Block until thread is finished 34 | print_output(workload, args) 35 | except KeyboardInterrupt: 36 | workload.kill() 37 | 38 | 39 | def main(): 40 | # Parse Command Line Arguments 41 | workload_choices = ['quicksort', 'linpack', 'tf-inception', 42 | 'tf-resnet', 'spark', 'kmeans', 'memaslap', 43 | 'stream'] 44 | 45 | parser = argparse.ArgumentParser(description='Run a workload in a ' 46 | 'parameterized container') 47 | parser.add_argument('name', help="Name of the binary to run", 48 | choices=workload_choices) 49 | parser.add_argument('ratio', 50 | help="Ratio of the workload's max memory to use", 51 | type=utils.check_ratio) 52 | parser.add_argument('--id', default=0, 53 | help="Workload id used for container name") 54 | parser.add_argument('--cpus', default=[],type=lambda l:list(map(int, l.split(','))), 55 | help="List of cpus to use for workloads that support it") 56 | 57 | args = parser.parse_args() 58 | run_benchmark(args) 59 | 60 | 61 | if __name__ == '__main__': 62 | main() 63 | -------------------------------------------------------------------------------- /linpack/runme_xeon64: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #=============================================================================== 3 | # Copyright 2001-2017 Intel Corporation All Rights Reserved. 4 | # 5 | # The source code, information and material ("Material") contained herein is 6 | # owned by Intel Corporation or its suppliers or licensors, and title to such 7 | # Material remains with Intel Corporation or its suppliers or licensors. The 8 | # Material contains proprietary information of Intel or its suppliers and 9 | # licensors. The Material is protected by worldwide copyright laws and treaty 10 | # provisions. No part of the Material may be used, copied, reproduced, 11 | # modified, published, uploaded, posted, transmitted, distributed or disclosed 12 | # in any way without Intel's prior express written permission. No license under 13 | # any patent, copyright or other intellectual property rights in the Material 14 | # is granted to or conferred upon you, either expressly, by implication, 15 | # inducement, estoppel or otherwise. Any license under such intellectual 16 | # property rights must be express and approved by Intel in writing. 17 | # 18 | # Unless otherwise agreed by Intel in writing, you may not remove or alter this 19 | # notice or any other notice embedded in Materials by Intel or Intel's 20 | # suppliers or licensors in any way. 21 | #=============================================================================== 22 | 23 | echo "This is a SAMPLE run script for SMP LINPACK. Change it to reflect" 24 | echo "the correct number of CPUs/threads, problem input files, etc.." 25 | 26 | # Setting up affinity for better threading performance 27 | export KMP_AFFINITY=nowarnings,compact,1,0,granularity=fine 28 | #export MKL_NUM_THREADS=4 29 | #export OMP_NUM_THREADS=4 30 | #export MKL_DOMAIN_NUM_THREADS=4 31 | 32 | # Get Directory of Script 33 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" 34 | 35 | # Use numactl for better performance on multi-socket machines. 36 | nnodes=`numactl -H 2>&1 | awk '/available:/ {print $2}'` 37 | cpucores=`cat /proc/cpuinfo | awk '/cpu cores/ {print $4; exit}'` 38 | 39 | if [ $nnodes -gt 1 -a $cpucores -gt 8 ] 40 | then 41 | numacmd="numactl --interleave=all" 42 | else 43 | numacmd= 44 | fi 45 | 46 | arch=xeon64 47 | { 48 | date 49 | /usr/bin/time -v $numacmd $DIR/xlinpack_$arch $DIR/lininput_$arch 50 | echo -n "Done: " 51 | date 52 | } | tee $DIR/lin_$arch.txt 53 | 54 | -------------------------------------------------------------------------------- /spark/pagerank/src/main/scala/PageRankExample.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | // scalastyle:off println 19 | //package org.apache.spark.examples.graphx 20 | 21 | // $example on$ 22 | import org.apache.spark.graphx.GraphLoader 23 | // $example off$ 24 | import org.apache.spark.sql.SparkSession 25 | import org.apache.log4j.{Level, Logger} 26 | 27 | /** 28 | * A PageRank example on social network dataset 29 | * Run with 30 | * {{{ 31 | * bin/run-example graphx.PageRankExample 32 | * }}} 33 | */ 34 | object pagerank { 35 | def main(args: Array[String]): Unit = { 36 | // Creates a SparkSession. 37 | val spark = SparkSession 38 | .builder 39 | .appName(s"${this.getClass.getSimpleName}") 40 | .getOrCreate() 41 | val sc = spark.sparkContext 42 | 43 | val rootLogger = Logger.getRootLogger() 44 | rootLogger.setLevel(Level.ERROR) 45 | 46 | val home_dir = System.getProperty("user.home") 47 | val spark_home = home_dir + "/spark-2.4.0-bin-hadoop2.7/" 48 | val data_file = spark_home + "data/sosp/web-BerkStan.txt" 49 | // $example on$ 50 | // Load the edges as a graph 51 | //val graph = GraphLoader.edgeListFile(sc, "data/berkeley_stanford/web-BerkStan.txt") 52 | val graph = GraphLoader.edgeListFile(sc, data_file) 53 | // Run PageRank 54 | val ranks = graph.pageRank(0.0001).vertices 55 | // Join the ranks with the usernames 56 | //val users = sc.textFile("data/graphx/users.txt").map { line => 57 | /* val users = sc.textFile("users.txt").map { line => 58 | val fields = line.split(",") 59 | (fields(0).toLong, fields(1)) 60 | } 61 | val ranksByUsername = users.join(ranks).map { 62 | case (id, (username, rank)) => (username, rank) 63 | } 64 | // Print the result 65 | println(ranksByUsername.collect().mkString("\n")) 66 | */ 67 | // $example off$ 68 | spark.stop() 69 | } 70 | } 71 | // scalastyle:on println 72 | -------------------------------------------------------------------------------- /lib/utils.py: -------------------------------------------------------------------------------- 1 | import time 2 | import subprocess 3 | import re 4 | import os 5 | import argparse 6 | 7 | g_sim_start = 0 8 | 9 | 10 | def get_current_ts(): 11 | global g_sim_start 12 | curr_ts = int(round(time.time() * 1000)) 13 | 14 | if g_sim_start == 0: 15 | g_sim_start = curr_ts 16 | return 0 17 | 18 | return curr_ts - g_sim_start 19 | 20 | 21 | def shell_exec(cmdline): 22 | p = subprocess.Popen(cmdline, stdout=subprocess.PIPE, 23 | stderr=subprocess.PIPE, shell=True) 24 | out, err = p.communicate() 25 | return (p.returncode, out.decode('utf-8'), err.decode('utf-8')) 26 | 27 | 28 | def check_sudo(): 29 | if os.geteuid() != 0: 30 | raise RuntimeError("Run with sudo.") 31 | 32 | 33 | def check_ratio(arg): 34 | ''' Check the validity of the argument passed for ratio. 35 | This function is passed to the argument parser. 36 | ''' 37 | if arg == 'max': 38 | return 'max' 39 | else: 40 | try: 41 | value = float(arg) 42 | except ValueError: 43 | msg = "Value provided for ratio is neither a number or max" 44 | raise argparse.ArgumentTypeError(msg) 45 | if (0 < value): 46 | return value 47 | else: 48 | raise argparse.ArgumentTypeError("Ratio value must be > 0") 49 | 50 | 51 | class BinTimeParser: 52 | def __init__(self): 53 | pass 54 | 55 | def parse(self, string): 56 | header = ','.join(('User Time', 'System Time', 57 | 'Wall Time', 'Major Page Faults')) 58 | values = {'User Time': self.get_user_time(string), 59 | 'System Time': self.get_sys_time(string), 60 | 'Wall Time': self.get_wall_time(string), 61 | 'Major Page Faults': self.get_page_faults(string)} 62 | return values 63 | 64 | def get_user_time(self, string): 65 | regex = re.compile(r"User time \(seconds\): (\d+.\d+)") 66 | return float(regex.search(string).groups()[0]) 67 | 68 | def get_sys_time(self, string): 69 | regex = re.compile(r"System time \(seconds\): (\d+.\d+)") 70 | return float(regex.search(string).groups()[0]) 71 | 72 | def get_wall_time(self, string): 73 | regex = re.compile(r"\(h:mm:ss or m:ss\): (\d*?):*(\d+):(\d+\.\d+)") 74 | hours, minutes, seconds = regex.search(string).groups() 75 | hours = float(hours) if hours else 0 # hours may be None 76 | minutes, seconds = float(minutes), float(seconds) 77 | return round(hours * 3600 + minutes * 60 + seconds, 3) 78 | 79 | def get_page_faults(self, string): 80 | regex = re.compile(r"Major \(requiring I/O\) page faults: (\d+)") 81 | return int(regex.search(string).groups()[0]) 82 | -------------------------------------------------------------------------------- /lib/container.py: -------------------------------------------------------------------------------- 1 | import os 2 | from lib import utils 3 | from lib import constants 4 | 5 | 6 | class Container: 7 | def __init__(self, name, mem_req, ratio): 8 | self.name = name 9 | self.mem_req = mem_req # in MB 10 | self.ratio = ratio 11 | 12 | def exists(self): 13 | """ Returns whether this container still exists """ 14 | return os.path.isdir(self.get_cont_path()) 15 | 16 | def delete(self): 17 | path = self.get_cont_path() 18 | ret = utils.shell_exec("rmdir {0}".format(path))[0] 19 | if ret: 20 | raise RuntimeError("Error deleting {}".format(path)) 21 | 22 | def set_memory_limit(self): 23 | # this is possible if the caller is multithreaded 24 | # and hasn't realized the container has been deleted 25 | if not self.exists(): 26 | return 27 | 28 | if self.ratio == 'max': 29 | memory_limit = 'max' 30 | print("Setting container memory limit to Max") 31 | else: 32 | memory_limit = str(round(self.ratio*self.mem_req)) + 'M' 33 | print("Setting {} memory limit to " 34 | "{}% ({}) of max".format(self.name, 35 | round(self.ratio*100), 36 | memory_limit)) 37 | 38 | mem_high_path = self.get_cont_path() + '/memory.high' 39 | with open(mem_high_path, 'w') as f: 40 | f.write(memory_limit) 41 | 42 | def set_new_size(self, local_ratio): 43 | self.ratio = local_ratio 44 | self.set_memory_limit() 45 | 46 | def get_cont_path(self): 47 | return "{}/{}".format(constants.CGROUP_PATH, self.name) 48 | 49 | def get_procs_path(self): 50 | return self.get_cont_path() + '/cgroup.procs' 51 | 52 | def create(self): 53 | """creates new container as child of CGROUP_PATH""" 54 | new_cont_path = self.get_cont_path() 55 | try: 56 | os.mkdir(new_cont_path) 57 | assert self.exists() 58 | except FileExistsError: 59 | print("container {} already exists, trying to delete".format(self.name)) 60 | self.delete() 61 | os.mkdir(new_cont_path) 62 | self.set_memory_limit() 63 | 64 | def get_pids(self): 65 | try: 66 | with open(self.get_procs_path(), 'r') as f: 67 | pids = f.readlines() 68 | pids = map(lambda p: p.rstrip('\n'), pids) 69 | pids = tuple(map(int, pids)) 70 | return pids 71 | except Exception as e: 72 | print("Exception of type: {}".format(type(e))) 73 | print("Procs path: {}".format(self.get_procs_path())) 74 | return () 75 | 76 | def check(): 77 | '''Check that the cgroup path exists and that the memory controller is enabled''' 78 | if not os.path.isdir(constants.CGROUP_PATH): 79 | raise RuntimeError("{} does not exist".format(constants.CGROUP_PATH)) 80 | 81 | with open(constants.CGROUP_PATH + '/cgroup.subtree_control', 'r') as f: 82 | content = f.read() 83 | if 'memory' not in content: 84 | raise RuntimeError('memory controller not enabled') 85 | -------------------------------------------------------------------------------- /lib/ftracer.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import re 3 | from multiprocessing import cpu_count 4 | from lib import constants 5 | from lib import utils 6 | 7 | BUFFER_SIZE_DEFAULT = 1408 8 | BUFFER_SIZE_MAX = 500000 9 | 10 | 11 | class FTracer: 12 | def __init__(self, filter_functions): 13 | self.filter_functions = filter_functions 14 | 15 | def read_trace_stats(self): 16 | regex = re.compile(r'([^\s]+)\s+(\d+)\s+(\d+\.*\d*)' 17 | '\s+us\s+(\d+\.*\d*)\s+us\s+(\d+\.*\d*)') 18 | stats = dict() 19 | 20 | for cpu in range(cpu_count()): 21 | filename = constants.TRACING_DIR + 'trace_stat/function' + str(cpu) 22 | with open(filename, 'r') as f: 23 | for line in f: 24 | match = regex.search(line) 25 | if match: 26 | func_name, hit, time, avg, std_dev = match.groups() 27 | if func_name in stats: 28 | stats[func_name]['hits'] += int(hit) 29 | stats[func_name]['sum_time'] += float(time) 30 | else: 31 | stats[func_name] = {'hits': int(hit), 32 | 'sum_time': float(time)} 33 | else: 34 | pass 35 | for func, values in stats.items(): 36 | values['avg'] = values['sum_time']/values['hits'] 37 | return stats 38 | 39 | def set_ftrace_filter(self): 40 | filter_file = constants.TRACING_DIR + 'set_ftrace_filter' 41 | with open(filter_file, 'w') as f: 42 | f.write('\n'.join(self.filter_functions)) 43 | 44 | def enable_function_profile(self): 45 | filename = constants.TRACING_DIR + 'function_profile_enabled' 46 | with open(filename, 'w') as f: 47 | f.write('1') 48 | 49 | def disable_function_profile(self): 50 | filename = constants.TRACING_DIR + 'function_profile_enabled' 51 | with open(filename, 'w') as f: 52 | f.write('0') 53 | 54 | def set_buffer_size_kb(self, size): 55 | with open(constants.TRACING_DIR + 'buffer_size_kb', 'w') as f: 56 | f.write(str(size)) 57 | 58 | def enable_tracing_on(self): 59 | with open(constants.TRACING_DIR + 'tracing_on', 'w') as f: 60 | f.write('1') 61 | 62 | def disable_tracing_on(self): 63 | with open(constants.TRACING_DIR + 'tracing_on', 'w') as f: 64 | f.write('0') 65 | 66 | def set_current_tracer(self, tracer): 67 | with open(constants.TRACING_DIR + 'current_tracer', 'w') as f: 68 | f.write(tracer) 69 | 70 | def copy_trace(self, name, mem_ratio): 71 | print("Copying trace to current directory") 72 | cp_trace = ' '.join(('sudo cp', 73 | constants.TRACING_DIR + 'trace', 74 | '{}_{}_{}')).format(name, mem_ratio, 75 | '_'.join(self.filter_functions)) 76 | utils.shell_exec(cp_trace) 77 | 78 | def setup_profile(self): 79 | self.set_current_tracer('function') 80 | self.set_ftrace_filter() 81 | self.disable_function_profile() 82 | self.enable_function_profile() 83 | 84 | def teardown_profile(self): 85 | self.disable_function_profile() 86 | 87 | def setup_timestamp(self): 88 | self.set_current_tracer('function') 89 | self.set_ftrace_filter() 90 | self.set_buffer_size_kb(BUFFER_SIZE_MAX) 91 | self.enable_tracing_on() 92 | 93 | def teardown_timestamp(self): 94 | self.disable_tracing_on() 95 | self.set_buffer_size_kb(BUFFER_SIZE_DEFAULT) 96 | -------------------------------------------------------------------------------- /stream/READ.ME: -------------------------------------------------------------------------------- 1 | =============================================== 2 | 3 | STREAM is the de facto industry standard benchmark 4 | for measuring sustained memory bandwidth. 5 | 6 | Documentation for STREAM is on the web at: 7 | http://www.cs.virginia.edu/stream/ref.html 8 | 9 | =============================================== 10 | NEWS 11 | =============================================== 12 | UPDATE: October 28 2014: 13 | 14 | "stream_mpi.c" released in the Versions directory. 15 | 16 | Based on Version 5.10 of stream.c, stream_mpi.c 17 | brings the following new features: 18 | * MPI implementation that *distributes* the arrays 19 | across all MPI ranks. (The older Fortran version 20 | of STREAM in MPI *replicates* the arrays across 21 | all MPI ranks.) 22 | * Data is allocated using "posix_memalign" 23 | rather than using static arrays. Different 24 | compiler flags may be needed for both portability 25 | and optimization. 26 | See the READ.ME file in the Versions directory 27 | for more details. 28 | * Error checking and timing done by all ranks and 29 | gathered by rank 0 for processing and output. 30 | * Timing code uses barriers to ensure correct 31 | operation even when multiple MPI ranks run on 32 | shared memory systems. 33 | 34 | NOTE: MPI is not a preferred implementation for 35 | STREAM, which is intended to measure memory 36 | bandwidth in shared-memory systems. In stream_mpi, 37 | the MPI calls are only used to properly synchronize 38 | the timers (using MPI_Barrier) and to gather 39 | timing and error data, so the performance should 40 | scale linearly with the size of the cluster. 41 | But it may be useful, and was an interesting 42 | exercise to develop and debug. 43 | 44 | =============================================== 45 | UPDATE: January 17 2013: 46 | 47 | Version 5.10 of stream.c is finally available! 48 | 49 | There are no changes to what is being measured, but 50 | a number of long-awaited improvements have been made: 51 | 52 | * Updated validation code does not suffer from 53 | accumulated roundoff error for large arrays. 54 | * Defining the preprocessor variable "VERBOSE" 55 | when compiling will (1) cause the code to print the 56 | measured average relative absolute error (rather than 57 | simply printing "Solution Validates", and (2) print 58 | the first 10 array entries with relative error exceeding 59 | the error tolerance. 60 | * Array index variables have been upgraded from 61 | "int" to "ssize_t" to allow arrays with more 62 | than 2 billion elements on 64-bit systems. 63 | * Substantial improvements to the comments in 64 | the source on how to configure/compile/run the 65 | benchmark. 66 | * The proprocessor variable controlling the array 67 | size has been changed from "N" to "STREAM_ARRAY_SIZE". 68 | * A new preprocessor variable "STREAM_TYPE" can be 69 | used to override the data type from the default 70 | "double" to "float". 71 | This mechanism could also be used to change to 72 | non-floating-point types, but several "printf" 73 | statements would need to have their formats changed 74 | to accomodate the modified data type. 75 | * Some small changes in output, including printing 76 | array sizes is GiB as well as MiB. 77 | * Change to the default output format to print fewer 78 | decimals for the bandwidth and more decimals for 79 | the min/max/avg execution times. 80 | 81 | 82 | =============================================== 83 | UPDATE: February 19 2009: 84 | 85 | The most recent "official" versions have been renamed 86 | "stream.f" and "stream.c" -- all other versions have 87 | been moved to the "Versions" subdirectory and should be 88 | considered obsolete. 89 | 90 | The "official" timer (was "second_wall.c") has been 91 | renamed "mysecond.c". This is embedded in the C version 92 | ("stream.c"), but still needs to be externally linked to 93 | the FORTRAN version ("stream.f"). The new version defines 94 | entry points both with and without trailing underscores, 95 | so it *should* link automagically with any Fortran compiler. 96 | 97 | =============================================== 98 | 99 | STREAM is a project of "Dr. Bandwidth": 100 | John D. McCalpin, Ph.D. 101 | john@mccalpin.com 102 | 103 | =============================================== 104 | 105 | The STREAM web and ftp sites are currently hosted at 106 | the Department of Computer Science at the University of 107 | Virginia under the generous sponsorship of Professor Bill 108 | Wulf and Professor Alan Batson. 109 | 110 | =============================================== 111 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Setup and pre-requisites 2 | 3 | On the client node the fastswap kernel and driver must be loaded. On the far memory node the server binary `rmserver` must be running. Please see https://github.com/clusterfarmem/fastswap for more details. 4 | 5 | ## General pre-requisites 6 | 7 | You'll need python3, grpcio, grpcio-tools, numpy and scipy to execute various parts of our framework. Please make sure your python environment can see these modules. 8 | 9 | ## Workload setup (for single and multi-workload benchmarks) 10 | 11 | * quicksort 12 | * Change directory to quicksort and type make 13 | * linpack 14 | * No setup required, but most likely you'll need an Intel CPU 15 | * tf-inception 16 | * tensorflow 1.14 is required 17 | * Init submodules `git submodule update --init` 18 | * spark 19 | * We assume the user has installed [spark 2.4](https://archive.apache.org/dist/spark/spark-2.4.0/spark-2.4.0-bin-hadoop2.7.tgz) at `~/spark-2.4.0-bin-hadoop2.7` 20 | * kmeans 21 | * Requires sklearn available in python3 22 | * memcached 23 | * Requires `memcached` and `memaslap` to be installed and available in your $PATH environment. 24 | * stream 25 | * Change directory to stream and type make 26 | 27 | ## Setting up cgroups 28 | ### Disable cgroup v1 29 | * Open /boot/grub/grub.cfg in your editor of choice 30 | * Find the `menuentry` for the fastswap kernel 31 | * Add `cgroup_no_v1=memory` to the end of the line beginning in `linux /boot/vmlinuz-4.11.0-sswap` 32 | * Save and exit the file 33 | * Run: sudo update-grub 34 | * Reboot 35 | 36 | ### Enable cgroup v2 37 | The framework and scripts rely on the cgroup system to be mounted at /cgroup2. Perform the following actions: 38 | * Run `sudo mkdir /cgroup2` to create root mount point 39 | * Execute `setup/init_bench_cgroups.sh` 40 | * Mounts cgroup system 41 | * Changes ownership of the mount point (and all nested files) to the current user 42 | * Enables prefetching 43 | 44 | ## Protocol Buffers 45 | We use [the grpc framework](https://grpc.io) and [protocol buffers](https://developers.google.com/protocol-buffers/docs/pythontutorial) to communicate between the scheduler and servers. The messages that we've defined are in `protocol/protocol.proto`. To generate them the corresponding `.py` files, execute the following command in the `protocol` directory: 46 | 47 | source gen_protocol.sh 48 | 49 | # Single Workload Benchmarks 50 | ## `benchmark.py` 51 | 52 | `Benchmark.py` is the command center from which you can run local, single benchmarks. It accepts numerous arguments but only two, `workload` and `ratio`, are required. Its minimum invocation is the following: 53 | 54 | ./benchmark.py 55 | 56 | Where `workload` is an application that the toolset has been configured to benchmark (Ex: linpack) and `ratio` is the portion of its resident set size that you want to keep in local memory, expressed as a decimal. 57 | 58 | Running the tool in this way will set the appropriate limits in the applications cgroup, run it to completion, then print statistics to stdout. 59 | 60 | ## Arguments 61 | Argument | Description | Required 62 | --------------------------------|-----------------------|---------------------- 63 | workload | An application that the toolset has been configured to benchmark (Ex: linpack) | Y 64 | ratio | The portion of the workload's resident set size that you want to keep in local memory, expressed as a decimal | Y 65 | --id | The workload ID that's appended to the workload's name to create its container name. If let unset, it will default to 0 | N 66 | --cpus | A comma separated list of CPUs to pin the workload to. If both this is left unset, the workload will be pinned to CPUs `[0, N-1]` where `N` is the number of CPUs listed in the workload's class | N 67 | 68 | ## Examples 69 | ### Linpack with 50% local memory on CPUs 4,5,6,7 70 | ./benchmark.py linpack 0.5 --cpus 4,5,6,7 71 | 72 | ### Quicksort with 30% local memory with an ID of 5 73 | ./benchmark.py quicksort 0.3 --id 5 74 | 75 | ## Adding Additional Workloads 76 | New workloads can be added by modifying the workload_choices variable in `benchmark.py` and creating a new class for it in `lib/workloads.py`. 77 | 78 | # Multi-workload Benchmarks 79 | 80 | ## `server.py` 81 | `server.py` runs on a separate (or even the same) machine from `scheduler.py`. Multiple `server.py` instances send execution-related data to a single `scheduler.py` instance, receiving workload execution directions in turn. `server.py` takes a single, optional flag, --log, that directs it to save a timestamped account of events to a file named `log.txt` in the same directory. 82 | 83 | ## Potential Issues 84 | We made a lot of assumptions about system configuration. `server.py` expects several files to exist on your system, mostly for sampling purposes. If they don't exist, we insert zeroes instead of reading their values. 85 | 86 | ## `scheduler.py` 87 | This is the brains of the server-scheduler system. The scheduler is responsible for determining the arrival order of workloads, setting the shrinking policy, and aggregating all of the data from the server(s). 88 | 89 | ## Arguments 90 | Argument | Description | Required 91 | --------------------------------|-------------------------------|-------------- 92 | seed | The seed used to initialize the randomized operations that the scheduler performs | Y 93 | servers | A comma-separated list of ip:port combinations on which `server.py` instances are listening | Y 94 | cpus | The number of cpus that each server is allowed to use | Y 95 | mem | The amount of local memory that each server is allowed to use | Y 96 | --remotemem, -r | Enables remote memory on each of the `server.py` instances | N 97 | --max_far, -s | The maximum aggregate remote memory that servers are allowed to use. Enforced entirely in the scheduler. Default = Unlimited | N 98 | --size | The total number of workloads to run. Default = 200| N 99 | --workload | A comma-separated set of unique workloads to run. Default = quicksort,kmeans,memaslap | N 100 | --ratios | A colon-separated set of ratios that correspond to the arguments for --workload. This determines how well-represented a particular workload type is in the aggregate. Default = 2:1:1 | N 101 | --until | The maximum arrival time of a workload. Default = 20 | N 102 | --uniform_ratio | Smallest local memory ratio for the uniform shrinking policy | N 103 | --variable_ratios | A comma-separated list of minimum local memory ratios that correspond to the arguments for --workload | N 104 | --start_burst | The number of workloads that will have their arrival time set to 0 instead of randomized. Default = 0 | N 105 | --optimal | Use the optimal shrinking policy | N 106 | 107 | ## Examples 108 | 109 | ./scheduler.py 123 192.168.0.1:50051 8 8192 -r --max_far 4096 --size 100 \ 110 | --workload quicksort,kmeans,linpack --ratios 3:1:1 --until 30 --optimal 111 | 112 | Parameter | Value | Explanation 113 | --------------|-----------------|------ 114 | seed | 123 | Randomization seed. The same seed creates the same arrival pattern 115 | servers | 192.168.0.1:50051 | Connect to a `server.py` instance at IP 192.168.0.1 that's listening on port 50051 116 | cpus | 8 | The `server.py` instance can use a total of 8 CPUs 117 | mem | 8192 (8192 = 8GB) | The `server.py` instance can use a total of 8GB of local memory 118 | -r | Set | Enable the use of remote memory (for swapping) 119 | --max_far | 4096 | The `server.py` instance can use a total of 4GB of remote memory 120 | --size | 100 | A total of 100 workloads will be scheduled. The type/number are determined by `--workload` and `--ratios` 121 | --workload | quicksort,kmeans,linpack | The previously-specified 100 workloads will consist of quicksort, kmeans, and linpack. The mixture is determined by `--ratios` 122 | --ratios | 3:1:1 | The first, second, and third workloads in the comma-separated list passed to `--workload` constitute 60% (3/(3+1+1)), 20% (1/(3+1+1)), and 20% (1/(3+1+1)) of the 100 workloads respectively. In this example, there will be 60 quicksorts, 20 kmeans, and 20 linpacks scheduled. 123 | --until | 30 | Each of the 30 workloads will have a random arrival time between 0 and 30 seconds 124 | --optimal | Set | The `server.py` and `scheduler.py` will use the optimal shrinking policy. Setting this precludes using both `--uniform_ratio` and `--variable_ratios` 125 | 126 | ./scheduler.py 123 192.168.0.1:50051 8 8192 -r --size 100 --workload quicksort,kmeans,linpack \ 127 | --ratios 3:1:1 --until 30 --variable_ratios 0.5,0.6,0.7 128 | 129 | Parameter | Value | Explanation 130 | --------------|-----------------|------ 131 | seed | 123 | Randomization seed. The same seed creates the same arrival pattern 132 | servers | 192.168.0.1:50051 | Connect to a `server.py` instance at IP 192.168.0.1 that's listening on port 50051 133 | cpus | 8 | The `server.py` instance can use a total of 8 CPUs 134 | mem | 8192 (8192 = 8GB) | The `server.py` instance can use a total of 8GB of local memory 135 | -r | Set | Enable the use of remote memory (for swapping) 136 | --max_far | Unset | The `server.py` instance can use unlimited remote memory 137 | --size | 100 | A total of 100 workloads will be scheduled. The type/number are determined by `--workload` and `--ratios` 138 | --workload | quicksort,kmeans,linpack | The previously-specified 100 workloads will consist of quicksort, kmeans, and linpack. The mixture is determined by `--ratios` 139 | --ratios | 3:1:1 | The first, second, and third workloads in the comma-separated list passed to `--workload` constitute 60% (3/(3+1+1)), 20% (1/(3+1+1)), and 20% (1/(3+1+1)) of the 100 workloads respectively. In this example, there will be 60 quicksorts, 20 kmeans, and 20 linpacks scheduled. 140 | --until | 30 | Each of the 30 workloads will have a random arrival time between 0 and 30 seconds 141 | --variable_ratios | 0.5,0.6,0.7 | The three workloads (quicksort, kmeans, and linpack) will have their minimum ratios set to 0.5, 0.6, and 0.7 respectively. `server.py` and `scheduler.py` will use the variable shrinking policy. Setting this precludes using both `--uniform_ratio` and `--optimal` 142 | 143 | ./scheduler.py 123 192.168.0.1:50051,192.168.0.2:50051 8 8192 -r --size 250 \ 144 | --workload quicksort,kmeans,linpack --ratios 3:1:1 --uniform_ratio 0.5 \ 145 | --until 30 --start_burst 2 146 | 147 | Parameter | Value | Explanation 148 | --------------|-----------------|------ 149 | seed | 123 | Randomization seed. The same seed creates the same arrival pattern 150 | servers | 192.168.0.1:50051,192.168.0.2:50051 | Connect to `server.py` instances at IPs 192.168.0.1 and 192.168.0.2 that are both listening on port 50051 151 | cpus | 8 | Each `server.py` instance can use a total of 8 CPUs 152 | mem | 8192 (8192 = 8GB) | Each `server.py` instance can use a total of 8GB of local memory 153 | -r | Set | Enable the use of remote memory (for swapping) 154 | --max_far | Unset | Each `server.py` instance can use unlimited remote memory 155 | --size | 250 | A total of 250 workloads will be scheduled. The type/number are determined by `--workload` and `--ratios` 156 | --workload | quicksort,kmeans,linpack | The previously-specified 250 workloads will consist of quicksort, kmeans, and linpack. The mixture is determined by `--ratios` 157 | --ratios | 3:1:1 | The first, second, and third workloads in the comma-separated list passed to `--workload` constitute 60% (3/(3+1+1)), 20% (1/(3+1+1)), and 20% (1/(3+1+1)) of the 100 workloads respectively. In this example, there will be 150 quicksorts, 50 kmeans, and 50 linpacks scheduled. 158 | --uniform_ratio | 0.5 | The three workloads (quicksort, kmeans, and linpack) will have their minimum ratios set to 0.5. `server.py` and `scheduler.py` will use the uniform shrinking policy. Setting this precludes using both `--optimal` and `--variable_ratios` 159 | --until | 30 | Each of the 30 workloads will have a random arrival time between 0 and 30 seconds 160 | --start_burst | 2 | The first 2 workloads in the schedule will have their arrival times modified to be 0. This causes them to arrive immediately. 161 | 162 | ## Further reading 163 | For more information, please refer to our [paper](https://dl.acm.org/doi/abs/10.1145/3342195.3387522) accepted at [EUROSYS 2020](https://www.eurosys2020.org/) 164 | 165 | ## Questions 166 | For additional questions please contact us at cfm@lists.eecs.berkeley.edu 167 | -------------------------------------------------------------------------------- /stream/stream.f: -------------------------------------------------------------------------------- 1 | *======================================================================= 2 | * Program: STREAM 3 | * Programmer: John D. McCalpin 4 | * RCS Revision: $Id: stream.f,v 5.6 2005/10/04 00:20:48 mccalpin Exp mccalpin $ 5 | *----------------------------------------------------------------------- 6 | * Copyright 1991-2003: John D. McCalpin 7 | *----------------------------------------------------------------------- 8 | * License: 9 | * 1. You are free to use this program and/or to redistribute 10 | * this program. 11 | * 2. You are free to modify this program for your own use, 12 | * including commercial use, subject to the publication 13 | * restrictions in item 3. 14 | * 3. You are free to publish results obtained from running this 15 | * program, or from works that you derive from this program, 16 | * with the following limitations: 17 | * 3a. In order to be referred to as "STREAM benchmark results", 18 | * published results must be in conformance to the STREAM 19 | * Run Rules, (briefly reviewed below) published at 20 | * http://www.cs.virginia.edu/stream/ref.html 21 | * and incorporated herein by reference. 22 | * As the copyright holder, John McCalpin retains the 23 | * right to determine conformity with the Run Rules. 24 | * 3b. Results based on modified source code or on runs not in 25 | * accordance with the STREAM Run Rules must be clearly 26 | * labelled whenever they are published. Examples of 27 | * proper labelling include: 28 | * "tuned STREAM benchmark results" 29 | * "based on a variant of the STREAM benchmark code" 30 | * Other comparable, clear and reasonable labelling is 31 | * acceptable. 32 | * 3c. Submission of results to the STREAM benchmark web site 33 | * is encouraged, but not required. 34 | * 4. Use of this program or creation of derived works based on this 35 | * program constitutes acceptance of these licensing restrictions. 36 | * 5. Absolutely no warranty is expressed or implied. 37 | *----------------------------------------------------------------------- 38 | * This program measures sustained memory transfer rates in MB/s for 39 | * simple computational kernels coded in FORTRAN. 40 | * 41 | * The intent is to demonstrate the extent to which ordinary user 42 | * code can exploit the main memory bandwidth of the system under 43 | * test. 44 | *======================================================================= 45 | * The STREAM web page is at: 46 | * http://www.streambench.org 47 | * 48 | * Most of the content is currently hosted at: 49 | * http://www.cs.virginia.edu/stream/ 50 | * 51 | * BRIEF INSTRUCTIONS: 52 | * 0) See http://www.cs.virginia.edu/stream/ref.html for details 53 | * 1) STREAM requires a timing function called mysecond(). 54 | * Several examples are provided in this directory. 55 | * "CPU" timers are only allowed for uniprocessor runs. 56 | * "Wall-clock" timers are required for all multiprocessor runs. 57 | * 2) The STREAM array sizes must be set to size the test. 58 | * The value "N" must be chosen so that each of the three 59 | * arrays is at least 4x larger than the sum of all the last- 60 | * level caches used in the run, or 1 million elements, which- 61 | * ever is larger. 62 | * ------------------------------------------------------------ 63 | * Note that you are free to use any array length and offset 64 | * that makes each array 4x larger than the last-level cache. 65 | * The intent is to determine the *best* sustainable bandwidth 66 | * available with this simple coding. Of course, lower values 67 | * are usually fairly easy to obtain on cached machines, but 68 | * by keeping the test to the *best* results, the answers are 69 | * easier to interpret. 70 | * You may put the arrays in common or not, at your discretion. 71 | * There is a commented-out COMMON statement below. 72 | * Fortran90 "allocatable" arrays are fine, too. 73 | * ------------------------------------------------------------ 74 | * 3) Compile the code with full optimization. Many compilers 75 | * generate unreasonably bad code before the optimizer tightens 76 | * things up. If the results are unreasonably good, on the 77 | * other hand, the optimizer might be too smart for me 78 | * Please let me know if this happens. 79 | * 4) Mail the results to mccalpin@cs.virginia.edu 80 | * Be sure to include: 81 | * a) computer hardware model number and software revision 82 | * b) the compiler flags 83 | * c) all of the output from the test case. 84 | * Please let me know if you do not want your name posted along 85 | * with the submitted results. 86 | * 5) See the web page for more comments about the run rules and 87 | * about interpretation of the results. 88 | * 89 | * Thanks, 90 | * Dr. Bandwidth 91 | *========================================================================= 92 | * 93 | PROGRAM stream 94 | * IMPLICIT NONE 95 | C .. Parameters .. 96 | INTEGER n,offset,ndim,ntimes 97 | PARAMETER (n=2000000,offset=0,ndim=n+offset,ntimes=10) 98 | C .. 99 | C .. Local Scalars .. 100 | DOUBLE PRECISION scalar,t 101 | INTEGER j,k,nbpw,quantum 102 | C .. 103 | C .. Local Arrays .. 104 | DOUBLE PRECISION maxtime(4),mintime(4),avgtime(4), 105 | $ times(4,ntimes) 106 | INTEGER bytes(4) 107 | CHARACTER label(4)*11 108 | C .. 109 | C .. External Functions .. 110 | DOUBLE PRECISION mysecond 111 | INTEGER checktick,realsize 112 | EXTERNAL mysecond,checktick,realsize 113 | !$ INTEGER omp_get_num_threads 114 | !$ EXTERNAL omp_get_num_threads 115 | C .. 116 | C .. Intrinsic Functions .. 117 | C 118 | INTRINSIC dble,max,min,nint,sqrt 119 | C .. 120 | C .. Arrays in Common .. 121 | DOUBLE PRECISION a(ndim),b(ndim),c(ndim) 122 | C .. 123 | C .. Common blocks .. 124 | * COMMON a,b,c 125 | C .. 126 | C .. Data statements .. 127 | DATA avgtime/4*0.0D0/,mintime/4*1.0D+36/,maxtime/4*0.0D0/ 128 | DATA label/'Copy: ','Scale: ','Add: ', 129 | $ 'Triad: '/ 130 | DATA bytes/2,2,3,3/ 131 | C .. 132 | 133 | * --- SETUP --- determine precision and check timing --- 134 | 135 | nbpw = realsize() 136 | 137 | PRINT *,'----------------------------------------------' 138 | PRINT *,'STREAM Version $Revision: 5.6 $' 139 | PRINT *,'----------------------------------------------' 140 | WRITE (*,FMT=9010) 'Array size = ',n 141 | WRITE (*,FMT=9010) 'Offset = ',offset 142 | WRITE (*,FMT=9020) 'The total memory requirement is ', 143 | $ 3*nbpw*n/ (1024*1024),' MB' 144 | WRITE (*,FMT=9030) 'You are running each test ',ntimes,' times' 145 | WRITE (*,FMT=9030) '--' 146 | WRITE (*,FMT=9030) 'The *best* time for each test is used' 147 | WRITE (*,FMT=9030) '*EXCLUDING* the first and last iterations' 148 | 149 | !$OMP PARALLEL 150 | !$OMP MASTER 151 | PRINT *,'----------------------------------------------' 152 | !$ PRINT *,'Number of Threads = ',OMP_GET_NUM_THREADS() 153 | !$OMP END MASTER 154 | !$OMP END PARALLEL 155 | 156 | PRINT *,'----------------------------------------------' 157 | !$OMP PARALLEL 158 | PRINT *,'Printing one line per active thread....' 159 | !$OMP END PARALLEL 160 | 161 | !$OMP PARALLEL DO 162 | DO 10 j = 1,n 163 | a(j) = 2.0d0 164 | b(j) = 0.5D0 165 | c(j) = 0.0D0 166 | 10 CONTINUE 167 | t = mysecond() 168 | !$OMP PARALLEL DO 169 | DO 20 j = 1,n 170 | a(j) = 0.5d0*a(j) 171 | 20 CONTINUE 172 | t = mysecond() - t 173 | PRINT *,'----------------------------------------------------' 174 | quantum = checktick() 175 | WRITE (*,FMT=9000) 176 | $ 'Your clock granularity/precision appears to be ',quantum, 177 | $ ' microseconds' 178 | PRINT *,'----------------------------------------------------' 179 | 180 | * --- MAIN LOOP --- repeat test cases NTIMES times --- 181 | scalar = 0.5d0*a(1) 182 | DO 70 k = 1,ntimes 183 | 184 | t = mysecond() 185 | a(1) = a(1) + t 186 | !$OMP PARALLEL DO 187 | DO 30 j = 1,n 188 | c(j) = a(j) 189 | 30 CONTINUE 190 | t = mysecond() - t 191 | c(n) = c(n) + t 192 | times(1,k) = t 193 | 194 | t = mysecond() 195 | c(1) = c(1) + t 196 | !$OMP PARALLEL DO 197 | DO 40 j = 1,n 198 | b(j) = scalar*c(j) 199 | 40 CONTINUE 200 | t = mysecond() - t 201 | b(n) = b(n) + t 202 | times(2,k) = t 203 | 204 | t = mysecond() 205 | a(1) = a(1) + t 206 | !$OMP PARALLEL DO 207 | DO 50 j = 1,n 208 | c(j) = a(j) + b(j) 209 | 50 CONTINUE 210 | t = mysecond() - t 211 | c(n) = c(n) + t 212 | times(3,k) = t 213 | 214 | t = mysecond() 215 | b(1) = b(1) + t 216 | !$OMP PARALLEL DO 217 | DO 60 j = 1,n 218 | a(j) = b(j) + scalar*c(j) 219 | 60 CONTINUE 220 | t = mysecond() - t 221 | a(n) = a(n) + t 222 | times(4,k) = t 223 | 70 CONTINUE 224 | 225 | * --- SUMMARY --- 226 | DO 90 k = 2,ntimes 227 | DO 80 j = 1,4 228 | avgtime(j) = avgtime(j) + times(j,k) 229 | mintime(j) = min(mintime(j),times(j,k)) 230 | maxtime(j) = max(maxtime(j),times(j,k)) 231 | 80 CONTINUE 232 | 90 CONTINUE 233 | WRITE (*,FMT=9040) 234 | DO 100 j = 1,4 235 | avgtime(j) = avgtime(j)/dble(ntimes-1) 236 | WRITE (*,FMT=9050) label(j),n*bytes(j)*nbpw/mintime(j)/1.0D6, 237 | $ avgtime(j),mintime(j),maxtime(j) 238 | 100 CONTINUE 239 | PRINT *,'----------------------------------------------------' 240 | CALL checksums (a,b,c,n,ntimes) 241 | PRINT *,'----------------------------------------------------' 242 | 243 | 9000 FORMAT (1x,a,i6,a) 244 | 9010 FORMAT (1x,a,i10) 245 | 9020 FORMAT (1x,a,i4,a) 246 | 9030 FORMAT (1x,a,i3,a,a) 247 | 9040 FORMAT ('Function',5x,'Rate (MB/s) Avg time Min time Max time' 248 | $ ) 249 | 9050 FORMAT (a,4 (f10.4,2x)) 250 | END 251 | 252 | *------------------------------------- 253 | * INTEGER FUNCTION dblesize() 254 | * 255 | * A semi-portable way to determine the precision of DOUBLE PRECISION 256 | * in Fortran. 257 | * Here used to guess how many bytes of storage a DOUBLE PRECISION 258 | * number occupies. 259 | * 260 | INTEGER FUNCTION realsize() 261 | * IMPLICIT NONE 262 | 263 | C .. Local Scalars .. 264 | DOUBLE PRECISION result,test 265 | INTEGER j,ndigits 266 | C .. 267 | C .. Local Arrays .. 268 | DOUBLE PRECISION ref(30) 269 | C .. 270 | C .. External Subroutines .. 271 | EXTERNAL confuse 272 | C .. 273 | C .. Intrinsic Functions .. 274 | INTRINSIC abs,acos,log10,sqrt 275 | C .. 276 | 277 | C Test #1 - compare single(1.0d0+delta) to 1.0d0 278 | 279 | 10 DO 20 j = 1,30 280 | ref(j) = 1.0d0 + 10.0d0** (-j) 281 | 20 CONTINUE 282 | 283 | DO 30 j = 1,30 284 | test = ref(j) 285 | ndigits = j 286 | CALL confuse(test,result) 287 | IF (test.EQ.1.0D0) THEN 288 | GO TO 40 289 | END IF 290 | 30 CONTINUE 291 | GO TO 50 292 | 293 | 40 WRITE (*,FMT='(a)') 294 | $ '----------------------------------------------' 295 | WRITE (*,FMT='(1x,a,i2,a)') 'Double precision appears to have ', 296 | $ ndigits,' digits of accuracy' 297 | IF (ndigits.LE.8) THEN 298 | realsize = 4 299 | ELSE 300 | realsize = 8 301 | END IF 302 | WRITE (*,FMT='(1x,a,i1,a)') 'Assuming ',realsize, 303 | $ ' bytes per DOUBLE PRECISION word' 304 | WRITE (*,FMT='(a)') 305 | $ '----------------------------------------------' 306 | RETURN 307 | 308 | 50 PRINT *,'Hmmmm. I am unable to determine the size.' 309 | PRINT *,'Please enter the number of Bytes per DOUBLE PRECISION', 310 | $ ' number : ' 311 | READ (*,FMT=*) realsize 312 | IF (realsize.NE.4 .AND. realsize.NE.8) THEN 313 | PRINT *,'Your answer ',realsize,' does not make sense.' 314 | PRINT *,'Try again.' 315 | PRINT *,'Please enter the number of Bytes per ', 316 | $ 'DOUBLE PRECISION number : ' 317 | READ (*,FMT=*) realsize 318 | END IF 319 | PRINT *,'You have manually entered a size of ',realsize, 320 | $ ' bytes per DOUBLE PRECISION number' 321 | WRITE (*,FMT='(a)') 322 | $ '----------------------------------------------' 323 | END 324 | 325 | SUBROUTINE confuse(q,r) 326 | * IMPLICIT NONE 327 | C .. Scalar Arguments .. 328 | DOUBLE PRECISION q,r 329 | C .. 330 | C .. Intrinsic Functions .. 331 | INTRINSIC cos 332 | C .. 333 | r = cos(q) 334 | RETURN 335 | END 336 | 337 | * A semi-portable way to determine the clock granularity 338 | * Adapted from a code by John Henning of Digital Equipment Corporation 339 | * 340 | INTEGER FUNCTION checktick() 341 | * IMPLICIT NONE 342 | 343 | C .. Parameters .. 344 | INTEGER n 345 | PARAMETER (n=20) 346 | C .. 347 | C .. Local Scalars .. 348 | DOUBLE PRECISION t1,t2 349 | INTEGER i,j,jmin 350 | C .. 351 | C .. Local Arrays .. 352 | DOUBLE PRECISION timesfound(n) 353 | C .. 354 | C .. External Functions .. 355 | DOUBLE PRECISION mysecond 356 | EXTERNAL mysecond 357 | C .. 358 | C .. Intrinsic Functions .. 359 | INTRINSIC max,min,nint 360 | C .. 361 | i = 0 362 | 363 | 10 t2 = mysecond() 364 | IF (t2.EQ.t1) GO TO 10 365 | 366 | t1 = t2 367 | i = i + 1 368 | timesfound(i) = t1 369 | IF (i.LT.n) GO TO 10 370 | 371 | jmin = 1000000 372 | DO 20 i = 2,n 373 | j = nint((timesfound(i)-timesfound(i-1))*1d6) 374 | jmin = min(jmin,max(j,0)) 375 | 20 CONTINUE 376 | 377 | IF (jmin.GT.0) THEN 378 | checktick = jmin 379 | ELSE 380 | PRINT *,'Your clock granularity appears to be less ', 381 | $ 'than one microsecond' 382 | checktick = 1 383 | END IF 384 | RETURN 385 | 386 | * PRINT 14, timesfound(1)*1d6 387 | * DO 20 i=2,n 388 | * PRINT 14, timesfound(i)*1d6, 389 | * & nint((timesfound(i)-timesfound(i-1))*1d6) 390 | * 14 FORMAT (1X, F18.4, 1X, i8) 391 | * 20 CONTINUE 392 | 393 | END 394 | 395 | 396 | 397 | 398 | SUBROUTINE checksums(a,b,c,n,ntimes) 399 | * IMPLICIT NONE 400 | C .. 401 | C .. Arguments .. 402 | DOUBLE PRECISION a(*),b(*),c(*) 403 | INTEGER n,ntimes 404 | C .. 405 | C .. Local Scalars .. 406 | DOUBLE PRECISION aa,bb,cc,scalar,suma,sumb,sumc,epsilon 407 | INTEGER k 408 | C .. 409 | 410 | C Repeat the main loop, but with scalars only. 411 | C This is done to check the sum & make sure all 412 | C iterations have been executed correctly. 413 | 414 | aa = 2.0D0 415 | bb = 0.5D0 416 | cc = 0.0D0 417 | aa = 0.5D0*aa 418 | scalar = 0.5d0*aa 419 | DO k = 1,ntimes 420 | cc = aa 421 | bb = scalar*cc 422 | cc = aa + bb 423 | aa = bb + scalar*cc 424 | END DO 425 | aa = aa*DBLE(n-2) 426 | bb = bb*DBLE(n-2) 427 | cc = cc*DBLE(n-2) 428 | 429 | C Now sum up the arrays, excluding the first and last 430 | C elements, which are modified using the timing results 431 | C to confuse aggressive optimizers. 432 | 433 | suma = 0.0d0 434 | sumb = 0.0d0 435 | sumc = 0.0d0 436 | !$OMP PARALLEL DO REDUCTION(+:suma,sumb,sumc) 437 | DO 110 j = 2,n-1 438 | suma = suma + a(j) 439 | sumb = sumb + b(j) 440 | sumc = sumc + c(j) 441 | 110 CONTINUE 442 | 443 | epsilon = 1.D-6 444 | 445 | IF (ABS(suma-aa)/suma .GT. epsilon) THEN 446 | PRINT *,'Failed Validation on array a()' 447 | PRINT *,'Target Sum of a is = ',aa 448 | PRINT *,'Computed Sum of a is = ',suma 449 | ELSEIF (ABS(sumb-bb)/sumb .GT. epsilon) THEN 450 | PRINT *,'Failed Validation on array b()' 451 | PRINT *,'Target Sum of b is = ',bb 452 | PRINT *,'Computed Sum of b is = ',sumb 453 | ELSEIF (ABS(sumc-cc)/sumc .GT. epsilon) THEN 454 | PRINT *,'Failed Validation on array c()' 455 | PRINT *,'Target Sum of c is = ',cc 456 | PRINT *,'Computed Sum of c is = ',sumc 457 | ELSE 458 | PRINT *,'Solution Validates!' 459 | ENDIF 460 | 461 | END 462 | 463 | -------------------------------------------------------------------------------- /lib/workloads.py: -------------------------------------------------------------------------------- 1 | import threading 2 | import subprocess 3 | import os 4 | import signal 5 | import time 6 | import psutil 7 | import numpy as np 8 | import shlex 9 | 10 | from lib import utils 11 | from lib.container import Container 12 | from lib import constants 13 | 14 | class Workload: 15 | ''' This class is not meant to be used by itself. It's only purpose 16 | is to provide definitions that are common to all of its children. 17 | ''' 18 | # These variables are defined in child classes 19 | # that inherit from this class. Their definition here is 20 | # just done for clarity. 21 | wname = None 22 | ideal_mem = None 23 | min_ratio = None 24 | cpu_req = None 25 | 26 | def __init__(self, idd, pinned_cpus, mem_ratio=1): 27 | 28 | self.idd = idd # a unique uint id for this workload 29 | 30 | # process handling 31 | self.thread = None 32 | self.popen = None 33 | self.stdout = None 34 | self.stderr = None 35 | 36 | # Container creation 37 | self.mem_ratio = mem_ratio 38 | self.container = Container(self.get_name(), self.ideal_mem, self.mem_ratio) 39 | self.container.create() 40 | 41 | # Pin CPUs 42 | self.pinned_cpus = pinned_cpus 43 | 44 | # Get shell command 45 | procs_path = self.container.get_procs_path() 46 | self.cmdline = self.get_cmdline(procs_path, pinned_cpus) 47 | 48 | # task timings 49 | self.ts_start = 0 50 | self.ts_finish = 0 51 | 52 | # Getting gradient coeffs ready 53 | self.percent = 0 54 | self.ratio = 1 55 | self.get_gradient() 56 | 57 | def __exec(self): 58 | " execute in self.thread " 59 | print(self.cmdline) 60 | 61 | self.ts_start = time.time() 62 | self.popen = subprocess.Popen(self.cmdline, stdout=subprocess.PIPE, 63 | stderr=subprocess.PIPE, shell=True) 64 | self.stdout, self.stderr = self.popen.communicate() # blocks process exit 65 | assert(self.popen.returncode == 0) 66 | self.ts_finish = time.time() 67 | 68 | self.container.delete() 69 | 70 | def start(self): 71 | self.thread = threading.Thread(target=self.__exec) 72 | self.thread.start() 73 | 74 | while not self.is_alive(): 75 | pass 76 | 77 | def modify_ratio(self, new_ratio): 78 | self.container.set_new_size(new_ratio) 79 | 80 | def get_name(self): 81 | return self.wname + str(self.idd) 82 | 83 | def get_retcode(self): 84 | return self.popen.returncode 85 | 86 | def is_alive(self): 87 | return self.thread.is_alive() and self.popen 88 | 89 | def get_process_duration(self): 90 | return self.ts_finish - self.ts_start 91 | 92 | def get_usr_bin_time(self): 93 | ''' Parse the output of /usr/bin/time from stderr''' 94 | parser = utils.BinTimeParser() 95 | return parser.parse(self.stderr.decode('utf-8')) 96 | 97 | def kill(self): 98 | pg_id = os.getpgid(self.popen.pid) 99 | os.killpg(pg_id, signal.SIGKILL) 100 | self.thread.join() 101 | 102 | def set_min_ratio(self, new_min_ratio): 103 | self.min_ratio = new_min_ratio 104 | self.min_mem = self.min_ratio * self.ideal_mem 105 | 106 | def update(self, el_time, new_ratio, new_idd=None): # ratio = 0 is no remote memory mode 107 | assert el_time >= 0 108 | 109 | if (new_idd is not None) and self.idd == new_idd: 110 | assert self.percent == 0 111 | else: 112 | self.update_percent(el_time) 113 | self.ratio = new_ratio 114 | self.modify_ratio(new_ratio) 115 | 116 | def update_percent(self, el_time): 117 | self.percent = self.percent + el_time/self.profile(self.ratio) 118 | 119 | def profile(self,ratio): 120 | return self.compute_ratio_from_coeff(self.coeff, ratio)*1000 # from second to millisecond 121 | 122 | def get_gradient(self): 123 | tmp_coeff = self.coeff + [0] 124 | self.gd_coeff = np.polyder(self.coeff) 125 | self.mem_gd_coeff = np.polyder(tmp_coeff) 126 | 127 | def gradient(self, ratio): 128 | return self.compute_ratio_from_coeff(self.gd_coeff, ratio) 129 | 130 | def mem_gradient(self,ratio): 131 | return self.compute_ratio_from_coeff(self.mem_gd_coeff, ratio) 132 | 133 | def compute_ratio_from_coeff(self, coeffs, ratio): 134 | p = 0 135 | order = len(coeffs) 136 | for i in range(order): 137 | p += coeffs[i] * ratio**(order-1-i) 138 | return p 139 | 140 | def get_pids(self): 141 | return self.container.get_pids() 142 | 143 | class Quicksort(Workload): 144 | wname = "quicksort" 145 | ideal_mem = 8250 146 | min_ratio = 0.65 147 | min_mem = int(min_ratio * ideal_mem) 148 | binary_name = "quicksort" 149 | cpu_req = 1 150 | coeff = [-1984.129, 4548.033, -3588.554, 1048.644, 252.997] 151 | 152 | def get_cmdline(self, procs_path, pinned_cpus): 153 | prefix = "echo $$ > {} &&".format(procs_path) 154 | arg = '8192' 155 | shell_cmd = '/usr/bin/time -v' + ' ' + constants.WORK_DIR + '/quicksort/quicksort {}'.format(arg) 156 | pinned_cpus_string = ','.join(map(str, pinned_cpus)) 157 | set_cpu = 'taskset -c {}'.format(pinned_cpus_string) 158 | full_command = ' '.join((prefix, 'exec', set_cpu, shell_cmd)) 159 | return full_command 160 | 161 | 162 | class Linpack(Workload): 163 | wname = "linpack" 164 | ideal_mem = 1600 165 | min_ratio = 0.9 166 | min_mem = int(min_ratio * ideal_mem) 167 | binary_name = "xlinpack_xeon64" 168 | cpu_req = 4 169 | coeff = [38.52, -77.88, 26.86, 36.70] 170 | 171 | def get_cmdline(self, procs_path, pinned_cpus): 172 | linpack_dir = constants.WORK_DIR + '/linpack' 173 | prefix = "echo $$ > {} &&".format(procs_path) 174 | set_vars = ' '.join(('MKL_NUM_THREADS=4', 175 | 'OMP_NUM_THREADS=4', 176 | 'MKL_DOMAIN_NUM_THREADS=4')) 177 | 178 | pinned_cpus_string = ','.join(map(str, pinned_cpus)) 179 | set_cpu = 'taskset -c {}'.format(pinned_cpus_string) 180 | 181 | set_vars = ' '.join(('KMP_AFFINITY=nowarnings,compact,1,0,granularity=fine', 182 | set_vars)) 183 | 184 | bin_path = '{}/xlinpack_xeon64'.format(linpack_dir) 185 | cmdline = '{}/lininput_xeon64'.format(linpack_dir) 186 | after_exec = ' '.join(('/usr/bin/time -v', bin_path, cmdline)) 187 | full_command = ' '.join((prefix, set_vars, 'exec', set_cpu, after_exec)) 188 | return full_command 189 | 190 | 191 | class Tfinception(Workload): 192 | wname = "tf-inception" 193 | ideal_mem = 2120 194 | min_ratio = 0.9 195 | min_mem = int(min_ratio * ideal_mem) 196 | binary_name = "python3" 197 | cpu_req = 2 198 | coeff = [-1617.416, 3789.953, -2993.734, 1225.477] 199 | 200 | def get_cmdline(self, procs_path, pinned_cpus): 201 | work_dir = ''.join((constants.WORK_DIR, 202 | '/tensorflow/benchmarks/scripts/tf_cnn_benchmarks')) 203 | 204 | pinned_cpus_string = ','.join(map(str, pinned_cpus)) 205 | set_cpu = 'taskset -c {}'.format(pinned_cpus_string) 206 | 207 | cd_dir = ' '.join(('cd', work_dir, '&&')) 208 | prefix = "echo $$ > {} &&".format(procs_path) 209 | set_vars = ' '.join(('KMP_BLOCK_TIME=0', 210 | 'KMP_SETTINGS=1 OMP_NUM_THREADS=2')) 211 | 212 | set_vars = ' '.join(('KMP_AFFINITY=granularity=fine,verbose,compact,1,0', 213 | set_vars)) 214 | 215 | shell_cmd = ' '.join(("/usr/bin/time -v python3 tf_cnn_benchmarks.py", 216 | "--forward_only=True --data_format=NHWC --device=cpu", 217 | "--batch_size=64 --num_inter_threads=1", 218 | "--num_intra_threads=2 --nodistortions", 219 | "--model=inception3", 220 | "--kmp_blocktime=0 --num_batches=20", 221 | "--num_warmup_batches 0")) 222 | full_command = ' '.join((cd_dir, prefix, set_vars, 'exec', set_cpu, shell_cmd)) 223 | return full_command 224 | 225 | 226 | class Tfresnet(Workload): 227 | wname = "tf-resnet" 228 | ideal_mem = 1268 229 | min_ratio = 0.9 230 | min_mem = int(min_ratio * ideal_mem) 231 | binary_name = "python3" 232 | cpu_req = 2 233 | coeff = [-1617.416, 3789.953, -2993.734, 1225.477] 234 | 235 | def get_cmdline(self, procs_path, pinned_cpus): 236 | work_dir = ''.join((constants.WORK_DIR, 237 | '/tensorflow/benchmarks/scripts/tf_cnn_benchmarks')) 238 | 239 | pinned_cpus_string = ','.join(map(str, pinned_cpus)) 240 | set_cpu = 'taskset -c {}'.format(pinned_cpus_string) 241 | 242 | cd_dir = ' '.join(('cd', work_dir, '&&')) 243 | prefix = "echo $$ > {} &&".format(procs_path) 244 | set_vars = ' '.join(('KMP_BLOCK_TIME=0', 245 | 'KMP_SETTINGS=1 OMP_NUM_THREADS=2')) 246 | 247 | set_vars = ' '.join(('KMP_AFFINITY=granularity=fine,verbose,compact,1,0', 248 | set_vars)) 249 | 250 | shell_cmd = ' '.join(("/usr/bin/time -v python3 tf_cnn_benchmarks.py", 251 | "--forward_only=True --data_format=NHWC --device=cpu", 252 | "--batch_size=64 --num_inter_threads=1", 253 | "--num_intra_threads=2 --nodistortions", 254 | "--model=resnet50", 255 | "--kmp_blocktime=0 --num_batches=20", 256 | "--num_warmup_batches 0")) 257 | full_command = ' '.join((cd_dir, prefix, set_vars, 'exec', set_cpu, shell_cmd)) 258 | return full_command 259 | 260 | 261 | class Kmeans(Workload): 262 | wname = "kmeans" 263 | ideal_mem = 4847 264 | binary_name = "python3" 265 | min_ratio = 0.75 266 | min_mem = int(min_ratio * ideal_mem) 267 | cpu_req = 1 268 | coeff = [-10341.875, 31554.403, -34346.894, 15214.428, -1730.533] 269 | 270 | def get_cmdline(self, procs_path, pinned_cpus): 271 | prefix = "echo $$ > {} && OMP_NUM_THREADS={}".format(procs_path, self.cpu_req) 272 | bin_path = constants.WORK_DIR + '/kmeans/kmeans.py' 273 | shell_cmd = '/usr/bin/time -v python3' + ' ' + bin_path 274 | 275 | pinned_cpus_string = ','.join(map(str, pinned_cpus)) 276 | set_cpu = 'taskset -c {}'.format(pinned_cpus_string) 277 | 278 | full_command = ' '.join((prefix, 'exec', set_cpu, shell_cmd)) 279 | 280 | return full_command 281 | 282 | 283 | class Spark(Workload): 284 | wname = "spark" 285 | ideal_mem = 4400 286 | min_ratio = 0.75 287 | min_mem = int(min_ratio * ideal_mem) 288 | binary_name = "java" 289 | cpu_req = 3 290 | coeff = [4689.05, -10841.59, 7709.92, -1486.13] 291 | 292 | def get_cmdline(self, procs_path, pinned_cpus): 293 | target_dir = ''.join((constants.WORK_DIR, '/spark/pagerank')) 294 | cd_dir = ' '.join(('cd', target_dir, '&&')) 295 | prefix = 'echo $$ > {} &&'.format(procs_path) 296 | 297 | pinned_cpus_string = ','.join(map(str, pinned_cpus)) 298 | set_cpu = 'taskset -c {}'.format(pinned_cpus_string) 299 | 300 | shell_cmd = ' '.join(('/usr/bin/time -v', 301 | constants.SPARK_HOME + 'bin/spark-submit', 302 | '--driver-memory 10g', 303 | '--class \"pagerank\"', 304 | '--master local[2]', 305 | 'target/scala-2.11/pagerank_2.11-1.0.jar')) 306 | full_command = ' '.join((cd_dir, prefix, 'exec', set_cpu, shell_cmd)) 307 | return full_command 308 | 309 | class Memaslap(Workload): 310 | wname = "memaslap" 311 | ideal_mem = 12288 312 | min_ratio = 0.5 313 | min_mem = int(min_ratio * ideal_mem) 314 | binary_name = "memcached" 315 | port_number = 11211 316 | cpu_req = 2 317 | coeff = [-11626.894, 32733.914, -31797.375, 11484.578, 113.33] 318 | 319 | def __init__(self, idd, pinned_cpus, mem_ratio=1): 320 | super().__init__(idd, pinned_cpus, mem_ratio) 321 | self.port_number = Memaslap.port_number 322 | self.memaslap_pids = set() 323 | Memaslap.port_number += 1 324 | 325 | def get_cmdline(self, procs_path, pinned_cpus): 326 | prefix = 'echo $$ > {} &&' 327 | memcached_serv = "/usr/bin/time -v memcached -l localhost -p {} -m {} -t 1".format(self.port_number, 328 | self.ideal_mem) 329 | cpu_list = list(pinned_cpus) 330 | taskset_serv = 'taskset -c {}'.format(cpu_list[0]) 331 | memcached_serv = ' '.join((prefix, 'exec', taskset_serv, memcached_serv)) 332 | memcached_serv = memcached_serv.format(procs_path) 333 | 334 | taskset_memaslap = 'taskset -c {}'.format(cpu_list[1]) 335 | memaslap_fill = taskset_memaslap + ' ' + "memaslap -s localhost:{} -T 1 -F {} --execute_number 30000000" 336 | memaslap_fill = memaslap_fill.format(self.port_number, "memaslap/memaslap_fill") 337 | 338 | memaslap_query = taskset_memaslap + ' ' + "memaslap -s localhost:{} -T 1 -F {} --execute_number 100000000" 339 | memaslap_query = memaslap_query.format(self.port_number, "memaslap/memaslap_etc") 340 | sleep = 'sleep 5' 341 | memaslap_cmd = ' && '.join((memaslap_fill, sleep, memaslap_query)) 342 | return (memcached_serv, memaslap_fill, memaslap_query) 343 | 344 | def start(self): 345 | self.thread = threading.Thread(target=self.__exec) 346 | self.thread.start() 347 | 348 | while not self.is_alive(): 349 | pass 350 | 351 | def __exec(self): 352 | memcached, memaslap_fill, memaslap_query = self.cmdline 353 | 354 | " execute in self.thread " 355 | print(self.cmdline) 356 | 357 | self.ts_start = time.time() 358 | 359 | self.popen = subprocess.Popen(memcached, stdout=subprocess.PIPE, 360 | stderr=subprocess.PIPE, shell=True, 361 | preexec_fn=os.setsid) 362 | 363 | time.sleep(3) # Wait for memcached to boot 364 | memaslap_proc = subprocess.Popen(shlex.split(memaslap_fill), stdout=subprocess.PIPE, 365 | stderr=subprocess.PIPE, shell=False) 366 | self.memaslap_pids.add(memaslap_proc.pid) 367 | stdout, stderr = memaslap_proc.communicate() 368 | self.memaslap_pids.remove(memaslap_proc.pid) 369 | 370 | time.sleep(5) 371 | memaslap_proc = subprocess.Popen(shlex.split(memaslap_query), stdout=subprocess.PIPE, 372 | stderr=subprocess.PIPE, shell=False) 373 | self.memaslap_pids.add(memaslap_proc.pid) 374 | stdout, stderr = memaslap_proc.communicate() 375 | self.memaslap_pids.remove(memaslap_proc.pid) 376 | 377 | print(stdout.decode('utf-8')) 378 | print(stderr.decode('utf-8')) 379 | 380 | os.killpg(os.getpgid(self.popen.pid), signal.SIGINT) 381 | 382 | self.stdout, self.stderr = self.popen.communicate() 383 | self.ts_finish = time.time() 384 | print(self.stdout.decode('utf-8')) 385 | print(self.stderr.decode('utf-8')) 386 | 387 | self.container.delete() 388 | 389 | def get_pids(self): 390 | pids = list(self.container.get_pids()) 391 | pids.extend(self.memaslap_pids) 392 | return pids 393 | 394 | class Stream(Workload): 395 | wname = "stream" 396 | ideal_mem = 4150 397 | min_ratio = 0.50 398 | min_mem = int(min_ratio * ideal_mem) 399 | binary_name = "stream_c.exe" 400 | cpu_req = 1 401 | coeff = [0] 402 | 403 | def get_cmdline(self, procs_path, pinned_cpus): 404 | target_dir = ''.join((constants.WORK_DIR, '/stream')) 405 | cd_dir = ' '.join(('cd', target_dir, '&&')) 406 | prefix = 'echo $$ > {} && OMP_NUM_THREADS={}'.format(procs_path, len(pinned_cpus)) 407 | 408 | pinned_cpus_string = ','.join(map(str, pinned_cpus)) 409 | set_cpu = 'taskset -c {}'.format(pinned_cpus_string) 410 | 411 | shell_cmd = 'nice -n -2 /usr/bin/time -v ./stream_c.exe'.format(len(pinned_cpus)) 412 | full_command = ' '.join((cd_dir, prefix, 'exec', set_cpu, shell_cmd)) 413 | return full_command 414 | 415 | def get_workload_class(wname): 416 | return {'quicksort': Quicksort, 417 | 'linpack': Linpack, 418 | 'tf-inception': Tfinception, 419 | 'tf-resnet': Tfresnet, 420 | 'spark': Spark, 421 | 'kmeans': Kmeans, 422 | 'memaslap': Memaslap, 423 | 'stream': Stream}[wname] 424 | -------------------------------------------------------------------------------- /stream/stream.c: -------------------------------------------------------------------------------- 1 | /*-----------------------------------------------------------------------*/ 2 | /* Program: STREAM */ 3 | /* Revision: $Id: stream.c,v 5.10 2013/01/17 16:01:06 mccalpin Exp mccalpin $ */ 4 | /* Original code developed by John D. McCalpin */ 5 | /* Programmers: John D. McCalpin */ 6 | /* Joe R. Zagar */ 7 | /* */ 8 | /* This program measures memory transfer rates in MB/s for simple */ 9 | /* computational kernels coded in C. */ 10 | /*-----------------------------------------------------------------------*/ 11 | /* Copyright 1991-2013: John D. McCalpin */ 12 | /*-----------------------------------------------------------------------*/ 13 | /* License: */ 14 | /* 1. You are free to use this program and/or to redistribute */ 15 | /* this program. */ 16 | /* 2. You are free to modify this program for your own use, */ 17 | /* including commercial use, subject to the publication */ 18 | /* restrictions in item 3. */ 19 | /* 3. You are free to publish results obtained from running this */ 20 | /* program, or from works that you derive from this program, */ 21 | /* with the following limitations: */ 22 | /* 3a. In order to be referred to as "STREAM benchmark results", */ 23 | /* published results must be in conformance to the STREAM */ 24 | /* Run Rules, (briefly reviewed below) published at */ 25 | /* http://www.cs.virginia.edu/stream/ref.html */ 26 | /* and incorporated herein by reference. */ 27 | /* As the copyright holder, John McCalpin retains the */ 28 | /* right to determine conformity with the Run Rules. */ 29 | /* 3b. Results based on modified source code or on runs not in */ 30 | /* accordance with the STREAM Run Rules must be clearly */ 31 | /* labelled whenever they are published. Examples of */ 32 | /* proper labelling include: */ 33 | /* "tuned STREAM benchmark results" */ 34 | /* "based on a variant of the STREAM benchmark code" */ 35 | /* Other comparable, clear, and reasonable labelling is */ 36 | /* acceptable. */ 37 | /* 3c. Submission of results to the STREAM benchmark web site */ 38 | /* is encouraged, but not required. */ 39 | /* 4. Use of this program or creation of derived works based on this */ 40 | /* program constitutes acceptance of these licensing restrictions. */ 41 | /* 5. Absolutely no warranty is expressed or implied. */ 42 | /*-----------------------------------------------------------------------*/ 43 | # include 44 | # include 45 | # include 46 | # include 47 | # include 48 | # include 49 | 50 | /*----------------------------------------------------------------------- 51 | * INSTRUCTIONS: 52 | * 53 | * 1) STREAM requires different amounts of memory to run on different 54 | * systems, depending on both the system cache size(s) and the 55 | * granularity of the system timer. 56 | * You should adjust the value of 'STREAM_ARRAY_SIZE' (below) 57 | * to meet *both* of the following criteria: 58 | * (a) Each array must be at least 4 times the size of the 59 | * available cache memory. I don't worry about the difference 60 | * between 10^6 and 2^20, so in practice the minimum array size 61 | * is about 3.8 times the cache size. 62 | * Example 1: One Xeon E3 with 8 MB L3 cache 63 | * STREAM_ARRAY_SIZE should be >= 4 million, giving 64 | * an array size of 30.5 MB and a total memory requirement 65 | * of 91.5 MB. 66 | * Example 2: Two Xeon E5's with 20 MB L3 cache each (using OpenMP) 67 | * STREAM_ARRAY_SIZE should be >= 20 million, giving 68 | * an array size of 153 MB and a total memory requirement 69 | * of 458 MB. 70 | * (b) The size should be large enough so that the 'timing calibration' 71 | * output by the program is at least 20 clock-ticks. 72 | * Example: most versions of Windows have a 10 millisecond timer 73 | * granularity. 20 "ticks" at 10 ms/tic is 200 milliseconds. 74 | * If the chip is capable of 10 GB/s, it moves 2 GB in 200 msec. 75 | * This means the each array must be at least 1 GB, or 128M elements. 76 | * 77 | * Version 5.10 increases the default array size from 2 million 78 | * elements to 10 million elements in response to the increasing 79 | * size of L3 caches. The new default size is large enough for caches 80 | * up to 20 MB. 81 | * Version 5.10 changes the loop index variables from "register int" 82 | * to "ssize_t", which allows array indices >2^32 (4 billion) 83 | * on properly configured 64-bit systems. Additional compiler options 84 | * (such as "-mcmodel=medium") may be required for large memory runs. 85 | * 86 | * Array size can be set at compile time without modifying the source 87 | * code for the (many) compilers that support preprocessor definitions 88 | * on the compile line. E.g., 89 | * gcc -O -DSTREAM_ARRAY_SIZE=100000000 stream.c -o stream.100M 90 | * will override the default size of 10M with a new size of 100M elements 91 | * per array. 92 | */ 93 | #ifndef STREAM_ARRAY_SIZE 94 | # define STREAM_ARRAY_SIZE 10000000 95 | #endif 96 | 97 | /* 2) STREAM runs each kernel "NTIMES" times and reports the *best* result 98 | * for any iteration after the first, therefore the minimum value 99 | * for NTIMES is 2. 100 | * There are no rules on maximum allowable values for NTIMES, but 101 | * values larger than the default are unlikely to noticeably 102 | * increase the reported performance. 103 | * NTIMES can also be set on the compile line without changing the source 104 | * code using, for example, "-DNTIMES=7". 105 | */ 106 | #ifdef NTIMES 107 | #if NTIMES<=1 108 | # define NTIMES 10 109 | #endif 110 | #endif 111 | #ifndef NTIMES 112 | # define NTIMES 10 113 | #endif 114 | 115 | /* Users are allowed to modify the "OFFSET" variable, which *may* change the 116 | * relative alignment of the arrays (though compilers may change the 117 | * effective offset by making the arrays non-contiguous on some systems). 118 | * Use of non-zero values for OFFSET can be especially helpful if the 119 | * STREAM_ARRAY_SIZE is set to a value close to a large power of 2. 120 | * OFFSET can also be set on the compile line without changing the source 121 | * code using, for example, "-DOFFSET=56". 122 | */ 123 | #ifndef OFFSET 124 | # define OFFSET 0 125 | #endif 126 | 127 | /* 128 | * 3) Compile the code with optimization. Many compilers generate 129 | * unreasonably bad code before the optimizer tightens things up. 130 | * If the results are unreasonably good, on the other hand, the 131 | * optimizer might be too smart for me! 132 | * 133 | * For a simple single-core version, try compiling with: 134 | * cc -O stream.c -o stream 135 | * This is known to work on many, many systems.... 136 | * 137 | * To use multiple cores, you need to tell the compiler to obey the OpenMP 138 | * directives in the code. This varies by compiler, but a common example is 139 | * gcc -O -fopenmp stream.c -o stream_omp 140 | * The environment variable OMP_NUM_THREADS allows runtime control of the 141 | * number of threads/cores used when the resulting "stream_omp" program 142 | * is executed. 143 | * 144 | * To run with single-precision variables and arithmetic, simply add 145 | * -DSTREAM_TYPE=float 146 | * to the compile line. 147 | * Note that this changes the minimum array sizes required --- see (1) above. 148 | * 149 | * The preprocessor directive "TUNED" does not do much -- it simply causes the 150 | * code to call separate functions to execute each kernel. Trivial versions 151 | * of these functions are provided, but they are *not* tuned -- they just 152 | * provide predefined interfaces to be replaced with tuned code. 153 | * 154 | * 155 | * 4) Optional: Mail the results to mccalpin@cs.virginia.edu 156 | * Be sure to include info that will help me understand: 157 | * a) the computer hardware configuration (e.g., processor model, memory type) 158 | * b) the compiler name/version and compilation flags 159 | * c) any run-time information (such as OMP_NUM_THREADS) 160 | * d) all of the output from the test case. 161 | * 162 | * Thanks! 163 | * 164 | *-----------------------------------------------------------------------*/ 165 | 166 | # define HLINE "-------------------------------------------------------------\n" 167 | 168 | # ifndef MIN 169 | # define MIN(x,y) ((x)<(y)?(x):(y)) 170 | # endif 171 | # ifndef MAX 172 | # define MAX(x,y) ((x)>(y)?(x):(y)) 173 | # endif 174 | 175 | #ifndef STREAM_TYPE 176 | #define STREAM_TYPE double 177 | #endif 178 | 179 | static STREAM_TYPE a[STREAM_ARRAY_SIZE+OFFSET], 180 | b[STREAM_ARRAY_SIZE+OFFSET], 181 | c[STREAM_ARRAY_SIZE+OFFSET]; 182 | 183 | static double avgtime[4] = {0}, maxtime[4] = {0}, 184 | mintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX}; 185 | 186 | static char *label[4] = {"Copy: ", "Scale: ", 187 | "Add: ", "Triad: "}; 188 | 189 | static double bytes[4] = { 190 | 2 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE, 191 | 2 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE, 192 | 3 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE, 193 | 3 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE 194 | }; 195 | 196 | extern double mysecond(); 197 | extern void checkSTREAMresults(); 198 | #ifdef TUNED 199 | extern void tuned_STREAM_Copy(); 200 | extern void tuned_STREAM_Scale(STREAM_TYPE scalar); 201 | extern void tuned_STREAM_Add(); 202 | extern void tuned_STREAM_Triad(STREAM_TYPE scalar); 203 | #endif 204 | #ifdef _OPENMP 205 | extern int omp_get_num_threads(); 206 | #endif 207 | int 208 | main() 209 | { 210 | int quantum, checktick(); 211 | int BytesPerWord; 212 | int k; 213 | ssize_t j; 214 | STREAM_TYPE scalar; 215 | double t, times[4][NTIMES]; 216 | 217 | /* --- SETUP --- determine precision and check timing --- */ 218 | 219 | printf(HLINE); 220 | printf("STREAM version $Revision: 5.10 $\n"); 221 | printf(HLINE); 222 | BytesPerWord = sizeof(STREAM_TYPE); 223 | printf("This system uses %d bytes per array element.\n", 224 | BytesPerWord); 225 | 226 | printf(HLINE); 227 | #ifdef N 228 | printf("***** WARNING: ******\n"); 229 | printf(" It appears that you set the preprocessor variable N when compiling this code.\n"); 230 | printf(" This version of the code uses the preprocesor variable STREAM_ARRAY_SIZE to control the array size\n"); 231 | printf(" Reverting to default value of STREAM_ARRAY_SIZE=%llu\n",(unsigned long long) STREAM_ARRAY_SIZE); 232 | printf("***** WARNING: ******\n"); 233 | #endif 234 | 235 | printf("Array size = %llu (elements), Offset = %d (elements)\n" , (unsigned long long) STREAM_ARRAY_SIZE, OFFSET); 236 | printf("Memory per array = %.1f MiB (= %.1f GiB).\n", 237 | BytesPerWord * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.0), 238 | BytesPerWord * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.0/1024.0)); 239 | printf("Total memory required = %.1f MiB (= %.1f GiB).\n", 240 | (3.0 * BytesPerWord) * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.), 241 | (3.0 * BytesPerWord) * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024./1024.)); 242 | printf("Each kernel will be executed %d times.\n", NTIMES); 243 | printf(" The *best* time for each kernel (excluding the first iteration)\n"); 244 | printf(" will be used to compute the reported bandwidth.\n"); 245 | 246 | #ifdef _OPENMP 247 | printf(HLINE); 248 | #pragma omp parallel 249 | { 250 | #pragma omp master 251 | { 252 | k = omp_get_num_threads(); 253 | printf ("Number of Threads requested = %i\n",k); 254 | } 255 | } 256 | #endif 257 | 258 | #ifdef _OPENMP 259 | k = 0; 260 | #pragma omp parallel 261 | #pragma omp atomic 262 | k++; 263 | printf ("Number of Threads counted = %i\n",k); 264 | #endif 265 | 266 | /* Get initial value for system clock. */ 267 | #pragma omp parallel for 268 | for (j=0; j= 1) 277 | printf("Your clock granularity/precision appears to be " 278 | "%d microseconds.\n", quantum); 279 | else { 280 | printf("Your clock granularity appears to be " 281 | "less than one microsecond.\n"); 282 | quantum = 1; 283 | } 284 | 285 | t = mysecond(); 286 | #pragma omp parallel for 287 | for (j = 0; j < STREAM_ARRAY_SIZE; j++) 288 | a[j] = 2.0E0 * a[j]; 289 | t = 1.0E6 * (mysecond() - t); 290 | 291 | printf("Each test below will take on the order" 292 | " of %d microseconds.\n", (int) t ); 293 | printf(" (= %d clock ticks)\n", (int) (t/quantum) ); 294 | printf("Increase the size of the arrays if this shows that\n"); 295 | printf("you are not getting at least 20 clock ticks per test.\n"); 296 | 297 | printf(HLINE); 298 | 299 | printf("WARNING -- The above is only a rough guideline.\n"); 300 | printf("For best results, please be sure you know the\n"); 301 | printf("precision of your system timer.\n"); 302 | printf(HLINE); 303 | 304 | /* --- MAIN LOOP --- repeat test cases NTIMES times --- */ 305 | 306 | scalar = 3.0; 307 | for (k=0; k 419 | 420 | double mysecond() 421 | { 422 | struct timeval tp; 423 | struct timezone tzp; 424 | int i; 425 | 426 | i = gettimeofday(&tp,&tzp); 427 | return ( (double) tp.tv_sec + (double) tp.tv_usec * 1.e-6 ); 428 | } 429 | 430 | #ifndef abs 431 | #define abs(a) ((a) >= 0 ? (a) : -(a)) 432 | #endif 433 | void checkSTREAMresults () 434 | { 435 | STREAM_TYPE aj,bj,cj,scalar; 436 | STREAM_TYPE aSumErr,bSumErr,cSumErr; 437 | STREAM_TYPE aAvgErr,bAvgErr,cAvgErr; 438 | double epsilon; 439 | ssize_t j; 440 | int k,ierr,err; 441 | 442 | /* reproduce initialization */ 443 | aj = 1.0; 444 | bj = 2.0; 445 | cj = 0.0; 446 | /* a[] is modified during timing check */ 447 | aj = 2.0E0 * aj; 448 | /* now execute timing loop */ 449 | scalar = 3.0; 450 | for (k=0; k epsilon) { 485 | err++; 486 | printf ("Failed Validation on array a[], AvgRelAbsErr > epsilon (%e)\n",epsilon); 487 | printf (" Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",aj,aAvgErr,abs(aAvgErr)/aj); 488 | ierr = 0; 489 | for (j=0; j epsilon) { 491 | ierr++; 492 | #ifdef VERBOSE 493 | if (ierr < 10) { 494 | printf(" array a: index: %ld, expected: %e, observed: %e, relative error: %e\n", 495 | j,aj,a[j],abs((aj-a[j])/aAvgErr)); 496 | } 497 | #endif 498 | } 499 | } 500 | printf(" For array a[], %d errors were found.\n",ierr); 501 | } 502 | if (abs(bAvgErr/bj) > epsilon) { 503 | err++; 504 | printf ("Failed Validation on array b[], AvgRelAbsErr > epsilon (%e)\n",epsilon); 505 | printf (" Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",bj,bAvgErr,abs(bAvgErr)/bj); 506 | printf (" AvgRelAbsErr > Epsilon (%e)\n",epsilon); 507 | ierr = 0; 508 | for (j=0; j epsilon) { 510 | ierr++; 511 | #ifdef VERBOSE 512 | if (ierr < 10) { 513 | printf(" array b: index: %ld, expected: %e, observed: %e, relative error: %e\n", 514 | j,bj,b[j],abs((bj-b[j])/bAvgErr)); 515 | } 516 | #endif 517 | } 518 | } 519 | printf(" For array b[], %d errors were found.\n",ierr); 520 | } 521 | if (abs(cAvgErr/cj) > epsilon) { 522 | err++; 523 | printf ("Failed Validation on array c[], AvgRelAbsErr > epsilon (%e)\n",epsilon); 524 | printf (" Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",cj,cAvgErr,abs(cAvgErr)/cj); 525 | printf (" AvgRelAbsErr > Epsilon (%e)\n",epsilon); 526 | ierr = 0; 527 | for (j=0; j epsilon) { 529 | ierr++; 530 | #ifdef VERBOSE 531 | if (ierr < 10) { 532 | printf(" array c: index: %ld, expected: %e, observed: %e, relative error: %e\n", 533 | j,cj,c[j],abs((cj-c[j])/cAvgErr)); 534 | } 535 | #endif 536 | } 537 | } 538 | printf(" For array c[], %d errors were found.\n",ierr); 539 | } 540 | if (err == 0) { 541 | printf ("Solution Validates: avg error less than %e on all three arrays\n",epsilon); 542 | } 543 | #ifdef VERBOSE 544 | printf ("Results Validation Verbose Results: \n"); 545 | printf (" Expected a(1), b(1), c(1): %f %f %f \n",aj,bj,cj); 546 | printf (" Observed a(1), b(1), c(1): %f %f %f \n",a[1],b[1],c[1]); 547 | printf (" Rel Errors on a, b, c: %e %e %e \n",abs(aAvgErr/aj),abs(bAvgErr/bj),abs(cAvgErr/cj)); 548 | #endif 549 | } 550 | 551 | #ifdef TUNED 552 | /* stubs for "tuned" versions of the kernels */ 553 | void tuned_STREAM_Copy() 554 | { 555 | ssize_t j; 556 | #pragma omp parallel for 557 | for (j=0; j 0: 87 | for idx in range(start_burst): 88 | schedule[idx].ts_arrival = 0 89 | return schedule 90 | 91 | def update_resources(self): 92 | for s in self.servers: 93 | s.get_resources() 94 | 95 | def find_server_fits(self, workload): 96 | if not self.servers: 97 | return None 98 | 99 | # first try to fit the workload normally 100 | for s in self.servers: 101 | if s.fits_normally(workload): 102 | return s 103 | 104 | # normal placement didn't work, are we using remote memory? 105 | if not self.remotemem: 106 | return None 107 | 108 | # we are using remote memory. for every server, check if we 109 | # can fit it using remote mem 110 | total_far_mem = sum(max(0, ss.alloc_mem - ss.total_mem) for ss in self.servers) 111 | for s in self.servers: 112 | if s.fits_remotemem(workload, self.max_far_mem, total_far_mem): 113 | return s 114 | 115 | return None 116 | 117 | def start_schedule(self): 118 | print("Will execute {} tasks.".format(len(self.schedule))) 119 | 120 | while self.schedule or self.pending or self.executing: 121 | 122 | ''' Update the Server instances with the latest resource information from their 123 | server.py counterparts''' 124 | self.update_resources() 125 | 126 | # move workloads from schedule to pendingq when they arrive 127 | if self.schedule: 128 | self.schedule = self.move_to_pending() 129 | 130 | # move from pendingq to executing when we place them on a server 131 | if self.pending: 132 | successfully_executed = self.exec_one() 133 | if successfully_executed: 134 | self.servers = list(self.original_servers) 135 | random.shuffle(self.servers) 136 | 137 | if not self.pending and self.schedule: 138 | pass 139 | 140 | # move from executing to finishq when they finish execution 141 | if self.executing: 142 | self.check_finished() 143 | 144 | time.sleep(MAIN_LOOP_SLEEP) 145 | 146 | return self.finished 147 | 148 | def exec_one(self): 149 | """ check if any machine fits the workload. 150 | each server can fit one new workload per exec_one() call. """ 151 | servers = list(self.servers) 152 | futures = [] # list of tuples (future, workload, server) 153 | 154 | def execute_done(future, base_time, workload, executing, server): 155 | assert future.result().success 156 | workload.ts_sent = time.time() - base_time 157 | print("Sent {} to {}".format(workload.get_name(), server.name)) 158 | executing[workload.idd] = workload 159 | 160 | for workload in list(self.pending): 161 | s = self.find_server_fits(workload) 162 | if s: 163 | future = s.execute_future(workload) 164 | futures.append((future, workload, s)) 165 | self.pending.remove(workload) 166 | servers.remove(s) 167 | future.add_done_callback(functools.partial(execute_done, 168 | base_time=self.base_time, workload=workload, executing=self.executing, server=s)) 169 | return True 170 | return False 171 | 172 | def move_to_pending(self): 173 | """ returns a new scheduleq with the workloads that couldn't be 174 | scheduled""" 175 | elapsed = time.time() - self.base_time 176 | 177 | new_schedule = [] 178 | for workload in self.schedule: 179 | if workload.ts_arrival <= elapsed: 180 | self.pending.append(workload) 181 | print("{} arrived".format(workload.name + str(workload.idd))) 182 | else: 183 | new_schedule.append(workload) 184 | 185 | return new_schedule 186 | 187 | def check_finished(self): 188 | for s in self.servers: 189 | finish_times, start_times = s.get_finished() 190 | for idd in finish_times.keys(): 191 | workload = self.executing[idd] 192 | workload.ts_start = start_times[idd] 193 | workload.ts_finish = finish_times[idd] 194 | self.finished.append(workload) 195 | del self.executing[idd] 196 | 197 | 198 | 199 | class SchedWorkload: 200 | def __init__(self, name, idd, cpu_req, mem_req, max_arrival, min_mem): 201 | self.name = name 202 | self.idd = idd 203 | self.cpu_req = cpu_req 204 | self.mem_req = mem_req 205 | self.min_mem = min_mem 206 | 207 | self.ts_arrival = random.uniform(0, max_arrival) 208 | self.ts_sent = 0 209 | self.ts_start = 0 210 | self.ts_finish = 0 211 | 212 | def get_name(self): 213 | return self.name + str(self.idd) 214 | 215 | def get_duration(self): 216 | return self.ts_finish - self.ts_start 217 | 218 | def get_jct(self): 219 | return self.ts_finish - self.ts_arrival 220 | 221 | 222 | class Server: 223 | def __init__(self, addr, remotemem, max_cpus, max_mem, 224 | uniform_ratio, variable_ratios, 225 | max_far, optimal): 226 | self.channel = grpc.insecure_channel(addr) 227 | self.stub = protocol_pb2_grpc.SchedulerStub(self.channel) 228 | self.checkin(remotemem, max_cpus, max_mem, uniform_ratio, 229 | variable_ratios, max_far > 0, optimal) 230 | self.addr = addr 231 | 232 | print("connected to server={}".format(self.name)) 233 | 234 | def __del__(self): 235 | self.close() 236 | 237 | def checkin(self, remotemem, max_cpus, max_mem, 238 | uniform_ratio, variable_ratios, 239 | limit_remote_mem, optimal): 240 | """ returns the server name if successful """ 241 | 242 | self.remotemem = remotemem 243 | self.free_cpus = max_cpus 244 | self.total_cpus = max_cpus 245 | self.free_mem = max_mem 246 | self.total_mem = max_mem 247 | self.uniform_ratio = uniform_ratio 248 | self.variable_ratios = variable_ratios 249 | self.uniform_ratio = uniform_ratio 250 | 251 | req = protocol_pb2.CheckinReq(use_remote_mem=remotemem, 252 | max_cpus=max_cpus, 253 | max_mem=max_mem, 254 | uniform_ratio=uniform_ratio, 255 | variable_ratios=variable_ratios, 256 | limit_remote_mem=limit_remote_mem, 257 | optimal=optimal) 258 | reply = self.stub.checkin(req) 259 | if not reply.success: 260 | raise RuntimeError("Not enough memory or cpus") 261 | 262 | self.name = reply.server_name 263 | 264 | 265 | def close(self): 266 | req = protocol_pb2.ShutdownReq() 267 | _ = self.stub.shutdown(req) 268 | self.channel.close() 269 | 270 | def execute_future(self, workload): 271 | """ returns a future of the execution request """ 272 | req = protocol_pb2.ExecuteReq(wname=workload.name, idd=workload.idd) 273 | return self.stub.execute.future(req) 274 | 275 | def get_resources(self): 276 | req = protocol_pb2.GetResourcesReq() 277 | reply = self.stub.get_resources(req) 278 | self.free_cpus = reply.free_cpus 279 | self.alloc_mem = reply.alloc_mem 280 | self.min_mem_sum = reply.min_mem_sum 281 | 282 | def fits_farmem_uniform(self, w, max_far_mem, total_far_mem): 283 | """ assumes everything from fits_remotemem() plus the workload 284 | fits in cpus """ 285 | local_alloc_mem = self.alloc_mem + w.mem_req 286 | local_ratio = min(1, self.total_mem / local_alloc_mem) 287 | if local_ratio < self.uniform_ratio: 288 | return False 289 | 290 | # check if (1 - local_ratio) that makes the incoming job fit results in 291 | # a far memory usage above the max 292 | if max_far_mem > 0: 293 | additional_far_mem = (1 - local_ratio) * w.mem_req 294 | if additional_far_mem + total_far_mem > max_far_mem: 295 | return False 296 | return True 297 | 298 | def fits_farmem_variable(self, w, max_far_mem, total_far_mem): 299 | local_min_mem_sum = self.min_mem_sum + w.min_mem 300 | if local_min_mem_sum > self.total_mem: 301 | return False 302 | 303 | if max_far_mem > 0: 304 | curr_far_mem = max(0, self.alloc_mem - self.total_mem) 305 | if curr_far_mem > 0: 306 | additional_far_mem = w.mem_req 307 | else: 308 | additional_far_mem = max(0, w.mem_req + self.alloc_mem - self.total_mem) 309 | 310 | if total_far_mem + additional_far_mem > max_far_mem: 311 | return False 312 | return True 313 | 314 | 315 | def fits_remotemem(self, w, max_far_mem, total_far_mem): 316 | """ assumes the workload didn't fit normally, try to fit it with 317 | remote memory. we only want to determine whether the workload fits, 318 | but will let the server compute its own ratio (to avoid consistency 319 | issues). 320 | others_far_mem is the far memory in use minus far memory used 321 | by this server. """ 322 | if not self.fits_cpu_remote(w): 323 | return False 324 | 325 | if self.uniform_ratio: 326 | return self.fits_farmem_uniform(w, max_far_mem, total_far_mem) 327 | 328 | # Variable Policy 329 | return self.fits_farmem_variable(w, max_far_mem, total_far_mem) 330 | 331 | 332 | def fits_normally(self, w): 333 | free_mem = self.total_mem - self.alloc_mem 334 | return self.fits_cpu(w) and free_mem >= w.mem_req 335 | 336 | def fits_cpu(self, w): 337 | return self.free_cpus >= w.cpu_req 338 | 339 | def fits_cpu_remote(self, w): 340 | return self.free_cpus - 1 >= w.cpu_req 341 | 342 | def get_finished(self): 343 | req = protocol_pb2.GetFinishedReq() 344 | finished = self.stub.get_finished(req) 345 | return (finished.finished_times, finished.start_times) 346 | 347 | def get_samples(self): 348 | req = protocol_pb2.GetSamplesReq() 349 | samples = self.stub.get_samples(req) 350 | return samples 351 | 352 | 353 | def print_finished_stats(finishq, base_time): 354 | print("\nfinished {} workloads".format(len(finishq))) 355 | latest_finish = max(map(lambda w: w.ts_finish, finishq)) 356 | print("makespan={}".format(round(latest_finish, 3))) 357 | print("\nName,Arrival,Start,Finish") 358 | for workload in sorted(finishq, key=lambda w: w.get_name()): 359 | print("{},{},{},{}".format(workload.get_name(), 360 | round(workload.ts_arrival, 3), 361 | round(workload.ts_sent, 3), 362 | round(workload.ts_finish, 3))) 363 | 364 | def average_samples_by_time(sample_list): # Takes in a list of lists 365 | # '*' unpacks an iterable into multiple args for a function 366 | tuples_by_time = zip(*sample_list) 367 | 368 | # Compute the mean for each time step 369 | means = map(statistics.mean, tuples_by_time) 370 | 371 | return means 372 | 373 | def sum_samples_by_time(sample_list): # Takes in a list of lists 374 | # '*' unpacks an iterable into multiple args for a function 375 | tuples_by_time = zip(*sample_list) 376 | 377 | # Compute the mean for each time step 378 | sums = map(sum, tuples_by_time) 379 | 380 | return sums 381 | 382 | def combine_samples(servers): 383 | mem_samples = list() 384 | cpu_samples = list() 385 | swap_samples = dict() 386 | bw_in_samples = dict() 387 | bw_out_samples = dict() 388 | bytes_in_samples = list() 389 | bytes_out_samples = list() 390 | curr_pages_samples = dict() 391 | 392 | # Compose of list of lists 393 | for s in servers: 394 | samples = s.get_samples() 395 | mem_samples.append(samples.mem_util) 396 | cpu_samples.append(samples.cpu_util) 397 | swap_samples[s.addr] = samples.swap_util 398 | bw_in_samples[s.addr] = samples.bw_in 399 | bw_out_samples[s.addr] = samples.bw_out 400 | bytes_in_samples.append(samples.bytes_in) 401 | bytes_out_samples.append(samples.bytes_out) 402 | curr_pages_samples[s.addr] = samples.curr_pages 403 | 404 | 405 | # Get the maximum run time 406 | max_len = max(map(len, mem_samples)) 407 | 408 | # Padding each list so that they're all the same length 409 | [lst.extend([0]*(max_len - len(lst))) for lst in mem_samples] 410 | [lst.extend([0]*(max_len - len(lst))) for lst in cpu_samples] 411 | 412 | # Averaging the samples at each time step 413 | mem = average_samples_by_time(mem_samples) 414 | cpu = average_samples_by_time(cpu_samples) 415 | 416 | # Round values 417 | rounded_mem = map(lambda num: round(num, 3), mem) 418 | rounded_cpu = map(lambda num: round(num, 3), cpu) 419 | swap_samples = {s: list(map(lambda num: round(num, 3), lst)) for s, lst in swap_samples.items()} 420 | bw_out_samples = {s: list(map(lambda num: round(num, 3), lst)) for s, lst in bw_out_samples.items()} 421 | bw_in_samples = {s: list(map(lambda num: round(num, 3), lst)) for s, lst in bw_in_samples.items()} 422 | curr_pages_samples = {s: list(lst) for s, lst in curr_pages_samples.items()} 423 | 424 | return (rounded_mem, rounded_cpu, bw_in_samples, bw_out_samples, 425 | swap_samples, bytes_in_samples, bytes_out_samples, curr_pages_samples) 426 | 427 | def write_samples_to_file(filename, samples): 428 | mem, cpu, bw_in, bw_out, swap, bytes_in, bytes_out, curr_pages = samples 429 | 430 | with open(filename, 'w') as f: 431 | combined = zip(mem, cpu) 432 | combined = [{'Mem':m, 'CPU':c} 433 | for m,c in combined] 434 | numbered = dict(enumerate(combined)) 435 | numbered['bytes in'] = sum(bytes_in) 436 | numbered['bytes out'] = sum(bytes_out) 437 | numbered['swap samples'] = swap 438 | numbered['bw out'] = bw_out 439 | numbered['bw in'] = bw_in 440 | numbered['curr_pages'] = curr_pages 441 | f.write(json.dumps(numbered, indent=4)) 442 | 443 | def generate_filename(args): 444 | cpus = str(args.cpus) 445 | mem = str(args.mem) 446 | size = str(args.size) 447 | if not args.remotemem: 448 | policy = "nofar" 449 | elif args.uniform_ratio: 450 | policy = "uniform" 451 | elif args.optimal: 452 | policy = "optimal" 453 | else: 454 | policy = "variable" 455 | 456 | filename = 'cpus_{}_mem_{}_size_{}' 457 | filename = filename.format(cpus, mem, size) 458 | if args.uniform_ratio != None: 459 | filename += '_uniform_ratio_{}'.format(args.uniform_ratio) 460 | 461 | filename += '_policy_{}'.format(policy) 462 | cur_time = time.localtime() 463 | time_string = '_{}-{}-{}:{}:{}:{}'.format(cur_time.tm_year, cur_time.tm_mon, 464 | cur_time.tm_mday, cur_time.tm_hour, 465 | cur_time.tm_min, cur_time.tm_sec) 466 | filename += time_string + '.json' 467 | return filename 468 | 469 | def check_args(args): 470 | if not args.remotemem: 471 | assert(not args.uniform_ratio), "uniform_ratio must be used with remote memory" 472 | assert(not args.variable_ratios), "variable_ratio must be used with remote memory" 473 | assert(not args.optimal), "optimal must be used with remote memory" 474 | else: 475 | # No two of these three can be active simultaneously 476 | uniform, variable, optimal = map(bool, (args.uniform_ratio, args.variable_ratios, args.optimal)) 477 | print(uniform, variable, optimal) 478 | assert(uniform ^ variable ^ optimal),\ 479 | ("You must specify one (and only one) of the following options: " 480 | "uniform_ratio, variable_ratio.") 481 | 482 | 483 | def main(): 484 | parser = argparse.ArgumentParser() 485 | parser.add_argument('seed', type=int, 486 | help="Used to seed randomization") 487 | parser.add_argument('servers', type=lambda s: s.split(','), 488 | help='comma separated list of servers') 489 | parser.add_argument('cpus', type=int, 490 | help='number of cpus required for each server') 491 | parser.add_argument('mem', type=int, 492 | help='memory required for each server (MB)') 493 | parser.add_argument('--remotemem', '-r', action='store_true', 494 | help='enable remote memory') 495 | parser.add_argument('--max_far', '-s', type=int, default=0, 496 | help='max size of far memory, default=0 (unlimited)') 497 | parser.add_argument('--size', type=int, 498 | help='size of workload (num of tasks) ' \ 499 | 'default=200', default=200) 500 | parser.add_argument('--workload', type=lambda s: s.split(','), 501 | help='tasks that comprise the workload ' \ 502 | 'default=quicksort,kmeans,memaslap', 503 | default='quicksort,kmeans,memaslap') 504 | parser.add_argument('--ratios', type=lambda s: s.split(':'), 505 | help='ratios of tasks in workload, default=2:1:1', 506 | default="2:1:1") 507 | parser.add_argument('--until', type=int, 508 | help='max arrival time in minutes default=20', 509 | default=20) 510 | parser.add_argument('--uniform_ratio', type=float, 511 | help='Smallest allowable memory ratio', 512 | default=0) 513 | parser.add_argument('--variable_ratios', type= lambda s: s.split(','), 514 | help='Min ratio for each workload', 515 | default=[]) 516 | parser.add_argument('--start_burst', type=int, 517 | help='Number of workloads that arrive immediately', 518 | default=0) 519 | parser.add_argument('--optimal', '-o', action='store_true', 520 | help='Use the optimal algorithm') 521 | 522 | cmdargs = parser.parse_args() 523 | 524 | # Check for options that shouldn't be used together 525 | check_args(cmdargs) 526 | 527 | # Put the workload_ratio values in a dictionary with the corresponding name 528 | if cmdargs.variable_ratios: 529 | assert len(cmdargs.variable_ratios) == len(cmdargs.workload) 530 | variable_ratios = map(float, cmdargs.variable_ratios) 531 | variable_ratios = dict(zip(cmdargs.workload, variable_ratios)) 532 | else: 533 | variable_ratios = dict() 534 | 535 | try: 536 | scheduler = Scheduler(cmdargs, variable_ratios) 537 | finished = scheduler.start_schedule() 538 | filename = generate_filename(cmdargs) 539 | print_finished_stats(finished, scheduler.base_time) 540 | samples = combine_samples(scheduler.servers) 541 | write_samples_to_file(filename, samples) 542 | except KeyboardInterrupt: 543 | for s in scheduler.servers[:]: 544 | del s 545 | 546 | if __name__ == '__main__': 547 | logging.basicConfig() 548 | main() 549 | -------------------------------------------------------------------------------- /server.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | """Server receives connection from scheduler""" 3 | 4 | from concurrent import futures 5 | import time 6 | import logging 7 | import argparse 8 | import socket 9 | from tensorflow.python.framework import test_util 10 | 11 | import multiprocessing 12 | import psutil 13 | 14 | import grpc 15 | 16 | from protocol import protocol_pb2 17 | from protocol import protocol_pb2_grpc 18 | 19 | from lib import workloads 20 | 21 | import re 22 | 23 | import numpy as np 24 | from scipy.optimize import Bounds, minimize 25 | 26 | MAIN_LOOP_SLEEP = 1 27 | DRIVER_PATH = "/sys/class/infiniband/mlx4_0/ports/1/counters/{}" 28 | MEGABYTE = 1024*1024 29 | CURR_PAGES_PATH = '/sys/kernel/debug/frontswap/curr_pages' 30 | SWAPPINESS_PATH = '/proc/sys/vm/swappiness' 31 | THP_PATH = "/sys/kernel/mm/transparent_hugepage/enabled" 32 | SOMAXCONN_PATH = "/proc/sys/net/core/somaxconn" 33 | SWAPPINESS_THRESHOLD = 60 34 | SWAP_REGEX = re.compile(rb"VmSwap:\s+(\d+)\s+\.*") 35 | 36 | 37 | def eq(x,mems,local_mem): 38 | return np.dot(x, mems) - local_mem 39 | 40 | def eq_grad(x,mems,local_mem): 41 | return mems 42 | 43 | def obj_new(x, ideal_mems, percents, profiles, gradients=None, mem_gradients=None, beta=0): 44 | r1 = 0 45 | r2 = 0 46 | r3 = 0 47 | r4 = 0 48 | for i in range(ideal_mems.shape[0]): 49 | r1 += ideal_mems[i]*(1-percents[i])*(x[i]*profiles[i](x[i]) - profiles[i](1))/1000 50 | r2 += ideal_mems[i]*(1-percents[i])*(1-x[i])*profiles[i](x[i])/1000 51 | r3 += ideal_mems[i]*(1-percents[i])*x[i]*profiles[i](x[i])/1000 52 | r4 += ideal_mems[i]*(1-percents[i])*profiles[i](1)/1000 53 | return r1/r2 + beta*r3/r4 54 | 55 | def obj_grad_new(x, ideal_mems, percents, profiles, gradients, mem_gradients, beta=0): 56 | r1 = 0 57 | r2 = 0 58 | r4 = 0 59 | g1 = np.empty(ideal_mems.shape) 60 | g2 = np.empty(ideal_mems.shape) 61 | for i in range(ideal_mems.shape[0]): 62 | r1 += ideal_mems[i]*(1-percents[i])*(x[i]*profiles[i](x[i]) - profiles[i](1))/1000 63 | r2 += ideal_mems[i]*(1-percents[i])*(1-x[i])*profiles[i](x[i])/1000 64 | r4 += ideal_mems[i]*(1-percents[i])*profiles[i](1)/1000 65 | 66 | g1[i] = ideal_mems[i]*(1-percents[i])*mem_gradients[i](x[i]) 67 | g2[i] = ideal_mems[i]*(1-percents[i])*(gradients[i](x[i]) - mem_gradients[i](x[i])) 68 | 69 | grads = np.empty(ideal_mems.shape) 70 | for i in range(ideal_mems.shape[0]): 71 | grads[i] = (g1[i]*r2 - r1*g2[i])/r2**2 + beta*g1[i]/r4 # r3 has the same gradient as r1 72 | return grads 73 | 74 | class Machine: 75 | def __init__(self): 76 | self.total_cpus = 0 # number of cpus this machine can use 77 | self.free_cpus = 0 78 | self.total_mem = 0 # amount of memory this machine can use 79 | self.alloc_mem = 0 80 | self.min_mem_sum = 0 81 | self.cur_ratio = 1 82 | 83 | # how much memory we have placed in this machine. 84 | # can be > total_mem when using remote memory 85 | self.remote_mem = False 86 | self.executing = [] 87 | self.finished = [] 88 | self.running = False 89 | self.shutdown_now = False 90 | self.using_remote_mem = False 91 | 92 | # Sampling 93 | self.cpu_samples = [] 94 | self.mem_samples = [] 95 | self.swap_samples = [] 96 | self.bw_in_samples = [] 97 | self.bw_out_samples = [] 98 | self.bytes_in_samples = 0 99 | self.bytes_out_samples = 0 100 | self.curr_pages = [] 101 | 102 | # Bandwidth state 103 | self.prev_recv = 0 104 | self.prev_sent = 0 105 | 106 | # State for calculating percents 107 | self.last_time = 0 108 | self.slow_downs = {} 109 | for wname in ['quicksort', 'kmeans', 'memaslap', 'linpack', 'spark', 'tf-inception']: 110 | self.slow_downs[wname] = 1 111 | 112 | def checkin(self, max_cpus, max_mem, use_remote, uniform_ratio, variable_ratios, limit_remote_mem, optimal): 113 | """ 114 | the scheduler checks in with these params. 115 | we return whether we have enough resources to do the checkin. 116 | if True, this machine will start executing jobs 117 | """ 118 | machine_cpus = multiprocessing.cpu_count() 119 | machine_mem = psutil.virtual_memory().total / 1024 / 1024 120 | 121 | if max_cpus > machine_cpus or max_mem > machine_mem: 122 | logging.info("Checkin Unsuccessful") 123 | return False 124 | 125 | # the checkin used feasible num. of cpus and mem. now initialize 126 | # the machine resources 127 | self.total_mem = max_mem 128 | self.total_cpus = max_cpus 129 | self.free_cpus = max_cpus 130 | self.remote_mem = use_remote 131 | self.uniform_ratio = uniform_ratio 132 | self.running = True 133 | self.variable_ratios = variable_ratios 134 | self.limit_remote_mem = limit_remote_mem 135 | self.unpinned_cpus = set(range(self.total_cpus)) 136 | self.cpu_assignments = {c: None for c in self.unpinned_cpus} 137 | self.base_time = time.time() 138 | self.reclaimer_cpu = self.total_cpus - 1 139 | 140 | self.optimal = optimal 141 | 142 | if self.remote_mem: 143 | try: 144 | with open(DRIVER_PATH.format("port_xmit_data")) as tx_file: 145 | tx_bytes = int(tx_file.read()) * 4 146 | except FileNotFoundError: 147 | tx_bytes = 0 148 | 149 | try: 150 | with open(DRIVER_PATH.format("port_rcv_data")) as recv_file: 151 | recv_bytes = int(recv_file.read()) * 4 152 | except FileNotFoundError: 153 | recv_bytes = 0 154 | 155 | self.prev_sent = tx_bytes 156 | self.prev_recv = recv_bytes 157 | 158 | logging.info("Initial tx value: {}".format(tx_bytes / MEGABYTE)) 159 | logging.info("Initial recv value: {}".format(tx_bytes / MEGABYTE)) 160 | 161 | 162 | #self.check_swappiness() 163 | self.check_thp() 164 | self.check_somaxconn() 165 | self.check_tf_mkl() 166 | 167 | logging.info("Checkin Successful") 168 | 169 | return True 170 | 171 | def check_state(self): 172 | if self.using_remote_mem: 173 | if self.alloc_mem <= self.total_mem: 174 | self.using_remote_mem = False 175 | print("Transitioning to 8 cpus") 176 | else: 177 | if self.alloc_mem > self.total_mem: 178 | self.using_remote_mem = True 179 | print("Transitioning to 7 cpus") 180 | 181 | def check_reclaimer_cpu(self): # Check if reclaimer CPU is being used and move workload off of it 182 | all_cpus = set(range(self.reclaimer_cpu)) # All CPUs except the reclaimer 183 | pinnable_cpus = self.unpinned_cpus.intersection(all_cpus) # Only the CPUs that aren't executing 184 | 185 | ''' We're now using far memory but a workload is executing on 186 | the reclaimer CPU. Need to move it off.''' 187 | if self.cpu_assignments[self.reclaimer_cpu]: 188 | workload_on_reclaimer = self.cpu_assignments[self.reclaimer_cpu] 189 | pids = workload_on_reclaimer.get_pids() # Potentially offending pids 190 | replacement_cpu = pinnable_cpus.pop() # Get a replacement CPU 191 | print("Moving {} off of the reclaimer CPU".format(workload_on_reclaimer.get_name())) 192 | 193 | ''' Not just the parent. But the children too''' 194 | for pid in pids: 195 | process = psutil.Process(pid) 196 | affinity_list = process.cpu_affinity() # 197 | if self.reclaimer_cpu in affinity_list: 198 | print("Moving {} off of the reclaimer CPU and to {}".format(pid, replacement_cpu)) 199 | new_affinity_list = [cpu for cpu in affinity_list if cpu != self.reclaimer_cpu] 200 | new_affinity_list.append(replacement_cpu) 201 | process.cpu_affinity(new_affinity_list) 202 | 203 | self.cpu_assignments[self.reclaimer_cpu] = None 204 | self.cpu_assignments[replacement_cpu] = workload_on_reclaimer 205 | self.unpinned_cpus.remove(replacement_cpu) 206 | self.unpinned_cpus.add(self.reclaimer_cpu) 207 | existing_pinned_cpus = set(workload_on_reclaimer.pinned_cpus) 208 | existing_pinned_cpus.remove(self.reclaimer_cpu) 209 | existing_pinned_cpus.add(replacement_cpu) 210 | workload_on_reclaimer.pinned_cpus = existing_pinned_cpus 211 | 212 | return pinnable_cpus 213 | 214 | def wait_for_swap_to_fall(self): 215 | start = time.time() 216 | while True: 217 | allowed_far = max(0, self.alloc_mem - self.total_mem) 218 | allowed_far = 1024 if allowed_far == 0 else allowed_far 219 | far_mem = self.get_swap() 220 | print("allowed_far={} far_mem={}".format(allowed_far, far_mem)) 221 | 222 | if far_mem <= allowed_far or far_mem < 32: 223 | break 224 | 225 | if time.time() - start > 20: 226 | print("waited for 20 seconds. let it go") 227 | break 228 | 229 | print("wait for swap usage to go down") 230 | time.sleep(0.5) 231 | end = time.time() 232 | print('waited for {} s'.format(end - start)) 233 | global total_wait_time 234 | total_wait_time += end - start 235 | 236 | def execute(self, new_workload_name, idd): 237 | new_workload_class = workloads.get_workload_class(new_workload_name) 238 | self.alloc_mem += new_workload_class.ideal_mem 239 | self.check_state() # Update self.using_remote_mem 240 | 241 | if self.using_remote_mem: 242 | pinnable_cpus = self.check_reclaimer_cpu() 243 | else: 244 | pinnable_cpus = set(self.unpinned_cpus) 245 | 246 | new_workload_cpus = set([pinnable_cpus.pop() for i in range(new_workload_class.cpu_req)]) 247 | self.unpinned_cpus.difference_update(new_workload_cpus) # Remove these cpus from the unpinned set 248 | new_workload = new_workload_class(idd, new_workload_cpus) 249 | 250 | for cpu in new_workload_cpus: 251 | self.cpu_assignments[cpu] = new_workload 252 | 253 | if new_workload_name in self.variable_ratios: 254 | new_workload.set_min_ratio(self.variable_ratios[new_workload_name]) 255 | 256 | self.min_mem_sum += new_workload.min_mem 257 | self.free_cpus -= new_workload_class.cpu_req 258 | 259 | 260 | all_workloads = self.executing + [new_workload] 261 | 262 | if self.remote_mem: 263 | if self.uniform_ratio: 264 | self.shrink_all_uniformly(all_workloads) 265 | elif self.optimal: 266 | self.shrink_all_optimally(all_workloads, idd) 267 | self.last_time = time.time() * 1000 # to ms 268 | else: 269 | self.shrink_all_proportionally(all_workloads) 270 | 271 | else: 272 | assert self.alloc_mem <= self.total_mem 273 | 274 | assert self.free_cpus >= 0 275 | 276 | new_workload.start() 277 | self.executing.append(new_workload) 278 | print("started {} at {} s".format(new_workload.get_name(), round(new_workload.ts_start - self.base_time, 3))) 279 | 280 | def check_swappiness(self): 281 | with open(SWAPPINESS_PATH, 'r') as f: 282 | swappiness = int(f.read()) 283 | 284 | assert(not self.remote_mem or swappiness >= SWAPPINESS_THRESHOLD),\ 285 | "Swappiness needs to be >= {} when using remote mem".format(SWAPPINESS_THRESHOLD) 286 | 287 | assert(self.remote_mem or swappiness == 1),\ 288 | "Swappiness needs to be == 1 when not using remote mem" 289 | 290 | def check_thp(self): 291 | with open(THP_PATH, 'r') as f: 292 | assert('[never]' in f.read()), 'Transparent Hugepage is not disabled' 293 | 294 | def check_somaxconn(self): 295 | with open(SOMAXCONN_PATH, 'r') as f: 296 | assert('65536' == f.read().strip('\n')), 'somaxconn is set to an incorrect value' 297 | 298 | def check_tf_mkl(self): 299 | assert(test_util.IsMklEnabled()), "tensorflow doesn't have mkl enabled" 300 | 301 | def set_cur_ratio(self): 302 | try: 303 | # Ratio > 1 means that we're haven't fully utilized local memory 304 | self.cur_ratio = min(1, self.total_mem / self.alloc_mem) 305 | except ZeroDivisionError: 306 | self.cur_ratio = 1 307 | 308 | def shrink_all_uniformly(self, workloads): 309 | total_ideal_mem = sum([w.ideal_mem for w in workloads]) 310 | try: 311 | local_ratio = min(1, self.total_mem / total_ideal_mem) 312 | except ZeroDivisionError: 313 | local_ratio = 1 314 | 315 | assert local_ratio >= self.uniform_ratio 316 | self.set_cur_ratio() 317 | 318 | for w in workloads: 319 | w.modify_ratio(local_ratio) 320 | 321 | def shrink_all_proportionally(self, workloads): 322 | assert self.min_mem_sum <= self.total_mem 323 | 324 | total_ideal_mem = sum([w.ideal_mem for w in workloads]) 325 | total_min_mem = sum([w.min_mem for w in workloads]) 326 | 327 | memory_pool = total_ideal_mem - total_min_mem 328 | 329 | # Prevent containers from overgrowing 330 | excess_mem = max(0, total_ideal_mem - self.total_mem) 331 | 332 | # Shrink each container 333 | for w in workloads: 334 | try: 335 | share_of_excess = (w.ideal_mem - w.min_mem) / memory_pool * excess_mem 336 | except ZeroDivisionError: 337 | # The pool of memory allowed to be pushed to remote storage is empty 338 | share_of_excess = 0 339 | ratio = (w.ideal_mem - share_of_excess) / w.ideal_mem 340 | w.modify_ratio(ratio) 341 | 342 | def shrink_all_optimally(self, workloads, new_idd=None): 343 | total_ideal_mem = sum([w.ideal_mem for w in workloads]) 344 | total_min_mem = sum([w.min_mem for w in workloads]) 345 | memory_pool = total_ideal_mem - total_min_mem 346 | 347 | excess_mem = max(0, total_ideal_mem - self.total_mem) 348 | 349 | # Shrink each container 350 | init_ratios = [] 351 | for w in workloads: 352 | try: 353 | share_of_excess = (w.ideal_mem - w.min_mem) / memory_pool * excess_mem 354 | except ZeroDivisionError: 355 | # The pool of memory allowed to be pushed to remote storage is empty 356 | share_of_excess = 0 357 | ratio = (w.ideal_mem - share_of_excess) / w.ideal_mem 358 | init_ratios.append(ratio) 359 | 360 | if excess_mem <= 0: 361 | opt_ratios = init_ratios 362 | else: 363 | ratios,_ = self.compute_opt_ratios(workloads,init_ratios, new_idd) 364 | opt_ratios = ratios.tolist() 365 | 366 | if self.last_time == 0: 367 | el_time = 0 368 | else: 369 | el_time = time.time()*1000 - self.last_time 370 | 371 | for w,ratio in zip(workloads,opt_ratios): 372 | w.update(el_time, ratio, new_idd) 373 | 374 | def compute_opt_ratios(self, workloads, init_ratios, new_idd): 375 | el_time = time.time()*1000 - self.last_time 376 | ideal_mems = np.array([w.ideal_mem for w in workloads]) 377 | percents = np.array([(1-(w.idd==new_idd))*min( (w.percent+el_time/w.profile(w.ratio))/self.slow_downs[w.wname], 0.95) for w in workloads]) 378 | profiles = [w.profile for w in workloads] 379 | mem_gradients = [w.mem_gradient for w in workloads] 380 | gradients = [w.gradient for w in workloads] 381 | 382 | x0 = np.array(init_ratios) 383 | 384 | eq_cons = {'type': 'eq', 'fun' : eq, 'jac': eq_grad, 'args': (ideal_mems,self.total_mem)} 385 | bounds = Bounds(0.5, 1.0) 386 | beta = 0 387 | res = minimize(obj_new, x0, method='SLSQP', jac=obj_grad_new, args=(ideal_mems, percents, profiles, gradients, mem_gradients, beta), constraints=eq_cons, options={'disp': False}, bounds=bounds) 388 | final_ratios = res.x 389 | return np.round(final_ratios,3), res.fun 390 | 391 | def check_finished(self): 392 | new_finished = [] 393 | old_alloc_mem = self.alloc_mem 394 | for workload in self.executing[:]: 395 | if not workload.is_alive(): 396 | finished_string = "{} finished at {} s (duration={})" 397 | print(finished_string.format(workload.get_name(), 398 | round(workload.ts_finish - self.base_time, 3), 399 | workload.get_process_duration())) 400 | 401 | self.unpinned_cpus.update(workload.pinned_cpus) 402 | 403 | for cpu in workload.pinned_cpus: 404 | self.cpu_assignments[cpu] = None 405 | self.free_cpus += workload.cpu_req 406 | self.alloc_mem -= workload.ideal_mem 407 | self.min_mem_sum -= workload.min_mem 408 | self.executing.remove(workload) 409 | new_finished.append(workload) 410 | 411 | # adjust percents 412 | el_time = time.time()*1000 - self.last_time 413 | final_percent = workload.percent + el_time/workload.profile(workload.ratio) 414 | if workload.wname in self.slow_downs: 415 | self.slow_downs[workload.wname] = 0.05*final_percent + 0.95*self.slow_downs[workload.wname] 416 | logging.info('{} new slow down is {}'.format(workload.wname,self.slow_downs[workload.wname])) 417 | self.finished.extend(new_finished) 418 | 419 | if new_finished: 420 | print("{} tasks finished".format(len(new_finished))) 421 | if self.remote_mem: 422 | if self.uniform_ratio: 423 | self.shrink_all_uniformly(self.executing) 424 | elif self.optimal: 425 | self.shrink_all_optimally(self.executing, None) 426 | self.last_time = time.time()*1000 427 | else: 428 | self.shrink_all_proportionally(self.executing) 429 | 430 | self.check_state() 431 | 432 | def clear_finished(self): 433 | self.finished = [] 434 | 435 | def get_resources(self): 436 | return {'free_cpus': self.free_cpus, 437 | 'alloc_mem': self.alloc_mem, 438 | 'min_mem_sum': self.min_mem_sum} 439 | 440 | def shutdown(self): 441 | for workload in self.executing: 442 | print("Terminating {}".format(workload.get_name())) 443 | workload.kill() 444 | self.shutdown_now = True 445 | print("Shutting Down") 446 | 447 | def get_swap(self): 448 | # Get list of pids 449 | pids = list() 450 | for workload in self.executing: 451 | '''Only get pids for things in the container 452 | This prevents the memaslap from being included with memcached''' 453 | pids.extend(workload.container.get_pids()) 454 | 455 | total_swap = 0 456 | for pid in pids: 457 | try: 458 | path = '/proc/{}/status'.format(pid) 459 | with open(path, 'rb', buffering=0) as f: 460 | swap = int(SWAP_REGEX.findall(f.read())[0]) 461 | total_swap += swap 462 | except Exception: 463 | continue 464 | total_swap = total_swap / 1024 # Convert from KB to MB 465 | return total_swap 466 | 467 | def sample(self): 468 | if self.running: 469 | cpu = psutil.cpu_percent() 470 | mem = psutil.virtual_memory() 471 | swap = self.get_swap() 472 | 473 | # get bandwidth measurements 474 | if self.remote_mem: 475 | try: 476 | with open(DRIVER_PATH.format("port_xmit_data")) as tx_file: 477 | tx_bytes = int(tx_file.read()) * 4 478 | except FileNotFoundError: 479 | tx_bytes = 0 480 | try: 481 | with open(DRIVER_PATH.format("port_rcv_data")) as recv_file: 482 | recv_bytes = int(recv_file.read()) * 4 483 | except FileNotFoundError: 484 | recv_bytes = 0 485 | 486 | bw_tx = tx_bytes - self.prev_sent 487 | bw_recv = recv_bytes - self.prev_recv 488 | 489 | try: 490 | with open(CURR_PAGES_PATH, 'r') as f_curr_pages: 491 | curr_pages = int(f_curr_pages.read()) 492 | except FileNotFoundError: 493 | curr_pages = 0 494 | 495 | stats = "CPU: {}, Total Mem: {}, Used Mem: {}, Used Swap: {}".format(cpu, 496 | mem.total, mem.used, round(swap, 3)) 497 | 498 | logging.info(stats) 499 | 500 | self.cpu_samples.append(cpu) 501 | self.mem_samples.append(mem.used / mem.total * 100) 502 | self.swap_samples.append(swap) 503 | if self.remote_mem: 504 | self.bw_in_samples.append(bw_recv) 505 | self.bw_out_samples.append(bw_tx) 506 | self.bytes_in_samples += bw_recv 507 | self.bytes_out_samples += bw_tx 508 | self.prev_recv = recv_bytes 509 | self.prev_sent = tx_bytes 510 | self.curr_pages.append(curr_pages) 511 | 512 | logging.info("bw_tx: {}".format(bw_tx / MEGABYTE)) 513 | logging.info("bw_recv: {}".format(bw_recv / MEGABYTE)) 514 | else: 515 | pass 516 | 517 | 518 | class Scheduler(protocol_pb2_grpc.SchedulerServicer): 519 | def __init__(self, machine, servername): 520 | self.machine = machine 521 | self.name = servername 522 | 523 | def checkin(self, req, context): 524 | success = self.machine.checkin(req.max_cpus, req.max_mem, 525 | req.use_remote_mem, req.uniform_ratio, 526 | req.variable_ratios, req.limit_remote_mem, req.optimal) 527 | 528 | return protocol_pb2.CheckinReply(server_name=self.name, success=success) 529 | 530 | def execute(self, request, context): 531 | """ executes the request.wname workload. 532 | if we are using remote memory, computes the new ratio 533 | that will be required after placing the workload.""" 534 | self.machine.check_finished() 535 | self.machine.execute(request.wname, request.idd) 536 | return protocol_pb2.ExecuteReply(success=True) 537 | 538 | def get_resources(self, request, context): 539 | self.machine.check_finished() 540 | resources = self.machine.get_resources() 541 | # ** Expands dictionary into named arguments for a function 542 | reply = protocol_pb2.GetResourcesReply(**resources) 543 | return reply 544 | 545 | def get_finished(self, request, context): 546 | self.machine.check_finished() 547 | start_times = {f.idd: f.ts_start - self.machine.base_time 548 | for f in self.machine.finished} 549 | finished_times = {f.idd: f.ts_finish - self.machine.base_time 550 | for f in self.machine.finished} 551 | reply = protocol_pb2.GetFinishedReply(start_times = start_times, 552 | finished_times=finished_times) 553 | self.machine.clear_finished() 554 | return reply 555 | 556 | def shutdown(self, request, context): 557 | self.machine.shutdown() 558 | reply = protocol_pb2.ShutdownReply(success=True) 559 | return reply 560 | 561 | def get_samples(self, request, context): 562 | reply = protocol_pb2.GetSamplesReply() 563 | reply.cpu_util.extend(self.machine.cpu_samples) 564 | reply.mem_util.extend(self.machine.mem_samples) 565 | reply.swap_util.extend(self.machine.swap_samples) 566 | reply.curr_pages.extend(self.machine.curr_pages) 567 | 568 | bw_in_mb = map(lambda x: x / MEGABYTE, self.machine.bw_in_samples) 569 | reply.bw_in.extend(bw_in_mb) 570 | bw_out_mb = map(lambda x: x / MEGABYTE, self.machine.bw_out_samples) 571 | reply.bw_out.extend(bw_out_mb) 572 | 573 | reply.bytes_in = self.machine.bytes_in_samples / MEGABYTE 574 | reply.bytes_out = self.machine.bytes_out_samples / MEGABYTE 575 | return reply 576 | 577 | def serve(): 578 | hostname = socket.gethostname() 579 | thismachine = Machine() 580 | 581 | server = grpc.server(futures.ThreadPoolExecutor(max_workers=10)) 582 | scheduler = Scheduler(thismachine, hostname) 583 | protocol_pb2_grpc.add_SchedulerServicer_to_server(scheduler, server) 584 | 585 | server.add_insecure_port('[::]:50051') 586 | server.start() 587 | 588 | total_cpus = multiprocessing.cpu_count() 589 | total_mem = psutil.virtual_memory().total 590 | print("server {} waiting for connection, avail cpus={} mem={} MB".format(hostname, 591 | total_cpus, int(total_mem/(1024*1024)))) 592 | 593 | try: 594 | while not thismachine.shutdown_now: 595 | t0 = time.time() 596 | thismachine.sample() 597 | t1 = time.time() 598 | time.sleep(max(0, MAIN_LOOP_SLEEP - (t1 - t0))) 599 | except KeyboardInterrupt: 600 | server.stop(0) 601 | 602 | if __name__ == '__main__': 603 | parser = argparse.ArgumentParser() 604 | parser.add_argument('--log', action='store_true', 605 | help='Write out log to file') 606 | args = parser.parse_args() 607 | 608 | if args.log: 609 | logging.basicConfig(format='%(asctime)s.%(msecs)03d %(message)s', filename='log.txt', level=logging.DEBUG, filemode='w') 610 | else: 611 | logging.basicConfig() 612 | 613 | total_wait_time = 0 614 | serve() 615 | print('total wait tims: {} s'.format(total_wait_time)) 616 | --------------------------------------------------------------------------------