├── linpack
    ├── .gitignore
    ├── xlinpack_xeon64
    ├── lininput_xeon64
    ├── run_cont_perf.sh
    ├── run_cont.sh
    └── runme_xeon64
├── quicksort
    ├── .gitignore
    ├── quicksort.sh
    ├── Makefile
    └── quicksort.cpp
├── spark
    └── pagerank
    │   ├── .gitignore
    │   ├── build.sbt
    │   └── src
    │       └── main
    │           └── scala
    │               └── PageRankExample.scala
├── stream
    ├── stream_c.exe
    ├── Makefile
    ├── mysecond.c
    ├── READ.ME
    ├── stream.f
    └── stream.c
├── setup
    ├── destroy_cgroups.sh
    └── init_bench_cgroups.sh
├── protocol
    ├── gen_protocol.sh
    └── protocol.proto
├── memaslap
    ├── memaslap_fill
    └── memaslap_etc
├── .gitmodules
├── lib
    ├── constants.py
    ├── utils.py
    ├── container.py
    ├── ftracer.py
    └── workloads.py
├── kmeans
    └── kmeans.py
├── tensorflow
    ├── tf-resnet.sh
    └── tf-inception.sh
├── benchmark.py
├── README.md
├── scheduler.py
└── server.py


/linpack/.gitignore:
--------------------------------------------------------------------------------
1 | lin_xeon64.txt
2 | 


--------------------------------------------------------------------------------
/quicksort/.gitignore:
--------------------------------------------------------------------------------
1 | quicksort
2 | 


--------------------------------------------------------------------------------
/spark/pagerank/.gitignore:
--------------------------------------------------------------------------------
1 | project
2 | target
3 | 


--------------------------------------------------------------------------------
/quicksort/quicksort.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | /usr/bin/time -v ./quicksort 2047
4 | 


--------------------------------------------------------------------------------
/stream/stream_c.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clusterfarmem/cfm/HEAD/stream/stream_c.exe


--------------------------------------------------------------------------------
/linpack/xlinpack_xeon64:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clusterfarmem/cfm/HEAD/linpack/xlinpack_xeon64


--------------------------------------------------------------------------------
/setup/destroy_cgroups.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | sudo rmdir /cgroup2/benchmarks
4 | sudo umount /cgroup2
5 | 


--------------------------------------------------------------------------------
/protocol/gen_protocol.sh:
--------------------------------------------------------------------------------
1 | cd ..
2 | python3 -m grpc_tools.protoc -I. --python_out=. --grpc_python_out=. protocol/protocol.proto
3 | 


--------------------------------------------------------------------------------
/quicksort/Makefile:
--------------------------------------------------------------------------------
1 | quicksort: quicksort.cpp
2 | 	g++ -std=c++11 -O3 -g quicksort.cpp -o quicksort
3 | 
4 | clean:
5 | 	rm quicksort
6 | 


--------------------------------------------------------------------------------
/memaslap/memaslap_fill:
--------------------------------------------------------------------------------
 1 | generated keys
 2 | key
 3 | 16 16 1
 4 | total generated values
 5 | value
 6 | 16 512 0.9
 7 | 1 15 0.1
 8 | cmd
 9 | 0 1
10 | 


--------------------------------------------------------------------------------
/memaslap/memaslap_etc:
--------------------------------------------------------------------------------
 1 | generated keys
 2 | key
 3 | 16 16 1
 4 | total generated values
 5 | value
 6 | 16 512 0.9
 7 | 1 15 0.1
 8 | cmd
 9 | 0 0.05
10 | 1 0.95
11 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "tensorflow/benchmarks"]
2 | 	path = tensorflow/benchmarks
3 | 	url = https://github.com/tensorflow/benchmarks.git
4 | 	branch = cnn_tf_v1.14_compatible
5 | 


--------------------------------------------------------------------------------
/lib/constants.py:
--------------------------------------------------------------------------------
1 | import os
2 | 
3 | CGROUP_PATH = "/cgroup2/benchmarks"
4 | PROCS = CGROUP_PATH + '/' + "{}/cgroup.procs"
5 | WORK_DIR = os.getcwd()
6 | TRACING_DIR = '/sys/kernel/debug/tracing/'
7 | SPARK_HOME = '~/spark-2.4.0-bin-hadoop2.7/'
8 | 


--------------------------------------------------------------------------------
/spark/pagerank/build.sbt:
--------------------------------------------------------------------------------
 1 | name := "pagerank"
 2 | 
 3 | version := "1.0"
 4 | 
 5 | scalaVersion := "2.11.12"
 6 | 
 7 | libraryDependencies ++= Seq(
 8 |     "org.apache.spark" %% "spark-sql" % "2.4.0",
 9 |     "org.apache.spark" %% "spark-graphx" % "2.4.0",
10 | )
11 | 


--------------------------------------------------------------------------------
/linpack/lininput_xeon64:
--------------------------------------------------------------------------------
1 | Sample Intel(R) Optimized LINPACK Benchmark data file (lininput_xeon64)
2 | Intel(R) Optimized LINPACK Benchmark data
3 | 1                     # number of tests
4 | 14000 # problem sizes
5 | 14000 # leading dimensions
6 | 1 # times to run a test
7 | 4 # alignment values (in KBytes)
8 | 


--------------------------------------------------------------------------------
/kmeans/kmeans.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from sklearn.cluster import KMeans
 3 | from sklearn.datasets.samples_generator import make_blobs
 4 | 
 5 | np.random.seed(42)
 6 | 
 7 | samples, labels = make_blobs(n_samples=15000000, centers=10, random_state=0)
 8 | k_means = KMeans(10, precompute_distances=True)
 9 | k_means.fit(samples)
10 | 


--------------------------------------------------------------------------------
/tensorflow/tf-resnet.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
4 | 
5 | pushd $DIR/benchmarks/scripts/tf_cnn_benchmarks
6 | /usr/bin/time -v python3 tf_cnn_benchmarks.py --forward_only=True --data_format=NHWC --device=cpu --batch_size=64 --num_inter_threads=0 --num_intra_threads=2 --nodistortions --model=resnet50 --kmp_blocktime=0 --num_batches=20 --num_warmup_batches 0
7 | 


--------------------------------------------------------------------------------
/tensorflow/tf-inception.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
4 | 
5 | pushd $DIR/benchmarks/scripts/tf_cnn_benchmarks
6 | /usr/bin/time -v python3 tf_cnn_benchmarks.py --forward_only=True --data_format=NHWC --device=cpu --batch_size=64 --num_inter_threads=0 --num_intra_threads=2 --nodistortions --model=inception3 --kmp_blocktime=0 --num_batches=20 --num_warmup_batches 0
7 | 


--------------------------------------------------------------------------------
/setup/init_bench_cgroups.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | CGROUP_ROOT=/cgroup2
 4 | CGROUP_BENCH=$CGROUP_ROOT/benchmarks
 5 | USER=$(whoami)
 6 | 
 7 | echo "will setup cgroups at $CGROUP_ROOT for user $USER"
 8 | sudo mount -t cgroup2 nodev $CGROUP_ROOT
 9 | sudo sh -c "echo '+memory' > $CGROUP_ROOT/cgroup.subtree_control"
10 | 
11 | sudo mkdir $CGROUP_BENCH
12 | sudo sh -c "echo '+memory' > $CGROUP_BENCH/cgroup.subtree_control"
13 | 
14 | sudo chown $USER -R $CGROUP_ROOT
15 | 
16 | echo "enabling readahead"
17 | sudo sh -c "echo 3 > /proc/sys/vm/page-cluster"
18 | 
19 | echo "done"
20 | 


--------------------------------------------------------------------------------
/linpack/run_cont_perf.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | # the workload uses 1.5G of memory
 4 | set -e
 5 | PERF="perf stat -d"
 6 | 
 7 | #echo "running linpack without memory limit"
 8 | #sudo $PERF ./runme_xeon64
 9 | #
10 | #echo "running linpack 1152M"
11 | #../scripts/changemem_cgroup2.sh 1152M
12 | #sudo ../scripts/exec_cgroupv2.sh $PERF ./runme_xeon64
13 | #
14 | #echo "running linpack 768M"
15 | #../scripts/changemem_cgroup2.sh 768M
16 | #sudo ../scripts/exec_cgroupv2.sh $PERF ./runme_xeon64
17 | 
18 | echo "running linpack 384M"
19 | ../scripts/changemem_cgroup2.sh 384M
20 | sudo ../scripts/exec_cgroupv2.sh ./runme_xeon64
21 | 


--------------------------------------------------------------------------------
/linpack/run_cont.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | # the workload uses 1.5G of memory
 4 | set -e
 5 | 
 6 | echo "running linpack without memory limit"
 7 | sudo /usr/bin/time -v ./runme_xeon64
 8 | 
 9 | echo "running linpack 1152M"
10 | ../scripts/changemem_cgroup2.sh 1152M
11 | sudo /usr/bin/time -v ../scripts/exec_cgroupv2.sh ./runme_xeon64
12 | 
13 | echo "running linpack 768M"
14 | ../scripts/changemem_cgroup2.sh 768M
15 | sudo /usr/bin/time -v ../scripts/exec_cgroupv2.sh ./runme_xeon64
16 | 
17 | echo "running linpack 384M"
18 | ../scripts/changemem_cgroup2.sh 384M
19 | sudo /usr/bin/time -v ../scripts/exec_cgroupv2.sh ./runme_xeon64
20 | 


--------------------------------------------------------------------------------
/stream/Makefile:
--------------------------------------------------------------------------------
 1 | CC = gcc
 2 | CFLAGS = -O2 -DSTREAM_ARRAY_SIZE=178955000 -DNTIMES=21 -mcmodel=medium -fopenmp
 3 | 
 4 | FF = gfortran -std=legacy
 5 | FFLAGS = -O2
 6 | 
 7 | all: stream_c.exe
 8 | 
 9 | stream_f.exe: stream.f mysecond.o
10 | 	$(CC) $(CFLAGS) -c mysecond.c
11 | 	$(FF) $(FFLAGS) -c stream.f
12 | 	$(FF) $(FFLAGS) stream.o mysecond.o -o stream_f.exe
13 | 
14 | stream_c.exe: stream.c
15 | 	$(CC) $(CFLAGS) stream.c -o stream_c.exe
16 | 
17 | clean:
18 | 	rm -f stream_f.exe stream_c.exe *.o
19 | 
20 | # an example of a more complex build line for the Intel icc compiler
21 | stream.icc: stream.c
22 | 	icc -O3 -xCORE-AVX2 -ffreestanding -qopenmp -DSTREAM_ARRAY_SIZE=80000000 -DNTIMES=20 stream.c -o stream.omp.AVX2.80M.20x.icc
23 | 


--------------------------------------------------------------------------------
/stream/mysecond.c:
--------------------------------------------------------------------------------
 1 | /* A gettimeofday routine to give access to the wall
 2 |    clock timer on most UNIX-like systems.
 3 | 
 4 |    This version defines two entry points -- with 
 5 |    and without appended underscores, so it *should*
 6 |    automagically link with FORTRAN */
 7 | 
 8 | #include <sys/time.h>
 9 | 
10 | double mysecond()
11 | {
12 | /* struct timeval { long        tv_sec;
13 |             long        tv_usec;        };
14 | 
15 | struct timezone { int   tz_minuteswest;
16 |              int        tz_dsttime;      };     */
17 | 
18 |         struct timeval tp;
19 |         struct timezone tzp;
20 |         int i;
21 | 
22 |         i = gettimeofday(&tp,&tzp);
23 |         return ( (double) tp.tv_sec + (double) tp.tv_usec * 1.e-6 );
24 | }
25 | 
26 | double mysecond_() {return mysecond();}
27 | 
28 | 


--------------------------------------------------------------------------------
/quicksort/quicksort.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <vector>
 3 | #include <algorithm>
 4 | #include <cstdlib>
 5 | #include <chrono>
 6 | 
 7 | const size_t MB = 1024 * 1024;
 8 | using namespace std::chrono;
 9 | 
10 | void die(const char *msg, bool printErrno) {
11 | 	std::cerr << msg << "\n";
12 | 	exit(1);
13 | }
14 | 
15 | void print_time_diff(time_point<high_resolution_clock> start,
16 | 	time_point<high_resolution_clock> end) {
17 | 	auto diff = end - start;
18 | 	std::cout << "time " << duration<double, std::nano> (diff).count() << "\n";
19 | }
20 | 
21 | void print_time_diff_ms(time_point<high_resolution_clock> start,
22 | 	time_point<high_resolution_clock> end) {
23 | 	auto diff = end - start;
24 | 	std::cout << "time " << duration<double, std::milli> (diff).count() << " ms\n";
25 | }
26 | 
27 | int main(int argc, char *argv[]) {
28 | 	if (argc != 2)
29 | 		die("need MB of integers to sort", false);
30 | 
31 | 	long size = std::stoi(argv[1]) * MB;
32 | 	long numInts = size / sizeof(int);
33 | 
34 | 	std::cout << "will sort " << numInts << " integers (" << size / MB << " MB)\n";
35 | 	std::vector<int> v(numInts);
36 | 
37 | 	std::srand(std::time(0));
38 | 	time_point<high_resolution_clock> start, end;
39 | 
40 | 	std::generate(v.begin(), v.end(), std::rand);
41 | 	start = high_resolution_clock::now();
42 | 	std::sort(v.begin(), v.end(), std::greater<int>());
43 | 
44 | 	end = high_resolution_clock::now();
45 | 	print_time_diff_ms(start, end);
46 | 
47 | 	return 0;
48 | }
49 | 


--------------------------------------------------------------------------------
/protocol/protocol.proto:
--------------------------------------------------------------------------------
 1 | syntax = "proto3";
 2 | 
 3 | //option java_multiple_files = true;
 4 | //option java_package = "io.grpc.examples.helloworld";
 5 | //option java_outer_classname = "HelloWorldProto";
 6 | //option objc_class_prefix = "HLW";
 7 | 
 8 | package scheduler;
 9 | 
10 | service Scheduler {
11 |   rpc checkin (CheckinReq) returns (CheckinReply) {}
12 |   rpc execute (ExecuteReq) returns (ExecuteReply) {}
13 |   rpc get_resources (GetResourcesReq) returns (GetResourcesReply) {}
14 |   rpc get_finished (GetFinishedReq) returns (GetFinishedReply) {}
15 |   rpc shutdown (ShutdownReq) returns (ShutdownReply) {}
16 |   rpc get_samples (GetSamplesReq) returns (GetSamplesReply) {}
17 | }
18 | 
19 | message CheckinReq {
20 |   bool use_remote_mem = 1;
21 |   uint32 max_cpus = 2;
22 |   uint32 max_mem = 3;
23 |   float uniform_ratio = 5;
24 |   map<string, float> variable_ratios = 6;
25 |   bool limit_remote_mem = 7;
26 |   bool optimal = 8;
27 | }
28 | 
29 | message CheckinReply {
30 |   string server_name = 1;
31 |   bool success = 2;
32 | }
33 | 
34 | message ExecuteReq {
35 |   string wname = 1;
36 |   uint32 idd = 2;
37 | }
38 | 
39 | message ExecuteReply {
40 |   bool success = 1;
41 | }
42 | 
43 | message GetResourcesReq { }
44 | 
45 | message GetResourcesReply {
46 |   float free_cpus = 1;
47 |   float alloc_mem = 2;
48 |   float min_mem_sum = 3;
49 | }
50 | 
51 | message GetFinishedReq { }
52 | 
53 | message GetFinishedReply {
54 |   map<uint32, float> start_times = 1;
55 |   map<uint32, float> finished_times = 2;
56 | }
57 | 
58 | message ShutdownReq { }
59 | 
60 | message ShutdownReply {
61 |   bool success = 1;
62 | }
63 | 
64 | message GetSamplesReq { }
65 | 
66 | message GetSamplesReply {
67 |   repeated float cpu_util = 1;
68 |   repeated float mem_util = 2;
69 |   repeated float swap_util = 3;
70 |   repeated float bw_in = 4;
71 |   repeated float bw_out = 5;
72 |   repeated uint32 curr_pages = 6;
73 |   float bytes_in = 7;
74 |   float bytes_out = 8;
75 | }
76 | 


--------------------------------------------------------------------------------
/benchmark.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python3
 2 | import argparse
 3 | from lib import workloads
 4 | from lib import utils
 5 | 
 6 | def print_output(workload, args):
 7 |     print(workload.stdout.decode('utf-8'), '\n')
 8 |     print(workload.stderr.decode('utf-8'), '\n')
 9 |     process_duration = workload.get_process_duration()
10 |     print('Python Wall Time: {}'.format(process_duration), '\n')
11 | 
12 |     usr_bin_time = workload.get_usr_bin_time()
13 |     usr_bin_time = sorted(usr_bin_time.items(), key=lambda x:x[0])
14 |     header, values = zip(*usr_bin_time)
15 |     header = ','.join(header)
16 |     values = map(str, values)
17 |     values = ','.join(values)
18 |     print(header, values, sep='\n')
19 | 
20 | def run_benchmark(args):
21 |     workload_class = workloads.get_workload_class(args.name)
22 |     
23 |     # Use user-specified cpus, otherwise use first n cpus
24 |     if args.cpus:
25 |         pinned_cpus = args.cpus
26 |     else:
27 |         pinned_cpus = range(workload_class.cpu_req)
28 | 
29 |     workload = workload_class(args.id, pinned_cpus, args.ratio)
30 | 
31 |     try:
32 |         workload.start()
33 |         workload.thread.join()  # Block until thread is finished
34 |         print_output(workload, args)
35 |     except KeyboardInterrupt:
36 |         workload.kill()
37 | 
38 | 
39 | def main():
40 |     # Parse Command Line Arguments
41 |     workload_choices = ['quicksort', 'linpack', 'tf-inception',
42 |                         'tf-resnet', 'spark', 'kmeans', 'memaslap',
43 |                         'stream']
44 | 
45 |     parser = argparse.ArgumentParser(description='Run a workload in a '
46 |                                      'parameterized container')
47 |     parser.add_argument('name', help="Name of the binary to run",
48 |                         choices=workload_choices)
49 |     parser.add_argument('ratio',
50 |                         help="Ratio of the workload's max memory to use",
51 |                         type=utils.check_ratio)
52 |     parser.add_argument('--id', default=0,
53 |                         help="Workload id used for container name")
54 |     parser.add_argument('--cpus', default=[],type=lambda l:list(map(int, l.split(','))),
55 |                         help="List of cpus to use for workloads that support it")
56 | 
57 |     args = parser.parse_args()
58 |     run_benchmark(args)
59 | 
60 | 
61 | if __name__ == '__main__':
62 |     main()
63 | 


--------------------------------------------------------------------------------
/linpack/runme_xeon64:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #===============================================================================
 3 | # Copyright 2001-2017 Intel Corporation All Rights Reserved.
 4 | #
 5 | # The source code,  information  and material  ("Material") contained  herein is
 6 | # owned by Intel Corporation or its  suppliers or licensors,  and  title to such
 7 | # Material remains with Intel  Corporation or its  suppliers or  licensors.  The
 8 | # Material  contains  proprietary  information  of  Intel or  its suppliers  and
 9 | # licensors.  The Material is protected by  worldwide copyright  laws and treaty
10 | # provisions.  No part  of  the  Material   may  be  used,  copied,  reproduced,
11 | # modified, published,  uploaded, posted, transmitted,  distributed or disclosed
12 | # in any way without Intel's prior express written permission.  No license under
13 | # any patent,  copyright or other  intellectual property rights  in the Material
14 | # is granted to  or  conferred  upon  you,  either   expressly,  by implication,
15 | # inducement,  estoppel  or  otherwise.  Any  license   under such  intellectual
16 | # property rights must be express and approved by Intel in writing.
17 | #
18 | # Unless otherwise agreed by Intel in writing,  you may not remove or alter this
19 | # notice or  any  other  notice   embedded  in  Materials  by  Intel  or Intel's
20 | # suppliers or licensors in any way.
21 | #===============================================================================
22 | 
23 | echo "This is a SAMPLE run script for SMP LINPACK. Change it to reflect"
24 | echo "the correct number of CPUs/threads, problem input files, etc.."
25 | 
26 | # Setting up affinity for better threading performance
27 | export KMP_AFFINITY=nowarnings,compact,1,0,granularity=fine
28 | #export MKL_NUM_THREADS=4
29 | #export OMP_NUM_THREADS=4
30 | #export MKL_DOMAIN_NUM_THREADS=4
31 | 
32 | # Get Directory of Script
33 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
34 | 
35 | # Use numactl for better performance on multi-socket machines.
36 | nnodes=`numactl -H 2>&1 | awk '/available:/ {print $2}'`
37 | cpucores=`cat /proc/cpuinfo | awk '/cpu cores/ {print $4; exit}'`
38 | 
39 | if [  $nnodes -gt 1 -a $cpucores -gt 8 ]
40 | then
41 |     numacmd="numactl --interleave=all"
42 | else
43 |     numacmd=
44 | fi
45 | 
46 | arch=xeon64
47 | {
48 |   date
49 |   /usr/bin/time -v $numacmd $DIR/xlinpack_$arch $DIR/lininput_$arch
50 |   echo -n "Done: "
51 |   date
52 | } | tee $DIR/lin_$arch.txt
53 | 
54 | 


--------------------------------------------------------------------------------
/spark/pagerank/src/main/scala/PageRankExample.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | // scalastyle:off println
19 | //package org.apache.spark.examples.graphx
20 | 
21 | // $example on$
22 | import org.apache.spark.graphx.GraphLoader
23 | // $example off$
24 | import org.apache.spark.sql.SparkSession
25 | import org.apache.log4j.{Level, Logger}
26 | 
27 | /**
28 |  * A PageRank example on social network dataset
29 |  * Run with
30 |  * {{{
31 |  * bin/run-example graphx.PageRankExample
32 |  * }}}
33 |  */
34 | object pagerank {
35 |   def main(args: Array[String]): Unit = {
36 |     // Creates a SparkSession.
37 |     val spark = SparkSession
38 |       .builder
39 |       .appName(s"${this.getClass.getSimpleName}")
40 |       .getOrCreate()
41 |     val sc = spark.sparkContext
42 | 
43 |     val rootLogger = Logger.getRootLogger()
44 |     rootLogger.setLevel(Level.ERROR)
45 | 
46 |     val home_dir = System.getProperty("user.home")
47 |     val spark_home =  home_dir + "/spark-2.4.0-bin-hadoop2.7/"
48 |     val data_file = spark_home + "data/sosp/web-BerkStan.txt"
49 |     // $example on$
50 |     // Load the edges as a graph
51 |     //val graph = GraphLoader.edgeListFile(sc, "data/berkeley_stanford/web-BerkStan.txt")
52 |     val graph = GraphLoader.edgeListFile(sc, data_file)
53 |     // Run PageRank
54 |     val ranks = graph.pageRank(0.0001).vertices
55 |     // Join the ranks with the usernames
56 |     //val users = sc.textFile("data/graphx/users.txt").map { line =>
57 | /*    val users = sc.textFile("users.txt").map { line =>
58 |       val fields = line.split(",")
59 |       (fields(0).toLong, fields(1))
60 |     }
61 |     val ranksByUsername = users.join(ranks).map {
62 |       case (id, (username, rank)) => (username, rank)
63 |     }
64 |     // Print the result
65 |     println(ranksByUsername.collect().mkString("\n"))
66 | */
67 |     // $example off$
68 |     spark.stop()
69 |   }
70 | }
71 | // scalastyle:on println
72 | 


--------------------------------------------------------------------------------
/lib/utils.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import subprocess
 3 | import re
 4 | import os
 5 | import argparse
 6 | 
 7 | g_sim_start = 0
 8 | 
 9 | 
10 | def get_current_ts():
11 |     global g_sim_start
12 |     curr_ts = int(round(time.time() * 1000))
13 | 
14 |     if g_sim_start == 0:
15 |         g_sim_start = curr_ts
16 |         return 0
17 | 
18 |     return curr_ts - g_sim_start
19 | 
20 | 
21 | def shell_exec(cmdline):
22 |     p = subprocess.Popen(cmdline, stdout=subprocess.PIPE,
23 |                          stderr=subprocess.PIPE, shell=True)
24 |     out, err = p.communicate()
25 |     return (p.returncode, out.decode('utf-8'), err.decode('utf-8'))
26 | 
27 | 
28 | def check_sudo():
29 |     if os.geteuid() != 0:
30 |         raise RuntimeError("Run with sudo.")
31 | 
32 | 
33 | def check_ratio(arg):
34 |     ''' Check the validity of the argument passed for ratio.
35 |         This function is passed to the argument parser.
36 |     '''
37 |     if arg == 'max':
38 |         return 'max'
39 |     else:
40 |         try:
41 |             value = float(arg)
42 |         except ValueError:
43 |             msg = "Value provided for ratio is neither a number or max"
44 |             raise argparse.ArgumentTypeError(msg)
45 |         if (0 < value):
46 |             return value
47 |         else:
48 |             raise argparse.ArgumentTypeError("Ratio value must be > 0")
49 | 
50 | 
51 | class BinTimeParser:
52 |     def __init__(self):
53 |         pass
54 | 
55 |     def parse(self, string):
56 |         header = ','.join(('User Time', 'System Time',
57 |                            'Wall Time', 'Major Page Faults'))
58 |         values = {'User Time': self.get_user_time(string),
59 |                   'System Time': self.get_sys_time(string),
60 |                   'Wall Time': self.get_wall_time(string),
61 |                   'Major Page Faults': self.get_page_faults(string)}
62 |         return values
63 | 
64 |     def get_user_time(self, string):
65 |         regex = re.compile(r"User time \(seconds\): (\d+.\d+)")
66 |         return float(regex.search(string).groups()[0])
67 | 
68 |     def get_sys_time(self, string):
69 |         regex = re.compile(r"System time \(seconds\): (\d+.\d+)")
70 |         return float(regex.search(string).groups()[0])
71 | 
72 |     def get_wall_time(self, string):
73 |         regex = re.compile(r"\(h:mm:ss or m:ss\): (\d*?):*(\d+):(\d+\.\d+)")
74 |         hours, minutes, seconds = regex.search(string).groups()
75 |         hours = float(hours) if hours else 0  # hours may be None
76 |         minutes, seconds = float(minutes), float(seconds)
77 |         return round(hours * 3600 + minutes * 60 + seconds, 3)
78 | 
79 |     def get_page_faults(self, string):
80 |         regex = re.compile(r"Major \(requiring I/O\) page faults: (\d+)")
81 |         return int(regex.search(string).groups()[0])
82 | 


--------------------------------------------------------------------------------
/lib/container.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from lib import utils
 3 | from lib import constants
 4 | 
 5 | 
 6 | class Container:
 7 |     def __init__(self, name, mem_req, ratio):
 8 |         self.name = name
 9 |         self.mem_req = mem_req  # in MB
10 |         self.ratio = ratio
11 | 
12 |     def exists(self):
13 |         """ Returns whether this container still exists """
14 |         return os.path.isdir(self.get_cont_path())
15 | 
16 |     def delete(self):
17 |         path = self.get_cont_path()
18 |         ret = utils.shell_exec("rmdir {0}".format(path))[0]
19 |         if ret:
20 |             raise RuntimeError("Error deleting {}".format(path))
21 | 
22 |     def set_memory_limit(self):
23 |         # this is possible if the caller is multithreaded
24 |         # and hasn't realized the container has been deleted
25 |         if not self.exists():
26 |             return
27 | 
28 |         if self.ratio == 'max':
29 |             memory_limit = 'max'
30 |             print("Setting container memory limit to Max")
31 |         else:
32 |             memory_limit = str(round(self.ratio*self.mem_req)) + 'M'
33 |             print("Setting {} memory limit to "
34 |                   "{}% ({}) of max".format(self.name,
35 |                                            round(self.ratio*100),
36 |                                            memory_limit))
37 | 
38 |         mem_high_path = self.get_cont_path() + '/memory.high'
39 |         with open(mem_high_path, 'w') as f:
40 |             f.write(memory_limit)
41 | 
42 |     def set_new_size(self, local_ratio):
43 |         self.ratio = local_ratio
44 |         self.set_memory_limit()
45 | 
46 |     def get_cont_path(self):
47 |         return "{}/{}".format(constants.CGROUP_PATH, self.name)
48 | 
49 |     def get_procs_path(self):
50 |         return self.get_cont_path() + '/cgroup.procs'
51 | 
52 |     def create(self):
53 |         """creates new container as child of CGROUP_PATH"""
54 |         new_cont_path = self.get_cont_path()
55 |         try:
56 |             os.mkdir(new_cont_path)
57 |             assert self.exists()
58 |         except FileExistsError:
59 |             print("container {} already exists, trying to delete".format(self.name))
60 |             self.delete()
61 |             os.mkdir(new_cont_path)
62 |         self.set_memory_limit()
63 | 
64 |     def get_pids(self):
65 |         try:
66 |             with open(self.get_procs_path(), 'r') as f:
67 |                 pids = f.readlines()
68 |                 pids = map(lambda p: p.rstrip('\n'), pids)
69 |                 pids = tuple(map(int, pids))
70 |                 return pids
71 |         except Exception as e:
72 |             print("Exception of type: {}".format(type(e)))
73 |             print("Procs path: {}".format(self.get_procs_path()))
74 |             return ()
75 | 
76 | def check():
77 |     '''Check that the cgroup path exists and that the memory controller is enabled'''
78 |     if not os.path.isdir(constants.CGROUP_PATH):
79 |         raise RuntimeError("{} does not exist".format(constants.CGROUP_PATH))
80 | 
81 |     with open(constants.CGROUP_PATH + '/cgroup.subtree_control', 'r') as f:
82 |         content = f.read()
83 |         if 'memory' not in content:
84 |             raise RuntimeError('memory controller not enabled')
85 | 


--------------------------------------------------------------------------------
/lib/ftracer.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import re
 3 | from multiprocessing import cpu_count
 4 | from lib import constants
 5 | from lib import utils
 6 | 
 7 | BUFFER_SIZE_DEFAULT = 1408
 8 | BUFFER_SIZE_MAX = 500000
 9 | 
10 | 
11 | class FTracer:
12 |     def __init__(self, filter_functions):
13 |         self.filter_functions = filter_functions
14 | 
15 |     def read_trace_stats(self):
16 |         regex = re.compile(r'([^\s]+)\s+(\d+)\s+(\d+\.*\d*)'
17 |                            '\s+us\s+(\d+\.*\d*)\s+us\s+(\d+\.*\d*)')
18 |         stats = dict()
19 | 
20 |         for cpu in range(cpu_count()):
21 |             filename = constants.TRACING_DIR + 'trace_stat/function' + str(cpu)
22 |             with open(filename, 'r') as f:
23 |                 for line in f:
24 |                     match = regex.search(line)
25 |                     if match:
26 |                         func_name, hit, time, avg, std_dev = match.groups()
27 |                         if func_name in stats:
28 |                             stats[func_name]['hits'] += int(hit)
29 |                             stats[func_name]['sum_time'] += float(time)
30 |                         else:
31 |                             stats[func_name] = {'hits': int(hit),
32 |                                                 'sum_time': float(time)}
33 |                     else:
34 |                         pass
35 |         for func, values in stats.items():
36 |             values['avg'] = values['sum_time']/values['hits']
37 |         return stats
38 | 
39 |     def set_ftrace_filter(self):
40 |         filter_file = constants.TRACING_DIR + 'set_ftrace_filter'
41 |         with open(filter_file, 'w') as f:
42 |             f.write('\n'.join(self.filter_functions))
43 | 
44 |     def enable_function_profile(self):
45 |         filename = constants.TRACING_DIR + 'function_profile_enabled'
46 |         with open(filename, 'w') as f:
47 |             f.write('1')
48 | 
49 |     def disable_function_profile(self):
50 |         filename = constants.TRACING_DIR + 'function_profile_enabled'
51 |         with open(filename, 'w') as f:
52 |             f.write('0')
53 | 
54 |     def set_buffer_size_kb(self, size):
55 |         with open(constants.TRACING_DIR + 'buffer_size_kb', 'w') as f:
56 |             f.write(str(size))
57 | 
58 |     def enable_tracing_on(self):
59 |         with open(constants.TRACING_DIR + 'tracing_on', 'w') as f:
60 |             f.write('1')
61 | 
62 |     def disable_tracing_on(self):
63 |         with open(constants.TRACING_DIR + 'tracing_on', 'w') as f:
64 |             f.write('0')
65 | 
66 |     def set_current_tracer(self, tracer):
67 |         with open(constants.TRACING_DIR + 'current_tracer', 'w') as f:
68 |             f.write(tracer)
69 | 
70 |     def copy_trace(self, name, mem_ratio):
71 |         print("Copying trace to current directory")
72 |         cp_trace = ' '.join(('sudo cp',
73 |                              constants.TRACING_DIR + 'trace',
74 |                              '{}_{}_{}')).format(name, mem_ratio,
75 |                                                  '_'.join(self.filter_functions))
76 |         utils.shell_exec(cp_trace)
77 | 
78 |     def setup_profile(self):
79 |         self.set_current_tracer('function')
80 |         self.set_ftrace_filter()
81 |         self.disable_function_profile()
82 |         self.enable_function_profile()
83 | 
84 |     def teardown_profile(self):
85 |         self.disable_function_profile()
86 | 
87 |     def setup_timestamp(self):
88 |         self.set_current_tracer('function')
89 |         self.set_ftrace_filter()
90 |         self.set_buffer_size_kb(BUFFER_SIZE_MAX)
91 |         self.enable_tracing_on()
92 | 
93 |     def teardown_timestamp(self):
94 |         self.disable_tracing_on()
95 |         self.set_buffer_size_kb(BUFFER_SIZE_DEFAULT)
96 | 


--------------------------------------------------------------------------------
/stream/READ.ME:
--------------------------------------------------------------------------------
  1 | ===============================================
  2 | 
  3 | STREAM is the de facto industry standard benchmark
  4 | for measuring sustained memory bandwidth.
  5 | 
  6 | Documentation for STREAM is on the web at:
  7 |    http://www.cs.virginia.edu/stream/ref.html
  8 | 
  9 | ===============================================
 10 | NEWS
 11 | ===============================================
 12 | UPDATE: October 28 2014:
 13 | 
 14 | "stream_mpi.c" released in the Versions directory.
 15 | 
 16 | Based on Version 5.10 of stream.c, stream_mpi.c
 17 | brings the following new features:
 18 | * MPI implementation that *distributes* the arrays
 19 |   across all MPI ranks. (The older Fortran version
 20 |   of STREAM in MPI *replicates* the arrays across
 21 |   all MPI ranks.)
 22 | * Data is allocated using "posix_memalign" 
 23 |   rather than using static arrays.  Different
 24 |   compiler flags may be needed for both portability
 25 |   and optimization.
 26 |   See the READ.ME file in the Versions directory
 27 |   for more details.
 28 | * Error checking and timing done by all ranks and
 29 |   gathered by rank 0 for processing and output.
 30 | * Timing code uses barriers to ensure correct
 31 |   operation even when multiple MPI ranks run on
 32 |   shared memory systems.
 33 | 
 34 | NOTE: MPI is not a preferred implementation for
 35 |   STREAM, which is intended to measure memory
 36 |   bandwidth in shared-memory systems.  In stream_mpi,
 37 |   the MPI calls are only used to properly synchronize
 38 |   the timers (using MPI_Barrier) and to gather
 39 |   timing and error data, so the performance should 
 40 |   scale linearly with the size of the cluster.
 41 |   But it may be useful, and was an interesting 
 42 |   exercise to develop and debug.
 43 | 
 44 | ===============================================
 45 | UPDATE: January 17 2013:
 46 | 
 47 | Version 5.10 of stream.c is finally available!
 48 | 
 49 | There are no changes to what is being measured, but
 50 | a number of long-awaited improvements have been made:
 51 | 
 52 | * Updated validation code does not suffer from 
 53 |   accumulated roundoff error for large arrays.
 54 | * Defining the preprocessor variable "VERBOSE"
 55 |   when compiling will (1) cause the code to print the
 56 |   measured average relative absolute error (rather than
 57 |   simply printing "Solution Validates", and (2) print
 58 |   the first 10 array entries with relative error exceeding
 59 |   the error tolerance.
 60 | * Array index variables have been upgraded from
 61 |   "int" to "ssize_t" to allow arrays with more
 62 |   than 2 billion elements on 64-bit systems.
 63 | * Substantial improvements to the comments in 
 64 |   the source on how to configure/compile/run the
 65 |   benchmark.
 66 | * The proprocessor variable controlling the array
 67 |   size has been changed from "N" to "STREAM_ARRAY_SIZE".
 68 | * A new preprocessor variable "STREAM_TYPE" can be
 69 |   used to override the data type from the default
 70 |   "double" to "float".
 71 |   This mechanism could also be used to change to 
 72 |   non-floating-point types, but several "printf"
 73 |   statements would need to have their formats changed
 74 |   to accomodate the modified data type.
 75 | * Some small changes in output, including printing
 76 |   array sizes is GiB as well as MiB.
 77 | * Change to the default output format to print fewer
 78 |   decimals for the bandwidth and more decimals for
 79 |   the min/max/avg execution times.
 80 | 
 81 | 
 82 | ===============================================
 83 | UPDATE: February 19 2009:
 84 | 
 85 | The most recent "official" versions have been renamed
 86 | "stream.f" and "stream.c" -- all other versions have
 87 | been moved to the "Versions" subdirectory and should be
 88 | considered obsolete.
 89 | 
 90 | The "official" timer (was "second_wall.c") has been
 91 | renamed "mysecond.c".   This is embedded in the C version
 92 | ("stream.c"), but still needs to be externally linked to
 93 | the FORTRAN version ("stream.f").  The new version defines
 94 | entry points both with and without trailing underscores,
 95 | so it *should* link automagically with any Fortran compiler.
 96 | 
 97 | ===============================================
 98 | 
 99 | STREAM is a project of "Dr. Bandwidth":
100 | 	John D. McCalpin, Ph.D.
101 | 	john@mccalpin.com
102 | 
103 | ===============================================
104 | 
105 | The STREAM web and ftp sites are currently hosted at
106 | the Department of Computer Science at the University of
107 | Virginia under the generous sponsorship of Professor Bill
108 | Wulf and Professor Alan Batson.
109 | 
110 | ===============================================
111 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Setup and pre-requisites
  2 | 
  3 | On the client node the fastswap kernel and driver must be loaded. On the far memory node the server binary `rmserver` must be running. Please see https://github.com/clusterfarmem/fastswap for more details.
  4 | 
  5 | ## General pre-requisites
  6 | 
  7 | You'll need python3, grpcio, grpcio-tools, numpy and scipy to execute various parts of our framework. Please make sure your python environment can see these modules.
  8 | 
  9 | ## Workload setup (for single and multi-workload benchmarks)
 10 | 
 11 | * quicksort
 12 |     * Change directory to quicksort and type make
 13 | * linpack
 14 |     * No setup required, but most likely you'll need an Intel CPU
 15 | * tf-inception
 16 |     * tensorflow 1.14 is required
 17 |     * Init submodules `git submodule update --init`
 18 | * spark
 19 |     * We assume the user has installed [spark 2.4](https://archive.apache.org/dist/spark/spark-2.4.0/spark-2.4.0-bin-hadoop2.7.tgz) at `~/spark-2.4.0-bin-hadoop2.7`
 20 | * kmeans
 21 |     * Requires sklearn available in python3
 22 | * memcached
 23 |     * Requires `memcached` and `memaslap` to be installed and available in your $PATH environment.
 24 | * stream
 25 |     * Change directory to stream and type make
 26 | 
 27 | ## Setting up cgroups
 28 | ### Disable cgroup v1
 29 | * Open /boot/grub/grub.cfg in your editor of choice
 30 | * Find the `menuentry` for the fastswap kernel
 31 | * Add `cgroup_no_v1=memory` to the end of the line beginning in `linux   /boot/vmlinuz-4.11.0-sswap`
 32 | * Save and exit the file
 33 | * Run: sudo update-grub
 34 | * Reboot
 35 | 
 36 | ### Enable cgroup v2
 37 | The framework and scripts rely on the cgroup system to be mounted at /cgroup2. Perform the following actions:
 38 | * Run `sudo mkdir /cgroup2` to create root mount point
 39 | * Execute `setup/init_bench_cgroups.sh`
 40 |     * Mounts cgroup system
 41 |     * Changes ownership of the mount point (and all nested files) to the current user
 42 |     * Enables prefetching
 43 | 
 44 | ## Protocol Buffers
 45 | We use [the grpc framework](https://grpc.io) and [protocol buffers](https://developers.google.com/protocol-buffers/docs/pythontutorial) to communicate between the scheduler and servers. The messages that we've defined are in `protocol/protocol.proto`. To generate them the corresponding `.py` files, execute the following command in the `protocol` directory:
 46 |     
 47 |     source gen_protocol.sh
 48 | 
 49 | # Single Workload Benchmarks
 50 | ## `benchmark.py`
 51 | 
 52 | `Benchmark.py` is the command center from which you can run local, single benchmarks. It accepts numerous arguments but only two, `workload` and `ratio`, are required. Its minimum invocation is the following:
 53 | 
 54 |     ./benchmark.py <workload> <ratio>
 55 | 
 56 | Where `workload` is an application that the toolset has been configured to benchmark (Ex: linpack) and `ratio` is the portion of its resident set size that you want to keep in local memory, expressed as a decimal.
 57 | 
 58 | Running the tool in this way will set the appropriate limits in the applications cgroup, run it to completion, then print statistics to stdout.
 59 | 
 60 | ## Arguments
 61 | Argument            | Description | Required
 62 | --------------------------------|-----------------------|----------------------
 63 | workload | An application that the toolset has been configured to benchmark (Ex: linpack) | Y
 64 | ratio | The portion of the workload's resident set size that you want to keep in local memory, expressed as a decimal | Y
 65 | --id | The workload ID that's appended to the workload's name to create its container name. If let unset, it will default to 0 | N
 66 | --cpus | A comma separated list of CPUs to pin the workload to. If both this is left unset, the workload will be pinned to CPUs `[0, N-1]` where `N` is the number of CPUs listed in the workload's class | N
 67 | 
 68 | ## Examples
 69 | ### Linpack with 50% local memory on CPUs 4,5,6,7
 70 |     ./benchmark.py linpack 0.5 --cpus 4,5,6,7
 71 | 
 72 | ### Quicksort with 30% local memory with an ID of 5
 73 |     ./benchmark.py quicksort 0.3 --id 5
 74 | 
 75 | ## Adding Additional Workloads
 76 | New workloads can be added by modifying the workload_choices variable in `benchmark.py` and creating a new class for it in `lib/workloads.py`. 
 77 | 
 78 | # Multi-workload Benchmarks
 79 | 
 80 | ## `server.py`
 81 | `server.py` runs on a separate (or even the same) machine from `scheduler.py`. Multiple `server.py` instances send execution-related data to a single `scheduler.py` instance, receiving workload execution directions in turn. `server.py` takes a single, optional flag, --log, that directs it to save a timestamped account of events to a file named `log.txt` in the same directory. 
 82 | 
 83 | ## Potential Issues
 84 | We made a lot of assumptions about system configuration. `server.py` expects several files to exist on your system, mostly for sampling purposes. If they don't exist, we insert zeroes instead of reading their values.
 85 | 
 86 | ## `scheduler.py`
 87 | This is the brains of the server-scheduler system. The scheduler is responsible for determining the arrival order of workloads, setting the shrinking policy, and aggregating all of the data from the server(s).
 88 | 
 89 | ## Arguments
 90 | Argument            | Description | Required
 91 | --------------------------------|-------------------------------|--------------
 92 | seed | The seed used to initialize the randomized operations that the scheduler performs | Y
 93 | servers | A comma-separated list of ip:port combinations on which `server.py` instances are listening | Y
 94 | cpus | The number of cpus that each server is allowed to use | Y
 95 | mem | The amount of local memory that each server is allowed to use | Y
 96 | --remotemem, -r | Enables remote memory on each of the `server.py` instances | N
 97 | --max_far, -s | The maximum aggregate remote memory that servers are allowed to use. Enforced entirely in the scheduler. Default = Unlimited | N
 98 | --size | The total number of workloads to run. Default = 200| N
 99 | --workload | A comma-separated set of unique workloads to run. Default = quicksort,kmeans,memaslap | N
100 | --ratios | A colon-separated set of ratios that correspond to the arguments for --workload. This determines how well-represented a particular workload type is in the aggregate. Default = 2:1:1 | N
101 | --until | The maximum arrival time of a workload. Default = 20 | N
102 | --uniform_ratio | Smallest local memory ratio for the uniform shrinking policy | N
103 | --variable_ratios | A comma-separated list of minimum local memory ratios that correspond to the arguments for --workload | N
104 | --start_burst | The number of workloads that will have their arrival time set to 0 instead of randomized. Default = 0 | N
105 | --optimal | Use the optimal shrinking policy | N
106 | 
107 | ## Examples
108 | 
109 |     ./scheduler.py 123 192.168.0.1:50051 8 8192 -r --max_far 4096 --size 100 \
110 |     --workload quicksort,kmeans,linpack --ratios 3:1:1 --until 30 --optimal
111 | 
112 | Parameter            | Value | Explanation
113 | --------------|-----------------|------
114 | seed | 123 | Randomization seed. The same seed creates the same arrival pattern
115 | servers | 192.168.0.1:50051 | Connect to a `server.py` instance at IP 192.168.0.1 that's listening on port 50051
116 | cpus | 8 | The `server.py` instance can use a total of 8 CPUs
117 | mem | 8192 (8192 = 8GB) | The `server.py` instance can use a total of 8GB of local memory
118 | -r | Set | Enable the use of remote memory (for swapping)
119 | --max_far | 4096 | The `server.py` instance can use a total of 4GB of remote memory
120 | --size | 100 | A total of 100 workloads will be scheduled. The type/number are determined by `--workload` and `--ratios`
121 | --workload | quicksort,kmeans,linpack | The previously-specified 100 workloads will consist of quicksort, kmeans, and linpack. The mixture is determined by `--ratios`
122 | --ratios | 3:1:1 | The first, second, and third workloads in the comma-separated list passed to `--workload` constitute 60% (3/(3+1+1)), 20% (1/(3+1+1)), and 20% (1/(3+1+1)) of the 100 workloads respectively. In this example, there will be 60 quicksorts, 20 kmeans, and 20 linpacks scheduled.
123 | --until | 30 | Each of the 30 workloads will have a random arrival time between 0 and 30 seconds
124 | --optimal | Set | The `server.py` and `scheduler.py` will use the optimal shrinking policy. Setting this precludes using both `--uniform_ratio` and `--variable_ratios`
125 | 
126 |     ./scheduler.py 123 192.168.0.1:50051 8 8192 -r --size 100 --workload quicksort,kmeans,linpack \
127 |     --ratios 3:1:1 --until 30 --variable_ratios 0.5,0.6,0.7
128 | 
129 | Parameter            | Value | Explanation
130 | --------------|-----------------|------
131 | seed | 123 | Randomization seed. The same seed creates the same arrival pattern
132 | servers | 192.168.0.1:50051 | Connect to a `server.py` instance at IP 192.168.0.1 that's listening on port 50051
133 | cpus | 8 | The `server.py` instance can use a total of 8 CPUs
134 | mem | 8192 (8192 = 8GB) | The `server.py` instance can use a total of 8GB of local memory
135 | -r | Set | Enable the use of remote memory (for swapping)
136 | --max_far | Unset | The `server.py` instance can use unlimited remote memory
137 | --size | 100 | A total of 100 workloads will be scheduled. The type/number are determined by `--workload` and `--ratios`
138 | --workload | quicksort,kmeans,linpack | The previously-specified 100 workloads will consist of quicksort, kmeans, and linpack. The mixture is determined by `--ratios`
139 | --ratios | 3:1:1 | The first, second, and third workloads in the comma-separated list passed to `--workload` constitute 60% (3/(3+1+1)), 20% (1/(3+1+1)), and 20% (1/(3+1+1)) of the 100 workloads respectively. In this example, there will be 60 quicksorts, 20 kmeans, and 20 linpacks scheduled.
140 | --until | 30 | Each of the 30 workloads will have a random arrival time between 0 and 30 seconds
141 | --variable_ratios | 0.5,0.6,0.7 | The three workloads (quicksort, kmeans, and linpack) will have their minimum ratios set to 0.5, 0.6, and 0.7 respectively. `server.py` and `scheduler.py` will use the variable shrinking policy. Setting this precludes using both `--uniform_ratio` and `--optimal`
142 | 
143 |     ./scheduler.py 123 192.168.0.1:50051,192.168.0.2:50051 8 8192 -r --size 250 \
144 |     --workload quicksort,kmeans,linpack --ratios 3:1:1 --uniform_ratio 0.5 \
145 |     --until 30 --start_burst 2
146 | 
147 | Parameter            | Value | Explanation
148 | --------------|-----------------|------
149 | seed | 123 | Randomization seed. The same seed creates the same arrival pattern
150 | servers | 192.168.0.1:50051,192.168.0.2:50051 | Connect to `server.py` instances at IPs 192.168.0.1 and 192.168.0.2 that are both listening on port 50051
151 | cpus | 8 | Each `server.py` instance can use a total of 8 CPUs
152 | mem | 8192 (8192 = 8GB) | Each `server.py` instance can use a total of 8GB of local memory
153 | -r | Set | Enable the use of remote memory (for swapping)
154 | --max_far | Unset | Each `server.py` instance can use unlimited remote memory
155 | --size | 250 | A total of 250 workloads will be scheduled. The type/number are determined by `--workload` and `--ratios`
156 | --workload | quicksort,kmeans,linpack | The previously-specified 250 workloads will consist of quicksort, kmeans, and linpack. The mixture is determined by `--ratios`
157 | --ratios | 3:1:1 | The first, second, and third workloads in the comma-separated list passed to `--workload` constitute 60% (3/(3+1+1)), 20% (1/(3+1+1)), and 20% (1/(3+1+1)) of the 100 workloads respectively. In this example, there will be 150 quicksorts, 50 kmeans, and 50 linpacks scheduled.
158 | --uniform_ratio | 0.5 | The three workloads (quicksort, kmeans, and linpack) will have their minimum ratios set to 0.5. `server.py` and `scheduler.py` will use the uniform shrinking policy. Setting this precludes using both `--optimal` and `--variable_ratios`
159 | --until | 30 | Each of the 30 workloads will have a random arrival time between 0 and 30 seconds
160 | --start_burst | 2 | The first 2 workloads in the schedule will have their arrival times modified to be 0. This causes them to arrive immediately. 
161 | 
162 | ## Further reading
163 | For more information, please refer to our [paper](https://dl.acm.org/doi/abs/10.1145/3342195.3387522) accepted at [EUROSYS 2020](https://www.eurosys2020.org/)
164 | 
165 | ## Questions
166 | For additional questions please contact us at cfm@lists.eecs.berkeley.edu
167 | 


--------------------------------------------------------------------------------
/stream/stream.f:
--------------------------------------------------------------------------------
  1 | *=======================================================================
  2 | * Program: STREAM
  3 | * Programmer: John D. McCalpin
  4 | * RCS Revision: $Id: stream.f,v 5.6 2005/10/04 00:20:48 mccalpin Exp mccalpin $
  5 | *-----------------------------------------------------------------------
  6 | * Copyright 1991-2003: John D. McCalpin
  7 | *-----------------------------------------------------------------------
  8 | * License:
  9 | *  1. You are free to use this program and/or to redistribute
 10 | *     this program.
 11 | *  2. You are free to modify this program for your own use,
 12 | *     including commercial use, subject to the publication
 13 | *     restrictions in item 3.
 14 | *  3. You are free to publish results obtained from running this
 15 | *     program, or from works that you derive from this program,
 16 | *     with the following limitations:
 17 | *     3a. In order to be referred to as "STREAM benchmark results",
 18 | *         published results must be in conformance to the STREAM
 19 | *         Run Rules, (briefly reviewed below) published at
 20 | *         http://www.cs.virginia.edu/stream/ref.html
 21 | *         and incorporated herein by reference.
 22 | *         As the copyright holder, John McCalpin retains the
 23 | *         right to determine conformity with the Run Rules.
 24 | *     3b. Results based on modified source code or on runs not in
 25 | *         accordance with the STREAM Run Rules must be clearly
 26 | *         labelled whenever they are published.  Examples of
 27 | *         proper labelling include:
 28 | *         "tuned STREAM benchmark results" 
 29 | *         "based on a variant of the STREAM benchmark code"
 30 | *         Other comparable, clear and reasonable labelling is
 31 | *         acceptable.
 32 | *     3c. Submission of results to the STREAM benchmark web site
 33 | *         is encouraged, but not required.
 34 | *  4. Use of this program or creation of derived works based on this
 35 | *     program constitutes acceptance of these licensing restrictions.
 36 | *  5. Absolutely no warranty is expressed or implied.
 37 | *-----------------------------------------------------------------------
 38 | * This program measures sustained memory transfer rates in MB/s for
 39 | * simple computational kernels coded in FORTRAN.
 40 | *
 41 | * The intent is to demonstrate the extent to which ordinary user
 42 | * code can exploit the main memory bandwidth of the system under
 43 | * test.
 44 | *=======================================================================
 45 | * The STREAM web page is at:
 46 | *          http://www.streambench.org
 47 | *
 48 | * Most of the content is currently hosted at:
 49 | *          http://www.cs.virginia.edu/stream/
 50 | *
 51 | * BRIEF INSTRUCTIONS: 
 52 | *       0) See http://www.cs.virginia.edu/stream/ref.html for details
 53 | *       1) STREAM requires a timing function called mysecond().
 54 | *          Several examples are provided in this directory.
 55 | *          "CPU" timers are only allowed for uniprocessor runs.
 56 | *          "Wall-clock" timers are required for all multiprocessor runs.
 57 | *       2) The STREAM array sizes must be set to size the test.
 58 | *          The value "N" must be chosen so that each of the three
 59 | *          arrays is at least 4x larger than the sum of all the last-
 60 | *          level caches used in the run, or 1 million elements, which-
 61 | *          ever is larger.
 62 | *          ------------------------------------------------------------
 63 | *          Note that you are free to use any array length and offset
 64 | *          that makes each array 4x larger than the last-level cache.
 65 | *          The intent is to determine the *best* sustainable bandwidth
 66 | *          available with this simple coding.  Of course, lower values
 67 | *          are usually fairly easy to obtain on cached machines, but 
 68 | *          by keeping the test to the *best* results, the answers are
 69 | *          easier to interpret.
 70 | *          You may put the arrays in common or not, at your discretion.
 71 | *          There is a commented-out COMMON statement below.
 72 | *          Fortran90 "allocatable" arrays are fine, too.
 73 | *          ------------------------------------------------------------
 74 | *       3) Compile the code with full optimization.  Many compilers
 75 | *          generate unreasonably bad code before the optimizer tightens
 76 | *          things up.  If the results are unreasonably good, on the
 77 | *          other hand, the optimizer might be too smart for me
 78 | *          Please let me know if this happens.
 79 | *       4) Mail the results to mccalpin@cs.virginia.edu
 80 | *          Be sure to include:
 81 | *               a) computer hardware model number and software revision
 82 | *               b) the compiler flags
 83 | *               c) all of the output from the test case.
 84 | *          Please let me know if you do not want your name posted along
 85 | *          with the submitted results.
 86 | *       5) See the web page for more comments about the run rules and
 87 | *          about interpretation of the results.
 88 | *
 89 | * Thanks,
 90 | *   Dr. Bandwidth
 91 | *=========================================================================
 92 | *
 93 |       PROGRAM stream
 94 | *     IMPLICIT NONE
 95 | C     .. Parameters ..
 96 |       INTEGER n,offset,ndim,ntimes
 97 |       PARAMETER (n=2000000,offset=0,ndim=n+offset,ntimes=10)
 98 | C     ..
 99 | C     .. Local Scalars ..
100 |       DOUBLE PRECISION scalar,t
101 |       INTEGER j,k,nbpw,quantum
102 | C     ..
103 | C     .. Local Arrays ..
104 |       DOUBLE PRECISION maxtime(4),mintime(4),avgtime(4),
105 |      $                 times(4,ntimes)
106 |       INTEGER bytes(4)
107 |       CHARACTER label(4)*11
108 | C     ..
109 | C     .. External Functions ..
110 |       DOUBLE PRECISION mysecond
111 |       INTEGER checktick,realsize
112 |       EXTERNAL mysecond,checktick,realsize
113 | !$    INTEGER omp_get_num_threads
114 | !$    EXTERNAL omp_get_num_threads
115 | C     ..
116 | C     .. Intrinsic Functions ..
117 | C
118 |       INTRINSIC dble,max,min,nint,sqrt
119 | C     ..
120 | C     .. Arrays in Common ..
121 |       DOUBLE PRECISION a(ndim),b(ndim),c(ndim)
122 | C     ..
123 | C     .. Common blocks ..
124 | *     COMMON a,b,c
125 | C     ..
126 | C     .. Data statements ..
127 |       DATA avgtime/4*0.0D0/,mintime/4*1.0D+36/,maxtime/4*0.0D0/
128 |       DATA label/'Copy:      ','Scale:     ','Add:       ',
129 |      $     'Triad:     '/
130 |       DATA bytes/2,2,3,3/
131 | C     ..
132 | 
133 | *       --- SETUP --- determine precision and check timing ---
134 | 
135 |       nbpw = realsize()
136 | 
137 |       PRINT *,'----------------------------------------------'
138 |       PRINT *,'STREAM Version $Revision: 5.6 $'
139 |       PRINT *,'----------------------------------------------'
140 |       WRITE (*,FMT=9010) 'Array size = ',n
141 |       WRITE (*,FMT=9010) 'Offset     = ',offset
142 |       WRITE (*,FMT=9020) 'The total memory requirement is ',
143 |      $  3*nbpw*n/ (1024*1024),' MB'
144 |       WRITE (*,FMT=9030) 'You are running each test ',ntimes,' times'
145 |       WRITE (*,FMT=9030) '--'
146 |       WRITE (*,FMT=9030) 'The *best* time for each test is used'
147 |       WRITE (*,FMT=9030) '*EXCLUDING* the first and last iterations'
148 | 
149 | !$OMP PARALLEL
150 | !$OMP MASTER
151 |       PRINT *,'----------------------------------------------'
152 | !$    PRINT *,'Number of Threads = ',OMP_GET_NUM_THREADS()
153 | !$OMP END MASTER
154 | !$OMP END PARALLEL
155 | 
156 |       PRINT *,'----------------------------------------------'
157 | !$OMP PARALLEL
158 |       PRINT *,'Printing one line per active thread....'
159 | !$OMP END PARALLEL
160 | 
161 | !$OMP PARALLEL DO
162 |       DO 10 j = 1,n
163 |           a(j) = 2.0d0
164 |           b(j) = 0.5D0
165 |           c(j) = 0.0D0
166 |    10 CONTINUE
167 |       t = mysecond()
168 | !$OMP PARALLEL DO
169 |       DO 20 j = 1,n
170 |           a(j) = 0.5d0*a(j)
171 |    20 CONTINUE
172 |       t = mysecond() - t
173 |       PRINT *,'----------------------------------------------------'
174 |       quantum = checktick()
175 |       WRITE (*,FMT=9000)
176 |      $  'Your clock granularity/precision appears to be ',quantum,
177 |      $  ' microseconds'
178 |       PRINT *,'----------------------------------------------------'
179 | 
180 | *       --- MAIN LOOP --- repeat test cases NTIMES times ---
181 |       scalar = 0.5d0*a(1)
182 |       DO 70 k = 1,ntimes
183 | 
184 |           t = mysecond()
185 |           a(1) = a(1) + t
186 | !$OMP PARALLEL DO
187 |           DO 30 j = 1,n
188 |               c(j) = a(j)
189 |    30     CONTINUE
190 |           t = mysecond() - t
191 |           c(n) = c(n) + t
192 |           times(1,k) = t
193 | 
194 |           t = mysecond()
195 |           c(1) = c(1) + t
196 | !$OMP PARALLEL DO
197 |           DO 40 j = 1,n
198 |               b(j) = scalar*c(j)
199 |    40     CONTINUE
200 |           t = mysecond() - t
201 |           b(n) = b(n) + t
202 |           times(2,k) = t
203 | 
204 |           t = mysecond()
205 |           a(1) = a(1) + t
206 | !$OMP PARALLEL DO
207 |           DO 50 j = 1,n
208 |               c(j) = a(j) + b(j)
209 |    50     CONTINUE
210 |           t = mysecond() - t
211 |           c(n) = c(n) + t
212 |           times(3,k) = t
213 | 
214 |           t = mysecond()
215 |           b(1) = b(1) + t
216 | !$OMP PARALLEL DO
217 |           DO 60 j = 1,n
218 |               a(j) = b(j) + scalar*c(j)
219 |    60     CONTINUE
220 |           t = mysecond() - t
221 |           a(n) = a(n) + t
222 |           times(4,k) = t
223 |    70 CONTINUE
224 | 
225 | *       --- SUMMARY ---
226 |       DO 90 k = 2,ntimes
227 |           DO 80 j = 1,4
228 |               avgtime(j) = avgtime(j) + times(j,k)
229 |               mintime(j) = min(mintime(j),times(j,k))
230 |               maxtime(j) = max(maxtime(j),times(j,k))
231 |    80     CONTINUE
232 |    90 CONTINUE
233 |       WRITE (*,FMT=9040)
234 |       DO 100 j = 1,4
235 |           avgtime(j) = avgtime(j)/dble(ntimes-1)
236 |           WRITE (*,FMT=9050) label(j),n*bytes(j)*nbpw/mintime(j)/1.0D6,
237 |      $      avgtime(j),mintime(j),maxtime(j)
238 |   100 CONTINUE
239 |       PRINT *,'----------------------------------------------------'
240 |       CALL checksums (a,b,c,n,ntimes)
241 |       PRINT *,'----------------------------------------------------'
242 | 
243 |  9000 FORMAT (1x,a,i6,a)
244 |  9010 FORMAT (1x,a,i10)
245 |  9020 FORMAT (1x,a,i4,a)
246 |  9030 FORMAT (1x,a,i3,a,a)
247 |  9040 FORMAT ('Function',5x,'Rate (MB/s)  Avg time   Min time  Max time'
248 |      $       )
249 |  9050 FORMAT (a,4 (f10.4,2x))
250 |       END
251 | 
252 | *-------------------------------------
253 | * INTEGER FUNCTION dblesize()
254 | *
255 | * A semi-portable way to determine the precision of DOUBLE PRECISION
256 | * in Fortran.
257 | * Here used to guess how many bytes of storage a DOUBLE PRECISION
258 | * number occupies.
259 | *
260 |       INTEGER FUNCTION realsize()
261 | *     IMPLICIT NONE
262 | 
263 | C     .. Local Scalars ..
264 |       DOUBLE PRECISION result,test
265 |       INTEGER j,ndigits
266 | C     ..
267 | C     .. Local Arrays ..
268 |       DOUBLE PRECISION ref(30)
269 | C     ..
270 | C     .. External Subroutines ..
271 |       EXTERNAL confuse
272 | C     ..
273 | C     .. Intrinsic Functions ..
274 |       INTRINSIC abs,acos,log10,sqrt
275 | C     ..
276 | 
277 | C       Test #1 - compare single(1.0d0+delta) to 1.0d0
278 | 
279 |    10 DO 20 j = 1,30
280 |           ref(j) = 1.0d0 + 10.0d0** (-j)
281 |    20 CONTINUE
282 | 
283 |       DO 30 j = 1,30
284 |           test = ref(j)
285 |           ndigits = j
286 |           CALL confuse(test,result)
287 |           IF (test.EQ.1.0D0) THEN
288 |               GO TO 40
289 |           END IF
290 |    30 CONTINUE
291 |       GO TO 50
292 | 
293 |    40 WRITE (*,FMT='(a)')
294 |      $  '----------------------------------------------'
295 |       WRITE (*,FMT='(1x,a,i2,a)') 'Double precision appears to have ',
296 |      $  ndigits,' digits of accuracy'
297 |       IF (ndigits.LE.8) THEN
298 |           realsize = 4
299 |       ELSE
300 |           realsize = 8
301 |       END IF
302 |       WRITE (*,FMT='(1x,a,i1,a)') 'Assuming ',realsize,
303 |      $  ' bytes per DOUBLE PRECISION word'
304 |       WRITE (*,FMT='(a)')
305 |      $  '----------------------------------------------'
306 |       RETURN
307 | 
308 |    50 PRINT *,'Hmmmm.  I am unable to determine the size.'
309 |       PRINT *,'Please enter the number of Bytes per DOUBLE PRECISION',
310 |      $  ' number : '
311 |       READ (*,FMT=*) realsize
312 |       IF (realsize.NE.4 .AND. realsize.NE.8) THEN
313 |           PRINT *,'Your answer ',realsize,' does not make sense.'
314 |           PRINT *,'Try again.'
315 |           PRINT *,'Please enter the number of Bytes per ',
316 |      $      'DOUBLE PRECISION number : '
317 |           READ (*,FMT=*) realsize
318 |       END IF
319 |       PRINT *,'You have manually entered a size of ',realsize,
320 |      $  ' bytes per DOUBLE PRECISION number'
321 |       WRITE (*,FMT='(a)')
322 |      $  '----------------------------------------------'
323 |       END
324 | 
325 |       SUBROUTINE confuse(q,r)
326 | *     IMPLICIT NONE
327 | C     .. Scalar Arguments ..
328 |       DOUBLE PRECISION q,r
329 | C     ..
330 | C     .. Intrinsic Functions ..
331 |       INTRINSIC cos
332 | C     ..
333 |       r = cos(q)
334 |       RETURN
335 |       END
336 | 
337 | * A semi-portable way to determine the clock granularity
338 | * Adapted from a code by John Henning of Digital Equipment Corporation
339 | *
340 |       INTEGER FUNCTION checktick()
341 | *     IMPLICIT NONE
342 | 
343 | C     .. Parameters ..
344 |       INTEGER n
345 |       PARAMETER (n=20)
346 | C     ..
347 | C     .. Local Scalars ..
348 |       DOUBLE PRECISION t1,t2
349 |       INTEGER i,j,jmin
350 | C     ..
351 | C     .. Local Arrays ..
352 |       DOUBLE PRECISION timesfound(n)
353 | C     ..
354 | C     .. External Functions ..
355 |       DOUBLE PRECISION mysecond
356 |       EXTERNAL mysecond
357 | C     ..
358 | C     .. Intrinsic Functions ..
359 |       INTRINSIC max,min,nint
360 | C     ..
361 |       i = 0
362 | 
363 |    10 t2 = mysecond()
364 |       IF (t2.EQ.t1) GO TO 10
365 | 
366 |       t1 = t2
367 |       i = i + 1
368 |       timesfound(i) = t1
369 |       IF (i.LT.n) GO TO 10
370 | 
371 |       jmin = 1000000
372 |       DO 20 i = 2,n
373 |           j = nint((timesfound(i)-timesfound(i-1))*1d6)
374 |           jmin = min(jmin,max(j,0))
375 |    20 CONTINUE
376 | 
377 |       IF (jmin.GT.0) THEN
378 |           checktick = jmin
379 |       ELSE
380 |           PRINT *,'Your clock granularity appears to be less ',
381 |      $      'than one microsecond'
382 |           checktick = 1
383 |       END IF
384 |       RETURN
385 | 
386 | *      PRINT 14, timesfound(1)*1d6
387 | *      DO 20 i=2,n
388 | *         PRINT 14, timesfound(i)*1d6,
389 | *     &       nint((timesfound(i)-timesfound(i-1))*1d6)
390 | *   14    FORMAT (1X, F18.4, 1X, i8)
391 | *   20 CONTINUE
392 | 
393 |       END
394 | 
395 | 
396 | 
397 | 
398 |       SUBROUTINE checksums(a,b,c,n,ntimes)
399 | *     IMPLICIT NONE
400 | C     ..
401 | C     .. Arguments ..
402 |       DOUBLE PRECISION a(*),b(*),c(*)
403 |       INTEGER n,ntimes
404 | C     ..
405 | C     .. Local Scalars ..
406 |       DOUBLE PRECISION aa,bb,cc,scalar,suma,sumb,sumc,epsilon
407 |       INTEGER k
408 | C     ..
409 | 
410 | C     Repeat the main loop, but with scalars only.
411 | C     This is done to check the sum & make sure all
412 | C     iterations have been executed correctly.
413 | 
414 |       aa = 2.0D0
415 |       bb = 0.5D0
416 |       cc = 0.0D0
417 |       aa = 0.5D0*aa
418 |       scalar = 0.5d0*aa
419 |       DO k = 1,ntimes
420 |           cc = aa
421 |           bb = scalar*cc
422 |           cc = aa + bb
423 |           aa = bb + scalar*cc
424 |       END DO
425 |       aa = aa*DBLE(n-2)
426 |       bb = bb*DBLE(n-2)
427 |       cc = cc*DBLE(n-2)
428 | 
429 | C     Now sum up the arrays, excluding the first and last
430 | C     elements, which are modified using the timing results
431 | C     to confuse aggressive optimizers.
432 | 
433 |       suma = 0.0d0
434 |       sumb = 0.0d0
435 |       sumc = 0.0d0
436 | !$OMP PARALLEL DO REDUCTION(+:suma,sumb,sumc)
437 |       DO 110 j = 2,n-1
438 |           suma = suma + a(j)
439 |           sumb = sumb + b(j)
440 |           sumc = sumc + c(j)
441 |   110 CONTINUE
442 | 
443 |       epsilon = 1.D-6
444 | 
445 |       IF (ABS(suma-aa)/suma .GT. epsilon) THEN
446 |           PRINT *,'Failed Validation on array a()'
447 |           PRINT *,'Target   Sum of a is = ',aa
448 |           PRINT *,'Computed Sum of a is = ',suma
449 |       ELSEIF (ABS(sumb-bb)/sumb .GT. epsilon) THEN
450 |           PRINT *,'Failed Validation on array b()'
451 |           PRINT *,'Target   Sum of b is = ',bb
452 |           PRINT *,'Computed Sum of b is = ',sumb
453 |       ELSEIF (ABS(sumc-cc)/sumc .GT. epsilon) THEN
454 |           PRINT *,'Failed Validation on array c()'
455 |           PRINT *,'Target   Sum of c is = ',cc
456 |           PRINT *,'Computed Sum of c is = ',sumc
457 |       ELSE
458 |           PRINT *,'Solution Validates!'
459 |       ENDIF
460 | 
461 |       END
462 | 
463 | 


--------------------------------------------------------------------------------
/lib/workloads.py:
--------------------------------------------------------------------------------
  1 | import threading
  2 | import subprocess
  3 | import os
  4 | import signal
  5 | import time
  6 | import psutil
  7 | import numpy as np
  8 | import shlex
  9 | 
 10 | from lib import utils
 11 | from lib.container import Container
 12 | from lib import constants
 13 | 
 14 | class Workload:
 15 |     ''' This class is not meant to be used by itself. It's only purpose
 16 |         is to provide definitions that are common to all of its children.
 17 |     '''
 18 |     # These variables are defined in child classes
 19 |     # that inherit from this class. Their definition here is
 20 |     # just done for clarity.
 21 |     wname = None
 22 |     ideal_mem = None
 23 |     min_ratio = None
 24 |     cpu_req = None
 25 | 
 26 |     def __init__(self, idd, pinned_cpus, mem_ratio=1):
 27 | 
 28 |         self.idd = idd  # a unique uint id for this workload
 29 | 
 30 |         # process handling
 31 |         self.thread = None
 32 |         self.popen = None
 33 |         self.stdout = None
 34 |         self.stderr = None
 35 | 
 36 |         # Container creation
 37 |         self.mem_ratio = mem_ratio
 38 |         self.container = Container(self.get_name(), self.ideal_mem, self.mem_ratio)
 39 |         self.container.create()
 40 |         
 41 |         # Pin CPUs
 42 |         self.pinned_cpus = pinned_cpus
 43 | 
 44 |         # Get shell command
 45 |         procs_path = self.container.get_procs_path()
 46 |         self.cmdline = self.get_cmdline(procs_path, pinned_cpus)
 47 | 
 48 |         # task timings
 49 |         self.ts_start = 0
 50 |         self.ts_finish = 0
 51 | 
 52 |         # Getting gradient coeffs ready
 53 |         self.percent = 0
 54 |         self.ratio = 1
 55 |         self.get_gradient()
 56 | 
 57 |     def __exec(self):
 58 |         " execute in self.thread "
 59 |         print(self.cmdline)
 60 | 
 61 |         self.ts_start = time.time()
 62 |         self.popen = subprocess.Popen(self.cmdline, stdout=subprocess.PIPE,
 63 |                                       stderr=subprocess.PIPE, shell=True)
 64 |         self.stdout, self.stderr = self.popen.communicate()  # blocks process exit
 65 |         assert(self.popen.returncode == 0)
 66 |         self.ts_finish = time.time()
 67 | 
 68 |         self.container.delete()
 69 | 
 70 |     def start(self):
 71 |         self.thread = threading.Thread(target=self.__exec)
 72 |         self.thread.start()
 73 | 
 74 |         while not self.is_alive():
 75 |             pass
 76 | 
 77 |     def modify_ratio(self, new_ratio):
 78 |         self.container.set_new_size(new_ratio)
 79 | 
 80 |     def get_name(self):
 81 |         return self.wname + str(self.idd)
 82 | 
 83 |     def get_retcode(self):
 84 |         return self.popen.returncode
 85 | 
 86 |     def is_alive(self):
 87 |         return self.thread.is_alive() and self.popen
 88 | 
 89 |     def get_process_duration(self):
 90 |         return self.ts_finish - self.ts_start
 91 | 
 92 |     def get_usr_bin_time(self):
 93 |         ''' Parse the output of /usr/bin/time from stderr'''
 94 |         parser = utils.BinTimeParser()
 95 |         return parser.parse(self.stderr.decode('utf-8'))
 96 | 
 97 |     def kill(self):
 98 |         pg_id = os.getpgid(self.popen.pid)
 99 |         os.killpg(pg_id, signal.SIGKILL)
100 |         self.thread.join()
101 | 
102 |     def set_min_ratio(self, new_min_ratio):
103 |         self.min_ratio = new_min_ratio
104 |         self.min_mem = self.min_ratio * self.ideal_mem
105 | 
106 |     def update(self, el_time, new_ratio, new_idd=None): # ratio = 0 is no remote memory mode
107 |         assert el_time >= 0
108 | 
109 |         if (new_idd is not None) and self.idd == new_idd:
110 |             assert self.percent == 0
111 |         else:
112 |             self.update_percent(el_time)
113 |         self.ratio = new_ratio
114 |         self.modify_ratio(new_ratio)
115 | 
116 |     def update_percent(self, el_time):
117 |         self.percent = self.percent + el_time/self.profile(self.ratio)
118 | 
119 |     def profile(self,ratio): 
120 |         return self.compute_ratio_from_coeff(self.coeff, ratio)*1000 # from second to millisecond
121 | 
122 |     def get_gradient(self):
123 |         tmp_coeff = self.coeff + [0]
124 |         self.gd_coeff = np.polyder(self.coeff)
125 |         self.mem_gd_coeff = np.polyder(tmp_coeff)
126 | 
127 |     def gradient(self, ratio):
128 |         return self.compute_ratio_from_coeff(self.gd_coeff, ratio)
129 | 
130 |     def mem_gradient(self,ratio):
131 |         return self.compute_ratio_from_coeff(self.mem_gd_coeff, ratio)
132 | 
133 |     def compute_ratio_from_coeff(self, coeffs, ratio):
134 |         p = 0
135 |         order = len(coeffs)
136 |         for i in range(order):
137 |             p += coeffs[i] * ratio**(order-1-i)
138 |         return p
139 | 
140 |     def get_pids(self):
141 |         return self.container.get_pids()
142 | 
143 | class Quicksort(Workload):
144 |     wname = "quicksort"
145 |     ideal_mem = 8250
146 |     min_ratio = 0.65
147 |     min_mem = int(min_ratio * ideal_mem)
148 |     binary_name = "quicksort"
149 |     cpu_req = 1
150 |     coeff = [-1984.129, 4548.033, -3588.554, 1048.644, 252.997]
151 | 
152 |     def get_cmdline(self, procs_path, pinned_cpus):
153 |         prefix = "echo $$ > {} &&".format(procs_path)
154 |         arg = '8192'
155 |         shell_cmd = '/usr/bin/time -v' + ' ' + constants.WORK_DIR + '/quicksort/quicksort {}'.format(arg)
156 |         pinned_cpus_string = ','.join(map(str, pinned_cpus))
157 |         set_cpu = 'taskset -c {}'.format(pinned_cpus_string)
158 |         full_command = ' '.join((prefix, 'exec', set_cpu, shell_cmd))
159 |         return full_command
160 | 
161 | 
162 | class Linpack(Workload):
163 |     wname = "linpack"
164 |     ideal_mem = 1600
165 |     min_ratio = 0.9
166 |     min_mem = int(min_ratio * ideal_mem)
167 |     binary_name = "xlinpack_xeon64"
168 |     cpu_req = 4
169 |     coeff = [38.52, -77.88, 26.86, 36.70]
170 | 
171 |     def get_cmdline(self, procs_path, pinned_cpus):
172 |         linpack_dir = constants.WORK_DIR + '/linpack'
173 |         prefix = "echo $$ > {} &&".format(procs_path)
174 |         set_vars = ' '.join(('MKL_NUM_THREADS=4',
175 |                              'OMP_NUM_THREADS=4',
176 |                              'MKL_DOMAIN_NUM_THREADS=4'))
177 |         
178 |         pinned_cpus_string = ','.join(map(str, pinned_cpus))
179 |         set_cpu = 'taskset -c {}'.format(pinned_cpus_string)
180 | 
181 |         set_vars = ' '.join(('KMP_AFFINITY=nowarnings,compact,1,0,granularity=fine',
182 |                              set_vars))
183 | 
184 |         bin_path = '{}/xlinpack_xeon64'.format(linpack_dir)
185 |         cmdline = '{}/lininput_xeon64'.format(linpack_dir)
186 |         after_exec = ' '.join(('/usr/bin/time -v', bin_path, cmdline))
187 |         full_command = ' '.join((prefix, set_vars, 'exec', set_cpu, after_exec))
188 |         return full_command
189 | 
190 | 
191 | class Tfinception(Workload):
192 |     wname = "tf-inception"
193 |     ideal_mem = 2120
194 |     min_ratio = 0.9
195 |     min_mem = int(min_ratio * ideal_mem)
196 |     binary_name = "python3"
197 |     cpu_req = 2
198 |     coeff = [-1617.416, 3789.953, -2993.734, 1225.477]
199 | 
200 |     def get_cmdline(self, procs_path, pinned_cpus):
201 |         work_dir = ''.join((constants.WORK_DIR,
202 |                             '/tensorflow/benchmarks/scripts/tf_cnn_benchmarks'))
203 |         
204 |         pinned_cpus_string = ','.join(map(str, pinned_cpus))
205 |         set_cpu = 'taskset -c {}'.format(pinned_cpus_string)
206 | 
207 |         cd_dir = ' '.join(('cd', work_dir, '&&'))
208 |         prefix = "echo $$ > {} &&".format(procs_path)
209 |         set_vars = ' '.join(('KMP_BLOCK_TIME=0',
210 |                              'KMP_SETTINGS=1 OMP_NUM_THREADS=2'))
211 |         
212 |         set_vars = ' '.join(('KMP_AFFINITY=granularity=fine,verbose,compact,1,0',
213 |                              set_vars))
214 | 
215 |         shell_cmd = ' '.join(("/usr/bin/time -v python3 tf_cnn_benchmarks.py",
216 |                               "--forward_only=True --data_format=NHWC --device=cpu",
217 |                               "--batch_size=64 --num_inter_threads=1",
218 |                               "--num_intra_threads=2 --nodistortions",
219 |                               "--model=inception3",
220 |                               "--kmp_blocktime=0 --num_batches=20",
221 |                               "--num_warmup_batches 0"))
222 |         full_command = ' '.join((cd_dir, prefix, set_vars, 'exec', set_cpu, shell_cmd))
223 |         return full_command
224 | 
225 | 
226 | class Tfresnet(Workload):
227 |     wname = "tf-resnet"
228 |     ideal_mem = 1268
229 |     min_ratio = 0.9
230 |     min_mem = int(min_ratio * ideal_mem)
231 |     binary_name = "python3"
232 |     cpu_req = 2
233 |     coeff = [-1617.416, 3789.953, -2993.734, 1225.477]
234 | 
235 |     def get_cmdline(self, procs_path, pinned_cpus):
236 |         work_dir = ''.join((constants.WORK_DIR,
237 |                             '/tensorflow/benchmarks/scripts/tf_cnn_benchmarks'))
238 | 
239 |         pinned_cpus_string = ','.join(map(str, pinned_cpus))
240 |         set_cpu = 'taskset -c {}'.format(pinned_cpus_string)
241 |         
242 |         cd_dir = ' '.join(('cd', work_dir, '&&'))
243 |         prefix = "echo $$ > {} &&".format(procs_path)
244 |         set_vars = ' '.join(('KMP_BLOCK_TIME=0',
245 |                              'KMP_SETTINGS=1 OMP_NUM_THREADS=2'))
246 |         
247 |         set_vars = ' '.join(('KMP_AFFINITY=granularity=fine,verbose,compact,1,0',
248 |                              set_vars))
249 | 
250 |         shell_cmd = ' '.join(("/usr/bin/time -v python3 tf_cnn_benchmarks.py",
251 |                               "--forward_only=True --data_format=NHWC --device=cpu",
252 |                               "--batch_size=64 --num_inter_threads=1",
253 |                               "--num_intra_threads=2 --nodistortions",
254 |                               "--model=resnet50",
255 |                               "--kmp_blocktime=0 --num_batches=20",
256 |                               "--num_warmup_batches 0"))
257 |         full_command = ' '.join((cd_dir, prefix, set_vars, 'exec', set_cpu, shell_cmd))
258 |         return full_command
259 | 
260 | 
261 | class Kmeans(Workload):
262 |     wname = "kmeans"
263 |     ideal_mem = 4847
264 |     binary_name = "python3"
265 |     min_ratio = 0.75
266 |     min_mem = int(min_ratio * ideal_mem)
267 |     cpu_req = 1
268 |     coeff = [-10341.875,  31554.403, -34346.894,  15214.428,  -1730.533]
269 | 
270 |     def get_cmdline(self, procs_path, pinned_cpus):
271 |         prefix = "echo $$ > {} && OMP_NUM_THREADS={}".format(procs_path, self.cpu_req)
272 |         bin_path = constants.WORK_DIR + '/kmeans/kmeans.py'
273 |         shell_cmd = '/usr/bin/time -v python3' + ' ' + bin_path
274 |         
275 |         pinned_cpus_string = ','.join(map(str, pinned_cpus))
276 |         set_cpu = 'taskset -c {}'.format(pinned_cpus_string)
277 |         
278 |         full_command = ' '.join((prefix, 'exec', set_cpu, shell_cmd))
279 | 
280 |         return full_command
281 | 
282 | 
283 | class Spark(Workload):
284 |     wname = "spark"
285 |     ideal_mem = 4400
286 |     min_ratio = 0.75
287 |     min_mem = int(min_ratio * ideal_mem)
288 |     binary_name = "java"
289 |     cpu_req = 3
290 |     coeff = [4689.05, -10841.59, 7709.92, -1486.13]
291 | 
292 |     def get_cmdline(self, procs_path, pinned_cpus):
293 |         target_dir = ''.join((constants.WORK_DIR, '/spark/pagerank'))
294 |         cd_dir = ' '.join(('cd', target_dir, '&&'))
295 |         prefix = 'echo $$ > {} &&'.format(procs_path)
296 | 
297 |         pinned_cpus_string = ','.join(map(str, pinned_cpus))
298 |         set_cpu = 'taskset -c {}'.format(pinned_cpus_string)
299 | 
300 |         shell_cmd = ' '.join(('/usr/bin/time -v',
301 |                               constants.SPARK_HOME + 'bin/spark-submit',
302 |                               '--driver-memory 10g',
303 |                               '--class \"pagerank\"',
304 |                               '--master local[2]',
305 |                               'target/scala-2.11/pagerank_2.11-1.0.jar'))
306 |         full_command = ' '.join((cd_dir, prefix, 'exec', set_cpu, shell_cmd))
307 |         return full_command
308 | 
309 | class Memaslap(Workload):
310 |     wname = "memaslap"
311 |     ideal_mem = 12288
312 |     min_ratio = 0.5
313 |     min_mem = int(min_ratio * ideal_mem)
314 |     binary_name = "memcached"
315 |     port_number = 11211
316 |     cpu_req = 2
317 |     coeff = [-11626.894, 32733.914, -31797.375, 11484.578, 113.33]
318 | 
319 |     def __init__(self, idd, pinned_cpus, mem_ratio=1):
320 |         super().__init__(idd, pinned_cpus, mem_ratio)
321 |         self.port_number = Memaslap.port_number
322 |         self.memaslap_pids = set()
323 |         Memaslap.port_number += 1
324 | 
325 |     def get_cmdline(self, procs_path, pinned_cpus):
326 |         prefix = 'echo $$ > {} &&'
327 |         memcached_serv = "/usr/bin/time -v memcached -l localhost -p {} -m {} -t 1".format(self.port_number, 
328 |                                                         self.ideal_mem)
329 |         cpu_list = list(pinned_cpus)
330 |         taskset_serv = 'taskset -c {}'.format(cpu_list[0])
331 |         memcached_serv = ' '.join((prefix, 'exec', taskset_serv, memcached_serv))
332 |         memcached_serv = memcached_serv.format(procs_path)
333 | 
334 |         taskset_memaslap = 'taskset -c {}'.format(cpu_list[1])
335 |         memaslap_fill = taskset_memaslap + ' ' + "memaslap -s localhost:{} -T 1 -F {} --execute_number 30000000"
336 |         memaslap_fill = memaslap_fill.format(self.port_number, "memaslap/memaslap_fill")
337 | 
338 |         memaslap_query = taskset_memaslap + ' ' + "memaslap -s localhost:{} -T 1 -F {} --execute_number 100000000"
339 |         memaslap_query = memaslap_query.format(self.port_number, "memaslap/memaslap_etc")
340 |         sleep = 'sleep 5'
341 |         memaslap_cmd = ' && '.join((memaslap_fill, sleep, memaslap_query))
342 |         return (memcached_serv, memaslap_fill, memaslap_query)
343 | 
344 |     def start(self):
345 |         self.thread = threading.Thread(target=self.__exec)
346 |         self.thread.start()
347 | 
348 |         while not self.is_alive():
349 |             pass
350 | 
351 |     def __exec(self):
352 |         memcached, memaslap_fill, memaslap_query = self.cmdline
353 | 
354 |         " execute in self.thread "
355 |         print(self.cmdline)
356 | 
357 |         self.ts_start = time.time()
358 | 
359 |         self.popen = subprocess.Popen(memcached, stdout=subprocess.PIPE,
360 |                                       stderr=subprocess.PIPE, shell=True,
361 |                                       preexec_fn=os.setsid)
362 | 
363 |         time.sleep(3) # Wait for memcached to boot
364 |         memaslap_proc = subprocess.Popen(shlex.split(memaslap_fill), stdout=subprocess.PIPE,
365 |                                         stderr=subprocess.PIPE, shell=False)
366 |         self.memaslap_pids.add(memaslap_proc.pid)
367 |         stdout, stderr = memaslap_proc.communicate()
368 |         self.memaslap_pids.remove(memaslap_proc.pid)
369 | 
370 |         time.sleep(5)
371 |         memaslap_proc = subprocess.Popen(shlex.split(memaslap_query), stdout=subprocess.PIPE,
372 |                                         stderr=subprocess.PIPE, shell=False)
373 |         self.memaslap_pids.add(memaslap_proc.pid)
374 |         stdout, stderr = memaslap_proc.communicate()
375 |         self.memaslap_pids.remove(memaslap_proc.pid)
376 |         
377 |         print(stdout.decode('utf-8'))
378 |         print(stderr.decode('utf-8'))
379 | 
380 |         os.killpg(os.getpgid(self.popen.pid), signal.SIGINT)
381 | 
382 |         self.stdout, self.stderr = self.popen.communicate()
383 |         self.ts_finish = time.time()
384 |         print(self.stdout.decode('utf-8'))
385 |         print(self.stderr.decode('utf-8'))
386 | 
387 |         self.container.delete()
388 | 
389 |     def get_pids(self):
390 |         pids = list(self.container.get_pids())
391 |         pids.extend(self.memaslap_pids)
392 |         return pids
393 | 
394 | class Stream(Workload):
395 |     wname = "stream"
396 |     ideal_mem = 4150
397 |     min_ratio = 0.50
398 |     min_mem = int(min_ratio * ideal_mem)
399 |     binary_name = "stream_c.exe"
400 |     cpu_req = 1
401 |     coeff = [0]
402 | 
403 |     def get_cmdline(self, procs_path, pinned_cpus):
404 |         target_dir = ''.join((constants.WORK_DIR, '/stream'))
405 |         cd_dir = ' '.join(('cd', target_dir, '&&'))
406 |         prefix = 'echo $$ > {} && OMP_NUM_THREADS={}'.format(procs_path, len(pinned_cpus))
407 | 
408 |         pinned_cpus_string = ','.join(map(str, pinned_cpus))
409 |         set_cpu = 'taskset -c {}'.format(pinned_cpus_string)
410 | 
411 |         shell_cmd = 'nice -n -2 /usr/bin/time -v ./stream_c.exe'.format(len(pinned_cpus))
412 |         full_command = ' '.join((cd_dir, prefix, 'exec', set_cpu, shell_cmd))
413 |         return full_command
414 | 
415 | def get_workload_class(wname):
416 |     return {'quicksort': Quicksort,
417 |             'linpack': Linpack,
418 |             'tf-inception': Tfinception,
419 |             'tf-resnet': Tfresnet,
420 |             'spark': Spark,
421 |             'kmeans': Kmeans,
422 |             'memaslap': Memaslap,
423 |             'stream': Stream}[wname]
424 | 


--------------------------------------------------------------------------------
/stream/stream.c:
--------------------------------------------------------------------------------
  1 | /*-----------------------------------------------------------------------*/
  2 | /* Program: STREAM                                                       */
  3 | /* Revision: $Id: stream.c,v 5.10 2013/01/17 16:01:06 mccalpin Exp mccalpin $ */
  4 | /* Original code developed by John D. McCalpin                           */
  5 | /* Programmers: John D. McCalpin                                         */
  6 | /*              Joe R. Zagar                                             */
  7 | /*                                                                       */
  8 | /* This program measures memory transfer rates in MB/s for simple        */
  9 | /* computational kernels coded in C.                                     */
 10 | /*-----------------------------------------------------------------------*/
 11 | /* Copyright 1991-2013: John D. McCalpin                                 */
 12 | /*-----------------------------------------------------------------------*/
 13 | /* License:                                                              */
 14 | /*  1. You are free to use this program and/or to redistribute           */
 15 | /*     this program.                                                     */
 16 | /*  2. You are free to modify this program for your own use,             */
 17 | /*     including commercial use, subject to the publication              */
 18 | /*     restrictions in item 3.                                           */
 19 | /*  3. You are free to publish results obtained from running this        */
 20 | /*     program, or from works that you derive from this program,         */
 21 | /*     with the following limitations:                                   */
 22 | /*     3a. In order to be referred to as "STREAM benchmark results",     */
 23 | /*         published results must be in conformance to the STREAM        */
 24 | /*         Run Rules, (briefly reviewed below) published at              */
 25 | /*         http://www.cs.virginia.edu/stream/ref.html                    */
 26 | /*         and incorporated herein by reference.                         */
 27 | /*         As the copyright holder, John McCalpin retains the            */
 28 | /*         right to determine conformity with the Run Rules.             */
 29 | /*     3b. Results based on modified source code or on runs not in       */
 30 | /*         accordance with the STREAM Run Rules must be clearly          */
 31 | /*         labelled whenever they are published.  Examples of            */
 32 | /*         proper labelling include:                                     */
 33 | /*           "tuned STREAM benchmark results"                            */
 34 | /*           "based on a variant of the STREAM benchmark code"           */
 35 | /*         Other comparable, clear, and reasonable labelling is          */
 36 | /*         acceptable.                                                   */
 37 | /*     3c. Submission of results to the STREAM benchmark web site        */
 38 | /*         is encouraged, but not required.                              */
 39 | /*  4. Use of this program or creation of derived works based on this    */
 40 | /*     program constitutes acceptance of these licensing restrictions.   */
 41 | /*  5. Absolutely no warranty is expressed or implied.                   */
 42 | /*-----------------------------------------------------------------------*/
 43 | # include <stdio.h>
 44 | # include <unistd.h>
 45 | # include <math.h>
 46 | # include <float.h>
 47 | # include <limits.h>
 48 | # include <sys/time.h>
 49 | 
 50 | /*-----------------------------------------------------------------------
 51 |  * INSTRUCTIONS:
 52 |  *
 53 |  *	1) STREAM requires different amounts of memory to run on different
 54 |  *           systems, depending on both the system cache size(s) and the
 55 |  *           granularity of the system timer.
 56 |  *     You should adjust the value of 'STREAM_ARRAY_SIZE' (below)
 57 |  *           to meet *both* of the following criteria:
 58 |  *       (a) Each array must be at least 4 times the size of the
 59 |  *           available cache memory. I don't worry about the difference
 60 |  *           between 10^6 and 2^20, so in practice the minimum array size
 61 |  *           is about 3.8 times the cache size.
 62 |  *           Example 1: One Xeon E3 with 8 MB L3 cache
 63 |  *               STREAM_ARRAY_SIZE should be >= 4 million, giving
 64 |  *               an array size of 30.5 MB and a total memory requirement
 65 |  *               of 91.5 MB.
 66 |  *           Example 2: Two Xeon E5's with 20 MB L3 cache each (using OpenMP)
 67 |  *               STREAM_ARRAY_SIZE should be >= 20 million, giving
 68 |  *               an array size of 153 MB and a total memory requirement
 69 |  *               of 458 MB.
 70 |  *       (b) The size should be large enough so that the 'timing calibration'
 71 |  *           output by the program is at least 20 clock-ticks.
 72 |  *           Example: most versions of Windows have a 10 millisecond timer
 73 |  *               granularity.  20 "ticks" at 10 ms/tic is 200 milliseconds.
 74 |  *               If the chip is capable of 10 GB/s, it moves 2 GB in 200 msec.
 75 |  *               This means the each array must be at least 1 GB, or 128M elements.
 76 |  *
 77 |  *      Version 5.10 increases the default array size from 2 million
 78 |  *          elements to 10 million elements in response to the increasing
 79 |  *          size of L3 caches.  The new default size is large enough for caches
 80 |  *          up to 20 MB.
 81 |  *      Version 5.10 changes the loop index variables from "register int"
 82 |  *          to "ssize_t", which allows array indices >2^32 (4 billion)
 83 |  *          on properly configured 64-bit systems.  Additional compiler options
 84 |  *          (such as "-mcmodel=medium") may be required for large memory runs.
 85 |  *
 86 |  *      Array size can be set at compile time without modifying the source
 87 |  *          code for the (many) compilers that support preprocessor definitions
 88 |  *          on the compile line.  E.g.,
 89 |  *                gcc -O -DSTREAM_ARRAY_SIZE=100000000 stream.c -o stream.100M
 90 |  *          will override the default size of 10M with a new size of 100M elements
 91 |  *          per array.
 92 |  */
 93 | #ifndef STREAM_ARRAY_SIZE
 94 | #   define STREAM_ARRAY_SIZE	10000000
 95 | #endif
 96 | 
 97 | /*  2) STREAM runs each kernel "NTIMES" times and reports the *best* result
 98 |  *         for any iteration after the first, therefore the minimum value
 99 |  *         for NTIMES is 2.
100 |  *      There are no rules on maximum allowable values for NTIMES, but
101 |  *         values larger than the default are unlikely to noticeably
102 |  *         increase the reported performance.
103 |  *      NTIMES can also be set on the compile line without changing the source
104 |  *         code using, for example, "-DNTIMES=7".
105 |  */
106 | #ifdef NTIMES
107 | #if NTIMES<=1
108 | #   define NTIMES	10
109 | #endif
110 | #endif
111 | #ifndef NTIMES
112 | #   define NTIMES	10
113 | #endif
114 | 
115 | /*  Users are allowed to modify the "OFFSET" variable, which *may* change the
116 |  *         relative alignment of the arrays (though compilers may change the
117 |  *         effective offset by making the arrays non-contiguous on some systems).
118 |  *      Use of non-zero values for OFFSET can be especially helpful if the
119 |  *         STREAM_ARRAY_SIZE is set to a value close to a large power of 2.
120 |  *      OFFSET can also be set on the compile line without changing the source
121 |  *         code using, for example, "-DOFFSET=56".
122 |  */
123 | #ifndef OFFSET
124 | #   define OFFSET	0
125 | #endif
126 | 
127 | /*
128 |  *	3) Compile the code with optimization.  Many compilers generate
129 |  *       unreasonably bad code before the optimizer tightens things up.
130 |  *     If the results are unreasonably good, on the other hand, the
131 |  *       optimizer might be too smart for me!
132 |  *
133 |  *     For a simple single-core version, try compiling with:
134 |  *            cc -O stream.c -o stream
135 |  *     This is known to work on many, many systems....
136 |  *
137 |  *     To use multiple cores, you need to tell the compiler to obey the OpenMP
138 |  *       directives in the code.  This varies by compiler, but a common example is
139 |  *            gcc -O -fopenmp stream.c -o stream_omp
140 |  *       The environment variable OMP_NUM_THREADS allows runtime control of the
141 |  *         number of threads/cores used when the resulting "stream_omp" program
142 |  *         is executed.
143 |  *
144 |  *     To run with single-precision variables and arithmetic, simply add
145 |  *         -DSTREAM_TYPE=float
146 |  *     to the compile line.
147 |  *     Note that this changes the minimum array sizes required --- see (1) above.
148 |  *
149 |  *     The preprocessor directive "TUNED" does not do much -- it simply causes the
150 |  *       code to call separate functions to execute each kernel.  Trivial versions
151 |  *       of these functions are provided, but they are *not* tuned -- they just
152 |  *       provide predefined interfaces to be replaced with tuned code.
153 |  *
154 |  *
155 |  *	4) Optional: Mail the results to mccalpin@cs.virginia.edu
156 |  *	   Be sure to include info that will help me understand:
157 |  *		a) the computer hardware configuration (e.g., processor model, memory type)
158 |  *		b) the compiler name/version and compilation flags
159 |  *      c) any run-time information (such as OMP_NUM_THREADS)
160 |  *		d) all of the output from the test case.
161 |  *
162 |  * Thanks!
163 |  *
164 |  *-----------------------------------------------------------------------*/
165 | 
166 | # define HLINE "-------------------------------------------------------------\n"
167 | 
168 | # ifndef MIN
169 | # define MIN(x,y) ((x)<(y)?(x):(y))
170 | # endif
171 | # ifndef MAX
172 | # define MAX(x,y) ((x)>(y)?(x):(y))
173 | # endif
174 | 
175 | #ifndef STREAM_TYPE
176 | #define STREAM_TYPE double
177 | #endif
178 | 
179 | static STREAM_TYPE	a[STREAM_ARRAY_SIZE+OFFSET],
180 | 			b[STREAM_ARRAY_SIZE+OFFSET],
181 | 			c[STREAM_ARRAY_SIZE+OFFSET];
182 | 
183 | static double	avgtime[4] = {0}, maxtime[4] = {0},
184 | 		mintime[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};
185 | 
186 | static char	*label[4] = {"Copy:      ", "Scale:     ",
187 |     "Add:       ", "Triad:     "};
188 | 
189 | static double	bytes[4] = {
190 |     2 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE,
191 |     2 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE,
192 |     3 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE,
193 |     3 * sizeof(STREAM_TYPE) * STREAM_ARRAY_SIZE
194 |     };
195 | 
196 | extern double mysecond();
197 | extern void checkSTREAMresults();
198 | #ifdef TUNED
199 | extern void tuned_STREAM_Copy();
200 | extern void tuned_STREAM_Scale(STREAM_TYPE scalar);
201 | extern void tuned_STREAM_Add();
202 | extern void tuned_STREAM_Triad(STREAM_TYPE scalar);
203 | #endif
204 | #ifdef _OPENMP
205 | extern int omp_get_num_threads();
206 | #endif
207 | int
208 | main()
209 |     {
210 |     int			quantum, checktick();
211 |     int			BytesPerWord;
212 |     int			k;
213 |     ssize_t		j;
214 |     STREAM_TYPE		scalar;
215 |     double		t, times[4][NTIMES];
216 | 
217 |     /* --- SETUP --- determine precision and check timing --- */
218 | 
219 |     printf(HLINE);
220 |     printf("STREAM version $Revision: 5.10 $\n");
221 |     printf(HLINE);
222 |     BytesPerWord = sizeof(STREAM_TYPE);
223 |     printf("This system uses %d bytes per array element.\n",
224 | 	BytesPerWord);
225 | 
226 |     printf(HLINE);
227 | #ifdef N
228 |     printf("*****  WARNING: ******\n");
229 |     printf("      It appears that you set the preprocessor variable N when compiling this code.\n");
230 |     printf("      This version of the code uses the preprocesor variable STREAM_ARRAY_SIZE to control the array size\n");
231 |     printf("      Reverting to default value of STREAM_ARRAY_SIZE=%llu\n",(unsigned long long) STREAM_ARRAY_SIZE);
232 |     printf("*****  WARNING: ******\n");
233 | #endif
234 | 
235 |     printf("Array size = %llu (elements), Offset = %d (elements)\n" , (unsigned long long) STREAM_ARRAY_SIZE, OFFSET);
236 |     printf("Memory per array = %.1f MiB (= %.1f GiB).\n",
237 | 	BytesPerWord * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.0),
238 | 	BytesPerWord * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.0/1024.0));
239 |     printf("Total memory required = %.1f MiB (= %.1f GiB).\n",
240 | 	(3.0 * BytesPerWord) * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024.),
241 | 	(3.0 * BytesPerWord) * ( (double) STREAM_ARRAY_SIZE / 1024.0/1024./1024.));
242 |     printf("Each kernel will be executed %d times.\n", NTIMES);
243 |     printf(" The *best* time for each kernel (excluding the first iteration)\n");
244 |     printf(" will be used to compute the reported bandwidth.\n");
245 | 
246 | #ifdef _OPENMP
247 |     printf(HLINE);
248 | #pragma omp parallel
249 |     {
250 | #pragma omp master
251 | 	{
252 | 	    k = omp_get_num_threads();
253 | 	    printf ("Number of Threads requested = %i\n",k);
254 |         }
255 |     }
256 | #endif
257 | 
258 | #ifdef _OPENMP
259 | 	k = 0;
260 | #pragma omp parallel
261 | #pragma omp atomic
262 | 		k++;
263 |     printf ("Number of Threads counted = %i\n",k);
264 | #endif
265 | 
266 |     /* Get initial value for system clock. */
267 | #pragma omp parallel for
268 |     for (j=0; j<STREAM_ARRAY_SIZE; j++) {
269 | 	    a[j] = 1.0;
270 | 	    b[j] = 2.0;
271 | 	    c[j] = 0.0;
272 | 	}
273 | 
274 |     printf(HLINE);
275 | 
276 |     if  ( (quantum = checktick()) >= 1)
277 | 	printf("Your clock granularity/precision appears to be "
278 | 	    "%d microseconds.\n", quantum);
279 |     else {
280 | 	printf("Your clock granularity appears to be "
281 | 	    "less than one microsecond.\n");
282 | 	quantum = 1;
283 |     }
284 | 
285 |     t = mysecond();
286 | #pragma omp parallel for
287 |     for (j = 0; j < STREAM_ARRAY_SIZE; j++)
288 | 		a[j] = 2.0E0 * a[j];
289 |     t = 1.0E6 * (mysecond() - t);
290 | 
291 |     printf("Each test below will take on the order"
292 | 	" of %d microseconds.\n", (int) t  );
293 |     printf("   (= %d clock ticks)\n", (int) (t/quantum) );
294 |     printf("Increase the size of the arrays if this shows that\n");
295 |     printf("you are not getting at least 20 clock ticks per test.\n");
296 | 
297 |     printf(HLINE);
298 | 
299 |     printf("WARNING -- The above is only a rough guideline.\n");
300 |     printf("For best results, please be sure you know the\n");
301 |     printf("precision of your system timer.\n");
302 |     printf(HLINE);
303 | 
304 |     /*	--- MAIN LOOP --- repeat test cases NTIMES times --- */
305 | 
306 |     scalar = 3.0;
307 |     for (k=0; k<NTIMES; k++)
308 | 	{
309 | 	times[0][k] = mysecond();
310 | #ifdef TUNED
311 |         tuned_STREAM_Copy();
312 | #else
313 | #pragma omp parallel for
314 | 	for (j=0; j<STREAM_ARRAY_SIZE; j++)
315 | 	    c[j] = a[j];
316 | #endif
317 | 	times[0][k] = mysecond() - times[0][k];
318 | 
319 | 	times[1][k] = mysecond();
320 | #ifdef TUNED
321 |         tuned_STREAM_Scale(scalar);
322 | #else
323 | #pragma omp parallel for
324 | 	for (j=0; j<STREAM_ARRAY_SIZE; j++)
325 | 	    b[j] = scalar*c[j];
326 | #endif
327 | 	times[1][k] = mysecond() - times[1][k];
328 | 
329 | 	times[2][k] = mysecond();
330 | #ifdef TUNED
331 |         tuned_STREAM_Add();
332 | #else
333 | #pragma omp parallel for
334 | 	for (j=0; j<STREAM_ARRAY_SIZE; j++)
335 | 	    c[j] = a[j]+b[j];
336 | #endif
337 | 	times[2][k] = mysecond() - times[2][k];
338 | 
339 | 	times[3][k] = mysecond();
340 | #ifdef TUNED
341 |         tuned_STREAM_Triad(scalar);
342 | #else
343 | #pragma omp parallel for
344 | 	for (j=0; j<STREAM_ARRAY_SIZE; j++)
345 | 	    a[j] = b[j]+scalar*c[j];
346 | #endif
347 | 	times[3][k] = mysecond() - times[3][k];
348 | 	}
349 | 
350 |     /*	--- SUMMARY --- */
351 | 
352 |     for (k=1; k<NTIMES; k++) /* note -- skip first iteration */
353 | 	{
354 | 	for (j=0; j<4; j++)
355 | 	    {
356 | 	    avgtime[j] = avgtime[j] + times[j][k];
357 | 	    mintime[j] = MIN(mintime[j], times[j][k]);
358 | 	    maxtime[j] = MAX(maxtime[j], times[j][k]);
359 | 	    }
360 | 	}
361 | 
362 |     printf("Function    Average Rate MB/s  Avg time     Min time     Max time\n");
363 |     for (j=0; j<4; j++) {
364 | 		avgtime[j] = avgtime[j]/(double)(NTIMES-1);
365 | 
366 | 		printf("%s%12.1f  %11.6f  %11.6f  %11.6f\n", label[j],
367 | 	       1.0E-06 * bytes[j]/avgtime[j],
368 | 	       avgtime[j],
369 | 	       mintime[j],
370 | 	       maxtime[j]);
371 |     }
372 |     printf(HLINE);
373 | 
374 |     /* --- Check Results --- */
375 |     checkSTREAMresults();
376 |     printf(HLINE);
377 | 
378 |     return 0;
379 | }
380 | 
381 | # define	M	20
382 | 
383 | int
384 | checktick()
385 |     {
386 |     int		i, minDelta, Delta;
387 |     double	t1, t2, timesfound[M];
388 | 
389 | /*  Collect a sequence of M unique time values from the system. */
390 | 
391 |     for (i = 0; i < M; i++) {
392 | 	t1 = mysecond();
393 | 	while( ((t2=mysecond()) - t1) < 1.0E-6 )
394 | 	    ;
395 | 	timesfound[i] = t1 = t2;
396 | 	}
397 | 
398 | /*
399 |  * Determine the minimum difference between these M values.
400 |  * This result will be our estimate (in microseconds) for the
401 |  * clock granularity.
402 |  */
403 | 
404 |     minDelta = 1000000;
405 |     for (i = 1; i < M; i++) {
406 | 	Delta = (int)( 1.0E6 * (timesfound[i]-timesfound[i-1]));
407 | 	minDelta = MIN(minDelta, MAX(Delta,0));
408 | 	}
409 | 
410 |    return(minDelta);
411 |     }
412 | 
413 | 
414 | 
415 | /* A gettimeofday routine to give access to the wall
416 |    clock timer on most UNIX-like systems.  */
417 | 
418 | #include <sys/time.h>
419 | 
420 | double mysecond()
421 | {
422 |         struct timeval tp;
423 |         struct timezone tzp;
424 |         int i;
425 | 
426 |         i = gettimeofday(&tp,&tzp);
427 |         return ( (double) tp.tv_sec + (double) tp.tv_usec * 1.e-6 );
428 | }
429 | 
430 | #ifndef abs
431 | #define abs(a) ((a) >= 0 ? (a) : -(a))
432 | #endif
433 | void checkSTREAMresults ()
434 | {
435 | 	STREAM_TYPE aj,bj,cj,scalar;
436 | 	STREAM_TYPE aSumErr,bSumErr,cSumErr;
437 | 	STREAM_TYPE aAvgErr,bAvgErr,cAvgErr;
438 | 	double epsilon;
439 | 	ssize_t	j;
440 | 	int	k,ierr,err;
441 | 
442 |     /* reproduce initialization */
443 | 	aj = 1.0;
444 | 	bj = 2.0;
445 | 	cj = 0.0;
446 |     /* a[] is modified during timing check */
447 | 	aj = 2.0E0 * aj;
448 |     /* now execute timing loop */
449 | 	scalar = 3.0;
450 | 	for (k=0; k<NTIMES; k++)
451 |         {
452 |             cj = aj;
453 |             bj = scalar*cj;
454 |             cj = aj+bj;
455 |             aj = bj+scalar*cj;
456 |         }
457 | 
458 |     /* accumulate deltas between observed and expected results */
459 | 	aSumErr = 0.0;
460 | 	bSumErr = 0.0;
461 | 	cSumErr = 0.0;
462 | 	for (j=0; j<STREAM_ARRAY_SIZE; j++) {
463 | 		aSumErr += abs(a[j] - aj);
464 | 		bSumErr += abs(b[j] - bj);
465 | 		cSumErr += abs(c[j] - cj);
466 | 		// if (j == 417) printf("Index 417: c[j]: %f, cj: %f\n",c[j],cj);	// MCCALPIN
467 | 	}
468 | 	aAvgErr = aSumErr / (STREAM_TYPE) STREAM_ARRAY_SIZE;
469 | 	bAvgErr = bSumErr / (STREAM_TYPE) STREAM_ARRAY_SIZE;
470 | 	cAvgErr = cSumErr / (STREAM_TYPE) STREAM_ARRAY_SIZE;
471 | 
472 | 	if (sizeof(STREAM_TYPE) == 4) {
473 | 		epsilon = 1.e-6;
474 | 	}
475 | 	else if (sizeof(STREAM_TYPE) == 8) {
476 | 		epsilon = 1.e-13;
477 | 	}
478 | 	else {
479 | 		printf("WEIRD: sizeof(STREAM_TYPE) = %lu\n",sizeof(STREAM_TYPE));
480 | 		epsilon = 1.e-6;
481 | 	}
482 | 
483 | 	err = 0;
484 | 	if (abs(aAvgErr/aj) > epsilon) {
485 | 		err++;
486 | 		printf ("Failed Validation on array a[], AvgRelAbsErr > epsilon (%e)\n",epsilon);
487 | 		printf ("     Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",aj,aAvgErr,abs(aAvgErr)/aj);
488 | 		ierr = 0;
489 | 		for (j=0; j<STREAM_ARRAY_SIZE; j++) {
490 | 			if (abs(a[j]/aj-1.0) > epsilon) {
491 | 				ierr++;
492 | #ifdef VERBOSE
493 | 				if (ierr < 10) {
494 | 					printf("         array a: index: %ld, expected: %e, observed: %e, relative error: %e\n",
495 | 						j,aj,a[j],abs((aj-a[j])/aAvgErr));
496 | 				}
497 | #endif
498 | 			}
499 | 		}
500 | 		printf("     For array a[], %d errors were found.\n",ierr);
501 | 	}
502 | 	if (abs(bAvgErr/bj) > epsilon) {
503 | 		err++;
504 | 		printf ("Failed Validation on array b[], AvgRelAbsErr > epsilon (%e)\n",epsilon);
505 | 		printf ("     Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",bj,bAvgErr,abs(bAvgErr)/bj);
506 | 		printf ("     AvgRelAbsErr > Epsilon (%e)\n",epsilon);
507 | 		ierr = 0;
508 | 		for (j=0; j<STREAM_ARRAY_SIZE; j++) {
509 | 			if (abs(b[j]/bj-1.0) > epsilon) {
510 | 				ierr++;
511 | #ifdef VERBOSE
512 | 				if (ierr < 10) {
513 | 					printf("         array b: index: %ld, expected: %e, observed: %e, relative error: %e\n",
514 | 						j,bj,b[j],abs((bj-b[j])/bAvgErr));
515 | 				}
516 | #endif
517 | 			}
518 | 		}
519 | 		printf("     For array b[], %d errors were found.\n",ierr);
520 | 	}
521 | 	if (abs(cAvgErr/cj) > epsilon) {
522 | 		err++;
523 | 		printf ("Failed Validation on array c[], AvgRelAbsErr > epsilon (%e)\n",epsilon);
524 | 		printf ("     Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n",cj,cAvgErr,abs(cAvgErr)/cj);
525 | 		printf ("     AvgRelAbsErr > Epsilon (%e)\n",epsilon);
526 | 		ierr = 0;
527 | 		for (j=0; j<STREAM_ARRAY_SIZE; j++) {
528 | 			if (abs(c[j]/cj-1.0) > epsilon) {
529 | 				ierr++;
530 | #ifdef VERBOSE
531 | 				if (ierr < 10) {
532 | 					printf("         array c: index: %ld, expected: %e, observed: %e, relative error: %e\n",
533 | 						j,cj,c[j],abs((cj-c[j])/cAvgErr));
534 | 				}
535 | #endif
536 | 			}
537 | 		}
538 | 		printf("     For array c[], %d errors were found.\n",ierr);
539 | 	}
540 | 	if (err == 0) {
541 | 		printf ("Solution Validates: avg error less than %e on all three arrays\n",epsilon);
542 | 	}
543 | #ifdef VERBOSE
544 | 	printf ("Results Validation Verbose Results: \n");
545 | 	printf ("    Expected a(1), b(1), c(1): %f %f %f \n",aj,bj,cj);
546 | 	printf ("    Observed a(1), b(1), c(1): %f %f %f \n",a[1],b[1],c[1]);
547 | 	printf ("    Rel Errors on a, b, c:     %e %e %e \n",abs(aAvgErr/aj),abs(bAvgErr/bj),abs(cAvgErr/cj));
548 | #endif
549 | }
550 | 
551 | #ifdef TUNED
552 | /* stubs for "tuned" versions of the kernels */
553 | void tuned_STREAM_Copy()
554 | {
555 | 	ssize_t j;
556 | #pragma omp parallel for
557 |         for (j=0; j<STREAM_ARRAY_SIZE; j++)
558 |             c[j] = a[j];
559 | }
560 | 
561 | void tuned_STREAM_Scale(STREAM_TYPE scalar)
562 | {
563 | 	ssize_t j;
564 | #pragma omp parallel for
565 | 	for (j=0; j<STREAM_ARRAY_SIZE; j++)
566 | 	    b[j] = scalar*c[j];
567 | }
568 | 
569 | void tuned_STREAM_Add()
570 | {
571 | 	ssize_t j;
572 | #pragma omp parallel for
573 | 	for (j=0; j<STREAM_ARRAY_SIZE; j++)
574 | 	    c[j] = a[j]+b[j];
575 | }
576 | 
577 | void tuned_STREAM_Triad(STREAM_TYPE scalar)
578 | {
579 | 	ssize_t j;
580 | #pragma omp parallel for
581 | 	for (j=0; j<STREAM_ARRAY_SIZE; j++)
582 | 	    a[j] = b[j]+scalar*c[j];
583 | }
584 | /* end of stubs for the "tuned" versions of the kernels */
585 | #endif
586 | 


--------------------------------------------------------------------------------
/scheduler.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python3
  2 | """Scheduler connects to all servers"""
  3 | 
  4 | from __future__ import print_function
  5 | import logging
  6 | import argparse
  7 | import random
  8 | import time
  9 | import functools
 10 | import json
 11 | import statistics
 12 | 
 13 | import grpc
 14 | 
 15 | from protocol import protocol_pb2
 16 | from protocol import protocol_pb2_grpc
 17 | 
 18 | import lib.workloads
 19 | 
 20 | MAIN_LOOP_SLEEP = 0.1 # 100 ms
 21 | SAMPLE_FREQ = 1 * 1000  # 1 secs in ms
 22 | 
 23 | class Scheduler:
 24 |     def __init__(self, args, variable_ratios):
 25 |         # Instantiate Servers
 26 |         self.wid = 0
 27 |         self.servers = []
 28 |         random.seed(args.seed)
 29 |         self.schedule = self.get_schedule(args.size, args.until,
 30 |                                           args.workload, args.ratios,
 31 |                                           variable_ratios, args.start_burst)
 32 |         self.pending = [] # Workloads that have arrived but haven't been scheduled
 33 |         self.executing = {} # Currently executing workloads
 34 |         self.finished = [] # Workloads that have finished executing
 35 | 
 36 |         self.remotemem = args.remotemem
 37 |         self.max_far_mem = args.max_far
 38 |         self.base_time = time.time()
 39 |         
 40 |         for addr in sorted(args.servers):
 41 |             self.servers.append(Server(addr, args.remotemem, args.cpus, args.mem,
 42 |                                   args.uniform_ratio, variable_ratios,
 43 |                                   args.max_far, args.optimal))
 44 |         
 45 |         self.original_servers = list(self.servers) # Retain the original ordering for later shuffling operations
 46 | 
 47 |         print("Size of workload is {}; max arrival time is {} seconds".format(
 48 |                     args.size, args.until))
 49 | 
 50 |     
 51 |     def get_schedule(self, size, max_arrival, workloads, ratios, variable_ratios, start_burst):
 52 |         
 53 |         assert len(workloads) == len(ratios)
 54 | 
 55 |         ratios = list(map(int, ratios))
 56 |         
 57 |         # this is what a ratio of 1 corresponds to
 58 |         unit = size / sum(ratios)
 59 | 
 60 |         schedule = []
 61 |         wid = 0
 62 | 
 63 |         for workload_name, ratio in zip(workloads, ratios):
 64 |             times = int(unit * ratio)
 65 |             print("schedule will have {} {} times".format(workload_name, times))
 66 |             
 67 |             workload_class = lib.workloads.get_workload_class(workload_name)
 68 |             cpu_req = workload_class.cpu_req
 69 |             ideal_mem = workload_class.ideal_mem
 70 |             
 71 |             # Use user-provided min_mem if applicable
 72 |             if workload_name in variable_ratios:
 73 |                 min_mem = variable_ratios[workload_name] * ideal_mem
 74 |             else:
 75 |                 min_mem = workload_class.min_mem
 76 | 
 77 |             # Add 'times' instances of this workload to the scheduler     
 78 |             for _ in range(times):
 79 |                 schedule.append(SchedWorkload(workload_name, wid, cpu_req, ideal_mem,
 80 |                                             max_arrival, min_mem))
 81 |                 wid += 1
 82 | 
 83 |         schedule.sort(key=lambda x: x.ts_arrival)
 84 | 
 85 |         # Change the arrival of the first 'start_bust' workloads to 0
 86 |         if start_burst > 0:
 87 |             for idx in range(start_burst):
 88 |                 schedule[idx].ts_arrival = 0
 89 |         return schedule
 90 | 
 91 |     def update_resources(self):
 92 |         for s in self.servers:
 93 |             s.get_resources()
 94 | 
 95 |     def find_server_fits(self, workload):
 96 |         if not self.servers:
 97 |             return None
 98 | 
 99 |         # first try to fit the workload normally
100 |         for s in self.servers:
101 |             if s.fits_normally(workload):
102 |                 return s
103 | 
104 |         # normal placement didn't work, are we using remote memory?
105 |         if not self.remotemem:
106 |             return None
107 | 
108 |         # we are using remote memory. for every server, check if we
109 |         # can fit it using remote mem
110 |         total_far_mem = sum(max(0, ss.alloc_mem - ss.total_mem) for ss in self.servers)
111 |         for s in self.servers:
112 |             if s.fits_remotemem(workload, self.max_far_mem, total_far_mem):
113 |                 return s
114 | 
115 |         return None
116 | 
117 |     def start_schedule(self):
118 |         print("Will execute {} tasks.".format(len(self.schedule)))
119 | 
120 |         while self.schedule or self.pending or self.executing:
121 | 
122 |             ''' Update the Server instances with the latest resource information from their
123 |                 server.py counterparts'''
124 |             self.update_resources()
125 | 
126 |             # move workloads from schedule to pendingq when they arrive
127 |             if self.schedule:
128 |                 self.schedule = self.move_to_pending()
129 | 
130 |             # move from pendingq to executing when we place them on a server
131 |             if self.pending:
132 |                 successfully_executed = self.exec_one()
133 |                 if successfully_executed:
134 |                     self.servers = list(self.original_servers)
135 |                     random.shuffle(self.servers)
136 | 
137 |             if not self.pending and self.schedule:
138 |                 pass
139 | 
140 |             # move from executing to finishq when they finish execution
141 |             if self.executing:
142 |                 self.check_finished()
143 | 
144 |             time.sleep(MAIN_LOOP_SLEEP)
145 | 
146 |         return self.finished
147 | 
148 |     def exec_one(self):
149 |         """ check if any machine fits the workload.
150 |         each server can fit one new workload per exec_one() call. """
151 |         servers = list(self.servers)
152 |         futures = [] # list of tuples (future, workload, server)
153 | 
154 |         def execute_done(future, base_time, workload, executing, server):
155 |             assert future.result().success
156 |             workload.ts_sent = time.time() - base_time
157 |             print("Sent {} to {}".format(workload.get_name(), server.name))
158 |             executing[workload.idd] = workload
159 | 
160 |         for workload in list(self.pending):
161 |             s = self.find_server_fits(workload)
162 |             if s:
163 |                 future = s.execute_future(workload)
164 |                 futures.append((future, workload, s))
165 |                 self.pending.remove(workload)
166 |                 servers.remove(s)
167 |                 future.add_done_callback(functools.partial(execute_done,
168 |                             base_time=self.base_time, workload=workload, executing=self.executing, server=s))
169 |                 return True
170 |         return False
171 | 
172 |     def move_to_pending(self):
173 |         """ returns a new scheduleq with the workloads that couldn't be
174 |         scheduled"""
175 |         elapsed = time.time() - self.base_time
176 | 
177 |         new_schedule = []
178 |         for workload in self.schedule:
179 |             if workload.ts_arrival <= elapsed:
180 |                 self.pending.append(workload)
181 |                 print("{} arrived".format(workload.name + str(workload.idd)))
182 |             else:
183 |                 new_schedule.append(workload)
184 | 
185 |         return new_schedule
186 | 
187 |     def check_finished(self):
188 |         for s in self.servers:
189 |             finish_times, start_times = s.get_finished()
190 |             for idd in finish_times.keys():
191 |                 workload = self.executing[idd]
192 |                 workload.ts_start = start_times[idd]
193 |                 workload.ts_finish = finish_times[idd]
194 |                 self.finished.append(workload)
195 |                 del self.executing[idd]
196 |         
197 | 
198 | 
199 | class SchedWorkload:
200 |     def __init__(self, name, idd, cpu_req, mem_req, max_arrival, min_mem):
201 |         self.name = name
202 |         self.idd = idd
203 |         self.cpu_req = cpu_req
204 |         self.mem_req = mem_req
205 |         self.min_mem = min_mem
206 | 
207 |         self.ts_arrival = random.uniform(0, max_arrival)
208 |         self.ts_sent = 0
209 |         self.ts_start = 0
210 |         self.ts_finish = 0
211 | 
212 |     def get_name(self):
213 |         return self.name + str(self.idd)
214 | 
215 |     def get_duration(self):
216 |         return self.ts_finish - self.ts_start
217 | 
218 |     def get_jct(self):
219 |         return self.ts_finish - self.ts_arrival
220 | 
221 | 
222 | class Server:
223 |     def __init__(self, addr, remotemem, max_cpus, max_mem,
224 |                  uniform_ratio, variable_ratios,
225 |                  max_far, optimal):
226 |         self.channel = grpc.insecure_channel(addr)
227 |         self.stub = protocol_pb2_grpc.SchedulerStub(self.channel)
228 |         self.checkin(remotemem, max_cpus, max_mem, uniform_ratio,
229 |                      variable_ratios, max_far > 0, optimal)
230 |         self.addr = addr
231 | 
232 |         print("connected to server={}".format(self.name))
233 | 
234 |     def __del__(self):
235 |         self.close()
236 | 
237 |     def checkin(self, remotemem, max_cpus, max_mem,
238 |                 uniform_ratio, variable_ratios,
239 |                 limit_remote_mem, optimal):
240 |         """ returns the server name if successful """
241 |         
242 |         self.remotemem = remotemem
243 |         self.free_cpus = max_cpus
244 |         self.total_cpus = max_cpus
245 |         self.free_mem = max_mem
246 |         self.total_mem = max_mem
247 |         self.uniform_ratio = uniform_ratio
248 |         self.variable_ratios = variable_ratios
249 |         self.uniform_ratio = uniform_ratio
250 | 
251 |         req = protocol_pb2.CheckinReq(use_remote_mem=remotemem,
252 |                                       max_cpus=max_cpus,
253 |                                       max_mem=max_mem,
254 |                                       uniform_ratio=uniform_ratio,
255 |                                       variable_ratios=variable_ratios,
256 |                                       limit_remote_mem=limit_remote_mem,
257 |                                       optimal=optimal)
258 |         reply = self.stub.checkin(req)
259 |         if not reply.success:
260 |             raise RuntimeError("Not enough memory or cpus")
261 | 
262 |         self.name = reply.server_name
263 | 
264 | 
265 |     def close(self):
266 |         req = protocol_pb2.ShutdownReq()
267 |         _ = self.stub.shutdown(req)
268 |         self.channel.close()
269 | 
270 |     def execute_future(self, workload):
271 |         """ returns a future of the execution request """
272 |         req = protocol_pb2.ExecuteReq(wname=workload.name, idd=workload.idd)
273 |         return self.stub.execute.future(req)
274 | 
275 |     def get_resources(self):
276 |         req = protocol_pb2.GetResourcesReq()
277 |         reply = self.stub.get_resources(req)
278 |         self.free_cpus = reply.free_cpus
279 |         self.alloc_mem = reply.alloc_mem
280 |         self.min_mem_sum = reply.min_mem_sum
281 | 
282 |     def fits_farmem_uniform(self, w, max_far_mem, total_far_mem):
283 |         """ assumes everything from fits_remotemem() plus the workload
284 |         fits in cpus """
285 |         local_alloc_mem = self.alloc_mem + w.mem_req
286 |         local_ratio = min(1, self.total_mem / local_alloc_mem)
287 |         if local_ratio < self.uniform_ratio:
288 |             return False
289 | 
290 |         # check if (1 - local_ratio) that makes the incoming job fit results in
291 |         # a far memory usage above the max
292 |         if max_far_mem > 0:
293 |             additional_far_mem = (1 - local_ratio) * w.mem_req
294 |             if additional_far_mem + total_far_mem > max_far_mem:
295 |                 return False
296 |         return True
297 | 
298 |     def fits_farmem_variable(self, w, max_far_mem, total_far_mem):
299 |         local_min_mem_sum = self.min_mem_sum + w.min_mem
300 |         if local_min_mem_sum > self.total_mem:
301 |             return False
302 | 
303 |         if max_far_mem > 0:
304 |             curr_far_mem = max(0, self.alloc_mem - self.total_mem)
305 |             if curr_far_mem > 0:
306 |                 additional_far_mem = w.mem_req
307 |             else:
308 |                 additional_far_mem = max(0, w.mem_req + self.alloc_mem - self.total_mem)
309 | 
310 |             if total_far_mem + additional_far_mem > max_far_mem:
311 |                 return False
312 |         return True
313 | 
314 | 
315 |     def fits_remotemem(self, w, max_far_mem, total_far_mem):
316 |         """ assumes the workload didn't fit normally, try to fit it with
317 |         remote memory. we only want to determine whether the workload fits,
318 |         but will let the server compute its own ratio (to avoid consistency
319 |         issues).
320 |         others_far_mem is the far memory in use minus far memory used
321 |         by this server. """
322 |         if not self.fits_cpu_remote(w):
323 |             return False
324 | 
325 |         if self.uniform_ratio:
326 |             return self.fits_farmem_uniform(w, max_far_mem, total_far_mem)
327 | 
328 |         # Variable Policy
329 |         return self.fits_farmem_variable(w, max_far_mem, total_far_mem)
330 | 
331 | 
332 |     def fits_normally(self, w):
333 |         free_mem = self.total_mem - self.alloc_mem
334 |         return self.fits_cpu(w) and free_mem >= w.mem_req
335 | 
336 |     def fits_cpu(self, w):
337 |         return self.free_cpus >= w.cpu_req
338 | 
339 |     def fits_cpu_remote(self, w):
340 |         return self.free_cpus - 1 >= w.cpu_req
341 | 
342 |     def get_finished(self):
343 |         req = protocol_pb2.GetFinishedReq()
344 |         finished = self.stub.get_finished(req)
345 |         return (finished.finished_times, finished.start_times)
346 | 
347 |     def get_samples(self):
348 |         req = protocol_pb2.GetSamplesReq()
349 |         samples = self.stub.get_samples(req)
350 |         return samples
351 | 
352 | 
353 | def print_finished_stats(finishq, base_time):
354 |     print("\nfinished {} workloads".format(len(finishq)))
355 |     latest_finish = max(map(lambda w: w.ts_finish, finishq))
356 |     print("makespan={}".format(round(latest_finish, 3)))
357 |     print("\nName,Arrival,Start,Finish")
358 |     for workload in sorted(finishq, key=lambda w: w.get_name()):
359 |         print("{},{},{},{}".format(workload.get_name(),
360 |                                    round(workload.ts_arrival, 3),
361 |                                    round(workload.ts_sent, 3),
362 |                                    round(workload.ts_finish, 3)))
363 | 
364 | def average_samples_by_time(sample_list): # Takes in a list of lists
365 |     # '*' unpacks an iterable into multiple args for a function
366 |     tuples_by_time = zip(*sample_list)
367 | 
368 |     # Compute the mean for each time step
369 |     means = map(statistics.mean, tuples_by_time)
370 | 
371 |     return means
372 | 
373 | def sum_samples_by_time(sample_list): # Takes in a list of lists
374 |     # '*' unpacks an iterable into multiple args for a function
375 |     tuples_by_time = zip(*sample_list)
376 | 
377 |     # Compute the mean for each time step
378 |     sums = map(sum, tuples_by_time)
379 | 
380 |     return sums
381 | 
382 | def combine_samples(servers):
383 |     mem_samples = list()
384 |     cpu_samples = list()
385 |     swap_samples = dict()
386 |     bw_in_samples = dict()
387 |     bw_out_samples = dict()
388 |     bytes_in_samples = list()
389 |     bytes_out_samples = list()
390 |     curr_pages_samples = dict()
391 | 
392 |     # Compose of list of lists
393 |     for s in servers:
394 |         samples = s.get_samples()
395 |         mem_samples.append(samples.mem_util)
396 |         cpu_samples.append(samples.cpu_util)
397 |         swap_samples[s.addr] = samples.swap_util
398 |         bw_in_samples[s.addr] = samples.bw_in
399 |         bw_out_samples[s.addr] = samples.bw_out
400 |         bytes_in_samples.append(samples.bytes_in)
401 |         bytes_out_samples.append(samples.bytes_out)
402 |         curr_pages_samples[s.addr] = samples.curr_pages
403 | 
404 | 
405 |     # Get the maximum run time
406 |     max_len = max(map(len, mem_samples))
407 | 
408 |     # Padding each list so that they're all the same length
409 |     [lst.extend([0]*(max_len - len(lst))) for lst in mem_samples]
410 |     [lst.extend([0]*(max_len - len(lst))) for lst in cpu_samples]
411 | 
412 |     # Averaging the samples at each time step
413 |     mem = average_samples_by_time(mem_samples)
414 |     cpu = average_samples_by_time(cpu_samples)
415 | 
416 |     # Round values
417 |     rounded_mem = map(lambda num: round(num, 3), mem)
418 |     rounded_cpu = map(lambda num: round(num, 3), cpu)
419 |     swap_samples = {s: list(map(lambda num: round(num, 3), lst)) for s, lst in swap_samples.items()}
420 |     bw_out_samples = {s: list(map(lambda num: round(num, 3), lst)) for s, lst in bw_out_samples.items()}
421 |     bw_in_samples = {s: list(map(lambda num: round(num, 3), lst)) for s, lst in bw_in_samples.items()}
422 |     curr_pages_samples = {s: list(lst) for s, lst in curr_pages_samples.items()}
423 | 
424 |     return (rounded_mem, rounded_cpu, bw_in_samples, bw_out_samples,
425 |             swap_samples, bytes_in_samples, bytes_out_samples, curr_pages_samples)
426 | 
427 | def write_samples_to_file(filename, samples):
428 |     mem, cpu, bw_in, bw_out, swap, bytes_in, bytes_out, curr_pages = samples
429 | 
430 |     with open(filename, 'w') as f:
431 |         combined = zip(mem, cpu)
432 |         combined = [{'Mem':m, 'CPU':c}
433 |                       for m,c in combined]
434 |         numbered = dict(enumerate(combined))
435 |         numbered['bytes in'] = sum(bytes_in)
436 |         numbered['bytes out'] = sum(bytes_out)
437 |         numbered['swap samples'] = swap
438 |         numbered['bw out'] = bw_out
439 |         numbered['bw in'] = bw_in
440 |         numbered['curr_pages'] = curr_pages
441 |         f.write(json.dumps(numbered, indent=4))
442 | 
443 | def generate_filename(args):
444 |     cpus = str(args.cpus)
445 |     mem = str(args.mem)
446 |     size = str(args.size)
447 |     if not args.remotemem:
448 |         policy = "nofar"
449 |     elif args.uniform_ratio:
450 |         policy = "uniform"
451 |     elif args.optimal:
452 |         policy = "optimal"
453 |     else:
454 |         policy = "variable"
455 | 
456 |     filename = 'cpus_{}_mem_{}_size_{}'
457 |     filename = filename.format(cpus, mem, size)
458 |     if args.uniform_ratio != None:
459 |         filename += '_uniform_ratio_{}'.format(args.uniform_ratio)
460 | 
461 |     filename += '_policy_{}'.format(policy)
462 |     cur_time = time.localtime()
463 |     time_string = '_{}-{}-{}:{}:{}:{}'.format(cur_time.tm_year, cur_time.tm_mon,
464 |                                               cur_time.tm_mday, cur_time.tm_hour,
465 |                                               cur_time.tm_min, cur_time.tm_sec)
466 |     filename += time_string + '.json'
467 |     return filename
468 | 
469 | def check_args(args):
470 |     if not args.remotemem:
471 |         assert(not args.uniform_ratio), "uniform_ratio must be used with remote memory"
472 |         assert(not args.variable_ratios), "variable_ratio must be used with remote memory"
473 |         assert(not args.optimal), "optimal must be used with remote memory"
474 |     else:
475 |         # No two of these three can be active simultaneously
476 |         uniform, variable, optimal = map(bool, (args.uniform_ratio, args.variable_ratios, args.optimal))
477 |         print(uniform, variable, optimal)
478 |         assert(uniform ^ variable ^ optimal),\
479 |                ("You must specify one (and only one) of the following options: "
480 |                 "uniform_ratio, variable_ratio.")
481 | 
482 | 
483 | def main():
484 |     parser = argparse.ArgumentParser()
485 |     parser.add_argument('seed', type=int,
486 |                         help="Used to seed randomization")
487 |     parser.add_argument('servers', type=lambda s: s.split(','),
488 |                         help='comma separated list of servers')
489 |     parser.add_argument('cpus', type=int,
490 |                         help='number of cpus required for each server')
491 |     parser.add_argument('mem', type=int,
492 |                         help='memory required for each server (MB)')
493 |     parser.add_argument('--remotemem', '-r', action='store_true',
494 |                         help='enable remote memory')
495 |     parser.add_argument('--max_far', '-s', type=int, default=0,
496 |                         help='max size of far memory, default=0 (unlimited)')
497 |     parser.add_argument('--size', type=int,
498 |                         help='size of workload (num of tasks) ' \
499 |                         'default=200', default=200)
500 |     parser.add_argument('--workload', type=lambda s: s.split(','),
501 |                         help='tasks that comprise the workload ' \
502 |                         'default=quicksort,kmeans,memaslap',
503 |                         default='quicksort,kmeans,memaslap')
504 |     parser.add_argument('--ratios', type=lambda s: s.split(':'),
505 |                         help='ratios of tasks in workload, default=2:1:1',
506 |                         default="2:1:1")
507 |     parser.add_argument('--until', type=int,
508 |                         help='max arrival time in minutes default=20',
509 |                         default=20)
510 |     parser.add_argument('--uniform_ratio', type=float,
511 |                         help='Smallest allowable memory ratio',
512 |                         default=0)
513 |     parser.add_argument('--variable_ratios', type= lambda s: s.split(','),
514 |                         help='Min ratio for each workload',
515 |                         default=[])
516 |     parser.add_argument('--start_burst', type=int,
517 |                         help='Number of workloads that arrive immediately',
518 |                         default=0)
519 |     parser.add_argument('--optimal', '-o', action='store_true',
520 |                         help='Use the optimal algorithm')
521 | 
522 |     cmdargs = parser.parse_args()
523 | 
524 |     # Check for options that shouldn't be used together
525 |     check_args(cmdargs)
526 | 
527 |     # Put the workload_ratio values in a dictionary with the corresponding name
528 |     if cmdargs.variable_ratios:
529 |         assert len(cmdargs.variable_ratios) == len(cmdargs.workload)
530 |         variable_ratios = map(float, cmdargs.variable_ratios)
531 |         variable_ratios = dict(zip(cmdargs.workload, variable_ratios))
532 |     else:
533 |         variable_ratios = dict()
534 | 
535 |     try:
536 |         scheduler = Scheduler(cmdargs, variable_ratios)
537 |         finished = scheduler.start_schedule()
538 |         filename = generate_filename(cmdargs)
539 |         print_finished_stats(finished, scheduler.base_time)
540 |         samples = combine_samples(scheduler.servers)
541 |         write_samples_to_file(filename, samples)
542 |     except KeyboardInterrupt:
543 |         for s in scheduler.servers[:]:
544 |             del s
545 | 
546 | if __name__ == '__main__':
547 |     logging.basicConfig()
548 |     main()
549 | 


--------------------------------------------------------------------------------
/server.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python3
  2 | """Server receives connection from scheduler"""
  3 | 
  4 | from concurrent import futures
  5 | import time
  6 | import logging
  7 | import argparse
  8 | import socket
  9 | from tensorflow.python.framework import test_util
 10 | 
 11 | import multiprocessing
 12 | import psutil
 13 | 
 14 | import grpc
 15 | 
 16 | from protocol import protocol_pb2
 17 | from protocol import protocol_pb2_grpc
 18 | 
 19 | from lib import workloads
 20 | 
 21 | import re
 22 | 
 23 | import numpy as np
 24 | from scipy.optimize import Bounds, minimize
 25 | 
 26 | MAIN_LOOP_SLEEP = 1
 27 | DRIVER_PATH = "/sys/class/infiniband/mlx4_0/ports/1/counters/{}"
 28 | MEGABYTE = 1024*1024
 29 | CURR_PAGES_PATH = '/sys/kernel/debug/frontswap/curr_pages'
 30 | SWAPPINESS_PATH = '/proc/sys/vm/swappiness'
 31 | THP_PATH = "/sys/kernel/mm/transparent_hugepage/enabled"
 32 | SOMAXCONN_PATH = "/proc/sys/net/core/somaxconn"
 33 | SWAPPINESS_THRESHOLD = 60
 34 | SWAP_REGEX = re.compile(rb"VmSwap:\s+(\d+)\s+\.*")
 35 | 
 36 | 
 37 | def eq(x,mems,local_mem):
 38 |     return np.dot(x, mems) - local_mem
 39 | 
 40 | def eq_grad(x,mems,local_mem):
 41 |     return mems
 42 | 
 43 | def obj_new(x, ideal_mems, percents, profiles, gradients=None, mem_gradients=None, beta=0):
 44 |     r1 = 0
 45 |     r2 = 0 
 46 |     r3 = 0
 47 |     r4 = 0 
 48 |     for i in range(ideal_mems.shape[0]):
 49 |         r1 += ideal_mems[i]*(1-percents[i])*(x[i]*profiles[i](x[i]) - profiles[i](1))/1000
 50 |         r2 += ideal_mems[i]*(1-percents[i])*(1-x[i])*profiles[i](x[i])/1000
 51 |         r3 += ideal_mems[i]*(1-percents[i])*x[i]*profiles[i](x[i])/1000
 52 |         r4 += ideal_mems[i]*(1-percents[i])*profiles[i](1)/1000
 53 |     return r1/r2 + beta*r3/r4 
 54 | 
 55 | def obj_grad_new(x, ideal_mems, percents, profiles, gradients, mem_gradients, beta=0):
 56 |     r1 = 0
 57 |     r2 = 0 
 58 |     r4 = 0
 59 |     g1 = np.empty(ideal_mems.shape)
 60 |     g2 = np.empty(ideal_mems.shape)
 61 |     for i in range(ideal_mems.shape[0]):
 62 |         r1 += ideal_mems[i]*(1-percents[i])*(x[i]*profiles[i](x[i]) - profiles[i](1))/1000
 63 |         r2 += ideal_mems[i]*(1-percents[i])*(1-x[i])*profiles[i](x[i])/1000
 64 |         r4 += ideal_mems[i]*(1-percents[i])*profiles[i](1)/1000
 65 | 
 66 |         g1[i] = ideal_mems[i]*(1-percents[i])*mem_gradients[i](x[i]) 
 67 |         g2[i] = ideal_mems[i]*(1-percents[i])*(gradients[i](x[i]) - mem_gradients[i](x[i]))
 68 |     
 69 |     grads = np.empty(ideal_mems.shape)
 70 |     for i in range(ideal_mems.shape[0]): 
 71 |         grads[i] = (g1[i]*r2 - r1*g2[i])/r2**2 + beta*g1[i]/r4 # r3 has the same gradient as r1
 72 |     return grads 
 73 | 
 74 | class Machine:
 75 |     def __init__(self):
 76 |         self.total_cpus = 0 # number of cpus this machine can use
 77 |         self.free_cpus = 0
 78 |         self.total_mem = 0 # amount of memory this machine can use
 79 |         self.alloc_mem = 0
 80 |         self.min_mem_sum = 0
 81 |         self.cur_ratio = 1
 82 | 
 83 |         # how much memory we have placed in this machine.
 84 |         # can be > total_mem when using remote memory
 85 |         self.remote_mem = False
 86 |         self.executing = []
 87 |         self.finished = []
 88 |         self.running = False
 89 |         self.shutdown_now = False
 90 |         self.using_remote_mem = False
 91 | 
 92 |         # Sampling
 93 |         self.cpu_samples = []
 94 |         self.mem_samples = []
 95 |         self.swap_samples = []
 96 |         self.bw_in_samples = []
 97 |         self.bw_out_samples = []
 98 |         self.bytes_in_samples = 0
 99 |         self.bytes_out_samples = 0
100 |         self.curr_pages = []
101 | 
102 |         # Bandwidth state
103 |         self.prev_recv = 0
104 |         self.prev_sent = 0
105 | 
106 |         # State for calculating percents
107 |         self.last_time = 0
108 |         self.slow_downs = {}
109 |         for wname in ['quicksort', 'kmeans', 'memaslap', 'linpack', 'spark', 'tf-inception']:
110 |             self.slow_downs[wname] = 1
111 | 
112 |     def checkin(self, max_cpus, max_mem, use_remote, uniform_ratio, variable_ratios, limit_remote_mem, optimal):
113 |         """
114 |         the scheduler checks in with these params.
115 |         we return whether we have enough resources to do the checkin.
116 |         if True, this machine will start executing jobs
117 |         """
118 |         machine_cpus = multiprocessing.cpu_count()
119 |         machine_mem = psutil.virtual_memory().total / 1024 / 1024
120 | 
121 |         if max_cpus > machine_cpus or max_mem > machine_mem:
122 |             logging.info("Checkin Unsuccessful")
123 |             return False
124 | 
125 |         # the checkin used feasible num. of cpus and mem. now initialize
126 |         # the machine resources
127 |         self.total_mem = max_mem
128 |         self.total_cpus = max_cpus
129 |         self.free_cpus = max_cpus
130 |         self.remote_mem = use_remote
131 |         self.uniform_ratio = uniform_ratio
132 |         self.running = True
133 |         self.variable_ratios = variable_ratios
134 |         self.limit_remote_mem = limit_remote_mem
135 |         self.unpinned_cpus = set(range(self.total_cpus))
136 |         self.cpu_assignments = {c: None for c in self.unpinned_cpus}
137 |         self.base_time = time.time()
138 |         self.reclaimer_cpu = self.total_cpus - 1
139 | 
140 |         self.optimal = optimal
141 | 
142 |         if self.remote_mem:
143 |             try:
144 |                 with open(DRIVER_PATH.format("port_xmit_data")) as tx_file:
145 |                     tx_bytes = int(tx_file.read()) * 4
146 |             except FileNotFoundError:
147 |                     tx_bytes = 0
148 |             
149 |             try:       
150 |                 with open(DRIVER_PATH.format("port_rcv_data")) as recv_file:
151 |                     recv_bytes = int(recv_file.read()) * 4
152 |             except FileNotFoundError:
153 |                     recv_bytes = 0
154 |                     
155 |             self.prev_sent = tx_bytes
156 |             self.prev_recv = recv_bytes
157 | 
158 |             logging.info("Initial tx value: {}".format(tx_bytes / MEGABYTE))
159 |             logging.info("Initial recv value: {}".format(tx_bytes / MEGABYTE))
160 | 
161 | 
162 |         #self.check_swappiness()
163 |         self.check_thp()
164 |         self.check_somaxconn()
165 |         self.check_tf_mkl()
166 | 
167 |         logging.info("Checkin Successful")
168 | 
169 |         return True
170 | 
171 |     def check_state(self):
172 |         if self.using_remote_mem:
173 |             if self.alloc_mem <= self.total_mem:
174 |                 self.using_remote_mem = False
175 |                 print("Transitioning to 8 cpus")
176 |         else:
177 |             if self.alloc_mem > self.total_mem:
178 |                 self.using_remote_mem = True
179 |                 print("Transitioning to 7 cpus")
180 | 
181 |     def check_reclaimer_cpu(self): # Check if reclaimer CPU is being used and move workload off of it
182 |         all_cpus = set(range(self.reclaimer_cpu)) # All CPUs except the reclaimer
183 |         pinnable_cpus = self.unpinned_cpus.intersection(all_cpus) # Only the CPUs that aren't executing
184 |         
185 |         ''' We're now using far memory but a workload is executing on
186 |             the reclaimer CPU. Need to move it off.'''
187 |         if self.cpu_assignments[self.reclaimer_cpu]:
188 |             workload_on_reclaimer = self.cpu_assignments[self.reclaimer_cpu]
189 |             pids = workload_on_reclaimer.get_pids() # Potentially offending pids
190 |             replacement_cpu = pinnable_cpus.pop() # Get a replacement CPU
191 |             print("Moving {} off of the reclaimer CPU".format(workload_on_reclaimer.get_name()))
192 |             
193 |             ''' Not just the parent. But the children too'''
194 |             for pid in pids:
195 |                 process = psutil.Process(pid)
196 |                 affinity_list = process.cpu_affinity() # 
197 |                 if self.reclaimer_cpu in affinity_list:
198 |                     print("Moving {} off of the reclaimer CPU and to {}".format(pid, replacement_cpu))
199 |                     new_affinity_list = [cpu for cpu in affinity_list if cpu != self.reclaimer_cpu]
200 |                     new_affinity_list.append(replacement_cpu)
201 |                     process.cpu_affinity(new_affinity_list)
202 |             
203 |             self.cpu_assignments[self.reclaimer_cpu] = None
204 |             self.cpu_assignments[replacement_cpu] = workload_on_reclaimer
205 |             self.unpinned_cpus.remove(replacement_cpu)
206 |             self.unpinned_cpus.add(self.reclaimer_cpu)
207 |             existing_pinned_cpus = set(workload_on_reclaimer.pinned_cpus)
208 |             existing_pinned_cpus.remove(self.reclaimer_cpu)
209 |             existing_pinned_cpus.add(replacement_cpu)
210 |             workload_on_reclaimer.pinned_cpus = existing_pinned_cpus
211 |         
212 |         return pinnable_cpus
213 |         
214 |     def wait_for_swap_to_fall(self):
215 |         start = time.time()
216 |         while True:
217 |             allowed_far = max(0, self.alloc_mem - self.total_mem)
218 |             allowed_far = 1024 if allowed_far == 0 else allowed_far
219 |             far_mem = self.get_swap()
220 |             print("allowed_far={} far_mem={}".format(allowed_far, far_mem))
221 | 
222 |             if far_mem <= allowed_far or far_mem < 32:
223 |                 break
224 | 
225 |             if time.time() - start > 20:
226 |                 print("waited for 20 seconds. let it go")
227 |                 break
228 | 
229 |             print("wait for swap usage to go down")
230 |             time.sleep(0.5)
231 |         end = time.time()
232 |         print('waited for {} s'.format(end - start))
233 |         global total_wait_time
234 |         total_wait_time += end - start      
235 | 
236 |     def execute(self, new_workload_name, idd):
237 |         new_workload_class = workloads.get_workload_class(new_workload_name)
238 |         self.alloc_mem += new_workload_class.ideal_mem
239 |         self.check_state() # Update self.using_remote_mem
240 |         
241 |         if self.using_remote_mem:
242 |             pinnable_cpus = self.check_reclaimer_cpu()
243 |         else:
244 |             pinnable_cpus = set(self.unpinned_cpus)
245 |         
246 |         new_workload_cpus = set([pinnable_cpus.pop() for i in range(new_workload_class.cpu_req)])
247 |         self.unpinned_cpus.difference_update(new_workload_cpus) # Remove these cpus from the unpinned set
248 |         new_workload = new_workload_class(idd, new_workload_cpus)
249 | 
250 |         for cpu in new_workload_cpus:
251 |             self.cpu_assignments[cpu] = new_workload
252 | 
253 |         if new_workload_name in self.variable_ratios:
254 |             new_workload.set_min_ratio(self.variable_ratios[new_workload_name])
255 | 
256 |         self.min_mem_sum += new_workload.min_mem
257 |         self.free_cpus -= new_workload_class.cpu_req
258 | 
259 | 
260 |         all_workloads = self.executing + [new_workload]
261 | 
262 |         if self.remote_mem:
263 |             if self.uniform_ratio:
264 |                 self.shrink_all_uniformly(all_workloads)
265 |             elif self.optimal:
266 |                 self.shrink_all_optimally(all_workloads, idd)
267 |                 self.last_time = time.time() * 1000 # to ms
268 |             else:
269 |                 self.shrink_all_proportionally(all_workloads)
270 | 
271 |         else:
272 |             assert self.alloc_mem <= self.total_mem
273 | 
274 |         assert self.free_cpus >= 0
275 | 
276 |         new_workload.start()
277 |         self.executing.append(new_workload)
278 |         print("started {} at {} s".format(new_workload.get_name(), round(new_workload.ts_start - self.base_time, 3)))
279 | 
280 |     def check_swappiness(self):
281 |         with open(SWAPPINESS_PATH, 'r') as f:
282 |             swappiness = int(f.read())
283 | 
284 |         assert(not self.remote_mem or swappiness >= SWAPPINESS_THRESHOLD),\
285 |             "Swappiness needs to be >= {} when using remote mem".format(SWAPPINESS_THRESHOLD)
286 |         
287 |         assert(self.remote_mem or swappiness == 1),\
288 |             "Swappiness needs to be == 1 when not using remote mem"
289 |     
290 |     def check_thp(self):
291 |         with open(THP_PATH, 'r') as f:
292 |             assert('[never]' in f.read()), 'Transparent Hugepage is not disabled' 
293 | 
294 |     def check_somaxconn(self):
295 |         with open(SOMAXCONN_PATH, 'r') as f:
296 |             assert('65536' == f.read().strip('\n')), 'somaxconn is set to an incorrect value'
297 |     
298 |     def check_tf_mkl(self):
299 |         assert(test_util.IsMklEnabled()), "tensorflow doesn't have mkl enabled"
300 | 
301 |     def set_cur_ratio(self):
302 |         try:
303 |             # Ratio > 1 means that we're haven't fully utilized local memory
304 |             self.cur_ratio = min(1, self.total_mem / self.alloc_mem)
305 |         except ZeroDivisionError:
306 |             self.cur_ratio = 1
307 | 
308 |     def shrink_all_uniformly(self, workloads):
309 |         total_ideal_mem = sum([w.ideal_mem for w in workloads])
310 |         try:
311 |             local_ratio = min(1, self.total_mem / total_ideal_mem)
312 |         except ZeroDivisionError:
313 |             local_ratio = 1
314 | 
315 |         assert local_ratio >= self.uniform_ratio
316 |         self.set_cur_ratio()
317 | 
318 |         for w in workloads:
319 |             w.modify_ratio(local_ratio)
320 | 
321 |     def shrink_all_proportionally(self, workloads):
322 |         assert self.min_mem_sum <= self.total_mem
323 | 
324 |         total_ideal_mem = sum([w.ideal_mem for w in workloads])
325 |         total_min_mem = sum([w.min_mem for w in workloads])
326 | 
327 |         memory_pool = total_ideal_mem - total_min_mem
328 | 
329 |         # Prevent containers from overgrowing
330 |         excess_mem = max(0, total_ideal_mem - self.total_mem)
331 | 
332 |         # Shrink each container
333 |         for w in workloads:
334 |             try:
335 |                 share_of_excess = (w.ideal_mem - w.min_mem) / memory_pool * excess_mem
336 |             except ZeroDivisionError:
337 |                 # The pool of memory allowed to be pushed to remote storage is empty
338 |                 share_of_excess = 0
339 |             ratio = (w.ideal_mem - share_of_excess) / w.ideal_mem
340 |             w.modify_ratio(ratio)
341 | 
342 |     def shrink_all_optimally(self, workloads, new_idd=None):
343 |         total_ideal_mem = sum([w.ideal_mem for w in workloads])
344 |         total_min_mem = sum([w.min_mem for w in workloads])
345 |         memory_pool = total_ideal_mem - total_min_mem
346 | 
347 |         excess_mem = max(0, total_ideal_mem - self.total_mem)
348 | 
349 |         # Shrink each container
350 |         init_ratios = []
351 |         for w in workloads:
352 |             try:
353 |                 share_of_excess = (w.ideal_mem - w.min_mem) / memory_pool * excess_mem
354 |             except ZeroDivisionError:
355 |                 # The pool of memory allowed to be pushed to remote storage is empty
356 |                 share_of_excess = 0
357 |             ratio = (w.ideal_mem - share_of_excess) / w.ideal_mem
358 |             init_ratios.append(ratio)
359 | 
360 |         if excess_mem <= 0:
361 |             opt_ratios = init_ratios
362 |         else:
363 |             ratios,_ = self.compute_opt_ratios(workloads,init_ratios, new_idd)
364 |             opt_ratios = ratios.tolist()
365 |         
366 |         if self.last_time == 0:
367 |             el_time = 0
368 |         else:
369 |             el_time = time.time()*1000 - self.last_time
370 | 
371 |         for w,ratio in zip(workloads,opt_ratios):
372 |             w.update(el_time, ratio, new_idd)
373 | 
374 |     def compute_opt_ratios(self, workloads, init_ratios, new_idd):
375 |         el_time = time.time()*1000 - self.last_time
376 |         ideal_mems = np.array([w.ideal_mem for w in workloads])
377 |         percents = np.array([(1-(w.idd==new_idd))*min( (w.percent+el_time/w.profile(w.ratio))/self.slow_downs[w.wname], 0.95) for w in workloads])
378 |         profiles = [w.profile for w in workloads]
379 |         mem_gradients = [w.mem_gradient for w in workloads]
380 |         gradients = [w.gradient for w in workloads]
381 | 
382 |         x0 = np.array(init_ratios)
383 | 
384 |         eq_cons = {'type': 'eq',  'fun' : eq, 'jac': eq_grad, 'args': (ideal_mems,self.total_mem)}
385 |         bounds = Bounds(0.5, 1.0)
386 |         beta = 0
387 |         res = minimize(obj_new, x0, method='SLSQP', jac=obj_grad_new, args=(ideal_mems, percents, profiles, gradients, mem_gradients, beta), constraints=eq_cons, options={'disp': False}, bounds=bounds)
388 |         final_ratios = res.x
389 |         return np.round(final_ratios,3), res.fun
390 | 
391 |     def check_finished(self):
392 |         new_finished = []
393 |         old_alloc_mem = self.alloc_mem
394 |         for workload in self.executing[:]:
395 |             if not workload.is_alive():
396 |                 finished_string = "{} finished at {} s (duration={})"
397 |                 print(finished_string.format(workload.get_name(),
398 |                                             round(workload.ts_finish - self.base_time, 3),
399 |                                             workload.get_process_duration()))
400 |                 
401 |                 self.unpinned_cpus.update(workload.pinned_cpus)
402 |                 
403 |                 for cpu in workload.pinned_cpus:
404 |                     self.cpu_assignments[cpu] = None
405 |                 self.free_cpus += workload.cpu_req
406 |                 self.alloc_mem -= workload.ideal_mem
407 |                 self.min_mem_sum -= workload.min_mem
408 |                 self.executing.remove(workload)
409 |                 new_finished.append(workload)
410 |                 
411 |                 # adjust percents
412 |                 el_time = time.time()*1000 - self.last_time
413 |                 final_percent = workload.percent + el_time/workload.profile(workload.ratio)
414 |                 if workload.wname in self.slow_downs:
415 |                     self.slow_downs[workload.wname] = 0.05*final_percent + 0.95*self.slow_downs[workload.wname]
416 |                     logging.info('{} new slow down is {}'.format(workload.wname,self.slow_downs[workload.wname]))
417 |         self.finished.extend(new_finished)
418 | 
419 |         if new_finished:
420 |             print("{} tasks finished".format(len(new_finished)))
421 |             if self.remote_mem:
422 |                 if self.uniform_ratio:
423 |                     self.shrink_all_uniformly(self.executing)
424 |                 elif self.optimal:
425 |                     self.shrink_all_optimally(self.executing, None)
426 |                     self.last_time = time.time()*1000
427 |                 else:
428 |                     self.shrink_all_proportionally(self.executing)
429 | 
430 |         self.check_state()
431 | 
432 |     def clear_finished(self):
433 |         self.finished = []
434 | 
435 |     def get_resources(self):
436 |         return {'free_cpus': self.free_cpus,
437 |                 'alloc_mem': self.alloc_mem,
438 |                 'min_mem_sum': self.min_mem_sum}
439 | 
440 |     def shutdown(self):
441 |         for workload in self.executing:
442 |             print("Terminating {}".format(workload.get_name()))
443 |             workload.kill()
444 |         self.shutdown_now = True
445 |         print("Shutting Down")
446 | 
447 |     def get_swap(self):
448 |         # Get list of pids
449 |         pids = list()
450 |         for workload in self.executing:
451 |             '''Only get pids for things in the container
452 |                This prevents the memaslap from being included with memcached'''
453 |             pids.extend(workload.container.get_pids())
454 | 
455 |         total_swap = 0
456 |         for pid in pids:
457 |             try:
458 |                 path = '/proc/{}/status'.format(pid)
459 |                 with open(path, 'rb', buffering=0) as f:
460 |                     swap = int(SWAP_REGEX.findall(f.read())[0])
461 |                 total_swap += swap
462 |             except Exception:
463 |                 continue
464 |         total_swap = total_swap / 1024 # Convert from KB to MB
465 |         return total_swap
466 | 
467 |     def sample(self):
468 |         if self.running:
469 |             cpu = psutil.cpu_percent()
470 |             mem = psutil.virtual_memory()
471 |             swap = self.get_swap()
472 | 
473 |             # get bandwidth measurements
474 |             if self.remote_mem:
475 |                 try:
476 |                     with open(DRIVER_PATH.format("port_xmit_data")) as tx_file:
477 |                         tx_bytes = int(tx_file.read()) * 4
478 |                 except FileNotFoundError:
479 |                         tx_bytes = 0
480 |                 try:
481 |                     with open(DRIVER_PATH.format("port_rcv_data")) as recv_file:
482 |                         recv_bytes = int(recv_file.read()) * 4
483 |                 except FileNotFoundError:
484 |                         recv_bytes = 0
485 | 
486 |                 bw_tx = tx_bytes - self.prev_sent
487 |                 bw_recv = recv_bytes - self.prev_recv
488 | 
489 |                 try:
490 |                     with open(CURR_PAGES_PATH, 'r') as f_curr_pages:
491 |                         curr_pages = int(f_curr_pages.read())
492 |                 except FileNotFoundError:
493 |                         curr_pages = 0
494 | 
495 |             stats = "CPU: {}, Total Mem: {}, Used Mem: {}, Used Swap: {}".format(cpu,
496 |                     mem.total, mem.used, round(swap, 3))
497 | 
498 |             logging.info(stats)
499 | 
500 |             self.cpu_samples.append(cpu)
501 |             self.mem_samples.append(mem.used / mem.total * 100)
502 |             self.swap_samples.append(swap)
503 |             if self.remote_mem:
504 |                 self.bw_in_samples.append(bw_recv)
505 |                 self.bw_out_samples.append(bw_tx)
506 |                 self.bytes_in_samples += bw_recv
507 |                 self.bytes_out_samples += bw_tx
508 |                 self.prev_recv = recv_bytes
509 |                 self.prev_sent = tx_bytes
510 |                 self.curr_pages.append(curr_pages)
511 | 
512 |                 logging.info("bw_tx: {}".format(bw_tx / MEGABYTE))
513 |                 logging.info("bw_recv: {}".format(bw_recv / MEGABYTE))
514 |         else:
515 |             pass
516 | 
517 | 
518 | class Scheduler(protocol_pb2_grpc.SchedulerServicer):
519 |     def __init__(self, machine, servername):
520 |         self.machine = machine
521 |         self.name = servername
522 | 
523 |     def checkin(self, req, context):
524 |         success = self.machine.checkin(req.max_cpus, req.max_mem,
525 |                                        req.use_remote_mem, req.uniform_ratio,
526 |                                        req.variable_ratios, req.limit_remote_mem, req.optimal)
527 | 
528 |         return protocol_pb2.CheckinReply(server_name=self.name, success=success)
529 | 
530 |     def execute(self, request, context):
531 |         """ executes the request.wname workload.
532 |         if we are using remote memory, computes the new ratio
533 |         that will be required after placing the workload."""
534 |         self.machine.check_finished()
535 |         self.machine.execute(request.wname, request.idd)
536 |         return protocol_pb2.ExecuteReply(success=True)
537 | 
538 |     def get_resources(self, request, context):
539 |         self.machine.check_finished()
540 |         resources = self.machine.get_resources()
541 |         # ** Expands dictionary into named arguments for a function
542 |         reply = protocol_pb2.GetResourcesReply(**resources)
543 |         return reply
544 | 
545 |     def get_finished(self, request, context):
546 |         self.machine.check_finished()
547 |         start_times = {f.idd: f.ts_start - self.machine.base_time
548 |                           for f in self.machine.finished}
549 |         finished_times = {f.idd: f.ts_finish - self.machine.base_time
550 |                           for f in self.machine.finished}
551 |         reply = protocol_pb2.GetFinishedReply(start_times = start_times,
552 |                                               finished_times=finished_times)
553 |         self.machine.clear_finished()
554 |         return reply
555 | 
556 |     def shutdown(self, request, context):
557 |         self.machine.shutdown()
558 |         reply = protocol_pb2.ShutdownReply(success=True)
559 |         return reply
560 | 
561 |     def get_samples(self, request, context):
562 |         reply = protocol_pb2.GetSamplesReply()
563 |         reply.cpu_util.extend(self.machine.cpu_samples)
564 |         reply.mem_util.extend(self.machine.mem_samples)
565 |         reply.swap_util.extend(self.machine.swap_samples)
566 |         reply.curr_pages.extend(self.machine.curr_pages)
567 | 
568 |         bw_in_mb = map(lambda x: x / MEGABYTE, self.machine.bw_in_samples)
569 |         reply.bw_in.extend(bw_in_mb)
570 |         bw_out_mb = map(lambda x: x / MEGABYTE, self.machine.bw_out_samples)
571 |         reply.bw_out.extend(bw_out_mb)
572 | 
573 |         reply.bytes_in = self.machine.bytes_in_samples / MEGABYTE
574 |         reply.bytes_out = self.machine.bytes_out_samples / MEGABYTE
575 |         return reply
576 | 
577 | def serve():
578 |     hostname = socket.gethostname()
579 |     thismachine = Machine()
580 | 
581 |     server = grpc.server(futures.ThreadPoolExecutor(max_workers=10))
582 |     scheduler = Scheduler(thismachine, hostname)
583 |     protocol_pb2_grpc.add_SchedulerServicer_to_server(scheduler, server)
584 | 
585 |     server.add_insecure_port('[::]:50051')
586 |     server.start()
587 | 
588 |     total_cpus = multiprocessing.cpu_count()
589 |     total_mem = psutil.virtual_memory().total
590 |     print("server {} waiting for connection, avail cpus={} mem={} MB".format(hostname,
591 |                                               total_cpus, int(total_mem/(1024*1024))))
592 | 
593 |     try:
594 |         while not thismachine.shutdown_now:
595 |             t0 = time.time()
596 |             thismachine.sample()
597 |             t1 = time.time()
598 |             time.sleep(max(0, MAIN_LOOP_SLEEP - (t1 - t0)))
599 |     except KeyboardInterrupt:
600 |         server.stop(0)
601 | 
602 | if __name__ == '__main__':
603 |     parser = argparse.ArgumentParser()
604 |     parser.add_argument('--log', action='store_true',
605 |                         help='Write out log to file')
606 |     args = parser.parse_args()
607 | 
608 |     if args.log:
609 |         logging.basicConfig(format='%(asctime)s.%(msecs)03d %(message)s', filename='log.txt', level=logging.DEBUG, filemode='w')
610 |     else:
611 |         logging.basicConfig()
612 |     
613 |     total_wait_time = 0
614 |     serve()
615 |     print('total wait tims: {} s'.format(total_wait_time))
616 | 


--------------------------------------------------------------------------------