├── LAB1 ├── allpairsping.py ├── emulab │ └── howto_get_an_account.png ├── helloworld.py ├── homework │ ├── README │ ├── dist_demo.py │ ├── geoip_uvic.repy │ └── math_uvic.repy ├── infloop.py ├── udpforward.py ├── udpping.py └── udppingserver.py ├── LAB2 ├── README └── lab2.ns ├── LAB3 └── README.md ├── LAB4 └── README.md ├── LAB5 └── README.md ├── LAB6 └── README.md ├── LAB8 ├── README.md ├── mpi_mmul.c ├── mpi_test.c └── mpilab.pbs ├── LAB9 ├── GEE-README.md └── README.md ├── Lab-10 └── Readme.md ├── License ├── PortalCrashUpdate.md ├── ProjectIdea.md └── README.md /LAB1/allpairsping.py: -------------------------------------------------------------------------------- 1 | # send a probe message to each neighbor 2 | def probe_neighbors(port): 3 | 4 | for neighborip in mycontext["neighborlist"]: 5 | mycontext['sendtime'][neighborip] = getruntime() 6 | sendmess(neighborip, port, 'ping',getmyip(),port) 7 | 8 | sendmess(neighborip, port,'share'+encode_row(getmyip(), mycontext["neighborlist"], mycontext['latency'].copy())) 9 | # sleep in between messages to prevent us from getting a huge number of 10 | # responses all at once... 11 | sleep(.5) 12 | 13 | # Call me again in 10 seconds 14 | while True: 15 | try: 16 | settimer(10,probe_neighbors,(port,)) 17 | return 18 | except Exception, e: 19 | if "Resource 'events'" in str(e): 20 | # there are too many events scheduled, I should wait and try again 21 | sleep(.5) 22 | continue 23 | raise 24 | 25 | 26 | 27 | # Handle an incoming message 28 | def got_message(srcip,srcport,mess,ch): 29 | if mess == 'ping': 30 | sendmess(srcip,srcport,'pong') 31 | elif mess == 'pong': 32 | # elapsed time is now - time when I sent the ping 33 | mycontext['latency'][srcip] = getruntime() - mycontext['sendtime'][srcip] 34 | 35 | elif mess.startswith('share'): 36 | mycontext['row'][srcip] = mess[len('share'):] 37 | 38 | 39 | 40 | def encode_row(rowip, neighborlist, latencylist): 41 | 42 | retstring = ""+rowip+"" 43 | for neighborip in neighborlist: 44 | if neighborip in latencylist: 45 | retstring = retstring + ""+str(latencylist[neighborip])[:4]+"" 46 | else: 47 | retstring = retstring + "Unknown" 48 | 49 | retstring = retstring + "" 50 | return retstring 51 | 52 | 53 | # Displays a web page with the latency information 54 | def show_status(srcip,srcport,connobj, ch, mainch): 55 | 56 | webpage = "Latency Information

Latency information from "+getmyip()+'

' 57 | 58 | webpage = webpage + "" 59 | 60 | # copy to prevent a race 61 | # connobj.send(encode_row(getmyip(), mycontext['neighborlist'], mycontext['latency'].copy())) 62 | 63 | for nodeip in mycontext['neighborlist']: 64 | if nodeip in mycontext['row']: 65 | webpage = webpage + mycontext['row'][nodeip]+'\n' 66 | else: 67 | webpage = webpage + '\n' 68 | 69 | # now the footer... 70 | webpage = webpage + '
"+ "".join(mycontext['neighborlist'])+"
'+nodeip+'No Data Reported
' 71 | 72 | # send the header and page 73 | connobj.send('HTTP/1.1 200 OK\nContent-Type: text/html\nContent-Length: '+str(len(webpage))+'\nServer: Seattle Testbed\n\n'+webpage) 74 | 75 | # and we're done, so let's close this connection... 76 | connobj.close() 77 | 78 | 79 | 80 | if callfunc == 'initialize': 81 | 82 | # this holds the response information (i.e. when nodes responded) 83 | mycontext['latency'] = {} 84 | 85 | # this remembers when we sent a probe 86 | mycontext['sendtime'] = {} 87 | 88 | # this remembers row data from the other nodes 89 | mycontext['row'] = {} 90 | 91 | # get the nodes to probe 92 | mycontext['neighborlist'] = [] 93 | for line in file("neighboriplist.txt"): 94 | mycontext['neighborlist'].append(line.strip()) 95 | 96 | ip = getmyip() 97 | if len(callargs) != 1: 98 | raise Exception, "Must specify the port to use" 99 | pingport = int(callargs[0]) 100 | 101 | # call gotmessage whenever receiving a message 102 | recvmess(ip,pingport,got_message) 103 | 104 | probe_neighbors(pingport) 105 | 106 | # we want to register a function to show a status webpage (TCP port) 107 | pageport = int(callargs[0]) 108 | waitforconn(ip,pageport,show_status) 109 | 110 | -------------------------------------------------------------------------------- /LAB1/emulab/howto_get_an_account.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ycoady/UVic-Distributed-Systems/1ca6df8cd213f5dc6063e5f551f304e232fd02a3/LAB1/emulab/howto_get_an_account.png -------------------------------------------------------------------------------- /LAB1/helloworld.py: -------------------------------------------------------------------------------- 1 | if callfunc == 'initialize': 2 | print "Yo, Hello World from " + getmyip() -------------------------------------------------------------------------------- /LAB1/homework/README: -------------------------------------------------------------------------------- 1 | HOMEWORK 2 | ======== 3 | 4 | FILES 5 | ===== 6 | geoip_uvic.repy - seattle code calculating the distance between two IPs 7 | math_uvic.repy - math routines needed 8 | dist_demo.py - demo of how to calculate the distance between two IPs 9 | 10 | 11 | DUE 12 | === 13 | To be completed in the lab OR to be demonstrated at the beginning of the next lab. 14 | 15 | INSTRUCTIONS 16 | ============ 17 | Modify the "allpairsping.py" you used in this lab such that, 18 | 19 | - the latency matrix includes the geographical distance in kilometers 20 | for each seattle node pair. 21 | 22 | 23 | -------------------------------------------------------------------------------- /LAB1/homework/dist_demo.py: -------------------------------------------------------------------------------- 1 | include geoip_uvic.repy 2 | 3 | if callfunc == 'initialize': 4 | print "Distance to Victoria %.2f km." % (geoip_distance("142.104.193.247", getmyip())) -------------------------------------------------------------------------------- /LAB1/homework/geoip_uvic.repy: -------------------------------------------------------------------------------- 1 | include math.repy 2 | include math_uvic.repy 3 | include httpretrieve.repy 4 | 5 | # get geo-location record (by-passing unreliable seattle geoip API) 6 | def get_geoip_record (ip_address=""): 7 | GEOIP_SRV_URL = 'http://www.geoplugin.net/json.gp' 8 | 9 | geoip_dict = {} 10 | 11 | #set some safe default values... 12 | geoip_dict['city'] = "n/a" 13 | geoip_dict['countryName'] = "n/a" 14 | geoip_dict['latitude'] = 0.0 15 | geoip_dict['longitude'] = 0.0 16 | ret_val = geoip_dict 17 | 18 | 19 | if ip_address != "": 20 | querydata = "ip=" + ip_address 21 | 22 | 23 | try: 24 | geoip_json = httpretrieve_get_string(GEOIP_SRV_URL, querydata, \ 25 | postdata=None, httpheaders=None, \ 26 | proxy=None, timeout=200); 27 | if geoip_json != '': 28 | geoip_lines = geoip_json.split("\n") 29 | if (len(geoip_lines) >= 3): 30 | geoip_dict = {} 31 | geoip_lines = geoip_lines[1:-1] 32 | 33 | 34 | for line in geoip_lines: 35 | key_val = line.split(":") 36 | key = key_val[0][13:-1] 37 | val = key_val[1].replace("\"","").replace(",","") 38 | #print key + " / " + val + "\n" 39 | if key == "latitude" or key == "longitude": 40 | geoip_dict[key] = float(val) 41 | else: 42 | geoip_dict[key] = val; 43 | ret_val = geoip_dict 44 | except: 45 | ret_val = geoip_dict 46 | 47 | return ret_val 48 | 49 | 50 | # check if geo-location is valid 51 | def location_is_valid (location): 52 | if location != None and len(location) == 2: 53 | try: 54 | float(location[0]) 55 | float(location[1]) 56 | return True 57 | except ValueError: 58 | return False 59 | else: 60 | return False 61 | 62 | 63 | # calculates gepraphic distance between IPs of a and b 64 | def geoip_distance (ip_a, ip_b): 65 | location_rec = get_geoip_record(ip_a) 66 | lat_a = location_rec['latitude'] 67 | long_a = location_rec['longitude'] 68 | if not location_is_valid([lat_a, long_a]): 69 | return -1.0 70 | 71 | location_rec = get_geoip_record(ip_b) 72 | lat_b = location_rec['latitude'] 73 | long_b = location_rec['longitude'] 74 | if not location_is_valid([lat_b, long_b]): 75 | return -1.0 76 | 77 | earth_radius = 6371.0 78 | 79 | diff_lat = lat_b - lat_a 80 | diff_long = long_b - long_a 81 | tmp = math_sin (diff_lat / 2) * math_sin (diff_lat / 2) \ 82 | + math_cos ((lat_a)) \ 83 | * math_cos ((lat_b)) * math_sin (diff_long / 2) \ 84 | * math_sin (diff_long / 2) 85 | 86 | c_in_rad = 2 * math_atan2 (math_sqrt2 (tmp), math_sqrt2 (1 - tmp)) 87 | return earth_radius * c_in_rad 88 | -------------------------------------------------------------------------------- /LAB1/homework/math_uvic.repy: -------------------------------------------------------------------------------- 1 | # calculate x^2 2 | def square (x): 3 | return x * x 4 | 5 | 6 | # calculate average of a and b 7 | def average (a, b): 8 | return (a + b) / 2.0 9 | 10 | 11 | # mean square between a and b 12 | def mean_square (a, b): 13 | return average (square(a), square(b)) 14 | 15 | 16 | # computes Newton's square root approximation 17 | def math_sqrt2 (x): 18 | def is_good_enough (guess): 19 | return abs (square (guess) - x) < 0.0000001 20 | 21 | def improve (guess): 22 | return average (guess, x/guess) 23 | 24 | def sqrt_iter (guess): 25 | if is_good_enough (guess): 26 | return guess 27 | else: 28 | return sqrt_iter (improve (guess)) 29 | 30 | return sqrt_iter (1) 31 | 32 | 33 | # converts from degrees to radians 34 | def math_radians (x): 35 | return (math_pi / 180) * x 36 | 37 | 38 | # computes factorial of n: n! 39 | def math_factorial (n): 40 | if (n <= 1): 41 | return 1 42 | else: 43 | return n * math_factorial (n-1) 44 | 45 | 46 | # computes maclaurin's series approximation for sin(x) 47 | def math_sin (x): 48 | k = 100 49 | first_value = 0.0 50 | for i in xrange (1, k, 4): 51 | next_value = math_radians(x) ** i / math_factorial (i) 52 | first_value += next_value 53 | 54 | for i in xrange (3, k, 4): 55 | next_value = -1 * math_radians(x) ** i / math_factorial (i) 56 | first_value += next_value 57 | 58 | 59 | return first_value 60 | 61 | 62 | # computes maclaurin's series approximation for cos(x) 63 | def math_cos (x): 64 | k = 100 65 | first_value = 0.0 66 | 67 | for i in xrange (0, k, 4): 68 | next_value = math_radians (x) ** i / math_factorial (i) 69 | first_value += next_value 70 | 71 | for i in xrange(2, k, 4): 72 | next_value = -1 * math_radians (x) ** i / math_factorial (i) 73 | first_value += next_value 74 | 75 | return first_value 76 | 77 | 78 | # calculates approximate arctan2 value 79 | def math_atan2 (y, x): 80 | math_pi_div2 = 1.5707963 81 | if (x == 0.0): 82 | if (y > 0.0): 83 | return math_pi_div2 84 | if (y == 0.0): 85 | return 0.0 86 | return -math_pi_div2; 87 | atan = 0.0 88 | z = y / x 89 | if abs (z < 1.0): 90 | atan = z / (1.0 + 0.28 * z * z) 91 | if (x < 0.0): 92 | if ( y < 0.0): 93 | return atan - math_pi; 94 | return atan + math_pi; 95 | else: 96 | atan = math_pi_div2 - z / (z * z + 0.28) 97 | if (y < 0.0): 98 | return atan - math_pi; 99 | return atan -------------------------------------------------------------------------------- /LAB1/infloop.py: -------------------------------------------------------------------------------- 1 | while True: 2 | sleep(.1) 3 | -------------------------------------------------------------------------------- /LAB1/udpforward.py: -------------------------------------------------------------------------------- 1 | # Handle an incoming message 2 | def got_message(srcip,srcport,mess,ch): 3 | if srcip in mycontext['forwardIPs']: 4 | # forward the packet to the correct destination 5 | sendmess(mycontext['forwardIPs'][srcip],srcport,mess,getmyip(),srcport) 6 | 7 | 8 | if callfunc == 'initialize': 9 | if len(callargs) != 3: 10 | raise Exception("Must specify 'IP1 IP2 port' to forward traffic") 11 | mycontext['forwardIPs'] = {} 12 | mycontext['forwardIPs'][callargs[0]] = callargs[1] 13 | mycontext['forwardIPs'][callargs[1]] = callargs[0] 14 | recvmess(getmyip(),int(callargs[2]),got_message) 15 | 16 | -------------------------------------------------------------------------------- /LAB1/udpping.py: -------------------------------------------------------------------------------- 1 | # Handle an incoming message 2 | def got_reply(srcip,srcport,mess,ch): 3 | print "received message: '"+mess+"' from "+srcip+":"+str(srcport) 4 | 5 | if callfunc == 'initialize': 6 | if len(callargs) != 2: 7 | raise Exception("Must specify 'IP port' to send a ping packet") 8 | 9 | # my port is a command line arg 10 | recvmess(getmyip(),int(callargs[1]),got_reply) 11 | sendmess(callargs[0],int(callargs[1]),"Ping message from:"+getmyip()+":"+str(callargs[1]), getmyip(),int(callargs[1])) 12 | # exit in five seconds 13 | settimer(5,exitall,()) 14 | 15 | 16 | -------------------------------------------------------------------------------- /LAB1/udppingserver.py: -------------------------------------------------------------------------------- 1 | # Handle an incoming message 2 | def got_message(srcip,srcport,mess,ch): 3 | print "Received message: '"+mess+"' from "+srcip+":"+str(srcport) 4 | sendmess(srcip,srcport,"Ping response from "+getmyip()+":"+callargs[0],getmyip(), int(callargs[0])) 5 | 6 | 7 | if callfunc == 'initialize': 8 | if len(callargs) != 1: 9 | raise Exception("Must specify 'port' to wait for packets on") 10 | 11 | recvmess(getmyip(),int(callargs[0]),got_message) 12 | 13 | -------------------------------------------------------------------------------- /LAB2/README: -------------------------------------------------------------------------------- 1 | EMULAB 2 | ====== 3 | Emulab has 888 nodes available and is designed for short duration experiments but can also be used as a development platform. You may allocate dedicated machines or VMs using a web interface. You can configure the nodes by specifying an OS (e.g., “UBUNTU14-64-STD” for Ubuntu 12.14) and network links/topology between the nodes. This is unlike Seattle where machines come with whatever OS is already installed on them and you only have access to a scripting engine to run a restricted subset of the python language. In Emulab you are not restricted to a specific language; you can log into the allocated machines via your private key or your Emulab account/password (username on your machines is the same as your Emulab user name). You may log in to each node, load any software you need, then run your experiment by running any scripts etc. you need on the various nodes. You have full root accesss via de sudo command. 4 | 5 | In Emulab, before passing the requested nodes to the user, an operating system image specified/created by the user is installed. This means that unlike in Seattle, users can do destructive work such as kernel modifications if a custom OS image is used. One may define complex network topologies and configurations within Emulab using ns-2 scripting which also allows the automatic configuration of multiple machines and switches according to custom experiment requirements. 6 | 7 | 8 | IMPORTANAT NOTICE: 9 | ================== 10 | All modification done to an instantiation of an OS image on an Emulab node are lost after an experiment is swapped out. The Files in your home directory are kept though. 11 | 12 | 13 | USEFUL LINKS // GETTING STARTED 14 | =============================== 15 | https://wiki.emulab.net/wiki/Emulab/wiki/Tutorial 16 | https://wiki.emulab.net/wiki/Emulab/wiki/AdvancedExample 17 | https://wiki.emulab.net/wiki/nscommands 18 | 19 | EXAMPLE NS2 FILE (ONE NODE) 20 | ========================== 21 | # 22 | # Create a new simulator object 23 | # 24 | set ns [new Simulator] 25 | 26 | # 27 | # Load in testbed-specific commands 28 | # 29 | source tb_compat.tcl 30 | 31 | # 32 | # Network topology, traffic specification, events 33 | # 34 | 35 | 36 | # Set up a server node (use same OS for client) 37 | set server [$ns node] 38 | tb-set-node-os $server CENTOS63-64-STD #<- YOU NEED TO MAKE A CHANGE HERE 39 | 40 | # Set up a server node. 41 | # We use the Fedora Core 15 Linux as the OS. 42 | # =========TO BE FILLED IN BY YOU ========== 43 | 44 | # Set up the link between server and client 45 | # the link is should be named bridge, it's full duplex, 100 Mb/s, 46 | # 20ms latency, 5% packet loss. 47 | # (use DropTail for link, it specifies link properties under congestion) 48 | # =========TO BE FILLED IN BY YOU ========== 49 | 50 | 51 | # Begin the experiment (in NS, this would begin the simulation) 52 | $ns run 53 | 54 | END EXAMPLE 55 | =========== 56 | 57 | 58 | 59 | LAB #2 60 | ====== 61 | 62 | 1. SETUP YOUR EXPERIMENT 63 | ======================== 64 | - Modify the above ns-2 file to use only "32 bit Fedora 15 standard" 65 | OS image instead of the 64 bit CENTOS63-64-STD image. 66 | 67 | - Add a node called "client" with Fedora 15 standard 68 | 69 | - Add a duplex network link called "bridge" connecting server and client node. 70 | Use the following settings for the bride: 71 | linkspeed : 100Mbps 72 | latency : 10ms 73 | packetloss rate: 0.0001 74 | 75 | 76 | 2. MEASURING THE EFFECTS OF PACKETLOSS & LATENCY ON BANDWIDTH 77 | ============================================================= 78 | For each combination of link packetloss rate (PLR) and latency (L) do: 79 | PLR = [0.001, 0.05, 0.1] 80 | L = [1ms, 10ms, 50ms, 100ms] 81 | 82 | 1) In a table take note of the round-trip ping time (RTT) and ping's estimated packetloss. 83 | Why do you see the results you are seeing? 84 | Were these results what you expected? and if so why? 85 | 86 | 2) On your server node run the command: 87 | /usr/local/etc/emulab/emulab-iperf -s 88 | 89 | On your client node run the command: 90 | /usr/local/etc/emulab/emulab-iperf -c server -f k -i 60 -t 60 91 | 92 | - What do the above commands do? (HINT: run emulab-iperf --help) 93 | - Use the emulab-iperf commands above to examine how the throughput of your 94 | link changes as you change latency and packetloss rate. 95 | 96 | - For the packetloss rates (PLR) of 0% and 10% plot the average link throughput 97 | vs. the latencies = [1ms, 10ms, 50ms, 100ms]. E.g. make a plot for each PLR. 98 | 99 | HELPFUL COMMANDS 100 | ================ 101 | In "ns" script: 102 | $link0 bandwidth 10Mb duplex 103 | $link0 delay 10ms 104 | $link0 plr 0.05 105 | With "tevc": 106 | tevc ... link_name modify bandwidth=20000 # In kbits/second; 20000 = 20Mbps 107 | tevc ... link_name modify delay=10ms # In msecs (the "ms" is ignored) 108 | tevc ... link_name modify plr=0.05 # packet loss rate; 0.05 = 5% loss 109 | Both: 110 | $link_name up 111 | $link_name down 112 | 113 | MODIFY LINK SPEED AT RUNTIME 114 | ============================ 115 | /usr/testbed/bin/tevc -e vikelab/apache-loss-latency now bridge modify bandwidth=100000 116 | /usr/testbed/bin/tevc -e vikelab/apache-loss-latency now bridge modify delay=30ms 117 | /usr/testbed/bin/tevc -e vikelab/apache-loss-latency now bridge modify plr=0.05 -------------------------------------------------------------------------------- /LAB2/lab2.ns: -------------------------------------------------------------------------------- 1 | # 2 | # Create a new simulator object 3 | # 4 | set ns [new Simulator] 5 | 6 | # 7 | # Load in testbed-specific commands 8 | # 9 | source tb_compat.tcl 10 | 11 | # 12 | # Network topology, traffic specification, events 13 | # 14 | 15 | 16 | # Set up a server node (use same OS for client) 17 | set server [$ns node] 18 | tb-set-node-os $server CENTOS63-64-STD #<- YOU NEED TO MAKE A CHANGE HERE 19 | 20 | # Set up a server node. 21 | # We use the Fedora Core 15 Linux as the OS. 22 | # =========TO BE FILLED IN BY YOU ========== 23 | 24 | # Set up the link between server and client 25 | # the link is should be named bridge, it's full duplex, 100 Mb/s, 26 | # 20ms latency, 5% packet loss. 27 | # (use DropTail for link, it specifies link properties under congestion) 28 | # =========TO BE FILLED IN BY YOU ========== 29 | 30 | 31 | # Begin the experiment (in NS, this would begin the simulation) 32 | $ns run -------------------------------------------------------------------------------- /LAB3/README.md: -------------------------------------------------------------------------------- 1 | ## *"Hello GEE!"* in 10 easy steps by: 2 | 3 | 1. Allocating yourself a slicelet 4 | 5 | 2. Writing a script to install the necessary software (geoip-bin and wget), and a geoIP [database] (http://geolite.maxmind.com/download/geoip/database/GeoLiteCity.dat) 6 | 7 | 3. Writing a script to make sure everything installed correctly 8 | 9 | 4. Putting both scripts on the nodes using the [Fabric](http://www.cs.cornell.edu/projects/fabric/) 'put' command 10 | 11 | 5. Using Fabric with a parallel decorator to run the install and check scripts 12 | 13 | 6. Writing a script to get the local IP address out of ifconfig 14 | 15 | 7. Extending the script to get the host IP address as well 16 | 17 | 8. Using that IP address to get the lat long of the server 18 | 19 | 9. Writing a [curl] (http://curl.haxx.se/docs/manpage.html) command to send slicename, private_ip, public_ip, hostname, lat, long to the Lively server Rick will set up 20 | 21 | 10. Writing a Fabric command to do this in parallel on all the nodes 22 | 23 | Details [here!] (https://docs.google.com/document/d/1RVRtNavLLjECwuChpxinR_PFqTgf6yQxMfLnvcpW4vw/edit?usp=sharing) 24 | -------------------------------------------------------------------------------- /LAB4/README.md: -------------------------------------------------------------------------------- 1 | ## BitTorrent versus HTTP! 2 | 3 | ### Goal and Objective 4 | In this lab, we’re going to compare the performance of a file download over HTTP and using BitTorrent. You’re going to: 5 | * Learn how to set up a webserver 6 | * Learn how to set up a Torrent and a private tracker 7 | * Run an experiment using Fabric (timing is critical here) 8 | * Extract the results and present them 9 | 10 | ### Stuff you’ll Need 11 | * Webserver: Apache2, BitTorrent 12 | * Clients: aria2c 13 | 14 | ## What You’ll Produce 15 | A report showing: 16 | * Bandwidth between the http server and each client 17 | * Bandwidth between the http server and each client, when each client is trying to transmit in parallel 18 | * Download time for a standard test file, when each client tries to download in turn 19 | * Download time for a standard test file, when the clients try to download simultaneously 20 | * Torrent time for a standard test file, one client 21 | * Torrent time for a standard test file, three clients working in parallel 22 | * Fabric Operations and Concepts 23 | * Running a task in parallel (@parallel) 24 | * Roles (env.roledefs.update, @roles) 25 | * File operations (put/get) 26 | 27 | ### Code You’ll Write (all straightforward, but fun in teams!) 28 | * Something that generates a big file (100MB or so) for the test 29 | * A shell script to capture the begin time of a job and save to a file 30 | * A shell script to capture the end time of a job and save to a file 31 | * A short python or shell script to look at the times and file size and report duration and rate 32 | 33 | The model solution that we had had about five lines of code in each of these; as before, not a lot of programming in this lab. The goal is to learn about the infrastructure and conducting experiments! 34 | 35 | Step by Step details are [here!] (https://docs.google.com/document/d/1qyXj94BPk-SwQmTlVwnb0lyLIrLny-Z2o5g_P3UDHhs/edit?usp=sharing) 36 | -------------------------------------------------------------------------------- /LAB5/README.md: -------------------------------------------------------------------------------- 1 | ## Project Party! This lab has 2 parts (3 if you don't have a repo yet!): 2 | 3 | 4 | 0. If you haven't already, set up a repo for your project, and put a link to it under "OUR PROJECTS" in the CourseSpaces Forum! You will be demo-ing next week (Lab 6) based on what is there, and roughly it should contain: 5 | 6 | * a copy of your source code, and instructions on how we can compile / run your code 7 | * a description of the following: 8 | * the structure of your code, including any major interfaces you implemented 9 | * for example, in the 2PC project, the RPC interface your replicas expose to the coordinator 10 | * how you handle failures 11 | * for example, in the 2PC project, how are failures reflected to clients via the RPC interface that the coordinator exposes, if at all 12 | * the test cases you explored, and why you picked those, along with test cases you would do if you had more time! 13 | 14 | 15 | 1. Work on making progress on your project, and concretely identify where you could use a little help from someone who is on the outside! This could be brainstorming, sanity checking, or actual digging through code/tests! It may involve methodology, tools, coding, testing, project management... anything! Just be ready to articulate what it is you could use and hand with, as clearly as you can. Add this as an issue to the repo for your project. 16 | 17 | 2. Helping 3 other projects make progress, and having some else help you make progress with yours! Spend about 20 minutes with someone from another project, exchanging where you are at, and explaining the issue you have identified. By the end of this week, post a best-effort response for each of the issues you are helping with on the other projects. 18 | -------------------------------------------------------------------------------- /LAB6/README.md: -------------------------------------------------------------------------------- 1 | ## Lab 6: GLORY LAP for Projects and MPI! 2 | 3 | In this lab you will: 4 | 1. Demo your awesome project to Niko 5 | 2. Post two cool things about other peoples projects in the "COOL THINGS ABOUT OUR PROJECTS" issue 6 | 3. Try to fire up Compute Canada and MPI (if we have accounts in time!), or we can even use EC2... Check out this [tutorial!](http://mpitutorial.com/beginner-mpi-tutorial/) 7 | 8 | (more soon!) 9 | -------------------------------------------------------------------------------- /LAB8/README.md: -------------------------------------------------------------------------------- 1 | LAB #8 TASKS: 2 | ============= 3 | 4 | PART 1 (WARM UP): 5 | ================= 6 | 1. Login to a westgrid cluster (see below PART 2). 7 | 2. Upload mpi_test.c, mpi_mmul.c and mpilab.pbs 8 | 3. Compile mpi_test.c and run it using PBS (mpilab.pbs) on 4 nodes. 9 | 10 | mpicc -o mpitest mpitest.c 11 | 12 | PART 2 (MATRIX MULTIPLICATION): 13 | =============================== 14 | 1. Let A, B, and C be square matrices of size NxN. 15 | What is the running time in N of a NAIVE and SERIAL 16 | implementation of the matrix multiplication algorithm 17 | computing C = A*B? 18 | Is matrix multiplication a "nice" problem? 19 | Can we do better than the naive implementation? 20 | 21 | 2. To familiarize yourself with matrix multiplication in C 22 | compile mpi_mmul.c as non-MPI-program and then running the 23 | sanity_check() function: 24 | 25 | gcc mpi_mmul.c -o mpi_mmul 26 | ./mpi_mmul 27 | 28 | 3. Modify mpi_mmul.c to run as an MPI program that partitions, and distributes the work to multiply the matrices A and B among the number of nodes you specified when submitting your job to the PBS system. 29 | 30 | 4. Record the running time of your MPI program when setting the matrix size to NxN=4096x4096 and using the following number of MPI processes: 31 | NPROC = {32, 16, 8, 4, 2, 1} 32 | 33 | 5. Calculate and plot the speedup as a function of used parallel MPI processes. Speedup is defined as: 34 | 35 | (time req. for serial run)/(time req. for parallel run) 36 | 37 | 38 | 39 | AVAILABLE WESTGRID SYSTEMS WITH MPI INSTALLED 40 | ============================================= 41 | UVIC: hermes.westgrid.ca / nestor.westgrid.ca 42 | 43 | SFU: bugaboo.westgrid.ca 44 | 45 | U. ALBERTA: jasper.westgrid.ca 46 | 47 | LOGIN: 48 | ssh username@hermes.westgrid.ca 49 | 50 | 51 | USEFUL LINKS 52 | ============ 53 | WESTGRID: 54 | https://www.westgrid.ca/support/systems/hermesnestor 55 | https://www.westgrid.ca/support/running_jobs 56 | 57 | PBS SCRIPTING: 58 | https://www.westgrid.ca/files/PBS%20Script_0.pdf 59 | 60 | MPI API DOCUMENTATION: 61 | http://www.mpich.org/static/docs/v3.1/ 62 | 63 | API CALLS YOU WILL NEED: 64 | - MPI_Init 65 | - MPI_Comm_size 66 | - MPI_Comm_rank 67 | - MPI_Bcast 68 | - MPI_Gather 69 | - MPI_Finalize 70 | 71 | 72 | GENERAL PROCEDURE OF RUNNING MPI PROGRAMS ON WESTGRID 73 | ===================================================== 74 | 1. Compile your MPI program 75 | mpicc mpi_test.c -o mpi_test 76 | 77 | 2. Run a PBS script to submit a batch job to a westgrid cluster. 78 | The preferred way of starting MPI programs on westgrid is mpiexec. The main advantage of using mpiexec over mpirun is that there is no need to source any setup files before executing your program. The example MPI program mpi_test.c could be executed with the PBS script contained with this lab as follows 79 | 80 | qsub -l procs=16,pmem=1gb,walltime=1:00:00 mpilab.pbs 81 | 82 | in which 16 processors are requested (procs parameter), using at most 1 GB of memory per process (pmem parameter) and running for at most 1 hour (walltime parameter) (see https://www.westgrid.ca/support/running_jobs for more details). 83 | 84 | 3. Check your MPI program output: 85 | The output files for the job are created in the same directory where the qsub command was executed. The output files are created using the same filename as your PBS script filename with an extension that includes "e" for "error" or "o" for "output" followed by the process number. -------------------------------------------------------------------------------- /LAB8/mpi_mmul.c: -------------------------------------------------------------------------------- 1 | /* #include */ 2 | #include 3 | #include 4 | 5 | const int size = 3; 6 | /* const int size = 4096; */ 7 | 8 | float a[size][size]; 9 | float b[size][size]; 10 | float c[size][size]; 11 | 12 | /* Print matrix values to console */ 13 | void print_matrix(float matrix[size][size]) 14 | { 15 | int i, j; 16 | for (i = 0; i < size; ++i) 17 | { 18 | for(j = 0; j < size; j++) 19 | { 20 | printf("%6.2f\t", matrix[i][j]); 21 | } 22 | printf("\n"); 23 | } 24 | } 25 | 26 | /* Initialize matrices */ 27 | void init_matrices() 28 | { 29 | int i, j; 30 | // Initialize matrices. 31 | for (i = 0; i < size; ++i) 32 | { 33 | for (j = 0; j < size; ++j) 34 | { 35 | a[i][j] = (float)i + j; 36 | b[i][j] = (float)i - j; 37 | c[i][j] = 0.0f; 38 | } 39 | } 40 | } 41 | 42 | /* Multiply specified sections of the matrices */ 43 | void multiply(int idx_start, int idx_end) 44 | { 45 | int i, j, k; 46 | for (i = idx_start; i <= idx_end; ++i) 47 | { 48 | for (j = 0; j < size; ++j) 49 | { 50 | for (k = 0; k < size; ++k) 51 | { 52 | c[i][j] += a[i][k] * b[k][j]; 53 | } 54 | } 55 | } 56 | } 57 | 58 | /* Print values of matrices A, B, and C = A*B */ 59 | void sanity_check() 60 | { 61 | printf("A:\n"); 62 | print_matrix(a); 63 | printf("\n"); 64 | 65 | printf("B:\n"); 66 | print_matrix(b); 67 | printf("\n"); 68 | 69 | multiply(0, size-1); 70 | printf("C = A*B:\n"); 71 | print_matrix(c); 72 | } 73 | 74 | int main(int argc, char **argv) 75 | { 76 | int rank, nproc; 77 | int idx_start, idx_end; 78 | 79 | /* TODO: Initialize MPI subsystem 80 | will need MPI_COMM_WORLD as parameter what does it do?*/ 81 | 82 | /* TODO: deterimine (1) total number of MPI nodes (nproc) and 83 | (2) the nodes own MPI rank (rank)*/ 84 | 85 | /* TODO: populate matices with some values 86 | (Modify this such that only MPI node with rank 0 87 | is calling this routine!) */ 88 | init_matrices(); 89 | 90 | /* TODO: have rank 0 node BROADCAST values of 91 | matrix A, B, C to all MPI workers (workers have rank 1..(procs-1) ) 92 | (datatype: MPI_FLOAT, you may also use MPI_Scatter instead of broadcasting) */ 93 | 94 | /* TODO: partition work, that is: 95 | idx_start = (size / procs) * rank; 96 | idx_end = ? */ 97 | 98 | /* TODO: Compute matrix multiplication in assigned 99 | partition [idx_start, idx_end] */ 100 | 101 | /* TODO: Gather computed results */ 102 | 103 | /* Print matrices (sanity check, only run for small matrix size) 104 | (TODO: comment this line out when running as MPI program).*/ 105 | sanity_check(); 106 | } -------------------------------------------------------------------------------- /LAB8/mpi_test.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | int main(int argc, char **argv) 6 | { 7 | int rank; 8 | char hostname[256]; 9 | 10 | MPI_Init(&argc,&argv); 11 | MPI_Comm_rank(MPI_COMM_WORLD, &rank); 12 | gethostname(hostname,255); 13 | 14 | printf("Hello world! I am process number: %d on host %s\n", rank, hostname); 15 | 16 | MPI_Finalize(); 17 | 18 | return 0; 19 | } 20 | -------------------------------------------------------------------------------- /LAB8/mpilab.pbs: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #PBS -S /bin/bash 3 | 4 | # Sample script for running an MPI-based parallel program 5 | cd $PBS_O_WORKDIR 6 | echo "Current working directory is `pwd`" 7 | 8 | echo "Node file: $PBS_NODEFILE :" 9 | echo "---------------------" 10 | cat $PBS_NODEFILE 11 | echo "---------------------" 12 | 13 | # On many WestGrid systems a variable PBS_NP is automatically 14 | # assigned the number of cores requested of the batch system 15 | # and one could use 16 | # echo "Running on $PBS_NP cores." 17 | # On systems where $PBS_NP is not available, one could use: 18 | 19 | CORES=`/bin/awk 'END {print NR}' $PBS_NODEFILE` 20 | echo "Running on $CORES cores." 21 | 22 | echo "Starting run at: `date`" 23 | 24 | # On most WestGrid systems, mpiexec will automatically start 25 | # a number of MPI processes equal to the number of cores 26 | # requested. The -n arugment can be used to explicitly 27 | # use a specific number of cores. 28 | 29 | mpiexec -n ${CORES} ./mpi_test 30 | #mpiexec -n ${CORES} ./mpi_mmul 31 | echo "Program finished with exit code $? at: `date`" -------------------------------------------------------------------------------- /LAB9/GEE-README.md: -------------------------------------------------------------------------------- 1 | # Getting AbeBook's Scrunch Server running on GEE 2 | 3 | ## Log into GEE and run... 4 | 5 |
  6 | apt-get update
  7 | apt-get install -y build-essential
  8 | apt-get install -y python2.7-dev
  9 | apt-get install -y python-pip
 10 | apt-get install -y wget 
 11 | apt-get install -y unzip
 12 | apt-get install -y uwsgi uwsgi-plugin-python
 13 | 
14 | 15 | ## Prepare Web server directory and install dependencies: 16 | 17 |
 18 | cd /root
 19 | mkdir www
 20 | cd www
 21 | wget http://www.scrunch.ca/static/scrunch-code.zip
 22 | unzip scrunch-code.zip
 23 | chown www-data:www-data -R /root/www
 24 | pip install -r requirements/prod.txt
 25 | pip install -r requirements/dev.txt
 26 | 
27 | 28 | ## Install nginx: 29 | 30 |
 31 | apt-get install -y  nginx-full
 32 | rm /etc/nginx/sites-enabled/default
 33 | 
34 | 35 | ## Make nginx config file: 36 | 37 |
 38 | vi /etc/nginx/sites-enabled/scrunch
 39 | 
40 | 41 |
 42 | server {
 43 |         listen 80;
 44 |           server_name localhost
 45 |           server_tokens off;
 46 |           access_log /var/log/nginx/scrunch_access.log;
 47 |           error_log /var/log/nginx/scrunch_error.log;
 48 | 
 49 |           location / {
 50 |                   root /root/www;
 51 |                   include uwsgi_params;
 52 |                   uwsgi_pass unix:/tmp/uwsgi.sock;
 53 |         }
 54 | 
 55 |         location /static {
 56 |                 alias /root/www;
 57 |         }
 58 | }
 59 | 
60 | 61 | 62 | ## Restart nginx: 63 | 64 |
 65 | service nginx restart
 66 | 
67 | 68 | ## Make 'Hello World' program: 69 | 70 |
 71 | cd /root/www
 72 | vi hello.py
 73 | 
74 | 75 |
 76 | from flask import Flask
 77 | app = Flask(__name__)
 78 | 
 79 | @app.route('/')
 80 | def hello_world():
 81 |     return 'Hello World!'
 82 | 
 83 | if __name__ == '__main__':
 84 |     app.run(host='0.0.0.0')
 85 | 
86 | 87 | ## Make uWSGI configuration yaml script: 88 | 89 |
 90 | vi app.yaml
 91 | uwsgi:
 92 |   socket: /tmp/uwsgi.sock
 93 |   master: 1
 94 |   workers: 1
 95 |   chmod-socket: 666
 96 |   auto-procname: 1
 97 |   plugins: python
 98 |   python-path: .
 99 |   uid: www-data
100 |   gid: www-data
101 |   pidfile: /tmp/uwsgi.pid
102 |   daemonize: /var/log/uwsgi.log
103 |   module: hello:app
104 | 
105 | 106 | ## Start up 'Hello world' program: 107 | 108 |
109 | uwsgi --yaml app.yaml
110 | 
111 | 112 | ## Setup SSH tunnel forwarding traffic to the GEE nginx server: 113 | 114 |
115 | ssh -L 8080:localhost:80 -i id_rsa -F ssh-config [your slice url] -N
116 | 
117 | 118 | ## Access your Scrunch server from your local browser: 119 | 120 |
121 | http://localhost:8080
122 | 
123 | -------------------------------------------------------------------------------- /LAB9/README.md: -------------------------------------------------------------------------------- 1 | # Installing AbeBook's Scrunch on Amazon EC2 2 | 3 | ## Get an Amazon Web Services Account 4 | 1. Get an account here: 5 | 6 | http://aws.amazon.com 7 | 8 | (you will need a Amazon Account/Credit Card & Cell phone to verify your account) 9 | 10 | 2. Create key pair (EC2 Dashboard) 11 | 12 | If you access EC2 from Linux change permission on key 13 |
 14 | 		chmod 400 mykey.pem 
 15 | 	
16 | 17 | If you access EC2 from Windows using putty you will have to convert your key from *.pem to *.ppk using puttygen, see here: 18 | 19 | http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/get-set-up-for-amazon-ec2.html#prepare-for-putty 20 | 21 | 3. Makes sure your Region is set to 'US-Oregon' (Top right EC2 menu bar) 22 | 23 | 4. Create a Security Group (EC2 Dashboard) 24 | 25 | Allow Inbound: HTTP/HTTPS/SSH 26 | 27 | 5. Launch AMI Instance (EC2 Dashboard) 28 | 29 | Check tick 'Free tier only' check box. 30 | 31 | Choose Amazon Linux AMI 2014.09.2 (HVM) 32 | 33 | Filter for 'Micro Instances' in next dialog (t2.micro) 34 | 35 | Edit Security Group 36 | 37 | 6. Connect to your Micro AWS Instance 38 | 39 | Use ssh with the key you generated/downloaded earlier. 40 | 41 |
 42 | 		ssh -i mykey.pem ec2-user@[machine IP]
 43 | 	
44 | 45 | 7. TERMINATE YOUR INSTANCE 46 | 47 | DO IT!!!!! PLEASE!!!!!! 48 | 49 | ## Install basic and required packages 50 | 51 | For the packages below are enough to run a Flask 'Hello World' program we use to get you started (you will need some more packes to run AbeBook's Scrunch server). 52 | 53 |
 54 | sudo su
 55 | yum upgrade -y
 56 | 
57 | 58 | To begin with, you must install some bare bone packages 59 | Flask, Jinja2 and Werkzeug will be automatically installed below. 60 | 61 |
 62 | yum install -y gcc-c++
 63 | yum install -y python-devel
 64 | yum -y install python-pip
 65 | 
 66 | pip install uwsgi
 67 | pip install flask
 68 | pip install flask-debugtoolbar
 69 | 
70 | 71 | ## Install and setup web-server 72 | 73 | You can host your Flask app directly, but nginx, is the better choice if you want your project to scale (you can choose other web-servers like Apache as well, but the instructions here are for nginx . 74 | 75 |
yum install -y nginx
76 | 77 | After installation, change configuration file for proxy setup. 78 | 79 |
vi /etc/nginx/nginx.conf
80 | 81 | Find the location / section, and change it to as follow: 82 | 83 |
 84 | location / {
 85 |     include uwsgi_params;
 86 |     uwsgi_pass 127.0.0.1:10080;
 87 | }
 88 | 
89 | 90 | Then, start your Web server 91 | 92 |
 93 | service nginx start
 94 | 
95 | 96 | ## Create your first 'Hello, World' Flask program 97 | 98 | In your project folder create your first python app. 99 | 100 |
101 | cd /usr/share/nginx/html/
102 | mkdir hello
103 | cd hello
104 | 
105 | 106 |
vi hello.py
107 | 108 |
109 | from flask import Flask
110 | app = Flask(__name__)
111 | 
112 | @app.route('/')
113 | def hello_world():
114 |     return 'Hello World!'
115 | 
116 | if __name__ == '__main__':
117 |     app.run(host='0.0.0.0', port=10080)
118 | 
119 | 120 | ## Setup uWSGI 121 | 122 | To configure uWSGI server create a config file as follows: 123 | 124 |
vi app.yaml
125 | 126 |
127 | uwsgi:
128 |   socket: 127.0.0.1:10080
129 |   master: 1
130 |   workers: 1
131 |   chmod-socket: 666
132 |   auto-procname: 1
133 |   python-path: .
134 |   pidfile: /tmp/uwsgi.pid
135 |   daemonize: /var/log/uwsgi.log
136 |   module: hello:app
137 | 
138 | 139 | ## Start uWSGI 140 | 141 | Using the config file, you can easily start uWSGI server: 142 | 143 |
uwsgi --yaml app.yaml
144 | 145 | In case, you want to run it as other user's, you can use --uid option additionally. 146 | And, because our config specify that uWSGI processes are executed as daemon, if you want to stop them all, you can run: 147 | 148 |
kill -INT `cat /tmp/uwsgi.pid`
149 | 150 | ## Installing AbeBook's Scrunch 151 | Now that you have a running Flask/nginx environment, adapt the code from AbeBook's Scrunch Server to work on your machine such that it will use a database to store the shortened URLs (you may use sqlite as a basic key value store or get fancier and make your key value store distributed using additional AWS instances ). 152 | 153 | Get code for scrunch here: 154 | 155 | http://www.scrunch.ca 156 | 157 |
158 | wget http://www.scrunch.ca/static/scrunch-code.zip
159 | 
160 | 161 | ## Shut down your running AWS instances! 162 | - In the EC2 Dashboard click 'Running Instances' and right click on your instance in the list and select 'Instance State' > 'Terminate'. 163 | 164 | -------------------------------------------------------------------------------- /Lab-10/Readme.md: -------------------------------------------------------------------------------- 1 | 2 | ## Scaling Challenges and Project Repos set up! 3 | 4 | 5 | ### In Part 1 of this lab, you will design/implement/test as many strategies as you can to continue to scale your Scruncher! 6 | 7 | 1. You want to provide a list of the top 10 trending links, how do you do it? 8 | 2. A market research firm wants to pay you big bucks for a daily report with usage statistics, how do you do it? 9 | 3. Users are complaining they're receiving links to known malware and phishing websites, what do you do? 10 | 4. Users are asking for the ability to share links with only people they approve, how can you give it to them? 11 | 12 | In addition to this, think of another scaling challenge and pose for the rest to the class (and Cliff and Erik) to solve! 13 | Post this as a reply to the "My Scaling Challenge!" issue before Monday March 30th! 14 | 15 | ### In Part 2, make sure you Project 2 repo is set up. 16 | 17 | Feel free to share it under the "My Project 2" issue before the end of lab. For many of you, you are continuing to build on Project 1, so CLEARLY identify the vast improvements you have introduced relative to that early release of your system! :) 18 | -------------------------------------------------------------------------------- /License: -------------------------------------------------------------------------------- 1 | CC0 1.0 Universal 2 | 3 | Statement of Purpose 4 | 5 | The laws of most jurisdictions throughout the world automatically confer 6 | exclusive Copyright and Related Rights (defined below) upon the creator and 7 | subsequent owner(s) (each and all, an "owner") of an original work of 8 | authorship and/or a database (each, a "Work"). 9 | 10 | Certain owners wish to permanently relinquish those rights to a Work for the 11 | purpose of contributing to a commons of creative, cultural and scientific 12 | works ("Commons") that the public can reliably and without fear of later 13 | claims of infringement build upon, modify, incorporate in other works, reuse 14 | and redistribute as freely as possible in any form whatsoever and for any 15 | purposes, including without limitation commercial purposes. These owners may 16 | contribute to the Commons to promote the ideal of a free culture and the 17 | further production of creative, cultural and scientific works, or to gain 18 | reputation or greater distribution for their Work in part through the use and 19 | efforts of others. 20 | 21 | For these and/or other purposes and motivations, and without any expectation 22 | of additional consideration or compensation, the person associating CC0 with a 23 | Work (the "Affirmer"), to the extent that he or she is an owner of Copyright 24 | and Related Rights in the Work, voluntarily elects to apply CC0 to the Work 25 | and publicly distribute the Work under its terms, with knowledge of his or her 26 | Copyright and Related Rights in the Work and the meaning and intended legal 27 | effect of CC0 on those rights. 28 | 29 | 1. Copyright and Related Rights. A Work made available under CC0 may be 30 | protected by copyright and related or neighboring rights ("Copyright and 31 | Related Rights"). Copyright and Related Rights include, but are not limited 32 | to, the following: 33 | 34 | i. the right to reproduce, adapt, distribute, perform, display, communicate, 35 | and translate a Work; 36 | 37 | ii. moral rights retained by the original author(s) and/or performer(s); 38 | 39 | iii. publicity and privacy rights pertaining to a person's image or likeness 40 | depicted in a Work; 41 | 42 | iv. rights protecting against unfair competition in regards to a Work, 43 | subject to the limitations in paragraph 4(a), below; 44 | 45 | v. rights protecting the extraction, dissemination, use and reuse of data in 46 | a Work; 47 | 48 | vi. database rights (such as those arising under Directive 96/9/EC of the 49 | European Parliament and of the Council of 11 March 1996 on the legal 50 | protection of databases, and under any national implementation thereof, 51 | including any amended or successor version of such directive); and 52 | 53 | vii. other similar, equivalent or corresponding rights throughout the world 54 | based on applicable law or treaty, and any national implementations thereof. 55 | 56 | 2. Waiver. To the greatest extent permitted by, but not in contravention of, 57 | applicable law, Affirmer hereby overtly, fully, permanently, irrevocably and 58 | unconditionally waives, abandons, and surrenders all of Affirmer's Copyright 59 | and Related Rights and associated claims and causes of action, whether now 60 | known or unknown (including existing as well as future claims and causes of 61 | action), in the Work (i) in all territories worldwide, (ii) for the maximum 62 | duration provided by applicable law or treaty (including future time 63 | extensions), (iii) in any current or future medium and for any number of 64 | copies, and (iv) for any purpose whatsoever, including without limitation 65 | commercial, advertising or promotional purposes (the "Waiver"). Affirmer makes 66 | the Waiver for the benefit of each member of the public at large and to the 67 | detriment of Affirmer's heirs and successors, fully intending that such Waiver 68 | shall not be subject to revocation, rescission, cancellation, termination, or 69 | any other legal or equitable action to disrupt the quiet enjoyment of the Work 70 | by the public as contemplated by Affirmer's express Statement of Purpose. 71 | 72 | 3. Public License Fallback. Should any part of the Waiver for any reason be 73 | judged legally invalid or ineffective under applicable law, then the Waiver 74 | shall be preserved to the maximum extent permitted taking into account 75 | Affirmer's express Statement of Purpose. In addition, to the extent the Waiver 76 | is so judged Affirmer hereby grants to each affected person a royalty-free, 77 | non transferable, non sublicensable, non exclusive, irrevocable and 78 | unconditional license to exercise Affirmer's Copyright and Related Rights in 79 | the Work (i) in all territories worldwide, (ii) for the maximum duration 80 | provided by applicable law or treaty (including future time extensions), (iii) 81 | in any current or future medium and for any number of copies, and (iv) for any 82 | purpose whatsoever, including without limitation commercial, advertising or 83 | promotional purposes (the "License"). The License shall be deemed effective as 84 | of the date CC0 was applied by Affirmer to the Work. Should any part of the 85 | License for any reason be judged legally invalid or ineffective under 86 | applicable law, such partial invalidity or ineffectiveness shall not 87 | invalidate the remainder of the License, and in such case Affirmer hereby 88 | affirms that he or she will not (i) exercise any of his or her remaining 89 | Copyright and Related Rights in the Work or (ii) assert any associated claims 90 | and causes of action with respect to the Work, in either case contrary to 91 | Affirmer's express Statement of Purpose. 92 | 93 | 4. Limitations and Disclaimers. 94 | 95 | a. No trademark or patent rights held by Affirmer are waived, abandoned, 96 | surrendered, licensed or otherwise affected by this document. 97 | 98 | b. Affirmer offers the Work as-is and makes no representations or warranties 99 | of any kind concerning the Work, express, implied, statutory or otherwise, 100 | including without limitation warranties of title, merchantability, fitness 101 | for a particular purpose, non infringement, or the absence of latent or 102 | other defects, accuracy, or the present or absence of errors, whether or not 103 | discoverable, all to the greatest extent permissible under applicable law. 104 | 105 | c. Affirmer disclaims responsibility for clearing rights of other persons 106 | that may apply to the Work or any use thereof, including without limitation 107 | any person's Copyright and Related Rights in the Work. Further, Affirmer 108 | disclaims responsibility for obtaining any necessary consents, permissions 109 | or other rights required for any use of the Work. 110 | 111 | d. Affirmer understands and acknowledges that Creative Commons is not a 112 | party to this document and has no duty or obligation with respect to this 113 | CC0 or use of the Work. 114 | 115 | For more information, please see 116 | 117 | -------------------------------------------------------------------------------- /PortalCrashUpdate.md: -------------------------------------------------------------------------------- 1 | ## GEE Portal Crash January 29 And Status 2 | 3 | ### The Crash 4 | The GEE Portal crashed at the start of CS 462, Lab #2 on Thursday, January 29, and was down for about 27 hours while the GEE team diagnosed and fixed the problem! 5 | 6 | ### The Cause 7 | There were roughly 250 simultaneous slice-creation events at the start of the lab. This caused the slice-creation scripts, which invoke Ansible to create the containers on the nodes, to overload and corrupted the Ansible template. 8 | The 250 simultaneous slice-creation events were caused by an unfortunate combination of events. When the portal was written, slice “creation” was an instantaneous event; because populating slices on PlanetLab was somewhat slow, we had pre-allocated the slices and simply delivered the slice information and credentials to the user when he asked for a slice. However, when we moved to the Docker infrastructure, actual slice creation was faster than before, and we had the opportunity in future to permit users to use their own custom images from DockerHub. For these reasons, we decided to abandon pre-allocation, which meant that slice creation now takes on the order of a few seconds to a couple of minutes, depending on network conditions. We didn’t, however, update the portal to display an in-progress page, leaving the same page with a Get-a-Slicelet button, and no feedback to the user about what was happening. 9 | 10 | Slice creation slows down when a node is misbehaving, and by bad luck the GENI rack at the Naval Postgraduate School had gone offline. So a labful of students were staring at “Get a Slicelet!” screens, which weren’t responding. The natural assumption on a user’s part is that the link was broken, or not being followed, so naturally a number clicked a few times. Fifteen students clicking six times a minute (not unreasonable) can generate 250 clicks in an hour; worse, we weren’t checking whether we were already processing another request for that student, and this caused the crash. 11 | 12 | ### What We’ve Done 13 | First, when we get a slice-create request, we check to see if we’re already doing a request for that user. If we are, we simply hand him his dashboard. Second, when we get a slice-create request, the first thing we do is display an in-progress page to the user, with a link to take him to his dashboard. Third, we now display the slice status (Processing/Running/Error) on the dashboard. “Processing” means that the slice is in the process of being created. Fourth, we prevent the user from deleting a slice that is still in-process. The combination of these things should give the user better visibility into what’s going on with his slice status, and prevent incidents like the one we had last week. Further, since we’re handing the user his slice credentials and fabfile, he can start work on his own machine even before his slice gets into Running state. 14 | 15 | ### Still In The Works 16 | We want to make slice creation faster. The only thing which slows it down right now is a malfunctioning node, so we’re going to detect this and simply omit misbehaving nodes from the slice. 17 | -------------------------------------------------------------------------------- /ProjectIdea.md: -------------------------------------------------------------------------------- 1 | ## Project Idea: RPC and Two-Phase Commit! 2 | #### Due Feb 23, work in groups of 2-3. 3 | 4 | This is a *very* classic exercise for all students, everywhere, learning about distributed systems! It is a great thought exercise, and will use things we are/will be talking about in the course. It has 3 pieces: 5 | 6 | 1. A durable key/value store. 7 | 2. A two-phase commit protocol to coordinate writes to replicas of the key/value store. 8 | 3. RPC to coordinate the replica processes. 9 | 10 | ## What is a durable key/value store? 11 | 12 | Using something like an sqlite database library (you can write your own, but you really don't have to!) make sure you support the following 3 operations: 13 | 14 | * put(k, v): stores the value "v" with key "k" 15 | * del(k): deletes the entry for key "k" 16 | * v = get(k): returns the value associated with key "k" 17 | 18 | in a way that is "durable": if your process quits and is restarted, all values that were "put" must be retrievable using a "get", and any value that was deleted must not be available. 19 | 20 | ## What does this have to do with Two-Phase Commit? 21 | 22 | It is great that your key/value store is durable, but now your job is to replicate it by using a single *coordinator* process with multiple *replica* processes. 23 | 24 | The *coordinator* process should use RPC (use a library for this too, you could write your own but you don't have to!) for the 3 operations (put, del, get). Any state changing operations (put, del), should use two-phase commit to coordinate the operation between replicas. Operations that preserve state (get), can just be sent to a replica at random. Multiple clients should be able to connect to the coordinator and issue requests concurrently. 25 | 26 | The *replicas* should each be an instance of your durable key/value store, and expose an RPC interface for the coordinator to interact with them. Operations should be allowed to execute concurrently. To keep it simple, assume that if two or more operations manipulate the same key, the first to arrive should be able to proceed while the others should not be able to commit. 27 | 28 | Note that two-phase commit requires that the coordinator and the replicas will need to use logging to keep durable state. This means you need to: 29 | 30 | * implement logging 31 | * figure out what to log 32 | * determine how to replay the log when recovering 33 | 34 | ## Is this a Fault-Tolerant System? 35 | 36 | YES! If either the master or any of the replicas fail, you need to make sure your two-phase commit protocol does the right thing... and even more importantly, when whoever failed recovers, your two-phase commit needs to recover and proceed in the right way! 37 | 38 | ## How do you test this?! 39 | 40 | Implement a client program to interact with the coordinator, which in turn will interact with your replicas. You don't have to be too fancy with the configuration of the system, but you will have to figure out how to (at the very least) fire up a bunch of processes and hard wire which process exist and where they are running to configure your system. 41 | 42 | Launch multiple clients to hammer on the coordinator concurrently. Test as many cases as you can, in particular try knocking out the master during the period of uncertainty! 43 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![Course Logo](https://cloud.githubusercontent.com/assets/1288637/5566593/a64ba4bc-8ee2-11e4-8612-28191f82fdd1.png) 2 | 3 | ## [UVic Dept of Computer Science](https://www.csc.uvic.ca/) 4 | ### CSC 462/562 Spring 2015! 5 | 6 | Welcome to our extravaganza in building cool systems that involve multiple computational devices, CSC 462/562 Distributed Computing. This course is for fearless 4th year [undergrads](http://courses.seng.uvic.ca/courses/2015/spring/csc/462) and [grads](http://courses.seng.uvic.ca/courses/2015/spring/csc/562). 7 | 8 | Date | Topics | Homework 9 | :-----:| ----------------- | ----- 10 | Jan 5 | Intro and basics of Distributed Systems | Read [Tutorial](http://www.hpcs.cs.tsukuba.ac.jp/~tatebe/lecture/h23/dsys/dsd-tutorial.html) and do exercises 3, 9, 11 (post to coursespaces!) 11 | Jan 8 | Intro to RPC, DNS | Read [RPC](http://research.cs.wisc.edu/areas/os/Qual/papers/rpc.pdf), [DNS](http://pages.cs.wisc.edu/~akella/CS740/S08/740-Papers/MD88.pdf), Post comments and prepare for Lab 12 | Jan 12 | Intro to Keys and [Lab 1] (https://github.com/ycoady/UVic-Distributed-Systems/tree/master/LAB1) | Post comments for papers 13 | Jan 15 | RPC versus LPC | Post comments for papers 14 | Jan 19 | DNS and NTP and [Lab 2] (https://github.com/ycoady/UVic-Distributed-Systems/tree/master/LAB2) | Read [Logical Clocks](http://web.stanford.edu/class/cs240/readings/lamport.pdf) and [Distributed Snapshots](http://research.microsoft.com/en-us/um/people/lamport/pubs/chandy.pdf) 15 | Jan 22 | Intro to Logical Clocks | Post comments on Logical Clocks 16 | Jan 26 | Intro GEE (Rick) and [Lab 3] (https://github.com/ycoady/UVic-Distributed-Systems/tree/master/LAB3) | Post comments on Distributed Snapshots, Read [Two Phase Commit (Chapter 7, up to 7.5) ](http://research.microsoft.com/en-us/people/philbe/chapter7.pdf) 17 | Jan 29 | Overlay Networks (Rick) | Update on [Portal Crash] (https://github.com/ycoady/UVic-Distributed-Systems/blob/master/PortalCrashUpdate.md) 18 | Feb 2 | Microsoft (Rob) | Intro to [Lab 4] (https://github.com/ycoady/UVic-Distributed-Systems/tree/master/LAB4) and Check out the [BitTorrent Protocol] (http://www.bittorrent.org/beps/bep_0003.html) 19 | Feb 5 | Distributed Snapshots and Projects | What questions are you able to address on this [checklist] (http://monkey.org/~marius/checklist.pdf)? 20 | Feb 9/12 | Happy Family Day and Reading Week! | This classic paper (particularly the lattice!) clarifies issues involving [Consistent Global States] (http://www.eecs.harvard.edu/cs262/Readings/babaoglu93consistent.pdf) 21 | Feb 16/19 | Projects and Midterm! | This on the CAP Theorem [retrospective] (http://www.infoq.com/articles/cap-twelve-years-later-how-the-rules-have-changed) helps in understanding how distributed systems have evolved! 22 | Feb 23 | Project 2 ideas! | GEE this is [terrific!]() 23 | Feb 26 | Peter from Heroku | Post any questions on forum! He will ask you about the CAP Theorum! 24 | Mar 2 | Abebooks Scrunch Part1 | Meet in ECS 660 25 | Mar 5/9/12 | Project Presentations! | 10 minutes, can use laptop/blackboard, be ready to ask questions! 26 | Mar 16 | Abebooks Scrunch Part2 | Meet in ECS 660 27 | Mar 19 | Do incentives work? | Read [BitTyrant](http://sing.stanford.edu/cs303-sp11/papers/BitTyrant.pdf) for Monday 28 | Mar 26 | Midterm | 29 | Mar 30 | Final Scrunch! | Meet in ECS 660 30 | Apr 2 | Project Presentations | Final wrap up! 31 | --------------------------------------------------------------------------------