├── .gitignore
├── Makefile
├── meson.build
├── plot_mig_mon_spike
├── utils.h
├── LICENSE
├── utils.c
├── README.md
├── mig_mon.h
├── mig_mon.c
├── mm_dirty.c
├── downtime.c
└── vm.c


/.gitignore:
--------------------------------------------------------------------------------
1 | mig_mon
2 | *.o
3 | cscope*
4 | push.sh
5 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: default clean cscope
 2 | 
 3 | default:
 4 | 	@meson setup build -Doptimization=3 -Ddebug=true -Dwarning_level=3 -Dwerror=true
 5 | 	@cd build && meson compile
 6 | 
 7 | cscope:
 8 | 	@cscope -bq *.c
 9 | 
10 | clean:
11 | 	@rm -rf build/ cscope*
12 | 


--------------------------------------------------------------------------------
/meson.build:
--------------------------------------------------------------------------------
 1 | project('mig_mon', 'c',
 2 |         version: run_command('git', 'describe', '--tags', '--dirty', check: true)
 3 |                  .stdout().strip())
 4 | 
 5 | add_project_arguments('-DMIG_MON_VERSION="' + meson.project_version() + '"',
 6 |                       language: 'c')
 7 | 
 8 | sources = files('mig_mon.c', 'downtime.c', 'mm_dirty.c', 'utils.c', 'vm.c')
 9 | 
10 | executable('mig_mon', sources)
11 | 


--------------------------------------------------------------------------------
/plot_mig_mon_spike:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import sys
 4 | import json
 5 | import matplotlib.pyplot as plt
 6 | 
 7 | args = sys.argv
 8 | prog_name = args[0]
 9 | help_msg = """
10 | usage: %s <spike_log>
11 | 
12 | Parse mig_mon spike log and generate graph.
13 | """ % prog_name
14 | 
15 | def usage ():
16 |     print help_msg
17 |     sys.exit(1)
18 | 
19 | if len(args) != 2:
20 |     usage()
21 | 
22 | spike_log = args[1]
23 | 
24 | data = []
25 | spike_fd = open(spike_log)
26 | while True:
27 |     line = spike_fd.readline().strip()
28 |     if not line:
29 |         break
30 |     data.append(line.split(","))
31 | 
32 | spike_fd.close()
33 | 
34 | start_ts = int(data[0][0])
35 | results = map(lambda x: [int(x[0]) - start_ts, int(x[1])], data)
36 | axis_x = [x[0] for x in results]
37 | axis_y = [x[1] for x in results]
38 | plt.plot(axis_x, axis_y, "b-", axis_x, axis_y, "ro")
39 | plt.xlabel("Time (s)")
40 | plt.ylabel("Downtime (ms)")
41 | plt.title("200 Loops DPDK Live Migration Network Downtime")
42 | plt.show()
43 | 


--------------------------------------------------------------------------------
/utils.h:
--------------------------------------------------------------------------------
 1 | #ifndef __UTILS_H__
 2 | #define __UTILS_H__
 3 | 
 4 | #include <stdint.h>
 5 | #include <pthread.h>
 6 | 
 7 | void fd_write(int fd, void *buffer, size_t size);
 8 | void fd_read(int fd, void *buffer, size_t size);
 9 | void socket_set_fast_reuse(int fd);
10 | void pthread_set_name(pthread_t thread, const char *name);
11 | unsigned long parse_size_to_mega(const char *str);
12 | 
13 | static inline uint64_t get_usec(void)
14 | {
15 |     uint64_t val = 0;
16 |     struct timespec t;
17 |     int ret = clock_gettime(CLOCK_MONOTONIC, &t);
18 |     if (ret == -1) {
19 |         perror("clock_gettime() failed");
20 |         /* should never happen */
21 |         exit(-1);
22 |     }
23 |     val = t.tv_nsec / 1000;     /* ns -> us */
24 |     val += t.tv_sec * 1000000;  /* s -> us */
25 |     return val;
26 | }
27 | 
28 | static inline uint64_t get_msec(void)
29 | {
30 |     return get_usec() / 1000;
31 | }
32 | 
33 | static inline uint64_t get_timestamp(void)
34 | {
35 |     return (uint64_t)time(NULL);
36 | }
37 | 
38 | #endif
39 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Peter Xu
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/utils.c:
--------------------------------------------------------------------------------
 1 | #include "mig_mon.h"
 2 | 
 3 | /* Return 0 when succeed, 1 for retry, assert on error */
 4 | void fd_write(int fd, void *buffer, size_t size)
 5 | {
 6 |     int ret;
 7 | 
 8 | retry:
 9 |     ret = write(fd, buffer, size);
10 |     if (ret < 0)
11 |         ret = -errno;
12 |     if (ret == -EAGAIN || ret == -EINTR)
13 |         goto retry;
14 | 
15 |     assert((size_t)ret == size);
16 | }
17 | 
18 | /* Return 0 when succeed, 1 for retry, assert on error */
19 | void fd_read(int fd, void *buffer, size_t size)
20 | {
21 |     int ret;
22 | 
23 | retry:
24 |     ret = read(fd, buffer, size);
25 |     if (ret < 0)
26 |         ret = -errno;
27 |     if (ret == -EAGAIN || ret == -EINTR)
28 |         goto retry;
29 | 
30 |     assert((size_t)ret == size);
31 | }
32 | 
33 | void socket_set_fast_reuse(int fd)
34 | {
35 |     int val = 1, ret;
36 | 
37 |     ret = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR,
38 |                      (const char *)&val, sizeof(val));
39 | 
40 |     assert(ret == 0);
41 | }
42 | 
43 | void pthread_set_name(pthread_t thread, const char *name)
44 | {
45 | #ifdef __linux__
46 |     int ret = pthread_setname_np(thread, name);
47 |     assert(ret == 0);
48 | #endif
49 | }
50 | 
51 | /* Parse some number like "2G", if no unit, using "M" by default. */
52 | uint64_t parse_size_to_mega(const char *str)
53 | {
54 |     uint64_t value, n = 1;
55 |     char *endptr;
56 | 
57 |     value = strtoul(str, &endptr, 10);
58 |     if (value == 0 || endptr == NULL) {
59 |         printf("Unknown size string: '%s'\n", str);
60 |         exit(-1);
61 |     }
62 | 
63 |     switch (*endptr) {
64 |     case 't':
65 |     case 'T':
66 |         n *= 1024;
67 |         /* fall through */
68 |     case 'g':
69 |     case 'G':
70 |         n *= 1024;
71 |         /* fall through */
72 |     case 'm':
73 |     case 'M':
74 |     case '\0': /* This means, no unit, so MB by default */
75 |         break;
76 |     default:
77 |         printf("Unknown unit '%c', try something else (MB/GB/...)\n", *endptr);
78 |         exit(-1);
79 |         break;
80 |     }
81 | 
82 |     return value * n;
83 | }
84 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | About
  2 | ======
  3 | 
  4 | `mig_mon` is the short form of `Migration Monitor`.  It's a set of tools for VM
  5 | migration testing and debugging.
  6 | 
  7 | Features
  8 | ===========
  9 | 
 10 | `mig_mon` provides a few sub-commands to use.
 11 | 
 12 | VM Live Migration Network Emulator
 13 | ----------------------------------------
 14 | 
 15 | This sub-tool can be used to emulate live migration TCP streams.
 16 | 
 17 | There're two types of live migration: (1) precopy (2) postcopy.  This tool can
 18 | emulate (1) or (2) or (1+2) case by specifying different '-t' parameters:
 19 | 
 20 |   - Enable precopy only: it emulates a TCP_STREAM workload from src->dst
 21 |   - Enable postcopy only: it emulates a TCP_RR workload from dst->src
 22 |   - Enable both: it emulates the above TCP_STREAM+TCP_RR on the same socket
 23 | 
 24 | For precopy stream, it's the bandwidth that matters.  The bandwidth
 25 | information will be dumped per-second on src VM.
 26 | 
 27 | For postcopy stream, it's the latency that matters.  The average/maximum
 28 | latency value of page requests will be dumped per-second on dst VM.
 29 | 
 30 | This sub-command has below parameters:
 31 | 
 32 |     ./mig_mon vm [options...]
 33 |       -d:    Emulate a dst VM
 34 |       -h:    Dump help message
 35 |       -H:    Specify dst VM IP (required for -s)
 36 |       -s:    Emulate a src VM
 37 |       -S:    Specify size of the VM (GB)
 38 |       -t:    Specify tests (precopy, postcopy)
 39 | 
 40 | Example usage:
 41 | 
 42 | To start the (emulated) destination VM, one can run this on dest host:
 43 | 
 44 |     ./mig_mon vm -d
 45 | 
 46 | Then, to start a src VM emulation and start both live migration streams,
 47 | one can run this command on src host:
 48 | 
 49 |     ./mig_mon vm -s -H $DEST_IP -t precopy -t postcopy
 50 | 
 51 | Specifying both '-t' will just enable both migration streams.
 52 | 
 53 | Memory Dirty
 54 | --------------
 55 | 
 56 | Sub-command "mm_dirty" can generate a constant dirty workload in the system.
 57 | 
 58 |     ./mig_mon mm_dirty [options...]
 59 |       -h:    Dump help message for mm_dirty sub-cmd
 60 |       -m:    Memory size in MB (default: 512)
 61 |       -r:    Dirty rate in MB/s (default: unlimited)
 62 |       -p:    Work pattern: "sequential", "random", or "once"
 63 |              (default: "sequential")
 64 |       -L:    Record and report memory access latencies
 65 |       -P:    Page size: "2m" or "1g" for huge pages
 66 | 
 67 | To generate a random dirty workload of 500MB/s upon 2GB memory range, we can
 68 | use:
 69 | 
 70 |     ./mig_mon mm_dirty -m 2000 -r 500 -p random
 71 |     
 72 | The dirty workload will always dirty pages in 4K page size (even if huge pages
 73 | are used) because normally hypervisor will trap dirty in small page size always.
 74 | 
 75 | Pre-heat will be done before starting the real workload.
 76 | 
 77 | ### Memory Access Latency Measurement
 78 | 
 79 | *mm_dirty* can also support measuring memory access latencies during
 80 | writing to memory.  It's mostly useful when e.g. there's a potential reason
 81 | for high memory access latency (e.g. the VM is during a postcopy live
 82 | migration), then we can get a distribution of memory access latencies for
 83 | the whole process.
 84 | 
 85 | To record and report memory access latencies, simply attach parameter *-L*
 86 | to the *mm_dirty* command.  Below is an example to start sequential writes
 87 | upon 16GB memory, measure / report memory access latencies:
 88 | 
 89 |     ./mig_mon mm_dirty -m 16G -L
 90 | 
 91 | The result (on a bare metal host) can look like this:
 92 | 
 93 |             1 (us): 23372101
 94 |             2 (us): 2399961
 95 |             4 (us): 2168
 96 |             8 (us): 1454
 97 |            16 (us): 76
 98 |            32 (us): 5
 99 |            64 (us): 0
100 |           128 (us): 0
101 |           256 (us): 0
102 |           512 (us): 0
103 |          1024 (us): 0
104 |          2048 (us): 0
105 |          4096 (us): 0
106 |          8192 (us): 0
107 |         16384 (us): 0
108 |         32768 (us): 0
109 |         65536 (us): 0
110 |        131072 (us): 0
111 |        262144 (us): 0
112 |        524288 (us): 0
113 |       1048576 (us): 0
114 | 
115 | Note that there're 21 buckets, each of the bucket is a power-of-2.  For
116 | example, the number showed in bucket *8 (us)* means there are 1454 memory
117 | accesses that took *no more than 8 microseconds* to finish (but larger than
118 | *4us* or it'll have fallen into the previous bucket).  Same applies to the
119 | rest buckets.
120 | 
121 | Here only the last bucket is special: anything bigger than 1sec will be put
122 | there.
123 | 
124 | Network Downtime Measurement
125 | ---------------------------------
126 | 
127 | Sub-command "server_rr/client" can be used to measure guest OS network downtime
128 | during migration.  To use it, we can first start the UDP echo server in the
129 | guest using:
130 | 
131 |     ./mig_mon server_rr
132 |     
133 | Then from outside the guest, we can start the client trying to send a packet
134 | for constant interval (e.g. 50ms) and waiting for a response:
135 | 
136 |     ./mig_mon client $GUEST_IP 50 $LOG
137 |     
138 | The client side will record the latency of each packet received, recording
139 | spikes into $LOG and also show the maximum latency detected.
140 | 


--------------------------------------------------------------------------------
/mig_mon.h:
--------------------------------------------------------------------------------
  1 | #ifndef __MIG_MON_H__
  2 | #define __MIG_MON_H__
  3 | 
  4 | #ifdef __linux__
  5 | #define _GNU_SOURCE
  6 | #endif
  7 | 
  8 | #include <assert.h>
  9 | #include <stdio.h>
 10 | #include <string.h>
 11 | #include <unistd.h>
 12 | #include <stdlib.h>
 13 | #include <sys/socket.h>
 14 | #include <sys/types.h>
 15 | #include <sys/stat.h>
 16 | #include <sys/mman.h>
 17 | #include <netinet/in.h>
 18 | #include <arpa/inet.h>
 19 | #include <stdint.h>
 20 | #include <inttypes.h>
 21 | #include <time.h>
 22 | #include <fcntl.h>
 23 | #include <errno.h>
 24 | #include <pthread.h>
 25 | #include <stdbool.h>
 26 | 
 27 | #ifdef __linux__
 28 | #include <linux/mman.h>
 29 | #endif
 30 | 
 31 | #include "utils.h"
 32 | 
 33 | #define  MAX(a, b)  ((a > b) ? (a) : (b))
 34 | #define  MIN(a, b)  ((a < b) ? (a) : (b))
 35 | 
 36 | #ifdef DEBUG
 37 | #define  debug(...)  printf(__VA_ARGS__)
 38 | #else
 39 | #define  debug(...)
 40 | #endif
 41 | 
 42 | typedef enum {
 43 |     PATTERN_SEQ = 0,
 44 |     PATTERN_RAND = 1,
 45 |     PATTERN_ONCE = 2,
 46 |     PATTERN_NUM,
 47 | } dirty_pattern;
 48 | 
 49 | /* whether allow client change its IP */
 50 | #define  MIG_MON_SINGLE_CLIENT       (0)
 51 | #define  MIG_MON_PORT                (12323)
 52 | #define  MIG_MON_INT_DEF             (1000)
 53 | #define  BUF_LEN                     (1024)
 54 | #define  MIG_MON_SPIKE_LOG_DEF       ("/tmp/spike.log")
 55 | #define  DEF_MM_DIRTY_SIZE           (512)
 56 | #define  DEF_MM_DIRTY_PATTERN        PATTERN_SEQ
 57 | 
 58 | /******************
 59 |  * For mig_mon.c  *
 60 |  ******************/
 61 | extern short mig_mon_port;
 62 | extern long n_cpus;
 63 | extern long page_size, huge_page_size;
 64 | extern const char *pattern_str[PATTERN_NUM];
 65 | extern const char *prog_name;
 66 | 
 67 | /******************
 68 |  * For downtime.c *
 69 |  ******************/
 70 | 
 71 | /* Mig_mon callbacks. Return 0 for continue, non-zero for errors. */
 72 | typedef int (*mon_server_cbk)(int sock, int spike_fd);
 73 | typedef int (*mon_client_cbk)(int sock, int spike_fd, int interval_ms);
 74 | 
 75 | int mon_server_callback(int sock, int spike_fd);
 76 | int mon_server_rr_callback(int sock, int spike_fd);
 77 | int mon_server(const char *spike_log, mon_server_cbk server_callback);
 78 | int mon_client_callback(int sock, int spike_fd, int interval_ms);
 79 | int mon_client_rr_callback(int sock, int spike_fd, int interval_ms);
 80 | int mon_client(const char *server_ip, int interval_ms,
 81 |                const char *spike_log, mon_client_cbk client_callback);
 82 | void usage_downtime_short(void);
 83 | void usage_downtime(void);
 84 | 
 85 | /******************
 86 |  * For mm_dirty.c *
 87 |  ******************/
 88 | typedef struct {
 89 |     /* Size of the memory to test on */
 90 |     uint64_t mm_size;
 91 |     /* Dirty rate (in MB/s) */
 92 |     uint64_t dirty_rate;
 93 |     /* mmap() flags to pass over */
 94 |     unsigned int map_flags;
 95 |     /* Dirty pattern */
 96 |     dirty_pattern pattern;
 97 |     /* Whether we're recording the memory access latencies */
 98 |     bool record_latencies;
 99 | } mm_dirty_args;
100 | int mon_mm_dirty(mm_dirty_args *args);
101 | void usage_mm_dirty_short(void);
102 | void usage_mm_dirty(void);
103 | 
104 | /**************
105 |  * For vm.c   *
106 |  **************/
107 | 
108 | /* If set, will generate precopy live migration stream */
109 | #define  VM_TEST_PRECOPY     (1UL << 0)
110 | /* If set, will generate postcopy page requests */
111 | #define  VM_TEST_POSTCOPY    (1UL << 1)
112 | 
113 | #define  DEF_VM_SIZE              (1UL << 40)  /* 1TB */
114 | 
115 | typedef enum {
116 |     EMULATE_NONE = 0,
117 |     EMULATE_SRC = 1,
118 |     EMULATE_DST = 2,
119 |     EMULATE_NUM,
120 | } emulate_target;
121 | 
122 | typedef struct {
123 |     int sock;
124 |     emulate_target target;
125 |     unsigned int tests;
126 |     /* Whether we should quit */
127 |     int quit;
128 |     /* Guest memory size (emulated) */
129 |     uint64_t vm_size;
130 |     /*
131 |      * Both the src/dst VMs have these threads, even if they do not mean the
132 |      * same workload will be run, we share the fields.
133 |      */
134 |     pthread_t sender;
135 |     pthread_t receiver;
136 | 
137 |     /*
138 |      * Maintaining receiving sockets
139 |      */
140 |     /* Size = DEF_IO_BUF_SIZE */
141 |     char *recv_buffer;
142 |     /* Length of data consumed */
143 |     int recv_cur;
144 |     /* Length of data in recv_buffer */
145 |     int recv_len;
146 | 
147 |     /*
148 |      * When on src: used to emulate page req queue.
149 |      * When on dst: used to notify when a page req is resolved.
150 |      *
151 |      * Data is page offset (u64), always.
152 |      */
153 |     int page_req_pipe[2];
154 | 
155 |     union {
156 |         /* Only needed on src VM */
157 |         struct {
158 |             /* Size = MAX_IOV_SIZE * DEF_IO_BUF_SIZE */
159 |             struct iovec *src_iov_buffer;
160 |             /* Dest VM ip */
161 |             const char *src_target_ip;
162 |             /* Points to the current IOV being used */
163 |             int src_cur;
164 |             /* Length of current IOV that has been consumed */
165 |             size_t src_cur_len;
166 |         };
167 |         /* Only needed on dst VM */
168 |         struct {
169 |             /* Current page to request */
170 |             uint64_t dst_current_req;
171 |         };
172 |     };
173 | } vm_args;
174 | 
175 | int mon_vm(vm_args *args);
176 | void usage_vm(void);
177 | void usage_vm_short(void);
178 | 
179 | #endif
180 | 


--------------------------------------------------------------------------------
/mig_mon.c:
--------------------------------------------------------------------------------
  1 | #include "mig_mon.h"
  2 | 
  3 | const char *prog_name = NULL;
  4 | long n_cpus;
  5 | short mig_mon_port = MIG_MON_PORT;
  6 | /*
  7 |  * huge_page_size stands for the real page size we used.  page_size will always
  8 |  * be the smallest page size of the system, as that's the size that guest
  9 |  * hypervisor will track dirty.
 10 |  */
 11 | long page_size, huge_page_size;
 12 | const char *pattern_str[PATTERN_NUM] = { "sequential", "random", "once" };
 13 | 
 14 | void version(void)
 15 | {
 16 |     printf("Version: %s\n", MIG_MON_VERSION);
 17 |     puts("");
 18 | }
 19 | 
 20 | void usage(void)
 21 | {
 22 |     puts("");
 23 |     puts("This tool is a toolset of VM live migration testing & debugging.");
 24 |     puts("For detailed usage, please try '-h/--help' for each sub-command.");
 25 |     puts("");
 26 |     puts("Usage:");
 27 |     printf("       %s [-h|--help]\tShow full help message\n", prog_name);
 28 |     usage_downtime_short();
 29 |     usage_mm_dirty_short();
 30 |     usage_vm_short();
 31 |     puts("");
 32 | }
 33 | 
 34 | dirty_pattern parse_dirty_pattern(const char *str)
 35 | {
 36 |     int i;
 37 | 
 38 |     for (i = 0; i < PATTERN_NUM; i++) {
 39 |         if (!strcmp(pattern_str[i], str)) {
 40 |             return i;
 41 |         }
 42 |     }
 43 | 
 44 |     /* Let's allow some short forms.. */
 45 |     if (!strcmp(str, "seq"))
 46 |         return PATTERN_SEQ;
 47 |     else if (!strcmp(str, "ran"))
 48 |         return PATTERN_RAND;
 49 | 
 50 |     fprintf(stderr, "Dirty pattern unknown: %s\n", str);
 51 |     exit(1);
 52 | }
 53 | 
 54 | int parse_huge_page_size(const char *size)
 55 | {
 56 | #ifdef __linux__
 57 |     if (!strcmp(size, "2m") || !strcmp(size, "2M")) {
 58 |         huge_page_size = 2UL << 20;
 59 |         return MAP_HUGETLB | MAP_HUGE_2MB;
 60 |     } else if (!strcmp(size, "1g") || !strcmp(size, "1G")) {
 61 |         huge_page_size = 1UL << 30;
 62 |         return MAP_HUGETLB | MAP_HUGE_1GB;
 63 |     } else if (!strcmp(size, "4k") || !strcmp(size, "4K")) {
 64 |         return 0;
 65 |     } else {
 66 |         printf("Unknown page size (%s), please specify 4K/2M/1G\n", size);
 67 |         exit(1);
 68 |     }
 69 | #else
 70 |     printf("Specify page size is not supported on non-Linux arch yet.\n");
 71 |     exit(1);
 72 | #endif
 73 | }
 74 | 
 75 | unsigned int vm_test_parse(const char *name)
 76 | {
 77 |     if (!strcmp(name, "precopy"))
 78 |         return VM_TEST_PRECOPY;
 79 |     else if (!strcmp(name, "postcopy"))
 80 |         return VM_TEST_POSTCOPY;
 81 |     printf("Unknown vm test type: '%s'\n", name);
 82 |     exit(1);
 83 | }
 84 | 
 85 | int main(int argc, char *argv[])
 86 | {
 87 |     int ret = 0;
 88 |     int interval_ms = MIG_MON_INT_DEF;
 89 |     const char *work_mode = NULL;
 90 |     const char *server_ip = NULL;
 91 |     const char *spike_log = MIG_MON_SPIKE_LOG_DEF;
 92 | 
 93 |     n_cpus = sysconf(_SC_NPROCESSORS_ONLN);
 94 |     page_size = huge_page_size = getpagesize();
 95 | 
 96 |     prog_name = argv[0];
 97 | 
 98 |     if (argc == 1) {
 99 |         usage();
100 |         version();
101 |         return -1;
102 |     }
103 | 
104 |     srand(time(NULL));
105 | 
106 |     work_mode = argv[1];
107 | 
108 |     if (!strcmp(work_mode, "-h") || !strcmp(work_mode, "--help")) {
109 |         usage_downtime();
110 |         usage_mm_dirty();
111 |         usage_vm();
112 |         return -1;
113 |     } else if (!strcmp(work_mode, "-v") || !strcmp(work_mode, "--version")) {
114 |         version();
115 |         return -1;
116 |     } else if (!strcmp(work_mode, "server")) {
117 |         puts("starting server mode...");
118 |         if (argc >= 3) {
119 |             spike_log = argv[2];
120 |         }
121 |         ret = mon_server(spike_log, mon_server_callback);
122 |     } else if (!strcmp(work_mode, "client")) {
123 |         if (argc < 3) {
124 |             usage_downtime();
125 |             return -1;
126 |         }
127 |         server_ip = argv[2];
128 |         if (argc >= 4) {
129 |             interval_ms = strtol(argv[3], NULL, 10);
130 |         }
131 |         puts("starting client mode...");
132 |         printf("server ip: %s, interval: %d (ms)\n", server_ip, interval_ms);
133 |         ret = mon_client(server_ip, interval_ms, NULL, mon_client_callback);
134 |     } else if (!strcmp(work_mode, "server_rr")) {
135 |         printf("starting server_rr...\n");
136 |         ret = mon_server(NULL, mon_server_rr_callback);
137 |     } else if (!strcmp(work_mode, "client_rr")) {
138 |         if (argc < 3) {
139 |             usage_downtime();
140 |             return -1;
141 |         }
142 |         server_ip = argv[2];
143 |         if (argc >= 4) {
144 |             interval_ms = strtol(argv[3], NULL, 10);
145 |         }
146 |         if (argc >= 5) {
147 |             spike_log = argv[4];
148 |         }
149 |         ret = mon_client(server_ip, interval_ms, spike_log,
150 |                          mon_client_rr_callback);
151 |     } else if (!strcmp(work_mode, "vm")) {
152 |         vm_args args = {
153 |             .target = EMULATE_NONE,
154 |             .vm_size = DEF_VM_SIZE,
155 |         };
156 |         int c;
157 | 
158 |         while ((c = getopt(argc-1, argv+1, "dhp:st:H:S:")) != -1) {
159 |             switch (c) {
160 |             case 'd':
161 |                 args.target = EMULATE_DST;
162 |                 break;
163 |             case 'p':
164 |                 mig_mon_port = atoi(optarg);
165 |                 break;
166 |             case 's':
167 |                 args.target = EMULATE_SRC;
168 |                 break;
169 |             case 't':
170 |                 args.tests |= vm_test_parse(optarg);
171 |                 break;
172 |             case 'H':
173 |                 args.src_target_ip = strdup(optarg);
174 |                 break;
175 |             case 'S':
176 |                 args.vm_size = atoi(optarg) * (1UL << 30);
177 |                 break;
178 |             case 'h':
179 |             default:
180 |                 usage_vm();
181 |                 return -1;
182 |             }
183 |         }
184 | 
185 |         ret = mon_vm(&args);
186 |     } else if (!strcmp(work_mode, "mm_dirty")) {
187 |         mm_dirty_args args = {
188 |             .dirty_rate = 0,
189 |             .mm_size = DEF_MM_DIRTY_SIZE,
190 |             .pattern = DEF_MM_DIRTY_PATTERN,
191 |             .map_flags = MAP_ANONYMOUS | MAP_PRIVATE,
192 |             .record_latencies = false,
193 |         };
194 |         int c;
195 | 
196 |         while ((c = getopt(argc-1, argv+1, "hLm:p:P:r:")) != -1) {
197 |             switch (c) {
198 |             case 'm':
199 |                 args.mm_size = parse_size_to_mega(optarg);
200 |                 break;
201 |             case 'r':
202 |                 args.dirty_rate = parse_size_to_mega(optarg);
203 |                 break;
204 |             case 'p':
205 |                 args.pattern = parse_dirty_pattern(optarg);
206 |                 break;
207 |             case 'L':
208 |                 args.record_latencies = true;
209 |                 break;
210 |             case 'P':
211 |                 args.map_flags |= parse_huge_page_size(optarg);
212 |                 break;
213 |             case 'h':
214 |             default:
215 |                 usage_mm_dirty();
216 |                 return -1;
217 |             }
218 |         }
219 | 
220 |         /*
221 |          * We should have consumed all parameters.  This will dump an error if
222 |          * the user used the old mig_mon mm_dirty parameters.
223 |          */
224 |         if (optind != argc-1) {
225 |             printf("Unknown extra parameters detected.\n");
226 |             usage_mm_dirty();
227 |             return -1;
228 |         }
229 | 
230 |         ret = mon_mm_dirty(&args);
231 |     } else {
232 |         usage();
233 |         return -1;
234 |     }
235 | 
236 |     return ret;
237 | }
238 | 


--------------------------------------------------------------------------------
/mm_dirty.c:
--------------------------------------------------------------------------------
  1 | #include "mig_mon.h"
  2 | #include <signal.h>
  3 | 
  4 | #define N_1M (1024 * 1024)
  5 | 
  6 | struct thread_info {
  7 |     unsigned char *buf;
  8 |     unsigned long pages;
  9 | };
 10 | 
 11 | static void prefault_range(unsigned char *buf, unsigned long pages)
 12 | {
 13 |     unsigned long index = 0;
 14 | 
 15 |     while (index < pages) {
 16 |         *(buf) = 1;
 17 |         buf = (unsigned char *)((unsigned long)buf + page_size);
 18 | 
 19 |         /* Each 1GB for 4K page size, print a dot */
 20 |         if (++index % (256 * 1024) == 0) {
 21 |             printf(".");
 22 |             fflush(stdout);
 23 |         }
 24 |     }
 25 | }
 26 | 
 27 | static void *prefault_thread(void *data)
 28 | {
 29 |     struct thread_info *info = data;
 30 | 
 31 |     prefault_range(info->buf, info->pages);
 32 | 
 33 |     return NULL;
 34 | }
 35 | 
 36 | static void prefault_memory(unsigned char *buf, unsigned long pages)
 37 | {
 38 |     unsigned long each = pages / n_cpus;
 39 |     unsigned long left = pages % n_cpus;
 40 |     pthread_t *threads = calloc(n_cpus, sizeof(pthread_t));
 41 |     struct thread_info *infos = calloc(n_cpus, sizeof(struct thread_info));
 42 |     int i, ret;
 43 | 
 44 |     assert(threads);
 45 | 
 46 |     for (i = 0; i < n_cpus; i++) {
 47 |         struct thread_info *info = infos + i;
 48 |         pthread_t *thread = threads + i;
 49 | 
 50 |         info->buf = buf + each * page_size * i;
 51 |         info->pages = each;
 52 |         ret = pthread_create(thread, NULL, prefault_thread, info);
 53 |         assert(ret == 0);
 54 |     }
 55 | 
 56 |     if (left) {
 57 |         prefault_range(buf + each * page_size * n_cpus, left);
 58 |     }
 59 | 
 60 |     for (i = 0; i < n_cpus; i++) {
 61 |         ret = pthread_join(threads[i], NULL);
 62 |         assert(ret == 0);
 63 |     }
 64 |     printf("done\n");
 65 | }
 66 | 
 67 | static bool record_latencies;
 68 | /* 1us, 2us, 4us, ..., 1024us, 2048us, ..., 1048576us */
 69 | #define  BUCKET_SIZE  21
 70 | static uint64_t ts_bucket[BUCKET_SIZE];
 71 | 
 72 | static void mm_dirty_sig_handler(int sig)
 73 | {
 74 |     /* Mark unused */
 75 |     (void)sig;
 76 | 
 77 |     if (record_latencies) {
 78 |         unsigned int i, n = BUCKET_SIZE;
 79 | 
 80 |         puts("\nMemory Latencies:\n");
 81 |         for (i = 0; i < n; i++) {
 82 |             printf("%12u (us): %lu\n", 1<<i, ts_bucket[i]);
 83 |         }
 84 |         puts("");
 85 |     }
 86 |     exit(0);
 87 | }
 88 | 
 89 | int mon_mm_dirty(mm_dirty_args *args)
 90 | {
 91 |     unsigned char *mm_ptr, *mm_buf, *mm_end;
 92 |     /*
 93 |      * Prefault with 1, to skip migration zero detection, so the next value to
 94 |      * set is 2.
 95 |      */
 96 |     unsigned char cur_val = 2;
 97 |     uint64_t pages_per_mb = N_1M / page_size;
 98 |     uint64_t time_iter, time_now;
 99 |     uint64_t sleep_ms = 0, elapsed_ms;
100 |     uint64_t ts_start = 0, ts_lat;
101 |     uint64_t dirtied_mb = 0, mm_npages;
102 |     dirty_pattern pattern = args->pattern;
103 |     unsigned int map_flags = args->map_flags;
104 |     uint64_t dirty_rate = args->dirty_rate;
105 |     uint64_t mm_size = args->mm_size;
106 |     struct sigaction sigact = { 0 };
107 |     float speed;
108 |     uint64_t i;
109 | 
110 |     record_latencies = args->record_latencies;
111 | 
112 |     sigact.sa_handler = mm_dirty_sig_handler;
113 |     sigaction(SIGTERM, &sigact, NULL);
114 |     sigaction(SIGINT, &sigact, NULL);
115 | 
116 |     mm_buf = mmap(NULL, mm_size * N_1M, PROT_READ | PROT_WRITE,
117 |                   map_flags, -1, 0);
118 |     if (mm_buf == MAP_FAILED) {
119 |         fprintf(stderr, "%s: mmap() failed\n", __func__);
120 |         return -1;
121 |     }
122 | 
123 |     printf("Binary version: \t%s\n", MIG_MON_VERSION);
124 |     printf("Test memory size: \t%ld (MB)\n", mm_size);
125 |     printf("Backend page size: \t%ld (Bytes)\n", huge_page_size);
126 |     printf("Dirty step size: \t%ld (Bytes)\n", page_size);
127 |     if (dirty_rate) {
128 |         printf("Dirty memory rate: \t%ld (MB/s)\n", dirty_rate);
129 |     } else {
130 |         printf("Dirty memory rate: \tMaximum\n");
131 |     }
132 |     printf("Dirty pattern: \t\t%s\n", pattern_str[pattern]);
133 |     printf("Recording latencies: \t%s\n", record_latencies ? "yes" : "no");
134 | 
135 |     mm_ptr = mm_buf;
136 |     mm_end = mm_buf + mm_size * N_1M;
137 |     mm_npages = (unsigned long) ((mm_end - mm_ptr) / page_size);
138 |     time_iter = get_msec();
139 | 
140 |     puts("+------------------------+");
141 |     puts("|   Prefault Memory      |");
142 |     puts("+------------------------+");
143 |     prefault_memory(mm_buf, mm_npages);
144 | 
145 |     if (pattern == PATTERN_ONCE) {
146 |         puts("[Goes to sleep; please hit ctrl-c to stop this program]");
147 |         while (1) {
148 |             sleep(1000);
149 |         }
150 |     }
151 | 
152 |     puts("+------------------------+");
153 |     puts("|   Start Dirty Memory   |");
154 |     puts("+------------------------+");
155 | 
156 |     while (1) {
157 |         /* Dirty in MB unit */
158 |         for (i = 0; i < pages_per_mb; i++) {
159 |             if (record_latencies)
160 |                 ts_start = get_usec();
161 |             if (pattern == PATTERN_SEQ) {
162 |                 /* Validate memory if not the first round */
163 |                 unsigned char target = cur_val - 1;
164 | 
165 |                 if (*mm_ptr != target) {
166 |                     fprintf(stderr, "%s: detected corrupted memory (%d != %d)!\n",
167 |                             __func__, *mm_ptr, target);
168 |                     exit(-1);
169 |                 }
170 |                 *mm_ptr = cur_val;
171 |                 mm_ptr += page_size;
172 |             } else if (pattern == PATTERN_RAND) {
173 |                 /* Write something to a random page upon the range */
174 |                 unsigned long rand = random() % mm_npages;
175 | 
176 |                 *(mm_buf + rand * page_size) = cur_val++;
177 |             } else {
178 |                 assert(0);
179 |             }
180 |             if (record_latencies) {
181 |                 unsigned int index;
182 | 
183 |                 ts_lat = get_usec() - ts_start;
184 |                 /*
185 |                  * This puts the latency value into the bucket with index.
186 |                  * E.g., 3us will be put into <4us bucket (index=2).  So
187 |                  * it's not really accurate but just to show a pattern of
188 |                  * the latencies.
189 |                  *
190 |                  * If it goes over 1sec, always put into 1sec bucket.
191 |                  */
192 |                 if (ts_lat == 0)
193 |                     index = 0;
194 |                 else
195 |                     index = 64 - __builtin_clzll(ts_lat);
196 |                 printf("latency: %lu, index: %d\n", ts_lat, index);
197 |                 if (index > (sizeof(ts_bucket) - 1))
198 |                     index = (sizeof(ts_bucket) - 1);
199 |                 ts_bucket[index]++;
200 |             }
201 |         }
202 |         if (pattern == PATTERN_SEQ && mm_ptr + N_1M > mm_end) {
203 |             mm_ptr = mm_buf;
204 |             cur_val++;
205 |         }
206 |         dirtied_mb++;
207 |         if (dirty_rate && dirtied_mb >= dirty_rate) {
208 |             /*
209 |              * We have dirtied enough, wait for a while until we reach
210 |              * the next second.
211 |              */
212 |             sleep_ms = 1000 - get_msec() + time_iter;
213 |             if (sleep_ms > 0) {
214 |                 usleep(sleep_ms * 1000);
215 |             }
216 |             while (get_msec() - time_iter < 1000);
217 |         }
218 |         time_now = get_msec();
219 |         elapsed_ms = time_now - time_iter;
220 |         if (elapsed_ms >= 1000) {
221 |             speed = 1.0 * dirtied_mb / elapsed_ms * 1000;
222 |             printf("Dirty rate: %.0f (MB/s), duration: %"PRIu64" (ms), "
223 |                    "load: %.2f%%\n", speed, elapsed_ms,
224 |                    100.0 * (elapsed_ms - sleep_ms) / elapsed_ms);
225 |             time_iter = time_now;
226 |             sleep_ms = 0;
227 |             dirtied_mb = 0;
228 |         }
229 |     }
230 | 
231 |     /* Never reached */
232 |     return 0;
233 | }
234 | 
235 | void usage_mm_dirty_short(void)
236 | {
237 |     puts("");
238 |     printf("       %s mm_dirty [options...]\n", prog_name);
239 |     printf("       \t -h: \tDump help message for mm_dirty sub-cmd\n");
240 |     printf("       \t -m: \tMemory size in MB (default: %d)\n", DEF_MM_DIRTY_SIZE);
241 |     printf("       \t -r: \tDirty rate in MB/s (default: unlimited)\n");
242 |     printf("       \t -p: \tWork pattern: \"sequential\", \"random\", or \"once\"\n");
243 |     printf("       \t\t(default: \"%s\")\n", pattern_str[DEF_MM_DIRTY_PATTERN]);
244 |     printf("       \t -L: \tRecord and report memory access latencies\n");
245 |     printf("       \t -P: \tPage size: \"2m\" or \"1g\" for huge pages\n");
246 | }
247 | 
248 | void usage_mm_dirty(void)
249 | {
250 |     puts("");
251 |     puts("Usage:");
252 |     usage_mm_dirty_short();
253 |     puts("");
254 |     puts("======== Memory Dirty Workload ========");
255 |     puts("");
256 |     puts("This sub-tool can also generate dirty memory workload in different ways.");
257 |     puts("");
258 |     puts("Example 1: generate 100MB/s random dirty workload upon 500MB memory using:");
259 |     puts("");
260 |     printf("  %s mm_dirty -m 500M -r 100M -p random\n", prog_name);
261 |     puts("");
262 |     puts("Example 2: dirty 10GB memory then keep idle after dirtying:");
263 |     puts("");
264 |     printf("  %s mm_dirty -m 10G -p once\n", prog_name);
265 |     puts("");
266 |     puts("Example 3: dirty 1GB memory, record and report memory access latencies when quit:");
267 |     puts("");
268 |     printf("  %s mm_dirty -m 10G -L\n", prog_name);
269 |     puts("");
270 | }
271 | 


--------------------------------------------------------------------------------
/downtime.c:
--------------------------------------------------------------------------------
  1 | #include "mig_mon.h"
  2 | 
  3 | /*
  4 |  * State machine for the event handler. It just starts from 0 until
  5 |  * RUNNING.
  6 |  */
  7 | enum event_state {
  8 |     /* Idle, waiting for first time triggering event */
  9 |     STATE_WAIT_FIRST_TRIGGER = 0,
 10 |     /* Got first event, waiting for the 2nd one */
 11 |     STATE_WAIT_SECOND_TRIGGER = 1,
 12 |     /* Normal running state */
 13 |     STATE_RUNNING = 2,
 14 |     STATE_MAX
 15 | };
 16 | 
 17 | static void write_spike_log(int fd, uint64_t delay)
 18 | {
 19 |     char spike_buf[1024] = {0};
 20 |     int str_len = -1;
 21 |     str_len = snprintf(spike_buf, sizeof(spike_buf) - 1,
 22 |                        "%"PRIu64",%"PRIu64"\n", get_timestamp(), delay);
 23 |     spike_buf[sizeof(spike_buf) - 1] = 0x00;
 24 |     write(fd, spike_buf, str_len);
 25 |     /* not flushed to make it fast */
 26 | }
 27 | 
 28 | static int socket_set_timeout(int sock, int timeout_ms)
 29 | {
 30 |     struct timeval tv = {
 31 |         .tv_sec = timeout_ms / 1000,
 32 |         .tv_usec = (timeout_ms % 1000) * 1000
 33 |     };
 34 | 
 35 |     return setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO,
 36 |                       (void *)&tv, sizeof(tv));
 37 | }
 38 | 
 39 | /*
 40 |  * This is a state machine to handle the incoming event. Return code
 41 |  * is the state before calling this handler.
 42 |  */
 43 | static enum event_state handle_event(int spike_fd)
 44 | {
 45 |     /* Internal static variables */
 46 |     static enum event_state state = STATE_WAIT_FIRST_TRIGGER;
 47 |     static uint64_t last = 0, max_delay = 0;
 48 |     /*
 49 |      * this will store the 1st and 2nd UDP packet latency, as a
 50 |      * baseline of latency values (this is very, very possibly the
 51 |      * value that you provided as interval when you start the
 52 |      * client). This is used to define spikes, using formular:
 53 |      *
 54 |      *         spike_throttle = first_latency * 2
 55 |      */
 56 |     static uint64_t first_latency = 0, spike_throttle = 0;
 57 | 
 58 |     /* Temp variables */
 59 |     uint64_t cur = 0, delay = 0;
 60 |     enum event_state old_state = state;
 61 | 
 62 |     cur = get_msec();
 63 | 
 64 |     if (last) {
 65 |         /*
 66 |          * If this is not exactly the first event we got, we calculate
 67 |          * the delay.
 68 |          */
 69 |         delay = cur - last;
 70 |     }
 71 | 
 72 |     switch (state) {
 73 |     case STATE_WAIT_FIRST_TRIGGER:
 74 |         assert(last == 0);
 75 |         assert(max_delay == 0);
 76 |         /*
 77 |          * We need to do nothing here, just to init the "last", which
 78 |          * will be done after the switch().
 79 |          */
 80 |         state++;
 81 |         break;
 82 | 
 83 |     case STATE_WAIT_SECOND_TRIGGER:
 84 |         /*
 85 |          * if this is _exactly_ the 2nd packet we got, we need to note
 86 |          * this down as a baseline.
 87 |          */
 88 |         assert(first_latency == 0);
 89 |         first_latency = delay;
 90 |         printf("1st and 2nd packet latency: %"PRIu64" (ms)\n", first_latency);
 91 |         spike_throttle = delay * 2;
 92 |         printf("Setting spike throttle to: %"PRIu64" (ms)\n", spike_throttle);
 93 |         if (spike_fd != -1) {
 94 |             printf("Updating spike log initial timestamp\n");
 95 |             /* this -1 is meaningless, shows the init timestamp only. */
 96 |             write_spike_log(spike_fd, -1);
 97 |         }
 98 |         state++;
 99 |         break;
100 | 
101 |     case STATE_RUNNING:
102 |         if (delay > max_delay) {
103 |             max_delay = delay;
104 |         }
105 |         /*
106 |          * if we specified spike_log, we need to log spikes into that
107 |          * file.
108 |          */
109 |         if (spike_fd != -1 && delay >= spike_throttle) {
110 |             write_spike_log(spike_fd, delay);
111 |         }
112 |         printf("\r                                                       ");
113 |         printf("\r[%"PRIu64"] max_delay: %"PRIu64" (ms), cur: %"PRIu64" (ms)", cur,
114 |                max_delay, delay);
115 |         fflush(stdout);
116 |         break;
117 | 
118 |     default:
119 |         printf("Unknown state: %d\n", state);
120 |         exit(1);
121 |         break;
122 |     }
123 | 
124 |     /* update LAST */
125 |     last = cur;
126 | 
127 |     return old_state;
128 | }
129 | 
130 | static int spike_log_open(const char *spike_log)
131 | {
132 |     int spike_fd = -1;
133 | 
134 |     if (spike_log) {
135 |         spike_fd = open(spike_log, O_WRONLY | O_CREAT, 0644);
136 |         if (spike_fd == -1) {
137 |             perror("failed to open spike log");
138 |             /* Silently disable spike log */
139 |         } else {
140 |             ftruncate(spike_fd, 0);
141 |         }
142 |     }
143 | 
144 |     return spike_fd;
145 | }
146 | 
147 | int mon_server_callback(int sock, int spike_fd)
148 | {
149 |     static in_addr_t target = 0;
150 |     int ret;
151 |     char buf[BUF_LEN];
152 |     struct sockaddr_in clnt_addr = { 0 };
153 |     socklen_t addr_len = sizeof(clnt_addr);
154 | 
155 |     ret = recvfrom(sock, buf, BUF_LEN, 0, (struct sockaddr *)&clnt_addr,
156 |                    &addr_len);
157 |     if (ret == -1) {
158 |         perror("recvfrom() error");
159 |         return -1;
160 |     }
161 | 
162 |     if (target == 0) {
163 |         /* this is the first packet we recved. we should init the
164 |            environment and remember the target client we are monitoring
165 |            for this round. */
166 |         printf("setting monitor target to client '%s'\n",
167 |                inet_ntoa(clnt_addr.sin_addr));
168 |         target = clnt_addr.sin_addr.s_addr;
169 |         /* Should be the first time calling */
170 |         assert(handle_event(spike_fd) == STATE_WAIT_FIRST_TRIGGER);
171 |         return 0;
172 |     }
173 | 
174 | #if MIG_MON_SINGLE_CLIENT
175 |     /* this is not the first packet we received, we will only monitor
176 |        the target client, and disgard all the other packets recved. */
177 |     if (clnt_addr.sin_addr.s_addr != target) {
178 |         printf("\nWARNING: another client (%s:%d) is connecting...\n",
179 |                inet_ntoa(clnt_addr.sin_addr),
180 |                ntohs(clnt_addr.sin_port));
181 |         /* disgard it! */
182 |         return 0;
183 |     }
184 | #endif
185 | 
186 |     handle_event(spike_fd);
187 | 
188 |     return 0;
189 | }
190 | 
191 | /* This is actually a udp ECHO server. */
192 | int mon_server_rr_callback(int sock, int spike_fd)
193 | {
194 |     int ret;
195 |     char buf[BUF_LEN];
196 |     struct sockaddr_in clnt_addr = { 0 };
197 |     socklen_t addr_len = sizeof(clnt_addr);
198 |     uint64_t cur;
199 | 
200 |     /* unused */
201 |     (void)spike_fd;
202 | 
203 |     ret = recvfrom(sock, buf, BUF_LEN, 0, (struct sockaddr *)&clnt_addr,
204 |                    &addr_len);
205 |     if (ret == -1) {
206 |         perror("recvfrom() error");
207 |         return -1;
208 |     }
209 | 
210 |     ret = sendto(sock, buf, ret, 0, (struct sockaddr *)&clnt_addr,
211 |                  addr_len);
212 |     if (ret == -1) {
213 |         perror("sendto() error");
214 |         return -1;
215 |     }
216 | 
217 |     cur = get_msec();
218 | 
219 |     printf("\r                                                  ");
220 |     printf("\r[%"PRIu64"] responding to client", cur);
221 |     fflush(stdout);
222 | 
223 |     return 0;
224 | }
225 | 
226 | /*
227 |  * spike_log is the file path to store spikes. Spikes will be
228 |  * stored in the form like (for each line):
229 |  *
230 |  * A,B
231 |  *
232 |  * Here, A is the timestamp in seconds. B is the latency value in
233 |  * ms.
234 |  */
235 | int mon_server(const char *spike_log, mon_server_cbk server_callback)
236 | {
237 |     int sock = 0;
238 |     int ret = 0;
239 |     struct sockaddr_in svr_addr = { 0 };
240 |     int spike_fd = spike_log_open(spike_log);
241 | 
242 |     sock = socket(AF_INET, SOCK_DGRAM, 0);
243 |     if (sock < 0) {
244 |         perror("socket() creation failed");
245 |         return -1;
246 |     }
247 | 
248 |     svr_addr.sin_family = AF_INET;
249 |     svr_addr.sin_addr.s_addr = htonl(INADDR_ANY);
250 |     svr_addr.sin_port = mig_mon_port;
251 | 
252 |     ret = bind(sock, (struct sockaddr *)&svr_addr, sizeof(svr_addr));
253 |     if (ret == -1) {
254 |         perror("bind() failed");
255 |         return -1;
256 |     }
257 | 
258 |     printf("listening on UDP port %d...\n", mig_mon_port);
259 | #if MIG_MON_SINGLE_CLIENT
260 |     printf("allowing single client only.\n");
261 | #else
262 |     printf("allowing multiple clients.\n");
263 | #endif
264 | 
265 |     while (1) {
266 |         ret = server_callback(sock, spike_fd);
267 |         if (ret) {
268 |             break;
269 |         }
270 |     }
271 | 
272 |     return ret;
273 | }
274 | 
275 | int mon_client_callback(int sock, int spike_fd, int interval_ms)
276 | {
277 |     int ret;
278 |     uint64_t cur;
279 |     char buf[BUF_LEN] = "echo";
280 |     int msg_len = strlen(buf);
281 |     int int_us = interval_ms * 1000;
282 | 
283 |     /* unused */
284 |     (void)spike_fd;
285 | 
286 |     ret = sendto(sock, buf, msg_len, 0, NULL, 0);
287 |     if (ret == -1) {
288 |         perror("sendto() failed");
289 |         return -1;
290 |     } else if (ret != msg_len) {
291 |         printf("sendto() returned %d?\n", ret);
292 |         return -1;
293 |     }
294 |     cur = get_msec();
295 |     printf("\r                                                  ");
296 |     printf("\r[%"PRIu64"] sending packet to server", cur);
297 |     fflush(stdout);
298 |     usleep(int_us);
299 | 
300 |     return 0;
301 | }
302 | 
303 | int mon_client_rr_callback(int sock, int spike_fd, int interval_ms)
304 | {
305 |     int ret;
306 |     uint64_t cur;
307 |     char buf[BUF_LEN] = "echo";
308 |     int msg_len = strlen(buf);
309 |     static int init = 0;
310 |     static uint64_t last = 0;
311 | 
312 |     if (!init) {
313 |         printf("Setting socket recv timeout to %d (ms)\n",
314 |                interval_ms);
315 |         socket_set_timeout(sock, interval_ms);
316 |         init = 1;
317 |     }
318 | 
319 |     cur = get_msec();
320 | 
321 |     if (last) {
322 |         /*
323 |          * This is not the first packet, we need to wait until we
324 |          * reaches the interval.
325 |          */
326 |         int64_t delta = last + interval_ms - cur;
327 |         if (delta > 0) {
328 |             usleep(delta * 1000);
329 |         }
330 |     }
331 | 
332 |     last = get_msec();
333 | 
334 |     ret = sendto(sock, buf, msg_len, 0, NULL, 0);
335 |     if (ret == -1) {
336 |         perror("sendto() failed");
337 |         return -1;
338 |     } else if (ret != msg_len) {
339 |         printf("sendto() returned %d?\n", ret);
340 |         return -1;
341 |     }
342 | 
343 |     ret = recvfrom(sock, buf, msg_len, 0, NULL, 0);
344 |     if (ret == -1) {
345 |         if (errno == ECONNREFUSED || errno == EAGAIN) {
346 |             /*
347 |              * This is when server is down, e.g., due to migration. So
348 |              * this is okay.
349 |              */
350 |             return 0;
351 |         } else {
352 |             printf("recvfrom() ERRNO: %d\n", errno);
353 |         }
354 |     } else if (ret != msg_len) {
355 |         printf("recvfrom() returned %d?\n", ret);
356 |         return -1;
357 |     }
358 | 
359 |     handle_event(spike_fd);
360 | 
361 |     return 0;
362 | }
363 | 
364 | int mon_client(const char *server_ip, int interval_ms,
365 |                const char *spike_log, mon_client_cbk client_callback)
366 | {
367 |     int ret = -1;
368 |     int sock = 0;
369 |     struct sockaddr_in addr;
370 |     int spike_fd = spike_log_open(spike_log);
371 | 
372 |     bzero(&addr, sizeof(addr));
373 | 
374 |     sock = socket(AF_INET, SOCK_DGRAM, 0);
375 |     if (sock == -1) {
376 |         perror("socket() failed");
377 |         return -1;
378 |     }
379 | 
380 |     addr.sin_family = AF_INET;
381 |     addr.sin_port = mig_mon_port;
382 |     if (inet_aton(server_ip, &addr.sin_addr) != 1) {
383 |         printf("server ip '%s' invalid\n", server_ip);
384 |         ret = -1;
385 |         goto close_sock;
386 |     }
387 | 
388 |     ret = connect(sock, (const struct sockaddr *)&addr, sizeof(addr));
389 |     if (ret) {
390 |         perror("connect() failed");
391 |         goto close_sock;
392 |     }
393 | 
394 |     while (1) {
395 |         ret = client_callback(sock, spike_fd, interval_ms);
396 |         if (ret) {
397 |             break;
398 |         }
399 |     }
400 | 
401 | close_sock:
402 |     close(sock);
403 |     return ret;
404 | }
405 | 
406 | void usage_downtime_short(void)
407 | {
408 |     puts("");
409 |     printf("       %s server [spike_log]\n", prog_name);
410 |     printf("       %s client server_ip [interval_ms]\n", prog_name);
411 |     printf("       %s server_rr\n", prog_name);
412 |     printf("       %s client_rr server_ip [interval_ms [spike_log]]\n", prog_name);
413 | }
414 | 
415 | void usage_downtime(void)
416 | {
417 |     puts("");
418 |     puts("Usage:");
419 |     usage_downtime_short();
420 |     puts("");
421 |     puts("======== VM Migration Downtime Measurement ========");
422 |     puts("");
423 |     puts("This is a program that could be used to measure");
424 |     puts("VM migration down time. Please specify work mode.");
425 |     puts("");
426 |     puts("Example usage to measure guest server downtime (single way):");
427 |     puts("");
428 |     printf("1. [on guest]  start server using '%s server /tmp/spike.log'\n",
429 |            prog_name);
430 |     printf("   this will start server, log all spikes into spike.log.\n");
431 |     printf("2. [on client] start client using '%s client GUEST_IP 50'\n",
432 |            prog_name);
433 |     printf("   this starts sending UDP packets to server, interval 50ms.\n");
434 |     printf("3. trigger loop migration (e.g., 100 times)\n");
435 |     printf("4. see the results on server side.\n");
436 |     puts("");
437 |     puts("Example usage to measure round-trip downtime:");
438 |     puts("(This is preferred since it simulates a simplest server behavior)");
439 |     puts("");
440 |     printf("1. [on guest]  start server using '%s server_rr'\n",
441 |            prog_name);
442 |     printf("   this will start a UDP echo server.\n");
443 |     printf("2. [on client] start client using '%s client GUEST_IP 50 spike.log'\n",
444 |            prog_name);
445 |     printf("   this starts sending UDP packets to server, then try to recv it.\n");
446 |     printf("   the timeout of recv() will be 50ms.\n");
447 |     printf("3. trigger loop migration (e.g., 100 times)\n");
448 |     printf("4. see the results on client side.\n");
449 |     puts("");
450 | }
451 | 


--------------------------------------------------------------------------------
/vm.c:
--------------------------------------------------------------------------------
  1 | #include "mig_mon.h"
  2 | 
  3 | #define  MAGIC_SEND_PAGE          (0x123)      /* For sending page */
  4 | #define  MAGIC_REQ_PAGE           (0x124)      /* For requesting page */
  5 | #define  MAGIC_HANDSHAKE          (0x125)      /* For src->dst handshake */
  6 | 
  7 | /* These emulates QEMU */
  8 | #define  DEF_IO_BUF_SIZE    32768
  9 | #define  MAX_IOV_SIZE       64
 10 | 
 11 | typedef struct {
 12 |     uint64_t magic;
 13 |     uint64_t page_index;
 14 | } page_header;
 15 | 
 16 | void *mmap_anon(size_t size)
 17 | {
 18 |     return mmap(NULL, size, PROT_READ | PROT_WRITE,
 19 |                 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
 20 | }
 21 | 
 22 | /* Init vm_args shared fields for both src/dst */
 23 | void vm_args_init_shared(vm_args *args)
 24 | {
 25 |     char *buf;
 26 |     int ret;
 27 | 
 28 |     /* Setup receiver socket buffers on both src/dst */
 29 |     buf = mmap_anon(DEF_IO_BUF_SIZE);
 30 |     assert(buf != MAP_FAILED);
 31 | 
 32 |     args->recv_buffer = buf;
 33 |     args->recv_cur = 0;
 34 |     args->recv_len = 0;
 35 | 
 36 |     ret = pipe(args->page_req_pipe);
 37 |     assert(ret == 0);
 38 | }
 39 | 
 40 | void vm_args_init_src(vm_args *args)
 41 | {
 42 |     struct iovec *iov;
 43 |     int i, ret;
 44 | 
 45 |     vm_args_init_shared(args);
 46 | 
 47 |     /* Only the sender does bulk sending, set it up */
 48 |     iov = mmap_anon(sizeof(struct iovec) * MAX_IOV_SIZE);
 49 |     assert(iov != MAP_FAILED);
 50 | 
 51 |     for (i = 0; i < MAX_IOV_SIZE; i++) {
 52 |         iov[i].iov_len = DEF_IO_BUF_SIZE;
 53 |         iov[i].iov_base = mmap_anon(DEF_IO_BUF_SIZE);
 54 |         assert(iov[i].iov_base != MAP_FAILED);
 55 |     }
 56 | 
 57 |     if (args->tests & VM_TEST_PRECOPY) {
 58 |         /*
 59 |          * If src enabled precopy, making the page req channel non-block so
 60 |          * it can handle both precopy/postcopy.  Otherwise keep it blocking
 61 |          * so if we're only testing postcopy we don't eat up 100% core on
 62 |          * src host.
 63 |          */
 64 |         ret = fcntl(args->page_req_pipe[0], F_SETFL, O_NONBLOCK);
 65 |         assert(ret == 0);
 66 |     }
 67 | 
 68 |     args->src_iov_buffer = iov;
 69 |     args->src_cur = 0;
 70 |     args->src_cur_len = 0;
 71 | }
 72 | 
 73 | void vm_args_init_dst(vm_args *args)
 74 | {
 75 |     vm_args_init_shared(args);
 76 | 
 77 |     /* This means, "no request yet" */
 78 |     args->dst_current_req = (uint64_t)-1;
 79 | }
 80 | 
 81 | int sock_write_flush(vm_args *args)
 82 | {
 83 |     struct iovec *iov = args->src_iov_buffer;
 84 |     struct msghdr msg = { NULL };
 85 |     int ret;
 86 | 
 87 |     /* Limit src_cur IOV to only send partial of its buffer */
 88 |     iov[args->src_cur].iov_len = args->src_cur_len;
 89 | 
 90 |     msg.msg_iov = iov;
 91 |     msg.msg_iovlen = args->src_cur + 1;
 92 | 
 93 | retry:
 94 |     ret = sendmsg(args->sock, &msg, 0);
 95 |     if (ret < 0) {
 96 |         if (ret == -EAGAIN || ret == -EINTR)
 97 |             goto retry;
 98 |         printf("sendmsg() failed: %d\n", ret);
 99 |         return ret;
100 |     }
101 | 
102 |     /* Recover the iov_len field of last IOV */
103 |     iov[args->src_cur].iov_len = DEF_IO_BUF_SIZE;
104 |     /* Free all the IOV buffers by reset the fields */
105 |     args->src_cur = 0;
106 |     args->src_cur_len = 0;
107 | 
108 |     if (ret == 0)
109 |         return -1;
110 | 
111 |     return 0;
112 | }
113 | 
114 | int sock_write(vm_args *args, void *buffer, uint64_t size)
115 | {
116 |     struct iovec *iov = args->src_iov_buffer;
117 |     int ret;
118 | 
119 |     while (size) {
120 |         size_t to_move;
121 |         void *cur_ptr;
122 | 
123 |         assert(args->src_cur < MAX_IOV_SIZE);
124 |         assert(args->src_cur_len < DEF_IO_BUF_SIZE);
125 | 
126 |         /* Every IOV is the same len */
127 |         to_move = DEF_IO_BUF_SIZE - args->src_cur_len;
128 |         to_move = MIN(to_move, size);
129 | 
130 |         cur_ptr = (void *)((uint64_t)(iov[args->src_cur].iov_base) +
131 |                            args->src_cur_len);
132 |         if (buffer) {
133 |             memcpy(cur_ptr, buffer, to_move);
134 |             buffer = (void *)((uint64_t)buffer + to_move);
135 |         } else {
136 |             bzero(cur_ptr, to_move);
137 |         }
138 | 
139 |         args->src_cur_len += to_move;
140 |         size -= to_move;
141 | 
142 |         if (args->src_cur_len >= DEF_IO_BUF_SIZE) {
143 |             assert(args->src_cur_len == DEF_IO_BUF_SIZE);
144 |             args->src_cur++;
145 |             args->src_cur_len = 0;
146 | 
147 |             if (args->src_cur >= MAX_IOV_SIZE) {
148 |                 assert(args->src_cur == MAX_IOV_SIZE);
149 |                 /* Flush all the data in the iovec */
150 |                 ret = sock_write_flush(args);
151 |                 if (ret)
152 |                     return ret;
153 |             }
154 |         }
155 |     }
156 | 
157 |     return 0;
158 | }
159 | 
160 | int sock_read_refill(vm_args *args)
161 | {
162 |     int ret;
163 | 
164 |     /* Make sure we've consumed all */
165 |     assert(args->recv_cur == args->recv_len);
166 | retry:
167 |     ret = read(args->sock, args->recv_buffer, DEF_IO_BUF_SIZE);
168 |     if (ret < 0)
169 |         ret = -errno;
170 |     if (ret == -EAGAIN || ret == -EINTR)
171 |         goto retry;
172 |     if (ret == -ECONNRESET || ret == 0) {
173 |         printf("Connection reset\n");
174 |         return -1;
175 |     }
176 |     if (ret < 0) {
177 |         printf("%s: ret==%d\n", __func__, ret);
178 |         return -1;
179 |     }
180 | 
181 |     args->recv_len = ret;
182 |     args->recv_cur = 0;
183 | 
184 |     return 0;
185 | }
186 | 
187 | int sock_read(vm_args *args, void *buf, uint64_t size)
188 | {
189 |     int len;
190 | 
191 |     while (size) {
192 |         /* Out of data in the buffer, refill */
193 |         if (args->recv_cur >= args->recv_len) {
194 |             assert(args->recv_cur == args->recv_len);
195 |             len = sock_read_refill(args);
196 |             if (len < 0)
197 |                 return len;
198 |         }
199 | 
200 |         len = args->recv_len - args->recv_cur;
201 |         len = MIN((uint64_t)len, size);
202 | 
203 |         if (buf) {
204 |             memcpy(buf, &args->recv_buffer[args->recv_cur], len);
205 |             buf = (void *)((uint64_t)buf + len);
206 |         }
207 | 
208 |         args->recv_cur += len;
209 |         size -= len;
210 |     }
211 | 
212 |     return 0;
213 | }
214 | 
215 | int vm_src_send_page(vm_args *args, uint64_t page)
216 | {
217 |     page_header header = {
218 |         .magic = MAGIC_SEND_PAGE,
219 |         .page_index = page,
220 |     };
221 |     int ret;
222 | 
223 |     /* Send header */
224 |     ret = sock_write(args, &header, sizeof(header));
225 |     if (ret)
226 |         return ret;
227 | 
228 |     /* Send page (which is all zero..) */
229 |     ret = sock_write(args, NULL, page_size);
230 |     if (ret)
231 |         return ret;
232 | 
233 |     return 0;
234 | }
235 | 
236 | int vm_src_enable_postcopy_on_dst(vm_args *args)
237 | {
238 |     page_header header = { .magic = MAGIC_HANDSHAKE };
239 | 
240 |     if (sock_write(args, &header, sizeof(header)))
241 |         return -1;
242 |     if (sock_write_flush(args))
243 |         return -1;
244 |     return 0;
245 | }
246 | 
247 | void *vm_src_sender_thread(void *opaque)
248 | {
249 |     vm_args *args = opaque;
250 |     uint64_t index = 0, end = args->vm_size / page_size;
251 |     uint64_t total = 0, last, cur, requested_page;
252 |     int ret;
253 | 
254 |     /* Enable dst postcopy if necessary */
255 |     if (args->tests & VM_TEST_POSTCOPY) {
256 |         if (vm_src_enable_postcopy_on_dst(args))
257 |             goto fail;
258 |     }
259 | 
260 |     if (args->tests & VM_TEST_PRECOPY)
261 |         printf("Starting PRECOPY streaming test...\n");
262 | 
263 |     last = get_msec();
264 |     while (1) {
265 |         /* If no precopy test, we don't need this sender */
266 |         if (args->tests & VM_TEST_PRECOPY) {
267 |             debug("sending page %"PRIu64"\n", index);
268 |             ret = vm_src_send_page(args, index);
269 |             if (ret)
270 |                 goto fail;
271 |             total += sizeof(page_header) + page_size;
272 | 
273 |             /* Update index */
274 |             index++;
275 |             if (index >= end)
276 |                 index = 0;
277 | 
278 |             cur = get_msec();
279 |             if (cur - last >= 1000) {
280 |                 printf("Speed: %"PRIu64" (MB/s)\n",
281 |                         (total / (1UL << 20)) * 1000 / (cur - last));
282 |                 last = cur;
283 |                 total = 0;
284 |             }
285 |         }
286 | 
287 |         while (1) {
288 |             /* Request pipe read side is non-blocking */
289 |             debug("try reading page requests\n");
290 |             ret = read(args->page_req_pipe[0], &requested_page,
291 |                        sizeof(requested_page));
292 |             if (ret < 0)
293 |                 ret = -errno;
294 |             if (ret == 0 || ret == -EINTR || ret == -EAGAIN)
295 |                 break;
296 |             assert(ret == sizeof(requested_page));
297 |             debug("got request, sending page\n");
298 |             ret = vm_src_send_page(args, requested_page);
299 |             if (ret)
300 |                 goto fail;
301 |             ret = sock_write_flush(args);
302 |             if (ret)
303 |                 goto fail;
304 |             total += sizeof(page_header) + page_size;
305 |             /* See if there're more requests; normally none */
306 |             continue;
307 |         }
308 |     }
309 | 
310 |     return NULL;
311 | 
312 | fail:
313 |     return (void *)-1;
314 | }
315 | 
316 | void *vm_src_receiver_thread(void *opaque)
317 | {
318 |     vm_args *args = opaque;
319 |     page_header header = { 0 };
320 | 
321 |     while (1) {
322 |         if (sock_read(args, &header, sizeof(header)))
323 |             goto fail;
324 |         debug("src vm recv request\n");
325 |         if (header.magic != MAGIC_REQ_PAGE) {
326 |             printf("Page request magic incorrect: %"PRIx64"\n", header.magic);
327 |             goto fail;
328 |         }
329 |         /* Queue the page */
330 |         fd_write(args->page_req_pipe[1], &header.page_index,
331 |                  sizeof(header.page_index));
332 |         debug("src vm page queued\n");
333 |     }
334 | 
335 |     return NULL;
336 | 
337 | fail:
338 |     return (void *)-1;
339 | }
340 | 
341 | void vm_src_run(vm_args *args)
342 | {
343 |     int ret;
344 | 
345 |     printf("Connected to dst VM %s.\n", args->src_target_ip);
346 | 
347 |     ret = pthread_create(&args->sender, NULL,
348 |                          vm_src_sender_thread, args);
349 |     if (ret) {
350 |         printf("Sender thread creation failed: %s\n", strerror(ret));
351 |         return;
352 |     }
353 |     pthread_set_name(args->sender, "vm-src-sender");
354 | 
355 |     ret = pthread_create(&args->receiver, NULL,
356 |                          vm_src_receiver_thread, args);
357 |     if (ret) {
358 |         printf("Receiver thread creation failed: %s\n", strerror(ret));
359 |         return;
360 |     }
361 |     pthread_set_name(args->receiver, "vm-src-receiver");
362 | 
363 |     pthread_join(args->sender, NULL);
364 |     pthread_join(args->receiver, NULL);
365 | 
366 |     close(args->sock);
367 |     printf("Dropped connection to dst VM %s.\n", args->src_target_ip);
368 | }
369 | 
370 | int mon_start_src(vm_args *args)
371 | {
372 |     int sock, ret;
373 |     struct sockaddr_in server;
374 | 
375 |     vm_args_init_src(args);
376 | 
377 |     puts("Start emulation of src VM.");
378 | 
379 |     sock = socket(AF_INET, SOCK_STREAM, 0);
380 |     if (sock < 0) {
381 |         perror("Could not create socket.");
382 |         return -errno;
383 |     }
384 | 
385 |     server.sin_family = AF_INET;
386 |     server.sin_port = htons(mig_mon_port);
387 |     if (inet_aton(args->src_target_ip, &server.sin_addr) != 1) {
388 |         printf("Destination VM address '%s' invalid\n", args->src_target_ip);
389 |         return -1;
390 |     }
391 | 
392 |     ret = connect(sock, (struct sockaddr *)&server, sizeof(server));
393 |     if (ret < 0) {
394 |         perror("Could not connect to dst VM.");
395 |         return -1;
396 |     }
397 | 
398 |     args->sock = sock;
399 |     vm_src_run(args);
400 | 
401 |     return 0;
402 | }
403 | 
404 | void *vm_dst_sender_thread(void *opaque)
405 | {
406 |     vm_args *args = opaque;
407 |     page_header header = { .magic = MAGIC_REQ_PAGE };
408 |     uint64_t npages, page_index, last, cur, total, count, max_lat, now;
409 | 
410 |     /* If don't test postcopy, we don't really need this */
411 |     if (!(args->tests & VM_TEST_POSTCOPY))
412 |         return NULL;
413 | 
414 |     printf("Starting POSTCOPY request-response test...\n");
415 | 
416 |     npages = args->vm_size / page_size;
417 |     total = count = max_lat = 0;
418 |     last = get_usec();
419 |     while (1) {
420 |         args->dst_current_req = random() % npages;
421 |         header.page_index = args->dst_current_req;
422 | 
423 |         cur = get_usec();
424 |         debug("sending page req: sock=%d\n", args->sock);
425 |         fd_write(args->sock, &header, sizeof(header));
426 |         /* We send the request, wait for response */
427 |         debug("reading pipe\n");
428 |         fd_read(args->page_req_pipe[0], &page_index, sizeof(page_index));
429 |         debug("reading pipe done\n");
430 | 
431 |         if (args->quit)
432 |             break;
433 | 
434 |         if (page_index != header.page_index) {
435 |             printf("%s: Incorrect page index received!\n", __func__);
436 |             break;
437 |         }
438 | 
439 |         now = get_usec();
440 |         /* Measure the latency, record max */
441 |         cur = now - cur;
442 |         if (cur > max_lat)
443 |             max_lat = cur;
444 |         total += cur;
445 |         count++;
446 | 
447 |         /* For each second */
448 |         if (now - last >= 1000000) {
449 |             printf("Latency: average %"PRIu64" (us), max: %"PRIu64" (us)\n",
450 |                    total / count, max_lat);
451 |             total = count = max_lat = 0;
452 |             last = now;
453 |         }
454 |     }
455 | 
456 |     return NULL;
457 | }
458 | 
459 | void vm_dst_start_sender(vm_args *args)
460 | {
461 |     int ret;
462 | 
463 |     ret = pthread_create(&args->sender, NULL,
464 |                          vm_dst_sender_thread, args);
465 |     assert(ret == 0);
466 |     pthread_set_name(args->sender, "vm-dst-sender");
467 | }
468 | 
469 | void vm_dst_kick_sender_quit(vm_args *args)
470 | {
471 |     uint64_t tmp = 0;
472 | 
473 |     /* To make sure sender thread quits... write anything to pipe */
474 |     args->quit = 1;
475 |     fd_write(args->page_req_pipe[1], &tmp, sizeof(uint64_t));
476 | }
477 | 
478 | void *vm_dst_receiver_thread(void *opaque)
479 | {
480 |     vm_args *args = opaque;
481 |     uint64_t end = args->vm_size / page_size;
482 |     page_header header;
483 |     int ret;
484 | 
485 |     while (1) {
486 |         ret = sock_read(args, &header, sizeof(header));
487 |         if (ret)
488 |             goto out;
489 | 
490 |         switch (header.magic) {
491 |         case MAGIC_HANDSHAKE:
492 |             if (!(args->tests & VM_TEST_POSTCOPY)) {
493 |                 args->tests |= VM_TEST_POSTCOPY;
494 |                 vm_dst_start_sender(args);
495 |             }
496 |             continue;
497 |         case MAGIC_SEND_PAGE:
498 |             /* A common page received */
499 |             break;
500 |         default:
501 |             printf("magic error: 0x%"PRIx64"\n", header.magic);
502 |             goto out;
503 |         }
504 | 
505 |         if (header.page_index >= end) {
506 |             printf("page index overflow: 0x%"PRIx64"\n", header.page_index);
507 |             goto out;
508 |         }
509 |         ret = sock_read(args, NULL, page_size);
510 |         if (ret)
511 |             goto out;
512 |         debug("dst vm receiving page\n");
513 | 
514 |         /* Check if this is a postcopy request page */
515 |         if (header.page_index == args->dst_current_req) {
516 |             fd_write(args->page_req_pipe[1],
517 |                      &args->dst_current_req, sizeof(uint64_t));
518 |         }
519 |     }
520 | out:
521 |     /* Remember to kick the sender thread to quit */
522 |     vm_dst_kick_sender_quit(args);
523 |     return NULL;
524 | }
525 | 
526 | void vm_dst_run(vm_args *args, char *src_ip)
527 | {
528 |     int ret;
529 | 
530 |     vm_args_init_dst(args);
531 | 
532 |     printf("Connected from src VM %s.\n", src_ip);
533 | 
534 |     ret = pthread_create(&args->receiver, NULL,
535 |                          vm_dst_receiver_thread, args);
536 |     if (ret) {
537 |         printf("Receiver thread creation failed: %s\n", strerror(ret));
538 |         return;
539 |     }
540 |     pthread_set_name(args->receiver, "vm-dst-receiver");
541 | 
542 |     pthread_join(args->receiver, NULL);
543 | 
544 |     if (args->tests & VM_TEST_POSTCOPY)
545 |         pthread_join(args->sender, NULL);
546 | 
547 |     close(args->sock);
548 |     printf("Dropped connection from src VM %s.\n", src_ip);
549 | }
550 | 
551 | int mon_start_dst(vm_args *args)
552 | {
553 |     struct sockaddr_in server, cli_addr;
554 |     int sock, ret, new_sock, child;
555 |     socklen_t client_len = sizeof(server);
556 | 
557 |     puts("Start emulation of dst VM.");
558 | 
559 |     sock = socket(AF_INET, SOCK_STREAM, 0);
560 |     if (sock < 0) {
561 |         perror("Could not create socket.");
562 |         return -errno;
563 |     }
564 | 
565 |     memset((char *)&server, 0, sizeof(server));
566 | 
567 |     server.sin_family = AF_INET;
568 |     server.sin_addr.s_addr = INADDR_ANY;
569 |     server.sin_port = htons(mig_mon_port);
570 | 
571 |     ret = bind(sock, (struct sockaddr *)&server, sizeof(server));
572 |     if (ret < 0) {
573 |         perror("Could not bind");
574 |         return -errno;
575 |     }
576 | 
577 |     socket_set_fast_reuse(sock);
578 |     listen(sock, 5);
579 | 
580 |     while (1) {
581 |         new_sock = accept(sock, (struct sockaddr *)&cli_addr, &client_len);
582 |         if (new_sock < 0) {
583 |             perror("Could not accept client.");
584 |             return -errno;
585 |         }
586 | 
587 |         child = fork();
588 |         if (child == 0) {
589 |             args->sock = new_sock;
590 |             vm_dst_run(args, strdup(inet_ntoa(cli_addr.sin_addr)));
591 |             return 0;
592 |         }
593 | 
594 |         close(new_sock);
595 |     }
596 | 
597 |     return 0;
598 | }
599 | 
600 | int mon_vm(vm_args *args)
601 | {
602 |     emulate_target target = args->target;
603 |     int ret;
604 | 
605 |     /* Doing sanity check on the parameters */
606 |     if (target == EMULATE_NONE) {
607 |         printf("Please specify to emulate either src (-s) or dst (-d)\n");
608 |         return -1;
609 |     } else if (target == EMULATE_SRC) {
610 |         if (!args->src_target_ip) {
611 |             printf("Please specify dst VM address using '-H'.\n");
612 |             return -1;
613 |         }
614 |     } else {
615 |         /* EMULATE_DST */
616 |         if (args->tests) {
617 |             printf("precopy/postcopy need to be specified on src VM.\n");
618 |             return -1;
619 |         }
620 |     }
621 | 
622 |     if (target == EMULATE_SRC)
623 |         ret = mon_start_src(args);
624 |     else
625 |         ret = mon_start_dst(args);
626 | 
627 |     return ret;
628 | }
629 | 
630 | void usage_vm_short(void)
631 | {
632 |     puts("");
633 |     printf("       %s vm [options...]\n", prog_name);
634 |     printf("       \t -d: \tEmulate a dst VM\n");
635 |     printf("       \t -h: \tDump help message for vm sub-cmd\n");
636 |     printf("       \t -H: \tSpecify dst VM IP (required for -s)\n");
637 |     printf("       \t -p: \tSpecify connect/listen port\n");
638 |     printf("       \t -s: \tEmulate a src VM\n");
639 |     printf("       \t -S: \tSpecify size of the VM (GB)\n");
640 |     printf("       \t -t: \tSpecify tests (precopy, postcopy)\n");
641 | }
642 | 
643 | void usage_vm(void)
644 | {
645 |     puts("");
646 |     puts("Usage:");
647 |     usage_vm_short();
648 |     puts("");
649 |     puts("======== Emulate VM Live Migrations ========");
650 |     puts("");
651 |     puts("This sub-tool can be used to emulate live migration TCP streams.");
652 |     puts("");
653 |     puts("There're two types of live migration: (1) precopy (2) postcopy.");
654 |     puts("This tool can emulate (1) or (2) or (1+2) case by specifying");
655 |     puts("different '-t' parameters.");
656 |     puts("");
657 |     puts("For precopy stream, it's the bandwidth that matters.  The bandwidth");
658 |     puts("information will be dumped per-second on src VM.");
659 |     puts("");
660 |     puts("For postcopy stream, it's the latency that matters.  The average/maximum");
661 |     puts("latency value of page requests will be dumped per-second on dst VM.");
662 |     puts("");
663 |     puts("Example:");
664 |     puts("");
665 |     puts("To start the (emulated) destination VM, one can run this on dest host:");
666 |     puts("");
667 |     printf("  %s vm -d\n", prog_name);
668 |     puts("");
669 |     puts("Then, to start a src VM emulation and start both live migration streams,");
670 |     puts("one can run this command on src host:");
671 |     puts("");
672 |     printf("  %s vm -s -H $DEST_IP -t precopy -t postcopy\n", prog_name);
673 |     puts("");
674 |     puts("Specifying both '-t' will just enable both migration streams.");
675 |     puts("");
676 | }
677 | 


--------------------------------------------------------------------------------