├── .gitignore ├── .gitmodules ├── README.md ├── apps └── flexkvs │ ├── Makefile │ ├── bench │ ├── benchmark.h │ ├── commandline.c │ ├── main-flexnic.c │ ├── main.c │ ├── rng.c │ ├── rng.h │ └── workload.c │ ├── common │ └── include │ │ └── protocol_binary.h │ ├── flexkvs.conf │ ├── kvsbench.conf │ ├── server │ ├── hashtable.c │ ├── ialloc.c │ ├── iokvs.h │ ├── jenkins_hash.c │ ├── main-flexnic.c │ ├── main-ll.c │ ├── main-ll2.c │ ├── main.c │ └── settings.c │ └── unlink_socks.sh ├── build.sh ├── microbenchmarks ├── Makefile ├── gups-hotset-move.c ├── gups-random.c ├── gups-small.c ├── gups.c ├── gups.h ├── run-instantaneous.sh ├── run-random.sh ├── run-threads.sh ├── run.sh ├── test.c └── zipf.c └── src ├── Makefile ├── fifo.c ├── fifo.h ├── hemem.c ├── hemem.h ├── interpose.c ├── interpose.h ├── pebs.c ├── pebs.h ├── policies ├── hemem-mmgr.c ├── hemem-mmgr.h ├── lru.c ├── lru.h ├── paging.c ├── paging.h ├── simple.c └── simple.h ├── spsc-ring.c ├── spsc-ring.h ├── timer.c ├── timer.h ├── userfaultfd.h └── uthash.h /.gitignore: -------------------------------------------------------------------------------- 1 | *.o 2 | *.so 3 | *.txt 4 | *.patch 5 | *.data 6 | *.sh 7 | *.csv 8 | *.pdf 9 | *.py 10 | microbenchmarks/gups 11 | microbenchmarks/gups-pebs 12 | microbenchmarks/gups-lru 13 | microbenchmarks/gups-simple 14 | microbenchmarks/gups-lru-swap 15 | microbenchmarks/gups-random 16 | microbenchmarks/gups-hotset-move 17 | memsim/mmgr_simple 18 | memsim/mmgr_linux 19 | memsim/mmgr_simple_mmm 20 | memsim/mmgr_hemem 21 | apps/ 22 | logs/ 23 | results/ 24 | scripts/ 25 | *tags 26 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "apps/silo"] 2 | path = apps/silo 3 | url = https://ajaustin@bitbucket.org/ajaustin/silo.git 4 | [submodule "apps/gapbs"] 5 | path = apps/gapbs 6 | url = https://ajaustin@bitbucket.org/ajaustin/gapbs.git 7 | [submodule "Hoard"] 8 | path = Hoard 9 | url = https://github.com/emeryberger/Hoard.git 10 | [submodule "linux"] 11 | path = linux 12 | url = ../hemem-linux 13 | [submodule "syscall_intercept"] 14 | path = syscall_intercept 15 | url = https://github.com/pmem/syscall_intercept 16 | [submodule "capstone"] 17 | path = capstone 18 | url = https://github.com/capstone-engine/capstone 19 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # HeMem 2 | 3 | This document describes the artifact for our [SOSP 2021 paper](https://dl.acm.org/doi/10.1145/3477132.3483550 "SOSP 2021 paper") on HeMem. HeMem is a tiered main memory management system designed from scratch for commercially available NVM and the big data applications that use it. HeMem manages tiered memory asynchronously, batching and amortizing memory access tracking, migration, and associated TLB synchronization overheads. HeMem monitors application memory use by sampling memory access via CPU events, rather than page tables. This allows HeMem to scale to terabytes of memory, keeping small and ephemeral data structures in fast memory, and allocating scarce, asymmetric NVM bandwidth according to access patterns. Finally, HeMem is flexible by placing per-application memory management policy at user-level. 4 | 5 | ## Overview 6 | 7 | * `apps/` contains the application benchmarks evaluated with HeMem 8 | * `microbenchmarks/` contains the GUPS microbenchmark used to evaluate HeMem 9 | * `src/` contains the source code of HeMem 10 | * `src/policies` contains extra memory policies used for testing HeMem, such as a page-table based LRU policy 11 | * `Hoard/` contains the Hoard memory allocator that HeMem depends on 12 | * `linux/` contains the linux kernel version required to run HeMem 13 | 14 | ### Building and Running HeMem 15 | 16 | #### Setup 17 | 18 | You may set up HeMem to run on your own machine provided you have Intel Optane NVM. HeMem uses `/dev/dax` files to represent DRAM and NVM. Some additional setup is required for setting up the DRAM and NVM `/dev/dax` files to run HeMem. 19 | 20 | To set up the `/dev/dax` file representing DRAM, follow the instructions [here](https://pmem.io/2016/02/22/pm-emulation.html "here") in order to reserve a block of DRAM at machine startup to represent the DRAM `/dev/dax` file. HeMem reserves its 140GB of DRAM in this way (enough for its 128GB of reserved DRAM plus some metadata needed for `ndctl`). If your machine has multiple NUMA nodes, ensure that the block of DRAM you reserve is located on the same NUMA node that has NVM. **Do not follow the last set of instructions from pmem.io on setting up a file system on the reserved DRAM.** Instead, set up a `/dev/dax` file to represent it: 21 | 22 | 1. First, determine the name of the namespace representing the reserved DRAM: 23 | 24 | `ndctl list --human` 25 | 26 | 2. You should see your reserved DRAM. If multiple namespaces are listed, some represent NVM namespaces (described below). You should be able to differentiate the DRAM namespace based on size. Your DRAM namespace is likely in `fsdax` mode. Change the namespace over to `devdax` mode using the following command (in this example, the DRAM namespace is called `namespace0.0`): 27 | 28 | `sudo ndctl create-namespace -f -e namespace0.0 --mode=devdax --align 2M` 29 | 30 | 3. Make note of the `chardev` name of the DRAM `/dev/dax` file. This will be used to tell HeMem which `/dev/dax` file represents DRAM. If this is different from `dax0.0`, then you will need to edit the `src/hemem.h` file `DRAMPATH` macro to point it towards your actual DRAM `/dev/dax` file. 31 | 32 | To set up the `/dev/dax` file representing NVM, ensure that your machine has NVM in App Direct mode. If you do not already have namespaces representing NVM, then you will need to create them. Follow these steps: 33 | 34 | 1. List the regions available on your machine: 35 | 36 | `ndctl list --regions --human` 37 | 38 | 2. Note which regions represent NVM. You can differentiate them from the reserved DRAM region based on size or via the `persistence_domain` field, which, for NVM, will read `memory_controller`. Pick the region that is on the same NUMA node as your reserved DRAM. In this example, this is "region1". Create a namespace over this region: 39 | 40 | `ndctl create-namespace --region=1 --mode=devdax` 41 | 42 | 3. Make note of the `chardev` name of the NVM `/dev/dax` file. This will be used to tell HeMem which `/dev/dax` file represents NVM. If this is different from `dax1.0`, then you will need to edit the `src/hemem.h` file `NVMPATH` macro to point it towards your actual NVM `/dev/dax` file. 43 | 44 | 45 | #### Building 46 | 47 | To build HeMem, you must first build the linux kernel HeMem depends on. Build, install, and run the kernel located in the `linux/` directory. 48 | 49 | Next, HeMem depends on Hoard. Follow the instructions to build the Hoard library located in the `Hoard/` directory. 50 | 51 | HeMem also depends on libsyscall_intercept to intercept memory allocation system calls. Follow the instructions to build and install libsyscall_intercept [here](https://github.com/pmem/syscall_intercept). 52 | 53 | Once the proper kernel version is running, the `/dev/dax` files have been set up, and all dependencies have been installed, HeMem can be built with the supplied Makefile by typing `make` from the `src/` directory. 54 | 55 | #### Running 56 | 57 | You will likely need to add the paths to the build HeMem library and the Hoard library to your LD_LIBRARY_PATH variable: 58 | 59 | `export LD_LIBRARY_PATH=path/to/hemem/lib:/path/to/Hoard/lib:$LD_LIBRARY_PATH` 60 | 61 | You may also need to increase the number of allowed mmap ranges: 62 | 63 | `echo 1000000 > /proc/sys/vm/max_map_count` 64 | 65 | HeMem requires the user be root in order to run. Applications can either be linked with Hemem or run unmodified via the `LD_PRELOAD` environment variable: 66 | 67 | `LD_PRELOAD=/path/to/hemem/lib.so ./foo [args]` 68 | 69 | ### Microbenchmarks 70 | 71 | A Makefile is provided to build the GUPS microbenchmarks. 72 | 73 | To reproduce the Uniform GUPS results, run the `run-random.sh` script. Results will be printed to the `random.txt` file. The throughput results shown in the paper are the "GUPS" lines. 74 | 75 | To reproduce the Hotset GUPS results, run the `run.sh` script. Results will be printed to the `results.txt` file. The throughput results shown in the paper are the "GUPS" lines. 76 | 77 | To reproduce the Instantaneous GUPS results, run the `run-instantaneous.sh` script. Results will be printed to the `tot_gups.txt` file. 78 | 79 | ### Application Benchmarks 80 | 81 | Applications tested with HeMem are located in the `apps/` directory. 82 | 83 | #### Silo 84 | 85 | The Silo application can be found in the `apps/silo_hemem/silo` directory.. Run the provided `run_batch.sh` script. Results will be in the `batch/results.txt` file. The reported throughput numbers are numbers in the first column of the file. 86 | 87 | #### FlexKVS 88 | 89 | The FlexKVS application can be found in the `apps/flexkvs` directory. These results require a separate machine for the clients. 90 | 91 | #### GapBS 92 | 93 | The GapBS application can be found in the `apps/gapbs` directory. To run the BC algorithm reported in the paper, you may run the following command: 94 | 95 | `LD_PRELOAD=/path/to/hemem/lib ./bc -g ` 96 | 97 | which will run the bc algorithm with HeMem on a graph with 2^scale vertices. 98 | 99 | -------------------------------------------------------------------------------- /apps/flexkvs/Makefile: -------------------------------------------------------------------------------- 1 | FLEXKVS_COMMON=hashtable.o ialloc.o jenkins_hash.o settings.o 2 | BENCH_COMMON=rng.o commandline.o workload.o 3 | 4 | TAS_CODE?=/home/tstamler/tas 5 | HEMEM_CODE=/home/amanda/hemem/ 6 | 7 | CFLAGS = -std=gnu99 -g -Wall -Icommon/include -O3 -march=native \ 8 | -I$(TAS_CODE)/include -I$(TAS_CODE)/lib/tas/include 9 | #CFLAGS += -DDEL_TEST 10 | LDFLAGS = -pthread -g -O3 11 | LDLIBS = -lrt -lm -lpthread -lrt -ldl 12 | 13 | include ../common/Makefile.mtcp 14 | 15 | 16 | all: flexkvs kvsbench 17 | all-sockets: flexkvs kvsbench 18 | all-mtcp: flexkvs-mtcp kvsbench-mtcp 19 | all-ll: flexkvs-ll 20 | 21 | flexkvs: $(addprefix server/,$(FLEXKVS_COMMON) main.o) 22 | $(CC) $(LDFLAGS) -o $@ $^ $(LDLIBS) -L$(HEMEM_CODE) -lhemem 23 | 24 | kvsbench: $(addprefix bench/,$(BENCH_COMMON) main.o) \ 25 | $(TAS_CODE)/lib/utils/timeout.o $(TAS_CODE)/lib/utils/utils.o 26 | $(CC) $(LDFLAGS) -o $@ $^ $(LDLIBS) 27 | 28 | flexkvs-ll: $(addprefix server/,$(FLEXKVS_COMMON) main-ll.o) \ 29 | $(TAS_CODE)/lib/libtas.so 30 | $(CC) $(LDFLAGS) -o $@ $^ $(LDLIBS) 31 | 32 | flexkvs-mtcp: $(addprefix server/,$(FLEXKVS_COMMON:.o=.mtcp.o) main.mtcp.o) 33 | $(CC) $(LDFLAGS) $(MTCP_LDFLAGS) -o $@ $^ $(LDLIBS) $(MTCP_LDLIBS) 34 | 35 | kvsbench-mtcp: $(addprefix bench/,$(BENCH_COMMON:.o=.mtcp.o) main.mtcp.o) \ 36 | ../utils/utils.o 37 | $(CC) $(LDFLAGS) $(MTCP_LDFLAGS) -o $@ $^ $(LDLIBS) $(MTCP_LDLIBS) 38 | 39 | %.mtcp.o: %.c 40 | $(CC) $(CFLAGS) $(MTCP_CFLAGS) -c -o $@ $< 41 | 42 | clean: 43 | rm -f */*.o flexkvs kvsbench flexkvs-mtcp kvsbench-mtcp flexkvs-ll 44 | 45 | .PHONY: all all-sockets all-mtcp all-ll clean 46 | -------------------------------------------------------------------------------- /apps/flexkvs/bench/benchmark.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 University of Washington, Max Planck Institute for 3 | * Software Systems, and The University of Texas at Austin 4 | * 5 | * Permission is hereby granted, free of charge, to any person obtaining 6 | * a copy of this software and associated documentation files (the 7 | * "Software"), to deal in the Software without restriction, including 8 | * without limitation the rights to use, copy, modify, merge, publish, 9 | * distribute, sublicense, and/or sell copies of the Software, and to 10 | * permit persons to whom the Software is furnished to do so, subject to 11 | * the following conditions: 12 | * 13 | * The above copyright notice and this permission notice shall be 14 | * included in all copies or substantial portions of the Software. 15 | * 16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 19 | * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 20 | * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 21 | * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 22 | * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 23 | */ 24 | 25 | #include 26 | 27 | #include "rng.h" 28 | 29 | #define DEL_RATIO 0.7 30 | 31 | enum key_dist { 32 | DIST_UNIFORM, 33 | DIST_ZIPF, 34 | }; 35 | 36 | struct settings { 37 | uint32_t dstip; 38 | uint16_t dstport; 39 | uint16_t threads; 40 | uint16_t conns; 41 | uint16_t pending; 42 | 43 | uint32_t keynum; 44 | union { 45 | struct { 46 | double s; 47 | } zipf; 48 | } keydistparams; 49 | double get_prob; 50 | enum key_dist keydist; 51 | uint64_t key_seed; 52 | uint64_t op_seed; 53 | uint32_t request_gap; 54 | uint32_t warmup_time; 55 | uint32_t cooldown_time; 56 | uint32_t run_time; 57 | uint16_t keysize; 58 | uint16_t valuesize; 59 | 60 | uint8_t batchsize; 61 | 62 | bool keybased; 63 | }; 64 | 65 | struct key { 66 | void *key; 67 | size_t keylen; 68 | double cdf; 69 | }; 70 | 71 | struct workload { 72 | struct rng op_rng; 73 | struct key *keys; 74 | size_t keys_num; 75 | }; 76 | 77 | struct workload_core { 78 | struct rng rng; 79 | }; 80 | 81 | enum workload_op { 82 | WL_OP_GET, 83 | WL_OP_SET, 84 | WL_OP_DELETE 85 | }; 86 | 87 | extern struct settings settings; 88 | 89 | enum error_ids { 90 | ERR_SUCCESS, 91 | ERR_KEY_ENOENT, 92 | ERR_KEY_EEXIST, 93 | ERR_E2BIG, 94 | ERR_EINVAL, 95 | ERR_NOT_STORED, 96 | ERR_DELTA_BADVAL, 97 | ERR_UNKNOWN_CMD, 98 | ERR_ENOMEM, 99 | ERR_OTHER, 100 | ERR_MAX, 101 | }; 102 | 103 | void print_usage(void); 104 | void init_settings(struct settings *s); 105 | int parse_settings(int argc, char *argv[], struct settings *s); 106 | 107 | bool trace_open(const char *path); 108 | bool trace_init(void); 109 | void trace_request_get(uint8_t thread, struct key *key, uint16_t id); 110 | void trace_request_set(uint8_t thread, struct key *key, uint32_t valsz, 111 | uint16_t id); 112 | void trace_response(uint8_t thread, uint16_t id, uint8_t err); 113 | void trace_flush(uint8_t thread); 114 | 115 | void workload_init(struct workload *wl); 116 | void workload_adjust(struct workload *wl, struct workload *wl2); 117 | void workload_core_init(struct workload *wl, struct workload_core *wc); 118 | void workload_op(struct workload *wl, struct workload_core *wc, struct key **k, 119 | enum workload_op *op); 120 | -------------------------------------------------------------------------------- /apps/flexkvs/bench/commandline.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 University of Washington, Max Planck Institute for 3 | * Software Systems, and The University of Texas at Austin 4 | * 5 | * Permission is hereby granted, free of charge, to any person obtaining 6 | * a copy of this software and associated documentation files (the 7 | * "Software"), to deal in the Software without restriction, including 8 | * without limitation the rights to use, copy, modify, merge, publish, 9 | * distribute, sublicense, and/or sell copies of the Software, and to 10 | * permit persons to whom the Software is furnished to do so, subject to 11 | * the following conditions: 12 | * 13 | * The above copyright notice and this permission notice shall be 14 | * included in all copies or substantial portions of the Software. 15 | * 16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 19 | * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 20 | * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 21 | * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 22 | * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 23 | */ 24 | 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | 32 | #include "benchmark.h" 33 | 34 | void print_usage(void) 35 | { 36 | fprintf(stderr, 37 | "./benchmark [options] dst-ip:dst-port\n" 38 | "Options:\n" 39 | " -t, --threads=COUNT Number of sending threads [default 1].\n" 40 | " -C, --connns=COUNT # connections / thread [default 1].\n" 41 | " -p, --pending=NUM Number of pend. req/conn. [default 1].\n" 42 | " -k, --key-size=BYTES Key size in bytes [default 32].\n" 43 | " -n, --key-num=COUNT Number of keys [default 1000].\n" 44 | " -u, --key-uniform Uniform key distribution [default]\n" 45 | " -z, --key-zipf=S Zipf key distribution;\n" 46 | " S is the zipf parameter.\n" 47 | " -v, --val-size=BYTES Value size in bytes [default 1024].\n" 48 | " -g, --get-prob=PROB Probability of GET Reqs. [default .9].\n" 49 | " -T, --time=SECS Measurement time in [s]. [default 10].\n" 50 | " -w, --warmup=SECS Warmup time [s]. [default 5].\n" 51 | " -c, --cooldown=SECS Cooldown time [s]. [default 5].\n" 52 | " -s, --key-seed=SEED Seed for key PRG.\n" 53 | " -o, --op-seed=SEED Seed for operation PRG.\n" 54 | " -r, --trace=FILE Write operation trace to file.\n" 55 | " -K, --keysteer Key-based steering.\n"); 56 | } 57 | 58 | void init_settings(struct settings *s) 59 | { 60 | s->threads = 1; 61 | s->conns = 1; 62 | s->pending = 1; 63 | s->keysize = 32; 64 | s->keynum = 1000; 65 | s->keydist = DIST_UNIFORM; 66 | s->valuesize = 1024; 67 | s->get_prob = 0.9; 68 | s->warmup_time = 5; 69 | s->cooldown_time = 5; 70 | s->run_time = 10; 71 | s->request_gap = 100 * 1000; 72 | s->key_seed = 0x123457890123ULL; 73 | s->op_seed = 0x987654321098ULL; 74 | s->keybased = false; 75 | s->batchsize = 32; 76 | } 77 | 78 | int parse_settings(int argc, char *argv[], struct settings *s) 79 | { 80 | static struct option long_opts[] = { 81 | {"threads", required_argument, NULL, 't'}, 82 | {"conns", required_argument, NULL, 'C'}, 83 | {"pending", required_argument, NULL, 'p'}, 84 | {"key-size", required_argument, NULL, 'k'}, 85 | {"key-num", required_argument, NULL, 'n'}, 86 | {"key-uniform", no_argument, NULL, 'u'}, 87 | {"key-zipf", required_argument, NULL, 'z'}, 88 | {"val-size", required_argument, NULL, 'v'}, 89 | {"get-prob", required_argument, NULL, 'g'}, 90 | {"time", required_argument, NULL, 'T'}, 91 | {"warmup", required_argument, NULL, 'w'}, 92 | {"cooldown", required_argument, NULL, 'c'}, 93 | {"delay", required_argument, NULL, 'd'}, 94 | {"key-seed", required_argument, NULL, 's'}, 95 | {"op-seed", required_argument, NULL, 'o'}, 96 | {"keysteer", no_argument, NULL, 'K'}, 97 | }; 98 | static const char *short_opts = "t:C:p:k:n:uz:v:g:T:w:c:d:s:o:r:K"; 99 | int c, opt_idx, done = 0; 100 | char *end; 101 | 102 | while (!done) { 103 | c = getopt_long(argc, argv, short_opts, long_opts, &opt_idx); 104 | switch (c) { 105 | case 't': 106 | s->threads = strtoul(optarg, &end, 10); 107 | if (!*optarg || *end || s->threads < 1) { 108 | fprintf(stderr, "threads needs to be a positive " 109 | "integer\n"); 110 | return -1; 111 | } 112 | break; 113 | case 'C': 114 | s->conns = strtoul(optarg, &end, 10); 115 | if (!*optarg || *end || s->conns < 1) { 116 | fprintf(stderr, "conns needs to be a positive " 117 | "integer\n"); 118 | return -1; 119 | } 120 | break; 121 | case 'p': 122 | s->pending = strtoul(optarg, &end, 10); 123 | if (!*optarg || *end || s->pending < 1) { 124 | fprintf(stderr, "pending needs to be a positive " 125 | "integer\n"); 126 | return -1; 127 | } 128 | break; 129 | 130 | case 'k': 131 | s->keysize = strtoul(optarg, &end, 10); 132 | if (!*optarg || *end || s->keysize < 1) { 133 | fprintf(stderr, "Key size needs to be a positive " 134 | "integer\n"); 135 | return -1; 136 | } 137 | break; 138 | case 'n': 139 | s->keynum = strtoul(optarg, &end, 10); 140 | if (!*optarg || *end || s->keynum < 1) { 141 | fprintf(stderr, "Key count needs to be a positive " 142 | "integer\n"); 143 | return -1; 144 | } 145 | break; 146 | case 'v': 147 | s->valuesize = strtoul(optarg, &end, 10); 148 | if (!*optarg || *end || s->valuesize < 1) { 149 | fprintf(stderr, "Value size needs to be a positive " 150 | "integer\n"); 151 | return -1; 152 | } 153 | break; 154 | 155 | case 'u': 156 | s->keydist = DIST_UNIFORM; 157 | break; 158 | case 'z': 159 | s->keydist = DIST_ZIPF; 160 | s->keydistparams.zipf.s = strtod(optarg, &end); 161 | if (!*optarg || *end) { 162 | fprintf(stderr, "Zipf parameter needs to be a floating " 163 | "point number.\n"); 164 | return -1; 165 | } 166 | break; 167 | case'g': 168 | s->get_prob = strtod(optarg, &end); 169 | if (!*optarg || *end || s->get_prob < 0 || s->get_prob > 1) { 170 | fprintf(stderr, "GET probability needs to be a floating " 171 | "point number between 0 and 1.\n"); 172 | return -1; 173 | } 174 | break; 175 | case 'T': 176 | s->run_time = strtoul(optarg, &end, 10); 177 | if (!*optarg || *end || s->run_time < 1) { 178 | fprintf(stderr, "Run time needs to be a positive " 179 | "integer\n"); 180 | return -1; 181 | } 182 | break; 183 | case 'w': 184 | s->warmup_time = strtoul(optarg, &end, 10); 185 | if (!*optarg || *end) { 186 | fprintf(stderr, "Warmup time needs to be a positive " 187 | "integer\n"); 188 | return -1; 189 | } 190 | break; 191 | case 'c': 192 | s->cooldown_time = strtoul(optarg, &end, 10); 193 | if (!*optarg || *end) { 194 | fprintf(stderr, "Cool down time needs to be a positive " 195 | "integer\n"); 196 | return -1; 197 | } 198 | break; 199 | case 'd': 200 | s->request_gap = strtoul(optarg, &end, 10); 201 | if (!*optarg || *end) { 202 | fprintf(stderr, "Delay needs to be a positive " 203 | "integer\n"); 204 | return -1; 205 | } 206 | break; 207 | case 's': 208 | s->key_seed = strtoull(optarg, &end, 0); 209 | if (!*optarg || *end) { 210 | fprintf(stderr, "Key seed needs to be an integer.\n"); 211 | return -1; 212 | } 213 | break; 214 | case 'o': 215 | s->op_seed = strtoull(optarg, &end, 0); 216 | if (!*optarg || *end) { 217 | fprintf(stderr, "Op seed needs to be an integer.\n"); 218 | return -1; 219 | } 220 | break; 221 | case 'K': 222 | settings.keybased = true; 223 | break; 224 | case -1: 225 | done = 1; 226 | break; 227 | case '?': 228 | return -1; 229 | default: 230 | abort(); 231 | } 232 | } 233 | 234 | if (optind + 1 != argc) { 235 | return -1; 236 | } 237 | 238 | /* separate ip and port at colon */ 239 | if ((end = strchr(argv[optind], ':')) == NULL) { 240 | fprintf(stderr, "Colon separating IP and port not found\n"); 241 | return -1; 242 | } 243 | *end = '\0'; 244 | end++; 245 | 246 | /* parse ip */ 247 | if (util_parse_ipv4(argv[optind], &s->dstip) != 0) { 248 | fprintf(stderr, "Parsing ip address failed\n"); 249 | return -1; 250 | } 251 | 252 | /* parse port */ 253 | s->dstport = strtoul(end, NULL, 10); 254 | 255 | // TODO: ensure key size / key num combination is valid 256 | 257 | return 0; 258 | } 259 | -------------------------------------------------------------------------------- /apps/flexkvs/bench/rng.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 University of Washington, Max Planck Institute for 3 | * Software Systems, and The University of Texas at Austin 4 | * 5 | * Permission is hereby granted, free of charge, to any person obtaining 6 | * a copy of this software and associated documentation files (the 7 | * "Software"), to deal in the Software without restriction, including 8 | * without limitation the rights to use, copy, modify, merge, publish, 9 | * distribute, sublicense, and/or sell copies of the Software, and to 10 | * permit persons to whom the Software is furnished to do so, subject to 11 | * the following conditions: 12 | * 13 | * The above copyright notice and this permission notice shall be 14 | * included in all copies or substantial portions of the Software. 15 | * 16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 19 | * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 20 | * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 21 | * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 22 | * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 23 | */ 24 | 25 | #include "rng.h" 26 | 27 | static const uint64_t a = 0x5deece66dULL; 28 | static const uint64_t c = 0xb; 29 | static const uint64_t m = 1ULL << 48; 30 | 31 | void rng_init(struct rng *rng, uint64_t seed) 32 | { 33 | rng->seed = (seed ^ a) % m; 34 | } 35 | 36 | uint32_t rng_gen32(struct rng *rng) 37 | { 38 | uint64_t next; 39 | next = (a * rng->seed + c) % m; 40 | rng->seed = next; 41 | return next >> 16; 42 | } 43 | 44 | double rng_gend(struct rng *rng) 45 | { 46 | // This is what Java seems to do 47 | uint64_t x = 48 | (((uint64_t) rng_gen32(rng) >> 6) << 27) + (rng_gen32(rng) >> 5); 49 | return x / ((double) (1ULL << 53)); 50 | } 51 | 52 | void rng_gen(struct rng *rng, void *buf, size_t size) 53 | { 54 | uint32_t x; 55 | while (size >= 4) { 56 | * ((uint32_t *) buf) = rng_gen32(rng); 57 | buf = (void*) ((uintptr_t) buf + 4); 58 | size -= 4; 59 | } 60 | 61 | x = rng_gen32(rng); 62 | while (size > 0) { 63 | * ((uint8_t *) buf) = x >> 24; 64 | x <<= 8; 65 | buf = (void*) ((uintptr_t) buf + 1); 66 | size--; 67 | } 68 | 69 | } 70 | 71 | -------------------------------------------------------------------------------- /apps/flexkvs/bench/rng.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 University of Washington, Max Planck Institute for 3 | * Software Systems, and The University of Texas at Austin 4 | * 5 | * Permission is hereby granted, free of charge, to any person obtaining 6 | * a copy of this software and associated documentation files (the 7 | * "Software"), to deal in the Software without restriction, including 8 | * without limitation the rights to use, copy, modify, merge, publish, 9 | * distribute, sublicense, and/or sell copies of the Software, and to 10 | * permit persons to whom the Software is furnished to do so, subject to 11 | * the following conditions: 12 | * 13 | * The above copyright notice and this permission notice shall be 14 | * included in all copies or substantial portions of the Software. 15 | * 16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 19 | * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 20 | * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 21 | * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 22 | * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 23 | */ 24 | 25 | #ifndef RNG_H_ 26 | #define RNG_H_ 27 | 28 | #include 29 | #include 30 | 31 | struct rng { 32 | uint64_t seed; 33 | }; 34 | 35 | void rng_init(struct rng *rng, uint64_t seed); 36 | uint32_t rng_gen32(struct rng *rng); 37 | /** Double uniform between 0 and 1 */ 38 | double rng_gend(struct rng *rng); 39 | void rng_gen(struct rng *rng, void *buf, size_t size); 40 | 41 | #endif // ndef RNG_H_ 42 | -------------------------------------------------------------------------------- /apps/flexkvs/bench/workload.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 University of Washington, Max Planck Institute for 3 | * Software Systems, and The University of Texas at Austin 4 | * 5 | * Permission is hereby granted, free of charge, to any person obtaining 6 | * a copy of this software and associated documentation files (the 7 | * "Software"), to deal in the Software without restriction, including 8 | * without limitation the rights to use, copy, modify, merge, publish, 9 | * distribute, sublicense, and/or sell copies of the Software, and to 10 | * permit persons to whom the Software is furnished to do so, subject to 11 | * the following conditions: 12 | * 13 | * The above copyright notice and this permission notice shall be 14 | * included in all copies or substantial portions of the Software. 15 | * 16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 19 | * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 20 | * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 21 | * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 22 | * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 23 | */ 24 | 25 | #include 26 | #include 27 | #include 28 | #include 29 | 30 | #include "benchmark.h" 31 | #include "rng.h" 32 | 33 | static struct key *generate_keys(struct rng *rng, size_t n, size_t ks); 34 | static void distribute_uniform(struct key *keys, size_t n); 35 | static void distribute_zipf(struct key *keys, size_t n, double s); 36 | static struct key *draw_key(struct key *keys, size_t n, struct rng *rng); 37 | 38 | 39 | void workload_init(struct workload *wl) 40 | { 41 | struct rng key_rng; 42 | 43 | /* prepare rngs and distributions for keys */ 44 | rng_init(&key_rng, settings.key_seed); 45 | rng_init(&wl->op_rng, settings.op_seed); 46 | wl->keys = generate_keys(&key_rng, settings.keynum, settings.keysize); 47 | wl->keys_num = settings.keynum; 48 | if (settings.keydist == DIST_UNIFORM) { 49 | distribute_uniform(wl->keys, wl->keys_num); 50 | } else { 51 | distribute_zipf(wl->keys, wl->keys_num, settings.keydistparams.zipf.s); 52 | } 53 | } 54 | 55 | void workload_adjust(struct workload *wl, struct workload *wl2) 56 | { 57 | struct rng key_rng; 58 | 59 | /* prepare rngs and distributions for keys */ 60 | wl->keys = generate_keys(&key_rng, settings.keynum * (1-DEL_RATIO), settings.keysize); 61 | wl2->keys = generate_keys(&key_rng, settings.keynum * (DEL_RATIO), settings.keysize); 62 | wl->keys_num = settings.keynum * (1-DEL_RATIO); 63 | wl2->keys_num = settings.keynum * DEL_RATIO; 64 | if (settings.keydist == DIST_UNIFORM) { 65 | distribute_uniform(wl->keys, wl->keys_num); 66 | distribute_uniform(wl2->keys, wl2->keys_num); 67 | } else { 68 | distribute_zipf(wl->keys, wl->keys_num, settings.keydistparams.zipf.s); 69 | distribute_zipf(wl2->keys, wl2->keys_num, settings.keydistparams.zipf.s); 70 | } 71 | } 72 | 73 | 74 | void workload_core_init(struct workload *wl, struct workload_core *wc) 75 | { 76 | rng_init(&wc->rng, 77 | ((uint64_t) rng_gen32(&wl->op_rng) << 16) ^ rng_gen32(&wl->op_rng)); 78 | } 79 | 80 | void workload_op(struct workload *wl, struct workload_core *wc, struct key **k, 81 | enum workload_op *op) 82 | { 83 | if (rng_gend(&wc->rng) <= settings.get_prob) { 84 | *op = WL_OP_GET; 85 | } else { 86 | *op = WL_OP_SET; 87 | } 88 | *k = draw_key(wl->keys, wl->keys_num, &wc->rng); 89 | } 90 | 91 | /** Generate n keys (no distribution set) */ 92 | static struct key *generate_keys(struct rng *rng, size_t n, size_t keysz) 93 | { 94 | size_t i; 95 | struct key *k = malloc(n * sizeof(*k) + n * keysz); 96 | uint8_t *keys = (uint8_t *) (k + n); 97 | if (k == NULL) { 98 | abort(); 99 | } 100 | for (i = 0; i < n; i++) { 101 | rng_gen(rng, keys, keysz); 102 | k[i].key = keys; 103 | k[i].keylen = keysz; 104 | keys += keysz; 105 | } 106 | 107 | // TODO: Fix duplicates 108 | return k; 109 | } 110 | 111 | /** Distribute keys uniformly */ 112 | static void distribute_uniform(struct key *keys, size_t n) 113 | { 114 | size_t i; 115 | double p = (double) 1 / (double) n; 116 | double sum = 0; 117 | for (i = 0; i < n; i++) { 118 | sum += p; 119 | keys[i].cdf = sum; 120 | } 121 | } 122 | 123 | /** Distribute keys according to zipf distribution with parameter s. */ 124 | static void distribute_zipf(struct key *keys, size_t n, double s) 125 | { 126 | size_t i; 127 | double c = 0; 128 | double sum = 0; 129 | 130 | for (i = 0; i < n; i++) { 131 | c += 1 / pow(i + 1, s); 132 | } 133 | 134 | for (i = 0; i < n; i++) { 135 | sum += 1 / pow(i + 1, s) / c; 136 | keys[i].cdf = sum; 137 | } 138 | } 139 | 140 | /** Binary search helper (returns -1 to go left, 0 if found, 1 to go right). */ 141 | static inline int key_in_range(struct key *keys, size_t n, size_t i, double x) 142 | { 143 | double cdf = keys[i].cdf; 144 | //printf("key in range n=%lu i=%lu x=%lf cdf=%lf\n", n, i, x, cdf); 145 | if (x < cdf) { 146 | if (i == 0) { 147 | return 0; 148 | } else { 149 | return (x <= keys[i - 1].cdf ? -1 : 0); 150 | } 151 | } else if (x > cdf) { 152 | if (i == n - 1) { 153 | // Already at right-most value (could happen due to rounding errors 154 | // when generating the distribution) 155 | return 0; 156 | } else { 157 | return 1; 158 | } 159 | } else { 160 | return 0; 161 | } 162 | } 163 | 164 | /** Draw a key at random, according to the configured distribution. */ 165 | static struct key *draw_key(struct key *keys, size_t n, struct rng *rng) 166 | { 167 | double x = rng_gend(rng); 168 | size_t l, r, mid = 0; 169 | int res; 170 | //printf("draw_key(n=%lu)\n", n); 171 | 172 | l = 0; 173 | r = n - 1; 174 | while (l < r) { 175 | mid = (l + r) / 2; 176 | res = key_in_range(keys, n, mid, x); 177 | if (res < 0) { 178 | r = mid - 1; 179 | } else if (res > 0) { 180 | l = mid + 1; 181 | } else { 182 | break; 183 | } 184 | } 185 | 186 | return keys + mid; 187 | } 188 | -------------------------------------------------------------------------------- /apps/flexkvs/flexkvs.conf: -------------------------------------------------------------------------------- 1 | ############### mtcp configuration file ############### 2 | 3 | # The underlying I/O module you want to use. Please 4 | # enable only one out of the two. 5 | #io = psio 6 | #io = netmap 7 | io = dpdk 8 | 9 | # No. of cores setting (enabling this option will override 10 | # the `cpu' config for those applications that accept 11 | # num_cores as command line arguments) 12 | # 13 | # e.g. in case ./epserver is executed with `-N 4', the 14 | # mtcp core will still invoke 8 mTCP threads if the 15 | # following line is uncommented. 16 | num_cores = 4 17 | 18 | # Number of memory channels per processor socket (dpdk-only) 19 | num_mem_ch = 2 20 | 21 | # Enable multi-process support (under development) 22 | #multiprocess = 0 master 23 | #multiprocess = 1 24 | 25 | # Used port (please adjust accordingly) 26 | #------ PSIO ports -------# 27 | #port = xge0 xge1 28 | #port = xge1 29 | #------ DPDK ports -------# 30 | port = dpdk0 dpdk1 ens2f1 31 | #port = dpdk0 dpdk1 32 | #port = dpdk0:0 33 | #port = dpdk0:1 34 | 35 | # Maximum concurrency per core 36 | max_concurrency = 512 37 | 38 | # Maximum number of socket buffers per core 39 | # Set this to small value if there are many idle connections 40 | max_num_buffers = 512 41 | #max_num_buffers = 32 42 | 43 | # Receive buffer size of sockets 44 | rcvbuf = 8192 45 | #rcvbuf = 1048576 46 | 47 | # Send buffer size of sockets 48 | sndbuf = 8192 49 | #sndbuf = 1048576 50 | 51 | # TCP timeout seconds 52 | # (tcp_timeout = -1 can disable the timeout check) 53 | tcp_timeout = 1 54 | 55 | # TCP timewait seconds 56 | tcp_timewait = 0 57 | 58 | # Interface to print stats (please adjust accordingly) 59 | # You can enable multiple ports in separate lines 60 | #------ PSIO ports -------# 61 | #stat_print = xge0 62 | #stat_print = xge1 63 | #------ DPDK ports -------# 64 | #stat_print = dpdk1 65 | #stat_print = dpdk0:0 66 | #stat_print = dpdk0:1 67 | #stat_print = dpdk1 68 | 69 | ####################################################### 70 | -------------------------------------------------------------------------------- /apps/flexkvs/kvsbench.conf: -------------------------------------------------------------------------------- 1 | ############### mtcp configuration file ############### 2 | 3 | # The underlying I/O module you want to use. Please 4 | # enable only one out of the two. 5 | #io = psio 6 | #io = netmap 7 | io = dpdk 8 | 9 | # No. of cores setting (enabling this option will override 10 | # the `cpu' config for those applications that accept 11 | # num_cores as command line arguments) 12 | # 13 | # e.g. in case ./epserver is executed with `-N 4', the 14 | # mtcp core will still invoke 8 mTCP threads if the 15 | # following line is uncommented. 16 | num_cores = 1 17 | 18 | # Number of memory channels per processor socket (dpdk-only) 19 | num_mem_ch = 2 20 | 21 | # Enable multi-process support (under development) 22 | #multiprocess = 0 master 23 | #multiprocess = 1 24 | 25 | # Used port (please adjust accordingly) 26 | #------ PSIO ports -------# 27 | #port = xge0 xge1 28 | #port = xge1 29 | #------ DPDK ports -------# 30 | port = dpdk0 dpdk1 ens2f1 31 | #port = dpdk0 dpdk1 32 | #port = dpdk0:0 33 | #port = dpdk0:1 34 | 35 | # Maximum concurrency per core 36 | max_concurrency = 512 37 | 38 | # Maximum number of socket buffers per core 39 | # Set this to small value if there are many idle connections 40 | max_num_buffers = 512 41 | #max_num_buffers = 32 42 | 43 | # Receive buffer size of sockets 44 | rcvbuf = 8192 45 | #rcvbuf = 1048576 46 | 47 | # Send buffer size of sockets 48 | sndbuf = 8192 49 | #sndbuf = 1048576 50 | 51 | # TCP timeout seconds 52 | # (tcp_timeout = -1 can disable the timeout check) 53 | tcp_timeout = 1 54 | 55 | # TCP timewait seconds 56 | tcp_timewait = 0 57 | 58 | # Interface to print stats (please adjust accordingly) 59 | # You can enable multiple ports in separate lines 60 | #------ PSIO ports -------# 61 | #stat_print = xge0 62 | #stat_print = xge1 63 | #------ DPDK ports -------# 64 | #stat_print = dpdk1 65 | #stat_print = dpdk0:0 66 | #stat_print = dpdk0:1 67 | #stat_print = dpdk1 68 | 69 | ####################################################### 70 | -------------------------------------------------------------------------------- /apps/flexkvs/server/hashtable.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 University of Washington, Max Planck Institute for 3 | * Software Systems, and The University of Texas at Austin 4 | * 5 | * Permission is hereby granted, free of charge, to any person obtaining 6 | * a copy of this software and associated documentation files (the 7 | * "Software"), to deal in the Software without restriction, including 8 | * without limitation the rights to use, copy, modify, merge, publish, 9 | * distribute, sublicense, and/or sell copies of the Software, and to 10 | * permit persons to whom the Software is furnished to do so, subject to 11 | * the following conditions: 12 | * 13 | * The above copyright notice and this permission notice shall be 14 | * included in all copies or substantial portions of the Software. 15 | * 16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 19 | * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 20 | * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 21 | * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 22 | * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 23 | */ 24 | 25 | #include 26 | #include 27 | #include 28 | 29 | #include "iokvs.h" 30 | 31 | #define HASHTABLE_POWER 31 32 | #define TABLESZ(p) (1ULL << (p)) 33 | 34 | _Static_assert(sizeof(pthread_spinlock_t) == 4, "Bad spinlock size"); 35 | 36 | #define BUCKET_NITEMS 5 37 | 38 | //#define NOHTLOCKS 1 39 | 40 | struct hash_bucket { 41 | struct item *items[BUCKET_NITEMS]; 42 | uint32_t hashes[BUCKET_NITEMS]; 43 | pthread_spinlock_t lock; 44 | } __attribute__((packed)); 45 | 46 | _Static_assert(sizeof(struct hash_bucket) == 64, "Bad hash bucket size"); 47 | 48 | /******************************************************************************/ 49 | /* Hashtable */ 50 | 51 | static size_t nbuckets; 52 | static struct hash_bucket *buckets; 53 | 54 | void hasht_init(void) 55 | { 56 | size_t i; 57 | 58 | nbuckets = TABLESZ(HASHTABLE_POWER); 59 | printf("allocing %zu buckets for %zu bytes\n", nbuckets, nbuckets * sizeof(*buckets)); 60 | buckets = calloc(nbuckets + 1, sizeof(*buckets)); 61 | buckets = (struct hash_bucket *) (((uintptr_t) buckets + 63) & ~63ULL); 62 | if (buckets == NULL) { 63 | perror("Allocating item hash table failed"); 64 | abort(); 65 | } 66 | 67 | for (i = 0; i < nbuckets; i++) { 68 | if (pthread_spin_init(&buckets[i].lock, 0) != 0) { 69 | perror("Initializing spin lock failed"); 70 | abort(); 71 | } 72 | } 73 | } 74 | 75 | 76 | static inline bool item_key_matches(struct item *it, const void *key, 77 | size_t klen) 78 | { 79 | return klen == it->keylen && !__builtin_memcmp(item_key(it), key, klen); 80 | } 81 | 82 | static inline bool item_hkey_matches(struct item *it, const void *key, 83 | size_t klen, uint32_t hv) 84 | { 85 | return it->hv == hv && item_key_matches(it, key, klen); 86 | } 87 | 88 | #if 0 89 | void hasht_prefetch1(uint32_t hv) 90 | { 91 | rte_prefetch0(buckets + (hv % nbuckets)); 92 | } 93 | 94 | void hasht_prefetch2(uint32_t hv) 95 | { 96 | struct hash_bucket *b; 97 | size_t i; 98 | 99 | b = buckets + (hv % nbuckets); 100 | for (i = 0; i < BUCKET_NITEMS; i++) { 101 | if (b->items[i] != NULL && b->hashes[i] == hv) { 102 | rte_prefetch0(b->items[i]); 103 | } 104 | } 105 | } 106 | #endif 107 | 108 | 109 | struct item *hasht_get(const void *key, size_t klen, uint32_t hv) 110 | { 111 | struct hash_bucket *b; 112 | struct item *it; 113 | size_t i; 114 | 115 | b = buckets + (hv % nbuckets); 116 | #ifndef NOHTLOCKS 117 | pthread_spin_lock(&b->lock); 118 | #endif 119 | 120 | for (i = 0; i < BUCKET_NITEMS; i++) { 121 | if (b->items[i] != NULL && b->hashes[i] == hv) { 122 | it = b->items[i]; 123 | if (item_key_matches(it, key, klen)) { 124 | goto done; 125 | } 126 | } 127 | } 128 | it = b->items[BUCKET_NITEMS - 1]; 129 | if (it != NULL) { 130 | it = it->next; 131 | while (it != NULL && !item_hkey_matches(it, key, klen, hv)) { 132 | it = it->next; 133 | } 134 | } 135 | done: 136 | if (it != NULL) { 137 | item_ref(it); 138 | } 139 | #ifndef NOHTLOCKS 140 | pthread_spin_unlock(&b->lock); 141 | #endif 142 | return it; 143 | } 144 | 145 | 146 | void hasht_put(struct item *nit, struct item *cas) 147 | { 148 | struct hash_bucket *b; 149 | struct item *it, *prev; 150 | size_t i, di; 151 | bool has_direct = false; 152 | uint32_t hv = nit->hv; 153 | void *key = item_key(nit); 154 | size_t klen = nit->keylen; 155 | 156 | 157 | b = buckets + (hv % nbuckets); 158 | #ifndef NOHTLOCKS 159 | pthread_spin_lock(&b->lock); 160 | #endif 161 | 162 | // Check if we need to replace an existing item 163 | for (i = 0; i < BUCKET_NITEMS; i++) { 164 | if (b->items[i] == NULL) { 165 | has_direct = true; 166 | di = i; 167 | } else if (b->hashes[i] == hv) { 168 | it = b->items[i]; 169 | if (item_key_matches(it, key, klen)) { 170 | // Were doing a compare and set 171 | if (cas != NULL && cas != it) { 172 | goto done; 173 | } 174 | assert(nit != it); 175 | item_ref(nit); 176 | nit->next = it->next; 177 | b->items[i] = nit; 178 | item_unref(it); 179 | goto done; 180 | } 181 | } 182 | } 183 | 184 | if (cas != NULL) { 185 | goto done; 186 | } 187 | 188 | item_ref(nit); 189 | 190 | // Note it does not match, otherwise we would have already bailed in the for 191 | // loop 192 | it = b->items[BUCKET_NITEMS - 1]; 193 | if (it != NULL) { 194 | prev = it; 195 | it = it->next; 196 | while (it != NULL && !item_hkey_matches(it, key, klen, hv)) { 197 | prev = it; 198 | it = it->next; 199 | } 200 | 201 | if (it != NULL) { 202 | nit->next = it->next; 203 | prev->next = nit; 204 | item_unref(it); 205 | goto done; 206 | } 207 | } 208 | 209 | // We did not find an existing entry to replace, just stick it in wherever 210 | // we find room 211 | if (!has_direct) { 212 | di = BUCKET_NITEMS - 1; 213 | } 214 | nit->next = b->items[di]; 215 | b->hashes[di] = hv; 216 | b->items[di] = nit; 217 | 218 | done: 219 | #ifndef NOHTLOCKS 220 | pthread_spin_unlock(&b->lock); 221 | #endif 222 | return; 223 | } 224 | 225 | 226 | -------------------------------------------------------------------------------- /apps/flexkvs/server/ialloc.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 University of Washington, Max Planck Institute for 3 | * Software Systems, and The University of Texas at Austin 4 | * 5 | * Permission is hereby granted, free of charge, to any person obtaining 6 | * a copy of this software and associated documentation files (the 7 | * "Software"), to deal in the Software without restriction, including 8 | * without limitation the rights to use, copy, modify, merge, publish, 9 | * distribute, sublicense, and/or sell copies of the Software, and to 10 | * permit persons to whom the Software is furnished to do so, subject to 11 | * the following conditions: 12 | * 13 | * The above copyright notice and this permission notice shall be 14 | * included in all copies or substantial portions of the Software. 15 | * 16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 19 | * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 20 | * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 21 | * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 22 | * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 23 | */ 24 | 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | 32 | #include "iokvs.h" 33 | 34 | 35 | #define SF_INACTIVE 1 36 | #define SF_CLEANED 4 37 | 38 | struct segment_header { 39 | void *data; 40 | struct segment_header *next; 41 | struct segment_header *prev; 42 | uint32_t offset; 43 | uint32_t freed; 44 | uint32_t size; 45 | uint32_t flags; 46 | }; 47 | 48 | static struct segment_header *free_segments; 49 | static pthread_spinlock_t segalloc_lock; 50 | static void *seg_base; 51 | static struct segment_header **seg_headers; 52 | static size_t seg_alloced; 53 | 54 | #ifdef BARRELFISH 55 | void *mem_base; 56 | uint64_t mem_base_phys; 57 | #endif 58 | 59 | void ialloc_init(void) 60 | { 61 | pthread_spin_init(&segalloc_lock, 0); 62 | free_segments = NULL; 63 | size_t total; 64 | 65 | seg_alloced = 0; 66 | total = settings.segsize * settings.segmaxnum; 67 | printf("Allocating %lu bytes\n", (long unsigned int) total); 68 | 69 | #ifdef BARRELFISH 70 | { 71 | errval_t r; 72 | struct capref cap; 73 | struct frame_identity id; 74 | 75 | r = myt_alloc_map(VREGION_FLAGS_READ_WRITE, total, &seg_base, &cap); 76 | if (err_is_fail(r)) { 77 | USER_PANIC_ERR(r, "Preallocating failed"); 78 | } 79 | 80 | r = invoke_frame_identify(cap, &id); 81 | if (err_is_fail(r)) { 82 | USER_PANIC_ERR(r, "identify failed"); 83 | } 84 | 85 | mem_base = seg_base; 86 | mem_base_phys = id.base; 87 | } 88 | #else 89 | //if ((seg_base = mmap(NULL, total, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_POPULATE, 90 | //if ((seg_base = mmap(NULL, total, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, 91 | // -1, 0)) == MAP_FAILED) 92 | if ((seg_base = mmap(NULL, total, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, 93 | -1, 0)) == MAP_FAILED) 94 | { 95 | perror("mmap() of segments base failed"); 96 | abort(); 97 | } 98 | printf("seg base %p through %p\n", seg_base, seg_base + total); 99 | #endif 100 | if ((seg_headers = calloc(settings.segmaxnum, sizeof(*seg_headers))) == 101 | NULL) 102 | { 103 | perror("Allocating segment header array failed"); 104 | abort(); 105 | } 106 | } 107 | 108 | static struct segment_header *segment_alloc(void) 109 | { 110 | struct segment_header *h = NULL; 111 | void *data; 112 | size_t i, segsz; 113 | 114 | /* Try to get a segment from the freelist */ 115 | if (free_segments != NULL) { 116 | pthread_spin_lock(&segalloc_lock); 117 | if (free_segments != NULL) { 118 | h = free_segments; 119 | free_segments = h->next; 120 | } 121 | pthread_spin_unlock(&segalloc_lock); 122 | 123 | if (h != NULL) { 124 | goto init_h; 125 | } 126 | } 127 | 128 | /* Check if there are still unallocated segments (note: unlocked) */ 129 | i = seg_alloced; 130 | if (i >= settings.segmaxnum) { 131 | pthread_spin_unlock(&segalloc_lock); 132 | return NULL; 133 | } 134 | 135 | /* If there is a possiblity that there are still unallocated segments, let's 136 | * go for it. */ 137 | pthread_spin_lock(&segalloc_lock); 138 | i = seg_alloced; 139 | if (i >= settings.segmaxnum) { 140 | pthread_spin_unlock(&segalloc_lock); 141 | return NULL; 142 | } 143 | 144 | seg_alloced++; 145 | pthread_spin_unlock(&segalloc_lock); 146 | 147 | segsz = settings.segsize; 148 | data = (void *) ((uintptr_t) seg_base + segsz * i); 149 | #ifndef BARRELFISH 150 | if (mprotect(data, settings.segsize, PROT_READ | PROT_WRITE) != 0) { 151 | perror("mprotect failed"); 152 | /* TODO: check what to do here */ 153 | return NULL; 154 | } 155 | #endif 156 | 157 | h = malloc(sizeof(*h)); 158 | if (h == NULL) { 159 | /* TODO: check what to do here */ 160 | return NULL; 161 | } 162 | seg_headers[i] = h; 163 | 164 | h->size = segsz; 165 | h->data = data; 166 | //printf("allocating segment with size %zu and data at %p\n", segsz, data); 167 | init_h: 168 | h->offset = 0; 169 | h->flags = 0; 170 | h->freed = 0; 171 | return h; 172 | } 173 | 174 | static inline struct segment_header *segment_from_part(void *data) 175 | { 176 | size_t i = ((uintptr_t) data - (uintptr_t) seg_base) / settings.segsize; 177 | assert(i < settings.segmaxnum); 178 | return seg_headers[i]; 179 | } 180 | 181 | static void segment_free(struct segment_header *h) 182 | { 183 | pthread_spin_lock(&segalloc_lock); 184 | h->offset = 0; 185 | h->next = free_segments; 186 | free_segments = h; 187 | pthread_spin_unlock(&segalloc_lock); 188 | } 189 | 190 | static void segment_item_free(struct segment_header *h, size_t total) 191 | { 192 | if (h->size != __sync_add_and_fetch(&h->freed, total)) { 193 | return; 194 | } 195 | } 196 | 197 | static struct item *segment_item_alloc(struct segment_header *h, size_t total) 198 | { 199 | struct item *it = (struct item *) ((uintptr_t) h->data + h->offset); 200 | size_t avail; 201 | 202 | /* Not enough room in this segment */ 203 | avail = h->size - h->offset; 204 | if (avail == 0) { 205 | return NULL; 206 | } else if (avail < total) { 207 | if (avail >= sizeof(struct item)) { 208 | it->refcount = 0; 209 | /* needed for log scan */ 210 | it->keylen = avail - sizeof(struct item); 211 | it->vallen = 0; 212 | } 213 | segment_item_free(h, avail); 214 | h->offset += avail; 215 | return NULL; 216 | } 217 | 218 | /* Ordering here is important */ 219 | it->refcount = 1; 220 | 221 | h->offset += total; 222 | 223 | return it; 224 | } 225 | 226 | 227 | void ialloc_init_allocator(struct item_allocator *ia) 228 | { 229 | struct segment_header *h; 230 | 231 | memset(ia, 0, sizeof(*ia)); 232 | 233 | if ((h = segment_alloc()) == NULL) { 234 | fprintf(stderr, "Allocating segment failed\n"); 235 | abort(); 236 | } 237 | h->next = NULL; 238 | ia->cur = h; 239 | ia->oldest = h; 240 | 241 | if ((h = segment_alloc()) == NULL) { 242 | fprintf(stderr, "Allocating reserved segment failed\n"); 243 | abort(); 244 | } 245 | h->next = NULL; 246 | ia->reserved = h; 247 | 248 | printf("Initializing allocator: %lu\n", (unsigned long) (settings.segcqsize * 249 | sizeof(*ia->cleanup_queue))); 250 | ia->cleanup_queue = calloc(settings.segcqsize, sizeof(*ia->cleanup_queue)); 251 | ia->cq_head = ia->cq_tail = 0; 252 | ia->cleaning = NULL; 253 | } 254 | 255 | struct item *ialloc_alloc(struct item_allocator *ia, size_t total, bool cleanup) 256 | { 257 | struct segment_header *h, *old; 258 | struct item *it; 259 | assert(total < settings.segsize); 260 | 261 | /* If the reserved segment is currently active, only allocations for cleanup 262 | * are allowed */ 263 | if (ia->reserved == NULL && !cleanup) { 264 | printf("Only cleanup!\n"); 265 | return NULL; 266 | } 267 | 268 | old = ia->cur; 269 | if ((it = segment_item_alloc(old, total)) != NULL) { 270 | return it; 271 | } 272 | 273 | if ((h = segment_alloc()) == NULL) { 274 | /* We're currently doing cleanup, and still have the reserved segment 275 | * then that can be used now */ 276 | if (cleanup && ia->reserved != NULL) { 277 | h = ia->reserved; 278 | ia->reserved = NULL; 279 | } else { 280 | printf("Fail 2!\n"); 281 | return NULL; 282 | } 283 | } 284 | old->next = h; 285 | h->next = NULL; 286 | /* Mark old segment as GC-able */ 287 | old->flags |= SF_INACTIVE; 288 | ia->cur = h; 289 | 290 | it = segment_item_alloc(h, total); 291 | if (it == NULL) { 292 | printf("Fail 3!\n"); 293 | return NULL; 294 | } 295 | return it; 296 | } 297 | 298 | void ialloc_free(struct item *it, size_t total) 299 | { 300 | struct segment_header *h = segment_from_part(it); 301 | segment_item_free(h, total); 302 | } 303 | 304 | void ialloc_free_dont_need(struct item *it, size_t total) 305 | { 306 | struct segment_header *h = segment_from_part(it); 307 | segment_item_free(h, total); 308 | if (madvise(it, item_totalsz(it), MADV_DONTNEED) != 0){ 309 | perror("madvise"); 310 | } 311 | } 312 | 313 | struct item *ialloc_cleanup_item(struct item_allocator *ia, bool idle) 314 | { 315 | size_t i; 316 | struct item *it; 317 | 318 | if (!idle) { 319 | if (ia->cleanup_count >= 32) { 320 | return NULL; 321 | } 322 | ia->cleanup_count++; 323 | } 324 | 325 | i = ia->cq_head; 326 | it = ia->cleanup_queue[i]; 327 | if (it != NULL) { 328 | ia->cleanup_queue[i] = NULL; 329 | ia->cq_head = (i + 1) % settings.segcqsize; 330 | } 331 | if (ia->reserved == NULL) { 332 | ia->reserved = segment_alloc(); 333 | } 334 | return it; 335 | } 336 | 337 | void ialloc_cleanup_nextrequest(struct item_allocator *ia) 338 | { 339 | ia->cleanup_count = 0; 340 | } 341 | 342 | void ialloc_maintenance(struct item_allocator *ia) 343 | { 344 | #if 0 345 | struct segment_header *h, *prev, *next, *cand; 346 | struct item *it, **cq = ia->cleanup_queue; 347 | size_t off, size, idx; 348 | double cand_ratio, ratio; 349 | void *data; 350 | 351 | /* Check if we can now free some segments? While we're at it, we can also 352 | * look for a candidate to be cleaned */ 353 | h = ia->oldest; 354 | prev = NULL; 355 | cand = NULL; 356 | cand_ratio = 0; 357 | while (h != NULL && (h->flags & SF_INACTIVE) == SF_INACTIVE) { 358 | next = h->next; 359 | /* Done with this segment? */ 360 | if (h->freed == h->size) { 361 | if (prev == NULL) { 362 | ia->oldest = h->next; 363 | } else { 364 | prev->next = h->next; 365 | } 366 | segment_free(h); 367 | h = prev; 368 | } else { 369 | /* Otherwise we also look for the next cleanup candidate if 370 | * necessary */ 371 | ratio = (double) h->freed / h->size; 372 | if (ratio >= 0.5 && ratio > cand_ratio) { 373 | cand_ratio = ratio; 374 | cand = h; 375 | } 376 | } 377 | prev = h; 378 | h = next; 379 | } 380 | 381 | /* Check if we're currently working on cleaning a segment */ 382 | h = ia->cleaning; 383 | off = ia->clean_offset; 384 | size = (h == NULL ? 0 : h->size); 385 | if (h == NULL || off == size) { 386 | h = cand; 387 | ia->cleaning = h; 388 | off = ia->clean_offset = 0; 389 | } 390 | 391 | /* No segments to clean, that's great! */ 392 | if (h == NULL) { 393 | return; 394 | } 395 | 396 | /* Enqueue clean requests to worker untill we run out or the queue is filled 397 | * up */ 398 | idx = ia->cq_tail; 399 | data = h->data; 400 | while (off < size && cq[idx] == NULL) { 401 | it = (struct item *) ((uintptr_t) data + off); 402 | if (size - off < sizeof(struct item)) { 403 | off = size; 404 | break; 405 | } 406 | if (item_tryref(it)) { 407 | cq[idx] = it; 408 | idx = (idx + 1) % settings.segcqsize; 409 | } 410 | off += item_totalsz(it); 411 | } 412 | ia->cq_tail = idx; 413 | ia->clean_offset = off; 414 | #endif 415 | struct segment_header *h, *prev, *next, *cand; 416 | struct item *it, **cq = ia->cleanup_queue; 417 | size_t off, size, idx; 418 | double cand_ratio, ratio; 419 | void *data; 420 | 421 | /* Check if we can now free some segments? While we're at it, we can also 422 | * look for a candidate to be cleaned */ 423 | cand = NULL; 424 | cand_ratio = 0; 425 | h = ia->oldest; 426 | prev = NULL; 427 | /* We stop before the last segment in the list, and if we hit any 428 | * non-inactive segments. This prevents us from having to touch the cur 429 | * pointers. */ 430 | while (h != NULL && h->next != NULL && 431 | (h->flags & SF_INACTIVE) == SF_INACTIVE) 432 | { 433 | next = h->next; 434 | ratio = (double) h->freed / h->size; 435 | /* Done with this segment? */ 436 | if (h->freed == h->size) { 437 | if (prev == NULL) { 438 | ia->oldest = h->next; 439 | } else { 440 | prev->next = h->next; 441 | } 442 | segment_free(h); 443 | h = prev; 444 | } else if ((h->flags & SF_CLEANED) != SF_CLEANED) { 445 | /* Otherwise we also look for the next cleanup candidate if 446 | * necessary */ 447 | ratio = (double) h->freed / h->size; 448 | if (ratio >= settings.clean_ratio && ratio > cand_ratio) { 449 | cand_ratio = ratio; 450 | cand = h; 451 | } 452 | } 453 | prev = h; 454 | h = next; 455 | } 456 | 457 | /* Check if we're currently working on cleaning a segment */ 458 | h = ia->cleaning; 459 | off = ia->clean_offset; 460 | size = (h == NULL ? 0 : h->size); 461 | if (h == NULL || off == size) { 462 | h = cand; 463 | ia->cleaning = h; 464 | off = ia->clean_offset = 0; 465 | if (h != NULL) { 466 | h->flags |= SF_CLEANED; 467 | } 468 | } 469 | 470 | /* No segments to clean, that's great! */ 471 | if (h == NULL) { 472 | return; 473 | } 474 | 475 | /* Enqueue clean requests to worker untill we run out or the queue is filled 476 | * up */ 477 | idx = ia->cq_tail; 478 | data = h->data; 479 | while (off < size && cq[idx] == NULL) { 480 | it = (struct item *) ((uintptr_t) data + off); 481 | if (size - off < sizeof(struct item)) { 482 | off = size; 483 | break; 484 | } 485 | if (item_tryref(it)) { 486 | cq[idx] = it; 487 | idx = (idx + 1) % settings.segcqsize; 488 | } 489 | off += item_totalsz(it); 490 | } 491 | ia->cq_tail = idx; 492 | ia->clean_offset = off; 493 | 494 | } 495 | -------------------------------------------------------------------------------- /apps/flexkvs/server/iokvs.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 University of Washington, Max Planck Institute for 3 | * Software Systems, and The University of Texas at Austin 4 | * 5 | * Permission is hereby granted, free of charge, to any person obtaining 6 | * a copy of this software and associated documentation files (the 7 | * "Software"), to deal in the Software without restriction, including 8 | * without limitation the rights to use, copy, modify, merge, publish, 9 | * distribute, sublicense, and/or sell copies of the Software, and to 10 | * permit persons to whom the Software is furnished to do so, subject to 11 | * the following conditions: 12 | * 13 | * The above copyright notice and this permission notice shall be 14 | * included in all copies or substantial portions of the Software. 15 | * 16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 19 | * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 20 | * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 21 | * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 22 | * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 23 | */ 24 | 25 | #ifndef IOKVS_H_ 26 | #define IOKVS_H_ 27 | 28 | #define GNU_SOURCE_ 29 | #include 30 | #include 31 | #include 32 | #include 33 | 34 | 35 | /******************************************************************************/ 36 | /* Settings */ 37 | 38 | /** Configurable settings */ 39 | struct settings { 40 | /** Size of log segments in bytes */ 41 | size_t segsize; 42 | /** Maximal number of segments to use */ 43 | size_t segmaxnum; 44 | /** Size of seqment clean queue */ 45 | size_t segcqsize; 46 | /** Segment cleaning ratio */ 47 | double clean_ratio; 48 | /** UDP port to listen on */ 49 | uint16_t udpport; 50 | /** Verbosity for log messages. */ 51 | uint8_t verbose; 52 | /** Number of cores */ 53 | uint8_t numcores; 54 | /** Config file */ 55 | char *config_file; 56 | }; 57 | 58 | /** Global settings */ 59 | extern struct settings settings; 60 | 61 | /** Initialize global settings from command-line. */ 62 | int settings_init(int argc, char *argv[]); 63 | 64 | 65 | /******************************************************************************/ 66 | /* Hash table operations */ 67 | 68 | /** Initialize hash table. */ 69 | void hasht_init(void); 70 | 71 | /** Prefetch hash table slot */ 72 | void hasht_prefetch1(uint32_t hv); 73 | 74 | /** Prefetch matching items */ 75 | void hasht_prefetch2(uint32_t hv); 76 | 77 | /** 78 | * Lookup key in hash table. 79 | * @param key Key 80 | * @param klen Length of key in bytes 81 | * @param hv Hash of key 82 | * @return Pointer to item or NULL 83 | */ 84 | struct item *hasht_get(const void *key, size_t klen, uint32_t hv); 85 | 86 | /** 87 | * Insert item into hash table 88 | * @param it Item 89 | * @param cas If != NULL, will only store `it' if cas is the object currently 90 | * stored for the key (compare and set). 91 | */ 92 | void hasht_put(struct item *it, struct item *cas); 93 | 94 | 95 | /******************************************************************************/ 96 | /* Item Allocation */ 97 | struct segment_header; 98 | struct item; 99 | /** 100 | * Item allocator struct. Should be considered to be opaque outside ialloc.c 101 | * 102 | * This struct is slightly ugly, as it is split up into 3 parts to reduce false 103 | * sharing as much as possible. 104 | */ 105 | struct item_allocator { 106 | /***********************************************************/ 107 | /* Part 1: mostly read-only for maintenance and worker */ 108 | 109 | /* Reserved segment for log cleaning in case we run out */ 110 | struct segment_header *reserved; 111 | /* Queue for communication */ 112 | struct item **cleanup_queue; 113 | 114 | uint8_t pad_0[48]; 115 | /***********************************************************/ 116 | /* Part 2: Only accessed by worker threads */ 117 | 118 | /* Current segment */ 119 | struct segment_header *cur; 120 | /* Head pointer in cleanup queue */ 121 | size_t cq_head; 122 | /* Clenanup counter, limits mandatory cleanup per request */ 123 | size_t cleanup_count; 124 | 125 | uint8_t pad_1[40]; 126 | /***********************************************************/ 127 | /* Part 3: Only accessed by maintenance threads */ 128 | 129 | /* Oldest segment */ 130 | struct segment_header *oldest; 131 | /* Tail pointer for cleanup queue */ 132 | size_t cq_tail; 133 | /* */ 134 | struct segment_header *cleaning; 135 | /* */ 136 | size_t clean_offset; 137 | }; 138 | 139 | _Static_assert(offsetof(struct item_allocator, cur) % 64 == 0, 140 | "Alignment in struct item_allocator broken 1"); 141 | _Static_assert(offsetof(struct item_allocator, oldest) % 64 == 0, 142 | "Alignment in struct item_allocator broken 2"); 143 | 144 | /** Initialize item allocation. Prepares memory regions etc. */ 145 | void ialloc_init(void); 146 | 147 | /** Initialize an item allocator instance. */ 148 | void ialloc_init_allocator(struct item_allocator *ia); 149 | 150 | /** 151 | * Allocate an item. 152 | * 153 | * Note this function has two modes: cleanup and non-cleanup. In cleanup mode, 154 | * the allocator will use the segment reserved for log cleanup if no other 155 | * allocation is possible, otherwise it will just return NULL and leave the 156 | * reserved segment untouched. 157 | * 158 | * @param ia Allocator instance 159 | * @param total Total number of bytes (includes item struct) 160 | * @param cleanup true if this allocation is for a cleanup operation 161 | * @return Allocated item or NULL. 162 | */ 163 | struct item *ialloc_alloc(struct item_allocator *ia, size_t total, 164 | bool cleanup); 165 | 166 | /** 167 | * Free an item. 168 | * @param it Item 169 | * @param total Total number of bytes (includes item struct) 170 | */ 171 | void ialloc_free(struct item *it, size_t total); 172 | void ialloc_free_dont_need(struct item *it, size_t total); 173 | 174 | /** 175 | * Get item from cleanup queue for this allocator. 176 | * @param ia Allocator instance 177 | * @param idle true if there are currently no pending requests, false otherwise 178 | * @return Item or NULL 179 | */ 180 | struct item *ialloc_cleanup_item(struct item_allocator *ia, bool idle); 181 | 182 | /** 183 | * Resets per-request cleanup counters. Should be called when a new request is 184 | * ready to be processed before calling ialloc_cleanup_item. 185 | */ 186 | void ialloc_cleanup_nextrequest(struct item_allocator *ia); 187 | 188 | /** 189 | * Dispatch log cleanup operations for this instance, if required. To be called 190 | * from maintenance thread. 191 | */ 192 | void ialloc_maintenance(struct item_allocator *ia); 193 | 194 | 195 | /******************************************************************************/ 196 | /* Items */ 197 | 198 | /** 199 | * Item. 200 | * The item struct is immediately followed by first the key, and then the 201 | * associated value. 202 | */ 203 | struct item { 204 | /** Next item in the hash chain. */ 205 | struct item *next; 206 | /** Hash value for this item */ 207 | uint32_t hv; 208 | /** Length of value in bytes */ 209 | uint32_t vallen; 210 | /** Reference count */ 211 | volatile uint16_t refcount; 212 | /** Length of key in bytes */ 213 | uint16_t keylen; 214 | /** Flags (currently unused, but provides padding) */ 215 | uint32_t flags; 216 | }; 217 | 218 | /** Get pointer to the item's key */ 219 | static inline void *item_key(struct item *it) 220 | { 221 | return it + 1; 222 | } 223 | 224 | /** Get pointer to the item's value */ 225 | static inline void *item_value(struct item *it) 226 | { 227 | return (void *) ((uintptr_t) (it + 1) + it->keylen); 228 | } 229 | 230 | /** Total number of bytes for this item (includes item struct) */ 231 | static inline size_t item_totalsz(struct item *it) 232 | { 233 | return sizeof(*it) + it->vallen + it->keylen; 234 | } 235 | 236 | /** Increment item's refcount (original refcount must not be 0). */ 237 | static inline void item_ref(struct item *it) 238 | { 239 | uint16_t old; 240 | old = __sync_add_and_fetch(&it->refcount, 1); 241 | assert(old != 1); 242 | } 243 | 244 | /** 245 | * Increment item's refcount if it is not zero. 246 | * @return true if the refcount was increased, false otherwise. 247 | */ 248 | static inline bool item_tryref(struct item *it) 249 | { 250 | uint16_t c; 251 | do { 252 | c = it->refcount; 253 | if (c == 0) { 254 | return false; 255 | } 256 | } while (!__sync_bool_compare_and_swap(&it->refcount, c, c + 1)); 257 | return true; 258 | } 259 | 260 | /** 261 | * Decrement item's refcount, and free item if refcount = 0. 262 | * The original refcount must be > 0. 263 | */ 264 | static inline void item_unref(struct item *it) 265 | { 266 | uint16_t c; 267 | assert(it->refcount > 0); 268 | if ((c = __sync_sub_and_fetch(&it->refcount, 1)) == 0) { 269 | ialloc_free(it, item_totalsz(it)); 270 | } 271 | } 272 | 273 | /** Wrapper for transport code */ 274 | static inline void myt_item_release(void *it) 275 | { 276 | item_unref(it); 277 | } 278 | 279 | 280 | 281 | 282 | uint32_t jenkins_hash(const void *key, size_t length); 283 | 284 | #endif // ndef IOKVS_H_ 285 | -------------------------------------------------------------------------------- /apps/flexkvs/server/jenkins_hash.c: -------------------------------------------------------------------------------- 1 | /* -*- Mode: C; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */ 2 | /* 3 | * Hash table 4 | * 5 | * The hash function used here is by Bob Jenkins, 1996: 6 | * 7 | * "By Bob Jenkins, 1996. bob_jenkins@burtleburtle.net. 8 | * You may use this code any way you wish, private, educational, 9 | * or commercial. It's free." 10 | * 11 | */ 12 | #include "iokvs.h" 13 | 14 | #define ENDIAN_LITTLE 1 15 | 16 | /* 17 | * Since the hash function does bit manipulation, it needs to know 18 | * whether it's big or little-endian. ENDIAN_LITTLE and ENDIAN_BIG 19 | * are set in the configure script. 20 | */ 21 | #if ENDIAN_BIG == 1 22 | # define HASH_LITTLE_ENDIAN 0 23 | # define HASH_BIG_ENDIAN 1 24 | #else 25 | # if ENDIAN_LITTLE == 1 26 | # define HASH_LITTLE_ENDIAN 1 27 | # define HASH_BIG_ENDIAN 0 28 | # else 29 | # define HASH_LITTLE_ENDIAN 0 30 | # define HASH_BIG_ENDIAN 0 31 | # endif 32 | #endif 33 | 34 | #define rot(x,k) (((x)<<(k)) ^ ((x)>>(32-(k)))) 35 | 36 | /* 37 | ------------------------------------------------------------------------------- 38 | mix -- mix 3 32-bit values reversibly. 39 | 40 | This is reversible, so any information in (a,b,c) before mix() is 41 | still in (a,b,c) after mix(). 42 | 43 | If four pairs of (a,b,c) inputs are run through mix(), or through 44 | mix() in reverse, there are at least 32 bits of the output that 45 | are sometimes the same for one pair and different for another pair. 46 | This was tested for: 47 | * pairs that differed by one bit, by two bits, in any combination 48 | of top bits of (a,b,c), or in any combination of bottom bits of 49 | (a,b,c). 50 | * "differ" is defined as +, -, ^, or ~^. For + and -, I transformed 51 | the output delta to a Gray code (a^(a>>1)) so a string of 1's (as 52 | is commonly produced by subtraction) look like a single 1-bit 53 | difference. 54 | * the base values were pseudorandom, all zero but one bit set, or 55 | all zero plus a counter that starts at zero. 56 | 57 | Some k values for my "a-=c; a^=rot(c,k); c+=b;" arrangement that 58 | satisfy this are 59 | 4 6 8 16 19 4 60 | 9 15 3 18 27 15 61 | 14 9 3 7 17 3 62 | Well, "9 15 3 18 27 15" didn't quite get 32 bits diffing 63 | for "differ" defined as + with a one-bit base and a two-bit delta. I 64 | used http://burtleburtle.net/bob/hash/avalanche.html to choose 65 | the operations, constants, and arrangements of the variables. 66 | 67 | This does not achieve avalanche. There are input bits of (a,b,c) 68 | that fail to affect some output bits of (a,b,c), especially of a. The 69 | most thoroughly mixed value is c, but it doesn't really even achieve 70 | avalanche in c. 71 | 72 | This allows some parallelism. Read-after-writes are good at doubling 73 | the number of bits affected, so the goal of mixing pulls in the opposite 74 | direction as the goal of parallelism. I did what I could. Rotates 75 | seem to cost as much as shifts on every machine I could lay my hands 76 | on, and rotates are much kinder to the top and bottom bits, so I used 77 | rotates. 78 | ------------------------------------------------------------------------------- 79 | */ 80 | #define mix(a,b,c) \ 81 | { \ 82 | a -= c; a ^= rot(c, 4); c += b; \ 83 | b -= a; b ^= rot(a, 6); a += c; \ 84 | c -= b; c ^= rot(b, 8); b += a; \ 85 | a -= c; a ^= rot(c,16); c += b; \ 86 | b -= a; b ^= rot(a,19); a += c; \ 87 | c -= b; c ^= rot(b, 4); b += a; \ 88 | } 89 | 90 | /* 91 | ------------------------------------------------------------------------------- 92 | final -- final mixing of 3 32-bit values (a,b,c) into c 93 | 94 | Pairs of (a,b,c) values differing in only a few bits will usually 95 | produce values of c that look totally different. This was tested for 96 | * pairs that differed by one bit, by two bits, in any combination 97 | of top bits of (a,b,c), or in any combination of bottom bits of 98 | (a,b,c). 99 | * "differ" is defined as +, -, ^, or ~^. For + and -, I transformed 100 | the output delta to a Gray code (a^(a>>1)) so a string of 1's (as 101 | is commonly produced by subtraction) look like a single 1-bit 102 | difference. 103 | * the base values were pseudorandom, all zero but one bit set, or 104 | all zero plus a counter that starts at zero. 105 | 106 | These constants passed: 107 | 14 11 25 16 4 14 24 108 | 12 14 25 16 4 14 24 109 | and these came close: 110 | 4 8 15 26 3 22 24 111 | 10 8 15 26 3 22 24 112 | 11 8 15 26 3 22 24 113 | ------------------------------------------------------------------------------- 114 | */ 115 | #define final(a,b,c) \ 116 | { \ 117 | c ^= b; c -= rot(b,14); \ 118 | a ^= c; a -= rot(c,11); \ 119 | b ^= a; b -= rot(a,25); \ 120 | c ^= b; c -= rot(b,16); \ 121 | a ^= c; a -= rot(c,4); \ 122 | b ^= a; b -= rot(a,14); \ 123 | c ^= b; c -= rot(b,24); \ 124 | } 125 | 126 | #if HASH_LITTLE_ENDIAN == 1 127 | uint32_t jenkins_hash( 128 | const void *key, /* the key to hash */ 129 | size_t length) /* length of the key */ 130 | { 131 | uint32_t a,b,c; /* internal state */ 132 | union { const void *ptr; size_t i; } u; /* needed for Mac Powerbook G4 */ 133 | 134 | /* Set up the internal state */ 135 | a = b = c = 0xdeadbeef + ((uint32_t)length) + 0; 136 | 137 | u.ptr = key; 138 | if (HASH_LITTLE_ENDIAN && ((u.i & 0x3) == 0)) { 139 | const uint32_t *k = key; /* read 32-bit chunks */ 140 | #ifdef VALGRIND 141 | const uint8_t *k8; 142 | #endif /* ifdef VALGRIND */ 143 | 144 | /*------ all but last block: aligned reads and affect 32 bits of (a,b,c) */ 145 | while (length > 12) 146 | { 147 | a += k[0]; 148 | b += k[1]; 149 | c += k[2]; 150 | mix(a,b,c); 151 | length -= 12; 152 | k += 3; 153 | } 154 | 155 | /*----------------------------- handle the last (probably partial) block */ 156 | /* 157 | * "k[2]&0xffffff" actually reads beyond the end of the string, but 158 | * then masks off the part it's not allowed to read. Because the 159 | * string is aligned, the masked-off tail is in the same word as the 160 | * rest of the string. Every machine with memory protection I've seen 161 | * does it on word boundaries, so is OK with this. But VALGRIND will 162 | * still catch it and complain. The masking trick does make the hash 163 | * noticeably faster for short strings (like English words). 164 | */ 165 | #ifndef VALGRIND 166 | 167 | switch(length) 168 | { 169 | case 12: c+=k[2]; b+=k[1]; a+=k[0]; break; 170 | case 11: c+=k[2]&0xffffff; b+=k[1]; a+=k[0]; break; 171 | case 10: c+=k[2]&0xffff; b+=k[1]; a+=k[0]; break; 172 | case 9 : c+=k[2]&0xff; b+=k[1]; a+=k[0]; break; 173 | case 8 : b+=k[1]; a+=k[0]; break; 174 | case 7 : b+=k[1]&0xffffff; a+=k[0]; break; 175 | case 6 : b+=k[1]&0xffff; a+=k[0]; break; 176 | case 5 : b+=k[1]&0xff; a+=k[0]; break; 177 | case 4 : a+=k[0]; break; 178 | case 3 : a+=k[0]&0xffffff; break; 179 | case 2 : a+=k[0]&0xffff; break; 180 | case 1 : a+=k[0]&0xff; break; 181 | case 0 : return c; /* zero length strings require no mixing */ 182 | } 183 | 184 | #else /* make valgrind happy */ 185 | 186 | k8 = (const uint8_t *)k; 187 | switch(length) 188 | { 189 | case 12: c+=k[2]; b+=k[1]; a+=k[0]; break; 190 | case 11: c+=((uint32_t)k8[10])<<16; /* fall through */ 191 | case 10: c+=((uint32_t)k8[9])<<8; /* fall through */ 192 | case 9 : c+=k8[8]; /* fall through */ 193 | case 8 : b+=k[1]; a+=k[0]; break; 194 | case 7 : b+=((uint32_t)k8[6])<<16; /* fall through */ 195 | case 6 : b+=((uint32_t)k8[5])<<8; /* fall through */ 196 | case 5 : b+=k8[4]; /* fall through */ 197 | case 4 : a+=k[0]; break; 198 | case 3 : a+=((uint32_t)k8[2])<<16; /* fall through */ 199 | case 2 : a+=((uint32_t)k8[1])<<8; /* fall through */ 200 | case 1 : a+=k8[0]; break; 201 | case 0 : return c; /* zero length strings require no mixing */ 202 | } 203 | 204 | #endif /* !valgrind */ 205 | 206 | } else if (HASH_LITTLE_ENDIAN && ((u.i & 0x1) == 0)) { 207 | const uint16_t *k = key; /* read 16-bit chunks */ 208 | const uint8_t *k8; 209 | 210 | /*--------------- all but last block: aligned reads and different mixing */ 211 | while (length > 12) 212 | { 213 | a += k[0] + (((uint32_t)k[1])<<16); 214 | b += k[2] + (((uint32_t)k[3])<<16); 215 | c += k[4] + (((uint32_t)k[5])<<16); 216 | mix(a,b,c); 217 | length -= 12; 218 | k += 6; 219 | } 220 | 221 | /*----------------------------- handle the last (probably partial) block */ 222 | k8 = (const uint8_t *)k; 223 | switch(length) 224 | { 225 | case 12: c+=k[4]+(((uint32_t)k[5])<<16); 226 | b+=k[2]+(((uint32_t)k[3])<<16); 227 | a+=k[0]+(((uint32_t)k[1])<<16); 228 | break; 229 | case 11: c+=((uint32_t)k8[10])<<16; /* @fallthrough */ 230 | case 10: c+=k[4]; /* @fallthrough@ */ 231 | b+=k[2]+(((uint32_t)k[3])<<16); 232 | a+=k[0]+(((uint32_t)k[1])<<16); 233 | break; 234 | case 9 : c+=k8[8]; /* @fallthrough */ 235 | case 8 : b+=k[2]+(((uint32_t)k[3])<<16); 236 | a+=k[0]+(((uint32_t)k[1])<<16); 237 | break; 238 | case 7 : b+=((uint32_t)k8[6])<<16; /* @fallthrough */ 239 | case 6 : b+=k[2]; 240 | a+=k[0]+(((uint32_t)k[1])<<16); 241 | break; 242 | case 5 : b+=k8[4]; /* @fallthrough */ 243 | case 4 : a+=k[0]+(((uint32_t)k[1])<<16); 244 | break; 245 | case 3 : a+=((uint32_t)k8[2])<<16; /* @fallthrough */ 246 | case 2 : a+=k[0]; 247 | break; 248 | case 1 : a+=k8[0]; 249 | break; 250 | case 0 : return c; /* zero length strings require no mixing */ 251 | } 252 | 253 | } else { /* need to read the key one byte at a time */ 254 | const uint8_t *k = key; 255 | 256 | /*--------------- all but the last block: affect some 32 bits of (a,b,c) */ 257 | while (length > 12) 258 | { 259 | a += k[0]; 260 | a += ((uint32_t)k[1])<<8; 261 | a += ((uint32_t)k[2])<<16; 262 | a += ((uint32_t)k[3])<<24; 263 | b += k[4]; 264 | b += ((uint32_t)k[5])<<8; 265 | b += ((uint32_t)k[6])<<16; 266 | b += ((uint32_t)k[7])<<24; 267 | c += k[8]; 268 | c += ((uint32_t)k[9])<<8; 269 | c += ((uint32_t)k[10])<<16; 270 | c += ((uint32_t)k[11])<<24; 271 | mix(a,b,c); 272 | length -= 12; 273 | k += 12; 274 | } 275 | 276 | /*-------------------------------- last block: affect all 32 bits of (c) */ 277 | switch(length) /* all the case statements fall through */ 278 | { 279 | case 12: c+=((uint32_t)k[11])<<24; 280 | case 11: c+=((uint32_t)k[10])<<16; 281 | case 10: c+=((uint32_t)k[9])<<8; 282 | case 9 : c+=k[8]; 283 | case 8 : b+=((uint32_t)k[7])<<24; 284 | case 7 : b+=((uint32_t)k[6])<<16; 285 | case 6 : b+=((uint32_t)k[5])<<8; 286 | case 5 : b+=k[4]; 287 | case 4 : a+=((uint32_t)k[3])<<24; 288 | case 3 : a+=((uint32_t)k[2])<<16; 289 | case 2 : a+=((uint32_t)k[1])<<8; 290 | case 1 : a+=k[0]; 291 | break; 292 | case 0 : return c; /* zero length strings require no mixing */ 293 | } 294 | } 295 | 296 | final(a,b,c); 297 | return c; /* zero length strings require no mixing */ 298 | } 299 | 300 | #elif HASH_BIG_ENDIAN == 1 301 | /* 302 | * hashbig(): 303 | * This is the same as hashword() on big-endian machines. It is different 304 | * from hashlittle() on all machines. hashbig() takes advantage of 305 | * big-endian byte ordering. 306 | */ 307 | uint32_t jenkins_hash( const void *key, size_t length) 308 | { 309 | uint32_t a,b,c; 310 | union { const void *ptr; size_t i; } u; /* to cast key to (size_t) happily */ 311 | 312 | /* Set up the internal state */ 313 | a = b = c = 0xdeadbeef + ((uint32_t)length) + 0; 314 | 315 | u.ptr = key; 316 | if (HASH_BIG_ENDIAN && ((u.i & 0x3) == 0)) { 317 | const uint32_t *k = key; /* read 32-bit chunks */ 318 | #ifdef VALGRIND 319 | const uint8_t *k8; 320 | #endif /* ifdef VALGRIND */ 321 | 322 | /*------ all but last block: aligned reads and affect 32 bits of (a,b,c) */ 323 | while (length > 12) 324 | { 325 | a += k[0]; 326 | b += k[1]; 327 | c += k[2]; 328 | mix(a,b,c); 329 | length -= 12; 330 | k += 3; 331 | } 332 | 333 | /*----------------------------- handle the last (probably partial) block */ 334 | /* 335 | * "k[2]<<8" actually reads beyond the end of the string, but 336 | * then shifts out the part it's not allowed to read. Because the 337 | * string is aligned, the illegal read is in the same word as the 338 | * rest of the string. Every machine with memory protection I've seen 339 | * does it on word boundaries, so is OK with this. But VALGRIND will 340 | * still catch it and complain. The masking trick does make the hash 341 | * noticeably faster for short strings (like English words). 342 | */ 343 | #ifndef VALGRIND 344 | 345 | switch(length) 346 | { 347 | case 12: c+=k[2]; b+=k[1]; a+=k[0]; break; 348 | case 11: c+=k[2]&0xffffff00; b+=k[1]; a+=k[0]; break; 349 | case 10: c+=k[2]&0xffff0000; b+=k[1]; a+=k[0]; break; 350 | case 9 : c+=k[2]&0xff000000; b+=k[1]; a+=k[0]; break; 351 | case 8 : b+=k[1]; a+=k[0]; break; 352 | case 7 : b+=k[1]&0xffffff00; a+=k[0]; break; 353 | case 6 : b+=k[1]&0xffff0000; a+=k[0]; break; 354 | case 5 : b+=k[1]&0xff000000; a+=k[0]; break; 355 | case 4 : a+=k[0]; break; 356 | case 3 : a+=k[0]&0xffffff00; break; 357 | case 2 : a+=k[0]&0xffff0000; break; 358 | case 1 : a+=k[0]&0xff000000; break; 359 | case 0 : return c; /* zero length strings require no mixing */ 360 | } 361 | 362 | #else /* make valgrind happy */ 363 | 364 | k8 = (const uint8_t *)k; 365 | switch(length) /* all the case statements fall through */ 366 | { 367 | case 12: c+=k[2]; b+=k[1]; a+=k[0]; break; 368 | case 11: c+=((uint32_t)k8[10])<<8; /* fall through */ 369 | case 10: c+=((uint32_t)k8[9])<<16; /* fall through */ 370 | case 9 : c+=((uint32_t)k8[8])<<24; /* fall through */ 371 | case 8 : b+=k[1]; a+=k[0]; break; 372 | case 7 : b+=((uint32_t)k8[6])<<8; /* fall through */ 373 | case 6 : b+=((uint32_t)k8[5])<<16; /* fall through */ 374 | case 5 : b+=((uint32_t)k8[4])<<24; /* fall through */ 375 | case 4 : a+=k[0]; break; 376 | case 3 : a+=((uint32_t)k8[2])<<8; /* fall through */ 377 | case 2 : a+=((uint32_t)k8[1])<<16; /* fall through */ 378 | case 1 : a+=((uint32_t)k8[0])<<24; break; 379 | case 0 : return c; 380 | } 381 | 382 | #endif /* !VALGRIND */ 383 | 384 | } else { /* need to read the key one byte at a time */ 385 | const uint8_t *k = key; 386 | 387 | /*--------------- all but the last block: affect some 32 bits of (a,b,c) */ 388 | while (length > 12) 389 | { 390 | a += ((uint32_t)k[0])<<24; 391 | a += ((uint32_t)k[1])<<16; 392 | a += ((uint32_t)k[2])<<8; 393 | a += ((uint32_t)k[3]); 394 | b += ((uint32_t)k[4])<<24; 395 | b += ((uint32_t)k[5])<<16; 396 | b += ((uint32_t)k[6])<<8; 397 | b += ((uint32_t)k[7]); 398 | c += ((uint32_t)k[8])<<24; 399 | c += ((uint32_t)k[9])<<16; 400 | c += ((uint32_t)k[10])<<8; 401 | c += ((uint32_t)k[11]); 402 | mix(a,b,c); 403 | length -= 12; 404 | k += 12; 405 | } 406 | 407 | /*-------------------------------- last block: affect all 32 bits of (c) */ 408 | switch(length) /* all the case statements fall through */ 409 | { 410 | case 12: c+=k[11]; 411 | case 11: c+=((uint32_t)k[10])<<8; 412 | case 10: c+=((uint32_t)k[9])<<16; 413 | case 9 : c+=((uint32_t)k[8])<<24; 414 | case 8 : b+=k[7]; 415 | case 7 : b+=((uint32_t)k[6])<<8; 416 | case 6 : b+=((uint32_t)k[5])<<16; 417 | case 5 : b+=((uint32_t)k[4])<<24; 418 | case 4 : a+=k[3]; 419 | case 3 : a+=((uint32_t)k[2])<<8; 420 | case 2 : a+=((uint32_t)k[1])<<16; 421 | case 1 : a+=((uint32_t)k[0])<<24; 422 | break; 423 | case 0 : return c; 424 | } 425 | } 426 | 427 | final(a,b,c); 428 | return c; 429 | } 430 | #else /* HASH_XXX_ENDIAN == 1 */ 431 | #error Must define HASH_BIG_ENDIAN or HASH_LITTLE_ENDIAN 432 | #endif /* HASH_XXX_ENDIAN == 1 */ 433 | -------------------------------------------------------------------------------- /apps/flexkvs/server/settings.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 University of Washington, Max Planck Institute for 3 | * Software Systems, and The University of Texas at Austin 4 | * 5 | * Permission is hereby granted, free of charge, to any person obtaining 6 | * a copy of this software and associated documentation files (the 7 | * "Software"), to deal in the Software without restriction, including 8 | * without limitation the rights to use, copy, modify, merge, publish, 9 | * distribute, sublicense, and/or sell copies of the Software, and to 10 | * permit persons to whom the Software is furnished to do so, subject to 11 | * the following conditions: 12 | * 13 | * The above copyright notice and this permission notice shall be 14 | * included in all copies or substantial portions of the Software. 15 | * 16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 19 | * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 20 | * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 21 | * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 22 | * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 23 | */ 24 | 25 | #include 26 | #include 27 | 28 | #include "iokvs.h" 29 | 30 | struct settings settings; 31 | 32 | int settings_init(int argc, char *argv[]) 33 | { 34 | settings.udpport = 11211; 35 | settings.verbose = 1; 36 | //settings.segsize = 256 * 1024; 37 | //settings.segsize = 128 * 65536; 38 | settings.segsize = 1024 * 1024 * 1024; 39 | //settings.segmaxnum = 4096; 40 | //settings.segmaxnum = 64 * 4096; 41 | settings.segmaxnum = 700; 42 | //settings.segcqsize = 32 * 1024; 43 | settings.segcqsize = 1500; 44 | settings.clean_ratio = 0.8; 45 | //settings.clean_ratio = 1.1; 46 | 47 | if (argc != 3) { 48 | fprintf(stderr, "Usage: flexkvs CONFIG THREADS\n"); 49 | return -1; 50 | } 51 | 52 | settings.numcores = atoi(argv[2]); 53 | settings.config_file = argv[1]; 54 | return 0; 55 | } 56 | -------------------------------------------------------------------------------- /apps/flexkvs/unlink_socks.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | for i in {0..15}; do unlink kvs_sock$i; done 4 | -------------------------------------------------------------------------------- /build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -x 3 | 4 | SCRIPTDIR="$(dirname "$(readlink -f "$0")")" 5 | PREFIX="$HOME"/.local 6 | 7 | export PKG_CONFIG_PATH="$PREFIX"/lib64/pkgconfig C_INCLUDE_PATH="$PREFIX"/include LIBRARY_PATH="$PREFIX"/lib64 8 | 9 | # capstone 10 | cd "$SCRIPTDIR"/capstone 11 | # rm -rf build 12 | cmake -S . -B build -DCMAKE_INSTALL_PREFIX="$PREFIX" -DCMAKE_BUILD_TYPE=Release && cmake --build build --config Release --parallel --target=install 13 | 14 | # syscall_intercept 15 | cd "$SCRIPTDIR"/syscall_intercept 16 | # rm -rf build 17 | cmake -S . -B build -DCMAKE_INSTALL_PREFIX="$PREFIX" -DCMAKE_BUILD_TYPE=Release && cmake --build build --config Release --parallel --target=install 18 | 19 | # hoard 20 | cd "$SCRIPTDIR"/Hoard/src 21 | make clean && make -j 22 | cp libhoard.so "$PREFIX"/lib64 23 | 24 | # hemem 25 | cd "$SCRIPTDIR"/src 26 | make clean && make -j 27 | cp libhemem.so "$PREFIX"/lib64 28 | 29 | -------------------------------------------------------------------------------- /microbenchmarks/Makefile: -------------------------------------------------------------------------------- 1 | CC = gcc 2 | CFLAGS = -g -Wall -O3 3 | #CFLAGS = -g3 -Wall -O0 4 | INCLUDES = -I../linux/usr/include 5 | LIBS = -lm -lpthread 6 | 7 | default: gups-pebs gups-random gups-hotset-move 8 | 9 | all: gups-pebs gups-random gups-simple gups-lru gups-lru-swap gups-hotset-move #gups 10 | 11 | gups-random: gups-random.o 12 | $(CC) $(CFLAGS) $(INCLUDES) -o gups-random gups-random.o zipf.o $(LIBS) -L../src/ -lhemem 13 | 14 | gups-small: gups-small.o 15 | $(CC) $(CFLAGS) $(INCLUDES) -o gups-small gups-small.o ../timer.o $(LIBS) 16 | 17 | gups: gups.o 18 | $(CC) $(CFLAGS) $(INCLUDES) -o gups gups.o zipf.o ../timer.o $(LIBS) 19 | 20 | gups-pebs: gups.o 21 | $(CC) $(CFLAGS) $(INCLUDES) -o gups-pebs gups.o zipf.o $(LIBS) -L../src/ -lhemem 22 | 23 | gups-hotset-move: gups-hotset-move.o 24 | $(CC) $(CFLAGS) $(INCLUDES) -o gups-hotset-move gups-hotset-move.o zipf.o $(LIBS) -L../src/ -lhemem 25 | 26 | gups-simple: gups.o 27 | $(CC) $(CFLAGS) $(INCLUDES) -o gups-simple gups.o zipf.o $(LIBS) -L../src/ -lhemem-simple 28 | 29 | gups-lru: gups.o 30 | $(CC) $(CFLAGS) $(INCLUDES) -o gups-lru gups.o zipf.o $(LIBS) -L../src/ -lhemem-lru 31 | 32 | gups-lru-swap: gups.o 33 | $(CC) $(CFLAGS) $(INCLUDES) -o gups-lru-swap gups.o zipf.o $(LIBS) -L../src/ -lhemem-lru-swap 34 | 35 | gups-random.o: gups-random.c zipf.c gups.h 36 | $(CC) $(CFLAGS) $(INCLUDES) -c gups-random.c zipf.c 37 | 38 | gups.o: gups.c zipf.c gups.h 39 | $(CC) $(CFLAGS) $(INCLUDES) -c gups.c zipf.c 40 | 41 | gups-hotset-move.o: gups-hotset-move.c zipf.c gups.h 42 | $(CC) $(CFLAGS) $(INCLUDES) -c gups-hotset-move.c zipf.c 43 | 44 | clean: 45 | $(RM) *.o gups gups-hotset-move gups-lru-swap gups-lru gups-simple gups-random gups-pebs 46 | -------------------------------------------------------------------------------- /microbenchmarks/gups-hotset-move.c: -------------------------------------------------------------------------------- 1 | /* 2 | * ===================================================================================== 3 | * 4 | * Filename: gups.c 5 | * 6 | * Description: 7 | * 8 | * Version: 1.0 9 | * Created: 02/21/2018 02:36:27 PM 10 | * Revision: none 11 | * Compiler: gcc 12 | * 13 | * Author: YOUR NAME (), 14 | * Organization: 15 | * 16 | * ===================================================================================== 17 | */ 18 | 19 | #define _GNU_SOURCE 20 | 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | #include 33 | #include 34 | #include 35 | #include 36 | 37 | #include "../src/timer.h" 38 | #include "../src/hemem.h" 39 | 40 | 41 | #include "gups.h" 42 | 43 | #define MAX_THREADS 64 44 | 45 | #define GUPS_PAGE_SIZE (4 * 1024) 46 | #define PAGE_NUM 3 47 | #define PAGES 2048 48 | 49 | #ifdef HOTSPOT 50 | extern uint64_t hotset_start; 51 | extern double hotset_fraction; 52 | #endif 53 | 54 | int threads; 55 | 56 | bool move_hotset1 = false; 57 | 58 | uint64_t hot_start = 0; 59 | uint64_t hotsize = 0; 60 | 61 | struct gups_args { 62 | int tid; // thread id 63 | uint64_t *indices; // array of indices to access 64 | void* field; // pointer to start of thread's region 65 | uint64_t iters; // iterations to perform 66 | uint64_t size; // size of region 67 | uint64_t elt_size; // size of elements 68 | uint64_t hot_start; // start of hot set 69 | uint64_t hotsize; // size of hot set 70 | }; 71 | 72 | 73 | static inline uint64_t rdtscp(void) 74 | { 75 | uint32_t eax, edx; 76 | // why is "ecx" in clobber list here, anyway? -SG&MH,2017-10-05 77 | __asm volatile ("rdtscp" : "=a" (eax), "=d" (edx) :: "ecx", "memory"); 78 | return ((uint64_t)edx << 32) | eax; 79 | } 80 | 81 | uint64_t thread_gups[MAX_THREADS]; 82 | 83 | static unsigned long updates, nelems; 84 | 85 | bool stop = false; 86 | 87 | static void *timing_thread() 88 | { 89 | uint64_t tic = -1; 90 | bool printed1 = false; 91 | for (;;) { 92 | tic++; 93 | if (tic >= 150 && tic < 300) { 94 | if (!printed1) { 95 | move_hotset1 = true; 96 | fprintf(stderr, "moved hotset1\n"); 97 | printed1 = true; 98 | } 99 | } 100 | if (tic >= 250) { 101 | stop = true; 102 | } 103 | sleep(1); 104 | } 105 | return 0; 106 | } 107 | 108 | uint64_t tot_updates = 0; 109 | 110 | static void *print_instantaneous_gups() 111 | { 112 | FILE *tot; 113 | uint64_t tot_gups, tot_last_second_gups = 0; 114 | 115 | 116 | tot = fopen("tot_gups.txt", "w"); 117 | if (tot == NULL) { 118 | perror("fopen"); 119 | } 120 | 121 | for (;;) { 122 | tot_gups = 0; 123 | for (int i = 0; i < threads; i++) { 124 | tot_gups += thread_gups[i]; 125 | } 126 | fprintf(tot, "%.10f\n", (1.0 * (abs(tot_gups - tot_last_second_gups))) / (1.0e9)); 127 | tot_updates += abs(tot_gups - tot_last_second_gups); 128 | tot_last_second_gups = tot_gups; 129 | sleep(1); 130 | } 131 | 132 | return NULL; 133 | } 134 | 135 | 136 | static uint64_t lfsr_fast(uint64_t lfsr) 137 | { 138 | lfsr ^= lfsr >> 7; 139 | lfsr ^= lfsr << 9; 140 | lfsr ^= lfsr >> 13; 141 | return lfsr; 142 | } 143 | 144 | char *filename = "indices1.txt"; 145 | 146 | FILE *hotsetfile = NULL; 147 | 148 | static void *do_gups(void *arguments) 149 | { 150 | //printf("do_gups entered\n"); 151 | struct gups_args *args = (struct gups_args*)arguments; 152 | uint64_t *field = (uint64_t*)(args->field); 153 | uint64_t i; 154 | uint64_t index1, index2; 155 | uint64_t elt_size = args->elt_size; 156 | char data[elt_size]; 157 | uint64_t lfsr; 158 | uint64_t hot_num; 159 | uint64_t tmp; 160 | uint64_t start, end; 161 | uint64_t before_accesses = 0; 162 | 163 | srand(args->tid); 164 | lfsr = rand(); 165 | 166 | index1 = 0; 167 | index2 = 0; 168 | 169 | fprintf(hotsetfile, "Thread %d region: %p - %p\thot set: %p - %p\n", args->tid, field, field + (args->size * elt_size), field + args->hot_start, field + args->hot_start + (args->hotsize * elt_size)); 170 | 171 | for (i = 0; i < args->iters; i++) { 172 | hot_num = lfsr_fast(lfsr) % 100; 173 | if (hot_num < 90) { 174 | lfsr = lfsr_fast(lfsr); 175 | index1 = args->hot_start + (lfsr % args->hotsize); 176 | if (move_hotset1) { 177 | if ((index1 < (args->hotsize / 4))) { 178 | index1 += args->hotsize; 179 | } 180 | } 181 | else { 182 | if ((index1 < (args->hotsize / 4))) { 183 | before_accesses++; 184 | } 185 | } 186 | start = rdtscp(); 187 | if (elt_size == 8) { 188 | uint64_t tmp = field[index1]; 189 | tmp = tmp + i; 190 | field[index1] = tmp; 191 | } 192 | else { 193 | memcpy(data, &field[index1 * elt_size], elt_size); 194 | memset(data, data[0] + i, elt_size); 195 | memcpy(&field[index1 * elt_size], data, elt_size); 196 | } 197 | end = rdtscp(); 198 | } 199 | else { 200 | lfsr = lfsr_fast(lfsr); 201 | index2 = lfsr % (args->size); 202 | start = rdtscp(); 203 | if (elt_size == 8) { 204 | uint64_t tmp = field[index2]; 205 | tmp = tmp + i; 206 | field[index2] = tmp; 207 | } 208 | else { 209 | memcpy(data, &field[index2 * elt_size], elt_size); 210 | memset(data, data[0] + i, elt_size); 211 | memcpy(&field[index2 * elt_size], data, elt_size); 212 | } 213 | end = rdtscp(); 214 | } 215 | 216 | if (i % 10000 == 0) { 217 | thread_gups[args->tid] += 10000; 218 | } 219 | 220 | if (stop) { 221 | break; 222 | } 223 | } 224 | 225 | fprintf(stderr, "before_accesses: %lu\n", before_accesses); 226 | 227 | //fclose(timefile); 228 | return 0; 229 | } 230 | 231 | int main(int argc, char **argv) 232 | { 233 | unsigned long expt; 234 | unsigned long size, elt_size; 235 | unsigned long tot_hot_size; 236 | int log_hot_size; 237 | struct timeval starttime, stoptime; 238 | double secs, gups; 239 | int i; 240 | void *p; 241 | struct gups_args** ga; 242 | pthread_t t[MAX_THREADS]; 243 | 244 | if (argc != 6) { 245 | fprintf(stderr, "Usage: %s [threads] [updates per thread] [exponent] [data size (bytes)] [noremap/remap]\n", argv[0]); 246 | fprintf(stderr, " threads\t\t\tnumber of threads to launch\n"); 247 | fprintf(stderr, " updates per thread\t\tnumber of updates per thread\n"); 248 | fprintf(stderr, " exponent\t\t\tlog size of region\n"); 249 | fprintf(stderr, " data size\t\t\tsize of data in array (in bytes)\n"); 250 | fprintf(stderr, " hot size\t\t\tlog size of hot set\n"); 251 | return 0; 252 | } 253 | 254 | gettimeofday(&starttime, NULL); 255 | 256 | threads = atoi(argv[1]); 257 | assert(threads <= MAX_THREADS); 258 | ga = (struct gups_args**)malloc(threads * sizeof(struct gups_args*)); 259 | 260 | updates = atol(argv[2]); 261 | updates -= updates % 256; 262 | expt = atoi(argv[3]); 263 | assert(expt > 8); 264 | assert(updates > 0 && (updates % 256 == 0)); 265 | size = (unsigned long)(1) << expt; 266 | size -= (size % 256); 267 | assert(size > 0 && (size % 256 == 0)); 268 | elt_size = atoi(argv[4]); 269 | log_hot_size = atof(argv[5]); 270 | tot_hot_size = (unsigned long)(1) << log_hot_size; 271 | 272 | fprintf(stderr, "%lu updates per thread (%d threads)\n", updates, threads); 273 | fprintf(stderr, "field of 2^%lu (%lu) bytes\n", expt, size); 274 | fprintf(stderr, "%ld byte element size (%ld elements total)\n", elt_size, size / elt_size); 275 | 276 | p = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS | MAP_HUGETLB | MAP_POPULATE, -1, 0); 277 | if (p == MAP_FAILED) { 278 | perror("mmap"); 279 | assert(0); 280 | } 281 | 282 | gettimeofday(&stoptime, NULL); 283 | fprintf(stderr, "Init took %.4f seconds\n", elapsed(&starttime, &stoptime)); 284 | fprintf(stderr, "Region address: %p - %p\t size: %ld\n", p, (p + size), size); 285 | 286 | nelems = (size / threads) / elt_size; // number of elements per thread 287 | fprintf(stderr, "Elements per thread: %lu\n", nelems); 288 | 289 | memset(thread_gups, 0, sizeof(thread_gups)); 290 | 291 | hotsetfile = fopen("hotsets.txt", "w"); 292 | if (hotsetfile == NULL) { 293 | perror("fopen"); 294 | assert(0); 295 | } 296 | 297 | gettimeofday(&stoptime, NULL); 298 | secs = elapsed(&starttime, &stoptime); 299 | fprintf(stderr, "Initialization time: %.4f seconds.\n", secs); 300 | 301 | //hemem_start_timing(); 302 | 303 | hot_start = 0; 304 | hotsize = (tot_hot_size / threads) / elt_size; 305 | //printf("hot_start: %p\thot_end: %p\thot_size: %lu\n", p + hot_start, p + hot_start + (hotsize * elt_size), hotsize); 306 | 307 | gettimeofday(&starttime, NULL); 308 | for (i = 0; i < threads; i++) { 309 | //printf("starting thread [%d]\n", i); 310 | ga[i] = (struct gups_args*)malloc(sizeof(struct gups_args)); 311 | ga[i]->tid = i; 312 | ga[i]->field = p + (i * nelems * elt_size); 313 | ga[i]->iters = updates; 314 | ga[i]->size = nelems; 315 | ga[i]->elt_size = elt_size; 316 | ga[i]->hot_start = 0; // hot set at start of thread's region 317 | ga[i]->hotsize = hotsize; 318 | } 319 | 320 | // run through gups once to touch all memory 321 | // spawn gups worker threads 322 | for (i = 0; i < threads; i++) { 323 | int r = pthread_create(&t[i], NULL, do_gups, (void*)ga[i]); 324 | assert(r == 0); 325 | } 326 | 327 | // wait for worker threads 328 | for (i = 0; i < threads; i++) { 329 | int r = pthread_join(t[i], NULL); 330 | assert(r == 0); 331 | } 332 | //hemem_print_stats(); 333 | 334 | gettimeofday(&stoptime, NULL); 335 | 336 | secs = elapsed(&starttime, &stoptime); 337 | //printf("Elapsed time: %.4f seconds.\n", secs); 338 | gups = threads * ((double)updates) / (secs * 1.0e9); 339 | //printf("GUPS = %.10f\n", gups); 340 | memset(thread_gups, 0, sizeof(thread_gups)); 341 | 342 | filename = "indices2.txt"; 343 | 344 | pthread_t print_thread; 345 | int pt = pthread_create(&print_thread, NULL, print_instantaneous_gups, NULL); 346 | assert(pt == 0); 347 | 348 | 349 | pthread_t timer_thread; 350 | int tt = pthread_create(&timer_thread, NULL, timing_thread, NULL); 351 | assert (tt == 0); 352 | 353 | fprintf(stderr, "Timing.\n"); 354 | gettimeofday(&starttime, NULL); 355 | 356 | //hemem_clear_stats(); 357 | // spawn gups worker threads 358 | for (i = 0; i < threads; i++) { 359 | ga[i]->iters = updates * 2; 360 | int r = pthread_create(&t[i], NULL, do_gups, (void*)ga[i]); 361 | assert(r == 0); 362 | } 363 | 364 | // wait for worker threads 365 | for (i = 0; i < threads; i++) { 366 | int r = pthread_join(t[i], NULL); 367 | assert(r == 0); 368 | } 369 | gettimeofday(&stoptime, NULL); 370 | //hemem_print_stats(); 371 | //hemem_clear_stats(); 372 | 373 | secs = elapsed(&starttime, &stoptime); 374 | printf("Elapsed time: %.4f seconds.\n", secs); 375 | gups = ((double)tot_updates) / (secs * 1.0e9); 376 | printf("GUPS = %.10f\n", gups); 377 | 378 | memset(thread_gups, 0, sizeof(thread_gups)); 379 | 380 | #if 0 381 | #ifdef HOTSPOT 382 | filename = "indices3.txt"; 383 | move_hotset = true; 384 | 385 | printf("Timing.\n"); 386 | gettimeofday(&starttime, NULL); 387 | 388 | // spawn gups worker threads 389 | for (i = 0; i < threads; i++) { 390 | int r = pthread_create(&t[i], NULL, do_gups, (void*)ga[i]); 391 | assert(r == 0); 392 | } 393 | 394 | // wait for worker threads 395 | for (i = 0; i < threads; i++) { 396 | int r = pthread_join(t[i], NULL); 397 | assert(r == 0); 398 | } 399 | 400 | gettimeofday(&stoptime, NULL); 401 | 402 | secs = elapsed(&starttime, &stoptime); 403 | printf("Elapsed time: %.4f seconds.\n", secs); 404 | gups = threads * ((double)updates) / (secs * 1.0e9); 405 | printf("GUPS = %.10f\n", gups); 406 | 407 | //hemem_print_stats(); 408 | #endif 409 | #endif 410 | 411 | //hemem_stop_timing(); 412 | 413 | for (i = 0; i < threads; i++) { 414 | //free(ga[i]->indices); 415 | free(ga[i]); 416 | } 417 | free(ga); 418 | 419 | //getchar(); 420 | 421 | munmap(p, size); 422 | 423 | return 0; 424 | } 425 | 426 | -------------------------------------------------------------------------------- /microbenchmarks/gups-random.c: -------------------------------------------------------------------------------- 1 | /* 2 | * ===================================================================================== 3 | * 4 | * Filename: gups.c 5 | * 6 | * Description: 7 | * 8 | * Version: 1.0 9 | * Created: 02/21/2018 02:36:27 PM 10 | * Revision: none 11 | * Compiler: gcc 12 | * 13 | * Author: YOUR NAME (), 14 | * Organization: 15 | * 16 | * ===================================================================================== 17 | */ 18 | 19 | #define _GNU_SOURCE 20 | 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | #include 33 | #include 34 | #include 35 | #include 36 | 37 | #include "../src/timer.h" 38 | #include "../src/hemem.h" 39 | 40 | 41 | #include "gups.h" 42 | 43 | #define MAX_THREADS 64 44 | 45 | #define GUPS_PAGE_SIZE (4 * 1024) 46 | #define PAGE_NUM 3 47 | #define PAGES 2048 48 | 49 | #ifdef HOTSPOT 50 | extern uint64_t hotset_start; 51 | extern double hotset_fraction; 52 | #endif 53 | 54 | int threads; 55 | 56 | uint64_t hot_start = 0; 57 | uint64_t hotsize = 0; 58 | uint64_t hot_offset_page = 0; 59 | bool move_hotset = false; 60 | 61 | struct gups_args { 62 | int tid; // thread id 63 | uint64_t *indices; // array of indices to access 64 | void* field; // pointer to start of thread's region 65 | uint64_t iters; // iterations to perform 66 | uint64_t size; // size of region 67 | uint64_t elt_size; // size of elements 68 | uint64_t hot_start; // start of hot set 69 | uint64_t hotsize; // size of hot set 70 | }; 71 | 72 | 73 | static inline uint64_t rdtscp(void) 74 | { 75 | uint32_t eax, edx; 76 | // why is "ecx" in clobber list here, anyway? -SG&MH,2017-10-05 77 | __asm volatile ("rdtscp" : "=a" (eax), "=d" (edx) :: "ecx", "memory"); 78 | return ((uint64_t)edx << 32) | eax; 79 | } 80 | 81 | //uint64_t thread_gups[MAX_THREADS]; 82 | 83 | static unsigned long updates, nelems; 84 | 85 | static void *print_instantaneous_gups() 86 | { 87 | uint64_t last_second_gups[threads]; 88 | FILE *f[threads]; 89 | char fname[20]; 90 | 91 | for (int i = 0; i < threads; i++) { 92 | last_second_gups[i] = 0; 93 | snprintf(fname, 20, "gups_%d.txt", i); 94 | //printf("file name: %s\n", fname); 95 | f[i] = fopen(fname, "w"); 96 | if (f[i] == NULL) { 97 | perror("fopen"); 98 | assert(0); 99 | } 100 | } 101 | 102 | for (;;) { 103 | for (int i = 0; i < threads; i++) { 104 | //fprintf(f[i], "%.10f\n", (1.0 * (abs(thread_gups[i] - last_second_gups[i]))) / (1.0e9)); 105 | //last_second_gups[i] = thread_gups[i]; 106 | } 107 | sleep(1); 108 | //printf("GUPS: %.10f\n", (1.0 * (abs(thread_gups[0]- last_second_gups))) / (1.0e9)); 109 | //last_second_gups = thread_gups[0]; 110 | //sleep(1); 111 | } 112 | 113 | return NULL; 114 | } 115 | 116 | 117 | static uint64_t lfsr_fast(uint64_t lfsr) 118 | { 119 | lfsr ^= lfsr >> 7; 120 | lfsr ^= lfsr << 9; 121 | lfsr ^= lfsr >> 13; 122 | return lfsr; 123 | } 124 | 125 | char *filename = "indices1.txt"; 126 | 127 | FILE *hotsetfile = NULL; 128 | 129 | bool hotset_only = false; 130 | 131 | static void *prefill_hotset(void* arguments) 132 | { 133 | struct gups_args *args = (struct gups_args*)arguments; 134 | uint64_t *field = (uint64_t*)(args->field); 135 | uint64_t i; 136 | uint64_t index1; 137 | uint64_t elt_size = args->elt_size; 138 | char data[elt_size]; 139 | 140 | index1 = 0; 141 | 142 | for (i = 0; i < args->hotsize; i++) { 143 | index1 = i; 144 | if (elt_size == 8) { 145 | uint64_t tmp = field[index1]; 146 | tmp = tmp + i; 147 | field[index1] = tmp; 148 | } 149 | else { 150 | memcpy(data, &field[index1 * elt_size], elt_size); 151 | memset(data, data[0] + i, elt_size); 152 | memcpy(&field[index1 * elt_size], data, elt_size); 153 | } 154 | } 155 | return 0; 156 | 157 | } 158 | 159 | static void *do_gups(void *arguments) 160 | { 161 | //printf("do_gups entered\n"); 162 | struct gups_args *args = (struct gups_args*)arguments; 163 | uint64_t *field = (uint64_t*)(args->field); 164 | uint64_t i; 165 | uint64_t index1, index2; 166 | uint64_t elt_size = args->elt_size; 167 | char data[elt_size]; 168 | uint64_t lfsr; 169 | uint64_t hot_num; 170 | uint64_t offset; 171 | uint64_t start, end; 172 | 173 | srand(args->tid); 174 | lfsr = rand(); 175 | 176 | index1 = 0; 177 | index2 = 0; 178 | 179 | fprintf(hotsetfile, "Thread %d region: %p - %p\thot set: %p - %p\n", args->tid, field, field + (args->size * elt_size), field + args->hot_start, field + args->hot_start + (args->hotsize * elt_size)); 180 | 181 | for (i = 0; i < args->iters; i++) { 182 | hot_num = lfsr_fast(lfsr) % 100; 183 | lfsr = lfsr_fast(lfsr); 184 | index2 = lfsr % (args->size); 185 | start = rdtscp(); 186 | if (elt_size == 8) { 187 | uint64_t tmp = field[index2]; 188 | tmp = tmp + i; 189 | field[index2] = tmp; 190 | } 191 | else { 192 | memcpy(data, &field[index2 * elt_size], elt_size); 193 | memset(data, data[0] + i, elt_size); 194 | memcpy(&field[index2 * elt_size], data, elt_size); 195 | } 196 | end = rdtscp(); 197 | } 198 | 199 | return 0; 200 | } 201 | 202 | int main(int argc, char **argv) 203 | { 204 | unsigned long expt; 205 | unsigned long size, elt_size; 206 | unsigned long tot_hot_size; 207 | int log_hot_size; 208 | struct timeval starttime, stoptime; 209 | double secs, gups; 210 | int i; 211 | void *p; 212 | struct gups_args** ga; 213 | pthread_t t[MAX_THREADS]; 214 | 215 | if (argc != 6) { 216 | fprintf(stderr, "Usage: %s [threads] [updates per thread] [exponent] [data size (bytes)] [noremap/remap]\n", argv[0]); 217 | fprintf(stderr, " threads\t\t\tnumber of threads to launch\n"); 218 | fprintf(stderr, " updates per thread\t\tnumber of updates per thread\n"); 219 | fprintf(stderr, " exponent\t\t\tlog size of region\n"); 220 | fprintf(stderr, " data size\t\t\tsize of data in array (in bytes)\n"); 221 | fprintf(stderr, " hot size\t\t\tlog size of hot set\n"); 222 | return 0; 223 | } 224 | 225 | gettimeofday(&starttime, NULL); 226 | 227 | threads = atoi(argv[1]); 228 | assert(threads <= MAX_THREADS); 229 | ga = (struct gups_args**)malloc(threads * sizeof(struct gups_args*)); 230 | 231 | updates = atol(argv[2]); 232 | updates -= updates % 256; 233 | expt = atoi(argv[3]); 234 | assert(expt > 8); 235 | assert(updates > 0 && (updates % 256 == 0)); 236 | size = (unsigned long)(1) << expt; 237 | size -= (size % 256); 238 | assert(size > 0 && (size % 256 == 0)); 239 | elt_size = atoi(argv[4]); 240 | log_hot_size = atof(argv[5]); 241 | tot_hot_size = (unsigned long)(1) << log_hot_size; 242 | 243 | fprintf(stderr, "%lu updates per thread (%d threads)\n", updates, threads); 244 | fprintf(stderr, "field of 2^%lu (%lu) bytes\n", expt, size); 245 | fprintf(stderr, "%ld byte element size (%ld elements total)\n", elt_size, size / elt_size); 246 | 247 | p = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS | MAP_HUGETLB | MAP_POPULATE, -1, 0); 248 | if (p == MAP_FAILED) { 249 | perror("mmap"); 250 | assert(0); 251 | } 252 | 253 | gettimeofday(&stoptime, NULL); 254 | fprintf(stderr, "Init took %.4f seconds\n", elapsed(&starttime, &stoptime)); 255 | fprintf(stderr, "Region address: %p - %p\t size: %ld\n", p, (p + size), size); 256 | 257 | nelems = (size / threads) / elt_size; // number of elements per thread 258 | fprintf(stderr, "Elements per thread: %lu\n", nelems); 259 | 260 | //memset(thread_gups, 0, sizeof(thread_gups)); 261 | 262 | hotsetfile = fopen("hotsets.txt", "w"); 263 | if (hotsetfile == NULL) { 264 | perror("fopen"); 265 | assert(0); 266 | } 267 | 268 | gettimeofday(&stoptime, NULL); 269 | secs = elapsed(&starttime, &stoptime); 270 | fprintf(stderr, "Initialization time: %.4f seconds.\n", secs); 271 | 272 | //hemem_start_timing(); 273 | 274 | //pthread_t print_thread; 275 | //int pt = pthread_create(&print_thread, NULL, print_instantaneous_gups, NULL); 276 | //assert(pt == 0); 277 | 278 | 279 | hot_start = 0; 280 | hotsize = (tot_hot_size / threads) / elt_size; 281 | //printf("hot_start: %p\thot_end: %p\thot_size: %lu\n", p + hot_start, p + hot_start + (hotsize * elt_size), hotsize); 282 | 283 | gettimeofday(&starttime, NULL); 284 | for (i = 0; i < threads; i++) { 285 | //printf("starting thread [%d]\n", i); 286 | ga[i] = (struct gups_args*)malloc(sizeof(struct gups_args)); 287 | ga[i]->tid = i; 288 | ga[i]->field = p + (i * nelems * elt_size); 289 | ga[i]->iters = updates; 290 | ga[i]->size = nelems; 291 | ga[i]->elt_size = elt_size; 292 | ga[i]->hot_start = 0; // hot set at start of thread's region 293 | ga[i]->hotsize = hotsize; 294 | } 295 | 296 | if (!hotset_only) { 297 | for (i = 0; i < threads; i++) { 298 | int r = pthread_create(&t[i], NULL, prefill_hotset, (void*)ga[i]); 299 | assert(r == 0); 300 | } 301 | // wait for worker threads 302 | for (i = 0; i < threads; i++) { 303 | int r = pthread_join(t[i], NULL); 304 | assert(r == 0); 305 | } 306 | } 307 | 308 | // run through gups once to touch all memory 309 | // spawn gups worker threads 310 | for (i = 0; i < threads; i++) { 311 | int r = pthread_create(&t[i], NULL, do_gups, (void*)ga[i]); 312 | assert(r == 0); 313 | } 314 | 315 | // wait for worker threads 316 | for (i = 0; i < threads; i++) { 317 | int r = pthread_join(t[i], NULL); 318 | assert(r == 0); 319 | } 320 | //hemem_print_stats(); 321 | 322 | gettimeofday(&stoptime, NULL); 323 | 324 | secs = elapsed(&starttime, &stoptime); 325 | //printf("Elapsed time: %.4f seconds.\n", secs); 326 | gups = threads * ((double)updates) / (secs * 1.0e9); 327 | //printf("GUPS = %.10f\n", gups); 328 | //memset(thread_gups, 0, sizeof(thread_gups)); 329 | 330 | filename = "indices2.txt"; 331 | 332 | fprintf(stderr, "Timing.\n"); 333 | gettimeofday(&starttime, NULL); 334 | 335 | //hemem_clear_stats(); 336 | // spawn gups worker threads 337 | for (i = 0; i < threads; i++) { 338 | int r = pthread_create(&t[i], NULL, do_gups, (void*)ga[i]); 339 | assert(r == 0); 340 | } 341 | 342 | // wait for worker threads 343 | for (i = 0; i < threads; i++) { 344 | int r = pthread_join(t[i], NULL); 345 | assert(r == 0); 346 | } 347 | gettimeofday(&stoptime, NULL); 348 | //hemem_print_stats(); 349 | //hemem_clear_stats(); 350 | 351 | secs = elapsed(&starttime, &stoptime); 352 | printf("Elapsed time: %.4f seconds.\n", secs); 353 | gups = threads * ((double)updates) / (secs * 1.0e9); 354 | printf("GUPS = %.10f\n", gups); 355 | 356 | //memset(thread_gups, 0, sizeof(thread_gups)); 357 | #if 0 358 | #ifdef HOTSPOT 359 | move_hotset = true; 360 | hot_offset_page = hotsize / GUPS_PAGE_SIZE; 361 | //hot_start = (16UL * 1024UL * 1024UL * 1024UL) / elt_size; // 16GB to the right; 362 | printf("hot_start: %lu\thot_size: %lu\n", hot_start, hotsize); 363 | printf("hot_offset_page: %lu\n", hot_offset_page); 364 | 365 | filename = "indices3.txt"; 366 | 367 | printf("Timing.\n"); 368 | gettimeofday(&starttime, NULL); 369 | 370 | // spawn gups worker threads 371 | for (i = 0; i < threads; i++) { 372 | ga[i]->tid = i; 373 | ga[i]->iters = updates; 374 | ga[i]->size = nelems; 375 | ga[i]->elt_size = elt_size; 376 | int r = pthread_create(&t[i], NULL, do_gups, (void*)ga[i]); 377 | assert(r == 0); 378 | } 379 | 380 | // wait for worker threads 381 | for (i = 0; i < threads; i++) { 382 | int r = pthread_join(t[i], NULL); 383 | assert(r == 0); 384 | } 385 | 386 | gettimeofday(&stoptime, NULL); 387 | 388 | secs = elapsed(&starttime, &stoptime); 389 | printf("Elapsed time: %.4f seconds.\n", secs); 390 | gups = threads * ((double)updates) / (secs * 1.0e9); 391 | printf("GUPS = %.10f\n", gups); 392 | 393 | //hemem_print_stats(); 394 | #endif 395 | #endif 396 | 397 | //hemem_stop_timing(); 398 | 399 | for (i = 0; i < threads; i++) { 400 | //free(ga[i]->indices); 401 | free(ga[i]); 402 | } 403 | free(ga); 404 | 405 | //getchar(); 406 | 407 | munmap(p, size); 408 | 409 | return 0; 410 | } 411 | 412 | 413 | -------------------------------------------------------------------------------- /microbenchmarks/gups-small.c: -------------------------------------------------------------------------------- 1 | /* 2 | * ===================================================================================== 3 | * 4 | * Filename: gups.c 5 | * 6 | * Description: 7 | * 8 | * Version: 1.0 9 | * Created: 02/21/2018 02:36:27 PM 10 | * Revision: none 11 | * Compiler: gcc 12 | * 13 | * Author: YOUR NAME (), 14 | * Organization: 15 | * 16 | * ===================================================================================== 17 | */ 18 | 19 | #define _GNU_SOURCE 20 | 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | #include 33 | #include 34 | #include 35 | #include 36 | #include 37 | #include 38 | #include 39 | #include 40 | #include 41 | 42 | #include "../timer.h" 43 | 44 | uint64_t size = 2 * 1024 * 1024; 45 | uint64_t elt_size = 8; 46 | uint64_t iters = 1000000000; 47 | 48 | int dramfd, nvmfd, uffd; 49 | void *dram_devdax_mmap, *nvm_devdax_mmap; 50 | 51 | bool migrating = false; 52 | 53 | void wp_page(uint64_t addr) 54 | { 55 | struct uffdio_writeprotect wp; 56 | int ret; 57 | 58 | assert(addr != 0); 59 | 60 | wp.range.start = addr; 61 | wp.range.len = 2 * 1024 * 1024; 62 | wp.mode = UFFDIO_WRITEPROTECT_MODE_WP; 63 | ret = ioctl(uffd, UFFDIO_WRITEPROTECT, &wp); 64 | 65 | if (ret < 0) { 66 | perror("uffdio writeprotect"); 67 | assert(0); 68 | } 69 | } 70 | 71 | void migrate(uint64_t va, bool migrate_down) 72 | { 73 | void *newptr; 74 | uint64_t pagesize; 75 | void *old_addr, *new_addr; 76 | 77 | migrating = true; 78 | 79 | pagesize = 2 * 1024 * 1024; 80 | 81 | if (migrate_down) { 82 | old_addr = dram_devdax_mmap; 83 | new_addr = nvm_devdax_mmap; 84 | } 85 | else { 86 | old_addr = nvm_devdax_mmap; 87 | new_addr = dram_devdax_mmap; 88 | } 89 | 90 | memcpy(new_addr, old_addr, pagesize); 91 | 92 | if (migrate_down) { 93 | newptr = mmap((void*)va, pagesize, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE | MAP_FIXED, nvmfd, 0); 94 | } 95 | else { 96 | newptr = mmap((void*)va, pagesize, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE | MAP_FIXED, dramfd, 0); 97 | } 98 | 99 | if (newptr == MAP_FAILED) { 100 | perror("newptr mmap"); 101 | assert(0); 102 | } 103 | if (newptr != (void*)va) { 104 | fprintf(stderr, "mapped address is not same as faulting address\n"); 105 | } 106 | 107 | // re-register new mmap region with userfaultfd 108 | struct uffdio_register uffdio_register; 109 | uffdio_register.range.start = (uint64_t)newptr; 110 | uffdio_register.range.len = pagesize; 111 | uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING | UFFDIO_REGISTER_MODE_WP; 112 | uffdio_register.ioctls = 0; 113 | if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == -1) { 114 | perror("ioctl uffdio_register"); 115 | assert(0); 116 | } 117 | 118 | migrating = false; 119 | } 120 | 121 | void *do_migration(void *addr) 122 | { 123 | for (;;) { 124 | wp_page((uint64_t)addr); 125 | migrate((uint64_t)addr, true); 126 | //usleep(50000); 127 | 128 | wp_page((uint64_t)addr); 129 | migrate((uint64_t)addr, false); 130 | //usleep(50000); 131 | } 132 | 133 | return NULL; 134 | } 135 | 136 | void handle_wp_fault(uint64_t page_boundry) 137 | { 138 | fprintf(stderr, "encountered a wp fault, waiting for migration..."); 139 | while (migrating) { 140 | // just wait for migrating to be done 141 | } 142 | fprintf(stderr, "done\n"); 143 | } 144 | 145 | void *handle_fault() 146 | { 147 | static struct uffd_msg *msg; 148 | ssize_t nread; 149 | uint64_t fault_addr; 150 | uint64_t fault_flags; 151 | uint64_t page_boundry; 152 | struct uffdio_range range; 153 | int ret; 154 | 155 | for (;;) { 156 | struct pollfd pollfd; 157 | int pollres; 158 | pollfd.fd = uffd; 159 | pollfd.events = POLLIN; 160 | 161 | pollres = poll(&pollfd, 1, -1); 162 | 163 | switch (pollres) { 164 | case -1: 165 | perror("poll"); 166 | assert(0); 167 | case 0: 168 | fprintf(stderr, "poll read 0\n"); 169 | continue; 170 | case 1: 171 | break; 172 | default: 173 | fprintf(stderr, "unexpected poll result\n"); 174 | assert(0); 175 | } 176 | 177 | if (pollfd.revents & POLLERR) { 178 | fprintf(stderr, "pollerr\n"); 179 | assert(0); 180 | } 181 | 182 | if (!pollfd.revents & POLLIN) { 183 | continue; 184 | } 185 | 186 | nread = read(uffd, msg, sizeof(struct uffd_msg)); 187 | if (nread == 0) { 188 | fprintf(stderr, "EOF on userfaultfd\n"); 189 | assert(0); 190 | } 191 | 192 | if (nread < 0) { 193 | if (errno == EAGAIN) { 194 | continue; 195 | } 196 | perror("read"); 197 | assert(0); 198 | } 199 | 200 | if ((nread != sizeof(struct uffd_msg))) { 201 | fprintf(stderr, "invalid msg size: [%ld]\n", nread); 202 | assert(0); 203 | } 204 | 205 | if (msg->event & UFFD_EVENT_PAGEFAULT) { 206 | fault_addr = (uint64_t)msg->arg.pagefault.address; 207 | fault_flags = msg->arg.pagefault.flags; 208 | 209 | // allign faulting address to page boundry 210 | // huge page boundry in this case due to dax allignment 211 | page_boundry = fault_addr & ~((2 * 1024 * 1024) - 1); 212 | 213 | if (fault_flags & UFFD_PAGEFAULT_FLAG_WP) { 214 | handle_wp_fault(page_boundry); 215 | } 216 | else { 217 | assert(!"page faults with MAP_POPULATE should not happen\n"); 218 | } 219 | 220 | // wake the faulting thread 221 | range.start = (uint64_t)page_boundry; 222 | range.len = 2 * 1024 * 1024; 223 | 224 | ret = ioctl(uffd, UFFDIO_WAKE, &range); 225 | 226 | if (ret < 0) { 227 | perror("uffdio wake"); 228 | assert(0); 229 | } 230 | } 231 | else if (msg->event & UFFD_EVENT_UNMAP){ 232 | fprintf(stderr, "Received an unmap event\n"); 233 | assert(0); 234 | } 235 | else if (msg->event & UFFD_EVENT_REMOVE) { 236 | fprintf(stderr, "received a remove event\n"); 237 | assert(0); 238 | } 239 | else { 240 | fprintf(stderr, "received a non page fault event\n"); 241 | assert(0); 242 | } 243 | } 244 | } 245 | 246 | 247 | static uint64_t lfsr_fast(uint64_t lfsr) 248 | { 249 | lfsr ^= lfsr >> 7; 250 | lfsr ^= lfsr << 9; 251 | lfsr ^= lfsr >> 13; 252 | return lfsr; 253 | } 254 | 255 | static void *do_gups(void *argument) 256 | { 257 | //printf("do_gups entered\n"); 258 | char *field = (char*)(argument); 259 | uint64_t i; 260 | uint64_t index; 261 | char data[elt_size]; 262 | uint64_t lfsr; 263 | 264 | srand(0); 265 | lfsr = rand(); 266 | 267 | for (i = 0; i < iters; i++) { 268 | lfsr = lfsr_fast(lfsr); 269 | index = lfsr % (size / elt_size); 270 | memcpy(data, &field[index * elt_size], elt_size); 271 | memset(data, data[0] + i, elt_size); 272 | } 273 | return 0; 274 | } 275 | 276 | int main(int argc, char **argv) 277 | { 278 | struct timeval starttime, stoptime; 279 | double secs, gups; 280 | void *p; 281 | uint64_t nelems; 282 | pthread_t gups_thread, fault_thread, migrate_thread; 283 | 284 | uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); 285 | if (uffd == -1) { 286 | perror("uffd"); 287 | assert(0); 288 | } 289 | 290 | struct uffdio_api uffdio_api; 291 | uffdio_api.api = UFFD_API; 292 | uffdio_api.features = UFFD_FEATURE_PAGEFAULT_FLAG_WP | UFFD_FEATURE_MISSING_SHMEM | UFFD_FEATURE_MISSING_HUGETLBFS;// | UFFD_FEATURE_EVENT_UNMAP | UFFD_FEATURE_EVENT_REMOVE; 293 | uffdio_api.ioctls = 0; 294 | if (ioctl(uffd, UFFDIO_API, &uffdio_api) == -1) { 295 | perror("ioctl uffdio_api"); 296 | assert(0); 297 | } 298 | 299 | dramfd = open("/dev/dax0.0", O_RDWR); 300 | if (dramfd < 0) { 301 | perror("dram open"); 302 | } 303 | assert(dramfd >= 0); 304 | 305 | nvmfd = open("/dev/dax1.0", O_RDWR); 306 | if (nvmfd < 0) { 307 | perror("nvm open"); 308 | } 309 | assert(nvmfd >= 0); 310 | 311 | dram_devdax_mmap = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, dramfd, 0); 312 | if (dram_devdax_mmap == MAP_FAILED) { 313 | perror("dram devdax mmap"); 314 | assert(0); 315 | } 316 | 317 | nvm_devdax_mmap = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, nvmfd, 0); 318 | if (nvm_devdax_mmap == MAP_FAILED) { 319 | perror("nvm devdax mmap"); 320 | assert(0); 321 | } 322 | 323 | gettimeofday(&starttime, NULL); 324 | 325 | fprintf(stderr, "%ld updates per thread (1 threads)\n", iters); 326 | fprintf(stderr, "field of 2^21 (%lu) bytes\n", size); 327 | fprintf(stderr, "8 byte element size (%ld elements total)\n", size / elt_size); 328 | 329 | p = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, dramfd, 0); 330 | if (p == MAP_FAILED) { 331 | perror("mmap"); 332 | assert(0); 333 | } 334 | 335 | struct uffdio_register uffdio_register; 336 | uffdio_register.range.start = (uint64_t)p; 337 | uffdio_register.range.len = size; 338 | uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING | UFFDIO_REGISTER_MODE_WP; 339 | uffdio_register.ioctls = 0; 340 | if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == -1) { 341 | perror("ioctl uffdio_register"); 342 | assert(0); 343 | } 344 | 345 | int r = pthread_create(&fault_thread, NULL, handle_fault, NULL); 346 | assert(r == 0); 347 | 348 | r = pthread_create(&migrate_thread, NULL, do_migration, p); 349 | assert(r == 0); 350 | 351 | gettimeofday(&stoptime, NULL); 352 | fprintf(stderr, "Init took %.4f seconds\n", elapsed(&starttime, &stoptime)); 353 | fprintf(stderr, "Region address: %p\t size: %ld\n", p, size); 354 | 355 | nelems = size / elt_size; // number of elements per thread 356 | fprintf(stderr, "Elements per thread: %lu\n", nelems); 357 | 358 | gettimeofday(&starttime, NULL); 359 | 360 | // run through gups once to touch all memory 361 | // spawn gups worker thread 362 | r = pthread_create(&gups_thread, NULL, do_gups, (void*)p); 363 | assert(r == 0); 364 | 365 | r = pthread_join(gups_thread, NULL); 366 | assert(r == 0); 367 | 368 | gettimeofday(&stoptime, NULL); 369 | 370 | secs = elapsed(&starttime, &stoptime); 371 | printf("Elapsed time: %.4f seconds.\n", secs); 372 | gups = ((double)iters) / (secs * 1.0e9); 373 | printf("GUPS = %.10f\n", gups); 374 | 375 | fprintf(stderr, "Timing.\n"); 376 | gettimeofday(&starttime, NULL); 377 | 378 | r = pthread_create(&gups_thread, NULL, do_gups, (void*)p); 379 | assert(r == 0); 380 | 381 | r = pthread_join(gups_thread, NULL); 382 | assert(r == 0); 383 | 384 | gettimeofday(&stoptime, NULL); 385 | 386 | secs = elapsed(&starttime, &stoptime); 387 | printf("Elapsed time: %.4f seconds.\n", secs); 388 | gups = ((double)iters) / (secs * 1.0e9); 389 | printf("GUPS = %.10f\n", gups); 390 | 391 | printf("Timing.\n"); 392 | gettimeofday(&starttime, NULL); 393 | 394 | // spawn gups worker threads 395 | r = pthread_create(&gups_thread, NULL, do_gups, (void*)p); 396 | assert(r == 0); 397 | 398 | r = pthread_join(gups_thread, NULL); 399 | assert(r == 0); 400 | 401 | gettimeofday(&stoptime, NULL); 402 | 403 | secs = elapsed(&starttime, &stoptime); 404 | printf("Elapsed time: %.4f seconds.\n", secs); 405 | gups = ((double)iters) / (secs * 1.0e9); 406 | printf("GUPS = %.10f\n", gups); 407 | 408 | munmap(p, size); 409 | 410 | return 0; 411 | } 412 | 413 | -------------------------------------------------------------------------------- /microbenchmarks/gups.c: -------------------------------------------------------------------------------- 1 | /* 2 | * ===================================================================================== 3 | * 4 | * Filename: gups.c 5 | * 6 | * Description: 7 | * 8 | * Version: 1.0 9 | * Created: 02/21/2018 02:36:27 PM 10 | * Revision: none 11 | * Compiler: gcc 12 | * 13 | * Author: YOUR NAME (), 14 | * Organization: 15 | * 16 | * ===================================================================================== 17 | */ 18 | 19 | #define _GNU_SOURCE 20 | 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | #include 33 | #include 34 | #include 35 | #include 36 | 37 | #include "../src/timer.h" 38 | #include "../src/hemem.h" 39 | 40 | 41 | #include "gups.h" 42 | 43 | #define MAX_THREADS 64 44 | 45 | #define GUPS_PAGE_SIZE (4 * 1024) 46 | #define PAGE_NUM 3 47 | #define PAGES 2048 48 | 49 | #ifdef HOTSPOT 50 | extern uint64_t hotset_start; 51 | extern double hotset_fraction; 52 | #endif 53 | 54 | int threads; 55 | 56 | uint64_t hot_start = 0; 57 | uint64_t hotsize = 0; 58 | uint64_t hot_offset_page = 0; 59 | bool move_hotset = false; 60 | 61 | struct gups_args { 62 | int tid; // thread id 63 | uint64_t *indices; // array of indices to access 64 | void* field; // pointer to start of thread's region 65 | uint64_t iters; // iterations to perform 66 | uint64_t size; // size of region 67 | uint64_t elt_size; // size of elements 68 | uint64_t hot_start; // start of hot set 69 | uint64_t hotsize; // size of hot set 70 | }; 71 | 72 | 73 | static inline uint64_t rdtscp(void) 74 | { 75 | uint32_t eax, edx; 76 | // why is "ecx" in clobber list here, anyway? -SG&MH,2017-10-05 77 | __asm volatile ("rdtscp" : "=a" (eax), "=d" (edx) :: "ecx", "memory"); 78 | return ((uint64_t)edx << 32) | eax; 79 | } 80 | 81 | //uint64_t thread_gups[MAX_THREADS]; 82 | 83 | static unsigned long updates, nelems; 84 | 85 | static void *print_instantaneous_gups() 86 | { 87 | uint64_t last_second_gups[threads]; 88 | FILE *f[threads]; 89 | char fname[20]; 90 | 91 | for (int i = 0; i < threads; i++) { 92 | last_second_gups[i] = 0; 93 | snprintf(fname, 20, "gups_%d.txt", i); 94 | //printf("file name: %s\n", fname); 95 | f[i] = fopen(fname, "w"); 96 | if (f[i] == NULL) { 97 | perror("fopen"); 98 | assert(0); 99 | } 100 | } 101 | 102 | for (;;) { 103 | for (int i = 0; i < threads; i++) { 104 | //fprintf(f[i], "%.10f\n", (1.0 * (abs(thread_gups[i] - last_second_gups[i]))) / (1.0e9)); 105 | //last_second_gups[i] = thread_gups[i]; 106 | } 107 | sleep(1); 108 | //printf("GUPS: %.10f\n", (1.0 * (abs(thread_gups[0]- last_second_gups))) / (1.0e9)); 109 | //last_second_gups = thread_gups[0]; 110 | //sleep(1); 111 | } 112 | 113 | return NULL; 114 | } 115 | 116 | 117 | static uint64_t lfsr_fast(uint64_t lfsr) 118 | { 119 | lfsr ^= lfsr >> 7; 120 | lfsr ^= lfsr << 9; 121 | lfsr ^= lfsr >> 13; 122 | return lfsr; 123 | } 124 | 125 | char *filename = "indices1.txt"; 126 | 127 | FILE *hotsetfile = NULL; 128 | 129 | bool hotset_only = false; 130 | 131 | static void *prefill_hotset(void* arguments) 132 | { 133 | struct gups_args *args = (struct gups_args*)arguments; 134 | uint64_t *field = (uint64_t*)(args->field); 135 | uint64_t i; 136 | uint64_t index1; 137 | uint64_t elt_size = args->elt_size; 138 | char data[elt_size]; 139 | 140 | index1 = 0; 141 | 142 | for (i = 0; i < args->hotsize; i++) { 143 | index1 = i; 144 | if (elt_size == 8) { 145 | uint64_t tmp = field[index1]; 146 | tmp = tmp + i; 147 | field[index1] = tmp; 148 | } 149 | else { 150 | memcpy(data, &field[index1 * elt_size], elt_size); 151 | memset(data, data[0] + i, elt_size); 152 | memcpy(&field[index1 * elt_size], data, elt_size); 153 | } 154 | } 155 | return 0; 156 | 157 | } 158 | 159 | static void *do_gups(void *arguments) 160 | { 161 | //printf("do_gups entered\n"); 162 | struct gups_args *args = (struct gups_args*)arguments; 163 | uint64_t *field = (uint64_t*)(args->field); 164 | uint64_t i; 165 | uint64_t index1, index2; 166 | uint64_t elt_size = args->elt_size; 167 | char data[elt_size]; 168 | uint64_t lfsr; 169 | uint64_t hot_num; 170 | uint64_t offset; 171 | uint64_t start, end; 172 | 173 | srand(args->tid); 174 | lfsr = rand(); 175 | 176 | index1 = 0; 177 | index2 = 0; 178 | 179 | fprintf(hotsetfile, "Thread %d region: %p - %p\thot set: %p - %p\n", args->tid, field, field + (args->size * elt_size), field + args->hot_start, field + args->hot_start + (args->hotsize * elt_size)); 180 | 181 | for (i = 0; i < args->iters; i++) { 182 | hot_num = lfsr_fast(lfsr) % 100; 183 | if (hot_num < 90) { 184 | lfsr = lfsr_fast(lfsr); 185 | index1 = args->hot_start + (lfsr % args->hotsize); 186 | start = rdtscp(); 187 | if (elt_size == 8) { 188 | uint64_t tmp = field[index1]; 189 | tmp = tmp + i; 190 | field[index1] = tmp; 191 | } 192 | else { 193 | memcpy(data, &field[index1 * elt_size], elt_size); 194 | memset(data, data[0] + i, elt_size); 195 | memcpy(&field[index1 * elt_size], data, elt_size); 196 | } 197 | end = rdtscp(); 198 | } 199 | else { 200 | lfsr = lfsr_fast(lfsr); 201 | index2 = lfsr % (args->size); 202 | start = rdtscp(); 203 | if (elt_size == 8) { 204 | uint64_t tmp = field[index2]; 205 | tmp = tmp + i; 206 | field[index2] = tmp; 207 | } 208 | else { 209 | memcpy(data, &field[index2 * elt_size], elt_size); 210 | memset(data, data[0] + i, elt_size); 211 | memcpy(&field[index2 * elt_size], data, elt_size); 212 | } 213 | end = rdtscp(); 214 | } 215 | } 216 | 217 | return 0; 218 | } 219 | 220 | int main(int argc, char **argv) 221 | { 222 | unsigned long expt; 223 | unsigned long size, elt_size; 224 | unsigned long tot_hot_size; 225 | int log_hot_size; 226 | struct timeval starttime, stoptime; 227 | double secs, gups; 228 | int i; 229 | void *p; 230 | struct gups_args** ga; 231 | pthread_t t[MAX_THREADS]; 232 | 233 | if (argc != 6) { 234 | fprintf(stderr, "Usage: %s [threads] [updates per thread] [exponent] [data size (bytes)] [noremap/remap]\n", argv[0]); 235 | fprintf(stderr, " threads\t\t\tnumber of threads to launch\n"); 236 | fprintf(stderr, " updates per thread\t\tnumber of updates per thread\n"); 237 | fprintf(stderr, " exponent\t\t\tlog size of region\n"); 238 | fprintf(stderr, " data size\t\t\tsize of data in array (in bytes)\n"); 239 | fprintf(stderr, " hot size\t\t\tlog size of hot set\n"); 240 | return 0; 241 | } 242 | 243 | gettimeofday(&starttime, NULL); 244 | 245 | threads = atoi(argv[1]); 246 | assert(threads <= MAX_THREADS); 247 | ga = (struct gups_args**)malloc(threads * sizeof(struct gups_args*)); 248 | 249 | updates = atol(argv[2]); 250 | updates -= updates % 256; 251 | expt = atoi(argv[3]); 252 | assert(expt > 8); 253 | assert(updates > 0 && (updates % 256 == 0)); 254 | size = (unsigned long)(1) << expt; 255 | size -= (size % 256); 256 | assert(size > 0 && (size % 256 == 0)); 257 | elt_size = atoi(argv[4]); 258 | log_hot_size = atof(argv[5]); 259 | tot_hot_size = (unsigned long)(1) << log_hot_size; 260 | 261 | fprintf(stderr, "%lu updates per thread (%d threads)\n", updates, threads); 262 | fprintf(stderr, "field of 2^%lu (%lu) bytes\n", expt, size); 263 | fprintf(stderr, "%ld byte element size (%ld elements total)\n", elt_size, size / elt_size); 264 | 265 | p = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS | MAP_HUGETLB | MAP_POPULATE, -1, 0); 266 | if (p == MAP_FAILED) { 267 | perror("mmap"); 268 | assert(0); 269 | } 270 | 271 | gettimeofday(&stoptime, NULL); 272 | fprintf(stderr, "Init took %.4f seconds\n", elapsed(&starttime, &stoptime)); 273 | fprintf(stderr, "Region address: %p - %p\t size: %ld\n", p, (p + size), size); 274 | 275 | nelems = (size / threads) / elt_size; // number of elements per thread 276 | fprintf(stderr, "Elements per thread: %lu\n", nelems); 277 | 278 | //memset(thread_gups, 0, sizeof(thread_gups)); 279 | 280 | hotsetfile = fopen("hotsets.txt", "w"); 281 | if (hotsetfile == NULL) { 282 | perror("fopen"); 283 | assert(0); 284 | } 285 | 286 | gettimeofday(&stoptime, NULL); 287 | secs = elapsed(&starttime, &stoptime); 288 | fprintf(stderr, "Initialization time: %.4f seconds.\n", secs); 289 | 290 | //hemem_start_timing(); 291 | 292 | //pthread_t print_thread; 293 | //int pt = pthread_create(&print_thread, NULL, print_instantaneous_gups, NULL); 294 | //assert(pt == 0); 295 | 296 | 297 | hot_start = 0; 298 | hotsize = (tot_hot_size / threads) / elt_size; 299 | //printf("hot_start: %p\thot_end: %p\thot_size: %lu\n", p + hot_start, p + hot_start + (hotsize * elt_size), hotsize); 300 | 301 | gettimeofday(&starttime, NULL); 302 | for (i = 0; i < threads; i++) { 303 | //printf("starting thread [%d]\n", i); 304 | ga[i] = (struct gups_args*)malloc(sizeof(struct gups_args)); 305 | ga[i]->tid = i; 306 | ga[i]->field = p + (i * nelems * elt_size); 307 | ga[i]->iters = updates; 308 | ga[i]->size = nelems; 309 | ga[i]->elt_size = elt_size; 310 | ga[i]->hot_start = 0; // hot set at start of thread's region 311 | ga[i]->hotsize = hotsize; 312 | } 313 | 314 | if (hotset_only) { 315 | for (i = 0; i < threads; i++) { 316 | int r = pthread_create(&t[i], NULL, prefill_hotset, (void*)ga[i]); 317 | assert(r == 0); 318 | } 319 | // wait for worker threads 320 | for (i = 0; i < threads; i++) { 321 | int r = pthread_join(t[i], NULL); 322 | assert(r == 0); 323 | } 324 | } 325 | 326 | // run through gups once to touch all memory 327 | // spawn gups worker threads 328 | for (i = 0; i < threads; i++) { 329 | int r = pthread_create(&t[i], NULL, do_gups, (void*)ga[i]); 330 | assert(r == 0); 331 | } 332 | 333 | // wait for worker threads 334 | for (i = 0; i < threads; i++) { 335 | int r = pthread_join(t[i], NULL); 336 | assert(r == 0); 337 | } 338 | //hemem_print_stats(); 339 | 340 | gettimeofday(&stoptime, NULL); 341 | 342 | secs = elapsed(&starttime, &stoptime); 343 | printf("Elapsed time: %.4f seconds.\n", secs); 344 | gups = threads * ((double)updates) / (secs * 1.0e9); 345 | printf("GUPS = %.10f\n", gups); 346 | //memset(thread_gups, 0, sizeof(thread_gups)); 347 | 348 | filename = "indices2.txt"; 349 | 350 | fprintf(stderr, "Timing.\n"); 351 | gettimeofday(&starttime, NULL); 352 | 353 | //hemem_clear_stats(); 354 | // spawn gups worker threads 355 | for (i = 0; i < threads; i++) { 356 | int r = pthread_create(&t[i], NULL, do_gups, (void*)ga[i]); 357 | assert(r == 0); 358 | } 359 | 360 | // wait for worker threads 361 | for (i = 0; i < threads; i++) { 362 | int r = pthread_join(t[i], NULL); 363 | assert(r == 0); 364 | } 365 | gettimeofday(&stoptime, NULL); 366 | //hemem_print_stats(); 367 | //hemem_clear_stats(); 368 | 369 | secs = elapsed(&starttime, &stoptime); 370 | printf("Elapsed time: %.4f seconds.\n", secs); 371 | gups = threads * ((double)updates) / (secs * 1.0e9); 372 | printf("GUPS = %.10f\n", gups); 373 | 374 | //memset(thread_gups, 0, sizeof(thread_gups)); 375 | #if 0 376 | #ifdef HOTSPOT 377 | move_hotset = true; 378 | hot_offset_page = hotsize / GUPS_PAGE_SIZE; 379 | //hot_start = (16UL * 1024UL * 1024UL * 1024UL) / elt_size; // 16GB to the right; 380 | printf("hot_start: %lu\thot_size: %lu\n", hot_start, hotsize); 381 | printf("hot_offset_page: %lu\n", hot_offset_page); 382 | 383 | filename = "indices3.txt"; 384 | 385 | printf("Timing.\n"); 386 | gettimeofday(&starttime, NULL); 387 | 388 | // spawn gups worker threads 389 | for (i = 0; i < threads; i++) { 390 | ga[i]->tid = i; 391 | ga[i]->iters = updates; 392 | ga[i]->size = nelems; 393 | ga[i]->elt_size = elt_size; 394 | int r = pthread_create(&t[i], NULL, do_gups, (void*)ga[i]); 395 | assert(r == 0); 396 | } 397 | 398 | // wait for worker threads 399 | for (i = 0; i < threads; i++) { 400 | int r = pthread_join(t[i], NULL); 401 | assert(r == 0); 402 | } 403 | 404 | gettimeofday(&stoptime, NULL); 405 | 406 | secs = elapsed(&starttime, &stoptime); 407 | printf("Elapsed time: %.4f seconds.\n", secs); 408 | gups = threads * ((double)updates) / (secs * 1.0e9); 409 | printf("GUPS = %.10f\n", gups); 410 | 411 | //hemem_print_stats(); 412 | #endif 413 | #endif 414 | 415 | FILE* pebsfile = fopen("pebs.txt", "w+"); 416 | assert(pebsfile != NULL); 417 | for (uint64_t addr = (uint64_t)p; addr < (uint64_t)p + size; addr += (2*1024*1024)) { 418 | struct hemem_page *pg = get_hemem_page(addr); 419 | assert(pg != NULL); 420 | if (pg != NULL) { 421 | fprintf(pebsfile, "0x%lx:\t%lu\t%lu\t%lu\n", pg->va, pg->tot_accesses[DRAMREAD], pg->tot_accesses[NVMREAD], pg->tot_accesses[WRITE]); 422 | } 423 | } 424 | 425 | //hemem_stop_timing(); 426 | 427 | for (i = 0; i < threads; i++) { 428 | //free(ga[i]->indices); 429 | free(ga[i]); 430 | } 431 | free(ga); 432 | 433 | //getchar(); 434 | 435 | munmap(p, size); 436 | 437 | return 0; 438 | } 439 | 440 | 441 | -------------------------------------------------------------------------------- /microbenchmarks/gups.h: -------------------------------------------------------------------------------- 1 | /* 2 | * ===================================================================================== 3 | * 4 | * Filename: gups.h 5 | * 6 | * Description: i 7 | * 8 | * Version: 1.0 9 | * Created: 02/17/2020 09:13:33 AM 10 | * Revision: none 11 | * Compiler: gcc 12 | * 13 | * Author: YOUR NAME (), 14 | * Organization: 15 | * 16 | * ===================================================================================== 17 | */ 18 | #ifndef GUPS_H 19 | #define GUPS_H 20 | 21 | #define INDEX_FILE "logs/indices.txt" 22 | 23 | //#define ZIPFIAN 24 | #define HOTSPOT 25 | //#define UNIFORM_RANDOM 26 | 27 | void calc_indices(unsigned long* indices, unsigned long updates, unsigned long nelems); 28 | 29 | #ifdef HOTSPOT 30 | extern uint64_t hotset_start; 31 | extern double hotset_fraction; 32 | #endif 33 | 34 | #endif 35 | -------------------------------------------------------------------------------- /microbenchmarks/run-instantaneous.sh: -------------------------------------------------------------------------------- 1 | numactl -N0 -m0 -- ./gups-hotset-move 16 1000000000 39 8 34 2 | -------------------------------------------------------------------------------- /microbenchmarks/run-random.sh: -------------------------------------------------------------------------------- 1 | #clear-caches 2 | echo "=== 30 ===" > random.txt 3 | numactl -N0 -m0 -- ./gups-random 16 1000000000 30 8 30 >> random.txt 4 | #clear-caches 5 | echo "=== 31 ===" >> random.txt 6 | numactl -N0 -m0 -- ./gups-random 16 1000000000 31 8 31 >> random.txt 7 | #clear-caches 8 | echo "=== 32 ===" >> random.txt 9 | numactl -N0 -m0 -- ./gups-random 16 1000000000 32 8 32 >> random.txt 10 | #clear-caches 11 | echo "=== 33 ===" >> random.txt 12 | numactl -N0 -m0 -- ./gups-random 16 1000000000 33 8 33 >> random.txt 13 | #clear-caches 14 | echo "=== 34 ===" >> random.txt 15 | numactl -N0 -m0 -- ./gups-random 16 1000000000 34 8 34 >> random.txt 16 | #clear-caches 17 | echo "=== 35 ===" >> random.txt 18 | numactl -N0 -m0 -- ./gups-random 16 1000000000 35 8 35 >> random.txt 19 | #clear-caches 20 | echo "=== 36 ===" >> random.txt 21 | numactl -N0 -m0 -- ./gups-random 16 1000000000 36 8 36 >> random.txt 22 | #clear-caches 23 | echo "=== 37 ===" >> random.txt 24 | numactl -N0 -m0 -- ./gups-random 16 1000000000 37 8 37 >> random.txt 25 | #clear-caches 26 | echo "=== 38 ===" >> random.txt 27 | numactl -N0 -m0 -- ./gups-random 16 1000000000 38 8 38 >> random.txt 28 | #clear-caches 29 | echo "=== 39 ===" >> random.txt 30 | numactl -N0 -m0 -- ./gups-random 16 1000000000 39 8 39 >> random.txt 31 | -------------------------------------------------------------------------------- /microbenchmarks/run-threads.sh: -------------------------------------------------------------------------------- 1 | #clear-caches 2 | echo "=== 1 ===" > threads.txt 3 | numactl -N0 --preferred=0 -- ./gups-hotset-move 1 1000000000 39 8 34 >> threads.txt 4 | #clear-caches 5 | echo "=== 2 ===" >> threads.txt 6 | numactl -N0 --preferred=0 -- ./gups-hotset-move 2 1000000000 39 8 34 >> threads.txt 7 | #clear-caches 8 | echo "=== 4 ===" >> threads.txt 9 | numactl -N0 --preferred=0 -- ./gups-hotset-move 4 1000000000 39 8 34 >> threads.txt 10 | #clear-caches 11 | echo "=== 8 ===" >> threads.txt 12 | numactl -N0 --preferred=0 -- ./gups-hotset-move 8 1000000000 39 8 34 >> threads.txt 13 | #clear-caches 14 | echo "=== 16 ===" >> threads.txt 15 | numactl -N0 --preferred=0 -- ./gups-hotset-move 16 1000000000 39 8 34 >> threads.txt 16 | #clear-caches 17 | echo "=== 20 ===" >> threads.txt 18 | numactl -N0 --preferred=0 -- ./gups-hotset-move 16 1000000000 39 8 34 >> threads.txt 19 | #clear-caches 20 | echo "=== 21 ===" >> threads.txt 21 | numactl -N0 --preferred=0 -- ./gups-hotset-move 16 1000000000 39 8 34 >> threads.txt 22 | #clear-caches 23 | echo "=== 22 ===" >> threads.txt 24 | numactl -N0 --preferred=0 -- ./gups-hotset-move 16 1000000000 39 8 34 >> threads.txt 25 | #clear-caches 26 | echo "=== 23 ===" >> threads.txt 27 | numactl -N0 --preferred=0 -- ./gups-hotset-move 16 1000000000 39 8 34 >> threads.txt 28 | #clear-caches 29 | echo "=== 24 ===" >> threads.txt 30 | numactl -N0 --preferred=0 -- ./gups-hotset-move 16 1000000000 39 8 34 >> threads.txt 31 | #clear-caches 32 | -------------------------------------------------------------------------------- /microbenchmarks/run.sh: -------------------------------------------------------------------------------- 1 | #clear-caches 2 | echo "=== 30 ===" >> results.txt 3 | numactl -N0 -m0 -- ./gups-pebs 16 1000000000 39 8 30 >> results.txt 4 | #clear-caches 5 | echo "=== 31 ===" >> results.txt 6 | numactl -N0 -m0 -- ./gups-pebs 16 1000000000 39 8 31 >> results.txt 7 | #clear-caches 8 | echo "=== 32 ===" >> results.txt 9 | numactl -N0 -m0 -- ./gups-pebs 16 1000000000 39 8 32 >> results.txt 10 | #clear-caches 11 | echo "=== 33 ===" >> results.txt 12 | numactl -N0 -m0 -- ./gups-pebs 16 1000000000 39 8 33 >> results.txt 13 | #clear-caches 14 | echo "=== 34 ===" >> results.txt 15 | numactl -N0 -m0 -- ./gups-pebs 16 1000000000 39 8 34 >> results.txt 16 | #clear-caches 17 | echo "=== 35 ===" >> results.txt 18 | numactl -N0 -m0 -- ./gups-pebs 16 1000000000 39 8 35 >> results.txt 19 | #clear-caches 20 | echo "=== 36 ===" >> results.txt 21 | numactl -N0 -m0 -- ./gups-pebs 16 1000000000 39 8 36 >> results.txt 22 | #clear-caches 23 | echo "=== 37 ===" >> results.txt 24 | numactl -N0 -m0 -- ./gups-pebs 16 1000000000 39 8 37 >> results.txt 25 | #clear-caches 26 | echo "=== 38 ===" >> results.txt 27 | numactl -N0 -m0 -- ./gups-pebs 16 1000000000 39 8 38 >> results.txt 28 | -------------------------------------------------------------------------------- /microbenchmarks/test.c: -------------------------------------------------------------------------------- 1 | /* 2 | * ===================================================================================== 3 | * 4 | * Filename: test.c 5 | * 6 | * Description: 7 | * 8 | * Version: 1.0 9 | * Created: 03/17/2020 06:24:25 AM 10 | * Revision: none 11 | * Compiler: gcc 12 | * 13 | * Author: YOUR NAME (), 14 | * Organization: 15 | * 16 | * ===================================================================================== 17 | */ 18 | #define _GNU_SOURCE 19 | 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | #include 33 | #include 34 | 35 | #include "../timer.h" 36 | #include "../hemem.h" 37 | 38 | #define KB(x) ((uint64_t)x * 1024) 39 | #define MB(x) (KB(x) * 1024) 40 | #define GB(x) (MB(x) * 1024) 41 | 42 | #define SIZE (GB(256)) 43 | 44 | int main(int argc, char **argv) 45 | { 46 | void *p; 47 | uint64_t i; 48 | uint64_t *region; 49 | uint64_t nelems; 50 | struct timeval start, end; 51 | uint64_t startval; 52 | 53 | if (argc != 2) { 54 | printf("usage: %s val\n", argv[0]); 55 | return 0; 56 | } 57 | 58 | startval = atoi(argv[1]); 59 | 60 | p = mmap(NULL, SIZE, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0); 61 | if (p == MAP_FAILED) { 62 | perror("mmap"); 63 | assert(0); 64 | } 65 | 66 | region = (uint64_t*)p; 67 | nelems = (SIZE / sizeof(uint64_t)); 68 | printf("there are %lu elements\n", nelems); 69 | 70 | gettimeofday(&start, NULL); 71 | for (i = 0; i < nelems; i++) { 72 | region[i] = startval; 73 | if (region[i] != startval) { 74 | assert(region[i] == startval); 75 | } 76 | } 77 | gettimeofday(&end, NULL); 78 | printf("init region took %.4f seconds\n", elapsed(&start, &end)); 79 | hemem_print_stats(); 80 | 81 | for (i = 0; i < nelems; i++) { 82 | if (region[i] != startval) { 83 | assert(region[i] == startval); 84 | } 85 | } 86 | hemem_print_stats(); 87 | 88 | gettimeofday(&start, NULL); 89 | for (i = 0; i < nelems; i++) { 90 | region[i] = region[i] + 2; 91 | if (region[i] != startval + 2) { 92 | assert(region[i] == startval + 2); 93 | } 94 | } 95 | gettimeofday(&end, NULL); 96 | printf("calc region took %.4f seconds\n", elapsed(&start, &end)); 97 | hemem_print_stats(); 98 | 99 | for (i = 0; i < nelems; i++) { 100 | if (region[i] != startval + 2) { 101 | assert(region[i] == startval + 2); 102 | } 103 | } 104 | hemem_print_stats(); 105 | 106 | return 0; 107 | } 108 | -------------------------------------------------------------------------------- /microbenchmarks/zipf.c: -------------------------------------------------------------------------------- 1 | /* 2 | * ===================================================================================== 3 | * 4 | * Filename: zipf.c 5 | * 6 | * Description: 7 | * 8 | * Version: 1.0 9 | * Created: 05/06/2019 11:24:53 AM 10 | * Revision: none 11 | * Compiler: gcc 12 | * 13 | * Author: YOUR NAME (), 14 | * Organization: 15 | * 16 | * ===================================================================================== 17 | */ 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | 24 | #include "gups.h" 25 | 26 | #ifdef ZIPFIAN 27 | 28 | static const double ZETAN = 26.46902820178302; 29 | static const double ZIPFIAN_CONSTANT = 0.99; 30 | static unsigned long min, max, itemcount; 31 | static unsigned long items, base, countforzeta; 32 | static double zipfianconstant, alpha, zetan, eta, theta, zeta2theta; 33 | static unsigned long lastVal; 34 | static int allowitemdecrease = 0; 35 | static const long FNV_OFFSET_BASIS_64 = 0xCBF29CE484222325L; 36 | static const long FNV_PRIME_64 = 1099511628211L; 37 | 38 | static unsigned long fnvhash64(unsigned long val) { 39 | long hashval = FNV_OFFSET_BASIS_64; 40 | 41 | for (int i = 0; i < 8; i++) { 42 | long octet = val & 0x00ff; 43 | val = val >> 8; 44 | 45 | hashval = hashval ^ octet; 46 | hashval = hashval * FNV_PRIME_64; 47 | } 48 | 49 | return (unsigned long)abs(hashval); 50 | } 51 | 52 | static double _zetastatic(unsigned long st, unsigned long n, double theta, double initialsum) 53 | { 54 | double sum = initialsum; 55 | for (unsigned long i = st; i < n; i++) { 56 | sum += 1 / (pow(i + 1, theta)); 57 | } 58 | return sum; 59 | } 60 | 61 | static double _zeta(unsigned long st, unsigned long n, double thetaVal, double initialsum) 62 | { 63 | countforzeta = n; 64 | return _zetastatic(st, n, thetaVal, initialsum); 65 | } 66 | 67 | static double zetastatic(unsigned long n, double theta) 68 | { 69 | return _zetastatic(0, n, theta, 0); 70 | } 71 | 72 | static double zeta(unsigned long n, double thetaVal) 73 | { 74 | countforzeta = n; 75 | return zetastatic(n, thetaVal); 76 | 77 | } 78 | 79 | static unsigned long nextValue(unsigned long itemcount) 80 | { 81 | if (itemcount != countforzeta) { 82 | if (itemcount > countforzeta) { 83 | printf("recomputing zeta due to item increase\n"); 84 | zetan = _zeta(countforzeta, itemcount, theta, zetan); 85 | eta = (1 - pow(2.0 / items, 1 - theta)) / (1 - zeta2theta / zetan); 86 | } else if (itemcount > countforzeta) { 87 | printf("recomputing zeta due to item decrease (warning: slow)\n"); 88 | zetan = zeta(itemcount, theta); 89 | eta = (1 - pow(2.0 / items, 1 - theta)) / (1 - zeta2theta / zetan); 90 | } 91 | } 92 | 93 | double u = (double)rand() / RAND_MAX; 94 | double uz = u * zetan; 95 | 96 | if (uz < 1.0) { 97 | return base; 98 | } 99 | 100 | if (uz < 1.0 + pow(0.5, theta)) { 101 | return base + 1; 102 | } 103 | 104 | unsigned long ret = base + (unsigned long)((itemcount) * pow(eta * u - eta + 1, alpha)); 105 | lastVal = ret; 106 | return ret; 107 | } 108 | 109 | void calc_indices(unsigned long* indices, unsigned long updates, unsigned long nelems) 110 | { 111 | FILE* f; 112 | unsigned int i; 113 | 114 | assert(!"Not thread-safe"); 115 | 116 | f = fopen(INDEX_FILE, "w"); 117 | if (f == NULL) { 118 | perror("fopen"); 119 | assert(0); 120 | } 121 | assert(indices != NULL); 122 | 123 | // init zipfian distrobution variables 124 | min = 0; 125 | max = nelems - 1; 126 | itemcount = max - min + 1; 127 | items = max - min + 1; 128 | base = min; 129 | zipfianconstant = ZIPFIAN_CONSTANT; 130 | theta = zipfianconstant; 131 | zeta2theta = zeta(2, theta); 132 | 133 | alpha = 1.0 / (1.0 - theta); 134 | zetan = ZETAN; 135 | countforzeta = items; 136 | eta = (1 - pow(2.0 / items, 1 - theta)) / (1 - zeta2theta / zetan); 137 | nextValue(nelems); 138 | 139 | for (i = 0; i < updates; i++) { 140 | unsigned long ret = nextValue(nelems); 141 | ret = min + fnvhash64(ret) % itemcount; 142 | lastVal = ret; 143 | indices[i] = ret; 144 | //fprintf(f, "%d\n", indices[i]); 145 | } 146 | 147 | fclose(f); 148 | } 149 | 150 | #elif defined HOTSPOT 151 | 152 | #define RAND_WITHIN(x) (((double)rand_r(&seed) / RAND_MAX) * (x)) 153 | 154 | uint64_t hotset_start = 0; 155 | double hotset_fraction = 0.1; 156 | static double hotset_prob = 0.9; 157 | 158 | void calc_indices(unsigned long* indices, unsigned long updates, unsigned long nelems) 159 | { 160 | int i; 161 | uint64_t hotset_size = (uint64_t)(hotset_fraction * nelems); 162 | unsigned int seed = 0; 163 | 164 | assert(hotset_start + hotset_size <= nelems); 165 | 166 | printf("hotset start: %lu\thotset size: %lu\thotset probability: %f\n", hotset_start, hotset_size, hotset_prob); 167 | 168 | /* srand(0); */ 169 | 170 | for (i = 0; i < updates; i++) { 171 | if (RAND_WITHIN(1) < hotset_prob) { 172 | indices[i] = hotset_start + (uint64_t)RAND_WITHIN(hotset_size); 173 | } 174 | else { 175 | indices[i] = (uint64_t)RAND_WITHIN(nelems); 176 | } 177 | } 178 | } 179 | 180 | #else // UNIFORM_RANDOM 181 | 182 | void calc_indices(unsigned long* indices, unsigned long updates, unsigned long nelems) 183 | { 184 | unsigned int i; 185 | assert(indices != NULL); 186 | unsigned int seed = 0; 187 | 188 | /* srand(0); */ 189 | 190 | for (i = 0; i < updates; i++) { 191 | indices[i] = rand_r(&seed) % nelems; 192 | } 193 | } 194 | 195 | #endif 196 | -------------------------------------------------------------------------------- /src/Makefile: -------------------------------------------------------------------------------- 1 | CC = gcc 2 | CFLAGS = -g -Wall -O3 -fPIC 3 | #CFLAGS = -g3 -Wall -O0 -fPIC 4 | LDFLAGS = -shared -Wl,--allow-multiple-definition -Wl,-rpath,'$$ORIGIN' 5 | INCLUDES = 6 | LIBS = -lm -lpthread 7 | HEMEM_LIBS = $(LIBS) -ldl -lsyscall_intercept -lcapstone -lhoard 8 | 9 | default: libhemem.so 10 | 11 | all: hemem-libs 12 | 13 | hemem-libs: libhemem-lru.so libhemem-simple.so libhemem-lru-swap.so libhemem.so 14 | 15 | libhemem.so: hemem.o pebs.o timer.o interpose.o fifo.o spsc-ring.o 16 | $(CC) $(LDFLAGS) -o libhemem.so hemem.o timer.o interpose.o pebs.o fifo.o spsc-ring.o $(HEMEM_LIBS) 17 | 18 | libhemem-lru.so: policies/hemem-lru.o policies/lru.o timer.o interpose.o policies/paging.o fifo.o spsc-ring.o 19 | $(CC) $(LDFLAGS) -o libhemem-lru.so policies/hemem-lru.o timer.o policies/lru.o interpose.o policies/paging.o fifo.o spsc-ring.o $(HEMEM_LIBS) 20 | 21 | libhemem-simple.so: policies/hemem-simple.o policies/simple.o timer.o interpose.o policies/paging.o fifo.o spsc-ring.o 22 | $(CC) $(LDFLAGS) -o libhemem-simple.so policies/hemem-simple.o timer.o policies/simple.o interpose.o policies/paging.o fifo.o spsc-ring.o $(HEMEM_LIBS) 23 | 24 | libhemem-lru-swap.so: policies/hemem-lru-swap.o policies/lru_swap.o timer.o interpose.o policies/paging.o fifo.o spsc-ring.o 25 | $(CC) $(LDFLAGS) -o libhemem-lru-swap.so policies/hemem-lru-swap.o timer.o policies/lru_swap.o interpose.o policies/paging.o fifo.o spsc-ring.o $(HEMEM_LIBS) 26 | 27 | hemem.o: hemem.c hemem.h pebs.h interpose.h fifo.h spsc-ring.h 28 | $(CC) $(CFLAGS) $(INCLUDES) -D ALLOC_HEMEM -c hemem.c -o hemem.o 29 | 30 | policies/hemem-lru.o: hemem.c hemem.h policies/lru.h interpose.h policies/paging.h fifo.h spsc-ring.h 31 | $(CC) $(CFLAGS) $(INCLUDES) -D ALLOC_LRU -c hemem.c -o policies/hemem-lru.o 32 | 33 | policies/hemem-simple.o: hemem.c hemem.h policies/simple.h interpose.h policies/paging.h fifo.h spsc-ring.h 34 | $(CC) $(CFLAGS) $(INCLUDES) -D ALLOC_SIMPLE -c hemem.c -o policies/hemem-simple.o 35 | 36 | policies/hemem-lru-swap.o: hemem.c hemem.h policies/lru.h interpose.h policies/paging.h fifo.h spsc-ring.h 37 | $(CC) $(CFLAGS) $(INCLUDES) -D ALLOC_LRU -D LRU_SWAP -c hemem.c -o policies/hemem-lru-swap.o 38 | 39 | interpose.o: interpose.c interpose.h hemem.h 40 | $(CC) $(CFLAGS) $(INCLUDES) -c interpose.c 41 | 42 | timer.o: timer.c timer.h 43 | $(CC) $(CFLAGS) $(INCLUDES) -c timer.c 44 | 45 | policies/lru.o: policies/lru.c policies/lru.h hemem.h fifo.h spsc-ring.h 46 | $(CC) $(CFLAGS) $(INCLUDES) -D ALLOC_LRU -c policies/lru.c -o policies/lru.o 47 | 48 | policies/simple.o: policies/simple.c policies/simple.h hemem.h spsc-ring.h 49 | $(CC) $(CFLAGS) $(INCLUDES) -c policies/simple.c -o policies/simple.o 50 | 51 | policies/lru_swap.o: policies/lru.c policies/lru.h hemem.h fifo.h spsc-ring.h 52 | $(CC) $(CFLAGS) $(INCLUDES) -D LRU_SWAP -D ALLOC_LRU -c policies/lru.c -o policies/lru_swap.o 53 | 54 | policies/paging.o: policies/paging.c policies/paging.h 55 | $(CC) $(CFLAGS) $(INCLUDES) -c policies/paging.c -o policies/paging.o 56 | 57 | pebs.o: pebs.c pebs.h hemem.h fifo.h 58 | $(CC) $(CFLAGS) $(INCLUDES) -c pebs.c 59 | 60 | fifo.o: fifo.c fifo.h hemem.h 61 | $(CC) $(CFLAGS) $(INCLUDES) -c fifo.c 62 | 63 | spsc-ring.o: spsc-ring.c spsc-ring.h 64 | $(CC) $(CFLAGS) $(INCLUDES) -c spsc-ring.c 65 | 66 | clean: 67 | $(RM) *.o *.so policies/*.o 68 | -------------------------------------------------------------------------------- /src/fifo.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "hemem.h" 5 | #include "fifo.h" 6 | 7 | void enqueue_fifo(struct fifo_list *queue, struct hemem_page *entry) 8 | { 9 | pthread_mutex_lock(&(queue->list_lock)); 10 | assert(entry->prev == NULL); 11 | entry->next = queue->first; 12 | if(queue->first != NULL) { 13 | assert(queue->first->prev == NULL); 14 | queue->first->prev = entry; 15 | } else { 16 | assert(queue->last == NULL); 17 | assert(queue->numentries == 0); 18 | queue->last = entry; 19 | } 20 | 21 | queue->first = entry; 22 | entry->list = queue; 23 | queue->numentries++; 24 | pthread_mutex_unlock(&(queue->list_lock)); 25 | } 26 | 27 | struct hemem_page *dequeue_fifo(struct fifo_list *queue) 28 | { 29 | pthread_mutex_lock(&(queue->list_lock)); 30 | struct hemem_page *ret = queue->last; 31 | 32 | if(ret == NULL) { 33 | //assert(queue->numentries == 0); 34 | pthread_mutex_unlock(&(queue->list_lock)); 35 | return ret; 36 | } 37 | 38 | queue->last = ret->prev; 39 | if(queue->last != NULL) { 40 | queue->last->next = NULL; 41 | } else { 42 | queue->first = NULL; 43 | } 44 | 45 | ret->prev = ret->next = NULL; 46 | ret->list = NULL; 47 | assert(queue->numentries > 0); 48 | queue->numentries--; 49 | pthread_mutex_unlock(&(queue->list_lock)); 50 | 51 | return ret; 52 | } 53 | 54 | void page_list_remove_page(struct fifo_list *list, struct hemem_page *page) 55 | { 56 | pthread_mutex_lock(&(list->list_lock)); 57 | if (list->first == NULL) { 58 | assert(list->last == NULL); 59 | assert(list->numentries == 0); 60 | pthread_mutex_unlock(&(list->list_lock)); 61 | LOG("page_list_remove_page: list was empty!\n"); 62 | return; 63 | } 64 | 65 | if (list->first == page) { 66 | list->first = page->next; 67 | } 68 | 69 | if (list->last == page) { 70 | list->last = page->prev; 71 | } 72 | 73 | if (page->next != NULL) { 74 | page->next->prev = page->prev; 75 | } 76 | 77 | if (page->prev != NULL) { 78 | page->prev->next = page->next; 79 | } 80 | 81 | assert(list->numentries > 0); 82 | list->numentries--; 83 | page->next = NULL; 84 | page->prev = NULL; 85 | page->list = NULL; 86 | pthread_mutex_unlock(&(list->list_lock)); 87 | } 88 | 89 | struct hemem_page* next_page(struct fifo_list *list, struct hemem_page *page) 90 | { 91 | struct hemem_page* next_page = NULL; 92 | 93 | pthread_mutex_lock(&(list->list_lock)); 94 | if (page == NULL) { 95 | next_page = list->last; 96 | } 97 | else { 98 | next_page = page->prev; 99 | assert(page->list == list); 100 | } 101 | pthread_mutex_unlock(&(list->list_lock)); 102 | 103 | return next_page; 104 | } 105 | -------------------------------------------------------------------------------- /src/fifo.h: -------------------------------------------------------------------------------- 1 | #ifndef HEMEM_FIFO_H 2 | #define HEMEM_FIFO_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include "hemem.h" 11 | 12 | struct fifo_list { 13 | struct hemem_page *first, *last; 14 | pthread_mutex_t list_lock; 15 | size_t numentries; 16 | }; 17 | 18 | 19 | void enqueue_fifo(struct fifo_list *list, struct hemem_page *page); 20 | struct hemem_page* dequeue_fifo(struct fifo_list *list); 21 | void page_list_remove_page(struct fifo_list *list, struct hemem_page *page); 22 | struct hemem_page* next_page(struct fifo_list *list, struct hemem_page *page); 23 | 24 | #endif 25 | 26 | -------------------------------------------------------------------------------- /src/hemem.h: -------------------------------------------------------------------------------- 1 | #ifndef HEMEM_H 2 | 3 | #define HEMEM_H 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | #ifndef __cplusplus 15 | #include 16 | #else 17 | #include 18 | #define _Atomic(X) std::atomic< X > 19 | #endif 20 | 21 | #ifdef __cplusplus 22 | extern "C" { 23 | #endif 24 | 25 | #ifdef ALLOC_LRU 26 | #include "policies/lru.h" 27 | #endif 28 | 29 | #ifdef ALLOC_SIMPLE 30 | #include "policies/simple.h" 31 | #endif 32 | 33 | #include "pebs.h" 34 | #include "timer.h" 35 | #include "interpose.h" 36 | #include "uthash.h" 37 | #include "fifo.h" 38 | 39 | //#define HEMEM_DEBUG 40 | //#define USE_PEBS 41 | //#define STATS_THREAD 42 | 43 | #define NUM_CHANNS 2 44 | // #define USE_DMA 45 | #define SIZE_PER_DMA_REQUEST (1024*1024) 46 | 47 | #define MEM_BARRIER() __sync_synchronize() 48 | 49 | #define NVMSIZE (3L* (1024L * 1024L * 1024L)) 50 | #define DRAMSIZE (5L* (1024L * 1024L * 1024L)) 51 | #define SMALLALLOCSIZE (1L << 30) 52 | 53 | #define DRAMPATH "/dev/dax0.0" 54 | #define NVMPATH "/dev/dax1.0" 55 | 56 | //#define PAGE_SIZE (1024 * 1024 * 1024) 57 | //#define PAGE_SIZE (2 * (1024 * 1024)) 58 | #define BASEPAGE_SIZE (4UL * 1024UL) 59 | #define HUGEPAGE_SIZE (2UL * 1024UL * 1024UL) 60 | #define GIGAPAGE_SIZE (1024UL * 1024UL * 1024UL) 61 | #define PAGE_SIZE HUGEPAGE_SIZE 62 | 63 | #define FASTMEM_PAGES ((DRAMSIZE) / (PAGE_SIZE)) 64 | #define SLOWMEM_PAGES ((NVMSIZE) / (PAGE_SIZE)) 65 | 66 | #define BASEPAGE_MASK (BASEPAGE_SIZE - 1) 67 | #define HUGEPAGE_MASK (HUGEPAGE_SIZE - 1) 68 | #define GIGAPAGE_MASK (GIGAPAGE_SIZE - 1) 69 | 70 | #define BASE_PFN_MASK (BASEPAGE_MASK ^ UINT64_MAX) 71 | #define HUGE_PFN_MASK (HUGEPAGE_MASK ^ UINT64_MAX) 72 | #define GIGA_PFN_MASK (GIGAPAGE_MASK ^ UINT64_MAX) 73 | 74 | #define FAULT_THREAD_CPU (0) 75 | #define STATS_THREAD_CPU (3) 76 | 77 | FILE *hememlogf; 78 | //#define LOG(...) fprintf(stderr, __VA_ARGS__) 79 | //#define LOG(...) fprintf(hememlogf, __VA_ARGS__) 80 | #define LOG(str, ...) while(0) {} 81 | 82 | FILE *timef; 83 | extern bool timing; 84 | 85 | static inline void log_time(const char* fmt, ...) 86 | { 87 | if (timing) { 88 | va_list args; 89 | va_start(args, fmt); 90 | vfprintf(timef, fmt, args); 91 | va_end(args); 92 | } 93 | } 94 | 95 | 96 | //#define LOG_TIME(str, ...) log_time(str, __VA_ARGS__) 97 | //#define LOG_TIME(str, ...) fprintf(timef, str, __VA_ARGS__) 98 | #define LOG_TIME(str, ...) while(0) {} 99 | 100 | FILE *statsf; 101 | #define LOG_STATS(str, ...) fprintf(stderr, str, __VA_ARGS__) 102 | //#define LOG_STATS(str, ...) fprintf(statsf, str, __VA_ARGS__) 103 | //#define LOG_STATS(str, ...) while (0) {} 104 | 105 | #if defined (ALLOC_HEMEM) 106 | #define pagefault(...) pebs_pagefault(__VA_ARGS__) 107 | #define paging_init(...) pebs_init(__VA_ARGS__) 108 | #define mmgr_remove(...) pebs_remove_page(__VA_ARGS__) 109 | #define mmgr_stats(...) pebs_stats(__VA_ARGS__) 110 | #define policy_shutdown(...) pebs_shutdown(__VA_ARGS__) 111 | #elif defined (ALLOC_LRU) 112 | #define pagefault(...) lru_pagefault(__VA_ARGS__) 113 | #define paging_init(...) lru_init(__VA_ARGS__) 114 | #define mmgr_remove(...) lru_remove_page(__VA_ARGS__) 115 | #define mmgr_stats(...) lru_stats(__VA_ARGS__) 116 | #define policy_shutdown(...) while(0) {} 117 | #elif defined (ALLOC_SIMPLE) 118 | #define pagefault(...) simple_pagefault(__VA_ARGS__) 119 | #define paging_init(...) simple_init(__VA_ARGS__) 120 | #define mmgr_remove(...) simple_remove_page(__VA_ARGS__) 121 | #define mmgr_stats(...) simple_stats(__VA_ARGS__) 122 | #define policy_shutdown(...) while(0) {} 123 | #endif 124 | 125 | 126 | #define MAX_UFFD_MSGS (1) 127 | #define MAX_COPY_THREADS (4) 128 | 129 | extern uint64_t cr3; 130 | extern int dramfd; 131 | extern int nvmfd; 132 | extern int devmemfd; 133 | extern bool is_init; 134 | extern uint64_t missing_faults_handled; 135 | extern uint64_t migrations_up; 136 | extern uint64_t migrations_down; 137 | extern __thread bool internal_malloc; 138 | extern __thread bool old_internal_call; 139 | extern __thread bool internal_call; 140 | extern __thread bool internal_munmap; 141 | extern void* devmem_mmap; 142 | 143 | enum memtypes { 144 | FASTMEM = 0, 145 | SLOWMEM = 1, 146 | NMEMTYPES, 147 | }; 148 | 149 | enum pagetypes { 150 | HUGEP = 0, 151 | BASEP = 1, 152 | NPAGETYPES 153 | }; 154 | 155 | struct hemem_page { 156 | uint64_t va; 157 | uint64_t devdax_offset; 158 | bool in_dram; 159 | enum pagetypes pt; 160 | volatile bool migrating; 161 | bool present; 162 | bool written; 163 | bool hot; 164 | uint64_t naccesses; 165 | uint64_t migrations_up, migrations_down; 166 | uint64_t local_clock; 167 | bool ring_present; 168 | uint64_t accesses[NPBUFTYPES]; 169 | uint64_t tot_accesses[NPBUFTYPES]; 170 | pthread_mutex_t page_lock; 171 | 172 | UT_hash_handle hh; 173 | struct hemem_page *next, *prev; 174 | struct fifo_list *list; 175 | }; 176 | 177 | static inline uint64_t pt_to_pagesize(enum pagetypes pt) 178 | { 179 | switch(pt) { 180 | case HUGEP: return HUGEPAGE_SIZE; 181 | case BASEP: return BASEPAGE_SIZE; 182 | default: assert(!"Unknown page type"); 183 | } 184 | } 185 | 186 | static inline enum pagetypes pagesize_to_pt(uint64_t pagesize) 187 | { 188 | switch (pagesize) { 189 | case BASEPAGE_SIZE: return BASEP; 190 | case HUGEPAGE_SIZE: return HUGEP; 191 | default: assert(!"Unknown page ssize"); 192 | } 193 | } 194 | 195 | void log_init(); 196 | void hemem_init(); 197 | void hemem_stop(); 198 | void* hemem_mmap(void *addr, size_t length, int prot, int flags, int fd, off_t offset); 199 | int hemem_munmap(void* addr, size_t length); 200 | void *handle_fault(); 201 | void hemem_migrate_up(struct hemem_page *page, uint64_t dram_offset); 202 | void hemem_migrate_down(struct hemem_page *page, uint64_t nvm_offset); 203 | void hemem_wp_page(struct hemem_page *page, bool protect); 204 | void hemem_promote_pages(uint64_t addr); 205 | void hemem_demote_pages(uint64_t addr); 206 | 207 | #ifdef ALLOC_LRU 208 | void hemem_clear_bits(struct hemem_page *page); 209 | uint64_t hemem_get_bits(struct hemem_page *page); 210 | void hemem_tlb_shootdown(uint64_t va); 211 | #endif 212 | 213 | struct hemem_page* get_hemem_page(uint64_t va); 214 | 215 | void hemem_print_stats(); 216 | void hemem_clear_stats(); 217 | 218 | void hemem_start_timing(void); 219 | void hemem_stop_timing(void); 220 | 221 | #ifdef __cplusplus 222 | } 223 | #endif 224 | 225 | #define max(a,b) ({ __typeof__ (a) _a = (a); __typeof__ (b) _b = (b); _a > _b ? _a : _b; }) 226 | #define min(a,b) ({ __typeof__ (a) _a = (a); __typeof__ (b) _b = (b); _a < _b ? _a : _b; }) 227 | 228 | #endif /* HEMEM_H */ 229 | -------------------------------------------------------------------------------- /src/interpose.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #define __USE_GNU 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | #include "hemem.h" 14 | #include "interpose.h" 15 | 16 | void* (*libc_mmap)(void *addr, size_t length, int prot, int flags, int fd, off_t offset) = NULL; 17 | int (*libc_munmap)(void *addr, size_t length) = NULL; 18 | void* (*libc_malloc)(size_t size) = NULL; 19 | void (*libc_free)(void* ptr) = NULL; 20 | 21 | static int mmap_filter(void *addr, size_t length, int prot, int flags, int fd, off_t offset, uint64_t *result) 22 | { 23 | //ensure_init(); 24 | 25 | //TODO: figure out which mmap calls should go to libc vs hemem 26 | // non-anonymous mappings should probably go to libc (e.g., file mappings) 27 | if (((flags & MAP_ANONYMOUS) != MAP_ANONYMOUS) && !((fd == dramfd) || (fd == nvmfd) || (fd == devmemfd))) { 28 | LOG("hemem interpose: calling libc mmap due to non-anonymous, non-devdax mapping: mmap(0x%lx, %ld, %x, %x, %d, %ld)\n", (uint64_t)addr, length, prot, flags, fd, offset); 29 | return 1; 30 | } 31 | 32 | if ((flags & MAP_STACK) == MAP_STACK) { 33 | // pthread mmaps are called with MAP_STACK 34 | LOG("hemem interpose: calling libc mmap due to stack mapping: mmap(0x%lx, %ld, %x, %x, %d, %ld)\n", (uint64_t)addr, length, prot, flags, fd, offset); 35 | return 1; 36 | } 37 | 38 | //if (((flags & MAP_NORESERVE) == MAP_NORESERVE)) { 39 | // thread stack is called without swap space reserved, so we can probably ignore these 40 | //fprintf(stderr, "hemem interpose: calling libc mmap due to non-swap space reserved mapping: mmap(0x%lx, %ld, %x, %x, %d, %ld)\n", (uint64_t)addr, length, prot, flags, fd, offset); 41 | //return 1; 42 | //} 43 | 44 | if ((fd == dramfd) || (fd == nvmfd) || (fd == devmemfd)) { 45 | //LOG("hemem interpose: calling libc mmap due to hemem devdax mapping\n"); 46 | return 1; 47 | } 48 | 49 | if (internal_call) { 50 | LOG("hemem interpose: calling libc mmap due to internal memory call: mmap(0x%lx, %ld, %x, %x, %d, %ld)\n", (uint64_t)addr, length, prot, flags, fd, offset); 51 | return 1; 52 | } 53 | 54 | if (!is_init) { 55 | //LOG("hemem interpose: calling libc mmap due to hemem init in progress\n"); 56 | return 1; 57 | } 58 | 59 | LOG("hemem interpose calling libc mmap due to small allocation size: mmap(0x%lx, %ld, %x, %x, %d, %ld)\n", (uint64_t)addr, length, prot, flags, fd, offset); 60 | if (length < SMALLALLOCSIZE) { 61 | return 1; 62 | } 63 | 64 | LOG("hemem interpose: calling hemem mmap(0x%lx, %ld, %x, %x, %d, %ld)\n", (uint64_t)addr, length, prot, flags, fd, offset); 65 | if ((*result = (uint64_t)hemem_mmap(addr, length, prot, flags, fd, offset)) == (uint64_t)MAP_FAILED) { 66 | // hemem failed for some reason, try libc 67 | LOG("hemem mmap failed\n\tmmap(0x%lx, %ld, %x, %x, %d, %ld)\n", (uint64_t)addr, length, prot, flags, fd, offset); 68 | } 69 | return 0; 70 | } 71 | 72 | 73 | static int munmap_filter(void *addr, size_t length, uint64_t* result) 74 | { 75 | //ensure_init(); 76 | 77 | //TODO: figure out which munmap calls should go to libc vs hemem 78 | 79 | if (internal_call) { 80 | return 1; 81 | } 82 | 83 | if ((*result = hemem_munmap(addr, length)) == -1) { 84 | LOG("hemem munmap failed\n\tmunmap(0x%lx, %ld)\n", (uint64_t)addr, length); 85 | } 86 | return 0; 87 | } 88 | 89 | 90 | static void* bind_symbol(const char *sym) 91 | { 92 | void *ptr; 93 | if ((ptr = dlsym(RTLD_NEXT, sym)) == NULL) { 94 | fprintf(stderr, "hemem memory manager interpose: dlsym failed (%s)\n", sym); 95 | abort(); 96 | } 97 | return ptr; 98 | } 99 | 100 | static int hook(long syscall_number, long arg0, long arg1, long arg2, long arg3, long arg4, long arg5, long *result) 101 | { 102 | if (syscall_number == SYS_mmap) { 103 | return mmap_filter((void*)arg0, (size_t)arg1, (int)arg2, (int)arg3, (int)arg4, (off_t)arg5, (uint64_t*)result); 104 | } else if (syscall_number == SYS_munmap){ 105 | return munmap_filter((void*)arg0, (size_t)arg1, (uint64_t*)result); 106 | } else { 107 | // ignore non-mmap system calls 108 | return 1; 109 | } 110 | } 111 | 112 | static __attribute__((constructor)) void init(void) 113 | { 114 | log_init(); 115 | 116 | libc_mmap = bind_symbol("mmap"); 117 | libc_munmap = bind_symbol("munmap"); 118 | libc_malloc = bind_symbol("malloc"); 119 | libc_free = bind_symbol("free"); 120 | intercept_hook_point = hook; 121 | 122 | hemem_init(); 123 | } 124 | 125 | static __attribute__((destructor)) void hemem_shutdown(void) 126 | { 127 | hemem_stop(); 128 | } 129 | 130 | /* 131 | void* malloc(size_t size) 132 | { 133 | void* ret; 134 | if(libc_malloc == NULL) { 135 | libc_malloc = bind_symbol("malloc"); 136 | } 137 | assert(libc_malloc != NULL); 138 | ret = libc_malloc(size); 139 | return ret; 140 | } 141 | 142 | void free(void* ptr) 143 | { 144 | if(libc_free == NULL) { 145 | libc_free = bind_symbol("free"); 146 | } 147 | assert(libc_free != NULL); 148 | libc_free(ptr); 149 | } 150 | */ 151 | -------------------------------------------------------------------------------- /src/interpose.h: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | // function pointers to libc functions 4 | extern void* (*libc_mmap)(void *addr, size_t length, int prot, int flags, int fd, off_t offset); 5 | extern int (*libc_munmap)(void *addr, size_t length); 6 | extern void* (*libc_malloc)(size_t size); 7 | extern void (*libc_free)(void* p); 8 | 9 | 10 | -------------------------------------------------------------------------------- /src/pebs.h: -------------------------------------------------------------------------------- 1 | #ifndef HEMEM_PEBS_H 2 | #define HEMEM_PEBS_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include "hemem.h" 11 | 12 | #define PEBS_KSWAPD_INTERVAL (10000) // in us (10ms) 13 | #define PEBS_KSWAPD_MIGRATE_RATE (10UL * 1024UL * 1024UL * 1024UL) // 10GB 14 | #define HOT_READ_THRESHOLD (4) 15 | #define HOT_WRITE_THRESHOLD (4) 16 | #define PEBS_COOLING_THRESHOLD (10) 17 | 18 | #define HOT_RING_REQS_THRESHOLD (1024*1024) 19 | #define COLD_RING_REQS_THRESHOLD (128) 20 | #define CAPACITY (16*1024*1024) 21 | #define COOLING_PAGES (8192) 22 | 23 | #define PEBS_NPROCS 4 24 | #define PERF_PAGES (1 + (1 << 14)) // Has to be == 1+2^n 25 | //#define SAMPLE_PERIOD 10007 26 | #define SAMPLE_PERIOD 19997 27 | //#define SAMPLE_FREQ 100 28 | 29 | 30 | #define SCANNING_THREAD_CPU (FAULT_THREAD_CPU + 1) 31 | #define MIGRATION_THREAD_CPU (SCANNING_THREAD_CPU + 1) 32 | 33 | struct perf_sample { 34 | struct perf_event_header header; 35 | __u64 ip; 36 | __u32 pid, tid; /* if PERF_SAMPLE_TID */ 37 | __u64 addr; /* if PERF_SAMPLE_ADDR */ 38 | __u64 weight; /* if PERF_SAMPLE_WEIGHT */ 39 | /* __u64 data_src; /\* if PERF_SAMPLE_DATA_SRC *\/ */ 40 | }; 41 | 42 | enum pbuftype { 43 | DRAMREAD = 0, 44 | NVMREAD = 1, 45 | WRITE = 2, 46 | NPBUFTYPES 47 | }; 48 | 49 | void *pebs_kswapd(); 50 | struct hemem_page* pebs_pagefault(void); 51 | struct hemem_page* pebs_pagefault_unlocked(void); 52 | void pebs_init(void); 53 | void pebs_remove_page(struct hemem_page *page); 54 | void pebs_stats(); 55 | void pebs_shutdown(); 56 | 57 | #endif /* HEMEM_LRU_MODIFIED_H */ 58 | -------------------------------------------------------------------------------- /src/policies/hemem-mmgr.h: -------------------------------------------------------------------------------- 1 | #ifndef HEMEM_MMGR_H 2 | #define HEMEM_MMGR_H 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #include "hemem.h" 9 | #include "paging.h" 10 | 11 | #define HEMEM_INTERVAL 10000ULL // in us 12 | 13 | #define HEMEM_FASTFREE (DRAMSIZE / 10) 14 | #define HEMEM_COOL_RATE (10ULL * 1024ULL * 1024ULL * 1024ULL) 15 | #define HEMEM_THAW_RATE (NVMSIZE + DRAMSIZE) 16 | 17 | #define FASTMEM_HUGE_PAGES ((DRAMSIZE) / (HUGEPAGE_SIZE)) 18 | #define FASTMEM_BASE_PAGES ((DRAMSIZE) / (BASEPAGE_SIZE)) 19 | 20 | #define SLOWMEM_HUGE_PAGES ((NVMSIZE) / (HUGEPAGE_SIZE)) 21 | #define SLOWMEM_BASE_PAGES ((NVMSIZE) / (BASEPAGE_SIZE)) 22 | 23 | struct mmgr_node { 24 | struct hemem_page *page; 25 | uint64_t accesses, tot_accesses; 26 | uint64_t offset; 27 | struct mmgr_node *next, *prev; 28 | struct mmgr_list *list; 29 | }; 30 | 31 | struct mmgr_list { 32 | struct mmgr_node *first; 33 | struct mmgr_node *last; 34 | size_t numentries; 35 | pthread_mutex_t list_lock; 36 | }; 37 | 38 | void *mmgr_kswapd(void); 39 | struct hemem_page* hemem_mmgr_pagefault(); 40 | struct hemem_page* hemem_mmgr_pagefault_unlocked(); 41 | void hemem_mmgr_init(void); 42 | void hemem_mmgr_remove_page(struct hemem_page *page); 43 | void hemem_mmgr_stats(); 44 | 45 | #endif 46 | -------------------------------------------------------------------------------- /src/policies/lru.h: -------------------------------------------------------------------------------- 1 | #ifndef HEMEM_LRU_H 2 | #define HEMEM_LRU_H 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #include "../hemem.h" 9 | #include "paging.h" 10 | 11 | 12 | #define KSCAND_INTERVAL (50000) // in us (20ms) 13 | #define KSWAPD_INTERVAL (1000000) // in us (1s) 14 | #define KSWAPD_MIGRATE_RATE (50UL * 1024UL * 1024UL * 1024UL) // 50GB 15 | 16 | void *lru_kswapd(); 17 | struct hemem_page* lru_pagefault(void); 18 | struct hemem_page* lru_pagefault_unlocked(void); 19 | void lru_init(void); 20 | void lru_remove_page(struct hemem_page *page); 21 | void lru_stats(); 22 | 23 | 24 | #endif /* HEMEM_LRU_MODIFIED_H */ 25 | -------------------------------------------------------------------------------- /src/policies/paging.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | 21 | #include "../hemem.h" 22 | #include "../timer.h" 23 | #include "paging.h" 24 | #include "../interpose.h" 25 | 26 | #if 0 27 | 28 | uint64_t* va_to_pa(uint64_t va) 29 | { 30 | uint64_t pt_base = ((uint64_t)(cr3 & ADDRESS_MASK)); 31 | uint64_t *pgd; 32 | uint64_t *pud; 33 | uint64_t *pmd; 34 | uint64_t *pte; 35 | uint64_t pgd_offset; 36 | uint64_t pud_offset; 37 | uint64_t pmd_offset; 38 | uint64_t pte_offset; 39 | uint64_t pgd_entry; 40 | uint64_t pud_entry; 41 | uint64_t pmd_entry; 42 | uint64_t pte_entry; 43 | 44 | pgd = (devmem_mmap + pt_base); 45 | pgd_offset = (((va) >> HEMEM_PGDIR_SHIFT) & (HEMEM_PTRS_PER_PGD - 1)); 46 | assert(pgd_offset < PAGE_SIZE); 47 | pgd_entry = *(pgd + pgd_offset); 48 | LOG("pgd_entry: %lx\n", pgd_entry); 49 | if (!((pgd_entry & HEMEM_PRESENT_FLAG) == HEMEM_PRESENT_FLAG)) { 50 | LOG("hemem_va_to_pa: pgd not present: %016lx\n", pgd_entry); 51 | assert(0); 52 | } 53 | 54 | pud = (uint64_t*)(pgd_entry & ADDRESS_MASK); 55 | pud_offset = (((va) >> HEMEM_PUD_SHIFT) & (HEMEM_PTRS_PER_PUD - 1)); 56 | assert(pud_offset < PAGE_SIZE); 57 | pud_entry = *(pud + pud_offset); 58 | LOG("pud_entry: %lx\n", pud_entry); 59 | if (!((pud_entry & HEMEM_PRESENT_FLAG) == HEMEM_PRESENT_FLAG)) { 60 | LOG("hemem_va_to_pa: pud not present: %016lx\n", pud_entry); 61 | assert(0); 62 | } 63 | 64 | pmd = (uint64_t*)(pud_entry & ADDRESS_MASK); 65 | pmd_offset = (((va) >> HEMEM_PMD_SHIFT) & (HEMEM_PTRS_PER_PMD - 1)); 66 | assert(pmd_offset < PAGE_SIZE); 67 | pmd_entry = *(pmd + pmd_offset); 68 | LOG("pmd_entry: %lx\n", pmd_entry); 69 | if (!((pmd_entry & HEMEM_PRESENT_FLAG) == HEMEM_PRESENT_FLAG)) { 70 | LOG("hemem_va_to_pa: pmd not present: %016lx\n", pmd_entry); 71 | assert(0); 72 | } 73 | 74 | if ((pmd_entry & HEMEM_HUGEPAGE_FLAG) == HEMEM_HUGEPAGE_FLAG) { 75 | return pmd + pmd_offset; 76 | } 77 | 78 | pte = (uint64_t*)(pmd_entry & ADDRESS_MASK); 79 | pte_offset = (((va) >> HEMEM_PAGE_SHIFT) & (HEMEM_PTRS_PER_PTE - 1)); 80 | assert(pte_offset < PAGE_SIZE); 81 | pte_entry = *(pte + pte_offset); 82 | LOG("pte_entry: %lx\n", pte_entry); 83 | if (!((pte_entry & HEMEM_PRESENT_FLAG) == HEMEM_PRESENT_FLAG)) { 84 | LOG("hemem_va_to_pa: pte not present: %016lx\n", pte_entry); 85 | assert(0); 86 | } 87 | 88 | return pte + pte_offset; 89 | } 90 | 91 | void clear_bit(uint64_t va, uint64_t bit) 92 | { 93 | uint64_t pt_base = ((uint64_t)(cr3 & ADDRESS_MASK)); 94 | uint64_t *pgd; 95 | uint64_t *pud; 96 | uint64_t *pmd; 97 | uint64_t *pte; 98 | uint64_t pgd_offset; 99 | uint64_t pud_offset; 100 | uint64_t pmd_offset; 101 | uint64_t pte_offset; 102 | uint64_t *pgd_entry; 103 | uint64_t *pud_entry; 104 | uint64_t *pmd_entry; 105 | uint64_t *pte_entry; 106 | 107 | pgd = (devmem_mmap + pt_base); 108 | pgd_offset = (((va) >> HEMEM_PGDIR_SHIFT) & (HEMEM_PTRS_PER_PGD - 1)); 109 | assert(pgd_offset < PAGE_SIZE); 110 | pgd_entry = (pgd + pgd_offset); 111 | LOG("pgd_entry: %lx\n", *pgd_entry); 112 | if (!((*pgd_entry & HEMEM_PRESENT_FLAG) == HEMEM_PRESENT_FLAG)) { 113 | LOG("clear_bit: pgd not present: %016lx\n", *pgd_entry); 114 | //assert(0); 115 | return; 116 | } 117 | 118 | pud = (uint64_t*)(*pgd_entry & ADDRESS_MASK); 119 | pud_offset = (((va) >> HEMEM_PUD_SHIFT) & (HEMEM_PTRS_PER_PUD - 1)); 120 | assert(pud_offset < PAGE_SIZE); 121 | pud_entry = (pud + pud_offset); 122 | LOG("pud_entry: %lx\n", *pud_entry); 123 | if (!((*pud_entry & HEMEM_PRESENT_FLAG) == HEMEM_PRESENT_FLAG)) { 124 | LOG("clear_bit: pud not present: %016lx\n", *pud_entry); 125 | //assert(0); 126 | return; 127 | } 128 | 129 | pmd = (uint64_t*)(*pud_entry & ADDRESS_MASK); 130 | pmd_offset = (((va) >> HEMEM_PMD_SHIFT) & (HEMEM_PTRS_PER_PMD - 1)); 131 | assert(pmd_offset < PAGE_SIZE); 132 | pmd_entry = (pmd + pmd_offset); 133 | LOG("pmd_entry: %lx\n", *pmd_entry); 134 | if (!((*pmd_entry & HEMEM_PRESENT_FLAG) == HEMEM_PRESENT_FLAG)) { 135 | LOG("clear_bit: pmd not present: %016lx\n", *pmd_entry); 136 | //assert(0); 137 | return; 138 | } 139 | 140 | if ((*pmd_entry & HEMEM_HUGEPAGE_FLAG) == HEMEM_HUGEPAGE_FLAG) { 141 | *pmd_entry = *pmd_entry & ~bit; 142 | return; 143 | } 144 | 145 | pte = (uint64_t*)(*pmd_entry & ADDRESS_MASK); 146 | pte_offset = (((va) >> HEMEM_PAGE_SHIFT) & (HEMEM_PTRS_PER_PTE - 1)); 147 | assert(pte_offset < PAGE_SIZE); 148 | pte_entry = (pte + pte_offset); 149 | LOG("pte_entry: %lx\n", *pte_entry); 150 | if (!((*pte_entry & HEMEM_PRESENT_FLAG) == HEMEM_PRESENT_FLAG)) { 151 | LOG("clear_bit: pte not present: %016lx\n", *pte_entry); 152 | //assert(0); 153 | return; 154 | } 155 | 156 | *pte_entry = *pte_entry & ~bit; 157 | } 158 | 159 | uint64_t get_bit(uint64_t va, uint64_t bit) 160 | { 161 | uint64_t pt_base = ((uint64_t)(cr3 & ADDRESS_MASK)); 162 | uint64_t *pgd; 163 | uint64_t *pud; 164 | uint64_t *pmd; 165 | uint64_t *pte; 166 | uint64_t pgd_offset; 167 | uint64_t pud_offset; 168 | uint64_t pmd_offset; 169 | uint64_t pte_offset; 170 | uint64_t *pgd_entry; 171 | uint64_t *pud_entry; 172 | uint64_t *pmd_entry; 173 | uint64_t *pte_entry; 174 | 175 | pgd = (devmem_mmap + pt_base); 176 | pgd_offset = (((va) >> HEMEM_PGDIR_SHIFT) & (HEMEM_PTRS_PER_PGD - 1)); 177 | assert(pgd_offset < PAGE_SIZE); 178 | pgd_entry = (pgd + pgd_offset); 179 | LOG("pgd_entry: %lx\n", *pgd_entry); 180 | if (!((*pgd_entry & HEMEM_PRESENT_FLAG) == HEMEM_PRESENT_FLAG)) { 181 | LOG("set_bit: pgd not present: %016lx\n", *pgd_entry); 182 | //assert(0); 183 | return 0; 184 | } 185 | 186 | pud = (uint64_t*)(*pgd_entry & ADDRESS_MASK); 187 | pud_offset = (((va) >> HEMEM_PUD_SHIFT) & (HEMEM_PTRS_PER_PUD - 1)); 188 | assert(pud_offset < PAGE_SIZE); 189 | pud_entry = (pud + pud_offset); 190 | LOG("pud_entry: %lx\n", *pud_entry); 191 | if (!((*pud_entry & HEMEM_PRESENT_FLAG) == HEMEM_PRESENT_FLAG)) { 192 | LOG("set_bit: pud not present: %016lx\n", *pud_entry); 193 | //assert(0); 194 | return 0; 195 | } 196 | 197 | pmd = (uint64_t*)(*pud_entry & ADDRESS_MASK); 198 | pmd_offset = (((va) >> HEMEM_PMD_SHIFT) & (HEMEM_PTRS_PER_PMD - 1)); 199 | assert(pmd_offset < PAGE_SIZE); 200 | pmd_entry = (pmd + pmd_offset); 201 | LOG("pmd_entry: %lx\n", *pmd_entry); 202 | if (!((*pmd_entry & HEMEM_PRESENT_FLAG) == HEMEM_PRESENT_FLAG)) { 203 | LOG("set_bit: pmd not present: %016lx\n", *pmd_entry); 204 | //assert(0); 205 | return 0; 206 | } 207 | 208 | if ((*pmd_entry & HEMEM_HUGEPAGE_FLAG) == HEMEM_HUGEPAGE_FLAG) { 209 | return *pmd_entry & bit; 210 | } 211 | 212 | pte = (uint64_t*)(*pmd_entry & ADDRESS_MASK); 213 | pte_offset = (((va) >> HEMEM_PAGE_SHIFT) & (HEMEM_PTRS_PER_PTE - 1)); 214 | assert(pte_offset < PAGE_SIZE); 215 | pte_entry = (pte + pte_offset); 216 | LOG("pte_entry: %lx\n", *pte_entry); 217 | if (!((*pte_entry & HEMEM_PRESENT_FLAG) == HEMEM_PRESENT_FLAG)) { 218 | LOG("set_bit: pte not present: %016lx\n", *pte_entry); 219 | //assert(0); 220 | return 0; 221 | } 222 | 223 | return *pte_entry & bit; 224 | } 225 | 226 | 227 | void clear_accessed_bit(uint64_t va) 228 | { 229 | clear_bit(va, HEMEM_ACCESSED_FLAG); 230 | } 231 | 232 | 233 | uint64_t get_accessed_bit(uint64_t va) 234 | { 235 | return get_bit(va, HEMEM_ACCESSED_FLAG); 236 | } 237 | 238 | 239 | void clear_dirty_bit(uint64_t va) 240 | { 241 | clear_bit(va, HEMEM_DIRTY_FLAG); 242 | } 243 | 244 | 245 | uint64_t get_dirty_bit(uint64_t va) 246 | { 247 | return get_bit(va, HEMEM_DIRTY_FLAG); 248 | } 249 | 250 | #endif 251 | FILE *ptes, *pdes, *pdtpes, *pml4es, *valid; 252 | 253 | 254 | void scan_fourth_level(uint64_t pde, bool clear_flag, uint64_t flag) 255 | { 256 | uint64_t *ptable4_ptr; 257 | uint64_t *pte_ptr; 258 | uint64_t pte; 259 | 260 | ptable4_ptr = libc_mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, devmemfd, pde & ADDRESS_MASK); 261 | if (ptable4_ptr == MAP_FAILED) { 262 | perror("third level page table mmap"); 263 | assert(0); 264 | } 265 | 266 | pte_ptr = (uint64_t*)ptable4_ptr; 267 | for (int i = 0; i < 512; i++) { 268 | pte = *pte_ptr; 269 | fprintf(ptes, "%016lx\n", pte); 270 | 271 | if (((pte & FLAGS_MASK) & HEMEM_PAGE_WALK_FLAGS) == HEMEM_PAGE_WALK_FLAGS) { 272 | if (((pte & FLAGS_MASK) & HEMEM_PWTPCD_FLAGS) == 0) { 273 | fprintf(valid, "pte[%x]: %016lx\n", i, pte); 274 | 275 | if (clear_flag) { 276 | pte = pte & ~flag; 277 | } 278 | } 279 | } 280 | 281 | pte_ptr++; 282 | } 283 | 284 | munmap(ptable4_ptr, PAGE_SIZE); 285 | } 286 | 287 | 288 | void scan_third_level(uint64_t pdtpe, bool clear_flag, uint64_t flag) 289 | { 290 | uint64_t *ptable3_ptr; 291 | uint64_t *pde_ptr; 292 | uint64_t pde; 293 | 294 | ptable3_ptr = libc_mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, devmemfd, pdtpe & ADDRESS_MASK); 295 | if (ptable3_ptr == MAP_FAILED) { 296 | perror("third level page table mmap"); 297 | assert(0); 298 | } 299 | 300 | pde_ptr = (uint64_t*)ptable3_ptr; 301 | for (int i = 0; i < 512; i++) { 302 | pde = *pde_ptr; 303 | fprintf(pdes, "%016lx\n", pde); 304 | 305 | if (((pde & FLAGS_MASK) & HEMEM_PAGE_WALK_FLAGS) == HEMEM_PAGE_WALK_FLAGS) { 306 | if (((pde & FLAGS_MASK) & HEMEM_PWTPCD_FLAGS) == 0) { 307 | fprintf(valid, "pde[%x]: %016lx\n", i, pde); 308 | scan_fourth_level(pde, clear_flag, flag); 309 | } 310 | } 311 | 312 | pde_ptr++; 313 | } 314 | 315 | munmap(ptable3_ptr, PAGE_SIZE); 316 | } 317 | 318 | 319 | void scan_second_level(uint64_t pml4e, bool clear_flag, uint64_t flag) 320 | { 321 | uint64_t *ptable2_ptr; 322 | uint64_t *pdtpe_ptr; 323 | uint64_t pdtpe; 324 | 325 | ptable2_ptr = libc_mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, devmemfd, pml4e & ADDRESS_MASK); 326 | if (ptable2_ptr == MAP_FAILED) { 327 | perror("second level page table mmap"); 328 | assert(0); 329 | } 330 | 331 | pdtpe_ptr = (uint64_t*)ptable2_ptr; 332 | for (int i = 0; i < 512; i++) { 333 | pdtpe = *pdtpe_ptr; 334 | fprintf(pdtpes, "%016lx\n", pdtpe); 335 | 336 | if (((pdtpe & FLAGS_MASK) & HEMEM_PAGE_WALK_FLAGS) == HEMEM_PAGE_WALK_FLAGS) { 337 | if (((pdtpe & FLAGS_MASK) & HEMEM_PWTPCD_FLAGS) == 0) { 338 | fprintf(valid, "pdtpe[%x]: %016lx\n", i, pdtpe); 339 | scan_third_level(pdtpe, clear_flag, flag); 340 | } 341 | } 342 | 343 | pdtpe_ptr++; 344 | } 345 | 346 | munmap(ptable2_ptr, PAGE_SIZE); 347 | } 348 | 349 | 350 | void _scan_pagetable(bool clear_flag, uint64_t flag) 351 | { 352 | int *rootptr; 353 | uint64_t *pml4e_ptr; 354 | uint64_t pml4e; 355 | 356 | pml4es = fopen("logs/pml4es.txt", "w+"); 357 | if (pml4es == NULL) { 358 | perror("pml4e file open"); 359 | assert(0); 360 | } 361 | 362 | pdtpes = fopen("logs/pdtpes.txt", "w+"); 363 | if (pdtpes == NULL) { 364 | perror("pdtpes open"); 365 | assert(0); 366 | } 367 | 368 | pdes = fopen("logs/pdes.txt", "w+"); 369 | if (pdes == NULL) { 370 | perror("pdes open"); 371 | assert(0); 372 | } 373 | 374 | ptes = fopen("logs/ptes.txt", "w+"); 375 | if (ptes == NULL) { 376 | perror("ptes open"); 377 | assert(0); 378 | } 379 | 380 | valid = fopen("logs/valid.txt", "w+"); 381 | if (valid == NULL) { 382 | perror("valid open"); 383 | assert(0); 384 | } 385 | 386 | rootptr = libc_mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, devmemfd, cr3 & ADDRESS_MASK); 387 | if (rootptr == MAP_FAILED) { 388 | perror("/dev/mem mmap"); 389 | assert(0); 390 | } 391 | 392 | pml4e_ptr = (uint64_t*)rootptr; 393 | for (int i = 0; i < 512; i++) { 394 | pml4e = *pml4e_ptr; 395 | fprintf(pml4es, "%016lx\n", pml4e); 396 | 397 | if (((pml4e & FLAGS_MASK) & HEMEM_PAGE_WALK_FLAGS) == HEMEM_PAGE_WALK_FLAGS) { 398 | if (((pml4e & FLAGS_MASK) & HEMEM_PWTPCD_FLAGS) == 0) { 399 | fprintf(valid, "pml4e[%x]: %016lx\n", i, pml4e); 400 | scan_second_level(pml4e, clear_flag, flag); 401 | } 402 | } 403 | pml4e_ptr++; 404 | } 405 | 406 | munmap(rootptr, PAGE_SIZE); 407 | } 408 | 409 | void scan_pagetable() 410 | { 411 | _scan_pagetable(false, 0); 412 | } 413 | 414 | #ifdef EXAMINE_PGTABLES 415 | void *examine_pagetables() 416 | { 417 | FILE *maps; 418 | int pagemaps; 419 | FILE *kpageflags; 420 | char *line = NULL; 421 | ssize_t nread; 422 | size_t len; 423 | uint64_t vm_start, vm_end; 424 | int n, num_pages; 425 | long index; 426 | off_t o; 427 | ssize_t t; 428 | struct pagemapEntry entry; 429 | int maps_copy; 430 | ssize_t nwritten; 431 | FILE *pfn_file; 432 | uint64_t num_pfn = 0; 433 | 434 | maps = fopen("/proc/self/maps", "r"); 435 | if (maps == NULL) { 436 | perror("/proc/self/maps fopen"); 437 | assert(0); 438 | } 439 | 440 | pagemaps = open("/proc/self/pagemap", O_RDONLY); 441 | if (pagemaps == -1) { 442 | perror("/proc/self/pagemap fopen"); 443 | ignore_this_mamp = true; 444 | assert(0); 445 | } 446 | 447 | maps_copy = open("logs/maps.txt", O_CREAT | O_RDWR); 448 | if (maps_copy == -1) { 449 | perror("map.txt open"); 450 | assert(0); 451 | } 452 | 453 | kpageflags = fopen("/proc/kpageflags", "r"); 454 | if (kpageflags == NULL) { 455 | perror("/proc/kpageflags fopen"); 456 | assert(0); 457 | } 458 | 459 | pfn_file = fopen("logs/pfn.txt", "w+"); 460 | if (pfn_file == NULL) { 461 | perror("pfn.txt open"); 462 | assert(0); 463 | } 464 | 465 | nread = getline(&line, &len, maps); 466 | while (nread != -1) { 467 | nwritten = write(maps_copy, line, nread); 468 | if (nwritten < 0) { 469 | perror("maps_copy write"); 470 | assert(0); 471 | } 472 | if (strstr(line, DRAMPATH) != NULL) { 473 | n = sscanf(line, "%lX-%lX", &vm_start, &vm_end); 474 | if (n != 2) { 475 | fprintf(stderr, "error, invalid line: %s\n", line); 476 | assert(0); 477 | } 478 | 479 | num_pages = (vm_end - vm_start) / PAGE_SIZE; 480 | if (num_pages > 0) { 481 | index = (vm_start / PAGE_SIZE) * sizeof(uint64_t); 482 | 483 | o = lseek(pagemaps, index, SEEK_SET); 484 | if (o != index) { 485 | perror("pagemaps lseek"); 486 | assert(0); 487 | } 488 | 489 | while (num_pages > 0) { 490 | uint64_t pfn; 491 | t = read(pagemaps, &pfn, sizeof(uint64_t)); 492 | if (t < 0) { 493 | perror("pagemaps read"); 494 | assert(0); 495 | } 496 | 497 | entry.pfn = pfn & 0x7ffffffffffff; 498 | entry.soft_dirty = (pfn >> 55) & 1; 499 | entry.exclusive = (pfn >> 56) & 1; 500 | entry.file_page = (pfn >> 61) & 1; 501 | entry.swapped = (pfn >> 62) & 1; 502 | entry.present = (pfn >> 63) & 1; 503 | 504 | fprintf(pfn_file, "DRAM: %016lX\n", (entry.pfn * sysconf(_SC_PAGESIZE))); 505 | num_pages--; 506 | num_pfn++; 507 | } 508 | } 509 | } 510 | else if (strstr(line, NVMPATH) != NULL) { 511 | n = sscanf(line, "%lX-%lX", &vm_start, &vm_end); 512 | if (n != 2) { 513 | fprintf(stderr, "error, invalid line: %s\n", line); 514 | assert(0); 515 | } 516 | 517 | num_pages = (vm_end - vm_start) / PAGE_SIZE; 518 | if (num_pages > 0) { 519 | index = (vm_start / PAGE_SIZE) * sizeof(uint64_t); 520 | 521 | o = lseek(pagemaps, index, SEEK_SET); 522 | if (o != index) { 523 | perror("pagemaps lseek"); 524 | assert(0); 525 | } 526 | 527 | while (num_pages > 0) { 528 | uint64_t pfn; 529 | t = read(pagemaps, &pfn, sizeof(uint64_t)); 530 | if (t < 0) { 531 | perror("pagemaps read"); 532 | assert(0); 533 | } 534 | 535 | entry.pfn = pfn & 0x7ffffffffffff; 536 | entry.soft_dirty = (pfn >> 55) & 1; 537 | entry.exclusive = (pfn >> 56) & 1; 538 | entry.file_page = (pfn >> 61) & 1; 539 | entry.swapped = (pfn >> 62) & 1; 540 | entry.present = (pfn >> 63) & 1; 541 | 542 | fprintf(pfn_file, "NVM: %016lX\n", (entry.pfn * sysconf(_SC_PAGE_SIZE))); 543 | num_pages--; 544 | num_pfn++; 545 | } 546 | } 547 | } 548 | nread = getline(&line, &len, maps); 549 | } 550 | 551 | fclose(maps); 552 | close(pagemaps); 553 | fclose(kpageflags); 554 | close(maps_copy); 555 | fclose(pfn_file); 556 | 557 | return 0; 558 | } 559 | #endif 560 | 561 | -------------------------------------------------------------------------------- /src/policies/paging.h: -------------------------------------------------------------------------------- 1 | #ifndef HEMEM_PAGING_H 2 | #define HEMEM_PAGING_H 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #include "../hemem.h" 9 | 10 | 11 | #define ADDRESS_MASK ((uint64_t)0x00000ffffffff000UL) 12 | #define FLAGS_MASK ((uint64_t)0x0000000000000fffUL) 13 | 14 | #define HEMEM_PRESENT_FLAG ((uint64_t)0x0000000000000001UL) 15 | #define HEMEM_WRITE_FLAG ((uint64_t)0x0000000000000002UL) 16 | #define HEMEM_USER_FLAG ((uint64_t)0x0000000000000004UL) 17 | #define HEMEM_PWT_FLAG ((uint64_t)0x0000000000000008UL) 18 | #define HEMEM_PCD_FLAG ((uint64_t)0x0000000000000010UL) 19 | #define HEMEM_ACCESSED_FLAG ((uint64_t)0x0000000000000020UL) 20 | #define HEMEM_DIRTY_FLAG ((uint64_t)0x0000000000000040UL) 21 | #define HEMEM_HUGEPAGE_FLAG ((uint64_t)0x0000000000000080UL) 22 | 23 | 24 | #define HEMEM_PAGE_WALK_FLAGS (HEMEM_PRESENT_FLAG | \ 25 | HEMEM_WRITE_FLAG | \ 26 | HEMEM_USER_FLAG | \ 27 | HEMEM_ACCESSED_FLAG | \ 28 | HEMEM_DIRTY_FLAG) 29 | 30 | #define HEMEM_PWTPCD_FLAGS (HEMEM_PWT_FLAG | HEMEM_PCD_FLAG) 31 | 32 | #define HEMEM_PGDIR_SHIFT 39 33 | #define HEMEM_PTRS_PER_PGD 512 34 | #define HEMEM_PUD_SHIFT 30 35 | #define HEMEM_PTRS_PER_PUD 512 36 | #define HEMEM_PMD_SHIFT 21 37 | #define HEMEM_PTRS_PER_PMD 512 38 | #define HEMEM_PAGE_SHIFT 12 39 | #define HEMEM_PTRS_PER_PTE 512 40 | 41 | //#define EXAMINE_PGTABLES 42 | 43 | 44 | void scan_pagetable(); 45 | void _scan_pagetable(bool clear_flag, uint64_t flag); 46 | 47 | //void clear_accessed_bit(uint64_t pa); 48 | //uint64_t get_accessed_bit(uint64_t pa); 49 | //void clear_dirty_bit(uint64_t pa); 50 | //uint64_t get_dirty_bit(uint64_t pa); 51 | // 52 | //uint64_t* va_to_pa(uint64_t va); 53 | 54 | #ifdef EXAMINE_PGTABLES 55 | 56 | struct pagemapEntry { 57 | uint64_t pfn : 54; 58 | unsigned int soft_dirty : 1; 59 | unsigned int exclusive : 1; 60 | unsigned int file_page : 1; 61 | unsigned int swapped : 1; 62 | unsigned int present : 1; 63 | }; 64 | 65 | void *examine_pagetables(); 66 | 67 | #endif /*EXAMINE_PGTABLES*/ 68 | 69 | #endif /* HEMEM_PAGING_H */ 70 | 71 | -------------------------------------------------------------------------------- /src/policies/simple.c: -------------------------------------------------------------------------------- 1 | /* 2 | * ===================================================================================== 3 | * 4 | * Filename: simple.c 5 | * 6 | * Description: 7 | * 8 | * Version: 1.0 9 | * Created: 02/04/2020 09:58:58 AM 10 | * Revision: none 11 | * Compiler: gcc 12 | * 13 | * Author: YOUR NAME (), 14 | * Organization: 15 | * 16 | * ===================================================================================== 17 | */ 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | 27 | #include "../hemem.h" 28 | #include "paging.h" 29 | #include "../timer.h" 30 | #include "../fifo.h" 31 | 32 | uint64_t fastmem = 0; 33 | uint64_t slowmem = 0; 34 | bool slowmem_switch = false; 35 | 36 | static struct fifo_list dram_free, nvm_free; 37 | 38 | void simple_remove_page(struct hemem_page *page) 39 | { 40 | if (page->in_dram) { 41 | page->present = false; 42 | enqueue_fifo(&dram_free, page); 43 | fastmem -= PAGE_SIZE; 44 | } 45 | else { 46 | page->present = false; 47 | enqueue_fifo(&nvm_free, page); 48 | slowmem -= PAGE_SIZE; 49 | } 50 | } 51 | 52 | struct hemem_page* simple_pagefault(void) 53 | { 54 | struct timeval start, end; 55 | struct hemem_page *page; 56 | 57 | gettimeofday(&start, NULL); 58 | 59 | page = dequeue_fifo(&dram_free); 60 | if (page != NULL) { 61 | assert(!page->present); 62 | page->present = true; 63 | fastmem += PAGE_SIZE; 64 | } 65 | else { 66 | assert(slowmem < NVMSIZE); 67 | page = dequeue_fifo(&nvm_free); 68 | 69 | assert(page != NULL); 70 | assert(!page->present); 71 | 72 | page->present = true; 73 | slowmem += PAGE_SIZE; 74 | } 75 | gettimeofday(&end, NULL); 76 | LOG_TIME("mem_policy_allocate_page: %f s\n", elapsed(&start, &end)); 77 | 78 | return page; 79 | } 80 | 81 | void simple_init(void) 82 | { 83 | pthread_mutex_init(&(dram_free.list_lock), NULL); 84 | for (int i = 0; i < DRAMSIZE / PAGE_SIZE; i++) { 85 | struct hemem_page *p = calloc(1, sizeof(struct hemem_page)); 86 | p->devdax_offset = i * PAGE_SIZE; 87 | p->present = false; 88 | p->in_dram = true; 89 | p->pt = pagesize_to_pt(PAGE_SIZE); 90 | pthread_mutex_init(&(p->page_lock), NULL); 91 | enqueue_fifo(&dram_free, p); 92 | } 93 | 94 | pthread_mutex_init(&(nvm_free.list_lock), NULL); 95 | for (int i = 0; i < NVMSIZE / PAGE_SIZE; i++) { 96 | struct hemem_page *p = calloc(1, sizeof(struct hemem_page)); 97 | p->devdax_offset = i * PAGE_SIZE; 98 | p->present = false; 99 | p->in_dram = false; 100 | p->pt = pagesize_to_pt(PAGE_SIZE); 101 | pthread_mutex_init(&(p->page_lock), NULL); 102 | enqueue_fifo(&nvm_free, p); 103 | } 104 | LOG("Memory management policy is simple\n"); 105 | } 106 | 107 | void simple_stats() 108 | { 109 | LOG_STATS("\tfastmem_allocated: [%ld]\tslowmem_allocated: [%ld]\n", fastmem, slowmem); 110 | } 111 | -------------------------------------------------------------------------------- /src/policies/simple.h: -------------------------------------------------------------------------------- 1 | /* 2 | * ===================================================================================== 3 | * 4 | * Filename: simple.h 5 | * 6 | * Description: 7 | * 8 | * Version: 1.0 9 | * Created: 02/04/2020 09:56:26 AM 10 | * Revision: none 11 | * Compiler: gcc 12 | * 13 | * Author: YOUR NAME (), 14 | * Organization: 15 | * 16 | * ===================================================================================== 17 | */ 18 | #ifndef HEMEM_SIMPLE_H 19 | #define HEMEM_SIMPLE_H 20 | 21 | #include 22 | #include 23 | 24 | #include "../hemem.h" 25 | #include "paging.h" 26 | 27 | struct hemem_page* simple_pagefault(void); 28 | void simple_init(void); 29 | void simple_remove_page(struct hemem_page *page); 30 | void simple_stats(); 31 | 32 | #endif // HEMEM_SIMPLE_H 33 | -------------------------------------------------------------------------------- /src/spsc-ring.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include "spsc-ring.h" 7 | 8 | struct ring_buf_t { 9 | uint64_t** buffer; 10 | size_t head; 11 | size_t tail; 12 | size_t capacity; 13 | }; 14 | 15 | static void advance_pointer(ring_handle_t rbuf) 16 | { 17 | assert(rbuf); 18 | 19 | if(ring_buf_full(rbuf)) 20 | { 21 | if(++(rbuf->tail) == rbuf->capacity) 22 | { 23 | rbuf->tail = 0; 24 | } 25 | } 26 | 27 | if(++(rbuf->head) == rbuf->capacity) 28 | { 29 | rbuf->head = 0; 30 | } 31 | } 32 | 33 | static void retreat_pointer(ring_handle_t rbuf) 34 | { 35 | assert(rbuf); 36 | 37 | if(++(rbuf->tail) == rbuf->capacity) 38 | { 39 | rbuf->tail = 0; 40 | } 41 | } 42 | 43 | ring_handle_t ring_buf_init(uint64_t** buffer, size_t size) 44 | { 45 | assert(buffer && size); 46 | 47 | ring_handle_t rbuf = malloc(sizeof(ring_buf_t)); 48 | assert(rbuf); 49 | 50 | rbuf->buffer = buffer; 51 | rbuf->capacity = size; 52 | ring_buf_reset(rbuf); 53 | 54 | assert(ring_buf_empty(rbuf)); 55 | 56 | return rbuf; 57 | } 58 | 59 | void ring_buf_free(ring_handle_t rbuf) 60 | { 61 | assert(rbuf); 62 | free(rbuf); 63 | } 64 | 65 | void ring_buf_reset(ring_handle_t rbuf) 66 | { 67 | assert(rbuf); 68 | 69 | rbuf->head = 0; 70 | rbuf->tail = 0; 71 | } 72 | 73 | size_t ring_buf_size(ring_handle_t rbuf) 74 | { 75 | assert(rbuf); 76 | 77 | size_t size = rbuf->capacity; 78 | 79 | if(!ring_buf_full(rbuf)) 80 | { 81 | if(rbuf->head >= rbuf->tail) 82 | { 83 | size = (rbuf->head - rbuf->tail); 84 | } 85 | else 86 | { 87 | size = (rbuf->capacity + rbuf->head - rbuf->tail); 88 | } 89 | 90 | } 91 | 92 | return size; 93 | } 94 | 95 | size_t ring_buf_capacity(ring_handle_t rbuf) 96 | { 97 | assert(rbuf); 98 | 99 | return rbuf->capacity; 100 | } 101 | 102 | void ring_buf_put(ring_handle_t rbuf, uint64_t* data) 103 | { 104 | assert(rbuf && rbuf->buffer); 105 | 106 | rbuf->buffer[rbuf->head] = data; 107 | 108 | advance_pointer(rbuf); 109 | } 110 | 111 | int ring_buf_put2(ring_handle_t rbuf, uint64_t* data) 112 | { 113 | int r = -1; 114 | 115 | assert(rbuf && rbuf->buffer); 116 | 117 | if(!ring_buf_full(rbuf)) 118 | { 119 | rbuf->buffer[rbuf->head] = data; 120 | advance_pointer(rbuf); 121 | r = 0; 122 | } 123 | 124 | return r; 125 | } 126 | 127 | uint64_t* ring_buf_get(ring_handle_t rbuf) 128 | { 129 | assert(rbuf && rbuf->buffer); 130 | uint64_t* cur_read; 131 | 132 | if(!ring_buf_empty(rbuf)) 133 | { 134 | cur_read = rbuf->buffer[rbuf->tail]; 135 | retreat_pointer(rbuf); 136 | 137 | return cur_read; 138 | } 139 | 140 | return NULL; 141 | } 142 | 143 | bool ring_buf_empty(ring_handle_t rbuf) 144 | { 145 | assert(rbuf); 146 | 147 | return (!ring_buf_full(rbuf) && (rbuf->head == rbuf->tail)); 148 | } 149 | 150 | bool ring_buf_full(ring_buf_t* rbuf) 151 | { 152 | size_t head = rbuf->head + 1; 153 | if(head == rbuf->capacity) 154 | { 155 | head = 0; 156 | } 157 | 158 | return head == rbuf->tail; 159 | } 160 | -------------------------------------------------------------------------------- /src/spsc-ring.h: -------------------------------------------------------------------------------- 1 | #ifndef SPSC_RING_H 2 | #define SPSC_RING_H 3 | 4 | #include 5 | 6 | typedef struct ring_buf_t ring_buf_t; 7 | 8 | typedef ring_buf_t* ring_handle_t; 9 | 10 | ring_handle_t ring_buf_init(uint64_t** buffer, size_t size); 11 | void ring_buf_free(ring_handle_t rbuf); 12 | void ring_buf_reset(ring_handle_t rbuf); 13 | void ring_buf_put(ring_handle_t rbuf, uint64_t* data); 14 | int ring_buf_put2(ring_handle_t rbuf, uint64_t* data); 15 | uint64_t* ring_buf_get(ring_handle_t rbuf); 16 | bool ring_buf_empty(ring_handle_t rbuf); 17 | bool ring_buf_full(ring_handle_t rbuf); 18 | size_t ring_buf_capacity(ring_handle_t rbuf); 19 | size_t ring_buf_size(ring_handle_t rbuf); 20 | 21 | #endif //SPSC_RING_H 22 | -------------------------------------------------------------------------------- /src/timer.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include "timer.h" 8 | 9 | /* Useful for doing arithmetic on struct timevals. M*/ 10 | void timeDiff(struct timeval *d, struct timeval *a, struct timeval *b) 11 | { 12 | d->tv_sec = a->tv_sec - b->tv_sec; 13 | d->tv_usec = a->tv_usec - b->tv_usec; 14 | if (d->tv_usec < 0) { 15 | d->tv_sec -= 1; 16 | d->tv_usec += 1000000; 17 | } 18 | } 19 | 20 | 21 | /* Return the no. of elapsed seconds between Starttime and Endtime. */ 22 | double elapsed(struct timeval *starttime, struct timeval *endtime) 23 | { 24 | struct timeval diff; 25 | 26 | timeDiff(&diff, endtime, starttime); 27 | return tv_to_double(diff); 28 | } 29 | 30 | long clock_time_elapsed(struct timespec start, struct timespec end) 31 | { 32 | long seconds = end.tv_sec - start.tv_sec; 33 | long nanoseconds = end.tv_nsec - start.tv_nsec; 34 | return seconds * 1000000000 + nanoseconds; 35 | } 36 | 37 | -------------------------------------------------------------------------------- /src/timer.h: -------------------------------------------------------------------------------- 1 | #ifndef HEMEM_TIMER_H 2 | #define HEMEM_TIMER_H 3 | 4 | /* Returns the number of seconds encoded in T, a "struct timeval". */ 5 | #define tv_to_double(t) (t.tv_sec + (t.tv_usec / 1000000.0)) 6 | 7 | void timeDiff(struct timeval *d, struct timeval *a, struct timeval *b); 8 | double elapsed(struct timeval *starttime, struct timeval *endtime); 9 | long clock_time_elapsed(struct timespec start, struct timespec end); 10 | 11 | #endif /* HEMEM_TIMER_H */ 12 | -------------------------------------------------------------------------------- /src/userfaultfd.h: -------------------------------------------------------------------------------- 1 | /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ 2 | /* 3 | * include/linux/userfaultfd.h 4 | * 5 | * Copyright (C) 2007 Davide Libenzi 6 | * Copyright (C) 2015 Red Hat, Inc. 7 | * 8 | */ 9 | 10 | #ifndef _LINUX_USERFAULTFD_H 11 | #define _LINUX_USERFAULTFD_H 12 | 13 | #include 14 | 15 | /* 16 | * If the UFFDIO_API is upgraded someday, the UFFDIO_UNREGISTER and 17 | * UFFDIO_WAKE ioctls should be defined as _IOW and not as _IOR. In 18 | * userfaultfd.h we assumed the kernel was reading (instead _IOC_READ 19 | * means the userland is reading). 20 | */ 21 | #define UFFD_API ((__u64)0xAA) 22 | #define UFFD_API_FEATURES (UFFD_FEATURE_PAGEFAULT_FLAG_WP | \ 23 | UFFD_FEATURE_EVENT_FORK | \ 24 | UFFD_FEATURE_EVENT_REMAP | \ 25 | UFFD_FEATURE_EVENT_REMOVE | \ 26 | UFFD_FEATURE_EVENT_UNMAP | \ 27 | UFFD_FEATURE_MISSING_HUGETLBFS | \ 28 | UFFD_FEATURE_MISSING_SHMEM | \ 29 | UFFD_FEATURE_SIGBUS | \ 30 | UFFD_FEATURE_THREAD_ID) 31 | #define UFFD_API_IOCTLS \ 32 | ((__u64)1 << _UFFDIO_REGISTER | \ 33 | (__u64)1 << _UFFDIO_UNREGISTER | \ 34 | (__u64)1 << _UFFDIO_API) 35 | #define UFFD_API_RANGE_IOCTLS \ 36 | ((__u64)1 << _UFFDIO_WAKE | \ 37 | (__u64)1 << _UFFDIO_COPY | \ 38 | (__u64)1 << _UFFDIO_ZEROPAGE | \ 39 | (__u64)1 << _UFFDIO_WRITEPROTECT) 40 | #define UFFD_API_RANGE_IOCTLS_BASIC \ 41 | ((__u64)1 << _UFFDIO_WAKE | \ 42 | (__u64)1 << _UFFDIO_COPY | \ 43 | (__u64)1 << _UFFDIO_TLBFLUSH | \ 44 | (__u64)1 << _UFFDIO_CR3 | \ 45 | (__u64)1 << _UFFDIO_GET_FLAG | \ 46 | (__u64)1 << _UFFDIO_CLEAR_FLAG | \ 47 | (__u64)1 << _UFFDIO_DMA_COPY | \ 48 | (__u64)1 << _UFFDIO_DMA_REQUEST_CHANNS | \ 49 | (__u64)1 << _UFFDIO_DMA_RELEASE_CHANNS) 50 | 51 | /* 52 | * Valid ioctl command number range with this API is from 0x00 to 53 | * 0x3F. UFFDIO_API is the fixed number, everything else can be 54 | * changed by implementing a different UFFD_API. If sticking to the 55 | * same UFFD_API more ioctl can be added and userland will be aware of 56 | * which ioctl the running kernel implements through the ioctl command 57 | * bitmask written by the UFFDIO_API. 58 | */ 59 | #define _UFFDIO_REGISTER (0x00) 60 | #define _UFFDIO_UNREGISTER (0x01) 61 | #define _UFFDIO_WAKE (0x02) 62 | #define _UFFDIO_COPY (0x03) 63 | #define _UFFDIO_ZEROPAGE (0x04) 64 | #define _UFFDIO_WRITEPROTECT (0x06) 65 | #define _UFFDIO_API (0x3F) 66 | #define _UFFDIO_TLBFLUSH (0x08) 67 | #define _UFFDIO_CR3 (0x0a) 68 | #define _UFFDIO_GET_FLAG (0x0b) 69 | #define _UFFDIO_CLEAR_FLAG (0x0c) 70 | #define _UFFDIO_DMA_COPY (0x0d) 71 | #define _UFFDIO_DMA_REQUEST_CHANNS (0x0e) 72 | #define _UFFDIO_DMA_RELEASE_CHANNS (0x0f) 73 | 74 | 75 | /* userfaultfd ioctl ids */ 76 | #define UFFDIO 0xAA 77 | #define UFFDIO_API _IOWR(UFFDIO, _UFFDIO_API, \ 78 | struct uffdio_api) 79 | #define UFFDIO_REGISTER _IOWR(UFFDIO, _UFFDIO_REGISTER, \ 80 | struct uffdio_register) 81 | #define UFFDIO_UNREGISTER _IOR(UFFDIO, _UFFDIO_UNREGISTER, \ 82 | struct uffdio_range) 83 | #define UFFDIO_WAKE _IOR(UFFDIO, _UFFDIO_WAKE, \ 84 | struct uffdio_range) 85 | #define UFFDIO_COPY _IOWR(UFFDIO, _UFFDIO_COPY, \ 86 | struct uffdio_copy) 87 | #define UFFDIO_ZEROPAGE _IOWR(UFFDIO, _UFFDIO_ZEROPAGE, \ 88 | struct uffdio_zeropage) 89 | #define UFFDIO_WRITEPROTECT _IOWR(UFFDIO, _UFFDIO_WRITEPROTECT, \ 90 | struct uffdio_writeprotect) 91 | #define UFFDIO_TLBFLUSH _IOR(UFFDIO, _UFFDIO_TLBFLUSH, \ 92 | struct uffdio_range) 93 | #define UFFDIO_CR3 _IOR(UFFDIO, _UFFDIO_CR3, \ 94 | struct uffdio_cr3) 95 | #define UFFDIO_GET_FLAG _IOWR(UFFDIO, _UFFDIO_GET_FLAG, \ 96 | struct uffdio_page_flags) 97 | #define UFFDIO_CLEAR_FLAG _IOWR(UFFDIO, _UFFDIO_CLEAR_FLAG, \ 98 | struct uffdio_page_flags) 99 | #define UFFDIO_DMA_COPY _IOWR(UFFDIO, _UFFDIO_DMA_COPY, \ 100 | struct uffdio_dma_copy) 101 | #define UFFDIO_DMA_REQUEST_CHANNS _IOWR(UFFDIO, _UFFDIO_DMA_REQUEST_CHANNS, \ 102 | struct uffdio_dma_channs) 103 | #define UFFDIO_DMA_RELEASE_CHANNS _IOWR(UFFDIO, _UFFDIO_DMA_RELEASE_CHANNS, \ 104 | struct uffdio_dma_channs) 105 | 106 | /* read() structure */ 107 | struct uffd_msg { 108 | __u8 event; 109 | 110 | __u8 reserved1; 111 | __u16 reserved2; 112 | __u32 reserved3; 113 | 114 | union { 115 | struct { 116 | __u64 flags; 117 | __u64 address; 118 | union { 119 | __u32 ptid; 120 | } feat; 121 | } pagefault; 122 | 123 | struct { 124 | __u32 ufd; 125 | } fork; 126 | 127 | struct { 128 | __u64 from; 129 | __u64 to; 130 | __u64 len; 131 | } remap; 132 | 133 | struct { 134 | __u64 start; 135 | __u64 end; 136 | } remove; 137 | 138 | struct { 139 | /* unused reserved fields */ 140 | __u64 reserved1; 141 | __u64 reserved2; 142 | __u64 reserved3; 143 | } reserved; 144 | } arg; 145 | } __packed; 146 | 147 | /* 148 | * Start at 0x12 and not at 0 to be more strict against bugs. 149 | */ 150 | #define UFFD_EVENT_PAGEFAULT 0x12 151 | #define UFFD_EVENT_FORK 0x13 152 | #define UFFD_EVENT_REMAP 0x14 153 | #define UFFD_EVENT_REMOVE 0x15 154 | #define UFFD_EVENT_UNMAP 0x16 155 | 156 | /* flags for UFFD_EVENT_PAGEFAULT */ 157 | #define UFFD_PAGEFAULT_FLAG_WRITE (1<<0) /* If this was a write fault */ 158 | #define UFFD_PAGEFAULT_FLAG_WP (1<<1) /* If reason is VM_UFFD_WP */ 159 | 160 | struct uffdio_api { 161 | /* userland asks for an API number and the features to enable */ 162 | __u64 api; 163 | /* 164 | * Kernel answers below with the all available features for 165 | * the API, this notifies userland of which events and/or 166 | * which flags for each event are enabled in the current 167 | * kernel. 168 | * 169 | * Note: UFFD_EVENT_PAGEFAULT and UFFD_PAGEFAULT_FLAG_WRITE 170 | * are to be considered implicitly always enabled in all kernels as 171 | * long as the uffdio_api.api requested matches UFFD_API. 172 | * 173 | * UFFD_FEATURE_MISSING_HUGETLBFS means an UFFDIO_REGISTER 174 | * with UFFDIO_REGISTER_MODE_MISSING mode will succeed on 175 | * hugetlbfs virtual memory ranges. Adding or not adding 176 | * UFFD_FEATURE_MISSING_HUGETLBFS to uffdio_api.features has 177 | * no real functional effect after UFFDIO_API returns, but 178 | * it's only useful for an initial feature set probe at 179 | * UFFDIO_API time. There are two ways to use it: 180 | * 181 | * 1) by adding UFFD_FEATURE_MISSING_HUGETLBFS to the 182 | * uffdio_api.features before calling UFFDIO_API, an error 183 | * will be returned by UFFDIO_API on a kernel without 184 | * hugetlbfs missing support 185 | * 186 | * 2) the UFFD_FEATURE_MISSING_HUGETLBFS can not be added in 187 | * uffdio_api.features and instead it will be set by the 188 | * kernel in the uffdio_api.features if the kernel supports 189 | * it, so userland can later check if the feature flag is 190 | * present in uffdio_api.features after UFFDIO_API 191 | * succeeded. 192 | * 193 | * UFFD_FEATURE_MISSING_SHMEM works the same as 194 | * UFFD_FEATURE_MISSING_HUGETLBFS, but it applies to shmem 195 | * (i.e. tmpfs and other shmem based APIs). 196 | * 197 | * UFFD_FEATURE_SIGBUS feature means no page-fault 198 | * (UFFD_EVENT_PAGEFAULT) event will be delivered, instead 199 | * a SIGBUS signal will be sent to the faulting process. 200 | * 201 | * UFFD_FEATURE_THREAD_ID pid of the page faulted task_struct will 202 | * be returned, if feature is not requested 0 will be returned. 203 | */ 204 | #define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0) 205 | #define UFFD_FEATURE_EVENT_FORK (1<<1) 206 | #define UFFD_FEATURE_EVENT_REMAP (1<<2) 207 | #define UFFD_FEATURE_EVENT_REMOVE (1<<3) 208 | #define UFFD_FEATURE_MISSING_HUGETLBFS (1<<4) 209 | #define UFFD_FEATURE_MISSING_SHMEM (1<<5) 210 | #define UFFD_FEATURE_EVENT_UNMAP (1<<6) 211 | #define UFFD_FEATURE_SIGBUS (1<<7) 212 | #define UFFD_FEATURE_THREAD_ID (1<<8) 213 | __u64 features; 214 | 215 | __u64 ioctls; 216 | }; 217 | 218 | struct uffdio_range { 219 | __u64 start; 220 | __u64 len; 221 | }; 222 | 223 | struct uffdio_register { 224 | struct uffdio_range range; 225 | #define UFFDIO_REGISTER_MODE_MISSING ((__u64)1<<0) 226 | #define UFFDIO_REGISTER_MODE_WP ((__u64)1<<1) 227 | __u64 mode; 228 | 229 | /* 230 | * kernel answers which ioctl commands are available for the 231 | * range, keep at the end as the last 8 bytes aren't read. 232 | */ 233 | __u64 ioctls; 234 | }; 235 | 236 | struct uffdio_copy { 237 | __u64 dst; 238 | __u64 src; 239 | __u64 len; 240 | #define UFFDIO_COPY_MODE_DONTWAKE ((__u64)1<<0) 241 | /* 242 | * UFFDIO_COPY_MODE_WP will map the page write protected on 243 | * the fly. UFFDIO_COPY_MODE_WP is available only if the 244 | * write protected ioctl is implemented for the range 245 | * according to the uffdio_register.ioctls. 246 | */ 247 | #define UFFDIO_COPY_MODE_WP ((__u64)1<<1) 248 | __u64 mode; 249 | 250 | /* 251 | * "copy" is written by the ioctl and must be at the end: the 252 | * copy_from_user will not read the last 8 bytes. 253 | */ 254 | __s64 copy; 255 | }; 256 | 257 | struct uffdio_zeropage { 258 | struct uffdio_range range; 259 | #define UFFDIO_ZEROPAGE_MODE_DONTWAKE ((__u64)1<<0) 260 | __u64 mode; 261 | 262 | /* 263 | * "zeropage" is written by the ioctl and must be at the end: 264 | * the copy_from_user will not read the last 8 bytes. 265 | */ 266 | __s64 zeropage; 267 | }; 268 | 269 | struct uffdio_writeprotect { 270 | struct uffdio_range range; 271 | /* 272 | * UFFDIO_WRITEPROTECT_MODE_WP: set the flag to write protect a range, 273 | * unset the flag to undo protection of a range which was previously 274 | * write protected. 275 | * 276 | * UFFDIO_WRITEPROTECT_MODE_DONTWAKE: set the flag to avoid waking up 277 | * any wait thread after the operation succeeds. 278 | * 279 | * NOTE: Write protecting a region (WP=1) is unrelated to page faults, 280 | * therefore DONTWAKE flag is meaningless with WP=1. Removing write 281 | * protection (WP=0) in response to a page fault wakes the faulting 282 | * task unless DONTWAKE is set. 283 | */ 284 | #define UFFDIO_WRITEPROTECT_MODE_WP ((__u64)1<<0) 285 | #define UFFDIO_WRITEPROTECT_MODE_DONTWAKE ((__u64)1<<1) 286 | __u64 mode; 287 | }; 288 | 289 | struct uffdio_cr3 { 290 | //struct uffdio_range range; 291 | __u64 cr3; // base page table ptr 292 | }; 293 | 294 | struct uffdio_page_flags { 295 | __u64 va; // virtual address 296 | __u64 flag1; // the first flag of interest 297 | __u64 flag2; // the second flag of interest 298 | __u64 res1; // result of operation (flag1 value if get, success/fail if set) 299 | __u64 res2; // result of operation (flag2 value) 300 | }; 301 | 302 | #define DMA_BATCH 32 303 | #define MAX_DMA_CHANS 16 304 | //#define DEBUG_TM 305 | struct uffdio_dma_copy { 306 | __u64 dst[DMA_BATCH]; 307 | __u64 src[DMA_BATCH]; 308 | __u64 len[DMA_BATCH]; 309 | __u64 count; 310 | 311 | /* 312 | * There will be a wrprotection flag later that allows to map 313 | * pages wrprotected on the fly. And such a flag will be 314 | * available if the wrprotection ioctl are implemented for the 315 | * range according to the uffdio_register.ioctls. 316 | */ 317 | #define UFFDIO_COPY_MODE_DONTWAKE ((__u64)1<<0) 318 | __u64 mode; 319 | 320 | /* 321 | * "copy" is written by the ioctl and must be at the end: the 322 | * copy_from_user will not read the last 8 bytes. 323 | */ 324 | __s64 copy; 325 | }; 326 | 327 | struct uffdio_dma_channs { 328 | __u32 num_channs; 329 | __u32 size_per_dma_request; 330 | }; 331 | 332 | #endif /* _LINUX_USERFAULTFD_H */ 333 | --------------------------------------------------------------------------------