├── .gitignore ├── CMakeLists.txt ├── README.md ├── build.sh ├── client.cc ├── client.h ├── crush.c ├── crush.h ├── example-crush-tester.cc ├── example-simple.cc ├── hash.c ├── hash.h ├── libcrush-based-on-ceph0.86.diff ├── libcrush.h ├── mapper.c └── mapper.h /.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | .cproject 3 | .project 4 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.6) 2 | 3 | project (libcrush) 4 | set(CMAKE_BUILD_TYPE Release) 5 | 6 | include_directories(${CMAKE_CURRENT_SOURCE_DIR}) 7 | set (crush_SRCS hash.c mapper.c crush.c client.cc) 8 | 9 | add_library(crush SHARED ${crush_SRCS}) 10 | target_link_libraries(crush rados) 11 | 12 | add_executable(example-simple example-simple.cc ${crush_SRCS}) 13 | target_link_libraries(example-simple rados) 14 | add_executable(example-crush-tester example-crush-tester.cc ${crush_SRCS}) 15 | target_link_libraries(example-crush-tester crush rados) 16 | 17 | set(CMAKE_CXX_COMPILER "g++") 18 | set(CMAKE_CXX_FLAGS_DEBUG "-O0") 19 | set(CMAKE_CXX_FLAGS_RELEASE "-g -O0 -finline-limit=1000 --std=c++11") 20 | set(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR}) 21 | 22 | string(TOUPPER ${CMAKE_BUILD_TYPE} BUILD_TYPE) 23 | message(STATUS "CXX_FLAGS = " ${CMAKE_CXX_FLAGS} " " ${CMAKE_CXX_FLAGS_${BUILD_TYPE}}) 24 | 25 | install(TARGETS crush DESTINATION lib) 26 | file(GLOB HEADERS "*.h") 27 | install(FILES ${HEADERS} DESTINATION include/crush) 28 | 29 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | under development... 2 | 3 | If you want to use this library, you should notice: 4 | - apply libcrush-based-on-ceph0.86.diff, then recompile ceph and get a new librados.so 5 | - `./build.sh` to create libcrush.so 6 | - `cd build; make install` to install libcrush.so 7 | - `-lcrush -lrados`, link libcrush before librados, because librados contains unmodified crush codes 8 | 9 | I want to: 10 | - simulate CRUSH algorithm with online data in a live Ceph cluster. 11 | - access crushmap. 12 | 13 | How I do: 14 | - adjust Ceph code (based on v0.86), make a new librados. 15 | - diff: libcrush-based-on-ceph0.86.diff. 16 | 17 | I tried: 18 | - extract crush + client code from kernel code. -- difficult. 19 | - use librados directly. -- no access to crushmap. 20 | 21 | -------------------------------------------------------------------------------- /build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | set -x 4 | 5 | SOURCE_DIR=`pwd` 6 | BUILD_DIR=${BUILD_DIR:-./build} 7 | 8 | mkdir -p $BUILD_DIR \ 9 | && cd $BUILD_DIR \ 10 | && cmake $SOURCE_DIR \ 11 | && make $* 12 | 13 | set +x 14 | -------------------------------------------------------------------------------- /client.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "client.h" 4 | 5 | void create_client(librados::Rados& rados, 6 | std::string conf_file, 7 | std::string cluster_name, 8 | std::string user_name, 9 | uint64_t flags) { 10 | int ret = rados.init2(user_name.c_str(), cluster_name.c_str(), flags); 11 | assert(ret == 0); 12 | 13 | ret = rados.conf_read_file(conf_file.c_str()); 14 | assert(ret == 0); 15 | 16 | ret = rados.connect(); 17 | assert(ret == 0); 18 | } 19 | 20 | /* 21 | int main() { 22 | librados::Rados rados; 23 | create_client(rados, "/etc/ceph/ceph.conf", "ceph", "client.admin", 0); 24 | const struct crush_map *crushmap = rados.get_crushmap(); 25 | (void) crushmap; 26 | rados.put_crushmap(); 27 | } 28 | */ 29 | -------------------------------------------------------------------------------- /client.h: -------------------------------------------------------------------------------- 1 | #ifndef LIBCRUSH_CLIENT_H 2 | #define LIBCRUSH_CLIENT_H 3 | 4 | #include 5 | #include 6 | 7 | void create_client(librados::Rados &rados, 8 | std::string conf_file, 9 | std::string cluster_name, 10 | std::string user_name, 11 | uint64_t flags); 12 | 13 | #endif 14 | 15 | -------------------------------------------------------------------------------- /crush.c: -------------------------------------------------------------------------------- 1 | # include 2 | # include 3 | # define kfree(x) do { if (x) free(x); } while (0) 4 | # define BUG_ON(x) assert(!(x)) 5 | 6 | #include "crush.h" 7 | 8 | const char *crush_bucket_alg_name(int alg) { 9 | switch (alg) { 10 | case CRUSH_BUCKET_UNIFORM: return "uniform"; 11 | case CRUSH_BUCKET_LIST: return "list"; 12 | case CRUSH_BUCKET_TREE: return "tree"; 13 | case CRUSH_BUCKET_STRAW: return "straw"; 14 | default: return "unknown"; 15 | } 16 | } 17 | 18 | /** 19 | * crush_get_bucket_item_weight - Get weight of an item in given bucket 20 | * @b: bucket pointer 21 | * @p: item index in bucket 22 | */ 23 | int crush_get_bucket_item_weight(const struct crush_bucket *b, int p) { 24 | if ((__u32)p >= b->size) 25 | return 0; 26 | 27 | switch (b->alg) { 28 | case CRUSH_BUCKET_UNIFORM: 29 | return ((struct crush_bucket_uniform *)b)->item_weight; 30 | case CRUSH_BUCKET_LIST: 31 | return ((struct crush_bucket_list *)b)->item_weights[p]; 32 | case CRUSH_BUCKET_TREE: 33 | return ((struct crush_bucket_tree *)b)->node_weights[crush_calc_tree_node(p)]; 34 | case CRUSH_BUCKET_STRAW: 35 | return ((struct crush_bucket_straw *)b)->item_weights[p]; 36 | } 37 | return 0; 38 | } 39 | 40 | void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b) { 41 | kfree(b->h.perm); 42 | kfree(b->h.items); 43 | kfree(b); 44 | } 45 | 46 | void crush_destroy_bucket_list(struct crush_bucket_list *b) { 47 | kfree(b->item_weights); 48 | kfree(b->sum_weights); 49 | kfree(b->h.perm); 50 | kfree(b->h.items); 51 | kfree(b); 52 | } 53 | 54 | void crush_destroy_bucket_tree(struct crush_bucket_tree *b) { 55 | kfree(b->h.perm); 56 | kfree(b->h.items); 57 | kfree(b->node_weights); 58 | kfree(b); 59 | } 60 | 61 | void crush_destroy_bucket_straw(struct crush_bucket_straw *b) { 62 | kfree(b->straws); 63 | kfree(b->item_weights); 64 | kfree(b->h.perm); 65 | kfree(b->h.items); 66 | kfree(b); 67 | } 68 | 69 | void crush_destroy_bucket(struct crush_bucket *b) { 70 | switch (b->alg) { 71 | case CRUSH_BUCKET_UNIFORM: 72 | crush_destroy_bucket_uniform((struct crush_bucket_uniform *)b); 73 | break; 74 | case CRUSH_BUCKET_LIST: 75 | crush_destroy_bucket_list((struct crush_bucket_list *)b); 76 | break; 77 | case CRUSH_BUCKET_TREE: 78 | crush_destroy_bucket_tree((struct crush_bucket_tree *)b); 79 | break; 80 | case CRUSH_BUCKET_STRAW: 81 | crush_destroy_bucket_straw((struct crush_bucket_straw *)b); 82 | break; 83 | } 84 | } 85 | 86 | /** 87 | * crush_destroy - Destroy a crush_map 88 | * @map: crush_map pointer 89 | */ 90 | void crush_destroy(struct crush_map *map) { 91 | /* buckets */ 92 | if (map->buckets) { 93 | __s32 b; 94 | for (b = 0; b < map->max_buckets; b++) { 95 | if (map->buckets[b] == NULL) 96 | continue; 97 | crush_destroy_bucket(map->buckets[b]); 98 | } 99 | kfree(map->buckets); 100 | } 101 | 102 | /* rules */ 103 | if (map->rules) { 104 | __u32 b; 105 | for (b = 0; b < map->max_rules; b++) 106 | crush_destroy_rule(map->rules[b]); 107 | kfree(map->rules); 108 | } 109 | 110 | kfree(map); 111 | } 112 | 113 | void crush_destroy_rule(struct crush_rule *rule) { 114 | kfree(rule); 115 | } 116 | -------------------------------------------------------------------------------- /crush.h: -------------------------------------------------------------------------------- 1 | #ifndef CEPH_CRUSH_CRUSH_H 2 | #define CEPH_CRUSH_CRUSH_H 3 | 4 | #include 5 | 6 | /* 7 | * CRUSH is a pseudo-random data distribution algorithm that 8 | * efficiently distributes input values (typically, data objects) 9 | * across a heterogeneous, structured storage cluster. 10 | * 11 | * The algorithm was originally described in detail in this paper 12 | * (although the algorithm has evolved somewhat since then): 13 | * 14 | * http://www.ssrc.ucsc.edu/Papers/weil-sc06.pdf 15 | * 16 | * LGPL2 17 | */ 18 | 19 | #define CRUSH_MAGIC 0x00010000ul /* for detecting algorithm revisions */ 20 | #define CRUSH_MAX_DEPTH 10 /* max crush hierarchy depth */ 21 | 22 | #define CRUSH_ITEM_UNDEF 0x7ffffffe /* undefined result (internal use only) */ 23 | #define CRUSH_ITEM_NONE 0x7fffffff /* no result */ 24 | 25 | /* 26 | * CRUSH uses user-defined "rules" to describe how inputs should be 27 | * mapped to devices. A rule consists of sequence of steps to perform 28 | * to generate the set of output devices. 29 | */ 30 | struct crush_rule_step { 31 | __u32 op; 32 | __s32 arg1; 33 | __s32 arg2; 34 | }; 35 | 36 | /* step op codes */ 37 | enum { 38 | CRUSH_RULE_NOOP = 0, 39 | CRUSH_RULE_TAKE = 1, /* arg1 = value to start with */ 40 | CRUSH_RULE_CHOOSE_FIRSTN = 2, /* arg1 = num items to pick */ 41 | /* arg2 = type */ 42 | CRUSH_RULE_CHOOSE_INDEP = 3, /* same */ 43 | CRUSH_RULE_EMIT = 4, /* no args */ 44 | CRUSH_RULE_CHOOSELEAF_FIRSTN = 6, 45 | CRUSH_RULE_CHOOSELEAF_INDEP = 7, 46 | 47 | CRUSH_RULE_SET_CHOOSE_TRIES = 8, /* override choose_total_tries */ 48 | CRUSH_RULE_SET_CHOOSELEAF_TRIES = 9, /* override chooseleaf_descend_once */ 49 | CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES = 10, 50 | CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES = 11, 51 | CRUSH_RULE_SET_CHOOSELEAF_VARY_R = 12 52 | }; 53 | 54 | /* 55 | * for specifying choose num (arg1) relative to the max parameter 56 | * passed to do_rule 57 | */ 58 | #define CRUSH_CHOOSE_N 0 59 | #define CRUSH_CHOOSE_N_MINUS(x) (-(x)) 60 | 61 | /* 62 | * The rule mask is used to describe what the rule is intended for. 63 | * Given a ruleset and size of output set, we search through the 64 | * rule list for a matching rule_mask. 65 | */ 66 | struct crush_rule_mask { 67 | __u8 ruleset; 68 | __u8 type; 69 | __u8 min_size; 70 | __u8 max_size; 71 | }; 72 | 73 | struct crush_rule { 74 | __u32 len; 75 | struct crush_rule_mask mask; 76 | struct crush_rule_step steps[0]; 77 | }; 78 | 79 | #define crush_rule_size(len) (sizeof(struct crush_rule) + (len)*sizeof(struct crush_rule_step)) 80 | 81 | 82 | /* 83 | * A bucket is a named container of other items (either devices or 84 | * other buckets). Items within a bucket are chosen using one of a 85 | * few different algorithms. The table summarizes how the speed of 86 | * each option measures up against mapping stability when items are 87 | * added or removed. 88 | * 89 | * Bucket Alg Speed Additions Removals 90 | * ------------------------------------------------ 91 | * uniform O(1) poor poor 92 | * list O(n) optimal poor 93 | * tree O(log n) good good 94 | * straw O(n) optimal optimal 95 | */ 96 | enum { 97 | CRUSH_BUCKET_UNIFORM = 1, 98 | CRUSH_BUCKET_LIST = 2, 99 | CRUSH_BUCKET_TREE = 3, 100 | CRUSH_BUCKET_STRAW = 4 101 | }; 102 | extern const char *crush_bucket_alg_name(int alg); 103 | 104 | struct crush_bucket { 105 | __s32 id; /* this'll be negative */ 106 | __u16 type; /* non-zero; type=0 is reserved for devices */ 107 | __u8 alg; /* one of CRUSH_BUCKET_* */ 108 | __u8 hash; /* which hash function to use, CRUSH_HASH_* */ 109 | __u32 weight; /* 16-bit fixed point */ 110 | __u32 size; /* num items */ 111 | __s32 *items; 112 | 113 | /* 114 | * cached random permutation: used for uniform bucket and for 115 | * the linear search fallback for the other bucket types. 116 | */ 117 | __u32 perm_x; /* @x for which *perm is defined */ 118 | __u32 perm_n; /* num elements of *perm that are permuted/defined */ 119 | __u32 *perm; 120 | }; 121 | 122 | struct crush_bucket_uniform { 123 | struct crush_bucket h; 124 | __u32 item_weight; /* 16-bit fixed point; all items equally weighted */ 125 | }; 126 | 127 | struct crush_bucket_list { 128 | struct crush_bucket h; 129 | __u32 *item_weights; /* 16-bit fixed point */ 130 | __u32 *sum_weights; /* 16-bit fixed point. element i is sum 131 | of weights 0..i, inclusive */ 132 | }; 133 | 134 | struct crush_bucket_tree { 135 | struct crush_bucket h; /* note: h.size is _tree_ size, not number of 136 | actual items */ 137 | __u8 num_nodes; 138 | __u32 *node_weights; 139 | }; 140 | 141 | struct crush_bucket_straw { 142 | struct crush_bucket h; 143 | __u32 *item_weights; /* 16-bit fixed point */ 144 | __u32 *straws; /* 16-bit fixed point */ 145 | }; 146 | 147 | 148 | 149 | /* 150 | * CRUSH map includes all buckets, rules, etc. 151 | */ 152 | struct crush_map { 153 | struct crush_bucket **buckets; 154 | struct crush_rule **rules; 155 | 156 | __s32 max_buckets; 157 | __u32 max_rules; 158 | __s32 max_devices; 159 | 160 | /* choose local retries before re-descent */ 161 | __u32 choose_local_tries; 162 | /* choose local attempts using a fallback permutation before 163 | * re-descent */ 164 | __u32 choose_local_fallback_tries; 165 | /* choose attempts before giving up */ 166 | __u32 choose_total_tries; 167 | /* attempt chooseleaf inner descent once for firstn mode; on 168 | * reject retry outer descent. Note that this does *not* 169 | * apply to a collision: in that case we will retry as we used 170 | * to. */ 171 | __u32 chooseleaf_descend_once; 172 | 173 | /* if non-zero, feed r into chooseleaf, bit-shifted right by (r-1) 174 | * bits. a value of 1 is best for new clusters. for legacy clusters 175 | * that want to limit reshuffling, a value of 3 or 4 will make the 176 | * mappings line up a bit better with previous mappings. */ 177 | __u8 chooseleaf_vary_r; 178 | }; 179 | 180 | 181 | /* crush.c */ 182 | extern int crush_get_bucket_item_weight(const struct crush_bucket *b, int pos); 183 | extern void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b); 184 | extern void crush_destroy_bucket_list(struct crush_bucket_list *b); 185 | extern void crush_destroy_bucket_tree(struct crush_bucket_tree *b); 186 | extern void crush_destroy_bucket_straw(struct crush_bucket_straw *b); 187 | extern void crush_destroy_bucket(struct crush_bucket *b); 188 | extern void crush_destroy_rule(struct crush_rule *r); 189 | extern void crush_destroy(struct crush_map *map); 190 | 191 | static inline int crush_calc_tree_node(int i) { 192 | return ((i+1) << 1)-1; 193 | } 194 | 195 | #endif 196 | -------------------------------------------------------------------------------- /example-crush-tester.cc: -------------------------------------------------------------------------------- 1 | // 2 | // g++ crush_tester.cc -o test_crush -lcrush -lrados --std=c++11 -g -O0 3 | // notice: 4 | // 1. librados is modified based on ceph0.86 5 | // 2. libcrush is not official (see https://github.com/xanpeng/libcrush) 6 | // 3. link libcrush before librados, because librados contains another libcrush codes. 7 | // 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | using namespace std; 16 | 17 | namespace internal { 18 | #define mix(a, b, c) \ 19 | do { \ 20 | a = a - b; a = a - c; a = a ^ (c >> 13); \ 21 | b = b - c; b = b - a; b = b ^ (a << 8); \ 22 | c = c - a; c = c - b; c = c ^ (b >> 13); \ 23 | a = a - b; a = a - c; a = a ^ (c >> 12); \ 24 | b = b - c; b = b - a; b = b ^ (a << 16); \ 25 | c = c - a; c = c - b; c = c ^ (b >> 5); \ 26 | a = a - b; a = a - c; a = a ^ (c >> 3); \ 27 | b = b - c; b = b - a; b = b ^ (a << 10); \ 28 | c = c - a; c = c - b; c = c ^ (b >> 15); \ 29 | } while (0) 30 | 31 | uint32_t ceph_str_hash_rjenkins(const char *str, const unsigned length) { 32 | const unsigned char *k = (const unsigned char *)str; 33 | uint32_t a, b, c; /* the internal state */ 34 | uint32_t len; /* how many key bytes still need mixing */ 35 | len = length; // internal state 36 | a = 0x9e3779b9; /* the golden ratio; an arbitrary value */ 37 | b = a; 38 | c = 0; /* variable initialization of internal state */ 39 | 40 | while (len >= 12) { /* handle most of the key */ 41 | a = a + (k[0] + ((uint32_t)k[1] << 8) + ((uint32_t)k[2] << 16) + ((uint32_t)k[3] << 24)); 42 | b = b + (k[4] + ((uint32_t)k[5] << 8) + ((uint32_t)k[6] << 16) + ((uint32_t)k[7] << 24)); 43 | c = c + (k[8] + ((uint32_t)k[9] << 8) + ((uint32_t)k[10] << 16) + ((uint32_t)k[11] << 24)); 44 | mix(a, b, c); 45 | k = k + 12; 46 | len = len - 12; 47 | } 48 | 49 | /* handle the last 11 bytes */ 50 | c = c + length; 51 | switch (len) { /* all the case statements fall through */ 52 | case 11: c = c + ((uint32_t)k[10] << 24); 53 | case 10: c = c + ((uint32_t)k[9] << 16); 54 | case 9: c = c + ((uint32_t)k[8] << 8); 55 | /* the first byte of c is reserved for the length */ 56 | case 8: b = b + ((uint32_t)k[7] << 24); 57 | case 7: b = b + ((uint32_t)k[6] << 16); 58 | case 6: b = b + ((uint32_t)k[5] << 8); 59 | case 5: b = b + k[4]; 60 | case 4: a = a + ((uint32_t)k[3] << 24); 61 | case 3: a = a + ((uint32_t)k[2] << 16); 62 | case 2: a = a + ((uint32_t)k[1] << 8); 63 | case 1: a = a + k[0]; 64 | /* case 0: nothing left to add */ 65 | } 66 | mix(a, b, c); 67 | return c; 68 | } 69 | 70 | // x是输入变量，b表示bin的数目，bmask取满足2^n-1和b<=bmask的最小值。 71 | // 这是一个稳定求模算法，但不具有绝对的稳定性，只是相对稳定。 72 | // 假设b＝12，bmask则为15： 73 | // x＝13，1101 & 1111 ＝ 13 > b，返回 1101 & 0111 = 0101 = 5； 74 | // x＝7， 0111 & 1111 = 0111 ＝ 7 < b，返回7； 75 | // b从12增加到15的过程中，原来的输入x取模的结果没有发生变化。但b超过15时，取模结果就会变化。这就是相对稳定。 76 | // b = 12: 1 2 3 4 5 6 7 8 9 10 11 4 5 6 7 0 1 2 3 4 5 6 7 77 | // b = 13: 1 2 3 4 5 6 7 8 9 10 11 12 5 6 7 0 1 2 3 4 5 6 7 78 | // b = 14: 1 2 3 4 5 6 7 8 9 10 11 12 13 6 7 0 1 2 3 4 5 6 7 79 | // b = 15: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 7 0 1 2 3 4 5 6 7 80 | // b = 16: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 0 1 2 3 4 5 6 7 81 | // b = 17: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 1 2 3 4 5 6 7 82 | // b = 18: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 2 3 4 5 6 7 83 | // b = 19: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 3 4 5 6 7 84 | // b = 20: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 4 5 6 7 85 | // b = 21: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 5 6 7 86 | // b = 22: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 6 7 87 | // b = 23: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 7 88 | int crush_stable_mod(int x, int b, int bmask) { 89 | if ((x & bmask) < b) 90 | return x & bmask; 91 | else 92 | return x & (bmask >> 1); 93 | } 94 | } // namespace internal 95 | 96 | int g_pgp_num = 128, g_pgp_num_mask = 127; 97 | int g_pg_num = 128, g_pg_num_mask = 127; 98 | int g_pool_id = 1, g_pool_type = 1; 99 | string g_pool = "rbd"; 100 | int g_replicas = 2; 101 | int g_crush_ruleset = 0; 102 | 103 | typedef uint32_t ps_t; // placement seed 104 | 105 | struct pg_t { 106 | uint32_t seed; 107 | uint64_t pool_id; 108 | int32_t preferred; 109 | pg_t(uint32_t seed, uint64_t pool_id, int32_t preferred) 110 | : seed(seed), pool_id(pool_id), preferred(preferred) {} 111 | }; 112 | 113 | struct object_t { 114 | string name; 115 | object_t(string name) : name(name) {} 116 | }; 117 | 118 | struct object_locator_t { 119 | uint64_t pool_id; 120 | string key; // 指定hash()的输入，一般为空 121 | string nspace; // 作为object name的前缀，影响hash值，一般为空 122 | int64_t hash; // 指定hash值 123 | object_locator_t(uint64_t pool_id) : pool_id(pool_id), key(""), nspace(""), hash(-1) {} 124 | }; 125 | 126 | // step 1: 根据object name定位pg，此时的pg其实就是字符串hash得到的一个uint32_t 127 | pg_t object_to_pg(object_t oid, object_locator_t loc) { 128 | ps_t ps = internal::ceph_str_hash_rjenkins(oid.name.c_str(), oid.name.length()); 129 | return pg_t(ps, loc.pool_id, -1); 130 | } 131 | 132 | // step 2: 133 | pg_t raw_pg_to_pg(pg_t pg) { 134 | pg.seed = internal::crush_stable_mod(pg.seed, g_pg_num, g_pg_num_mask); 135 | return pg; 136 | } 137 | 138 | // step 3: 将pg映射到osd 139 | void pg_to_up_acting_osds(pg_t pg, vector *up) { 140 | librados::Rados rados; 141 | create_client(rados, "/etc/ceph/ceph.conf", "ceph", "client.admin", 0); 142 | const struct crush_map *crushmap = rados.get_crushmap(); 143 | assert(crushmap != NULL); 144 | 145 | vector osd_weight = rados.get_osd_weights(); 146 | // vector osd_weight {1965,655,655,655,1965,655,655,655}; 147 | assert(osd_weight.size() > 0); 148 | printf(" osd_weight: "); 149 | for (uint32_t w : osd_weight) printf("%d,", w); 150 | printf("\n"); 151 | 152 | int ruleno = crush_find_rule(crushmap, g_crush_ruleset, g_pool_type, g_replicas); 153 | printf(" ruleno: %d\n", ruleno); 154 | 155 | ps_t placement_ps = crush_hash32_2(0, // crush_hash_rjenkins1 156 | internal::crush_stable_mod(pg.seed, g_pgp_num, g_pgp_num_mask), 157 | g_pool_id); 158 | printf(" placement_ps: %d\n", placement_ps); 159 | 160 | int rawout[g_replicas], scratch[g_replicas * 3]; 161 | printf("\n---start crush_do_rule---\n"); 162 | int numrep = crush_do_rule(crushmap, ruleno, placement_ps, rawout, g_replicas, &osd_weight[0], osd_weight.size(), scratch); 163 | printf("---finish crush_do_rule---\n\n"); 164 | up->resize(numrep); 165 | for (int i = 0; i < numrep; ++i) 166 | up->at(i) = rawout[i]; 167 | printf(" numrep: %d, raw_osds: [", numrep); 168 | for (int i = 0; i < numrep; ++i) printf("%d,", rawout[i]); 169 | printf("]\n"); 170 | 171 | // remove nonexistent osds 172 | 173 | rados.put_crushmap(); 174 | } 175 | 176 | int main(int argc, char **argv) { 177 | assert(argc == 2); 178 | string objname = argv[argc-1]; 179 | 180 | pg_t pg = object_to_pg(object_t(objname), object_locator_t(g_pool_id)); 181 | printf("objectr_to_pg: %s -> %x\n", objname.c_str(), pg.seed); 182 | 183 | pg_t mpg = raw_pg_to_pg(pg); 184 | printf("raw_pg_to_pg: %x\n", mpg.seed); 185 | 186 | printf("pg_to_osds:\n"); 187 | vector up; 188 | pg_to_up_acting_osds(mpg, &up); 189 | } 190 | -------------------------------------------------------------------------------- /example-simple.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "libcrush.h" 4 | 5 | void find_rule() { 6 | librados::Rados rados; 7 | create_client(rados, "/etc/ceph/ceph.conf", "ceph", "client.admin", 0); 8 | 9 | const struct crush_map* crushmap = rados.get_crushmap(); 10 | // function signature: 11 | // int crush_find_rule(const struct crush_map *map, int ruleset, int type, int size) 12 | // compare result with `cehp osd crush rule dump` 13 | int rule_id = crush_find_rule(crushmap, 0, 1, 2); 14 | printf("rule id: %d\n", rule_id); 15 | rados.put_crushmap(); 16 | } 17 | 18 | int main() { 19 | find_rule(); 20 | } 21 | -------------------------------------------------------------------------------- /hash.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include "hash.h" 3 | 4 | /* 5 | * Robert Jenkins' function for mixing 32-bit values 6 | * http://burtleburtle.net/bob/hash/evahash.html 7 | * a, b = random bits, c = input and output 8 | * http://www.chasanc.com/old/hashing_func.htm 9 | */ 10 | #define crush_hashmix(a, b, c) do { \ 11 | a = a-b; a = a-c; a = a^(c>>13); \ 12 | b = b-c; b = b-a; b = b^(a<<8); \ 13 | c = c-a; c = c-b; c = c^(b>>13); \ 14 | a = a-b; a = a-c; a = a^(c>>12); \ 15 | b = b-c; b = b-a; b = b^(a<<16); \ 16 | c = c-a; c = c-b; c = c^(b>>5); \ 17 | a = a-b; a = a-c; a = a^(c>>3); \ 18 | b = b-c; b = b-a; b = b^(a<<10); \ 19 | c = c-a; c = c-b; c = c^(b>>15); \ 20 | } while (0) 21 | 22 | #define crush_hash_seed 1315423911 23 | 24 | static __u32 crush_hash32_rjenkins1(__u32 a) { 25 | __u32 hash = crush_hash_seed ^ a; 26 | __u32 b = a; 27 | __u32 x = 231232; 28 | __u32 y = 1232; 29 | crush_hashmix(b, x, hash); 30 | crush_hashmix(y, a, hash); 31 | return hash; 32 | } 33 | 34 | static __u32 crush_hash32_rjenkins1_2(__u32 a, __u32 b) { 35 | __u32 hash = crush_hash_seed ^ a ^ b; 36 | __u32 x = 231232; 37 | __u32 y = 1232; 38 | crush_hashmix(a, b, hash); 39 | crush_hashmix(x, a, hash); 40 | crush_hashmix(b, y, hash); 41 | return hash; 42 | } 43 | 44 | static __u32 crush_hash32_rjenkins1_3(__u32 a, __u32 b, __u32 c) { 45 | __u32 hash = crush_hash_seed ^ a ^ b ^ c; 46 | __u32 x = 231232; 47 | __u32 y = 1232; 48 | crush_hashmix(a, b, hash); 49 | crush_hashmix(c, x, hash); 50 | crush_hashmix(y, a, hash); 51 | crush_hashmix(b, x, hash); 52 | crush_hashmix(y, c, hash); 53 | return hash; 54 | } 55 | 56 | static __u32 crush_hash32_rjenkins1_4(__u32 a, __u32 b, __u32 c, __u32 d) { 57 | __u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d; 58 | __u32 x = 231232; 59 | __u32 y = 1232; 60 | crush_hashmix(a, b, hash); 61 | crush_hashmix(c, d, hash); 62 | crush_hashmix(a, x, hash); 63 | crush_hashmix(y, b, hash); 64 | crush_hashmix(c, x, hash); 65 | crush_hashmix(y, d, hash); 66 | return hash; 67 | } 68 | 69 | static __u32 crush_hash32_rjenkins1_5(__u32 a, __u32 b, __u32 c, __u32 d, __u32 e) { 70 | __u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d ^ e; 71 | __u32 x = 231232; 72 | __u32 y = 1232; 73 | crush_hashmix(a, b, hash); 74 | crush_hashmix(c, d, hash); 75 | crush_hashmix(e, x, hash); 76 | crush_hashmix(y, a, hash); 77 | crush_hashmix(b, x, hash); 78 | crush_hashmix(y, c, hash); 79 | crush_hashmix(d, x, hash); 80 | crush_hashmix(y, e, hash); 81 | return hash; 82 | } 83 | 84 | __u32 crush_hash32(int type, __u32 a) { 85 | switch (type) { 86 | case CRUSH_HASH_RJENKINS1: 87 | return crush_hash32_rjenkins1(a); 88 | default: 89 | return 0; 90 | } 91 | } 92 | 93 | __u32 crush_hash32_2(int type, __u32 a, __u32 b) { 94 | switch (type) { 95 | case CRUSH_HASH_RJENKINS1: 96 | return crush_hash32_rjenkins1_2(a, b); 97 | default: 98 | return 0; 99 | } 100 | } 101 | 102 | __u32 crush_hash32_3(int type, __u32 a, __u32 b, __u32 c) { 103 | switch (type) { 104 | case CRUSH_HASH_RJENKINS1: 105 | return crush_hash32_rjenkins1_3(a, b, c); 106 | default: 107 | return 0; 108 | } 109 | } 110 | 111 | __u32 crush_hash32_4(int type, __u32 a, __u32 b, __u32 c, __u32 d) { 112 | switch (type) { 113 | case CRUSH_HASH_RJENKINS1: 114 | return crush_hash32_rjenkins1_4(a, b, c, d); 115 | default: 116 | return 0; 117 | } 118 | } 119 | 120 | __u32 crush_hash32_5(int type, __u32 a, __u32 b, __u32 c, __u32 d, __u32 e) { 121 | switch (type) { 122 | case CRUSH_HASH_RJENKINS1: 123 | return crush_hash32_rjenkins1_5(a, b, c, d, e); 124 | default: 125 | return 0; 126 | } 127 | } 128 | 129 | const char *crush_hash_name(int type) { 130 | switch (type) { 131 | case CRUSH_HASH_RJENKINS1: 132 | return "rjenkins1"; 133 | default: 134 | return "unknown"; 135 | } 136 | } 137 | -------------------------------------------------------------------------------- /hash.h: -------------------------------------------------------------------------------- 1 | #ifndef CEPH_CRUSH_HASH_H 2 | #define CEPH_CRUSH_HASH_H 3 | 4 | #define CRUSH_HASH_RJENKINS1 0 5 | 6 | #define CRUSH_HASH_DEFAULT CRUSH_HASH_RJENKINS1 7 | 8 | extern const char *crush_hash_name(int type); 9 | 10 | extern __u32 crush_hash32(int type, __u32 a); 11 | extern __u32 crush_hash32_2(int type, __u32 a, __u32 b); 12 | extern __u32 crush_hash32_3(int type, __u32 a, __u32 b, __u32 c); 13 | extern __u32 crush_hash32_4(int type, __u32 a, __u32 b, __u32 c, __u32 d); 14 | extern __u32 crush_hash32_5(int type, __u32 a, __u32 b, __u32 c, __u32 d, __u32 e); 15 | 16 | #endif 17 | -------------------------------------------------------------------------------- /libcrush-based-on-ceph0.86.diff: -------------------------------------------------------------------------------- 1 | diff --git a/src/crush/CrushWrapper.h b/src/crush/CrushWrapper.h 2 | index 2d1fa95..2ebf387 100644 3 | --- a/src/crush/CrushWrapper.h 4 | +++ b/src/crush/CrushWrapper.h 5 | @@ -55,9 +55,9 @@ public: 6 | std::map type_map; /* bucket/device type names */ 7 | std::map name_map; /* bucket/device names */ 8 | std::map rule_name_map; 9 | + struct crush_map *crush; 10 | 11 | private: 12 | - struct crush_map *crush; 13 | /* reverse maps */ 14 | bool have_rmaps; 15 | std::map type_rmap, name_rmap, rule_name_rmap; 16 | diff --git a/src/include/rados/librados.hpp b/src/include/rados/librados.hpp 17 | index e6e40b2..e8e2009 100644 18 | --- a/src/include/rados/librados.hpp 19 | +++ b/src/include/rados/librados.hpp 20 | @@ -14,6 +14,10 @@ 21 | #include "librados.h" 22 | #include "rados_types.hpp" 23 | 24 | +extern "C" { 25 | +#include "crush/crush.h" 26 | +} 27 | + 28 | namespace libradosstriper 29 | { 30 | class RadosStriper; 31 | @@ -936,6 +940,9 @@ namespace librados 32 | 33 | /// get/wait for the most recent osdmap 34 | int wait_for_latest_osdmap(); 35 | + const struct crush_map* get_crushmap(); 36 | + void put_crushmap(); 37 | + std::vector<__u32> get_osd_weights(); 38 | 39 | /* 40 | * pool aio 41 | diff --git a/src/librados/RadosClient.h b/src/librados/RadosClient.h 42 | index 9a394b3..3459c87 100755 43 | --- a/src/librados/RadosClient.h 44 | +++ b/src/librados/RadosClient.h 45 | @@ -59,7 +59,6 @@ private: 46 | bool ms_handle_reset(Connection *con); 47 | void ms_handle_remote_reset(Connection *con); 48 | 49 | - Objecter *objecter; 50 | 51 | Mutex lock; 52 | Cond cond; 53 | @@ -71,7 +70,6 @@ private: 54 | void *log_cb_arg; 55 | string log_watch; 56 | 57 | - int wait_for_osdmap(); 58 | 59 | public: 60 | Finisher finisher; 61 | @@ -85,6 +83,7 @@ public: 62 | uint64_t get_instance_id(); 63 | 64 | int wait_for_latest_osdmap(); 65 | + int wait_for_osdmap(); 66 | 67 | int create_ioctx(const char *name, IoCtxImpl **io); 68 | 69 | @@ -109,6 +108,7 @@ public: 70 | // watch/notify 71 | uint64_t max_watch_notify_cookie; 72 | map watch_notify_info; 73 | + Objecter *objecter; 74 | 75 | void register_watch_notify_callback(librados::WatchNotifyInfo *wc, 76 | uint64_t *cookie); 77 | diff --git a/src/librados/librados.cc b/src/librados/librados.cc 78 | index 45bde1a..9067a0c 100644 79 | --- a/src/librados/librados.cc 80 | +++ b/src/librados/librados.cc 81 | @@ -1779,6 +1779,30 @@ int librados::Rados::wait_for_latest_osdmap() 82 | return client->wait_for_latest_osdmap(); 83 | } 84 | 85 | +const struct crush_map* librados::Rados::get_crushmap() 86 | +{ 87 | + int ret = client->wait_for_osdmap(); 88 | + if (ret < 0) 89 | + return NULL; 90 | + return client->objecter->get_osdmap_read()->crush->crush; 91 | +} 92 | + 93 | +void librados::Rados::put_crushmap() 94 | +{ 95 | + client->objecter->put_osdmap_read(); 96 | +} 97 | + 98 | +std::vector<__u32> librados::Rados::get_osd_weights() 99 | +{ 100 | + std::vector<__u32> weights; 101 | + int ret = client->wait_for_osdmap(); 102 | + if (ret < 0) 103 | + return weights; 104 | + weights = client->objecter->get_osdmap_read()->osd_weight; 105 | + client->objecter->put_osdmap_read(); 106 | + return weights; 107 | +} 108 | + 109 | librados::PoolAsyncCompletion *librados::Rados::pool_async_create_completion() 110 | { 111 | PoolAsyncCompletionImpl *c = new PoolAsyncCompletionImpl; 112 | diff --git a/src/osd/OSDMap.h b/src/osd/OSDMap.h 113 | index aafadb6..56cd379 100644 114 | --- a/src/osd/OSDMap.h 115 | +++ b/src/osd/OSDMap.h 116 | @@ -220,7 +220,6 @@ private: 117 | }; 118 | ceph::shared_ptr osd_addrs; 119 | 120 | - vector<__u32> osd_weight; // 16.16 fixed point, 0x10000 = "in", 0 = "out" 121 | vector osd_info; 122 | ceph::shared_ptr< map > > pg_temp; // temp pg mapping (e.g. while we rebuild) 123 | ceph::shared_ptr< map > primary_temp; // temp primary mapping (e.g. while we rebuild) 124 | @@ -242,6 +241,7 @@ private: 125 | 126 | public: 127 | ceph::shared_ptr crush; // hierarchical map 128 | + vector<__u32> osd_weight; // 16.16 fixed point, 0x10000 = "in", 0 = "out" 129 | 130 | friend class OSDMonitor; 131 | friend class PGMonitor; 132 | -------------------------------------------------------------------------------- /libcrush.h: -------------------------------------------------------------------------------- 1 | #ifndef LIBCRUSH_H 2 | #define LIBCRUSH_H 3 | 4 | #include "client.h" 5 | extern "C" { 6 | #include "crush.h" 7 | #include "hash.h" 8 | #include "mapper.h" 9 | } 10 | 11 | #endif 12 | 13 | -------------------------------------------------------------------------------- /mapper.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #define BUG_ON(x) assert(!(x)) 6 | #define dprintk(args...) fprintf(stderr, args) 7 | #define kmalloc(x, f) malloc(x) 8 | #define kfree(x) free(x) 9 | 10 | #include "crush.h" 11 | #include "hash.h" 12 | #include "mapper.h" 13 | 14 | /* 15 | * Implement the core CRUSH mapping algorithm. 16 | */ 17 | 18 | /** 19 | * crush_find_rule - find a crush_rule id for a given ruleset, type, and size. 20 | * @map: the crush_map 21 | * @ruleset: the storage ruleset id (user defined) 22 | * @type: storage ruleset type (user defined) 23 | * @size: output set size 24 | */ 25 | int crush_find_rule(const struct crush_map *map, int ruleset, int type, int size) { 26 | __u32 i; 27 | 28 | for (i = 0; i < map->max_rules; i++) { 29 | if (map->rules[i] && 30 | map->rules[i]->mask.ruleset == ruleset && 31 | map->rules[i]->mask.type == type && 32 | map->rules[i]->mask.min_size <= size && 33 | map->rules[i]->mask.max_size >= size) 34 | return i; 35 | } 36 | return -1; 37 | } 38 | 39 | 40 | /* 41 | * bucket choose methods 42 | * 43 | * For each bucket algorithm, we have a "choose" method that, given a 44 | * crush input @x and replica position (usually, position in output set) @r, 45 | * will produce an item in the bucket. 46 | */ 47 | 48 | /* 49 | * Choose based on a random permutation of the bucket. 50 | * 51 | * We used to use some prime number arithmetic to do this, but it 52 | * wasn't very random, and had some other bad behaviors. Instead, we 53 | * calculate an actual random permutation of the bucket members. 54 | * Since this is expensive, we optimize for the r=0 case, which 55 | * captures the vast majority of calls. 56 | */ 57 | static int bucket_perm_choose(struct crush_bucket *bucket, int x, int r) { 58 | unsigned int pr = r % bucket->size; 59 | unsigned int i, s; 60 | 61 | /* start a new permutation if @x has changed */ 62 | if (bucket->perm_x != (__u32)x || bucket->perm_n == 0) { 63 | dprintk("bucket %d new x=%d\n", bucket->id, x); 64 | bucket->perm_x = x; 65 | 66 | /* optimize common r=0 case */ 67 | if (pr == 0) { 68 | s = crush_hash32_3(bucket->hash, x, bucket->id, 0) % 69 | bucket->size; 70 | bucket->perm[0] = s; 71 | bucket->perm_n = 0xffff; /* magic value, see below */ 72 | goto out; 73 | } 74 | 75 | for (i = 0; i < bucket->size; i++) 76 | bucket->perm[i] = i; 77 | bucket->perm_n = 0; 78 | } else if (bucket->perm_n == 0xffff) { 79 | /* clean up after the r=0 case above */ 80 | for (i = 1; i < bucket->size; i++) 81 | bucket->perm[i] = i; 82 | bucket->perm[bucket->perm[0]] = 0; 83 | bucket->perm_n = 1; 84 | } 85 | 86 | /* calculate permutation up to pr */ 87 | for (i = 0; i < bucket->perm_n; i++) 88 | dprintk(" perm_choose have %d: %d\n", i, bucket->perm[i]); 89 | while (bucket->perm_n <= pr) { 90 | unsigned int p = bucket->perm_n; 91 | /* no point in swapping the final entry */ 92 | if (p < bucket->size - 1) { 93 | i = crush_hash32_3(bucket->hash, x, bucket->id, p) % 94 | (bucket->size - p); 95 | if (i) { 96 | unsigned int t = bucket->perm[p + i]; 97 | bucket->perm[p + i] = bucket->perm[p]; 98 | bucket->perm[p] = t; 99 | } 100 | dprintk(" perm_choose swap %d with %d\n", p, p+i); 101 | } 102 | bucket->perm_n++; 103 | } 104 | for (i = 0; i < bucket->size; i++) 105 | dprintk(" perm_choose %d: %d\n", i, bucket->perm[i]); 106 | 107 | s = bucket->perm[pr]; 108 | out: 109 | dprintk(" perm_choose %d sz=%d x=%d r=%d (%d) s=%d\n", bucket->id, 110 | bucket->size, x, r, pr, s); 111 | return bucket->items[s]; 112 | } 113 | 114 | /* uniform */ 115 | static int bucket_uniform_choose(struct crush_bucket_uniform *bucket, int x, int r) { 116 | return bucket_perm_choose(&bucket->h, x, r); 117 | } 118 | 119 | /* list */ 120 | static int bucket_list_choose(struct crush_bucket_list *bucket, int x, int r) { 121 | int i; 122 | 123 | for (i = bucket->h.size-1; i >= 0; i--) { 124 | __u64 w = crush_hash32_4(bucket->h.hash,x, bucket->h.items[i], 125 | r, bucket->h.id); 126 | w &= 0xffff; 127 | dprintk("list_choose i=%d x=%d r=%d item %d weight %x " 128 | "sw %x rand %llx", 129 | i, x, r, bucket->h.items[i], bucket->item_weights[i], 130 | bucket->sum_weights[i], w); 131 | w *= bucket->sum_weights[i]; 132 | w = w >> 16; 133 | /*dprintk(" scaled %llx\n", w);*/ 134 | if (w < bucket->item_weights[i]) 135 | return bucket->h.items[i]; 136 | } 137 | 138 | dprintk("bad list sums for bucket %d\n", bucket->h.id); 139 | return bucket->h.items[0]; 140 | } 141 | 142 | 143 | /* (binary) tree */ 144 | static int height(int n) { 145 | int h = 0; 146 | while ((n & 1) == 0) { 147 | h++; 148 | n = n >> 1; 149 | } 150 | return h; 151 | } 152 | 153 | static int left(int x) { 154 | int h = height(x); 155 | return x - (1 << (h-1)); 156 | } 157 | 158 | static int right(int x) { 159 | int h = height(x); 160 | return x + (1 << (h-1)); 161 | } 162 | 163 | static int terminal(int x) { 164 | return x & 1; 165 | } 166 | 167 | static int bucket_tree_choose(struct crush_bucket_tree *bucket, int x, int r) { 168 | int n; 169 | __u32 w; 170 | __u64 t; 171 | 172 | /* start at root */ 173 | n = bucket->num_nodes >> 1; 174 | 175 | while (!terminal(n)) { 176 | int l; 177 | /* pick point in [0, w) */ 178 | w = bucket->node_weights[n]; 179 | t = (__u64)crush_hash32_4(bucket->h.hash, x, n, r, 180 | bucket->h.id) * (__u64)w; 181 | t = t >> 32; 182 | 183 | /* descend to the left or right? */ 184 | l = left(n); 185 | if (t < bucket->node_weights[l]) 186 | n = l; 187 | else 188 | n = right(n); 189 | } 190 | 191 | return bucket->h.items[n >> 1]; 192 | } 193 | 194 | 195 | /* straw */ 196 | static int bucket_straw_choose(struct crush_bucket_straw *bucket, int x, int r) { 197 | __u32 i; 198 | int high = 0; 199 | __u64 high_draw = 0; 200 | __u64 draw; 201 | 202 | for (i = 0; i < bucket->h.size; i++) { 203 | draw = crush_hash32_3(bucket->h.hash, x, bucket->h.items[i], r); 204 | draw &= 0xffff; 205 | draw *= bucket->straws[i]; 206 | if (i == 0 || draw > high_draw) { 207 | high = i; 208 | high_draw = draw; 209 | } 210 | } 211 | return bucket->h.items[high]; 212 | } 213 | 214 | static int crush_bucket_choose(struct crush_bucket *in, int x, int r) { 215 | dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r); 216 | BUG_ON(in->size == 0); 217 | switch (in->alg) { 218 | case CRUSH_BUCKET_UNIFORM: 219 | return bucket_uniform_choose((struct crush_bucket_uniform *)in, 220 | x, r); 221 | case CRUSH_BUCKET_LIST: 222 | return bucket_list_choose((struct crush_bucket_list *)in, 223 | x, r); 224 | case CRUSH_BUCKET_TREE: 225 | return bucket_tree_choose((struct crush_bucket_tree *)in, 226 | x, r); 227 | case CRUSH_BUCKET_STRAW: 228 | return bucket_straw_choose((struct crush_bucket_straw *)in, 229 | x, r); 230 | default: 231 | dprintk("unknown bucket %d alg %d\n", in->id, in->alg); 232 | return in->items[0]; 233 | } 234 | } 235 | 236 | /* 237 | * true if device is marked "out" (failed, fully offloaded) 238 | * of the cluster 239 | */ 240 | static int is_out(const struct crush_map *map, 241 | const __u32 *weight, int weight_max, 242 | int item, int x) { 243 | if (item >= weight_max) 244 | return 1; 245 | if (weight[item] >= 0x10000) 246 | return 0; 247 | if (weight[item] == 0) 248 | return 1; 249 | if ((crush_hash32_2(CRUSH_HASH_RJENKINS1, x, item) & 0xffff) 250 | < weight[item]) 251 | return 0; 252 | return 1; 253 | } 254 | 255 | /** 256 | * crush_choose_firstn - choose numrep distinct items of given type 257 | * @map: the crush_map 258 | * @bucket: the bucket we are choose an item from 259 | * @x: crush input value 260 | * @numrep: the number of items to choose 261 | * @type: the type of item to choose 262 | * @out: pointer to output vector 263 | * @outpos: our position in that vector 264 | * @tries: number of attempts to make 265 | * @recurse_tries: number of attempts to have recursive chooseleaf make 266 | * @local_retries: localized retries 267 | * @local_fallback_retries: localized fallback retries 268 | * @recurse_to_leaf: true if we want one device under each item of given type (chooseleaf instead of choose) 269 | * @vary_r: pass r to recursive calls 270 | * @out2: second output vector for leaf items (if @recurse_to_leaf) 271 | * @parent_r: r value passed from the parent 272 | */ 273 | static int crush_choose_firstn(const struct crush_map *map, 274 | struct crush_bucket *bucket, 275 | const __u32 *weight, int weight_max, 276 | int x, int numrep, int type, 277 | int *out, int outpos, 278 | unsigned int tries, 279 | unsigned int recurse_tries, 280 | unsigned int local_retries, 281 | unsigned int local_fallback_retries, 282 | int recurse_to_leaf, 283 | unsigned int vary_r, 284 | int *out2, 285 | int parent_r) { 286 | int rep; 287 | unsigned int ftotal, flocal; 288 | int retry_descent, retry_bucket, skip_rep; 289 | struct crush_bucket *in = bucket; 290 | int r; 291 | int i; 292 | int item = 0; 293 | int itemtype; 294 | int collide, reject; 295 | 296 | dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d tries %d recurse_tries %d local_retries %d local_fallback_retries %d parent_r %d\n", 297 | recurse_to_leaf ? "_LEAF" : "", 298 | bucket->id, x, outpos, numrep, 299 | tries, recurse_tries, local_retries, local_fallback_retries, 300 | parent_r); 301 | 302 | for (rep = outpos; rep < numrep; rep++) { 303 | /* keep trying until we get a non-out, non-colliding item */ 304 | ftotal = 0; 305 | skip_rep = 0; 306 | do { 307 | retry_descent = 0; 308 | in = bucket; /* initial bucket */ 309 | 310 | /* choose through intervening buckets */ 311 | flocal = 0; 312 | do { 313 | collide = 0; 314 | retry_bucket = 0; 315 | r = rep + parent_r; 316 | /* r' = r + f_total */ 317 | r += ftotal; 318 | 319 | /* bucket choose */ 320 | if (in->size == 0) { 321 | reject = 1; 322 | goto reject; 323 | } 324 | if (local_fallback_retries > 0 && 325 | flocal >= (in->size>>1) && 326 | flocal > local_fallback_retries) 327 | item = bucket_perm_choose(in, x, r); 328 | else 329 | item = crush_bucket_choose(in, x, r); 330 | if (item >= map->max_devices) { 331 | dprintk(" bad item %d\n", item); 332 | skip_rep = 1; 333 | break; 334 | } 335 | 336 | /* desired type? */ 337 | if (item < 0) 338 | itemtype = map->buckets[-1-item]->type; 339 | else 340 | itemtype = 0; 341 | dprintk(" item %d type %d\n", item, itemtype); 342 | 343 | /* keep going? */ 344 | if (itemtype != type) { 345 | if (item >= 0 || 346 | (-1-item) >= map->max_buckets) { 347 | dprintk(" bad item type %d\n", type); 348 | skip_rep = 1; 349 | break; 350 | } 351 | in = map->buckets[-1-item]; 352 | retry_bucket = 1; 353 | continue; 354 | } 355 | 356 | /* collision? */ 357 | for (i = 0; i < outpos; i++) { 358 | if (out[i] == item) { 359 | collide = 1; 360 | break; 361 | } 362 | } 363 | 364 | reject = 0; 365 | if (!collide && recurse_to_leaf) { 366 | if (item < 0) { 367 | int sub_r; 368 | if (vary_r) 369 | sub_r = r >> (vary_r-1); 370 | else 371 | sub_r = 0; 372 | if (crush_choose_firstn(map, 373 | map->buckets[-1-item], 374 | weight, weight_max, 375 | x, outpos+1, 0, 376 | out2, outpos, 377 | recurse_tries, 0, 378 | local_retries, 379 | local_fallback_retries, 380 | 0, 381 | vary_r, 382 | NULL, 383 | sub_r) <= outpos) 384 | /* didn't get leaf */ 385 | reject = 1; 386 | } else { 387 | /* we already have a leaf! */ 388 | out2[outpos] = item; 389 | } 390 | } 391 | 392 | if (!reject) { 393 | /* out? */ 394 | if (itemtype == 0) 395 | reject = is_out(map, weight, 396 | weight_max, 397 | item, x); 398 | else 399 | reject = 0; 400 | } 401 | 402 | reject: 403 | if (reject || collide) { 404 | ftotal++; 405 | flocal++; 406 | 407 | if (collide && flocal <= local_retries) 408 | /* retry locally a few times */ 409 | retry_bucket = 1; 410 | else if (local_fallback_retries > 0 && 411 | flocal <= in->size + local_fallback_retries) 412 | /* exhaustive bucket search */ 413 | retry_bucket = 1; 414 | else if (ftotal < tries) 415 | /* then retry descent */ 416 | retry_descent = 1; 417 | else 418 | /* else give up */ 419 | skip_rep = 1; 420 | dprintk(" reject %d collide %d " 421 | "ftotal %u flocal %u\n", 422 | reject, collide, ftotal, 423 | flocal); 424 | } 425 | } while (retry_bucket); 426 | } while (retry_descent); 427 | 428 | if (skip_rep) { 429 | dprintk("skip rep\n"); 430 | continue; 431 | } 432 | 433 | dprintk("CHOOSE got %d\n", item); 434 | out[outpos] = item; 435 | outpos++; 436 | } 437 | 438 | dprintk("CHOOSE returns %d\n", outpos); 439 | return outpos; 440 | } 441 | 442 | 443 | /** 444 | * crush_choose_indep: alternative breadth-first positionally stable mapping 445 | * 446 | */ 447 | static void crush_choose_indep(const struct crush_map *map, 448 | struct crush_bucket *bucket, 449 | const __u32 *weight, int weight_max, 450 | int x, int left, int numrep, int type, 451 | int *out, int outpos, 452 | unsigned int tries, 453 | unsigned int recurse_tries, 454 | int recurse_to_leaf, 455 | int *out2, 456 | int parent_r) { 457 | struct crush_bucket *in = bucket; 458 | int endpos = outpos + left; 459 | int rep; 460 | unsigned int ftotal; 461 | int r; 462 | int i; 463 | int item = 0; 464 | int itemtype; 465 | int collide; 466 | 467 | dprintk("CHOOSE%s INDEP bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "", 468 | bucket->id, x, outpos, numrep); 469 | 470 | /* initially my result is undefined */ 471 | for (rep = outpos; rep < endpos; rep++) { 472 | out[rep] = CRUSH_ITEM_UNDEF; 473 | if (out2) 474 | out2[rep] = CRUSH_ITEM_UNDEF; 475 | } 476 | 477 | for (ftotal = 0; left > 0 && ftotal < tries; ftotal++) { 478 | for (rep = outpos; rep < endpos; rep++) { 479 | if (out[rep] != CRUSH_ITEM_UNDEF) 480 | continue; 481 | 482 | in = bucket; /* initial bucket */ 483 | 484 | /* choose through intervening buckets */ 485 | for (;;) { 486 | /* note: we base the choice on the position 487 | * even in the nested call. that means that 488 | * if the first layer chooses the same bucket 489 | * in a different position, we will tend to 490 | * choose a different item in that bucket. 491 | * this will involve more devices in data 492 | * movement and tend to distribute the load. 493 | */ 494 | r = rep + parent_r; 495 | 496 | /* be careful */ 497 | if (in->alg == CRUSH_BUCKET_UNIFORM && 498 | in->size % numrep == 0) 499 | /* r'=r+(n+1)*f_total */ 500 | r += (numrep+1) * ftotal; 501 | else 502 | /* r' = r + n*f_total */ 503 | r += numrep * ftotal; 504 | 505 | /* bucket choose */ 506 | if (in->size == 0) { 507 | dprintk(" empty bucket\n"); 508 | break; 509 | } 510 | 511 | item = crush_bucket_choose(in, x, r); 512 | if (item >= map->max_devices) { 513 | dprintk(" bad item %d\n", item); 514 | out[rep] = CRUSH_ITEM_NONE; 515 | if (out2) 516 | out2[rep] = CRUSH_ITEM_NONE; 517 | left--; 518 | break; 519 | } 520 | 521 | /* desired type? */ 522 | if (item < 0) 523 | itemtype = map->buckets[-1-item]->type; 524 | else 525 | itemtype = 0; 526 | dprintk(" item %d type %d\n", item, itemtype); 527 | 528 | /* keep going? */ 529 | if (itemtype != type) { 530 | if (item >= 0 || 531 | (-1-item) >= map->max_buckets) { 532 | dprintk(" bad item type %d\n", type); 533 | out[rep] = CRUSH_ITEM_NONE; 534 | if (out2) 535 | out2[rep] = 536 | CRUSH_ITEM_NONE; 537 | left--; 538 | break; 539 | } 540 | in = map->buckets[-1-item]; 541 | continue; 542 | } 543 | 544 | /* collision? */ 545 | collide = 0; 546 | for (i = outpos; i < endpos; i++) { 547 | if (out[i] == item) { 548 | collide = 1; 549 | break; 550 | } 551 | } 552 | if (collide) 553 | break; 554 | 555 | if (recurse_to_leaf) { 556 | if (item < 0) { 557 | crush_choose_indep(map, 558 | map->buckets[-1-item], 559 | weight, weight_max, 560 | x, 1, numrep, 0, 561 | out2, rep, 562 | recurse_tries, 0, 563 | 0, NULL, r); 564 | if (out2[rep] == CRUSH_ITEM_NONE) { 565 | /* placed nothing; no leaf */ 566 | break; 567 | } 568 | } else { 569 | /* we already have a leaf! */ 570 | out2[rep] = item; 571 | } 572 | } 573 | 574 | /* out? */ 575 | if (itemtype == 0 && 576 | is_out(map, weight, weight_max, item, x)) 577 | break; 578 | 579 | /* yay! */ 580 | out[rep] = item; 581 | left--; 582 | break; 583 | } 584 | } 585 | } 586 | for (rep = outpos; rep < endpos; rep++) { 587 | if (out[rep] == CRUSH_ITEM_UNDEF) { 588 | out[rep] = CRUSH_ITEM_NONE; 589 | } 590 | if (out2 && out2[rep] == CRUSH_ITEM_UNDEF) { 591 | out2[rep] = CRUSH_ITEM_NONE; 592 | } 593 | } 594 | } 595 | 596 | /** 597 | * crush_do_rule - calculate a mapping with the given input and rule 598 | * @map: the crush_map 599 | * @ruleno: the rule id 600 | * @x: hash input 601 | * @result: pointer to result vector 602 | * @result_max: maximum result size 603 | * @weight: weight vector (for map leaves) 604 | * @weight_max: size of weight vector 605 | * @scratch: scratch vector for private use; must be >= 3 * result_max 606 | */ 607 | int crush_do_rule(const struct crush_map *map, 608 | int ruleno, int x, int *result, int result_max, 609 | const __u32 *weight, int weight_max, 610 | int *scratch) { 611 | int result_len; 612 | int *a = scratch; 613 | int *b = scratch + result_max; 614 | int *c = scratch + result_max*2; 615 | int recurse_to_leaf; 616 | int *w; 617 | int wsize = 0; 618 | int *o; 619 | int osize; 620 | int *tmp; 621 | struct crush_rule *rule; 622 | __u32 step; 623 | int i, j; 624 | int numrep; 625 | /* 626 | * the original choose_total_tries value was off by one (it 627 | * counted "retries" and not "tries"). add one. 628 | */ 629 | int choose_tries = map->choose_total_tries + 1; 630 | int choose_leaf_tries = 0; 631 | /* 632 | * the local tries values were counted as "retries", though, 633 | * and need no adjustment 634 | */ 635 | int choose_local_retries = map->choose_local_tries; 636 | int choose_local_fallback_retries = map->choose_local_fallback_tries; 637 | 638 | int vary_r = map->chooseleaf_vary_r; 639 | 640 | if ((__u32)ruleno >= map->max_rules) { 641 | dprintk(" bad ruleno %d\n", ruleno); 642 | return 0; 643 | } 644 | 645 | rule = map->rules[ruleno]; 646 | result_len = 0; 647 | w = a; 648 | o = b; 649 | 650 | for (step = 0; step < rule->len; step++) { 651 | int firstn = 0; 652 | struct crush_rule_step *curstep = &rule->steps[step]; 653 | 654 | switch (curstep->op) { 655 | case CRUSH_RULE_TAKE: 656 | w[0] = curstep->arg1; 657 | wsize = 1; 658 | break; 659 | 660 | case CRUSH_RULE_SET_CHOOSE_TRIES: 661 | if (curstep->arg1 > 0) 662 | choose_tries = curstep->arg1; 663 | break; 664 | 665 | case CRUSH_RULE_SET_CHOOSELEAF_TRIES: 666 | if (curstep->arg1 > 0) 667 | choose_leaf_tries = curstep->arg1; 668 | break; 669 | 670 | case CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES: 671 | if (curstep->arg1 >= 0) 672 | choose_local_retries = curstep->arg1; 673 | break; 674 | 675 | case CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES: 676 | if (curstep->arg1 >= 0) 677 | choose_local_fallback_retries = curstep->arg1; 678 | break; 679 | 680 | case CRUSH_RULE_SET_CHOOSELEAF_VARY_R: 681 | if (curstep->arg1 >= 0) 682 | vary_r = curstep->arg1; 683 | break; 684 | 685 | case CRUSH_RULE_CHOOSELEAF_FIRSTN: 686 | case CRUSH_RULE_CHOOSE_FIRSTN: 687 | firstn = 1; 688 | /* fall through */ 689 | case CRUSH_RULE_CHOOSELEAF_INDEP: 690 | case CRUSH_RULE_CHOOSE_INDEP: 691 | if (wsize == 0) 692 | break; 693 | 694 | recurse_to_leaf = 695 | curstep->op == 696 | CRUSH_RULE_CHOOSELEAF_FIRSTN || 697 | curstep->op == 698 | CRUSH_RULE_CHOOSELEAF_INDEP; 699 | 700 | /* reset output */ 701 | osize = 0; 702 | 703 | for (i = 0; i < wsize; i++) { 704 | /* 705 | * see CRUSH_N, CRUSH_N_MINUS macros. 706 | * basically, numrep <= 0 means relative to 707 | * the provided result_max 708 | */ 709 | numrep = curstep->arg1; 710 | if (numrep <= 0) { 711 | numrep += result_max; 712 | if (numrep <= 0) 713 | continue; 714 | } 715 | j = 0; 716 | if (firstn) { 717 | int recurse_tries; 718 | if (choose_leaf_tries) 719 | recurse_tries = 720 | choose_leaf_tries; 721 | else if (map->chooseleaf_descend_once) 722 | recurse_tries = 1; 723 | else 724 | recurse_tries = choose_tries; 725 | osize += crush_choose_firstn( 726 | map, 727 | map->buckets[-1-w[i]], 728 | weight, weight_max, 729 | x, numrep, 730 | curstep->arg2, 731 | o+osize, j, 732 | choose_tries, 733 | recurse_tries, 734 | choose_local_retries, 735 | choose_local_fallback_retries, 736 | recurse_to_leaf, 737 | vary_r, 738 | c+osize, 739 | 0); 740 | } else { 741 | crush_choose_indep( 742 | map, 743 | map->buckets[-1-w[i]], 744 | weight, weight_max, 745 | x, numrep, numrep, 746 | curstep->arg2, 747 | o+osize, j, 748 | choose_tries, 749 | choose_leaf_tries ? 750 | choose_leaf_tries : 1, 751 | recurse_to_leaf, 752 | c+osize, 753 | 0); 754 | osize += numrep; 755 | } 756 | } 757 | 758 | if (recurse_to_leaf) 759 | /* copy final _leaf_ values to output set */ 760 | memcpy(o, c, osize*sizeof(*o)); 761 | 762 | /* swap o and w arrays */ 763 | tmp = o; 764 | o = w; 765 | w = tmp; 766 | wsize = osize; 767 | break; 768 | 769 | 770 | case CRUSH_RULE_EMIT: 771 | for (i = 0; i < wsize && result_len < result_max; i++) { 772 | result[result_len] = w[i]; 773 | result_len++; 774 | } 775 | wsize = 0; 776 | break; 777 | 778 | default: 779 | dprintk(" unknown op %d at step %d\n", 780 | curstep->op, step); 781 | break; 782 | } 783 | } 784 | return result_len; 785 | } 786 | 787 | 788 | -------------------------------------------------------------------------------- /mapper.h: -------------------------------------------------------------------------------- 1 | #ifndef CEPH_CRUSH_MAPPER_H 2 | #define CEPH_CRUSH_MAPPER_H 3 | 4 | /* 5 | * CRUSH functions for find rules and then mapping an input to an 6 | * output set. 7 | * 8 | * LGPL2 9 | */ 10 | 11 | #include "crush.h" 12 | 13 | extern int crush_find_rule(const struct crush_map *map, int ruleset, int type, int size); 14 | extern int crush_do_rule(const struct crush_map *map, 15 | int ruleno, 16 | int x, int *result, int result_max, 17 | const __u32 *weights, int weight_max, 18 | int *scratch); 19 | 20 | #endif 21 | --------------------------------------------------------------------------------