├── GPUexplore ├── .gitignore └── src │ └── GPUexplore.cu └── README.md /GPUexplore/.gitignore: -------------------------------------------------------------------------------- 1 | /.ptp-sync/ 2 | /Debug/ 3 | /Release/ 4 | .cproject 5 | .project 6 | .settings/ 7 | core -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | GPUexplore 2 | ========== 3 | GPUexplore is a model checker implemented in CUDA. At the moment, it can be used to check for deadlocks as well as safety properties. Input files should be in the gpf-format (more on that below). 4 | 5 | GPUexplore supports the following commandline options: 6 | ``` 7 | GPUexplore [-b ] [-t ] [-k ] [-q ] [-v ] [--por [--cycle-proviso]] [-d|-p] 8 | Generate the state space of .gpf (without extension), using the options specified: 9 | -b number of blocks to use 10 | -t number of threads per block 11 | -k number of iterations per kernel launch 12 | -q size of the hash table in number of 32 bit integers 13 | -v verbosity: 0 is quiet, 1 prints the iterations, 2 prints the number of states, 3 prints state vectors 14 | --por apply partial-order reduction 15 | --cycle-proviso apply the cycle proviso during POR 16 | -d check for deadlocks 17 | -p check for safety property (should be embedded in the model) 18 | ``` 19 | 20 | Input models 21 | ------------ 22 | The input models in gpf format can be generated from EXP models. Several examples can be found [here](http://tilde.snt.utwente.nl/~thomas.neele/GPUexplore-models.tar.gz). Included is a python script `GPUnetltsgen.py` to perform this conversion. The `-p` option should be used if you later want to apply POR during state-space exploration. 23 | 24 | Branches 25 | -------- 26 | Since it is very hard to merge the three different implementations of partial-order reduction without duplicating code, there are three separate branches. `ample-por` contains a POR implementation based on the ample-set approach, `cample-por` uses the cluster-based ample-set approach and `stubborn-por` computes the reduction based on stubborn sets. 27 | 28 | Publications 29 | -------- 30 | Anton Wijs, Thomas Neele, Dragan Bosnacki: 31 | GPUexplore 2.0: Unleashing GPU Explicit-State Model Checking. FM 2016: 694-701 32 | 33 | Thomas Neele, Anton Wijs, Dragan Bosnacki, Jaco van de Pol: 34 | Partial-Order Reduction for GPU Model Checking. ATVA 2016: 357-374 35 | 36 | Anton Wijs: 37 | BFS-Based Model Checking of Linear-Time Properties with an Application on GPUs. CAV (2) 2016: 472-493 38 | 39 | Anton Wijs, Dragan Bosnacki: 40 | Many-core on-the-fly model checking of safety properties using GPUs. STTT 18(2): 169-185 (2016) 41 | 42 | Anton Wijs, Dragan Bosnacki: 43 | GPUexplore: Many-Core On-the-Fly State Space Exploration Using GPUs. TACAS 2014: 233-247 44 | 45 | 46 | Previous versions 47 | -------- 48 | The version of the code that was used for the ATVA2016 paper has been tagged with `atva2016` in all three POR branches. Since then, we have mainly developed the cample-set version. The resulting code was used for the FM2016 publication. This commit has been tagged in the `cample-por` branch. -------------------------------------------------------------------------------- /GPUexplore/src/GPUexplore.cu: -------------------------------------------------------------------------------- 1 | /* 2 | ============================================================================ 3 | Name : GPUexplore.cu 4 | Author : Anton Wijs and Thomas Neele 5 | Version : 6 | Copyright : Copyright Anton Wijs and Thomas Neele 7 | Description : CUDA GPUexplore: On the fly state space analysis 8 | ============================================================================ 9 | */ 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | 21 | // type of elements used 22 | #define inttype uint32_t 23 | // type of indices in hash table 24 | #define indextype uint64_t 25 | 26 | enum BucketEntryStatus { EMPTY, TAKEN, FOUND }; 27 | enum PropertyStatus { NONE, DEADLOCK, SAFETY, LIVENESS }; 28 | 29 | #define MIN(a,b) \ 30 | ({ __typeof__ (a) _a = (a); \ 31 | __typeof__ (b) _b = (b); \ 32 | _a < _b ? _a : _b; }) 33 | 34 | #define MAX(a,b) \ 35 | ({ __typeof__ (a) _a = (a); \ 36 | __typeof__ (b) _b = (b); \ 37 | _a > _b ? _a : _b; }) 38 | 39 | // Nr of tiles processed in single kernel launch 40 | //#define TILEITERS 10 41 | 42 | static const int WARPSIZE = 32; 43 | static const int HALFWARPSIZE = 16; 44 | static const int INTSIZE = 32; 45 | static const int BUFFERSIZE = 256; 46 | 47 | // GPU constants 48 | __constant__ inttype d_nrbuckets; 49 | __constant__ inttype d_shared_q_size; 50 | __constant__ inttype d_nr_procs; 51 | __constant__ inttype d_max_buf_ints; 52 | __constant__ inttype d_sv_nints; 53 | __constant__ inttype d_bits_act; 54 | __constant__ inttype d_nbits_offset; 55 | __constant__ inttype d_kernel_iters; 56 | __constant__ inttype d_nbits_syncbits_offset; 57 | __constant__ PropertyStatus d_property; 58 | __constant__ inttype d_apply_por; 59 | __constant__ inttype d_check_cycle_proviso; 60 | 61 | // GPU shared memory array 62 | extern __shared__ volatile inttype shared[]; 63 | 64 | // thread ids 65 | #define WARP_ID (threadIdx.x / WARPSIZE) 66 | #define GLOBAL_WARP_ID (((blockDim.x / WARPSIZE)*blockIdx.x)+WARP_ID) 67 | #define NR_WARPS ((blockDim.x / WARPSIZE)*gridDim.x) 68 | #define LANE (threadIdx.x % WARPSIZE) 69 | #define HALFLANE (threadIdx.x % HALFWARPSIZE) 70 | //#define ENTRY_ID (LANE % d_sv_nints) 71 | #define ENTRY_ID (HALFLANE % d_sv_nints) 72 | #define GROUP_ID (LANE % d_nr_procs) 73 | #define GROUP_GID (WARP_ID * GROUPS_PER_WARP + LANE / d_nr_procs) 74 | #define NR_GROUPS ((blockDim.x / WARPSIZE) * GROUPS_PER_WARP) 75 | #define GROUPS_PER_WARP (WARPSIZE / d_nr_procs) 76 | // Group id to lane and lane to group id macros 77 | #define GTL(i) (LANE - GROUP_ID + (i)) 78 | #define LTG(i) ((i) - (LANE - GROUP_ID)) 79 | 80 | //#define NREL_IN_BUCKET ((WARPSIZE / d_sv_nints)) 81 | #define NREL_IN_BUCKET ((HALFWARPSIZE / d_sv_nints)*2) 82 | #define NREL_IN_BUCKET_HOST ((HALFWARPSIZE / sv_nints)*2) 83 | 84 | // constant for cuckoo hashing (Alcantara et al) 85 | static const inttype P = 979946131; 86 | // Retry constant to determine number of retries for element insertion 87 | #define RETRYFREQ 7 88 | #define NR_HASH_FUNCTIONS 8 89 | // Number of retries in local cache 90 | #define CACHERETRYFREQ 20 91 | // Maximum size of state vectors (in nr. of 32-bit integers) 92 | #define MAX_SIZE 9 93 | // Empty state vectors 94 | static const inttype EMPTYVECT32 = 0x7FFFFFFF; 95 | // Constant to indicate that no more work is required 96 | # define EXPLORATION_DONE 0x7FFFFFFF 97 | // offset in shared memory from which loaded data can be read 98 | static const int SH_OFFSET = 5; 99 | //static const int KERNEL_ITERS = 10; 100 | //static const int NR_OF_BLOCKS = 3120; 101 | //static const int BLOCK_SIZE = 512; 102 | static const int KERNEL_ITERS = 1; 103 | static const int NR_OF_BLOCKS = 1; 104 | static const int BLOCK_SIZE = 32; 105 | const size_t Mb = 1<<20; 106 | 107 | // test macros 108 | #define PRINTTHREADID() {printf("Hello thread %d\n", (blockIdx.x*blockDim.x)+threadIdx.x);} 109 | #define PRINTTHREAD(j, i) {printf("%d: Seen by thread %d: %d\n", (j), (blockIdx.x*blockDim.x)+threadIdx.x, (i));} 110 | 111 | // Offsets calculations for shared memory arrays 112 | #define HASHCONSTANTSLEN (2*NR_HASH_FUNCTIONS) 113 | #define VECTORPOSLEN (d_nr_procs+1) 114 | #define LTSSTATESIZELEN (d_nr_procs) 115 | #define OPENTILELEN (d_sv_nints*NR_GROUPS) 116 | #define LASTSEARCHLEN (blockDim.x/WARPSIZE) 117 | #define TGTSTATELEN (blockDim.x*d_sv_nints) 118 | #define THREADBUFFERLEN (NR_GROUPS*(THREADBUFFERSHARED+(d_nr_procs*d_max_buf_ints))) 119 | 120 | #define HASHCONSTANTSOFFSET (SH_OFFSET) 121 | #define VECTORPOSOFFSET (HASHCONSTANTSOFFSET+HASHCONSTANTSLEN) 122 | #define LTSSTATESIZEOFFSET (VECTORPOSOFFSET+VECTORPOSLEN) 123 | #define OPENTILEOFFSET (LTSSTATESIZEOFFSET+LTSSTATESIZELEN) 124 | #define LASTSEARCHOFFSET (OPENTILEOFFSET+OPENTILELEN) 125 | #define TGTSTATEOFFSET (LASTSEARCHOFFSET+LASTSEARCHLEN) 126 | #define THREADBUFFEROFFSET (TGTSTATEOFFSET+TGTSTATELEN) 127 | #define CACHEOFFSET (THREADBUFFEROFFSET+THREADBUFFERLEN) 128 | 129 | // One int for sync action counter 130 | // One int for POR counter 131 | #define THREADBUFFERSHARED 2 132 | // parameter is thread id 133 | #define THREADBUFFERGROUPSTART(i) (THREADBUFFEROFFSET+ (((i) / WARPSIZE)*GROUPS_PER_WARP+(((i) % WARPSIZE) / d_nr_procs)) * (THREADBUFFERSHARED+(d_nr_procs*d_max_buf_ints))) 134 | // parameter is group id 135 | #define THREADBUFFERGROUPPOS(i, j) shared[tbgs+THREADBUFFERSHARED+((i)*d_max_buf_ints)+(j)] 136 | #define THREADGROUPCOUNTER shared[tbgs] 137 | #define THREADGROUPPOR shared[tbgs + 1] 138 | 139 | #define THREADINGROUP (LANE < (GROUPS_PER_WARP)*d_nr_procs) 140 | 141 | #define STATESIZE(i) (shared[LTSSTATESIZEOFFSET+(i)]) 142 | #define VECTORSTATEPOS(i) (shared[VECTORPOSOFFSET+(i)]) 143 | #define NR_OF_STATES_IN_TRANSENTRY(i) ((31 - d_bits_act) / shared[LTSSTATESIZEOFFSET+(i)]) 144 | // SM local progress flags 145 | #define ITERATIONS (shared[0]) 146 | #define CONTINUE (shared[1]) 147 | #define OPENTILECOUNT (shared[2]) 148 | #define WORKSCANRESULT (shared[3]) 149 | #define SCAN (shared[4]) 150 | 151 | // BIT MANIPULATION MACROS 152 | 153 | #define SETBIT(i, x) {(x) = ((1<<(i)) | (x));} 154 | #define GETBIT(i, x) (((x) >> (i)) & 1) 155 | #define SETBITS(i, j, x) {(x) = (x) | (((1<<(j))-1)^((1<<(i))-1));} 156 | #define GETBITS(x, y, start, len) {(x) = ((y) >> (start)) & ((1 << (len)) - 1);} 157 | #define GETPROCTRANSACT(a, t) GETBITS(a, t, 1, d_bits_act) 158 | #define GETPROCTRANSSYNC(a, t) {(a) = ((t) & 1);} 159 | #define GETPROCTRANSSTATE(a, t, i, j) GETBITS(a, t, 1+d_bits_act+(i)*STATESIZE(j), STATESIZE(j)) 160 | #define GETTRANSOFFSET(a, t, i) GETBITS(a, t, (i)*d_nbits_offset, d_nbits_offset) 161 | #define GETSYNCOFFSET(a, t, i) GETBITS(a, t, (i)*d_nbits_syncbits_offset, d_nbits_syncbits_offset) 162 | #define GETSTATEVECTORSTATE(b, t, i) { asm("{\n\t" \ 163 | " .reg .u64 t1;\n\t" \ 164 | " mov.b64 t1,{%1,%2};\n\t" \ 165 | " bfe.u64 t1, t1, %3, %4;\n\t" \ 166 | " cvt.u32.u64 %0,t1;\n\t" \ 167 | "}" : "=r"(b) : "r"((t)[VECTORSTATEPOS(i)/INTSIZE]), "r"(VECTORSTATEPOS(i)/INTSIZE == (VECTORSTATEPOS((i)+1)-1)/INTSIZE ? 0 : (t)[VECTORSTATEPOS(i)/INTSIZE+1]), \ 168 | "r"(VECTORSTATEPOS(i)%INTSIZE), "r"(VECTORSTATEPOS(i+1)-VECTORSTATEPOS(i))); \ 169 | } 170 | #define SETSTATEVECTORSTATE(t, i, x) { asm("bfi.b32 %0, %1, %0, %2, %3;" \ 171 | : "+r"((t)[VECTORSTATEPOS(i)/INTSIZE]) : \ 172 | "r"(x), "r"(VECTORSTATEPOS(i)%INTSIZE), "r"(VECTORSTATEPOS((i)+1)-VECTORSTATEPOS(i))); \ 173 | if (VECTORSTATEPOS(i)/INTSIZE != (VECTORSTATEPOS((i)+1)-1)/INTSIZE) { \ 174 | asm("bfi.b32 %0, %1, %0, %2, %3;" \ 175 | : "+r"((t)[VECTORSTATEPOS(i+1)/INTSIZE]) : \ 176 | "r"((x)>>(INTSIZE - (VECTORSTATEPOS(i) % INTSIZE))), "r"(0), "r"(VECTORSTATEPOS((i)+1) % INTSIZE)); \ 177 | } \ 178 | } 179 | // NEEDS FIX: USE BIT 32 OF FIRST INTEGER TO INDICATE STATE OR NOT (1 or 0), IN CASE MULTIPLE INTEGERS ARE USED FOR STATE VECTOR!!! 180 | //#define ISSTATE(t) ((t)[(d_sv_nints-1)] != EMPTYVECT32) 181 | #define ISSTATE(t) ((t)[0] != EMPTYVECT32) 182 | #define SETNEWSTATE(t) { (t)[(d_sv_nints-1)] = (t)[(d_sv_nints-1)] | 0x80000000;} 183 | #define SETOLDSTATE(t) { (t)[(d_sv_nints-1)] = (t)[(d_sv_nints-1)] & 0x7FFFFFFF;} 184 | #define ISNEWSTATE(t) ((t)[(d_sv_nints-1)] >> 31) 185 | #define ISNEWSTATE_HOST(t) ((t)[(sv_nints-1)] >> 31) 186 | #define ISNEWINT(t) ((t) >> 31) 187 | #define OLDINT(t) ((t) & 0x7FFFFFFF) 188 | #define NEWINT(t) ((t) | 0x80000000) 189 | 190 | #define SETPORSTATE(t) { (t)[(d_sv_nints-1)] = (t)[(d_sv_nints-1)] | 0x40000000;} 191 | #define SETOTHERSTATE(t) { (t)[(d_sv_nints-1)] = (t)[(d_sv_nints-1)] & 0xBFFFFFFF;} 192 | #define ISPORSTATE(t) (ISPORINT((t)[(d_sv_nints-1))) 193 | #define ISPORSTATE_HOST(t) (ISPORINT((t)[(sv_nints-1))) 194 | #define ISPORINT(t) (((t) & 0x40000000) >> 30) 195 | #define OTHERINT(t) ((t) & 0xBFFFFFFF) 196 | #define PORINT(t) ((t) | 0x40000000) 197 | 198 | #define STATE_FLAGS_MASK (d_apply_por ? 0x3FFFFFFF : 0x7FFFFFFF) 199 | #define STRIPSTATE(t) {(t)[(d_sv_nints-1)] = (t)[(d_sv_nints-1)] & STATE_FLAGS_MASK;} 200 | #define STRIPPEDSTATE(t, i) ((i == d_sv_nints-1) ? ((t)[i] & STATE_FLAGS_MASK) : (t)[i]) 201 | #define STRIPPEDENTRY(t, i) ((i == d_sv_nints-1) ? ((t) & STATE_FLAGS_MASK) : (t)) 202 | #define STRIPPEDENTRY_HOST(t, i) ((i == sv_nints-1) ? ((t) & (apply_por ? 0x3FFFFFFF : 0x7FFFFFFF)) : (t)) 203 | #define NEWSTATEPART(t, i) (((i) == d_sv_nints-1) ? ((t)[d_sv_nints-1] | 0x80000000) : (t)[(i)]) 204 | #define COMPAREENTRIES(t1, t2) (((t1) & STATE_FLAGS_MASK) == ((t2) & STATE_FLAGS_MASK)) 205 | #define GETSYNCRULE(a, t, i) GETBITS(a, t, (i)*d_nr_procs, d_nr_procs) 206 | 207 | // HASH TABLE MACROS 208 | 209 | // Return 0 if not found, bit 2 is flag for new state, bit 3 is a flag for POR state, 8 if cache is full 210 | __device__ inttype STOREINCACHE(volatile inttype* t, inttype* cache, inttype* address) { 211 | inttype bi, bj, bk, bl, bitmask; 212 | indextype hashtmp; 213 | STRIPSTATE(t); 214 | hashtmp = 0; 215 | for (bi = 0; bi < d_sv_nints; bi++) { 216 | hashtmp += t[bi]; 217 | hashtmp <<= 5; 218 | } 219 | bitmask = d_sv_nints*((inttype) (hashtmp % ((d_shared_q_size - CACHEOFFSET) / d_sv_nints))); 220 | SETNEWSTATE(t); 221 | bl = 0; 222 | while (bl < CACHERETRYFREQ) { 223 | bi = atomicCAS((inttype *) &cache[bitmask+(d_sv_nints-1)], EMPTYVECT32, t[d_sv_nints-1]); 224 | if (bi == EMPTYVECT32) { 225 | for (bj = 0; bj < d_sv_nints-1; bj++) { 226 | cache[bitmask+bj] = t[bj]; 227 | } 228 | *address = bitmask; 229 | return 0; 230 | } 231 | if (COMPAREENTRIES(bi, t[d_sv_nints-1])) { 232 | if (d_sv_nints == 1) { 233 | *address = bitmask; 234 | return 1 + (ISNEWINT(bi) << 1) + (ISPORINT(bi) << 2); 235 | } 236 | else { 237 | for (bj = 0; bj < d_sv_nints-1; bj++) { 238 | if (cache[bitmask+bj] != (t)[bj]) { 239 | break; 240 | } 241 | } 242 | if (bj == d_sv_nints-1) { 243 | *address = bitmask; 244 | return 1 + (ISNEWINT(bi) << 1) + (ISPORINT(bi) << 2); 245 | } 246 | } 247 | } 248 | if (!ISNEWINT(bi)) { 249 | bj = atomicCAS((inttype *) &cache[bitmask+(d_sv_nints-1)], bi, t[d_sv_nints-1]); 250 | if (bi == bj) { 251 | for (bk = 0; bk < d_sv_nints-1; bk++) { 252 | cache[bitmask+bk] = t[bk]; 253 | } 254 | *address = bitmask; 255 | return 0; 256 | } 257 | } 258 | bl++; 259 | bitmask += d_sv_nints; 260 | if ((bitmask+(d_sv_nints-1)) >= (d_shared_q_size - CACHEOFFSET)) { 261 | bitmask = 0; 262 | } 263 | } 264 | return 8; 265 | } 266 | 267 | // Mark the state in the cache according to markNew 268 | // This function is used while applying POR to decide whether the cycle proviso 269 | // is satisfied. 270 | __device__ void MARKINCACHE(volatile inttype* t, inttype* cache, int markNew) { 271 | inttype bi, bj, bl, bitmask; 272 | indextype hashtmp; 273 | STRIPSTATE(t); 274 | hashtmp = 0; 275 | for (bi = 0; bi < d_sv_nints; bi++) { 276 | hashtmp += t[bi]; 277 | hashtmp <<= 5; 278 | } 279 | bitmask = d_sv_nints*((inttype) (hashtmp % ((d_shared_q_size - CACHEOFFSET) / d_sv_nints))); 280 | SETNEWSTATE(t); 281 | bl = 0; 282 | while (bl < CACHERETRYFREQ) { 283 | bi = cache[bitmask+(d_sv_nints-1)]; 284 | if (COMPAREENTRIES(bi, t[d_sv_nints-1])) { 285 | for (bj = 0; bj < d_sv_nints-1; bj++) { 286 | if (cache[bitmask+bj] != (t)[bj]) { 287 | break; 288 | } 289 | } 290 | if (bj == d_sv_nints-1) { 291 | if(markNew) { 292 | cache[bitmask+(d_sv_nints-1)] = NEWINT(OTHERINT(cache[bitmask+(d_sv_nints-1)] & STATE_FLAGS_MASK)); 293 | } else if(ISPORINT(bi) && ISNEWINT(bi)){ 294 | atomicCAS((inttype*) &cache[bitmask+(d_sv_nints-1)], bi, OLDINT(bi)); 295 | } 296 | return; 297 | } 298 | } 299 | bl++; 300 | bitmask += d_sv_nints; 301 | if ((bitmask+(d_sv_nints-1)) >= (d_shared_q_size - CACHEOFFSET)) { 302 | bitmask = 0; 303 | } 304 | } 305 | } 306 | 307 | // hash functions use bj variable 308 | #define FIRSTHASH(a, t) { hashtmp = 0; \ 309 | for (bj = 0; bj < d_sv_nints; bj++) { \ 310 | hashtmp += STRIPPEDSTATE(t,bj); \ 311 | hashtmp <<= 5; \ 312 | } \ 313 | hashtmp = (indextype) (d_h[0]*hashtmp+d_h[1]); \ 314 | (a) = WARPSIZE*((inttype)(hashtmp % P) % d_nrbuckets); \ 315 | } 316 | #define FIRSTHASHHOST(a) { indextype hashtmp = 0; \ 317 | hashtmp = (indextype) h[1]; \ 318 | (a) = WARPSIZE*((inttype) ((hashtmp % P) % q_size/WARPSIZE)); \ 319 | } 320 | #define HASHALL(a, i, t) { hashtmp = 0; \ 321 | for (bj = 0; bj < d_sv_nints; bj++) { \ 322 | hashtmp += STRIPPEDSTATE(t,bj); \ 323 | hashtmp <<= 5; \ 324 | } \ 325 | hashtmp = (indextype) (shared[HASHCONSTANTSOFFSET+(2*(i))]*(hashtmp)+shared[HASHCONSTANTSOFFSET+(2*(i))+1]); \ 326 | (a) = WARPSIZE*((inttype)(hashtmp % P) % d_nrbuckets); \ 327 | } 328 | #define HASHFUNCTION(a, i, t) ((HASHALL((a), (i), (t)))) 329 | 330 | #define COMPAREVECTORS(a, t1, t2) { (a) = 1; \ 331 | for (bk = 0; bk < d_sv_nints-1; bk++) { \ 332 | if ((t1)[bk] != (t2)[bk]) { \ 333 | (a) = 0; break; \ 334 | } \ 335 | } \ 336 | if ((a)) { \ 337 | if (STRIPPEDSTATE((t1),bk) != STRIPPEDSTATE((t2),bk)) { \ 338 | (a) = 0; \ 339 | } \ 340 | } \ 341 | } 342 | 343 | // check if bucket element associated with lane is a valid position to store data 344 | #define LANEPOINTSTOVALIDBUCKETPOS (HALFLANE < ((HALFWARPSIZE / d_sv_nints)*d_sv_nints)) 345 | 346 | __device__ inttype LANE_POINTS_TO_EL(inttype i) { 347 | if (i < HALFWARPSIZE / d_sv_nints) { 348 | return (LANE >= i*d_sv_nints && LANE < (i+1)*d_sv_nints); 349 | } 350 | else { 351 | return (LANE >= HALFWARPSIZE+(i-(HALFWARPSIZE / d_sv_nints))*d_sv_nints && LANE < HALFWARPSIZE+(i-(HALFWARPSIZE / d_sv_nints)+1)*d_sv_nints); 352 | } 353 | } 354 | 355 | // start position of element i in bucket 356 | #define STARTPOS_OF_EL_IN_BUCKET(i) ((i < (HALFWARPSIZE / d_sv_nints)) ? (i*d_sv_nints) : (HALFWARPSIZE + (i-(HALFWARPSIZE/d_sv_nints))*d_sv_nints)) 357 | #define STARTPOS_OF_EL_IN_BUCKET_HOST(i) ((i < (HALFWARPSIZE / sv_nints)) ? (i*sv_nints) : (HALFWARPSIZE + (i-(HALFWARPSIZE/sv_nints))*sv_nints)) 358 | 359 | 360 | // find or put element, warp version. t is element stored in block cache 361 | __device__ inttype FINDORPUT_WARP(inttype* t, inttype* d_q, volatile inttype* d_newstate_flags, inttype claim_work) { 362 | inttype bi, bj, bk, bl, bitmask; 363 | indextype hashtmp; 364 | inttype hash; 365 | BucketEntryStatus threadstatus; 366 | // prepare bitmask once to reason about results of threads in the same (state vector) group 367 | bitmask = 0; 368 | if (LANEPOINTSTOVALIDBUCKETPOS) { 369 | SETBITS(LANE-ENTRY_ID, LANE-ENTRY_ID+d_sv_nints, bitmask); 370 | } 371 | for (bi = 0; bi < NR_HASH_FUNCTIONS; bi++) { 372 | HASHFUNCTION(hash, bi, t); 373 | bl = d_q[hash+LANE]; 374 | bk = __ballot(STRIPPEDENTRY(bl, ENTRY_ID) == STRIPPEDSTATE(t, ENTRY_ID)); 375 | // threadstatus is used to determine whether full state vector has been found 376 | threadstatus = EMPTY; 377 | if (LANEPOINTSTOVALIDBUCKETPOS) { 378 | if ((bk & bitmask) == bitmask) { 379 | threadstatus = FOUND; 380 | } 381 | } 382 | if (__ballot(threadstatus == FOUND) != 0) { 383 | // state vector has been found in bucket. mark local copy as old. 384 | if (LANE == 0) { 385 | SETOLDSTATE(t); 386 | } 387 | return 1; 388 | } 389 | // try to find empty position to insert new state vector 390 | threadstatus = (bl == EMPTYVECT32 && LANEPOINTSTOVALIDBUCKETPOS) ? EMPTY : TAKEN; 391 | // let bk hold the smallest index of an available empty position 392 | bk = __ffs(__ballot(threadstatus == EMPTY)); 393 | while (bk != 0) { 394 | // write the state vector 395 | bk--; 396 | if (LANE >= bk && LANE < bk+d_sv_nints) { 397 | bl = atomicCAS(&(d_q[hash+LANE]), EMPTYVECT32, t[ENTRY_ID]); 398 | if (bl == EMPTYVECT32) { 399 | // success 400 | if (ENTRY_ID == d_sv_nints-1) { 401 | SETOLDSTATE(t); 402 | } 403 | // try to claim the state vector for future work 404 | bl = OPENTILELEN; 405 | if (ENTRY_ID == d_sv_nints-1) { 406 | // try to increment the OPENTILECOUNT counter 407 | if (claim_work && (bl = atomicAdd((inttype *) &OPENTILECOUNT, d_sv_nints)) < OPENTILELEN) { 408 | d_q[hash+LANE] = t[d_sv_nints-1]; 409 | } else { 410 | // There is work available for some block 411 | __threadfence(); 412 | d_newstate_flags[(hash / blockDim.x) % gridDim.x] = 1; 413 | } 414 | } 415 | // all active threads read the OPENTILECOUNT value of the last thread, and possibly store their part of the vector in the shared memory 416 | bl = __shfl(bl, LANE-ENTRY_ID+d_sv_nints-1); 417 | if (bl < OPENTILELEN) { 418 | // write part of vector to shared memory 419 | shared[OPENTILEOFFSET+bl+ENTRY_ID] = NEWSTATEPART(t, ENTRY_ID); 420 | } 421 | // write was successful. propagate this to the whole warp by setting threadstatus to FOUND 422 | threadstatus = FOUND; 423 | } 424 | else { 425 | // write was not successful. check if the state vector now in place equals the one we are trying to insert 426 | bk = __ballot(STRIPPEDENTRY(bl, ENTRY_ID) == STRIPPEDSTATE(t, ENTRY_ID)); 427 | if ((bk & bitmask) == bitmask) { 428 | // state vector has been found in bucket. mark local copy as old. 429 | if (LANE == bk) { 430 | SETOLDSTATE(t); 431 | } 432 | // propagate this result to the whole warp 433 | threadstatus = FOUND; 434 | } 435 | else { 436 | // state vector is different, and position in bucket is taken 437 | threadstatus = TAKEN; 438 | } 439 | } 440 | } 441 | // check if the state vector was either encountered or inserted 442 | if (__ballot(threadstatus == FOUND) != 0) { 443 | return 1; 444 | } 445 | // recompute bk 446 | bk = __ffs(__ballot(threadstatus == EMPTY)); 447 | } 448 | } 449 | return 0; 450 | } 451 | 452 | // find element, warp version. t is element stored in block cache 453 | // return 0 if not found or found and new, 1 if found and old 454 | __device__ inttype FIND_WARP(inttype* t, inttype* d_q) { 455 | inttype bi, bj, bk, bl, bitmask; 456 | indextype hashtmp; 457 | BucketEntryStatus threadstatus; 458 | // prepare bitmask once to reason about results of threads in the same (state vector) group 459 | bitmask = 0; 460 | if (LANEPOINTSTOVALIDBUCKETPOS) { 461 | SETBITS(LANE-ENTRY_ID, LANE-ENTRY_ID+d_sv_nints, bitmask); 462 | } 463 | for (bi = 0; bi < NR_HASH_FUNCTIONS; bi++) { 464 | HASHFUNCTION(hashtmp, bi, t); 465 | bl = d_q[hashtmp+LANE]; 466 | bk = __ballot(STRIPPEDENTRY(bl, ENTRY_ID) == STRIPPEDSTATE(t, ENTRY_ID)); 467 | // threadstatus is used to determine whether full state vector has been found 468 | threadstatus = EMPTY; 469 | if (LANEPOINTSTOVALIDBUCKETPOS) { 470 | if ((bk & bitmask) == bitmask) { 471 | threadstatus = FOUND; 472 | } 473 | } 474 | if (__ballot(threadstatus == FOUND) != 0) { 475 | // state vector has been found in bucket. mark local copy as old. 476 | if (threadstatus == FOUND & ISNEWINT(bl) == 0 & ENTRY_ID == d_sv_nints - 1) { 477 | SETOLDSTATE(t); 478 | } 479 | SETPORSTATE(t); 480 | return __ballot(threadstatus == FOUND & ISNEWINT(bl) == 0 & ENTRY_ID == d_sv_nints - 1); 481 | } 482 | // try to find empty position 483 | threadstatus = (bl == EMPTYVECT32 && LANEPOINTSTOVALIDBUCKETPOS) ? EMPTY : TAKEN; 484 | if(__any(threadstatus == EMPTY)) { 485 | // There is an empty slot in this bucket and the state vector was not found 486 | // State will also not be found after rehashing, so we return 0 487 | SETPORSTATE(t); 488 | return 0; 489 | } 490 | } 491 | SETPORSTATE(t); 492 | return 0; 493 | } 494 | 495 | // macro to print state vector 496 | #define PRINTVECTOR(s) { printf ("("); \ 497 | for (bk = 0; bk < d_nr_procs; bk++) { \ 498 | GETSTATEVECTORSTATE(bj, (s), bk) \ 499 | printf ("%d", bj); \ 500 | if (bk < (d_nr_procs-1)) { \ 501 | printf (","); \ 502 | } \ 503 | } \ 504 | printf (")\n"); \ 505 | } 506 | 507 | 508 | int vmem = 0; 509 | 510 | // GPU textures 511 | texture tex_proc_offsets_start; 512 | texture tex_proc_offsets; 513 | texture tex_proc_trans_start; 514 | texture tex_proc_trans; 515 | texture tex_syncbits_offsets; 516 | texture tex_syncbits; 517 | 518 | /** 519 | * This macro checks return value of the CUDA runtime call and exits 520 | * the application if the call failed. 521 | */ 522 | #define CUDA_CHECK_RETURN(value) { \ 523 | cudaError_t _m_cudaStat = value; \ 524 | if (_m_cudaStat != cudaSuccess) { \ 525 | fprintf(stderr, "Error %s at line %d in file %s\n", \ 526 | cudaGetErrorString(_m_cudaStat), __LINE__, __FILE__); \ 527 | exit(1); \ 528 | } } 529 | 530 | //wrapper around cudaMalloc to count allocated memory and check for error while allocating 531 | int cudaMallocCount ( void ** ptr,int size) { 532 | cudaError_t err = cudaSuccess; 533 | vmem += size; 534 | err = cudaMalloc(ptr,size); 535 | if (err) { 536 | printf("Error %s at line %d in file %s\n", cudaGetErrorString(err), __LINE__, __FILE__); 537 | exit(1); 538 | } 539 | fprintf (stdout, "allocated %d\n", size); 540 | return size; 541 | } 542 | 543 | //test function to print a given state vector 544 | void print_statevector(FILE* stream, inttype *state, inttype *firstbit_statevector, inttype nr_procs, inttype sv_nints, inttype apply_por) { 545 | inttype i, s, bitmask; 546 | 547 | for (i = 0; i < nr_procs; i++) { 548 | bitmask = 0; 549 | if (firstbit_statevector[i]/INTSIZE == firstbit_statevector[i+1]/INTSIZE) { 550 | bitmask = (((1<<(firstbit_statevector[i+1] % INTSIZE))-1)^((1<<(firstbit_statevector[i] % INTSIZE))-1)); 551 | s = (state[firstbit_statevector[i]/INTSIZE] & bitmask) >> (firstbit_statevector[i] % INTSIZE); 552 | } 553 | else { 554 | bitmask = 1 << (firstbit_statevector[i+1] % INTSIZE); 555 | s = (state[firstbit_statevector[i]/INTSIZE] >> (firstbit_statevector[i] % INTSIZE) 556 | | (state[firstbit_statevector[i+1]/INTSIZE] & bitmask) << (INTSIZE - (firstbit_statevector[i] % INTSIZE))); \ 557 | } 558 | fprintf (stream, "%d", s); 559 | if (i < (nr_procs-1)) { 560 | fprintf (stream, ","); 561 | } 562 | } 563 | fprintf (stream, " "); 564 | for (i = 0; i < sv_nints; i++) { 565 | fprintf (stream, "%d ", STRIPPEDENTRY_HOST(state[i], i)); 566 | } 567 | fprintf (stream, "\n"); 568 | } 569 | 570 | //test function to print the contents of the device queue 571 | void print_queue(inttype *d_q, inttype q_size, inttype *firstbit_statevector, inttype nr_procs, inttype sv_nints, inttype apply_por) { 572 | inttype *q_test = (inttype*) malloc(sizeof(inttype)*q_size); 573 | cudaMemcpy(q_test, d_q, q_size*sizeof(inttype), cudaMemcpyDeviceToHost); 574 | inttype nw; 575 | int count = 0; 576 | int newcount = 0; 577 | for (inttype i = 0; i < (q_size/WARPSIZE); i++) { 578 | for (inttype j = 0; j < NREL_IN_BUCKET_HOST; j++) { 579 | if (q_test[(i*WARPSIZE)+STARTPOS_OF_EL_IN_BUCKET_HOST(j)+(sv_nints-1)] != EMPTYVECT32) { 580 | count++; 581 | nw = ISNEWSTATE_HOST(&q_test[(i*WARPSIZE)+STARTPOS_OF_EL_IN_BUCKET_HOST(j)]); 582 | if (nw) { 583 | newcount++; 584 | fprintf (stdout, "new: "); 585 | } 586 | print_statevector(stdout, &(q_test[(i*WARPSIZE)+STARTPOS_OF_EL_IN_BUCKET_HOST(j)]), firstbit_statevector, nr_procs, sv_nints, apply_por); 587 | } 588 | } 589 | } 590 | fprintf (stdout, "nr. of states in hash table: %d (%d unexplored states)\n", count, newcount); 591 | } 592 | 593 | //test function to print the contents of the device queue 594 | void print_local_queue(FILE* stream, inttype *q, inttype q_size, inttype *firstbit_statevector, inttype nr_procs, inttype sv_nints, inttype apply_por) { 595 | int count = 0, newcount = 0; 596 | inttype nw; 597 | for (inttype i = 0; i < (q_size/WARPSIZE); i++) { 598 | for (inttype j = 0; j < NREL_IN_BUCKET_HOST; j++) { 599 | if (q[(i*WARPSIZE)+STARTPOS_OF_EL_IN_BUCKET_HOST(j)+(sv_nints-1)] != EMPTYVECT32) { 600 | count++; 601 | 602 | nw = ISNEWSTATE_HOST(&q[(i*WARPSIZE)+STARTPOS_OF_EL_IN_BUCKET_HOST(j)]); 603 | if (nw) { 604 | newcount++; 605 | fprintf (stream, "new: "); 606 | } 607 | print_statevector(stream, &(q[(i*WARPSIZE)+STARTPOS_OF_EL_IN_BUCKET_HOST(j)]), firstbit_statevector, nr_procs, sv_nints, apply_por); 608 | } 609 | } 610 | } 611 | fprintf (stream, "nr. of states in hash table: %d (%d unexplored states)\n", count, newcount); 612 | } 613 | 614 | //test function to count the contents of the device queue 615 | void count_queue(inttype *d_q, inttype q_size, inttype *firstbit_statevector, inttype nr_procs, inttype sv_nints) { 616 | inttype *q_test = (inttype*) malloc(sizeof(inttype)*q_size); 617 | cudaMemcpy(q_test, d_q, q_size*sizeof(inttype), cudaMemcpyDeviceToHost); 618 | 619 | int count = 0; 620 | for (inttype i = 0; i < (q_size/WARPSIZE); i++) { 621 | for (inttype j = 0; j < NREL_IN_BUCKET_HOST; j++) { 622 | if (q_test[(i*WARPSIZE)+STARTPOS_OF_EL_IN_BUCKET_HOST(j)+(sv_nints-1)] != EMPTYVECT32) { 623 | count++; 624 | } 625 | } 626 | } 627 | fprintf (stdout, "nr. of states in hash table: %d\n", count); 628 | } 629 | 630 | //test function to count the contents of the host queue 631 | void count_local_queue(inttype *q, inttype q_size, inttype *firstbit_statevector, inttype nr_procs, inttype sv_nints) { 632 | int count = 0, newcount = 0; 633 | inttype nw; 634 | inttype nrbuckets = q_size / WARPSIZE; 635 | inttype nrels = NREL_IN_BUCKET_HOST; 636 | for (inttype i = 0; i < nrbuckets; i++) { 637 | for (inttype j = 0; j < nrels; j++) { 638 | inttype elpos = STARTPOS_OF_EL_IN_BUCKET_HOST(j); 639 | inttype abselpos = (i*WARPSIZE)+elpos+sv_nints-1; 640 | inttype q_abselpos = q[abselpos]; 641 | if (q_abselpos != EMPTYVECT32) { 642 | count++; 643 | nw = ISNEWSTATE_HOST(&q[(i*WARPSIZE)+elpos]); 644 | if (nw) { 645 | newcount++; 646 | } 647 | } 648 | } 649 | } 650 | fprintf (stdout, "nr. of states in hash table: %d (%d unexplored states)\n", count, newcount); 651 | } 652 | 653 | /** 654 | * CUDA kernel function to initialise the queue 655 | */ 656 | __global__ void init_queue(inttype *d_q, inttype n_elem) { 657 | inttype nthreads = blockDim.x*gridDim.x; 658 | inttype i = (blockIdx.x *blockDim.x) + threadIdx.x; 659 | 660 | for(; i < n_elem; i += nthreads) { 661 | d_q[i] = (inttype) EMPTYVECT32; 662 | } 663 | } 664 | 665 | /** 666 | * CUDA kernel to store initial state in hash table 667 | */ 668 | __global__ void store_initial(inttype *d_q, inttype *d_h, inttype *d_newstate_flags, inttype blockdim, inttype griddim) { 669 | inttype bj, hash; 670 | indextype hashtmp; 671 | inttype state[MAX_SIZE]; 672 | 673 | for (bj = 0; bj < d_sv_nints; bj++) { 674 | state[bj] = 0; 675 | } 676 | SETNEWSTATE(state); 677 | FIRSTHASH(hash, state); 678 | for (bj = 0; bj < d_sv_nints; bj++) { 679 | d_q[hash+bj] = state[bj]; 680 | } 681 | d_newstate_flags[(hash / blockdim) % griddim] = 1; 682 | } 683 | 684 | /** 685 | * Kernel that counts the amount of states in global memory 686 | */ 687 | __global__ void count_states(inttype *d_q, inttype *result) { 688 | if(threadIdx.x == 0) { 689 | shared[0] = 0; 690 | } 691 | __syncthreads(); 692 | int localResult = 0; 693 | for(int i = GLOBAL_WARP_ID; i < d_nrbuckets; i += NR_WARPS) { 694 | int tmp = d_q[i*WARPSIZE+LANE]; 695 | if (ENTRY_ID == (d_sv_nints-1) && tmp != EMPTYVECT32) { 696 | localResult++; 697 | } 698 | } 699 | atomicAdd((unsigned int*)shared, localResult); 700 | __syncthreads(); 701 | if(threadIdx.x == 0) { 702 | atomicAdd(result, shared[0]); 703 | } 704 | } 705 | 706 | // When the cache overflows, use the whole warp to store states to global memory 707 | __device__ void store_cache_overflow_warp(inttype *d_q, volatile inttype *d_newstate_flags, int has_overflow) { 708 | while(int c = __ballot(has_overflow)) { 709 | int active_lane = __ffs(c) - 1; 710 | int bj = FINDORPUT_WARP((inttype*) &shared[TGTSTATEOFFSET + (threadIdx.x-LANE+active_lane)*d_sv_nints], d_q, d_newstate_flags, 0); 711 | if(LANE == active_lane) { 712 | has_overflow = 0; 713 | if(bj == 0) { 714 | CONTINUE = 2; 715 | } 716 | } 717 | } 718 | } 719 | 720 | // Copy all states from the cache to global memory 721 | __device__ void copy_cache_to_global(inttype *d_q, inttype* cache, volatile inttype *d_newstate_flags) { 722 | int k = (d_shared_q_size-CACHEOFFSET)/d_sv_nints; 723 | for (int i = WARP_ID; i * WARPSIZE < k; i += (blockDim.x / WARPSIZE)) { 724 | int have_new_state = i * WARPSIZE + LANE < k && ISNEWSTATE(&cache[(i*WARPSIZE+LANE)*d_sv_nints]); 725 | while (int c = __ballot(have_new_state)) { 726 | int active_lane = __ffs(c) - 1; 727 | if(FINDORPUT_WARP((inttype*) &cache[(i*WARPSIZE+active_lane)*d_sv_nints], d_q, d_newstate_flags, 1) == 0) { 728 | CONTINUE = 2; 729 | } 730 | if (LANE == active_lane) { 731 | have_new_state = 0; 732 | } 733 | } 734 | } 735 | } 736 | 737 | /** 738 | * CUDA kernel function for BFS iteration state gathering 739 | * Order of data in the shared queue: 740 | * (0. index of process LTS states sizes) 741 | * (1. index of sync rules offsets) 742 | * (2. index of sync rules) 743 | * (1. index of open queue tile) 744 | * 0. the 'iterations' flag to count the number of iterations so far (nr of tiles processed by SM) 745 | * 1. the 'continue' flag for thread work 746 | * (4. index of threads buffer) 747 | * (5. index of hash table) 748 | * 2. constants for d_q hash functions (2 per function, in total 8 by default) 749 | * 3. state vector offsets (nr_procs+1 elements) 750 | * 4. sizes of states in process LTS states (nr_procs elements) 751 | * (9. sync rules + offsets (nr_syncbits_offsets + nr_syncbits elements)) 752 | * 5. tile of open queue to be processed by block (sv_nints*(blockDim.x / nr_procs) elements) 753 | * 6. buffer for threads ((blockDim.x*max_buf_ints)+(blockDim.x/nr_procs) elements) 754 | * 7. hash table 755 | */ 756 | __global__ void 757 | __launch_bounds__(512, 2) 758 | gather(inttype *d_q, const inttype *d_h, const inttype *d_bits_state, 759 | const inttype *d_firstbit_statevector, inttype *d_contBFS, inttype *d_property_violation, 760 | volatile inttype *d_newstate_flags, inttype *d_worktiles, const inttype scan) { 761 | inttype i, k, l, index, offset1, offset2, tmp, cont, act, sync_offset1, sync_offset2; 762 | volatile inttype* src_state = &shared[OPENTILEOFFSET+d_sv_nints*GROUP_GID]; 763 | volatile inttype* tgt_state = &shared[TGTSTATEOFFSET+threadIdx.x*d_sv_nints]; 764 | inttype* cache = (inttype*) &shared[CACHEOFFSET]; 765 | inttype bitmask, bi; 766 | int pos; 767 | int tbgs = THREADBUFFERGROUPSTART(threadIdx.x); 768 | // TODO 769 | // is at least one outgoing transition enabled for a given state (needed to detect deadlocks) 770 | inttype outtrans_enabled; 771 | 772 | // Reset the shared variables 773 | if (threadIdx.x < SH_OFFSET) { 774 | shared[threadIdx.x] = 0; 775 | } 776 | // Load the hash constants into shared memory 777 | for (int j = threadIdx.x; j < HASHCONSTANTSLEN; j += blockDim.x) { 778 | shared[j+HASHCONSTANTSOFFSET] = d_h[j]; 779 | } 780 | // Load the state sizes and offsets into shared memory 781 | for (int j = threadIdx.x; j < VECTORPOSLEN; j += blockDim.x) { 782 | VECTORSTATEPOS(j) = d_firstbit_statevector[j]; 783 | } 784 | for (int j = threadIdx.x; j < LTSSTATESIZELEN; j += blockDim.x) { 785 | STATESIZE(j) = d_bits_state[j]; 786 | } 787 | // Clean the cache 788 | for (int j = threadIdx.x; j < (d_shared_q_size - (cache-shared)); j += blockDim.x) { 789 | cache[j] = EMPTYVECT32; 790 | } 791 | if(scan) { 792 | // Copy the work tile from global mem 793 | if (threadIdx.x < OPENTILELEN + LASTSEARCHLEN) { 794 | shared[OPENTILEOFFSET+threadIdx.x] = d_worktiles[(OPENTILELEN+LASTSEARCHLEN+1) * blockIdx.x + threadIdx.x]; 795 | } 796 | if(threadIdx.x == 0) { 797 | OPENTILECOUNT = d_worktiles[(OPENTILELEN+LASTSEARCHLEN+1) * blockIdx.x + OPENTILELEN + LASTSEARCHLEN]; 798 | } 799 | } else if (threadIdx.x < OPENTILELEN+LASTSEARCHLEN) { 800 | // On first run: initialize the work tile to empty 801 | shared[OPENTILEOFFSET+threadIdx.x] = threadIdx.x < OPENTILELEN ? EMPTYVECT32 : 0; 802 | } 803 | __syncthreads(); 804 | while (ITERATIONS < d_kernel_iters) { 805 | if (threadIdx.x == 0 && OPENTILECOUNT < OPENTILELEN && d_newstate_flags[blockIdx.x]) { 806 | // Indicate that we are scanning 807 | d_newstate_flags[blockIdx.x] = 2; 808 | SCAN = 1; 809 | } 810 | __syncthreads(); 811 | // Scan the open set for work; we use the OPENTILECOUNT flag at this stage to count retrieved elements 812 | if (SCAN) { 813 | inttype last_search_location = shared[LASTSEARCHOFFSET + WARP_ID]; 814 | // This block should be able to find a new state 815 | int found_new_state = 0; 816 | for (i = GLOBAL_WARP_ID; i < d_nrbuckets && OPENTILECOUNT < OPENTILELEN; i += NR_WARPS) { 817 | int loc = i + last_search_location; 818 | if(loc >= d_nrbuckets) { 819 | last_search_location = -i + GLOBAL_WARP_ID; 820 | loc = i + last_search_location; 821 | } 822 | tmp = d_q[loc*WARPSIZE+LANE]; 823 | l = EMPTYVECT32; 824 | if (ENTRY_ID == (d_sv_nints-1)) { 825 | if (ISNEWINT(tmp)) { 826 | found_new_state = 1; 827 | // try to increment the OPENTILECOUNT counter, if successful, store the state 828 | l = atomicAdd((uint32_t *) &OPENTILECOUNT, d_sv_nints); 829 | if (l < OPENTILELEN) { 830 | d_q[loc*WARPSIZE+LANE] = OLDINT(tmp); 831 | } 832 | } 833 | } 834 | // all threads read the OPENTILECOUNT value of the 'tail' thread, and possibly store their part of the vector in the shared memory 835 | if (LANEPOINTSTOVALIDBUCKETPOS) { 836 | l = __shfl(l, LANE-ENTRY_ID+d_sv_nints-1); 837 | if (l < OPENTILELEN) { 838 | // write part of vector to shared memory 839 | shared[OPENTILEOFFSET+l+ENTRY_ID] = tmp; 840 | } 841 | } 842 | } 843 | if(i < d_nrbuckets) { 844 | last_search_location = i - GLOBAL_WARP_ID; 845 | } else { 846 | last_search_location = 0; 847 | } 848 | if(LANE == 0) { 849 | // Store the last search location, so we can continue from that point later on 850 | shared[LASTSEARCHOFFSET + WARP_ID] = last_search_location; 851 | } 852 | if(found_new_state || i < d_nrbuckets) { 853 | WORKSCANRESULT = 1; 854 | } 855 | } 856 | __syncthreads(); 857 | // if work has been retrieved, indicate this 858 | if (threadIdx.x == 0) { 859 | if (OPENTILECOUNT > 0) { 860 | (*d_contBFS) = 1; 861 | } 862 | if(SCAN && WORKSCANRESULT == 0 && d_newstate_flags[blockIdx.x] == 2) { 863 | // Scanning has completed and no new states were found by this block, 864 | // save this information to prevent unnecessary scanning later on 865 | d_newstate_flags[blockIdx.x] = 0; 866 | } else { 867 | WORKSCANRESULT = 0; 868 | } 869 | } 870 | // is the thread part of an 'active' group? 871 | offset1 = 0; 872 | offset2 = 0; 873 | // Reset the whole thread buffer (shared + private) 874 | int start = THREADBUFFEROFFSET; 875 | int end = THREADBUFFEROFFSET + THREADBUFFERLEN; 876 | for(int j = start + threadIdx.x; j < end; j+=blockDim.x) { 877 | shared[j] = 0; 878 | } 879 | if (THREADINGROUP) { 880 | // Is there work? 881 | if (ISSTATE(src_state)) { 882 | // Gather the required transition information for all states in the tile 883 | i = tex1Dfetch(tex_proc_offsets_start, GROUP_ID); 884 | // Determine process state 885 | GETSTATEVECTORSTATE(cont, src_state, GROUP_ID); 886 | // Offset position 887 | index = cont/(INTSIZE/d_nbits_offset); 888 | pos = cont - (index*(INTSIZE/d_nbits_offset)); 889 | tmp = tex1Dfetch(tex_proc_offsets, i+index); 890 | GETTRANSOFFSET(offset1, tmp, pos); 891 | if (pos == (INTSIZE/d_nbits_offset)-1) { 892 | tmp = tex1Dfetch(tex_proc_offsets, i+index+1); 893 | GETTRANSOFFSET(offset2, tmp, 0); 894 | } 895 | else { 896 | GETTRANSOFFSET(offset2, tmp, pos+1); 897 | } 898 | } 899 | } 900 | // variable cont is used to indicate whether the buffer content of this thread still needs processing 901 | cont = 0; 902 | outtrans_enabled = 0; 903 | // First, generate successors following from local actions 904 | while (1) { 905 | i = 1; 906 | if(offset1 < offset2) { 907 | tmp = tex1Dfetch(tex_proc_trans, offset1); 908 | GETPROCTRANSSYNC(i, tmp); 909 | } 910 | if (__any(i == 0)) { 911 | if(i == 0) { 912 | // no deadlock 913 | outtrans_enabled = 1; 914 | // construct state 915 | for (int j = 0; j < d_sv_nints; j++) { 916 | tgt_state[j] = src_state[j]; 917 | } 918 | offset1++; 919 | } 920 | // loop over this transentry 921 | for (int j = 0; __any(i == 0 && j < NR_OF_STATES_IN_TRANSENTRY(GROUP_ID)); j++) { 922 | if(i == 0) { 923 | GETPROCTRANSSTATE(pos, tmp, j, GROUP_ID); 924 | if (pos > 0) { 925 | SETSTATEVECTORSTATE(tgt_state, GROUP_ID, pos-1); 926 | // check for violation of safety property, if required 927 | if (d_property == SAFETY) { 928 | if (GROUP_ID == d_nr_procs-1) { 929 | // pos contains state id + 1 930 | // error state is state 1 931 | if (pos == 2) { 932 | // error state found 933 | (*d_property_violation) = 1; 934 | } 935 | } 936 | } 937 | // store tgt_state in cache 938 | // if k == 8, cache is full, immediately store in global hash table 939 | k = STOREINCACHE(tgt_state, cache, &bi); 940 | } else { 941 | i = 1; 942 | } 943 | } 944 | store_cache_overflow_warp(d_q, d_newstate_flags, i == 0 && k == 8); 945 | } 946 | } else { 947 | break; 948 | } 949 | } 950 | // Now there are only synchronizing actions left 951 | act = 1 << d_bits_act; 952 | // While the hash table is not full and there are transitions left, 953 | // explore those transitions 954 | while (CONTINUE != 2 && __any(offset1 < offset2 || cont)) { 955 | if (offset1 < offset2 && !cont) { 956 | // Fill the buffer with transitions with the same action label 957 | tmp = tex1Dfetch(tex_proc_trans, offset1); 958 | GETPROCTRANSACT(act, tmp); 959 | // store transition entry 960 | THREADBUFFERGROUPPOS(GROUP_ID,0) = tmp; 961 | cont = 1; 962 | offset1++; 963 | bitmask = act; 964 | for (int j = 1; j < d_max_buf_ints; j++) { 965 | tmp = 0; 966 | if(offset1 < offset2 && act == bitmask) { 967 | tmp = tex1Dfetch(tex_proc_trans, offset1); 968 | GETPROCTRANSACT(bitmask, tmp); 969 | if (act == bitmask) { 970 | offset1++; 971 | } else { 972 | tmp = 0; 973 | } 974 | } 975 | THREADBUFFERGROUPPOS(GROUP_ID,j) = tmp; 976 | j++; 977 | } 978 | } 979 | int sync_act = act; 980 | if (__popc((__ballot(cont) >> (LANE - GROUP_ID)) & ((1 << d_nr_procs) - 1)) > 1) { 981 | // Find the smallest 'sync_act' with butterfly reduction 982 | for(int j = 1; j < d_nr_procs; j<<=1) { 983 | sync_act = min(__shfl(sync_act, GTL((GROUP_ID + j) % d_nr_procs)), sync_act); 984 | } 985 | } else { 986 | // Only one process with synchronizing transitions left, there will 987 | // be no more successors from this state 988 | cont = 0; 989 | offset1 = offset2; 990 | sync_act = 1 << d_bits_act; 991 | } 992 | // Now, we have obtained the info needed to combine process transitions 993 | sync_offset1 = sync_offset2 = 0; 994 | // Find out which processes have the smallest 'act' 995 | int proc_enabled = (__ballot(act == sync_act) >> (LANE - GROUP_ID)) & ((1 << d_nr_procs) - 1); 996 | // Only generate synchronizing successors if there are more that two processes with 'sync_act' enabled 997 | if(sync_act < (1 << d_bits_act) && (__popc(proc_enabled) >= 2)) { 998 | // syncbits Offset position 999 | i = sync_act/(INTSIZE/d_nbits_syncbits_offset); 1000 | pos = sync_act - (i*(INTSIZE/d_nbits_syncbits_offset)); 1001 | l = tex1Dfetch(tex_syncbits_offsets, i); 1002 | GETSYNCOFFSET(sync_offset1, l, pos); 1003 | pos++; 1004 | if (pos == (INTSIZE/d_nbits_syncbits_offset)) { 1005 | l = tex1Dfetch(tex_syncbits_offsets, i+1); 1006 | pos = 0; 1007 | } 1008 | GETSYNCOFFSET(sync_offset2, l, pos); 1009 | } 1010 | // iterate through the relevant syncbit filters 1011 | for (int j = GROUP_ID;__any(sync_offset1 + j / (INTSIZE/d_nr_procs) < sync_offset2);) { 1012 | 1013 | tmp = 0; 1014 | // Keep searching the array with sync rules until we have found an applicable rule or we have reached the end 1015 | // We don't need to check for THREADINGROUP, since sync_offset1 == sync_offset2 for threads outside a group 1016 | while(!(tmp != 0 && (tmp & proc_enabled) == tmp) && sync_offset1 + j / (INTSIZE/d_nr_procs) < sync_offset2) { 1017 | // Fetch the rule 1018 | index = tex1Dfetch(tex_syncbits, sync_offset1 + j / (INTSIZE/d_nr_procs)); 1019 | GETSYNCRULE(tmp, index, j % (INTSIZE/d_nr_procs)); 1020 | // Increase the counter such that threads that have not found an applicable sync rule take a smaller step 1021 | j += d_nr_procs - __popc((__ballot(tmp != 0 && (tmp & proc_enabled) == tmp) >> (LANE - GROUP_ID)) & ((1 << GROUP_ID) - 1)); 1022 | } 1023 | // Find the smallest index j for the next iteration 1024 | // We don't need to check for THREADINGROUP because there is no thread 1025 | // outside of a group with GROUP_ID == d_nr_procs - 1 1026 | if(j >= d_nr_procs - 1 && THREADGROUPCOUNTER < j) { 1027 | atomicMax((inttype*) &THREADGROUPCOUNTER, j); 1028 | } 1029 | 1030 | int work_remaining = 0; 1031 | int has_second_succ = 0; 1032 | // start combining entries in the buffer to create target states 1033 | if (tmp != 0 && (tmp & proc_enabled) == tmp) { 1034 | // source state is not a deadlock 1035 | outtrans_enabled = 1; 1036 | // copy src_state into tgt_state 1037 | for (pos = 0; pos < d_sv_nints; pos++) { 1038 | tgt_state[pos] = src_state[pos]; 1039 | } 1040 | // construct first successor 1041 | for (int rule = tmp; rule;) { 1042 | pos = __ffs(rule) - 1; 1043 | // get first state 1044 | GETPROCTRANSSTATE(k, THREADBUFFERGROUPPOS(pos,0), 0, pos); 1045 | SETSTATEVECTORSTATE(tgt_state, pos, k-1); 1046 | // Check if this buffer has a second state 1047 | GETPROCTRANSSTATE(k, THREADBUFFERGROUPPOS(pos,0), 1, pos); 1048 | if(d_max_buf_ints > 1 && !k) { 1049 | GETPROCTRANSSTATE(k, THREADBUFFERGROUPPOS(pos,1), 0, pos); 1050 | } 1051 | if(k) { 1052 | has_second_succ |= 1 << pos; 1053 | } 1054 | rule &= ~(1 << pos); 1055 | } 1056 | work_remaining = 1 + has_second_succ; 1057 | } 1058 | // while we keep getting new states, store them 1059 | while (__any(work_remaining)) { 1060 | l = 0; 1061 | if(work_remaining) { 1062 | // check for violation of safety property, if required 1063 | if (d_property == SAFETY) { 1064 | GETSTATEVECTORSTATE(pos, tgt_state, d_nr_procs-1); 1065 | if (pos == 1) { 1066 | // error state found 1067 | (*d_property_violation) = 1; 1068 | } 1069 | } 1070 | 1071 | // store tgt_state in cache; if i == d_shared_q_size, state was found, duplicate detected 1072 | // if i == d_shared_q_size+1, cache is full, immediately store in global hash table 1073 | l = STOREINCACHE(tgt_state, cache, &bitmask); 1074 | if(work_remaining == 1) { 1075 | // There will be no second successor 1076 | work_remaining = 0; 1077 | } 1078 | } 1079 | store_cache_overflow_warp(d_q, d_newstate_flags, l == 8); 1080 | if(work_remaining) { 1081 | // get next successor by finding the next combination from the buffer 1082 | // Only look at processes that stored more than one successor in the buffer (has_second_succ) 1083 | int rule; 1084 | for (rule = has_second_succ; rule;) { 1085 | pos = __ffs(rule) - 1; 1086 | int curr_st; 1087 | GETSTATEVECTORSTATE(curr_st, tgt_state, pos); 1088 | int st = 0; 1089 | int num_states_in_trans = NR_OF_STATES_IN_TRANSENTRY(pos); 1090 | // We search for the position of the current state in the buffer 1091 | // We don't have to compare the last position: if curr_st has not been found yet, 1092 | // then it has to be in the last position 1093 | for (k = 0; k < d_max_buf_ints * num_states_in_trans - 1; k++) { 1094 | GETPROCTRANSSTATE(st, THREADBUFFERGROUPPOS(pos,k / num_states_in_trans), k % num_states_in_trans, pos); 1095 | if (curr_st == (st-1) || st == 0) { 1096 | break; 1097 | } 1098 | } 1099 | // Try to get the next element 1100 | k++; 1101 | if (k < d_max_buf_ints * num_states_in_trans && st != 0) { 1102 | // Retrieve next element, insert it in 'tgt_state' if it is not 0, and return result, otherwise continue 1103 | GETPROCTRANSSTATE(st, THREADBUFFERGROUPPOS(pos,k / num_states_in_trans), k % num_states_in_trans, pos); 1104 | if (st > 0) { 1105 | SETSTATEVECTORSTATE(tgt_state, pos, st-1); 1106 | break; 1107 | } 1108 | } 1109 | // else, set this process state to first one, and continue to next process 1110 | GETPROCTRANSSTATE(st, THREADBUFFERGROUPPOS(pos,0), 0, pos); 1111 | SETSTATEVECTORSTATE(tgt_state, pos, st-1); 1112 | rule &= ~(1 << pos); 1113 | } 1114 | // did we find a successor? if not, all successors have been generated 1115 | if (rule == 0) { 1116 | work_remaining = 0; 1117 | } 1118 | } 1119 | } 1120 | 1121 | j = THREADINGROUP ? THREADGROUPCOUNTER + GROUP_ID + 1 : 0; 1122 | } 1123 | 1124 | // only active threads should reset 'cont' 1125 | if (cont && sync_act == act) { 1126 | cont = 0; 1127 | act = 1 << d_bits_act; 1128 | THREADGROUPCOUNTER = 0; 1129 | } 1130 | } 1131 | 1132 | // have we encountered a deadlock state? 1133 | // we use the shared memory to communicate this to the group leaders 1134 | if (d_property == DEADLOCK) { 1135 | if (THREADINGROUP) { 1136 | if (ISSTATE(src_state)) { 1137 | THREADBUFFERGROUPPOS(GROUP_ID, 0) = outtrans_enabled; 1138 | // group leader collects results 1139 | l = 0; 1140 | if (GROUP_ID == 0) { 1141 | for (i = 0; i < d_nr_procs; i++) { 1142 | l += THREADBUFFERGROUPPOS(i, 0); 1143 | } 1144 | if (l == 0) { 1145 | // deadlock state found 1146 | (*d_property_violation) = 1; 1147 | } 1148 | } 1149 | } 1150 | } 1151 | } 1152 | int performed_work = OPENTILECOUNT != 0; 1153 | __syncthreads(); 1154 | // Reset the work tile count 1155 | if (threadIdx.x == 0) { 1156 | OPENTILECOUNT = 0; 1157 | } 1158 | __syncthreads(); 1159 | // start scanning the local cache and write results to the global hash table 1160 | if(performed_work) { 1161 | copy_cache_to_global(d_q, cache, d_newstate_flags); 1162 | } 1163 | __syncthreads(); 1164 | // Write empty state vector to part of the work tile that is not used 1165 | if (threadIdx.x < OPENTILELEN - OPENTILECOUNT) { 1166 | shared[OPENTILEOFFSET+OPENTILECOUNT+threadIdx.x] = EMPTYVECT32; 1167 | } 1168 | // Ready to start next iteration, if error has not occurred 1169 | if (threadIdx.x == 0) { 1170 | if (CONTINUE == 2) { 1171 | (*d_contBFS) = 2; 1172 | ITERATIONS = d_kernel_iters; 1173 | } 1174 | else { 1175 | ITERATIONS++; 1176 | } 1177 | CONTINUE = 0; 1178 | } 1179 | __syncthreads(); 1180 | } 1181 | 1182 | //Copy the work tile to global mem 1183 | if (threadIdx.x < OPENTILELEN+LASTSEARCHLEN) { 1184 | d_worktiles[(OPENTILELEN+LASTSEARCHLEN+1) * blockIdx.x + threadIdx.x] = shared[OPENTILEOFFSET+threadIdx.x]; 1185 | } 1186 | if(threadIdx.x == 0) { 1187 | d_worktiles[(OPENTILELEN+LASTSEARCHLEN+1) * blockIdx.x + OPENTILELEN+LASTSEARCHLEN] = OPENTILECOUNT; 1188 | } 1189 | } 1190 | 1191 | __global__ void 1192 | __launch_bounds__(512, 2) 1193 | gather_por(inttype *d_q, inttype *d_h, inttype *d_bits_state, 1194 | inttype *d_firstbit_statevector, inttype *d_proc_offsets_start, 1195 | inttype *d_proc_offsets, inttype *d_proc_trans, inttype *d_syncbits_offsets, 1196 | inttype *d_syncbits, inttype *d_contBFS, inttype *d_property_violation, 1197 | volatile inttype *d_newstate_flags, inttype *d_worktiles, inttype scan) { 1198 | inttype i, k, l, index, offset1, offset2, tmp, cont, act, sync_offset1, sync_offset2; 1199 | volatile inttype* src_state = &shared[OPENTILEOFFSET+d_sv_nints*GROUP_GID]; 1200 | volatile inttype* tgt_state = &shared[TGTSTATEOFFSET+threadIdx.x*d_sv_nints]; 1201 | inttype* cache = (inttype*) &shared[CACHEOFFSET]; 1202 | inttype bitmask, bi, bj; 1203 | int pos; 1204 | int tbgs = THREADBUFFERGROUPSTART(threadIdx.x); 1205 | // TODO: remove this 1206 | inttype TMPVAR; 1207 | // is at least one outgoing transition enabled for a given state (needed to detect deadlocks) 1208 | inttype outtrans_enabled; 1209 | 1210 | // Locally store the state sizes and syncbits 1211 | if (threadIdx.x < SH_OFFSET) { 1212 | shared[threadIdx.x] = 0; 1213 | } 1214 | for (i = threadIdx.x; i < HASHCONSTANTSLEN; i += blockDim.x) { 1215 | shared[i+HASHCONSTANTSOFFSET] = d_h[i]; 1216 | } 1217 | for (i = threadIdx.x; i < VECTORPOSLEN; i += blockDim.x) { 1218 | VECTORSTATEPOS(i) = d_firstbit_statevector[i]; 1219 | } 1220 | for (i = threadIdx.x; i < LTSSTATESIZELEN; i += blockDim.x) { 1221 | STATESIZE(i) = d_bits_state[i]; 1222 | } 1223 | // Clean the cache 1224 | for (i = threadIdx.x; i < (d_shared_q_size - CACHEOFFSET); i += blockDim.x) { 1225 | cache[i] = EMPTYVECT32; 1226 | } 1227 | if(scan) { 1228 | // Copy the work tile from global mem 1229 | if (threadIdx.x < OPENTILELEN + LASTSEARCHLEN) { 1230 | shared[OPENTILEOFFSET+threadIdx.x] = d_worktiles[(OPENTILELEN+LASTSEARCHLEN+1) * blockIdx.x + threadIdx.x]; 1231 | } 1232 | if(threadIdx.x == 0) { 1233 | OPENTILECOUNT = d_worktiles[(OPENTILELEN+LASTSEARCHLEN+1) * blockIdx.x + OPENTILELEN + LASTSEARCHLEN]; 1234 | } 1235 | } else if (threadIdx.x < OPENTILELEN+LASTSEARCHLEN) { 1236 | // On first run: initialize the work tile to empty 1237 | shared[OPENTILEOFFSET+threadIdx.x] = threadIdx.x < OPENTILELEN ? EMPTYVECT32 : 0; 1238 | } 1239 | __syncthreads(); 1240 | while (ITERATIONS < d_kernel_iters) { 1241 | if (threadIdx.x == 0 && OPENTILECOUNT < OPENTILELEN && d_newstate_flags[blockIdx.x]) { 1242 | // Indicate that we are scanning 1243 | d_newstate_flags[blockIdx.x] = 2; 1244 | SCAN = 1; 1245 | } 1246 | __syncthreads(); 1247 | // Scan the open set for work; we use the OPENTILECOUNT flag at this stage to count retrieved elements 1248 | if (SCAN) { 1249 | inttype last_search_location = shared[LASTSEARCHOFFSET + WARP_ID]; 1250 | // This block should be able to find a new state 1251 | int found_new_state = 0; 1252 | for (i = GLOBAL_WARP_ID; i < d_nrbuckets && OPENTILECOUNT < OPENTILELEN; i += NR_WARPS) { 1253 | int loc = i + last_search_location; 1254 | if(loc >= d_nrbuckets) { 1255 | last_search_location = -i + GLOBAL_WARP_ID; 1256 | loc = i + last_search_location; 1257 | } 1258 | tmp = d_q[loc*WARPSIZE+LANE]; 1259 | l = EMPTYVECT32; 1260 | if (ENTRY_ID == (d_sv_nints-1)) { 1261 | if (ISNEWINT(tmp)) { 1262 | found_new_state = 1; 1263 | // try to increment the OPENTILECOUNT counter, if successful, store the state 1264 | l = atomicAdd((uint32_t *) &OPENTILECOUNT, d_sv_nints); 1265 | if (l < OPENTILELEN) { 1266 | d_q[loc*WARPSIZE+LANE] = OLDINT(tmp); 1267 | } 1268 | } 1269 | } 1270 | // all threads read the OPENTILECOUNT value of the 'tail' thread, and possibly store their part of the vector in the shared memory 1271 | if (LANEPOINTSTOVALIDBUCKETPOS) { 1272 | l = __shfl(l, LANE-ENTRY_ID+d_sv_nints-1); 1273 | if (l < OPENTILELEN) { 1274 | // write part of vector to shared memory 1275 | shared[OPENTILEOFFSET+l+ENTRY_ID] = tmp; 1276 | } 1277 | } 1278 | } 1279 | if(i < d_nrbuckets) { 1280 | last_search_location = i - GLOBAL_WARP_ID; 1281 | } else { 1282 | last_search_location = 0; 1283 | } 1284 | if(LANE == 0) { 1285 | shared[LASTSEARCHOFFSET + WARP_ID] = last_search_location; 1286 | } 1287 | if(found_new_state || i < d_nrbuckets) { 1288 | WORKSCANRESULT = 1; 1289 | } 1290 | } 1291 | __syncthreads(); 1292 | // if work has been retrieved, indicate this 1293 | if (threadIdx.x == 0) { 1294 | if (OPENTILECOUNT > 0) { 1295 | (*d_contBFS) = 1; 1296 | } 1297 | if(SCAN && WORKSCANRESULT == 0 && d_newstate_flags[blockIdx.x] == 2) { 1298 | // Scanning has completed and no new states were found by this block, 1299 | // save this information to prevent unnecessary scanning later on 1300 | d_newstate_flags[blockIdx.x] = 0; 1301 | } else { 1302 | WORKSCANRESULT = 0; 1303 | } 1304 | scan = 0; 1305 | } 1306 | // is the thread part of an 'active' group? 1307 | offset1 = 0; 1308 | offset2 = 0; 1309 | // Reset the whole thread buffer (shared + private) 1310 | int start = THREADBUFFEROFFSET; 1311 | int end = THREADBUFFEROFFSET + THREADBUFFERLEN; 1312 | for(i = start + threadIdx.x; i < end; i+=blockDim.x) { 1313 | shared[i] = 0; 1314 | } 1315 | if (THREADINGROUP) { 1316 | act = 1 << d_bits_act; 1317 | // Is there work? 1318 | if (ISSTATE(src_state)) { 1319 | // Gather the required transition information for all states in the tile 1320 | i = tex1Dfetch(tex_proc_offsets_start, GROUP_ID); 1321 | // Determine process state 1322 | GETSTATEVECTORSTATE(cont, src_state, GROUP_ID); 1323 | // Offset position 1324 | index = cont/(INTSIZE/d_nbits_offset); 1325 | pos = cont - (index*(INTSIZE/d_nbits_offset)); 1326 | tmp = tex1Dfetch(tex_proc_offsets, i+index); 1327 | GETTRANSOFFSET(offset1, tmp, pos); 1328 | if (pos == (INTSIZE/d_nbits_offset)-1) { 1329 | tmp = tex1Dfetch(tex_proc_offsets, i+index+1); 1330 | GETTRANSOFFSET(offset2, tmp, 0); 1331 | } 1332 | else { 1333 | GETTRANSOFFSET(offset2, tmp, pos+1); 1334 | } 1335 | } 1336 | if (GROUP_ID == 0) { 1337 | THREADGROUPPOR = 0; 1338 | } 1339 | } 1340 | // iterate over the outgoing transitions of state 'cont' 1341 | // variable cont is reused to indicate whether the buffer content of this thread still needs processing 1342 | cont = 0; 1343 | // while there is work to be done 1344 | outtrans_enabled = 0; 1345 | char generate = 1; 1346 | char proviso_satisfied = 0; 1347 | int cluster_trans = 1 << GROUP_ID; 1348 | int orig_offset1 = offset1; 1349 | while(generate > -1) { 1350 | while (CONTINUE != 2 && __any(offset1 < offset2 || cont)) { 1351 | if (offset1 < offset2 && !cont) { 1352 | // reset act 1353 | act = (1 << (d_bits_act)); 1354 | // reset buffer of this thread 1355 | for (l = 0; l < d_max_buf_ints; l++) { 1356 | THREADBUFFERGROUPPOS(GROUP_ID, l) = 0; 1357 | } 1358 | } 1359 | // if not sync, store in hash table 1360 | // loop over all transentries 1361 | while (1) { 1362 | i = 1; 1363 | if(offset1 < offset2 && !cont) { 1364 | tmp = tex1Dfetch(tex_proc_trans, offset1); 1365 | GETPROCTRANSSYNC(i, tmp); 1366 | } 1367 | if (__any(i == 0)) { 1368 | if(i == 0) { 1369 | // no deadlock 1370 | outtrans_enabled = 1; 1371 | // construct state 1372 | for (l = 0; l < d_sv_nints; l++) { 1373 | tgt_state[l] = src_state[l]; 1374 | } 1375 | offset1++; 1376 | } 1377 | // loop over this transentry 1378 | for (l = 0; __any(i == 0 && l < NR_OF_STATES_IN_TRANSENTRY(GROUP_ID)); l++) { 1379 | if(i == 0) { 1380 | GETPROCTRANSSTATE(pos, tmp, l, GROUP_ID); 1381 | if (pos > 0) { 1382 | SETSTATEVECTORSTATE(tgt_state, GROUP_ID, pos-1); 1383 | // check for violation of safety property, if required 1384 | if (d_property == SAFETY) { 1385 | if (GROUP_ID == d_nr_procs-1) { 1386 | // pos contains state id + 1 1387 | // error state is state 1 1388 | if (pos == 2) { 1389 | // error state found 1390 | (*d_property_violation) = 1; 1391 | } 1392 | } 1393 | } 1394 | 1395 | if (!d_check_cycle_proviso) { 1396 | // Set proviso to 1 to indicate at least one state has been found 1397 | proviso_satisfied = 1; 1398 | } 1399 | // store tgt_state in cache 1400 | // if k == 8, cache is full, immediately store in global hash table 1401 | if(generate == 1) { 1402 | k = STOREINCACHE(tgt_state, cache, &bi); 1403 | if(k >> 2) { 1404 | proviso_satisfied |= (k >> 1) & 1; 1405 | } else if (!d_check_cycle_proviso) { 1406 | SETPORSTATE(&cache[bi]); 1407 | } 1408 | } else { 1409 | MARKINCACHE(tgt_state, cache, (THREADGROUPPOR >> GROUP_ID) & 1); 1410 | } 1411 | } else { 1412 | i = 1; 1413 | } 1414 | } 1415 | store_cache_overflow_warp(d_q, d_newstate_flags, i == 0 && k == 8); 1416 | int c; 1417 | // Check cycle proviso with the whole warp 1418 | while(generate && d_check_cycle_proviso && (c = __ballot(i == 0 && (k >> 2 == 0)))) { 1419 | int active_lane = __ffs(c) - 1; 1420 | int cache_index = __shfl(bi, active_lane); 1421 | bj = FIND_WARP((inttype*) &cache[cache_index], d_q); 1422 | if(LANE == active_lane) { 1423 | i = 1; 1424 | if(bj == 0) { 1425 | proviso_satisfied = 1; 1426 | } 1427 | } 1428 | } 1429 | } 1430 | } else { 1431 | break; 1432 | } 1433 | } 1434 | 1435 | // i is the current relative position in the buffer for this thread 1436 | i = 0; 1437 | if (offset1 < offset2 && !cont) { 1438 | GETPROCTRANSACT(act, tmp); 1439 | // store transition entry 1440 | THREADBUFFERGROUPPOS(GROUP_ID,i) = tmp; 1441 | cont = 1; 1442 | i++; 1443 | offset1++; 1444 | while (offset1 < offset2) { 1445 | tmp = tex1Dfetch(tex_proc_trans, offset1); 1446 | GETPROCTRANSACT(bitmask, tmp); 1447 | if (act == bitmask) { 1448 | THREADBUFFERGROUPPOS(GROUP_ID,i) = tmp; 1449 | i++; 1450 | offset1++; 1451 | } 1452 | else { 1453 | break; 1454 | } 1455 | } 1456 | } 1457 | int sync_act = cont ? act : (1 << d_bits_act); 1458 | for(i = 1; i < d_nr_procs; i<<=1) { 1459 | sync_act = min(__shfl(sync_act, GTL((GROUP_ID + i) % d_nr_procs)), sync_act); 1460 | } 1461 | // Now, we have obtained the info needed to combine process transitions 1462 | sync_offset1 = sync_offset2 = 0; 1463 | int proc_enabled = (__ballot(act == sync_act) >> (LANE - GROUP_ID)) & ((1 << d_nr_procs) - 1); 1464 | if(THREADINGROUP && sync_act < (1 << d_bits_act)) { 1465 | // syncbits Offset position 1466 | i = sync_act/(INTSIZE/d_nbits_syncbits_offset); 1467 | pos = sync_act - (i*(INTSIZE/d_nbits_syncbits_offset)); 1468 | l = tex1Dfetch(tex_syncbits_offsets, i); 1469 | GETSYNCOFFSET(sync_offset1, l, pos); 1470 | if (pos == (INTSIZE/d_nbits_syncbits_offset)-1) { 1471 | l = tex1Dfetch(tex_syncbits_offsets, i+1); 1472 | GETSYNCOFFSET(sync_offset2, l, 0); 1473 | } 1474 | else { 1475 | GETSYNCOFFSET(sync_offset2, l, pos+1); 1476 | } 1477 | } 1478 | // iterate through the relevant syncbit filters 1479 | tmp = 1; 1480 | for (int j = GROUP_ID;__any(sync_offset1 + j / (INTSIZE/d_nr_procs) < sync_offset2 && tmp); j+=d_nr_procs) { 1481 | index = 0; 1482 | if(THREADINGROUP && sync_act < (1 << d_bits_act) && sync_offset1 + j / (INTSIZE/d_nr_procs) < sync_offset2 && tmp) { 1483 | index = tex1Dfetch(tex_syncbits, sync_offset1 + j / (INTSIZE/d_nr_procs)); 1484 | } 1485 | SETOLDSTATE(tgt_state); 1486 | int has_second_succ = 0; 1487 | GETSYNCRULE(tmp, index, j % (INTSIZE/d_nr_procs)); 1488 | if (tmp != 0 && (tmp & proc_enabled) == tmp) { 1489 | // source state is not a deadlock 1490 | outtrans_enabled = 1; 1491 | // start combining entries in the buffer to create target states 1492 | // if sync rule applicable, construct the first successor 1493 | // copy src_state into tgt_state 1494 | for (pos = 0; pos < d_sv_nints; pos++) { 1495 | tgt_state[pos] = src_state[pos]; 1496 | } 1497 | // construct first successor 1498 | for (int rule = tmp; rule;) { 1499 | pos = __ffs(rule) - 1; 1500 | // get first state 1501 | GETPROCTRANSSTATE(k, THREADBUFFERGROUPPOS(pos,0), 0, pos); 1502 | SETSTATEVECTORSTATE(tgt_state, pos, k-1); 1503 | GETPROCTRANSSTATE(k, THREADBUFFERGROUPPOS(pos,0), 1, pos); 1504 | has_second_succ |= k; 1505 | if(d_max_buf_ints > 1 && !k) { 1506 | GETPROCTRANSSTATE(k, THREADBUFFERGROUPPOS(pos,1), 0, pos); 1507 | has_second_succ |= k; 1508 | } 1509 | rule &= ~(1 << pos); 1510 | } 1511 | SETNEWSTATE(tgt_state); 1512 | } 1513 | int rule_proviso = 0; 1514 | // while we keep getting new states, store them 1515 | while (__any(ISNEWSTATE(tgt_state))) { 1516 | l = k = TMPVAR = bitmask = 0; 1517 | if(ISNEWSTATE(tgt_state)) { 1518 | // check for violation of safety property, if required 1519 | if (d_property == SAFETY) { 1520 | GETSTATEVECTORSTATE(pos, tgt_state, d_nr_procs-1); 1521 | if (pos == 1) { 1522 | // error state found 1523 | (*d_property_violation) = 1; 1524 | } 1525 | } 1526 | 1527 | if (!d_check_cycle_proviso) { 1528 | // Set rule_proviso to 1 to indicate at least one state has been found 1529 | rule_proviso = 1; 1530 | } 1531 | // store tgt_state in cache; if i == d_shared_q_size, state was found, duplicate detected 1532 | // if i == d_shared_q_size+1, cache is full, immediately store in global hash table 1533 | if(generate == 1) { 1534 | TMPVAR = STOREINCACHE(tgt_state, cache, &bitmask); 1535 | if(TMPVAR >> 2) { 1536 | rule_proviso |= (TMPVAR >> 1) & 1; 1537 | } else if (!d_check_cycle_proviso) { 1538 | SETPORSTATE(&cache[bitmask]); 1539 | } 1540 | } else { 1541 | MARKINCACHE(tgt_state, cache, (THREADGROUPPOR & tmp) == tmp); 1542 | } 1543 | l = 1; 1544 | k = has_second_succ; 1545 | if(!has_second_succ) { 1546 | SETOLDSTATE(tgt_state); 1547 | } 1548 | } 1549 | store_cache_overflow_warp(d_q, d_newstate_flags, l && TMPVAR == 8); 1550 | int c; 1551 | // Check cycle proviso with the whole warp 1552 | while(generate && d_check_cycle_proviso && (c = __ballot(l && (TMPVAR >> 2 == 0)))) { 1553 | int active_lane = __ffs(c) - 1; 1554 | int cache_index = __shfl(bitmask, active_lane); 1555 | bj = FIND_WARP((inttype*) &cache[cache_index], d_q); 1556 | if(LANE == active_lane) { 1557 | l = 0; 1558 | if(bj == 0) { 1559 | rule_proviso = 1; 1560 | } 1561 | } 1562 | } 1563 | if(k) { 1564 | // get next successor 1565 | int rule; 1566 | for (rule = tmp; rule;) { 1567 | pos = __ffs(rule) - 1; 1568 | int curr_st; 1569 | GETSTATEVECTORSTATE(curr_st, tgt_state, pos); 1570 | int st = 0; 1571 | for (k = 0; k < d_max_buf_ints; k++) { 1572 | for (l = 0; l < NR_OF_STATES_IN_TRANSENTRY(pos); l++) { 1573 | GETPROCTRANSSTATE(st, THREADBUFFERGROUPPOS(pos,k), l, pos); 1574 | if (curr_st == (st-1)) { 1575 | break; 1576 | } 1577 | } 1578 | if (curr_st == (st-1)) { 1579 | break; 1580 | } 1581 | } 1582 | // Assumption: element has been found (otherwise, 'last' was not a valid successor) 1583 | // Try to get the next element 1584 | if (l == NR_OF_STATES_IN_TRANSENTRY(pos) - 1) { 1585 | if (k >= d_max_buf_ints-1) { 1586 | st = 0; 1587 | } 1588 | else { 1589 | k++; 1590 | l = 0; 1591 | } 1592 | } 1593 | else { 1594 | l++; 1595 | } 1596 | // Retrieve next element, insert it in 'tgt_state' if it is not 0, and return result, otherwise continue 1597 | if (st != 0) { 1598 | GETPROCTRANSSTATE(st, THREADBUFFERGROUPPOS(pos,k), l, pos); 1599 | if (st > 0) { 1600 | SETSTATEVECTORSTATE(tgt_state, pos, st-1); 1601 | SETNEWSTATE(tgt_state); 1602 | break; 1603 | } 1604 | } 1605 | // else, set this process state to first one, and continue to next process 1606 | GETPROCTRANSSTATE(st, THREADBUFFERGROUPPOS(pos,0), 0, pos); 1607 | SETSTATEVECTORSTATE(tgt_state, pos, st-1); 1608 | rule &= ~(1 << pos); 1609 | } 1610 | // did we find a successor? if not, set tgt_state to old 1611 | if (rule == 0) { 1612 | SETOLDSTATE(tgt_state); 1613 | } 1614 | } 1615 | } 1616 | for (l = 0; l < d_nr_procs; l++) { 1617 | // Exchange the sync rules so every thread can update its cluster_trans 1618 | int sync_rule = __shfl(tmp, GTL((GROUP_ID + l) % d_nr_procs)); 1619 | int proviso = __shfl(rule_proviso, GTL((GROUP_ID + l) % d_nr_procs)); 1620 | if(GETBIT(GROUP_ID, sync_rule) && sync_act == act) { 1621 | cluster_trans |= sync_rule; 1622 | proviso_satisfied |= proviso; 1623 | } 1624 | } 1625 | } 1626 | 1627 | // only active threads should reset 'cont' 1628 | if (cont && sync_act == act) { 1629 | cont = 0; 1630 | } 1631 | } // END WHILE CONTINUE == 1 1632 | 1633 | if(generate == 1 && THREADINGROUP) { 1634 | // Choose a cluster for reduction 1635 | if(!proviso_satisfied) { 1636 | cluster_trans = cluster_trans & ~(1 << GROUP_ID); 1637 | } 1638 | THREADBUFFERGROUPPOS(GROUP_ID,0) = cluster_trans; 1639 | __syncthreads(); 1640 | proviso_satisfied = 0; 1641 | int to_check = cluster_trans; 1642 | while (to_check) { 1643 | i = __ffs(to_check) - 1; 1644 | to_check &= ~(1 << i); 1645 | int cluster = THREADBUFFERGROUPPOS(i, 0); 1646 | proviso_satisfied |= GETBIT(i, cluster); 1647 | to_check |= cluster & ~cluster_trans & ~(1 << i); 1648 | cluster_trans |= cluster; 1649 | } 1650 | __syncthreads(); 1651 | if(!proviso_satisfied) { 1652 | THREADBUFFERGROUPPOS(GROUP_ID,0) = 0; 1653 | } else { 1654 | THREADBUFFERGROUPPOS(GROUP_ID,0) = cluster_trans; 1655 | } 1656 | __syncthreads(); 1657 | if(GROUP_ID == 0) { 1658 | int min = d_nr_procs; 1659 | int cluster = 0xFFFFFFFF >> (INTSIZE - d_nr_procs); 1660 | for(i = 0; i < d_nr_procs; i++) { 1661 | if(THREADBUFFERGROUPPOS(i,0) > 0 && __popc(THREADBUFFERGROUPPOS(i,0)) < min) { 1662 | min = __popc(THREADBUFFERGROUPPOS(i,0)); 1663 | cluster = THREADBUFFERGROUPPOS(i,0); 1664 | } 1665 | } 1666 | THREADGROUPPOR = cluster; 1667 | if(cluster < (0xFFFFFFFF >> (INTSIZE - d_nr_procs))) { 1668 | // printf("Selected cluster %d for POR\n",cluster); 1669 | } 1670 | } 1671 | __syncthreads(); 1672 | } 1673 | offset1 = orig_offset1; 1674 | generate--; 1675 | } // END while(generate > -1) 1676 | 1677 | // have we encountered a deadlock state? 1678 | // we use the shared memory to communicate this to the group leaders 1679 | if (d_property == DEADLOCK) { 1680 | if (THREADINGROUP) { 1681 | if (ISSTATE(src_state)) { 1682 | THREADBUFFERGROUPPOS(GROUP_ID, 0) = outtrans_enabled; 1683 | // group leader collects results 1684 | l = 0; 1685 | if (GROUP_ID == 0) { 1686 | for (i = 0; i < d_nr_procs; i++) { 1687 | l += THREADBUFFERGROUPPOS(i, 0); 1688 | } 1689 | if (l == 0) { 1690 | // deadlock state found 1691 | (*d_property_violation) = 1; 1692 | } 1693 | } 1694 | } 1695 | } 1696 | } 1697 | int performed_work = OPENTILECOUNT != 0; 1698 | __syncthreads(); 1699 | // Reset the open queue tile 1700 | if (threadIdx.x < OPENTILELEN) { 1701 | shared[OPENTILEOFFSET+threadIdx.x] = EMPTYVECT32; 1702 | } 1703 | if (threadIdx.x == 0) { 1704 | OPENTILECOUNT = 0; 1705 | } 1706 | __syncthreads(); 1707 | // start scanning the local cache and write results to the global hash table 1708 | if(performed_work) { 1709 | copy_cache_to_global(d_q, cache, d_newstate_flags); 1710 | } 1711 | __syncthreads(); 1712 | // Ready to start next iteration, if error has not occurred 1713 | if (threadIdx.x == 0) { 1714 | if (CONTINUE == 2) { 1715 | (*d_contBFS) = 2; 1716 | ITERATIONS = d_kernel_iters; 1717 | } 1718 | else { 1719 | ITERATIONS++; 1720 | } 1721 | CONTINUE = 0; 1722 | } 1723 | __syncthreads(); 1724 | } 1725 | 1726 | //Copy the work tile to global mem 1727 | if (threadIdx.x < OPENTILELEN+LASTSEARCHLEN) { 1728 | d_worktiles[(OPENTILELEN+LASTSEARCHLEN+1) * blockIdx.x + threadIdx.x] = shared[OPENTILEOFFSET+threadIdx.x]; 1729 | } 1730 | if(threadIdx.x == 0) { 1731 | d_worktiles[(OPENTILELEN+LASTSEARCHLEN+1) * blockIdx.x + OPENTILELEN+LASTSEARCHLEN] = OPENTILECOUNT; 1732 | } 1733 | } 1734 | 1735 | /** 1736 | * Host function that prepares data array and passes it to the CUDA kernel. 1737 | */ 1738 | int main(int argc, char** argv) { 1739 | FILE *fp; 1740 | inttype nr_procs, bits_act, bits_statevector, sv_nints, nr_trans, proc_nrstates, nbits_offset, max_buf_ints, nr_syncbits_offsets, nr_syncbits, nbits_syncbits_offset; 1741 | inttype *bits_state, *firstbit_statevector, *proc_offsets, *proc_trans, *proc_offsets_start, *syncbits_offsets, *syncbits; 1742 | inttype contBFS, counted_states; 1743 | char stmp[BUFFERSIZE], fn[BUFFERSIZE]; 1744 | // to store constants for closed set hash functions 1745 | int h[NR_HASH_FUNCTIONS*2]; 1746 | // size of global hash table 1747 | size_t q_size = 0; 1748 | PropertyStatus check_property = NONE; 1749 | // nr of iterations in single kernel run 1750 | int kernel_iters = KERNEL_ITERS; 1751 | int nblocks = NR_OF_BLOCKS; 1752 | int nthreadsperblock = BLOCK_SIZE; 1753 | // POR options 1754 | int apply_por = 0; 1755 | int use_cycle_proviso = 0; 1756 | // level of verbosity (1=print level progress) 1757 | int verbosity = 0; 1758 | char* dump_file = NULL; 1759 | // clock to measure time 1760 | clock_t start, stop; 1761 | double runtime = 0.0; 1762 | 1763 | // Start timer 1764 | assert((start = clock())!=-1); 1765 | 1766 | cudaDeviceProp prop; 1767 | int nDevices; 1768 | 1769 | // GPU side versions of the input 1770 | inttype *d_bits_state, *d_firstbit_statevector, *d_proc_offsets_start, *d_proc_offsets, *d_proc_trans, *d_syncbits_offsets, *d_syncbits, *d_h; 1771 | // flag to keep track of progress and whether hash table errors occurred (value==2) 1772 | inttype *d_contBFS; 1773 | // flags to track which blocks have new states 1774 | inttype *d_newstate_flags; 1775 | // flag to keep track of property verification outcome 1776 | inttype *d_property_violation; 1777 | // Integer to store the amount of states counted in the hash table 1778 | inttype *d_counted_states; 1779 | // Space to temporarily store work tiles 1780 | inttype *d_worktiles; 1781 | 1782 | // GPU datastructures for calculation 1783 | inttype *d_q; 1784 | 1785 | const char* help_text = 1786 | "Usage: GPUexplore [OPTIONS]\n" 1787 | "Run state-space exploration on model (do not include the file extension).\n" 1788 | "options:\n" 1789 | " -d Check for deadlocks\n" 1790 | " -p Check a safety property (should be embedded in the model)\n" 1791 | " --por Apply partial-order reduction\n" 1792 | " --cycle-proviso Apply the cycle proviso during partial-order reduction\n" 1793 | " -k NUM Run NUM iterations per kernel launch (default 1)\n" 1794 | " -b NUM Run the kernel on NUM blocks (default 1)\n" 1795 | " -t NUM Use NUM threads per block (default 32)\n" 1796 | " -q NUM Allocate NUM integers for the hash table\n" 1797 | " --dump FILE Dump the state space to FILE after completing the exploration\n" 1798 | " -v NUM Change the verbosity:\n" 1799 | " 0 - minimal output\n" 1800 | " 1 - print sequence number of each kernel launch\n" 1801 | " 2 - print number of states in the hash table after each kernel launch\n" 1802 | " 3 - print state vectors after each kernel launch\n" 1803 | " -h, --help Show this help message\n"; 1804 | 1805 | if (argc == 1) { 1806 | fprintf(stderr, "ERROR: No input network given!\n"); 1807 | fprintf(stdout, help_text); 1808 | exit(1); 1809 | } else if(!strcmp(argv[1],"--help") || !strcmp(argv[1],"-h") || !strcmp(argv[1],"-?")) { 1810 | fprintf(stdout, help_text); 1811 | exit(0); 1812 | } 1813 | 1814 | strcpy(fn, argv[1]); 1815 | strcat(fn, ".gpf"); 1816 | 1817 | int i = 2; 1818 | while (i < argc) { 1819 | if (!strcmp(argv[i],"--help") || !strcmp(argv[i],"-h") || !strcmp(argv[i],"-?")) { 1820 | fprintf(stdout, help_text); 1821 | exit(0); 1822 | } 1823 | else if (!strcmp(argv[i],"-k")) { 1824 | // if nr. of iterations per kernel run is given, store it 1825 | kernel_iters = atoi(argv[i+1]); 1826 | i += 2; 1827 | } 1828 | else if (!strcmp(argv[i],"-b")) { 1829 | // store nr of blocks to be used 1830 | nblocks = atoi(argv[i+1]); 1831 | i += 2; 1832 | } 1833 | else if (!strcmp(argv[i],"-t")) { 1834 | // store nr of threads per block to be used 1835 | nthreadsperblock = atoi(argv[i+1]); 1836 | i += 2; 1837 | } 1838 | else if (!strcmp(argv[i],"-q")) { 1839 | // store hash table size 1840 | q_size = atoll(argv[i+1]); 1841 | i += 2; 1842 | } 1843 | else if (!strcmp(argv[i],"-v")) { 1844 | // store verbosity level 1845 | verbosity = atoi(argv[i+1]); 1846 | if (verbosity > 3) { 1847 | verbosity = 3; 1848 | } 1849 | i += 2; 1850 | } 1851 | else if (!strcmp(argv[i],"-d")) { 1852 | // check for deadlocks 1853 | check_property = DEADLOCK; 1854 | use_cycle_proviso = 0; 1855 | i += 1; 1856 | } 1857 | else if (!strcmp(argv[i],"-p")) { 1858 | // check a property 1859 | check_property = SAFETY; 1860 | use_cycle_proviso = 1; 1861 | i += 1; 1862 | } 1863 | else if (!strcmp(argv[i],"--por")) { 1864 | // apply partial-order reduction 1865 | apply_por = 1; 1866 | i += 1; 1867 | } 1868 | else if (!strcmp(argv[i],"--cycle-proviso")) { 1869 | // use cycle proviso 1870 | if (check_property == NONE) { 1871 | use_cycle_proviso = 1; 1872 | } 1873 | i += 1; 1874 | } 1875 | else if (!strcmp(argv[i],"--dump")) { 1876 | dump_file = argv[i+1]; 1877 | i += 2; 1878 | } else { 1879 | fprintf(stderr, "ERROR: unrecognized option %s\n", argv[i]); 1880 | fprintf(stdout, help_text); 1881 | exit(1); 1882 | } 1883 | } 1884 | 1885 | fp = fopen(fn, "r"); 1886 | if (fp) { 1887 | // Read the input 1888 | if (fgets(stmp, BUFFERSIZE, fp) != NULL && check_property == SAFETY) { 1889 | i = atoi(stmp); 1890 | fprintf(stdout, "Property to check is "); 1891 | if (i == 0) { 1892 | fprintf(stdout, "not "); 1893 | } 1894 | fprintf(stdout, "a liveness property\n"); 1895 | if (i == 1) { 1896 | check_property = LIVENESS; 1897 | } 1898 | } 1899 | if (fgets(stmp, BUFFERSIZE, fp) != NULL) { 1900 | nr_procs = atoi(stmp); 1901 | fprintf(stdout, "nr of procs: %d\n", nr_procs); 1902 | } 1903 | if (fgets(stmp, BUFFERSIZE, fp) != NULL) { 1904 | bits_act = atoi(stmp); 1905 | fprintf(stdout, "nr of bits for transition label: %d\n", bits_act); 1906 | } 1907 | if (fgets(stmp, BUFFERSIZE, fp) != NULL) { 1908 | proc_nrstates = atoi(stmp); 1909 | fprintf(stdout, "min. nr. of proc. states that fit in 32-bit integer: %d\n", proc_nrstates); 1910 | } 1911 | if (fgets(stmp, BUFFERSIZE, fp) != NULL) { 1912 | bits_statevector = atoi(stmp) + apply_por; 1913 | fprintf(stdout, "number of bits needed for a state vector: %d\n", bits_statevector); 1914 | } 1915 | firstbit_statevector = (inttype*) malloc(sizeof(inttype)*(nr_procs+1)); 1916 | for (int i = 0; i <= nr_procs; i++) { 1917 | if (fgets(stmp, BUFFERSIZE, fp) != NULL) { 1918 | firstbit_statevector[i] = atoi(stmp); 1919 | fprintf(stdout, "statevector offset %d: %d\n", i, firstbit_statevector[i]); 1920 | } 1921 | } 1922 | // determine the number of integers needed for a state vector 1923 | sv_nints = (bits_statevector+31) / INTSIZE; 1924 | bits_state = (inttype*) malloc(sizeof(inttype)*nr_procs); 1925 | for (int i = 0; i < nr_procs; i++) { 1926 | if (fgets(stmp, BUFFERSIZE, fp) != NULL) { 1927 | bits_state[i] = atoi(stmp); 1928 | fprintf(stdout, "bits for states of process LTS %d: %d\n", i, bits_state[i]); 1929 | } 1930 | } 1931 | if (fgets(stmp, BUFFERSIZE, fp) != NULL) { 1932 | nbits_offset = atoi(stmp); 1933 | fprintf(stdout, "size of offset in process LTSs: %d\n", nbits_offset); 1934 | } 1935 | if (fgets(stmp, BUFFERSIZE, fp) != NULL) { 1936 | max_buf_ints = atoi(stmp); 1937 | fprintf(stdout, "maximum label-bounded branching factor: %d\n", max_buf_ints); 1938 | } 1939 | proc_offsets_start = (inttype*) malloc(sizeof(inttype)*(nr_procs+1)); 1940 | for (int i = 0; i <= nr_procs; i++) { 1941 | if (fgets(stmp, BUFFERSIZE, fp) != NULL) { 1942 | proc_offsets_start[i] = atoi(stmp); 1943 | } 1944 | } 1945 | proc_offsets = (inttype*) malloc(sizeof(inttype)*proc_offsets_start[nr_procs]); 1946 | for (int i = 0; i < proc_offsets_start[nr_procs]; i++) { 1947 | if (fgets(stmp, BUFFERSIZE, fp) != NULL) { 1948 | proc_offsets[i] = atoi(stmp); 1949 | } 1950 | } 1951 | if (fgets(stmp, BUFFERSIZE, fp) != NULL) { 1952 | nr_trans = atoi(stmp); 1953 | fprintf(stdout, "total number of transition entries in network: %d\n", nr_trans); 1954 | } 1955 | proc_trans = (inttype*) malloc(sizeof(inttype)*nr_trans); 1956 | for (int i = 0; i < nr_trans; i++) { 1957 | if (fgets(stmp, BUFFERSIZE, fp) != NULL) { 1958 | proc_trans[i] = atoi(stmp); 1959 | } 1960 | } 1961 | 1962 | if (fgets(stmp, BUFFERSIZE, fp) != NULL) { 1963 | nbits_syncbits_offset = atoi(stmp); 1964 | } 1965 | if (fgets(stmp, BUFFERSIZE, fp) != NULL) { 1966 | nr_syncbits_offsets = atoi(stmp); 1967 | } 1968 | syncbits_offsets = (inttype*) malloc(sizeof(inttype)*nr_syncbits_offsets); 1969 | for (int i = 0; i < nr_syncbits_offsets; i++) { 1970 | if (fgets(stmp, BUFFERSIZE, fp) != NULL) { 1971 | syncbits_offsets[i] = atoi(stmp); 1972 | } 1973 | } 1974 | if (fgets(stmp, BUFFERSIZE, fp) != NULL) { 1975 | nr_syncbits = atoi(stmp); 1976 | } 1977 | syncbits = (inttype*) malloc(sizeof(inttype)*nr_syncbits); 1978 | for (int i = 0; i < nr_syncbits; i++) { 1979 | if (fgets(stmp, BUFFERSIZE, fp) != NULL) { 1980 | syncbits[i] = atoi(stmp); 1981 | } 1982 | } 1983 | } 1984 | else { 1985 | fprintf(stderr, "ERROR: input network does not exist!\n"); 1986 | exit(1); 1987 | } 1988 | 1989 | // Randomly define the closed set hash functions 1990 | srand(time(NULL)); 1991 | for (int i = 0; i < NR_HASH_FUNCTIONS*2; i++) { 1992 | h[i] = rand(); 1993 | } 1994 | 1995 | // continue flags 1996 | contBFS = 1; 1997 | 1998 | // Query the device properties and determine data structure sizes 1999 | cudaGetDeviceCount(&nDevices); 2000 | if (nDevices == 0) { 2001 | fprintf (stderr, "ERROR: No CUDA compatible GPU detected!\n"); 2002 | exit(1); 2003 | } 2004 | cudaGetDeviceProperties(&prop, 0); 2005 | fprintf (stdout, "global mem: %lu\n", (uint64_t) prop.totalGlobalMem); 2006 | fprintf (stdout, "shared mem per block: %d\n", (int) prop.sharedMemPerBlock); 2007 | fprintf (stdout, "shared mem per SM: %d\n", (int) prop.sharedMemPerMultiprocessor); 2008 | fprintf (stdout, "max. threads per block: %d\n", (int) prop.maxThreadsPerBlock); 2009 | fprintf (stdout, "max. grid size: %d\n", (int) prop.maxGridSize[0]); 2010 | fprintf (stdout, "nr. of multiprocessors: %d\n", (int) prop.multiProcessorCount); 2011 | 2012 | // determine actual nr of blocks 2013 | nblocks = MAX(1,MIN(prop.maxGridSize[0],nblocks)); 2014 | 2015 | // Allocate memory on GPU 2016 | cudaMallocCount((void **) &d_contBFS, sizeof(inttype)); 2017 | cudaMallocCount((void **) &d_property_violation, sizeof(inttype)); 2018 | cudaMallocCount((void **) &d_counted_states, sizeof(inttype)); 2019 | cudaMallocCount((void **) &d_h, NR_HASH_FUNCTIONS*2*sizeof(inttype)); 2020 | cudaMallocCount((void **) &d_bits_state, nr_procs*sizeof(inttype)); 2021 | cudaMallocCount((void **) &d_firstbit_statevector, (nr_procs+1)*sizeof(inttype)); 2022 | cudaMallocCount((void **) &d_proc_offsets_start, (nr_procs+1)*sizeof(inttype)); 2023 | cudaMallocCount((void **) &d_proc_offsets, proc_offsets_start[nr_procs]*sizeof(inttype)); 2024 | cudaMallocCount((void **) &d_proc_trans, nr_trans*sizeof(inttype)); 2025 | cudaMallocCount((void **) &d_syncbits_offsets, nr_syncbits_offsets*sizeof(inttype)); 2026 | cudaMallocCount((void **) &d_syncbits, nr_syncbits*sizeof(inttype)); 2027 | cudaMallocCount((void **) &d_newstate_flags, nblocks*sizeof(inttype)); 2028 | cudaMallocCount((void **) &d_worktiles, nblocks * (sv_nints*(nthreadsperblock/nr_procs)+nthreadsperblock/WARPSIZE+1)*sizeof(inttype)); 2029 | 2030 | 2031 | // Copy data to GPU 2032 | CUDA_CHECK_RETURN(cudaMemcpy(d_contBFS, &contBFS, sizeof(inttype), cudaMemcpyHostToDevice)) 2033 | CUDA_CHECK_RETURN(cudaMemcpy(d_h, h, NR_HASH_FUNCTIONS*2*sizeof(inttype), cudaMemcpyHostToDevice)) 2034 | CUDA_CHECK_RETURN(cudaMemcpy(d_bits_state, bits_state, nr_procs*sizeof(inttype), cudaMemcpyHostToDevice)) 2035 | CUDA_CHECK_RETURN(cudaMemcpy(d_firstbit_statevector, firstbit_statevector, (nr_procs+1)*sizeof(inttype), cudaMemcpyHostToDevice)) 2036 | CUDA_CHECK_RETURN(cudaMemcpy(d_proc_offsets_start, proc_offsets_start, (nr_procs+1)*sizeof(inttype), cudaMemcpyHostToDevice)) 2037 | CUDA_CHECK_RETURN(cudaMemcpy(d_proc_offsets, proc_offsets, proc_offsets_start[nr_procs]*sizeof(inttype), cudaMemcpyHostToDevice)) 2038 | CUDA_CHECK_RETURN(cudaMemcpy(d_proc_trans, proc_trans, nr_trans*sizeof(inttype), cudaMemcpyHostToDevice)) 2039 | CUDA_CHECK_RETURN(cudaMemcpy(d_syncbits_offsets, syncbits_offsets, nr_syncbits_offsets*sizeof(inttype), cudaMemcpyHostToDevice)) 2040 | CUDA_CHECK_RETURN(cudaMemcpy(d_syncbits, syncbits, nr_syncbits*sizeof(inttype), cudaMemcpyHostToDevice)) 2041 | CUDA_CHECK_RETURN(cudaMemset(d_newstate_flags, 0, nblocks*sizeof(inttype))); 2042 | CUDA_CHECK_RETURN(cudaMemset(d_worktiles, 0, nblocks * (sv_nints*(nthreadsperblock/nr_procs)+nthreadsperblock/WARPSIZE+1)*sizeof(inttype))); 2043 | CUDA_CHECK_RETURN(cudaMemset(d_counted_states, 0, sizeof(inttype))); 2044 | 2045 | // Bind data to textures 2046 | cudaBindTexture(NULL, tex_proc_offsets_start, d_proc_offsets_start, (nr_procs+1)*sizeof(inttype)); 2047 | cudaBindTexture(NULL, tex_proc_offsets, d_proc_offsets, proc_offsets_start[nr_procs]*sizeof(inttype)); 2048 | cudaBindTexture(NULL, tex_proc_trans, d_proc_trans, nr_trans*sizeof(inttype)); 2049 | cudaBindTexture(NULL, tex_syncbits_offsets, d_syncbits_offsets, nr_syncbits_offsets*sizeof(inttype)); 2050 | cudaBindTexture(NULL, tex_syncbits, d_syncbits, nr_syncbits*sizeof(inttype)); 2051 | 2052 | size_t available, total; 2053 | cudaMemGetInfo(&available, &total); 2054 | if (q_size == 0) { 2055 | q_size = total / sizeof(inttype); 2056 | } 2057 | size_t el_per_Mb = Mb / sizeof(inttype); 2058 | 2059 | 2060 | while(cudaMalloc((void**)&d_q, q_size * sizeof(inttype)) == cudaErrorMemoryAllocation) { 2061 | q_size -= el_per_Mb; 2062 | if( q_size < el_per_Mb) { 2063 | // signal no free memory 2064 | break; 2065 | } 2066 | } 2067 | 2068 | fprintf (stdout, "global mem queue size: %lu, number of entries: %lu\n", q_size*sizeof(inttype), (indextype) q_size); 2069 | 2070 | inttype shared_q_size = (int) prop.sharedMemPerMultiprocessor / sizeof(inttype) / 2; 2071 | fprintf (stdout, "shared mem queue size: %lu, number of entries: %u\n", shared_q_size*sizeof(inttype), shared_q_size); 2072 | fprintf (stdout, "nr. of blocks: %d, block size: %d, nr of kernel iterations: %d\n", nblocks, nthreadsperblock, kernel_iters); 2073 | 2074 | // copy symbols 2075 | inttype tablesize = q_size; 2076 | inttype nrbuckets = tablesize / WARPSIZE; 2077 | cudaMemcpyToSymbol(d_nrbuckets, &nrbuckets, sizeof(inttype)); 2078 | cudaMemcpyToSymbol(d_shared_q_size, &shared_q_size, sizeof(inttype)); 2079 | cudaMemcpyToSymbol(d_nr_procs, &nr_procs, sizeof(inttype)); 2080 | cudaMemcpyToSymbol(d_max_buf_ints, &max_buf_ints, sizeof(inttype)); 2081 | cudaMemcpyToSymbol(d_sv_nints, &sv_nints, sizeof(inttype)); 2082 | cudaMemcpyToSymbol(d_bits_act, &bits_act, sizeof(inttype)); 2083 | cudaMemcpyToSymbol(d_nbits_offset, &nbits_offset, sizeof(inttype)); 2084 | cudaMemcpyToSymbol(d_nbits_syncbits_offset, &nbits_syncbits_offset, sizeof(inttype)); 2085 | cudaMemcpyToSymbol(d_kernel_iters, &kernel_iters, sizeof(inttype)); 2086 | cudaMemcpyToSymbol(d_property, &check_property, sizeof(inttype)); 2087 | cudaMemcpyToSymbol(d_apply_por, &apply_por, sizeof(inttype)); 2088 | cudaMemcpyToSymbol(d_check_cycle_proviso, &use_cycle_proviso, sizeof(inttype)); 2089 | 2090 | // init the hash table 2091 | init_queue<<>>(d_q, q_size); 2092 | store_initial<<<1,1>>>(d_q, d_h, d_newstate_flags,nthreadsperblock,nblocks); 2093 | for (int i = 0; i < 2*NR_HASH_FUNCTIONS; i++) { 2094 | fprintf (stdout, "hash constant %d: %d\n", i, h[i]); 2095 | } 2096 | FIRSTHASHHOST(i); 2097 | fprintf (stdout, "hash of initial state: %d\n", i); 2098 | 2099 | inttype zero = 0; 2100 | inttype *q_test = (inttype*) malloc(sizeof(inttype)*tablesize); 2101 | int j = 0; 2102 | inttype scan = 0; 2103 | CUDA_CHECK_RETURN(cudaMemcpy(d_property_violation, &zero, sizeof(inttype), cudaMemcpyHostToDevice)) 2104 | inttype property_violation = 0; 2105 | 2106 | clock_t exploration_start; 2107 | assert((exploration_start = clock())!=-1); 2108 | 2109 | while (contBFS == 1) { 2110 | CUDA_CHECK_RETURN(cudaMemcpy(d_contBFS, &zero, sizeof(inttype), cudaMemcpyHostToDevice)) 2111 | if(apply_por) { 2112 | gather_por<<>>(d_q, d_h, d_bits_state, d_firstbit_statevector, d_proc_offsets_start, 2113 | d_proc_offsets, d_proc_trans, d_syncbits_offsets, d_syncbits, d_contBFS, d_property_violation, d_newstate_flags, d_worktiles, scan); 2114 | } else { 2115 | gather<<>>(d_q, d_h, d_bits_state, d_firstbit_statevector, 2116 | d_contBFS, d_property_violation, d_newstate_flags, d_worktiles, scan); 2117 | } 2118 | // copy progress result 2119 | //CUDA_CHECK_RETURN(cudaGetLastError()); 2120 | CUDA_CHECK_RETURN(cudaDeviceSynchronize()); 2121 | CUDA_CHECK_RETURN(cudaMemcpy(&contBFS, d_contBFS, sizeof(inttype), cudaMemcpyDeviceToHost)) 2122 | if (check_property > 0) { 2123 | CUDA_CHECK_RETURN(cudaMemcpy(&property_violation, d_property_violation, sizeof(inttype), cudaMemcpyDeviceToHost)) 2124 | if (property_violation == 1) { 2125 | contBFS = 0; 2126 | } 2127 | } 2128 | if (verbosity > 0) { 2129 | if (verbosity == 1) { 2130 | printf ("%d\n", j++); 2131 | } 2132 | else if (verbosity == 2) { 2133 | cudaMemcpy(q_test, d_q, tablesize*sizeof(inttype), cudaMemcpyDeviceToHost); 2134 | count_local_queue(q_test, tablesize, firstbit_statevector, nr_procs, sv_nints); 2135 | } 2136 | else if (verbosity == 3) { 2137 | cudaMemcpy(q_test, d_q, tablesize*sizeof(inttype), cudaMemcpyDeviceToHost); 2138 | print_local_queue(stdout, q_test, tablesize, firstbit_statevector, nr_procs, sv_nints, apply_por); 2139 | } 2140 | } 2141 | scan = 1; 2142 | } 2143 | // determine runtime 2144 | stop = clock(); 2145 | runtime = (double) (stop-start)/CLOCKS_PER_SEC; 2146 | fprintf (stdout, "Run time: %f\n", runtime); 2147 | runtime = (double) (stop-exploration_start)/CLOCKS_PER_SEC; 2148 | fprintf(stdout, "Exploration time %f\n", runtime); 2149 | 2150 | if (property_violation == 1) { 2151 | switch (check_property) { 2152 | case DEADLOCK: 2153 | printf ("deadlock detected!\n"); 2154 | break; 2155 | case SAFETY: 2156 | printf ("safety property violation detected!\n"); 2157 | break; 2158 | case LIVENESS: 2159 | printf ("liveness property violation detected!\n"); 2160 | break; 2161 | } 2162 | } 2163 | // report error if required 2164 | if (contBFS == 2) { 2165 | fprintf (stderr, "ERROR: problem with hash table\n"); 2166 | } 2167 | 2168 | CUDA_CHECK_RETURN(cudaMemset(d_counted_states, 0, sizeof(inttype))); 2169 | count_states<<<((int) prop.multiProcessorCount)*8, 512, 1>>>(d_q, d_counted_states); 2170 | CUDA_CHECK_RETURN(cudaDeviceSynchronize()); 2171 | CUDA_CHECK_RETURN(cudaMemcpy(&counted_states, d_counted_states, sizeof(inttype), cudaMemcpyDeviceToHost)); 2172 | fprintf (stdout, "nr. of states in hash table: %d\n", counted_states); 2173 | 2174 | // Debugging functionality: print states to file 2175 | if(dump_file) { 2176 | FILE* fout; 2177 | if((fout = fopen(dump_file, "w")) != NULL) { 2178 | fprintf(stdout, "Dumping state space to file...\n"); 2179 | cudaMemcpy(q_test, d_q, tablesize*sizeof(inttype), cudaMemcpyDeviceToHost); 2180 | print_local_queue(fout, q_test, tablesize, firstbit_statevector, nr_procs, sv_nints, apply_por); 2181 | fclose(fout); 2182 | } else { 2183 | fprintf(stderr, "Could not open file to dump the state space\n"); 2184 | } 2185 | } 2186 | 2187 | return 0; 2188 | } 2189 | --------------------------------------------------------------------------------