├── .gitignore ├── examples ├── saxpy │ ├── params.h │ ├── params_directed.h │ ├── params_common.h │ ├── Makefile │ ├── saxpy_single.cu │ └── saxpy_double.cu ├── negatives │ ├── Makefile │ ├── arrival.cu │ ├── different.cu │ ├── over.cu │ └── deadlock.cu ├── sgemv │ ├── Makefile │ ├── vec_single.cu │ ├── vec_manual.cu │ ├── both_single.cu │ ├── vec_double.cu │ ├── both_manual.cu │ └── both_double.cu ├── RTM │ ├── Makefile │ ├── one_phase_single_buffer.cu │ └── two_phase_single_buffer.cu ├── PRF │ └── Makefile ├── DME │ └── Makefile ├── Heptane │ └── Makefile └── run_examples.sh ├── src ├── Makefile ├── race.h ├── graph.h ├── program.h ├── weft.h ├── race.cc ├── weft.cc └── instruction.h ├── README.md └── LICENSE /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled Object files 2 | *.slo 3 | *.lo 4 | *.o 5 | *.obj 6 | 7 | # Precompiled Headers 8 | *.gch 9 | *.pch 10 | 11 | # Compiled Dynamic libraries 12 | *.so 13 | *.dylib 14 | *.dll 15 | 16 | # Fortran module files 17 | *.mod 18 | 19 | # Compiled Static libraries 20 | *.lai 21 | *.la 22 | *.a 23 | *.lib 24 | 25 | # Executables 26 | *.exe 27 | *.out 28 | *.app 29 | -------------------------------------------------------------------------------- /examples/saxpy/params.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2015 Stanford University and NVIDIA 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include "params_directed.h" 18 | #include "params_common.h" 19 | -------------------------------------------------------------------------------- /examples/negatives/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2015 Stanford University and NVIDIA 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | INPUTS := arrival.cu deadlock.cu different.cu over.cu 18 | OUTPUTS := $(INPUTS:.cu=.ptx) 19 | 20 | %.ptx : %.cu 21 | nvcc -o $@ -ptx -lineinfo -m64 -arch=compute_20 $< 22 | 23 | .PHONY: all 24 | all: normal 25 | 26 | normal: $(OUTPUTS) 27 | 28 | clean: 29 | rm -f *.ptx 30 | 31 | -------------------------------------------------------------------------------- /examples/negatives/arrival.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2015 Stanford University and NVIDIA 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | __global__ void 18 | __launch_bounds__(64,1) 19 | arrival_test(void) 20 | { 21 | int wid = threadIdx.x >> 5; 22 | if (wid == 0) 23 | asm volatile("bar.arrive 1, 64;" : : : "memory"); 24 | else if (wid == 1) 25 | asm volatile("bar.arrive 1, 64;" : : : "memory"); 26 | } 27 | 28 | -------------------------------------------------------------------------------- /examples/negatives/different.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2015 Stanford University and NVIDIA 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | __global__ void 18 | __launch_bounds__(64,1) 19 | different_test(void) 20 | { 21 | int wid = threadIdx.x >> 5; 22 | if (wid == 0) 23 | asm volatile("bar.sync 0, 96;" : : : "memory"); 24 | else if (wid == 1) 25 | asm volatile("bar.sync 0, 128;" : : : "memory"); 26 | } 27 | 28 | -------------------------------------------------------------------------------- /examples/saxpy/params_directed.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2015 Stanford University and NVIDIA 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #define SAXPY_KERNEL saxpy_cudaDMA_doublebuffer 18 | #define CTA_COUNT 14 19 | #define COMPUTE_THREADS_PER_CTA 32 * 8 20 | #ifndef NUM_ITERS 21 | #define NUM_ITERS 2048 22 | #endif 23 | #define DMA_THREADS_PER_LD 32 * 1 24 | #define BYTES_PER_DMA_THREAD 32 25 | #define DMA_SZ 4 * COMPUTE_THREADS_PER_CTA 26 | -------------------------------------------------------------------------------- /examples/negatives/over.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2015 Stanford University and NVIDIA 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | __global__ void 18 | __launch_bounds__(96,1) 19 | arrival_test(void) 20 | { 21 | int wid = threadIdx.x >> 5; 22 | if (wid == 0) 23 | asm volatile("bar.sync 0, 64;" : : : "memory"); 24 | else if (wid == 1) 25 | asm volatile("bar.arrive 0, 64;" : : : "memory"); 26 | else if (wid == 2) 27 | asm volatile("bar.sync 0, 64;" : : : "memory"); 28 | } 29 | -------------------------------------------------------------------------------- /examples/negatives/deadlock.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2015 Stanford University and NVIDIA 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | __global__ void 18 | __launch_bounds__(64,1) 19 | deadlock_test(void) 20 | { 21 | int wid = threadIdx.x >> 5; 22 | if (wid == 0) { 23 | asm volatile("bar.sync 0, 64;" : : : "memory"); 24 | asm volatile("bar.arrive 1, 64;" : : : "memory"); 25 | } else { 26 | asm volatile("bar.sync 1, 64;" : : : "memory"); 27 | asm volatile("bar.arrive 0, 64;" : : : "memory"); 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /examples/saxpy/params_common.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2015 Stanford University and NVIDIA 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | // These are computed from the above parameters 18 | #define DMA_THREADS_PER_CTA ( (SAXPY_KERNEL==saxpy_cudaDMA_doublebuffer) ? 4 : 2 ) * DMA_THREADS_PER_LD 19 | #define THREADS_PER_CTA \ 20 | (SAXPY_KERNEL==saxpy_cudaDMA_doublebuffer) ? (COMPUTE_THREADS_PER_CTA+DMA_THREADS_PER_CTA) : \ 21 | (SAXPY_KERNEL==saxpy_cudaDMA) ? (COMPUTE_THREADS_PER_CTA+DMA_THREADS_PER_CTA) : \ 22 | COMPUTE_THREADS_PER_CTA 23 | -------------------------------------------------------------------------------- /examples/saxpy/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2015 Stanford University and NVIDIA 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | INPUTS := saxpy_single.cu saxpy_double.cu 18 | OUTPUTS := $(INPUTS:.cu=.ptx) 19 | SMALL_OUTPUTS := $(INPUTS:.cu=_small.ptx) 20 | 21 | %_small.ptx : %.cu 22 | nvcc -o $@ -ptx -lineinfo -m64 -arch=compute_20 -DNUM_ITERS=8 $< 23 | 24 | %.ptx : %.cu 25 | nvcc -o $@ -ptx -lineinfo -m64 -arch=compute_20 $< 26 | 27 | .PHONY: all 28 | all: normal 29 | 30 | normal: $(OUTPUTS) 31 | 32 | .PHONY: small 33 | small: $(SMALL_OUTPUTS) 34 | 35 | clean: 36 | rm -f *.ptx 37 | 38 | -------------------------------------------------------------------------------- /src/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2015 Stanford University and NVIDIA 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | OUTFILE := weft 18 | 19 | .PHONY: all 20 | all: $(OUTFILE) 21 | 22 | GCC = g++ 23 | CC_FLAGS = -O2 -Wall 24 | LD_FLAGS = -O2 -lpthread 25 | 26 | UNAME = $(shell uname) 27 | ifeq ($(UNAME),Linux) 28 | LD_FLAGS += -lrt 29 | endif 30 | 31 | FILES = weft.cc \ 32 | race.cc \ 33 | graph.cc \ 34 | program.cc \ 35 | instruction.cc 36 | 37 | OBJS := $(FILES:.cc=.o) 38 | 39 | %.o : %.cc 40 | $(GCC) -c $(CC_FLAGS) $< 41 | 42 | $(OUTFILE) : $(OBJS) 43 | $(GCC) -o $(OUTFILE) $(OBJS) $(LD_FLAGS) 44 | 45 | clean: 46 | rm -f *.o $(OUTFILE) 47 | -------------------------------------------------------------------------------- /examples/sgemv/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2015 Stanford University and NVIDIA 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | INPUTS := vec_single.cu \ 18 | vec_double.cu \ 19 | vec_manual.cu \ 20 | both_single.cu \ 21 | both_double.cu \ 22 | both_manual.cu 23 | OUTPUTS := $(INPUTS:.cu=.ptx) 24 | SMALL_OUTPUTS := $(INPUTS:.cu=_small.ptx) 25 | 26 | %_small.ptx : %.cu 27 | nvcc -o $@ -ptx -lineinfo -m64 -arch=compute_20 -DSGEMV_ITERS=8 $< 28 | 29 | %.ptx : %.cu 30 | nvcc -o $@ -ptx -lineinfo -m64 -arch=compute_20 $< 31 | 32 | .PHONY: all 33 | all: normal 34 | 35 | normal: $(OUTPUTS) 36 | 37 | .PHONY: small 38 | small: $(SMALL_OUTPUTS) 39 | 40 | clean: 41 | rm -f *.ptx 42 | 43 | -------------------------------------------------------------------------------- /examples/RTM/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2015 Stanford University and NVIDIA 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | INPUTS := one_phase_single_buffer.cu \ 18 | one_phase_manual_buffer.cu \ 19 | two_phase_single_buffer.cu \ 20 | two_phase_manual_buffer.cu \ 21 | two_phase_quad_buffer.cu 22 | 23 | OUTPUTS := $(INPUTS:.cu=.ptx) 24 | SMALL_OUTPUTS := $(INPUTS:.cu=_small.ptx) 25 | DYNAMIC_OUTPUTS := $(INPUTS:.cu=_dynamic.ptx) 26 | 27 | %_small.ptx : %.cu 28 | nvcc -o $@ -ptx -lineinfo -m64 -arch=compute_35 -DRTM_ELMTS=16 $< 29 | 30 | %_dynamic.ptx : %.cu 31 | nvcc -o $@ -ptx -lineinfo -m64 -arch=compute_35 -DDYNAMIC $< 32 | 33 | %.ptx : %.cu 34 | nvcc -o $@ -ptx -lineinfo -m64 -arch=compute_35 $< 35 | 36 | .PHONY: all 37 | all: normal 38 | 39 | normal: $(OUTPUTS) 40 | 41 | .PHONY: small 42 | small: $(SMALL_OUTPUTS) 43 | 44 | .PHONY: dynamic 45 | dynamic: $(DYNAMIC_OUTPUTS) 46 | 47 | clean: 48 | rm -f *.ptx 49 | 50 | -------------------------------------------------------------------------------- /examples/PRF/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2015 Stanford University and NVIDIA 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | FERMI_INPUTS := diff_fermi.cu \ 18 | visc_fermi.cu 19 | 20 | KEPLER_INPUTS := diff_kepler.cu \ 21 | visc_kepler.cu 22 | 23 | FERMI_OUTPUTS := $(FERMI_INPUTS:.cu=.ptx) 24 | KEPLER_OUTPUTS := $(KEPLER_INPUTS:.cu=.ptx) 25 | 26 | SMALLF_OUTPUTS := $(FERMI_INPUTS:.cu=_small.ptx) 27 | SMALLK_OUTPUTS := $(KEPLER_INPUTS:.cu=_small.ptx) 28 | DYNAMICF_OUTPUTS:= $(FERMI_INPUTS:.cu=_dynamic.ptx) 29 | DYNAMICK_OUTPUTS:= $(KEPLER_INPUTS:.cu=_dynamic.ptx) 30 | 31 | %_fermi_small.ptx : %_fermi.cu 32 | nvcc -o $@ -ptx -m64 -lineinfo -arch=compute_20 -DTOTAL_STEPS=4 $< 33 | 34 | %_fermi_dynamic.ptx : %_fermi.cu 35 | nvcc -o $@ -ptx -m64 -lineinfo -arch=compute_20 -DDYNAMIC $< 36 | 37 | %_fermi.ptx : %_fermi.cu 38 | nvcc -o $@ -ptx -m64 -lineinfo -arch=compute_20 $< 39 | 40 | %_kepler_small.ptx : %_kepler.cu 41 | nvcc -o $@ -ptx -m64 -lineinfo -arch=compute_35 -DTOTAL_STEPS=4 $< 42 | 43 | %_kepler_dynamic.ptx : %_kepler.cu 44 | nvcc -o $@ -ptx -m64 -lineinfo -arch=compute_35 -DDYNAMIC $< 45 | 46 | %_kepler.ptx : %_kepler.cu 47 | nvcc -o $@ -ptx -m64 -lineinfo -arch=compute_35 $< 48 | 49 | 50 | .PHONY: all 51 | all: normal 52 | 53 | normal: $(FERMI_OUTPUTS) $(KEPLER_OUTPUTS) 54 | 55 | .PHONY: small 56 | small: $(SMALLF_OUTPUTS) $(SMALLK_OUTPUTS) 57 | 58 | .PHONY: dynamic 59 | dynamic: $(DYNAMICF_OUTPUTS) $(DYNAMICK_OUTPUTS) 60 | 61 | clean: 62 | rm -f *.ptx 63 | 64 | -------------------------------------------------------------------------------- /examples/DME/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2015 Stanford University and NVIDIA 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | FERMI_INPUTS := diff_fermi.cu \ 18 | visc_fermi.cu \ 19 | chem_fermi.cu 20 | 21 | KEPLER_INPUTS := diff_kepler.cu \ 22 | visc_kepler.cu \ 23 | chem_kepler.cu 24 | 25 | FERMI_OUTPUTS := $(FERMI_INPUTS:.cu=.ptx) 26 | KEPLER_OUTPUTS := $(KEPLER_INPUTS:.cu=.ptx) 27 | 28 | SMALLF_OUTPUTS := $(FERMI_INPUTS:.cu=_small.ptx) 29 | SMALLK_OUTPUTS := $(KEPLER_INPUTS:.cu=_small.ptx) 30 | DYNAMICF_OUTPUTS:= $(FERMI_INPUTS:.cu=_dynamic.ptx) 31 | DYNAMICK_OUTPUTS:= $(KEPLER_INPUTS:.cu=_dynamic.ptx) 32 | 33 | %_fermi_small.ptx : %_fermi.cu 34 | nvcc -o $@ -ptx -lineinfo -m64 -arch=compute_20 -DTOTAL_STEPS=4 $< 35 | 36 | %_fermi_dynamic.ptx : %_fermi.cu 37 | nvcc -o $@ -ptx -lineinfo -m64 -arch=compute_20 -DDYNAMIC $< 38 | 39 | %_fermi.ptx : %_fermi.cu 40 | nvcc -o $@ -ptx -lineinfo -m64 -arch=compute_20 $< 41 | 42 | %_kepler_small.ptx : %_kepler.cu 43 | nvcc -o $@ -ptx -lineinfo -m64 -arch=compute_35 -DTOTAL_STEPS=4 $< 44 | 45 | %_kepler_dynamic.ptx : %_kepler.cu 46 | nvcc -o $@ -ptx -lineinfo -m64 -arch=compute_35 -DDYNAMIC $< 47 | 48 | %_kepler.ptx : %_kepler.cu 49 | nvcc -o $@ -ptx -lineinfo -m64 -arch=compute_35 $< 50 | 51 | 52 | .PHONY: all 53 | all: normal 54 | 55 | normal: $(FERMI_OUTPUTS) $(KEPLER_OUTPUTS) 56 | 57 | .PHONY: small 58 | small: $(SMALLF_OUTPUTS) $(SMALLK_OUTPUTS) 59 | 60 | .PHONY: dynamic 61 | dynamic: $(DYNAMICF_OUTPUTS) $(DYNAMICK_OUTPUTS) 62 | 63 | clean: 64 | rm -f *.ptx 65 | 66 | -------------------------------------------------------------------------------- /examples/Heptane/Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2015 Stanford University and NVIDIA 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | FERMI_INPUTS := diff_fermi.cu \ 18 | visc_fermi.cu \ 19 | chem_fermi.cu 20 | 21 | KEPLER_INPUTS := diff_kepler.cu \ 22 | visc_kepler.cu \ 23 | chem_kepler.cu 24 | 25 | FERMI_OUTPUTS := $(FERMI_INPUTS:.cu=.ptx) 26 | KEPLER_OUTPUTS := $(KEPLER_INPUTS:.cu=.ptx) 27 | 28 | SMALLF_OUTPUTS := $(FERMI_INPUTS:.cu=_small.ptx) 29 | SMALLK_OUTPUTS := $(KEPLER_INPUTS:.cu=_small.ptx) 30 | DYNAMICF_OUTPUTS:= $(FERMI_INPUTS:.cu=_dynamic.ptx) 31 | DYNAMICK_OUTPUTS:= $(KEPLER_INPUTS:.cu=_dynamic.ptx) 32 | 33 | %_fermi_small.ptx : %_fermi.cu 34 | nvcc -o $@ -ptx -m64 -lineinfo -arch=compute_20 -DTOTAL_STEPS=4 $< 35 | 36 | %_fermi_dynamic.ptx : %_fermi.cu 37 | nvcc -o $@ -ptx -m64 -lineinfo -arch=compute_20 -DDYNAMIC $< 38 | 39 | %_fermi.ptx : %_fermi.cu 40 | nvcc -o $@ -ptx -m64 -lineinfo -arch=compute_20 $< 41 | 42 | %_kepler_small.ptx : %_kepler.cu 43 | nvcc -o $@ -ptx -m64 -lineinfo -arch=compute_35 -DTOTAL_STEPS=4 $< 44 | 45 | %_kepler_dynamic.ptx : %_kepler.cu 46 | nvcc -o $@ -ptx -m64 -lineinfo -arch=compute_35 -DDYNAMIC $< 47 | 48 | %_kepler.ptx : %_kepler.cu 49 | nvcc -o $@ -ptx -m64 -lineinfo -arch=compute_35 $< 50 | 51 | 52 | .PHONY: all 53 | all: normal 54 | 55 | normal: $(FERMI_OUTPUTS) $(KEPLER_OUTPUTS) 56 | 57 | .PHONY: small 58 | small: $(SMALLF_OUTPUTS) $(SMALLK_OUTPUTS) 59 | 60 | .PHONY: dynamic 61 | dynamic: $(DYNAMICF_OUTPUTS) $(DYNAMICK_OUTPUTS) 62 | 63 | clean: 64 | rm -f *.ptx 65 | 66 | -------------------------------------------------------------------------------- /examples/sgemv/vec_single.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2015 Stanford University and NVIDIA 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include 19 | #include 20 | #include 21 | 22 | #include "cuda.h" 23 | #include "cuda_runtime.h" 24 | 25 | #include "cudaDMA.h" 26 | 27 | #define SIZE_N 896 28 | #define SIZE_M SIZE_N 29 | 30 | #define DMA_KERNEL sgemvn_cuda_dma_vec_single 31 | #define COMPUTE_THREADS_PER_CTA 128 32 | #define DMA_THREADS_PER_LD 32 33 | #define DMA_LDS 1 34 | #ifndef VEC_ELMTS 35 | #define VEC_ELMTS 128 36 | #endif 37 | 38 | #ifndef SGEMV_ITERS 39 | #define SGEMV_ITERS 128 40 | #endif 41 | 42 | __global__ void 43 | __launch_bounds__(160,1) 44 | sgemvn_cuda_dma_vec_single(int n, int m, int n1, float alpha, float *A, int lda, float *x, float *y) 45 | { 46 | __shared__ float buff[VEC_ELMTS]; 47 | 48 | cudaDMASequential 49 | dma_ld_0(1,COMPUTE_THREADS_PER_CTA,COMPUTE_THREADS_PER_CTA); 50 | 51 | if (threadIdx.x < COMPUTE_THREADS_PER_CTA) 52 | { 53 | dma_ld_0.start_async_dma(); 54 | int ind = blockIdx.x*COMPUTE_THREADS_PER_CTA + threadIdx.x; 55 | 56 | A += ind; 57 | 58 | float res = 0.f; 59 | 60 | #ifdef DYNAMIC 61 | #pragma unroll 1 62 | for(int i=0; in1) 79 | { 80 | buff[threadIdx.x] = x[n1]; 81 | 82 | __syncthreads(); 83 | for(int j=0; j<(m-n1); j++) 84 | { 85 | res += A[0]*buff[j]; 86 | A+=lda; 87 | } 88 | } 89 | #endif 90 | 91 | if (ind 18 | #include 19 | 20 | #include "cudaDMA.h" 21 | #include "params.h" 22 | 23 | /* 24 | * This version of saxpy uses cudaDMA for DMAs (but requires 2 CTAs/SM) for double buffering. 25 | */ 26 | __global__ void 27 | __launch_bounds__(320,2) 28 | saxpy_cudaDMA ( float* y, float* x, float a, clock_t * timer_vals) 29 | { 30 | __shared__ float sdata_x0 [COMPUTE_THREADS_PER_CTA]; 31 | __shared__ float sdata_y0 [COMPUTE_THREADS_PER_CTA]; 32 | 33 | cudaDMASequential 34 | dma_ld_x_0 (1, COMPUTE_THREADS_PER_CTA, COMPUTE_THREADS_PER_CTA ); 35 | cudaDMASequential 36 | dma_ld_y_0 (2, COMPUTE_THREADS_PER_CTA, COMPUTE_THREADS_PER_CTA + DMA_THREADS_PER_LD ); 37 | 38 | int tid = threadIdx.x ; 39 | 40 | if ( tid < COMPUTE_THREADS_PER_CTA ) { 41 | unsigned int idx; 42 | int i; 43 | float tmp_x; 44 | float tmp_y; 45 | 46 | // Preamble: 47 | dma_ld_x_0.start_async_dma(); 48 | dma_ld_y_0.start_async_dma(); 49 | #pragma unroll 1 50 | for (i = 0; i < NUM_ITERS-1; ++i) { 51 | dma_ld_x_0.wait_for_dma_finish(); 52 | tmp_x = sdata_x0[tid]; 53 | dma_ld_x_0.start_async_dma(); 54 | dma_ld_y_0.wait_for_dma_finish(); 55 | tmp_y = sdata_y0[tid]; 56 | dma_ld_y_0.start_async_dma(); 57 | idx = i * COMPUTE_THREADS_PER_CTA * CTA_COUNT + blockIdx.x * COMPUTE_THREADS_PER_CTA + threadIdx.x; 58 | y[idx] = a * tmp_x + tmp_y; 59 | } 60 | // Postamble: 61 | dma_ld_x_0.wait_for_dma_finish(); 62 | tmp_x = sdata_x0[tid]; 63 | dma_ld_y_0.wait_for_dma_finish(); 64 | tmp_y = sdata_y0[tid]; 65 | idx = i * COMPUTE_THREADS_PER_CTA * CTA_COUNT + blockIdx.x * COMPUTE_THREADS_PER_CTA + threadIdx.x; 66 | y[idx] = a * tmp_x + tmp_y; 67 | 68 | } else if (dma_ld_x_0.owns_this_thread()) { 69 | #pragma unroll 1 70 | for (unsigned int j = 0; j < NUM_ITERS; ++j) { 71 | // idx is a pointer to the base of the chunk of memory to copy 72 | unsigned int idx = j * COMPUTE_THREADS_PER_CTA * CTA_COUNT + blockIdx.x * COMPUTE_THREADS_PER_CTA; 73 | dma_ld_x_0.execute_dma( &x[idx], sdata_x0 ); 74 | } 75 | } else if (dma_ld_y_0.owns_this_thread()) { 76 | #pragma unroll 1 77 | for (unsigned int j = 0; j < NUM_ITERS; ++j) { 78 | unsigned int idx = j * COMPUTE_THREADS_PER_CTA * CTA_COUNT + blockIdx.x * COMPUTE_THREADS_PER_CTA; 79 | dma_ld_y_0.execute_dma( &y[idx], sdata_y0 ); 80 | } 81 | } 82 | } 83 | 84 | 85 | 86 | -------------------------------------------------------------------------------- /examples/sgemv/vec_manual.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2015 Stanford University and NVIDIA 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include 19 | #include 20 | #include 21 | 22 | #include "cuda.h" 23 | #include "cuda_runtime.h" 24 | 25 | #include "cudaDMA.h" 26 | 27 | #define SIZE_N 896 28 | #define SIZE_M SIZE_N 29 | 30 | #define DMA_KERNEL sgemvn_cuda_dma_vec_manual 31 | #define COMPUTE_THREADS_PER_CTA 128 32 | #define DMA_THREADS_PER_LD 32 33 | #define DMA_LDS 1 34 | #ifndef VEC_ELMTS 35 | #define VEC_ELMTS 512 36 | #endif 37 | 38 | #ifndef SGEMV_ITERS 39 | #define SGEMV_ITERS 64 40 | #endif 41 | 42 | __global__ void 43 | __launch_bounds__(160,1) 44 | sgemvn_cuda_dma_vec_manual(int n, int m, int n1, float alpha, float *A, int lda, float *x, float *y) 45 | { 46 | __shared__ float buff0[VEC_ELMTS]; 47 | __shared__ float buff1[VEC_ELMTS]; 48 | 49 | 50 | cudaDMASequential 51 | dma_ld_0(1,COMPUTE_THREADS_PER_CTA,COMPUTE_THREADS_PER_CTA); 52 | cudaDMASequential 53 | dma_ld_1(2,COMPUTE_THREADS_PER_CTA,COMPUTE_THREADS_PER_CTA); 54 | 55 | if (threadIdx.x < COMPUTE_THREADS_PER_CTA) 56 | { 57 | dma_ld_0.start_async_dma(); 58 | dma_ld_1.start_async_dma(); 59 | int ind = blockIdx.x*COMPUTE_THREADS_PER_CTA + threadIdx.x; 60 | 61 | A += ind; 62 | 63 | float res = 0.f; 64 | 65 | #ifdef DYNAMIC 66 | #pragma unroll 1 67 | for(int i=0; i 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | 27 | class Weft; 28 | class Thread; 29 | class Program; 30 | class WeftAccess; 31 | class WeftBarrier; 32 | class SharedMemory; 33 | class PTXInstruction; 34 | 35 | class Happens { 36 | public: 37 | Happens(int total_threads); 38 | Happens(const Happens &rhs) { assert(false); } 39 | ~Happens(void) { } 40 | public: 41 | Happens& operator=(const Happens &rhs) { assert(false); return *this; } 42 | public: 43 | void update_barriers_before(const std::vector &before); 44 | void update_barriers_after(const std::vector &after); 45 | public: 46 | void update_happens_relationships(void); 47 | bool has_happens(int thread, int line_number); 48 | protected: 49 | bool initialized; 50 | std::vector latest_before; 51 | std::vector earliest_after; 52 | std::vector happens_before; 53 | std::vector happens_after; 54 | }; 55 | 56 | class Address { 57 | public: 58 | Address(const int addr, SharedMemory *memory); 59 | Address(const Address &rhs) : address(0), memory(NULL) { assert(false); } 60 | ~Address(void); 61 | public: 62 | Address& operator=(const Address &rhs) { assert(false); return *this; } 63 | public: 64 | void add_access(WeftAccess *access); 65 | void perform_race_tests(void); 66 | int report_races(std::map< 67 | std::pair,size_t> &all_races); 68 | size_t count_race_tests(void); 69 | protected: 70 | void record_race(WeftAccess *one, WeftAccess *two); 71 | public: 72 | const int address; 73 | SharedMemory *const memory; 74 | protected: 75 | pthread_mutex_t address_lock; 76 | std::vector accesses; 77 | protected: 78 | int total_races; 79 | std::map, 80 | std::set > > ptx_races; 81 | }; 82 | 83 | class SharedMemory { 84 | public: 85 | SharedMemory(Weft *weft, Program *program); 86 | SharedMemory(const SharedMemory &rhs) : weft(NULL), program(NULL) { assert(false); } 87 | ~SharedMemory(void); 88 | public: 89 | SharedMemory& operator=(const SharedMemory &rhs) 90 | { assert(false); return *this; } 91 | public: 92 | void update_accesses(WeftAccess *access); 93 | int count_addresses(void) const; 94 | void enqueue_race_checks(void); 95 | void check_for_races(void); 96 | size_t count_race_tests(void); 97 | public: 98 | Weft *const weft; 99 | Program *const program; 100 | protected: 101 | pthread_mutex_t memory_lock; 102 | std::map addresses; 103 | }; 104 | 105 | #endif // __RACE_H__ 106 | -------------------------------------------------------------------------------- /examples/sgemv/both_single.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2015 Stanford University and NVIDIA 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include 19 | #include 20 | #include 21 | 22 | #include "cuda.h" 23 | #include "cuda_runtime.h" 24 | 25 | #include "cudaDMA.h" 26 | 27 | #define SIZE_N 896 28 | #define SIZE_M SIZE_N 29 | 30 | #define DMA_KERNEL sgemvn_cuda_dma_both_single 31 | #define COMPUTE_THREADS_PER_CTA 128 32 | #define DMA_THREADS_PER_LD 32 33 | #define DMA_LDS 5 34 | #ifndef VEC_ELMTS 35 | #define VEC_ELMTS 32 36 | #endif 37 | 38 | #ifndef SGEMV_ITERS 39 | #define SGEMV_ITERS 128 40 | #endif 41 | 42 | __global__ void 43 | __launch_bounds__(288,1) 44 | sgemvn_cuda_dma_both_single(int n, int m, int n1, float alpha, float *A, int lda, float *x, float *y) 45 | { 46 | 47 | __shared__ float buff[VEC_ELMTS]; 48 | __shared__ float mat[VEC_ELMTS][COMPUTE_THREADS_PER_CTA]; 49 | 50 | cudaDMASequential 51 | dma_ld_0(1,COMPUTE_THREADS_PER_CTA,COMPUTE_THREADS_PER_CTA); 52 | 53 | cudaDMAStrided 54 | dma_ld_1(2,COMPUTE_THREADS_PER_CTA,COMPUTE_THREADS_PER_CTA+1*DMA_THREADS_PER_LD,4*lda); 55 | 56 | if (threadIdx.x < COMPUTE_THREADS_PER_CTA) 57 | { 58 | dma_ld_0.start_async_dma(); 59 | dma_ld_1.start_async_dma(); 60 | 61 | float res = 0.f; 62 | 63 | #ifdef DYNAMIC 64 | #pragma unroll 1 65 | for(int i=0; i 18 | #include 19 | #include 20 | #include 21 | 22 | #include "cuda.h" 23 | #include "cuda_runtime.h" 24 | 25 | #include "cudaDMA.h" 26 | 27 | #define SIZE_N 896 28 | #define SIZE_M SIZE_N 29 | 30 | #define DMA_KERNEL sgemvn_cuda_dma_vec_double 31 | #define COMPUTE_THREADS_PER_CTA 128 32 | #define DMA_THREADS_PER_LD 32 33 | #define DMA_LDS 2 34 | #ifndef VEC_ELMTS 35 | #define VEC_ELMTS 128 36 | #endif 37 | 38 | #ifndef SGEMV_ITERS 39 | #define SGEMV_ITERS 128 40 | #endif 41 | 42 | __global__ void 43 | __launch_bounds__(192,1) 44 | sgemvn_cuda_dma_vec_double(int n, int m, int n1, float alpha, float *A, int lda, float *x, float *y) 45 | { 46 | __shared__ float buff0[VEC_ELMTS]; 47 | __shared__ float buff1[VEC_ELMTS]; 48 | 49 | cudaDMASequential 50 | dma_ld_0(1,COMPUTE_THREADS_PER_CTA,COMPUTE_THREADS_PER_CTA); 51 | cudaDMASequential 52 | dma_ld_1(2,COMPUTE_THREADS_PER_CTA,COMPUTE_THREADS_PER_CTA+1*DMA_THREADS_PER_LD); 53 | 54 | if (threadIdx.x < COMPUTE_THREADS_PER_CTA) 55 | { 56 | dma_ld_0.start_async_dma(); 57 | dma_ld_1.start_async_dma(); 58 | int ind = blockIdx.x*COMPUTE_THREADS_PER_CTA + threadIdx.x; 59 | 60 | A += ind; 61 | 62 | float res = 0.f; 63 | 64 | #ifdef DYNAMIC 65 | #pragma unroll 1 66 | for(int i=0; i 18 | #include 19 | #include 20 | #include 21 | 22 | #include "cuda.h" 23 | #include "cuda_runtime.h" 24 | 25 | #include "cudaDMA.h" 26 | 27 | #define SIZE_N 896 28 | #define SIZE_M SIZE_N 29 | 30 | #define DMA_KERNEL sgemvn_cuda_dma_both_manual 31 | #define COMPUTE_THREADS_PER_CTA 128 32 | #define DMA_THREADS_PER_LD 32 33 | #define DMA_LDS 9 34 | #ifndef VEC_ELMTS 35 | #define VEC_ELMTS 32 36 | #endif 37 | 38 | #ifndef SGEMV_ITERS 39 | #define SGEMV_ITERS 64 40 | #endif 41 | 42 | __global__ void 43 | __launch_bounds__(416,1) 44 | sgemvn_cuda_dma_both_manual(int n, int m, int n1, float alpha, float *A, int lda, float *x, float *y) 45 | { 46 | __shared__ float buff0[VEC_ELMTS]; 47 | __shared__ float buff1[VEC_ELMTS]; 48 | __shared__ float mat0[VEC_ELMTS][COMPUTE_THREADS_PER_CTA]; 49 | __shared__ float mat1[VEC_ELMTS][COMPUTE_THREADS_PER_CTA]; 50 | 51 | cudaDMASequential 52 | dma_ld_0(1,COMPUTE_THREADS_PER_CTA,COMPUTE_THREADS_PER_CTA); 53 | 54 | cudaDMASequential 55 | dma_ld_1(2,COMPUTE_THREADS_PER_CTA,COMPUTE_THREADS_PER_CTA); 56 | 57 | cudaDMAStrided 58 | dma_ld_2(3,COMPUTE_THREADS_PER_CTA,COMPUTE_THREADS_PER_CTA+1*DMA_THREADS_PER_LD,4*lda); 59 | 60 | cudaDMAStrided 61 | dma_ld_3(4,COMPUTE_THREADS_PER_CTA,COMPUTE_THREADS_PER_CTA+1*DMA_THREADS_PER_LD,4*lda); 62 | 63 | if (threadIdx.x < COMPUTE_THREADS_PER_CTA) 64 | { 65 | dma_ld_0.start_async_dma(); 66 | dma_ld_1.start_async_dma(); 67 | dma_ld_2.start_async_dma(); 68 | dma_ld_3.start_async_dma(); 69 | 70 | float res = 0.f; 71 | 72 | #ifdef DYNAMIC 73 | #pragma unroll 1 74 | for(int i=0; i 18 | #include 19 | 20 | #include "cudaDMA.h" 21 | #include "params.h" 22 | 23 | /* 24 | * This version of saxpy uses cudaDMA for DMAs with manual double buffering. 25 | */ 26 | __global__ void 27 | __launch_bounds__(384,2) 28 | saxpy_cudaDMA_doublebuffer ( float* y, float* x, float a, clock_t * timer_vals) 29 | { 30 | __shared__ float sdata_x0 [COMPUTE_THREADS_PER_CTA]; 31 | __shared__ float sdata_x1 [COMPUTE_THREADS_PER_CTA]; 32 | __shared__ float sdata_y0 [COMPUTE_THREADS_PER_CTA]; 33 | __shared__ float sdata_y1 [COMPUTE_THREADS_PER_CTA]; 34 | 35 | cudaDMASequential 36 | dma_ld_x_0 (1, COMPUTE_THREADS_PER_CTA, COMPUTE_THREADS_PER_CTA); 37 | cudaDMASequential 38 | dma_ld_y_0 (2, COMPUTE_THREADS_PER_CTA, COMPUTE_THREADS_PER_CTA + DMA_THREADS_PER_LD); 39 | cudaDMASequential 40 | dma_ld_x_1 (3, COMPUTE_THREADS_PER_CTA, COMPUTE_THREADS_PER_CTA + 2*DMA_THREADS_PER_LD); 41 | cudaDMASequential 42 | dma_ld_y_1 (4, COMPUTE_THREADS_PER_CTA, COMPUTE_THREADS_PER_CTA + 3*DMA_THREADS_PER_LD); 43 | 44 | int tid = threadIdx.x ; 45 | 46 | if ( tid < COMPUTE_THREADS_PER_CTA ) { 47 | unsigned int idx; 48 | int i; 49 | float tmp_x; 50 | float tmp_y; 51 | 52 | // Preamble: 53 | dma_ld_x_0.start_async_dma(); 54 | dma_ld_y_0.start_async_dma(); 55 | dma_ld_x_1.start_async_dma(); 56 | dma_ld_y_1.start_async_dma(); 57 | #pragma unroll 1 58 | for (i = 0; i < NUM_ITERS-2; i += 2) { 59 | 60 | // Phase 1: 61 | dma_ld_x_0.wait_for_dma_finish(); 62 | tmp_x = sdata_x0[tid]; 63 | dma_ld_x_0.start_async_dma(); 64 | dma_ld_y_0.wait_for_dma_finish(); 65 | tmp_y = sdata_y0[tid]; 66 | dma_ld_y_0.start_async_dma(); 67 | idx = i * COMPUTE_THREADS_PER_CTA * CTA_COUNT + blockIdx.x * COMPUTE_THREADS_PER_CTA + threadIdx.x; 68 | y[idx] = a * tmp_x + tmp_y; 69 | 70 | // Phase 2: 71 | dma_ld_x_1.wait_for_dma_finish(); 72 | tmp_x = sdata_x1[tid]; 73 | dma_ld_x_1.start_async_dma(); 74 | dma_ld_y_1.wait_for_dma_finish(); 75 | tmp_y = sdata_y1[tid]; 76 | dma_ld_y_1.start_async_dma(); 77 | idx = (i+1) * COMPUTE_THREADS_PER_CTA * CTA_COUNT + blockIdx.x * COMPUTE_THREADS_PER_CTA + threadIdx.x; 78 | y[idx] = a * tmp_x + tmp_y; 79 | } 80 | 81 | // Postamble 82 | dma_ld_x_0.wait_for_dma_finish(); 83 | tmp_x = sdata_x0[tid]; 84 | dma_ld_y_0.wait_for_dma_finish(); 85 | tmp_y = sdata_y0[tid]; 86 | idx = i * COMPUTE_THREADS_PER_CTA * CTA_COUNT + blockIdx.x * COMPUTE_THREADS_PER_CTA + threadIdx.x; 87 | y[idx] = a * tmp_x + tmp_y; 88 | dma_ld_x_1.wait_for_dma_finish(); 89 | tmp_x = sdata_x1[tid]; 90 | dma_ld_y_1.wait_for_dma_finish(); 91 | tmp_y = sdata_y1[tid]; 92 | idx = (i+1) * COMPUTE_THREADS_PER_CTA * CTA_COUNT + blockIdx.x * COMPUTE_THREADS_PER_CTA + threadIdx.x; 93 | y[idx] = a * tmp_x + tmp_y; 94 | 95 | } else if (dma_ld_x_0.owns_this_thread()) { 96 | #pragma unroll 1 97 | for (unsigned int j = 0; j < NUM_ITERS; j+=2) { 98 | // idx is a pointer to the base of the chunk of memory to copy 99 | unsigned int idx = j * COMPUTE_THREADS_PER_CTA * CTA_COUNT + blockIdx.x * COMPUTE_THREADS_PER_CTA; 100 | dma_ld_x_0.execute_dma( &x[idx], sdata_x0 ); 101 | } 102 | } else if (dma_ld_y_0.owns_this_thread()) { 103 | #pragma unroll 1 104 | for (unsigned int j = 0; j < NUM_ITERS; j+=2) { 105 | unsigned int idx = j * COMPUTE_THREADS_PER_CTA * CTA_COUNT + blockIdx.x * COMPUTE_THREADS_PER_CTA; 106 | dma_ld_y_0.execute_dma( &y[idx], sdata_y0 ); 107 | } 108 | } else if (dma_ld_x_1.owns_this_thread()) { 109 | #pragma unroll 1 110 | for (unsigned int j = 1; j < NUM_ITERS; j+=2) { 111 | unsigned int idx = j * COMPUTE_THREADS_PER_CTA * CTA_COUNT + blockIdx.x * COMPUTE_THREADS_PER_CTA; 112 | dma_ld_x_1.execute_dma( &x[idx], sdata_x1 ); 113 | } 114 | } else if (dma_ld_y_1.owns_this_thread()) { 115 | #pragma unroll 1 116 | for (unsigned int j = 1; j < NUM_ITERS; j+=2) { 117 | unsigned int idx = j * COMPUTE_THREADS_PER_CTA * CTA_COUNT + blockIdx.x * COMPUTE_THREADS_PER_CTA; 118 | dma_ld_y_1.execute_dma( &y[idx], sdata_y1 ); 119 | } 120 | } 121 | } 122 | 123 | -------------------------------------------------------------------------------- /examples/sgemv/both_double.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2015 Stanford University and NVIDIA 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include 19 | #include 20 | #include 21 | 22 | #include "cuda.h" 23 | #include "cuda_runtime.h" 24 | 25 | #include "cudaDMA.h" 26 | 27 | #define SIZE_N 896 28 | #define SIZE_M SIZE_N 29 | 30 | #define DMA_KERNEL sgemvn_cuda_dma_both_double 31 | #define COMPUTE_THREADS_PER_CTA 128 32 | #define DMA_THREADS_PER_LD 32 33 | #define DMA_LDS 10 34 | #ifndef VEC_ELMTS 35 | #define VEC_ELMTS 32 36 | #endif 37 | 38 | #ifndef SGEMV_ITERS 39 | #define SGEMV_ITERS 128 40 | #endif 41 | 42 | __global__ void 43 | __launch_bounds__(448,1) 44 | sgemvn_cuda_dma_both_double(int n, int m, int n1, float alpha, float *A, int lda, float *x, float *y) 45 | { 46 | __shared__ float buff0[VEC_ELMTS]; 47 | __shared__ float buff1[VEC_ELMTS]; 48 | __shared__ float mat0[VEC_ELMTS][COMPUTE_THREADS_PER_CTA]; 49 | __shared__ float mat1[VEC_ELMTS][COMPUTE_THREADS_PER_CTA]; 50 | 51 | cudaDMASequential 52 | dma_ld_0(1,COMPUTE_THREADS_PER_CTA,COMPUTE_THREADS_PER_CTA); 53 | 54 | cudaDMASequential 55 | dma_ld_1(2,COMPUTE_THREADS_PER_CTA,COMPUTE_THREADS_PER_CTA+1*DMA_THREADS_PER_LD); 56 | 57 | cudaDMAStrided 58 | dma_ld_2(3,COMPUTE_THREADS_PER_CTA,COMPUTE_THREADS_PER_CTA+2*DMA_THREADS_PER_LD,4*lda); 59 | 60 | cudaDMAStrided 61 | dma_ld_3(4,COMPUTE_THREADS_PER_CTA,COMPUTE_THREADS_PER_CTA+6*DMA_THREADS_PER_LD,4*lda); 62 | 63 | if (threadIdx.x < COMPUTE_THREADS_PER_CTA) 64 | { 65 | dma_ld_0.start_async_dma(); 66 | dma_ld_1.start_async_dma(); 67 | dma_ld_2.start_async_dma(); 68 | dma_ld_3.start_async_dma(); 69 | 70 | float res = 0.f; 71 | 72 | #ifdef DYNAMIC 73 | #pragma unroll 1 74 | for(int i=0; i 21 | #include 22 | #include 23 | #include 24 | #include 25 | 26 | class Weft; 27 | class Thread; 28 | class WeftBarrier; 29 | class BarrierArrive; 30 | class WeftInstruction; 31 | class BarrierDependenceGraph; 32 | 33 | class BarrierInstance { 34 | public: 35 | BarrierInstance(BarrierDependenceGraph *graph, int name, int generation); 36 | BarrierInstance(const BarrierInstance &rhs); 37 | ~BarrierInstance(void); 38 | public: 39 | BarrierInstance& operator=(const BarrierInstance &rhs); 40 | public: 41 | void update_waiting_threads(std::set &waiting_threads); 42 | bool intersects_with(const std::set &waiting_threads); 43 | bool happens_after(BarrierInstance *other); 44 | bool happens_before(const std::vector &other_participants); 45 | public: 46 | void add_participant(WeftBarrier *participant, bool sync); 47 | bool has_next(BarrierInstance *other); 48 | bool has_previous(BarrierInstance *other); 49 | void add_incoming(BarrierInstance *other); 50 | void add_outgoing(BarrierInstance *other); 51 | void remove_incoming(int name, int gen); 52 | void remove_outgoing(int name, int gen); 53 | public: 54 | void initialize_pending_counts(void); 55 | template 56 | void launch_if_ready(Weft *weft, bool forward); 57 | template 58 | void notify_dependences(Weft *weft, bool forward); 59 | void compute_reachability(Weft *weft, bool forward); 60 | void compute_transitivity(Weft *weft, bool forward); 61 | void update_latest_incoming(std::vector &other); 62 | void update_earliest_outgoing(std::vector &other); 63 | void update_latest_before(std::vector &other); 64 | void update_earliest_after(std::vector &other); 65 | public: 66 | void traverse_forward(std::deque &queue, 67 | std::set &visited); 68 | public: 69 | BarrierDependenceGraph *const graph; 70 | const int name; 71 | const int generation; 72 | protected: 73 | std::vector participants; 74 | // Helpful for constructing barrier dependence graph 75 | std::map syncs_only; 76 | protected: 77 | std::vector incoming; 78 | std::vector outgoing; 79 | protected: 80 | std::vector latest_incoming; 81 | std::vector earliest_outgoing; 82 | protected: 83 | std::vector latest_before; 84 | std::vector earliest_after; 85 | protected: 86 | int base_incoming; 87 | int base_outgoing; 88 | int pending_incoming; 89 | int pending_outgoing; 90 | }; 91 | 92 | class BarrierDependenceGraph { 93 | private: 94 | struct PendingState { 95 | public: 96 | PendingState(void) 97 | : expected(-1), generation(0) { } 98 | public: 99 | inline void reset(void) { 100 | expected = -1; 101 | generation++; 102 | arrivals.clear(); 103 | } 104 | public: 105 | int expected; 106 | int generation; 107 | std::set arrivals; 108 | }; 109 | struct PreceedingBarriers { 110 | public: 111 | PreceedingBarriers(void) { } 112 | public: 113 | void find_preceeding(BarrierInstance *bar); 114 | void add_instance(BarrierInstance *bar); 115 | public: 116 | // This is an upper bound on all arrivals 117 | std::set arrival_threads; 118 | std::deque previous; 119 | }; 120 | public: 121 | BarrierDependenceGraph(Weft *weft, Program *p); 122 | BarrierDependenceGraph(const BarrierDependenceGraph &rhs); 123 | ~BarrierDependenceGraph(void); 124 | public: 125 | BarrierDependenceGraph& operator=(const BarrierDependenceGraph &rhs); 126 | public: 127 | void construct_graph(const std::vector &threads); 128 | int count_validation_tasks(void); 129 | void enqueue_validation_tasks(void); 130 | void check_for_validation_errors(void); 131 | void validate_barrier(int name, int generation); 132 | public: 133 | int count_total_barriers(void); 134 | void enqueue_reachability_tasks(void); 135 | void enqueue_transitive_happens_tasks(void); 136 | protected: 137 | bool remove_complete_barriers(std::vector &program_counters, 138 | std::vector &pending_arrives, 139 | std::vector &preceeding, 140 | const std::vector &threads); 141 | bool are_empty(const std::vector &program_counters, 142 | const std::vector &threads); 143 | bool advance_program_counters(std::vector &program_counters, 144 | std::vector &pending_arrives, 145 | const std::vector &threads); 146 | void report_state(const std::vector &program_counters, 147 | const std::vector &threads, 148 | const std::vector &pending_arrives); 149 | protected: 150 | void initialize_pending_counts(void); 151 | public: 152 | Weft *const weft; 153 | Program *const program; 154 | const int max_num_barriers; 155 | protected: 156 | std::vector > barrier_instances; 157 | // A summary of all barriers in one place 158 | std::deque all_barriers; 159 | protected: 160 | pthread_mutex_t validation_mutex; 161 | std::vector > failed_validations; 162 | }; 163 | 164 | class BFSSearch { 165 | public: 166 | BFSSearch(BarrierInstance *source, BarrierInstance *target); 167 | BFSSearch(const BFSSearch &rhs) : source(NULL), target(NULL) { assert(false); } 168 | ~BFSSearch(void) { } 169 | public: 170 | BFSSearch& operator=(const BFSSearch &rhs) { assert(false); return *this; } 171 | public: 172 | bool execute(void); 173 | public: 174 | BarrierInstance *const source; 175 | BarrierInstance *const target; 176 | protected: 177 | std::deque queue; 178 | std::set visited; 179 | }; 180 | 181 | #endif // __BARRIER_DEPENDENCE_GRAPH_H__ 182 | -------------------------------------------------------------------------------- /src/program.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2015 Stanford University and NVIDIA 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #ifndef __PROGRAM_H__ 18 | #define __PROGRAM_H__ 19 | 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | 27 | enum ThreadStatus { 28 | THREAD_ENABLED, 29 | THREAD_DISABLED, 30 | THREAD_EXITTED, 31 | }; 32 | 33 | enum ProgramStage { 34 | EMULATE_THREADS_STAGE, 35 | CONSTRUCT_BARRIER_GRAPH_STAGE, 36 | COMPUTE_HAPPENS_RELATIONSHIP_STAGE, 37 | CHECK_FOR_RACES_STAGE, 38 | TOTAL_STAGES, 39 | }; 40 | 41 | class Weft; 42 | class Thread; 43 | class Happens; 44 | class PTXLabel; 45 | class WeftAccess; 46 | class SharedMemory; 47 | class PTXInstruction; 48 | class WeftInstruction; 49 | 50 | struct ThreadState { 51 | public: 52 | ThreadState(void) 53 | : status(THREAD_ENABLED), next(NULL) { } 54 | public: 55 | ThreadStatus status; 56 | PTXLabel *next; 57 | }; 58 | 59 | class Program { 60 | public: 61 | struct CTAState { 62 | public: 63 | CTAState(void) 64 | : shared_memory(NULL), graph(NULL) { } 65 | public: 66 | int block_id[3]; 67 | SharedMemory *shared_memory; 68 | BarrierDependenceGraph *graph; 69 | std::vector threads; 70 | }; 71 | public: 72 | Program(Weft *weft, std::string &kernel_name); 73 | Program(const Program &rhs); 74 | ~Program(void); 75 | public: 76 | Program& operator=(const Program &rhs); 77 | public: 78 | static void parse_ptx_file(const char *file_name, Weft *weft, 79 | std::vector &programs); 80 | void report_statistics(void); 81 | void report_statistics(const std::vector &threads); 82 | bool has_shuffles(void) const; 83 | inline int count_instructions(void) const { return ptx_instructions.size(); } 84 | inline int barrier_upper_bound(void) const { return max_num_barriers; } 85 | inline int thread_count(void) const { return max_num_threads; } 86 | inline bool assume_warp_synchronous(void) const { return warp_synchronous; } 87 | inline const char* get_name(void) const { return kernel_name.c_str(); } 88 | protected: 89 | void emulate_threads(void); 90 | void construct_dependence_graph(void); 91 | void compute_happens_relationships(void); 92 | void check_for_race_conditions(void); 93 | void print_statistics(void); 94 | void print_files(void); 95 | int count_dynamic_instructions(void); 96 | int count_weft_statements(void); 97 | int count_total_barriers(void); 98 | int count_addresses(void); 99 | size_t count_race_tests(void); 100 | public: 101 | int emulate(Thread *thread); 102 | void emulate_warp(Thread **threads); 103 | void get_kernel_prefix(char *buffer, size_t count); 104 | public: 105 | void add_line(const std::string &line, int line_num); 106 | void set_block_dim(const int *array); 107 | void add_block_id(const int *array); 108 | void set_grid_dim(const int *array); 109 | void fill_block_dim(int *array) const; 110 | void fill_block_id(int *array) const; 111 | void fill_grid_dim(int *array) const; 112 | void verify(void); 113 | protected: 114 | void convert_to_instructions(const std::map &source_files); 115 | static bool parse_file_location(const std::string &line, 116 | std::map &source_files); 117 | static bool parse_source_location(const std::string &line, 118 | int &source_file, int &source_line); 119 | protected: 120 | void start_instrumentation(ProgramStage stage); 121 | void stop_instrumentation(ProgramStage stage); 122 | public: 123 | void report_instrumentation(size_t &accumulated_memory); 124 | public: 125 | Weft *const weft; 126 | protected: 127 | std::string kernel_name; 128 | int max_num_threads; 129 | int max_num_barriers; 130 | protected: 131 | int block_dim[3]; 132 | int block_id[3]; 133 | int grid_dim[3]; 134 | bool warp_synchronous; 135 | unsigned current_cta; 136 | std::vector cta_states; 137 | protected: 138 | std::vector > lines; 139 | std::vector ptx_instructions; 140 | protected: 141 | // Instrumentation 142 | unsigned long long timing[TOTAL_STAGES]; 143 | size_t memory_usage[TOTAL_STAGES]; 144 | }; 145 | 146 | class Thread { 147 | public: 148 | struct GlobalDataInfo { 149 | public: 150 | const char *name; 151 | const int *data; 152 | size_t size; 153 | }; 154 | public: 155 | Thread(unsigned thread_id, int tidx, int tidy, int tidz, 156 | Program *p, SharedMemory *s); 157 | Thread(const Thread &rhs) : thread_id(0), tid_x(-1), tid_y(-1), tid_z(-1), 158 | program(NULL), shared_memory(NULL) { assert(false); } 159 | ~Thread(void); 160 | public: 161 | Thread& operator=(const Thread &rhs) { assert(false); return *this; } 162 | public: 163 | void initialize(void); 164 | void emulate(void); 165 | void cleanup(void); 166 | public: 167 | void register_shared_location(const std::string &name, int64_t address); 168 | bool find_shared_location(const std::string &name, int64_t &addr); 169 | public: 170 | void register_global_location(const char *name, const int *data, size_t size); 171 | bool get_global_location(const char *name, int64_t &addr); 172 | bool get_global_value(int64_t addr, int64_t &value); 173 | public: 174 | void set_value(int64_t reg, int64_t value); 175 | bool get_value(int64_t reg, int64_t &value); 176 | public: 177 | void set_pred(int64_t pred, bool value); 178 | bool get_pred(int64_t pred, bool &value); 179 | public: 180 | void add_instruction(WeftInstruction *instruction); 181 | void update_max_barrier_name(int name); 182 | inline int get_max_barrier_name(void) const { return max_barrier_name; } 183 | public: 184 | void profile_instruction(PTXInstruction *instruction); 185 | int accumulate_instruction_counts(std::vector &total_counts); 186 | void dump_weft_thread(void); 187 | public: 188 | void update_shared_memory(WeftAccess *access); 189 | public: 190 | inline size_t get_program_size(void) const { return instructions.size(); } 191 | inline WeftInstruction* get_instruction(int idx) 192 | { return ((unsigned(idx) < instructions.size()) ? instructions[idx] : NULL); } 193 | inline int count_dynamic_instructions(void) const 194 | { return dynamic_instructions; } 195 | inline int count_weft_statements(void) const 196 | { return instructions.size(); } 197 | inline void set_dynamic_instructions(int count) { dynamic_instructions = count; } 198 | public: 199 | void initialize_happens(int total_threads, int max_num_barriers); 200 | void update_happens_relationships(void); 201 | protected: 202 | void initialize_happens_instances(int total_threads); 203 | void compute_barriers_before(int max_num_barriers); 204 | void compute_barriers_after(int max_num_barriers); 205 | public: 206 | const unsigned thread_id; 207 | const int tid_x, tid_y, tid_z; 208 | Program *const program; 209 | SharedMemory *const shared_memory; 210 | protected: 211 | std::map shared_locations; 212 | std::map register_store; 213 | std::map predicate_store; 214 | std::vector globals; 215 | protected: 216 | int max_barrier_name; 217 | int dynamic_instructions; 218 | std::vector instructions; 219 | std::vector dynamic_counts; 220 | protected: 221 | std::deque all_happens; 222 | }; 223 | 224 | class SharedStore { 225 | public: 226 | SharedStore(void) { } 227 | SharedStore(const SharedStore &rhs) { assert(false); } 228 | ~SharedStore(void) { } 229 | public: 230 | SharedStore& operator=(const SharedStore &rhs) { assert(false); return *this; } 231 | public: 232 | void write(int64_t addr, int64_t value); 233 | bool read(int64_t addr, int64_t &value); 234 | protected: 235 | std::map store; 236 | }; 237 | 238 | #endif //__PROGRAM_H__ 239 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Weft 2 | ==== 3 | 4 | A Sound and Complete Verification Tool for Warp-Specialized GPU Kernels 5 | 6 | Update! Our paper on Weft, **Verification of Producer-Consumer 7 | Synchronization in GPU Programs** will be appearing at 8 | [PLDI 2015](http://conf.researchr.org/home/pldi2015). 9 | 10 | Navigation 11 | ---- 12 | 13 | 1. [Overview](#overview) 14 | 2. [Prerequisites](#prerequisites) 15 | 3. [Downloading Weft](#downloading-and-building-weft) 16 | 4. [Using Weft](#using-weft) 17 | 5. [Command Line Arguments](#command-line-arguments) 18 | 19 | Overview 20 | ---- 21 | 22 | Weft is a sound and complete verification tool for warp-specialized 23 | kernels that use named barriers on NVIDIA GPUs. Warp-specialized 24 | kernels can encode arbitrary producer-consumer relationships between 25 | different subsets of warps within a kernel using named barriers. 26 | This requires a more general analysis than most current GPU verification 27 | tools provide. 28 | 29 | Weft operates on the PTX code emitted by the CUDA compiler and verifies 30 | three important properties of any warp-specialized kernel. 31 | 32 | * Deadlock Freedom - the use of named barriers should not result in deadlock. 33 | * Safe Barrier Recycling - named barriers are a limited physical resource 34 | and it is important to check that they are 35 | safely recycled. 36 | * Race Freedom - checking that all shared memory accesses are properly 37 | synchronized by named barriers. 38 | 39 | Weft performs a fully static analysis which requires that the use of 40 | named barriers and shared memory accesses be statically analyzable. 41 | All operations which are not statically analyzable are ignored and 42 | can optionally be reported. In practice, we have found that for most 43 | GPU kernels this is not an issue because synchronization and shared 44 | memory accesses are not dependent on program input and therefore 45 | can be verified statically. 46 | 47 | Due to its generality, Weft is also capable of checking non-warp-specialized 48 | code as well for race freedom. The one caveat is that Weft currently 49 | does not attempt to check code that uses atomics. 50 | 51 | Prerequisites 52 | ---- 53 | 54 | Weft requires an installation of the CUDA compiler for generating 55 | input PTX files. The CUDA toolkit can be downloaded 56 | [here](https://developer.nvidia.com/cuda-downloads). Weft requires 57 | CUDA version 5.5 or later. 58 | 59 | Weft can be built with a standard C++ compiler. Weft has been tested 60 | with g++ and clang on both Linux and Mac systems. 61 | 62 | Downloading and Building Weft 63 | ---- 64 | 65 | Weft is available on github under the Apache Software License 66 | version 2.0. To clone a copy of the Weft source type: 67 | 68 | $ git clone https://github.com/lightsighter/Weft.git 69 | 70 | After cloning the repository, change into the `src` directory 71 | and type: 72 | 73 | $ make 74 | 75 | This will build the Weft binary `weft`. You may wish to add the 76 | directory containing the Weft binary to your path using the 77 | following command. 78 | 79 | $ export PATH=$PATH://src 80 | 81 | Using Weft 82 | ---- 83 | 84 | Using Weft to validate a CUDA source file is straightforward. 85 | The first step is to use the CUDA compiler to generate a PTX 86 | file for Weft to consume as input. Currently, Weft will only 87 | analyze the first kernel that it finds in a PTX file, so files 88 | containing multiple kernels should be divided into separate 89 | source files. 90 | 91 | To generate input for Weft, the CUDA compiler should be 92 | invoked with the `-ptx` flag to create an output PTX file. 93 | We also recommend the CUDA compiler be called with the 94 | `-lineinfo` flag so Weft can provide output based on CUDA 95 | source code line numbers instead of PTX line numbers. In 96 | some cases, the flags for compute architecture (`-arch`) and 97 | machine size (`-m`) may need to be specified depending on the 98 | kernel being compiled. Below are the two ways that we invoke 99 | the CUDA compiler on all of our example kernels for the 100 | Fermi and Kepler architectures respectively. 101 | 102 | $ nvcc -ptx -lineinfo -m64 -arch=compute_20 source.cu 103 | $ nvcc -ptx -lineinfo -m64 -arch=compute_35 source.cu 104 | 105 | The resulting PTX file is the input to Weft. The PTX file name 106 | can either be specified to Weft using the `-f` flag or as the 107 | last argument. 108 | 109 | $ weft -f source.ptx -s -t 4 110 | $ weft -s -t 4 source.ptx 111 | 112 | As part of its validation, Weft needs to know how many threads 113 | are in each CTA. For kernels with 1-D CTAs, Weft can infer this 114 | information if the `__launch_bounds__` annotation was given on 115 | the CUDA original kernel. However, if this declaration did not exits on 116 | the original source kernel, then it must be explicitly specified 117 | using the `-n` flag. As an example, our `saxpy_single.cu` source 118 | file contains no `__launch_bounds__` declaration on its 119 | kernel, therefore we must tell Weft that the kernel requires CTAs 120 | containing 320 threads. 121 | 122 | $ weft -n 320 saxpy_single.ptx 123 | 124 | Note that the `-n` flag should also be used to specify multi-dimensional 125 | CTA shapes which cannot be captured by the `__launch_bounds__` 126 | annotation. Both of the following are valid examples: 127 | 128 | $ weft -n 320x1x1 saxpy_single.ptx 129 | $ weft -n 16x16 dgemm.ptx 130 | 131 | Weft supports a large set of command line flags which we cover in 132 | more detail [later](#command-line-arguments). We mention two flags 133 | briefly now as they are often useful for many users. First, by default, 134 | Weft does not assume warp synchronous execution where all 135 | threads in a warp execute in lock-step. Many CUDA programs rely on 136 | this property for correctness. The warp synchronous execution assumption 137 | can be enabled in Weft by passing the `-s` flag on the command line. 138 | As an example, the Fermi chemistry kernel in `examples/DME/chem_fermi.cu` 139 | will report races if run under normal assumptions, but will always be 140 | race free under a warp synchronous execution. 141 | 142 | Another useful flag for Weft is the `-t` flag which controls the 143 | number of parallel threads that Weft will use when performing validation. 144 | For most multi-core architectures we find that 2-4 threads is a good 145 | option. Weft is primarily a memory bound application, and having two 146 | threads per socket is usually sufficient to saturate memory bandwidth. 147 | 148 | We have provided a set of test kernels for Weft in the `examples` 149 | directory. Each individual directory contains its own Makefile for 150 | generating the PTX code for individual kernels. We also have a script 151 | called `run_examples.sh` in the main `examples` directory which will 152 | validate all of the example kernels. Note that some kernels will 153 | report races. The script may take between 30 minutes 154 | and 1 hour (depending on the machine) to validate all of the kernels. 155 | 156 | Command Line Arguments 157 | ---- 158 | 159 | Below is a summary of the command line flags that Weft supports. 160 | 161 | * `-b`: specify the CTA id to simulate (default 0x0x0) 162 | * `-d`: print detailed information when giving error output, 163 | including where threads are blocked for deadlock as 164 | well as per-thread and per-address information for races 165 | * `-f`: specify the input PTX file (can be omitted if 166 | the file is the last argument in the command line) 167 | * `-g`: specify the grid dimensions for the kernel being simulated 168 | (this argument can be omitted in most cases as many kernels 169 | will not depend on these values; regardless of the grid 170 | bounds Weft will always validate a single CTA specified 171 | by the `-b` flag) 172 | * `-i`: instrument the execution of Weft to report the 173 | time taken and memory usage for each stage 174 | * `-n`: set the number of threads per CTA. This is required 175 | if the CUDA kernel did not have a 176 | `__launch_bounds__` annotation 177 | * `-p`: print out individual files for each thread of all Weft modeled 178 | instructions, this will generate one file per thread 179 | * `-s`: assume warp-synchronous execution when checking for races 180 | * `-t`: set the size of the thread pool for Weft to use; in 181 | general, Weft is memory bound, so one or two threads per socket 182 | should be sufficient for achieving peak performance. 183 | * `-v`: enable verbose output 184 | * `-w`: enable warnings about PTX instructions that cannot be 185 | statically emulated (can result in large output) 186 | 187 | -------------------------------------------------------------------------------- /src/weft.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2015 Stanford University and NVIDIA 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #ifndef __WEFT_H__ 18 | #define __WEFT_H__ 19 | 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | 27 | #define PTHREAD_SAFE_CALL(cmd) \ 28 | { \ 29 | int ret = (cmd); \ 30 | if (ret != 0) { \ 31 | fprintf(stderr,"PTHREAD error: %s = %d (%s)\n", #cmd, ret, strerror(ret)); \ 32 | assert(false); \ 33 | } \ 34 | } 35 | 36 | #define WARP_SIZE 32 37 | 38 | enum { 39 | WEFT_SUCCESS, 40 | WEFT_ERROR_NO_FILE_NAME, 41 | WEFT_ERROR_FILE_OPEN, 42 | WEFT_ERROR_NO_KERNELS, 43 | WEFT_ERROR_NO_THREAD_COUNT, 44 | WEFT_ERROR_ARRIVAL_MISMATCH, 45 | WEFT_ERROR_TOO_MANY_PARTICIPANTS, 46 | WEFT_ERROR_ALL_ARRIVALS, 47 | WEFT_ERROR_DEADLOCK, 48 | WEFT_ERROR_GRAPH_VALIDATION, 49 | WEFT_ERROR_INVALID_PTX_VERSION, 50 | }; 51 | 52 | class Weft; 53 | class Thread; 54 | class Program; 55 | class Address; 56 | class SharedMemory; 57 | class BarrierInstance; 58 | class BarrierDependenceGraph; 59 | 60 | class WeftTask { 61 | public: 62 | virtual ~WeftTask(void) { } 63 | virtual void execute(void) = 0; 64 | }; 65 | 66 | class EmulateThread : public WeftTask { 67 | public: 68 | EmulateThread(Thread *thread); 69 | EmulateThread(const EmulateThread &rhs) : thread(NULL) { assert(false); } 70 | virtual ~EmulateThread(void) { } 71 | public: 72 | EmulateThread& operator=(const EmulateThread &rhs) { assert(false); return *this; } 73 | public: 74 | virtual void execute(void); 75 | public: 76 | Thread *const thread; 77 | }; 78 | 79 | class EmulateWarp : public WeftTask { 80 | public: 81 | EmulateWarp(Program *p, Thread **start); 82 | EmulateWarp(const EmulateWarp &rhs) : program(NULL), threads(NULL) { assert(false); } 83 | virtual ~EmulateWarp(void) { } 84 | public: 85 | EmulateWarp& operator=(const EmulateWarp &rhs) { assert(false); return *this; } 86 | public: 87 | virtual void execute(void); 88 | public: 89 | Program *const program; 90 | Thread **const threads; 91 | }; 92 | 93 | class ValidationTask : public WeftTask { 94 | public: 95 | ValidationTask(BarrierDependenceGraph *graph, int name, int generation); 96 | ValidationTask(const ValidationTask &rhs) : graph(NULL), 97 | name(0), generation(0) { assert(false); } 98 | virtual ~ValidationTask(void) { } 99 | public: 100 | ValidationTask& operator=(const ValidationTask &rhs) { assert(false); return *this; } 101 | public: 102 | virtual void execute(void); 103 | public: 104 | BarrierDependenceGraph *const graph; 105 | const int name; 106 | const int generation; 107 | }; 108 | 109 | class InitializationTask : public WeftTask { 110 | public: 111 | InitializationTask(Thread *thread, int total, int max_num_barriers); 112 | InitializationTask(const InitializationTask &rhs) 113 | : thread(NULL), total_threads(0), max_num_barriers(0) { assert(false); } 114 | virtual ~InitializationTask(void) { } 115 | public: 116 | InitializationTask& operator=(const InitializationTask &rhs) 117 | { assert(false); return *this; } 118 | public: 119 | virtual void execute(void); 120 | public: 121 | Thread *const thread; 122 | const int total_threads; 123 | const int max_num_barriers; 124 | }; 125 | 126 | class ReachabilityTask : public WeftTask { 127 | public: 128 | ReachabilityTask(BarrierInstance *instance, Weft *weft, bool forward); 129 | ReachabilityTask(const ReachabilityTask &rhs) : instance(NULL), 130 | weft(NULL), forward(true) { assert(false); } 131 | virtual ~ReachabilityTask(void) { } 132 | public: 133 | ReachabilityTask& operator=(const ReachabilityTask &rhs) 134 | { assert(false); return *this; } 135 | public: 136 | virtual void execute(void); 137 | public: 138 | BarrierInstance *const instance; 139 | Weft *const weft; 140 | const bool forward; 141 | }; 142 | 143 | class TransitiveTask : public WeftTask { 144 | public: 145 | TransitiveTask(BarrierInstance *instance, Weft *weft, bool forward); 146 | TransitiveTask(const TransitiveTask &rhs) : instance(NULL), 147 | weft(NULL), forward(true) { assert(false); } 148 | virtual ~TransitiveTask(void) { } 149 | public: 150 | TransitiveTask& operator=(const TransitiveTask &rhs) 151 | { assert(false); return *this; } 152 | public: 153 | virtual void execute(void); 154 | public: 155 | BarrierInstance *const instance; 156 | Weft *const weft; 157 | const bool forward; 158 | }; 159 | 160 | class UpdateThreadTask : public WeftTask { 161 | public: 162 | UpdateThreadTask(Thread *thread); 163 | UpdateThreadTask(const UpdateThreadTask &rhs) : thread(NULL) { assert(false); } 164 | virtual ~UpdateThreadTask(void) { } 165 | public: 166 | UpdateThreadTask& operator=(const UpdateThreadTask &rhs) 167 | { assert(false); return *this; } 168 | public: 169 | virtual void execute(void); 170 | public: 171 | Thread *const thread; 172 | }; 173 | 174 | class RaceCheckTask : public WeftTask { 175 | public: 176 | RaceCheckTask(Address *address); 177 | RaceCheckTask(const RaceCheckTask &rhs) : address(NULL) { assert(false); } 178 | virtual ~RaceCheckTask(void) { } 179 | public: 180 | RaceCheckTask& operator=(const RaceCheckTask &rhs) 181 | { assert(false); return *this; } 182 | public: 183 | virtual void execute(void); 184 | public: 185 | Address *const address; 186 | }; 187 | 188 | class DumpThreadTask : public WeftTask { 189 | public: 190 | DumpThreadTask(Thread *thread); 191 | DumpThreadTask(const DumpThreadTask &rhs) : thread(NULL) { assert(false); } 192 | virtual ~DumpThreadTask(void) { } 193 | public: 194 | DumpThreadTask& operator=(const DumpThreadTask &rhs) 195 | { assert(false); return *this; } 196 | public: 197 | virtual void execute(void); 198 | public: 199 | Thread *const thread; 200 | }; 201 | 202 | class Weft { 203 | public: 204 | Weft(int argc, char **argv); 205 | ~Weft(void); 206 | public: 207 | void verify(void); 208 | void report_error(int error_code, const char *message); 209 | inline bool report_warnings(void) const { return warnings; } 210 | inline bool print_verbose(void) const { return verbose; } 211 | inline bool print_detail(void) const { return detailed; } 212 | inline bool perform_instrumentation(void) const { return instrument; } 213 | inline bool emit_program_files(void) const { return print_files; } 214 | protected: 215 | void parse_inputs(int argc, char **argv); 216 | bool parse_triple(const std::string &input, int *array, 217 | const char *flag, const char *error_str); 218 | void report_usage(int error, const char *error_str); 219 | Program* parse_ptx(void); 220 | public: 221 | bool initialize_program(Program *program) const; 222 | void start_parsing_instrumentation(void); 223 | void stop_parsing_instrumentation(void); 224 | protected: 225 | void report_instrumentation(void); 226 | protected: 227 | void start_threadpool(void); 228 | void stop_threadpool(void); 229 | public: 230 | void initialize_count(unsigned count); 231 | void wait_until_done(void); 232 | public: 233 | void enqueue_task(WeftTask *task); 234 | WeftTask* dequeue_task(void); 235 | void complete_task(WeftTask *task); 236 | public: 237 | static void* worker_loop(void *arg); 238 | static unsigned long long get_current_time_in_micros(void); 239 | static size_t get_memory_usage(void); 240 | protected: 241 | const char *file_name; 242 | int block_dim[3]; // x, y, z 243 | int block_id[3]; // x, y, z 244 | int grid_dim[3]; // x, y, z 245 | int thread_pool_size; 246 | bool verbose; 247 | bool detailed; 248 | bool instrument; 249 | bool warnings; 250 | bool warp_synchronous; 251 | bool print_files; 252 | std::vector programs; 253 | protected: 254 | pthread_t *worker_threads; 255 | bool threadpool_finished; 256 | protected: 257 | pthread_mutex_t count_lock; 258 | pthread_cond_t count_cond; 259 | unsigned int pending_count; 260 | protected: 261 | pthread_mutex_t queue_lock; 262 | pthread_cond_t queue_cond; 263 | std::deque queue; 264 | protected: 265 | unsigned long long parsing_time; 266 | size_t parsing_memory; 267 | }; 268 | 269 | #endif // __WEFT_H__ 270 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | 203 | -------------------------------------------------------------------------------- /src/race.cc: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2015 Stanford University and NVIDIA 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include "weft.h" 18 | #include "race.h" 19 | #include "graph.h" 20 | #include "program.h" 21 | #include "instruction.h" 22 | 23 | Happens::Happens(int total_threads) 24 | : initialized(false) 25 | { 26 | happens_before.resize(total_threads, -1); 27 | happens_after.resize(total_threads, -1); 28 | } 29 | 30 | void Happens::update_barriers_before(const std::vector &before) 31 | { 32 | assert(latest_before.empty()); 33 | latest_before = before; 34 | } 35 | 36 | void Happens::update_barriers_after(const std::vector &after) 37 | { 38 | assert(earliest_after.empty()); 39 | earliest_after = after; 40 | } 41 | 42 | void Happens::update_happens_relationships(void) 43 | { 44 | for (std::vector::const_iterator it = 45 | latest_before.begin(); it != latest_before.end(); it++) 46 | { 47 | if ((*it) == NULL) 48 | continue; 49 | (*it)->get_instance()->update_latest_before(happens_after); 50 | } 51 | for (std::vector::const_iterator it = 52 | earliest_after.begin(); it != earliest_after.end(); it++) 53 | { 54 | if ((*it) == NULL) 55 | continue; 56 | (*it)->get_instance()->update_earliest_after(happens_before); 57 | } 58 | } 59 | 60 | bool Happens::has_happens(int thread, int line_number) 61 | { 62 | if (happens_before[thread] <= line_number) 63 | return true; 64 | if (happens_after[thread] >= line_number) 65 | return true; 66 | return false; 67 | } 68 | 69 | Address::Address(const int addr, SharedMemory *mem) 70 | : address(addr), memory(mem), total_races(0) 71 | { 72 | PTHREAD_SAFE_CALL( pthread_mutex_init(&address_lock,NULL) ); 73 | } 74 | 75 | Address::~Address(void) 76 | { 77 | PTHREAD_SAFE_CALL( pthread_mutex_destroy(&address_lock) ); 78 | } 79 | 80 | void Address::add_access(WeftAccess *access) 81 | { 82 | PTHREAD_SAFE_CALL( pthread_mutex_lock(&address_lock) ); 83 | accesses.push_back(access); 84 | PTHREAD_SAFE_CALL( pthread_mutex_unlock(&address_lock) ); 85 | } 86 | 87 | void Address::perform_race_tests(void) 88 | { 89 | if (memory->program->assume_warp_synchronous()) 90 | { 91 | for (unsigned idx1 = 0; idx1 < accesses.size(); idx1++) 92 | { 93 | WeftAccess *first = accesses[idx1]; 94 | if (first->is_read()) 95 | { 96 | for (unsigned idx2 = idx1+1; idx2 < accesses.size(); idx2++) 97 | { 98 | WeftAccess *second = accesses[idx2]; 99 | // Check for both reads 100 | if (second->is_read()) 101 | continue; 102 | // Check for warp-synchronous 103 | if (first->is_warp_synchronous(second)) 104 | continue; 105 | if (!first->has_happens_relationship(second)) 106 | record_race(first, second); 107 | } 108 | } 109 | else 110 | { 111 | for (unsigned idx2 = idx1+1; idx2 < accesses.size(); idx2++) 112 | { 113 | WeftAccess *second = accesses[idx2]; 114 | // Check for warp-synchronous 115 | if (first->is_warp_synchronous(second)) 116 | continue; 117 | if (!first->has_happens_relationship(second)) 118 | record_race(first, second); 119 | } 120 | } 121 | } 122 | } 123 | else 124 | { 125 | // For every pair of addresses, check to see if we can 126 | // establish a happens before or a happens after relationship 127 | for (unsigned idx1 = 0; idx1 < accesses.size(); idx1++) 128 | { 129 | WeftAccess *first = accesses[idx1]; 130 | if (first->is_read()) 131 | { 132 | for (unsigned idx2 = idx1+1; idx2 < accesses.size(); idx2++) 133 | { 134 | WeftAccess *second = accesses[idx2]; 135 | // Check for both reads 136 | if (second->is_read()) 137 | continue; 138 | if (!first->has_happens_relationship(second)) 139 | record_race(first, second); 140 | } 141 | } 142 | else 143 | { 144 | for (unsigned idx2 = idx1+1; idx2 < accesses.size(); idx2++) 145 | { 146 | WeftAccess *second = accesses[idx2]; 147 | if (!first->has_happens_relationship(second)) 148 | record_race(first, second); 149 | } 150 | } 151 | } 152 | } 153 | } 154 | 155 | void Address::record_race(WeftAccess *one, WeftAccess *two) 156 | { 157 | // Alternative race reporting 158 | //printf("Race between threads %d and %d on instructions " 159 | // "%d and %d (PTX %d and %d)\n", 160 | // one->thread->thread_id, two->thread->thread_id, 161 | // one->thread_line_number, two->thread_line_number, 162 | // one->instruction->line_number, two->instruction->line_number); 163 | total_races++; 164 | // Save the races based on the PTX instructions 165 | int ptx_one = one->instruction->line_number; 166 | int ptx_two = two->instruction->line_number; 167 | if (ptx_one <= ptx_two) 168 | { 169 | std::pair 170 | key(one->instruction, two->instruction); 171 | if (one->thread->thread_id <= two->thread->thread_id) 172 | ptx_races[key].insert( 173 | std::pair(one->thread, two->thread)); 174 | else 175 | ptx_races[key].insert( 176 | std::pair(two->thread, one->thread)); 177 | } 178 | else 179 | { 180 | std::pair 181 | key(two->instruction, one->instruction); 182 | if (one->thread->thread_id <= two->thread->thread_id) 183 | ptx_races[key].insert( 184 | std::pair(one->thread, two->thread)); 185 | else 186 | ptx_races[key].insert( 187 | std::pair(two->thread, one->thread)); 188 | } 189 | } 190 | 191 | int Address::report_races(std::map< 192 | std::pair,size_t> &all_races) 193 | { 194 | if (total_races > 0) 195 | { 196 | if (memory->weft->print_detail()) 197 | { 198 | fprintf(stderr,"WEFT INFO: Found %d races on address %d!\n", 199 | total_races, address); 200 | for (std::map,std::set< 201 | std::pair > >::const_iterator it = 202 | ptx_races.begin(); it != ptx_races.end(); it++) 203 | { 204 | PTXInstruction *one = it->first.first; 205 | PTXInstruction *two = it->first.second; 206 | if (one->source_file != NULL) 207 | { 208 | assert(two->source_file != NULL); 209 | if (one == two) 210 | fprintf(stderr,"\tThere are %ld races between different threads " 211 | "on line %d of %s with address %d\n", it->second.size(), 212 | one->source_line_number, one->source_file, address); 213 | else 214 | fprintf(stderr,"\tThere are %ld races between line %d of %s " 215 | " and line %d of %s with address %d\n", it->second.size(), 216 | one->source_line_number, one->source_file, 217 | two->source_line_number, two->source_file, address); 218 | } 219 | else 220 | { 221 | assert(two->source_file == NULL); 222 | if (one == two) 223 | fprintf(stderr,"\tThere are %ld races between different threads " 224 | "on PTX line %d with address %d\n", it->second.size(), 225 | one->line_number, address); 226 | else 227 | fprintf(stderr,"\tThere are %ld races between PTX line %d " 228 | " and PTX line %d with address %d\n", it->second.size(), 229 | one->line_number, two->line_number, address); 230 | } 231 | const std::set > &threads = it->second; 232 | for (std::set >::const_iterator 233 | thread_it = threads.begin(); 234 | thread_it != threads.end(); thread_it++) 235 | { 236 | Thread *first = thread_it->first; 237 | Thread *second = thread_it->second; 238 | fprintf(stderr,"\t\t... between thread (%d,%d,%d) and (%d,%d,%d)\n", 239 | first->tid_x, first->tid_y, first->tid_z, 240 | second->tid_x, second->tid_y, second->tid_z); 241 | } 242 | } 243 | } 244 | else 245 | { 246 | for (std::map, 247 | std::set > >::const_iterator 248 | it = ptx_races.begin(); it != ptx_races.end(); it++) 249 | { 250 | std::map,size_t>::iterator 251 | finder = all_races.find(it->first); 252 | if (finder == all_races.end()) 253 | all_races[it->first] = it->second.size(); 254 | else 255 | finder->second += it->second.size(); 256 | } 257 | } 258 | } 259 | return total_races; 260 | } 261 | 262 | size_t Address::count_race_tests(void) 263 | { 264 | size_t num_accesses = accesses.size(); 265 | // OLA's equality 266 | // 1 + 2 + 3 + ... + n-1 = (n-1)*n/2 267 | return ((num_accesses * (num_accesses-1))/2); 268 | } 269 | 270 | SharedMemory::SharedMemory(Weft *w, Program *p) 271 | : weft(w), program(p) 272 | { 273 | PTHREAD_SAFE_CALL( pthread_mutex_init(&memory_lock,NULL) ); 274 | } 275 | 276 | SharedMemory::~SharedMemory(void) 277 | { 278 | for (std::map::iterator it = addresses.begin(); 279 | it != addresses.end(); it++) 280 | { 281 | delete it->second; 282 | } 283 | addresses.clear(); 284 | PTHREAD_SAFE_CALL( pthread_mutex_destroy(&memory_lock) ); 285 | } 286 | 287 | void SharedMemory::update_accesses(WeftAccess *access) 288 | { 289 | Address *address; 290 | // These lookups need to be thread safe 291 | PTHREAD_SAFE_CALL( pthread_mutex_lock(&memory_lock) ); 292 | std::map::const_iterator finder = 293 | addresses.find(access->address); 294 | if (finder == addresses.end()) 295 | { 296 | address = new Address(access->address, this); 297 | addresses[access->address] = address; 298 | } 299 | else 300 | address = finder->second; 301 | PTHREAD_SAFE_CALL( pthread_mutex_unlock(&memory_lock) ); 302 | address->add_access(access); 303 | } 304 | 305 | int SharedMemory::count_addresses(void) const 306 | { 307 | return addresses.size(); 308 | } 309 | 310 | void SharedMemory::enqueue_race_checks(void) 311 | { 312 | for (std::map::const_iterator it = addresses.begin(); 313 | it != addresses.end(); it++) 314 | { 315 | weft->enqueue_task(new RaceCheckTask(it->second)); 316 | } 317 | } 318 | 319 | void SharedMemory::check_for_races(void) 320 | { 321 | int total_races = 0; 322 | std::map,size_t> all_races; 323 | for (std::map::const_iterator it = 324 | addresses.begin(); it != addresses.end(); it++) 325 | { 326 | total_races += it->second->report_races(all_races); 327 | } 328 | if (total_races > 0) 329 | { 330 | if (!weft->print_detail()) 331 | { 332 | for (std::map,size_t>::const_iterator 333 | it = all_races.begin(); it != all_races.end(); it++) 334 | { 335 | PTXInstruction *one = it->first.first; 336 | PTXInstruction *two = it->first.second; 337 | if (one->source_file != NULL) 338 | { 339 | assert(two->source_file != NULL); 340 | if (one == two) 341 | fprintf(stderr,"\tFound races between %ld pairs of " 342 | "threads on line %d of %s\n", it->second, 343 | one->source_line_number, one->source_file); 344 | else 345 | fprintf(stderr,"\tFound races between %ld pairs of threads " 346 | "on line %d of %s and line %d of %s\n", it->second, 347 | one->source_line_number, one->source_file, 348 | two->source_line_number, two->source_file); 349 | } 350 | else 351 | { 352 | assert(two->source_file == NULL); 353 | if (one == two) 354 | fprintf(stderr,"\tFound races between %ld pairs of " 355 | "threads on PTX line number %d\n", 356 | it->second, one->line_number); 357 | else 358 | fprintf(stderr,"\tFound races between %ld pairs of threads on " 359 | "PTX line %d and PTX line %d\n", it->second, 360 | one->line_number, two->line_number); 361 | } 362 | } 363 | fprintf(stderr,"WEFT INFO: Found %d total races in kernel %s!\n" 364 | " Run with '-d' flag to see detailed per-thread " 365 | "and per-address races\n", total_races, program->get_name()); 366 | } 367 | else 368 | fprintf(stderr,"WEFT INFO: Found %d total races in kernel %s!\n", 369 | total_races, program->get_name()); 370 | fprintf(stderr,"WEFT INFO: RACES DETECTED IN KERNEL %s!\n", 371 | program->get_name()); 372 | } 373 | else 374 | fprintf(stdout,"WEFT INFO: No races detected in kernel %s!\n", 375 | program->get_name()); 376 | } 377 | 378 | size_t SharedMemory::count_race_tests(void) 379 | { 380 | size_t result = 0; 381 | for (std::map::const_iterator it = addresses.begin(); 382 | it != addresses.end(); it++) 383 | { 384 | result += it->second->count_race_tests(); 385 | } 386 | return result; 387 | } 388 | 389 | RaceCheckTask::RaceCheckTask(Address *addr) 390 | : address(addr) 391 | { 392 | } 393 | 394 | void RaceCheckTask::execute(void) 395 | { 396 | address->perform_race_tests(); 397 | } 398 | 399 | -------------------------------------------------------------------------------- /src/weft.cc: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2015 Stanford University and NVIDIA 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include "weft.h" 18 | #include "race.h" 19 | #include "graph.h" 20 | #include "program.h" 21 | #include "instruction.h" 22 | 23 | #include 24 | 25 | #include 26 | #include 27 | #include 28 | #include 29 | 30 | #include 31 | #include 32 | 33 | #ifdef __MACH__ 34 | #include "mach/clock.h" 35 | #include "mach/mach.h" 36 | #endif 37 | 38 | Weft::Weft(int argc, char **argv) 39 | : file_name(NULL), thread_pool_size(1), 40 | verbose(false), detailed(false), instrument(false), 41 | warnings(false), warp_synchronous(false), print_files(false), 42 | worker_threads(NULL), pending_count(0) 43 | { 44 | for (int i = 0; i < 3; i++) 45 | block_dim[i] = 1; 46 | for (int i = 0; i < 3; i++) 47 | block_id[i] = 0; 48 | for (int i = 0; i < 3; i++) 49 | grid_dim[i] = 1; 50 | parse_inputs(argc, argv); 51 | start_threadpool(); 52 | } 53 | 54 | Weft::~Weft(void) 55 | { 56 | stop_threadpool(); 57 | for (std::vector::iterator it = programs.begin(); 58 | it != programs.end(); it++) 59 | { 60 | delete (*it); 61 | } 62 | programs.clear(); 63 | } 64 | 65 | void Weft::verify(void) 66 | { 67 | Program::parse_ptx_file(file_name, this, programs); 68 | for (std::vector::const_iterator it = programs.begin(); 69 | it != programs.end(); it++) 70 | { 71 | Program *program = *it; 72 | program->verify(); 73 | } 74 | if (instrument) 75 | report_instrumentation(); 76 | } 77 | 78 | void Weft::report_error(int error_code, const char *message) 79 | { 80 | assert(error_code != WEFT_SUCCESS); 81 | fprintf(stderr,"WEFT ERROR %d: %s!\n", error_code, message); 82 | fprintf(stderr,"WEFT WILL NOW EXIT...\n"); 83 | fflush(stderr); 84 | stop_threadpool(); 85 | exit(error_code); 86 | } 87 | 88 | void Weft::parse_inputs(int argc, char **argv) 89 | { 90 | for (int i = 1; i < argc; i++) 91 | { 92 | if (!strcmp(argv[i],"-b")) 93 | { 94 | std::string block(argv[++i]); 95 | parse_triple(block, block_id, "-b", "CTA ID"); 96 | continue; 97 | } 98 | if (!strcmp(argv[i],"-d")) 99 | { 100 | detailed = true; 101 | continue; 102 | } 103 | if (!strcmp(argv[i],"-f")) 104 | { 105 | file_name = argv[++i]; 106 | continue; 107 | } 108 | if (!strcmp(argv[i],"-g")) 109 | { 110 | std::string grid(argv[++i]); 111 | parse_triple(grid, grid_dim, "-g", "Grid Size"); 112 | continue; 113 | } 114 | if (!strcmp(argv[i],"-i")) 115 | { 116 | instrument = true; 117 | continue; 118 | } 119 | if (!strcmp(argv[i],"-n")) 120 | { 121 | std::string threads(argv[++i]); 122 | parse_triple(threads, block_dim, "-n", "CTA size"); 123 | continue; 124 | } 125 | if (!strcmp(argv[i],"-p")) 126 | { 127 | print_files = true; 128 | continue; 129 | } 130 | if (!strcmp(argv[i],"-s")) 131 | { 132 | warp_synchronous = true; 133 | continue; 134 | } 135 | if (!strcmp(argv[i],"-t")) 136 | { 137 | thread_pool_size = atoi(argv[++i]); 138 | if (thread_pool_size < 1) 139 | thread_pool_size = 1; 140 | continue; 141 | } 142 | if (!strcmp(argv[i],"-v")) 143 | { 144 | verbose = true; 145 | continue; 146 | } 147 | if (!strcmp(argv[i],"-w")) 148 | { 149 | warnings = true; 150 | continue; 151 | } 152 | // If it has a ptx ending then guess it is the file name 153 | std::string file(argv[i]); 154 | if (file.find(".ptx") != std::string::npos) 155 | { 156 | file_name = argv[i]; 157 | continue; 158 | } 159 | fprintf(stderr,"WEFT WARNING: skipping argument %s\n", argv[i]); 160 | } 161 | if (file_name == NULL) 162 | report_usage(WEFT_ERROR_NO_FILE_NAME, "No file name specified"); 163 | if (verbose) 164 | { 165 | fprintf(stdout,"INITIAL WEFT SETTINGS:\n"); 166 | fprintf(stdout," File Name: %s\n", file_name); 167 | fprintf(stdout," CTA dimensions: (%d,%d,%d)\n", 168 | block_dim[0], block_dim[1], block_dim[2]); 169 | fprintf(stdout," Block ID: (%d,%d,%d)\n", 170 | block_id[0], block_id[1], block_id[2]); 171 | fprintf(stdout," Grid dimensions: (%d,%d,%d)\n", 172 | grid_dim[0], grid_dim[1], grid_dim[2]); 173 | fprintf(stdout," Thread Pool Size: %d\n", thread_pool_size); 174 | fprintf(stdout," Verbose: %s\n", (verbose ? "yes" : "no")); 175 | fprintf(stdout," Detailed: %s\n", (detailed ? "yes" : "no")); 176 | fprintf(stdout," Instrument: %s\n", (instrument ? "yes" : "no")); 177 | fprintf(stdout," Report Warnings: %s\n", (warnings ? "yes" : "no")); 178 | fprintf(stdout," Warp-Synchronous Execution: %s\n", (warnings ? "yes" : "no")); 179 | fprintf(stdout," Dump Weft thread files: %s\n", (print_files ? "yes" : "no")); 180 | } 181 | } 182 | 183 | bool Weft::parse_triple(const std::string &input, int *array, 184 | const char *flag, const char *error_str) 185 | { 186 | bool success = true; 187 | if (input.find("x") != std::string::npos) 188 | { 189 | // Try parsing this block configuration 190 | std::vector values; 191 | split(values, input.c_str(), 'x'); 192 | if (!values.empty() && (values.size() <= 3)) 193 | { 194 | // Try parsing each of the arguments 195 | for (unsigned i = 0; i < values.size(); i++) 196 | { 197 | int count = atoi(values[i].c_str()); 198 | if (count < 1) 199 | { 200 | fprintf(stderr,"WEFT WARNING: Failed to parse dimension %d " 201 | "of %s: \"%s %s\"!\n", 202 | i, error_str, flag, input.c_str()); 203 | success = false; 204 | break; 205 | } 206 | array[i] = count; 207 | } 208 | } 209 | else 210 | { 211 | fprintf(stderr,"WEFT WARNING: Failed to parse %s with %ld" 212 | "dimensions from input: \"%s %s\"!\n", 213 | error_str, values.size(), flag, input.c_str()); 214 | success = false; 215 | } 216 | } 217 | else 218 | { 219 | int count = atoi(input.c_str()); 220 | if (count >= 1) 221 | array[0] = count; 222 | else 223 | { 224 | success = false; 225 | fprintf(stderr,"WEFT WARNING: Ignoring invalid input for %s " 226 | "\"%s %s\"!\n", error_str, flag, input.c_str()); 227 | } 228 | } 229 | return success; 230 | } 231 | 232 | void Weft::report_usage(int error, const char *error_str) 233 | { 234 | fprintf(stderr,"WEFT ERROR %d: %s!\nWEFT WILL NOW EXIT...\n", 235 | error, error_str); 236 | fprintf(stderr,"Usage: Weft [args]\n"); 237 | fprintf(stderr," -b: specify the CTA id to simulate (default 0x0x0)\n"); 238 | fprintf(stderr," can be an integer or an x-separated tuple e.g. 0x0x1 or 1x2\n"); 239 | fprintf(stderr," -d: print detailed information for error reporting\n"); 240 | fprintf(stderr," this includes line numbers for blocked threads under deadlock and\n"); 241 | fprintf(stderr," and per-thread and per-address information for races\n"); 242 | fprintf(stderr," -f: specify the input file\n"); 243 | fprintf(stderr," -g: specify the grid dimensions for the kernel being simulated\n"); 244 | fprintf(stderr," can be an integer or an x-separated tuple e.g. 32x32x2 or 32x1\n"); 245 | fprintf(stderr," Weft will still only simulate a single CTA specified by '-b'\n"); 246 | fprintf(stderr," -i: instrument execution\n"); 247 | fprintf(stderr," -n: number of threads per CTA\n"); 248 | fprintf(stderr," can be an integer or an x-separated tuple e.g. 64x2 or 32x8x1\n"); 249 | fprintf(stderr," -p: print individual Weft thread files (one file per thread!)\n"); 250 | fprintf(stderr," -s: assume warp-synchronous execution\n"); 251 | fprintf(stderr," -t: thread pool size\n"); 252 | fprintf(stderr," -v: print verbose output\n"); 253 | fprintf(stderr," -w: report emulation warnings (this may generate considerable output)\n"); 254 | exit(error); 255 | } 256 | 257 | bool Weft::initialize_program(Program *program) const 258 | { 259 | program->set_block_dim(block_dim); 260 | program->add_block_id(block_id); 261 | program->set_grid_dim(grid_dim); 262 | return warp_synchronous; 263 | } 264 | 265 | void Weft::start_parsing_instrumentation(void) 266 | { 267 | parsing_time = get_current_time_in_micros(); 268 | } 269 | 270 | void Weft::stop_parsing_instrumentation(void) 271 | { 272 | unsigned long long stop = get_current_time_in_micros(); 273 | unsigned long long start = parsing_time; 274 | parsing_time = stop - start; 275 | parsing_memory = get_memory_usage(); 276 | } 277 | 278 | void Weft::report_instrumentation(void) 279 | { 280 | fprintf(stdout,"WEFT INSTRUMENTATION FOR PARSING FILE %s\n", file_name); 281 | #ifdef __MACH__ 282 | fprintf(stdout," %50s: %10.3lf ms %12ld MB\n", 283 | "Parse PTX", double(parsing_time) * 1e-3, parsing_memory / (1024 * 1024)); 284 | #else 285 | fprintf(stdout," %50s: %10.3lf ms %12ld MB\n", 286 | "Parse PTX", double(parsing_time) * 1e-3, parsing_memory / 1024); 287 | #endif 288 | size_t accumulated_memory = parsing_memory; 289 | for (std::vector::const_iterator it = programs.begin(); 290 | it != programs.end(); it++) 291 | { 292 | (*it)->report_instrumentation(accumulated_memory); 293 | } 294 | } 295 | 296 | void Weft::start_threadpool(void) 297 | { 298 | assert(thread_pool_size > 0); 299 | PTHREAD_SAFE_CALL( pthread_mutex_init(&count_lock, NULL) ); 300 | PTHREAD_SAFE_CALL( pthread_cond_init(&count_cond, NULL) ); 301 | PTHREAD_SAFE_CALL( pthread_mutex_init(&queue_lock, NULL) ); 302 | PTHREAD_SAFE_CALL( pthread_cond_init(&queue_cond, NULL) ); 303 | assert(worker_threads == NULL); 304 | worker_threads = (pthread_t*)malloc(thread_pool_size * sizeof(pthread_t)); 305 | threadpool_finished = false; 306 | for (int i = 0; i < thread_pool_size; i++) 307 | { 308 | PTHREAD_SAFE_CALL( pthread_create(worker_threads+i, NULL, 309 | Weft::worker_loop, this) ); 310 | } 311 | } 312 | 313 | void Weft::stop_threadpool(void) 314 | { 315 | // Wake up all the worker threads so that they exit 316 | PTHREAD_SAFE_CALL( pthread_mutex_lock(&queue_lock) ); 317 | threadpool_finished = true; 318 | PTHREAD_SAFE_CALL( pthread_cond_broadcast(&queue_cond) ); 319 | PTHREAD_SAFE_CALL( pthread_mutex_unlock(&queue_lock) ); 320 | for (int i = 0; i < thread_pool_size; i++) 321 | { 322 | PTHREAD_SAFE_CALL( pthread_join(worker_threads[i], NULL) ) ; 323 | } 324 | free(worker_threads); 325 | worker_threads = NULL; 326 | PTHREAD_SAFE_CALL( pthread_mutex_destroy(&count_lock) ); 327 | PTHREAD_SAFE_CALL( pthread_cond_destroy(&count_cond) ); 328 | PTHREAD_SAFE_CALL( pthread_mutex_destroy(&queue_lock) ); 329 | PTHREAD_SAFE_CALL( pthread_cond_destroy(&queue_cond) ); 330 | } 331 | 332 | void Weft::initialize_count(unsigned count) 333 | { 334 | PTHREAD_SAFE_CALL( pthread_mutex_lock(&count_lock) ); 335 | assert(pending_count == 0); 336 | pending_count = count; 337 | PTHREAD_SAFE_CALL( pthread_mutex_unlock(&count_lock) ); 338 | } 339 | 340 | void Weft::wait_until_done(void) 341 | { 342 | PTHREAD_SAFE_CALL( pthread_mutex_lock(&count_lock) ); 343 | if (pending_count > 0) 344 | { 345 | PTHREAD_SAFE_CALL( pthread_cond_wait(&count_cond, &count_lock) ); 346 | } 347 | PTHREAD_SAFE_CALL( pthread_mutex_unlock(&count_lock) ); 348 | } 349 | 350 | void Weft::enqueue_task(WeftTask *task) 351 | { 352 | PTHREAD_SAFE_CALL( pthread_mutex_lock(&queue_lock) ); 353 | queue.push_back(task); 354 | PTHREAD_SAFE_CALL( pthread_cond_signal(&queue_cond) ); 355 | PTHREAD_SAFE_CALL( pthread_mutex_unlock(&queue_lock) ); 356 | } 357 | 358 | WeftTask* Weft::dequeue_task(void) 359 | { 360 | WeftTask *result = NULL; 361 | bool done = false; 362 | while (!done) 363 | { 364 | PTHREAD_SAFE_CALL( pthread_mutex_lock(&queue_lock) ); 365 | if (queue.empty()) 366 | { 367 | if (!threadpool_finished) 368 | { 369 | PTHREAD_SAFE_CALL( pthread_cond_wait(&queue_cond, &queue_lock) ); 370 | } 371 | else 372 | done = true; 373 | } 374 | else 375 | { 376 | result = queue.front(); 377 | queue.pop_front(); 378 | done = true; 379 | } 380 | PTHREAD_SAFE_CALL( pthread_mutex_unlock(&queue_lock) ); 381 | } 382 | return result; 383 | } 384 | 385 | void Weft::complete_task(WeftTask *task) 386 | { 387 | PTHREAD_SAFE_CALL( pthread_mutex_lock(&count_lock) ); 388 | assert(pending_count > 0); 389 | pending_count--; 390 | if (pending_count == 0) 391 | PTHREAD_SAFE_CALL( pthread_cond_signal(&count_cond) ); 392 | PTHREAD_SAFE_CALL( pthread_mutex_unlock(&count_lock) ); 393 | // Clean up the task 394 | delete task; 395 | } 396 | 397 | /*static*/ 398 | void* Weft::worker_loop(void *arg) 399 | { 400 | Weft *weft = (Weft*)arg; 401 | while (true) 402 | { 403 | WeftTask *task = weft->dequeue_task(); 404 | // If we ever get a NULL task then we are done 405 | if (task == NULL) 406 | break; 407 | task->execute(); 408 | weft->complete_task(task); 409 | } 410 | return NULL; 411 | } 412 | 413 | /*static*/ 414 | unsigned long long Weft::get_current_time_in_micros(void) 415 | { 416 | #ifdef __MACH__ 417 | mach_timespec_t spec; 418 | clock_serv_t cclock; 419 | host_get_clock_service(mach_host_self(), CALENDAR_CLOCK, &cclock); 420 | clock_get_time(cclock, &spec); 421 | mach_port_deallocate(mach_host_self(), cclock); 422 | #else 423 | struct timespec spec; 424 | clock_gettime(CLOCK_MONOTONIC, &spec); 425 | #endif 426 | unsigned long long result = (((unsigned long long)spec.tv_sec) * 1000000) + 427 | (((unsigned long long)spec.tv_nsec) / 1000); 428 | return result; 429 | } 430 | 431 | /*static*/ 432 | size_t Weft::get_memory_usage(void) 433 | { 434 | struct rusage usage; 435 | getrusage(RUSAGE_SELF, &usage); 436 | return usage.ru_maxrss; 437 | } 438 | 439 | int main(int argc, char **argv) 440 | { 441 | Weft weft(argc, argv); 442 | weft.verify(); 443 | fflush(stderr); 444 | fflush(stdout); 445 | return 0; 446 | } 447 | 448 | -------------------------------------------------------------------------------- /examples/RTM/one_phase_single_buffer.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2015 Stanford University and NVIDIA 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #include 18 | #include 19 | #include 20 | #include 21 | 22 | #include "cudaDMAK.h" 23 | 24 | #define LDG_BYTES (12*16) 25 | 26 | #ifndef USE_REDUCTION 27 | #define USE_REDUCTION 0 28 | #endif 29 | 30 | #ifndef X_BEFORE_Y_FOR_PXY 31 | #define X_BEFORE_Y_FOR_PXY 1 32 | #endif 33 | 34 | #define Z_BEFORE_X_FOR_PXZ 1 35 | #define Z_BEFORE_Y_FOR_PYZ 1 36 | 37 | #ifndef R 38 | #define R 4 39 | #endif 40 | 41 | #ifndef USE_CONSTANT_FOR_ZCOEFF 42 | #define USE_CONSTANT_FOR_ZCOEFF 1 43 | #endif 44 | 45 | #ifndef TILE_X 46 | #define TILE_X 32 47 | #endif 48 | #ifndef TILE_Y 49 | #define TILE_Y 4 50 | #endif 51 | 52 | #ifndef BLOCKING_TYPE 53 | #define BLOCKING_TYPE float2 54 | #endif 55 | const int x_b = sizeof(BLOCKING_TYPE)/sizeof(float); 56 | // Need radius in X-dimension to be a multiple of the X-dimension blocking factor 57 | const int RX = (R % x_b ? R + x_b - (R % x_b) : R); 58 | 59 | #define FP_RAND() (float(rand())/float(RAND_MAX) < 0.5 ? -(float((rand()+1)%5)) : (float((rand()+1)%5))) 60 | #define FP_RAND_RAD() (float((rand()%4)) * M_PI_2) 61 | 62 | #ifndef MAX_Z_DIM 63 | #define MAX_Z_DIM 500 64 | #endif 65 | 66 | #ifndef RTM_ELMTS 67 | #define RTM_ELMTS 64 68 | #endif 69 | 70 | __constant__ float c_xx[R+1], c_yy[R+1]; 71 | __constant__ float c_x[R+1], c_y[R+1]; 72 | __constant__ float vsz2_constant; 73 | 74 | #ifdef USE_CONSTANT_FOR_ZCOEFF 75 | __constant__ float c_zz[(2*R+1)*MAX_Z_DIM], c_z[(2*R+1)*MAX_Z_DIM]; 76 | #endif 77 | 78 | template 79 | __device__ __forceinline__ void 80 | shift(T b[ecnt+1]) { 81 | #pragma unroll 82 | for(unsigned idx=0;idx 87 | __device__ inline void 88 | init(float b[ecnt+1], const T ival) { 89 | #pragma unroll 90 | for(unsigned idx = 0; idx < ecnt + 1; idx ++) 91 | b[idx] = ival; 92 | } 93 | 94 | template 95 | struct deriv_t { 96 | T dx2; 97 | T dy2; 98 | T dz2; 99 | T dxy; 100 | T dxz; 101 | T dyz; 102 | }; 103 | 104 | struct range { 105 | int start; 106 | int end; 107 | range(int s, int e) : start(s), end(e) {} 108 | }; 109 | 110 | enum Param_type_e { 111 | Vpz2, 112 | Delta, 113 | Epsln, 114 | Alpha, 115 | Beta, 116 | ParameterCnt 117 | }; 118 | 119 | #define REGS_PER_THREAD 64 120 | #define REGS_PER_SM 65536 121 | 122 | __device__ __forceinline__ float 123 | cachedRead(const float* data, const int index) 124 | { 125 | const float* address = &data[index]; 126 | float result; 127 | asm("ld.ca.f32 %0, [%1];\n" 128 | : "=f" (result) 129 | #if defined(_WIN64) || defined(__LP64__) 130 | : "l"(address) 131 | #else 132 | : "r"(address) 133 | #endif 134 | : "memory" 135 | ); 136 | return result; 137 | } 138 | 139 | __device__ __forceinline__ void 140 | _myRedAdd(const float* address, const float update) 141 | { 142 | asm("red.global.add.f32 [%0], %1;\n" 143 | : 144 | #if defined(_WIN64) || defined(__LP64__) 145 | : "l"(address) 146 | #else 147 | : "r"(address) 148 | #endif 149 | , "f" (update) 150 | : "memory" 151 | ); 152 | } 153 | 154 | #define DIV_CEILING(x,y) (x/y + (x % y ? 1 : 0)) 155 | #define EPT(x) (DIV_CEILING(2*R+x, x)) 156 | #define WARP_WIDTH 32 157 | #define WPAD 0 158 | #define MAX(a, b) (a < b ? b : a) 159 | #define PQW_WIDTH MAX(WARP_WIDTH+WPAD, tile_x+2*R+WPAD) 160 | #define halfWarpCnt (TILE_Y * TILE_X / WARP_WIDTH) 161 | #define haloCnt ((2*R)/TILE_Y + 1) 162 | #ifdef USE_TEX 163 | texture tex_PQ2; 164 | texture tex_PnQn2; 165 | texture tex_PQ; 166 | texture tex_PnQn; 167 | texture tex_abde; 168 | texture tex_vpz2; 169 | texture tex_P; 170 | texture tex_Pn; 171 | texture tex_Q; 172 | texture tex_Qn; 173 | #endif 174 | #ifdef USE_TEX 175 | #define ld_PQ2_ro(_loc) tex1Dfetch(tex_PQ2, _loc) 176 | #define ld_PnQn2_ro(_loc) tex1Dfetch(tex_PnQn2, _loc) 177 | #define ld_PQ_ro(_loc) tex1Dfetch(tex_PQ, _loc) 178 | #define ld_P_ro(_loc) tex1Dfetch(tex_P, _loc) 179 | #define ld_Q_ro(_loc) tex1Dfetch(tex_Q, _loc) 180 | #define ld_PQ_ro_cached(_loc) tex1Dfetch(tex_PQ, _loc) 181 | #define ld_PnQn_ro(_loc) tex1Dfetch(tex_PnQn, _loc) 182 | #define ld_Pn_ro(_loc) tex1Dfetch(tex_Pn, _loc) 183 | #define ld_Qn_ro(_loc) tex1Dfetch(tex_Qn, _loc) 184 | #define ld_param_ro(_loc, _param) tex1Dfetch(tex_##_param, _loc) 185 | #else 186 | #define ld_PQ2_ro(_loc) g_PQ[_loc] 187 | #define ld_PnQn2_ro(_loc) g_PnQn[_loc] 188 | #define ld_PQ_ro(_loc) g_PQ[_loc] 189 | #define ld_P_ro(_loc) g_P[_loc] 190 | #define ld_Q_ro(_loc) g_Q[_loc] 191 | #if NVCC_SUPPORTS_UNROLL_INLINE_ASM==1 192 | #define ld_PQ_ro_cached(_loc) cachedRead(g_PQ, _loc) 193 | #define ld_P_ro_cached(_loc) cachedRead(g_P, _loc) 194 | #define ld_Q_ro_cached(_loc) cachedRead(g_Q, _loc) 195 | #else 196 | #define ld_PQ_ro_cached(_loc) ld_PQ_ro(_loc) 197 | #define ld_P_ro_cached(_loc) ld_P_ro(_loc) 198 | #define ld_Q_ro_cached(_loc) ld_Q_ro(_loc) 199 | #endif 200 | #define ld_PnQn_ro(_loc) g_PnQn[_loc] 201 | #define ld_Pn_ro(_loc) g_Pn[_loc] 202 | #define ld_Qn_ro(_loc) g_Qn[_loc] 203 | #define ld_param_ro(_loc, _param) g_##_param[_loc] 204 | #endif 205 | 206 | #define ZPENCIL_LENGTH (2*R+1) 207 | #define ZPENCIL_LAST (ZPENCIL_LENGTH-1) 208 | #define ZPENCIL_FIRST 0 209 | #define ZPENCIL(_n, _t) _t _n[ZPENCIL_LENGTH] 210 | #define ZPENCIL_SHIFT(_n) shift(_n) 211 | #define ZPENCIL_INIT(_n) init(_n, 0.0f) 212 | #define ZPENCIL_CTR_PRESHIFT R+1 213 | #define ZPENCIL_CTR_POSTSHIFT R 214 | 215 | #define Q_LENGTH R 216 | #ifdef Q_IN_REGISTERS 217 | #define Q_DEF(_n, _t) _t _n[Q_LENGTH]; init(_n, 0.0f) 218 | #define qidx 219 | #define Q_COMMON(_i) 220 | #define Q_CURR(_q, _i) _q[0] 221 | #define Q_LAST(_q, _i) _q[Q_LENGTH-1] 222 | #define ADVANCE_Qs(_q1,_q2,_q3,_i) shift(_q1);shift(_q2);shift(_q3) 223 | #define ADVANCE_6Qs(_q1,_q2,_q3,_q4, _q5, _q6, _i) shift(_q1);shift(_q2);shift(_q3);shift(_q4);shift(_q5);shift(_q6) 224 | #define ADVANCE_3Qs(_q1,_q2,_q3,_i) shift(_q1);shift(_q2);shift(_q3); 225 | #else 226 | #define Q_COMMON(_i) int _i = 0 227 | #ifdef Q_IN_SMEM 228 | #if 0 229 | #define Q_DEF(_n, _t) volatile __shared__ _t _n[Q_LENGTH][1*tile_y][tile_x] 230 | #define Q_CURR(_q, _i) _q[_i][threadIdx.y][threadIdx.x] 231 | #define Q_LAST(_q, _i) _q[(_i == 0 ? Q_LENGTH-1 : _i-1)][threadIdx.y][threadIdx.x] 232 | #endif 233 | #define Q_DEF(_n, _t) volatile __shared__ _t _n[Q_LENGTH][TILE_Y*TILE_X] 234 | #define Q_CURR(_q, _i) _q[_i][threadIdx.x] 235 | #define Q_LAST(_q, _i) _q[(_i == 0 ? Q_LENGTH-1 : _i-1)][threadIdx.x] 236 | #define ADVANCE_Qs(_q1,_q2,_q3,_i) _i = (_i == Q_LENGTH-1 ? 0 : _i+1) 237 | #define ADVANCE_6Qs(_q1,_q2,_q3,_q4,_q5,_q6,_i) _i = (_i == Q_LENGTH-1 ? 0 : _i+1) 238 | #else 239 | #define Q_DEF(_n, _t) _t _n[Q_LENGTH] 240 | #define Q_CURR(_q, _i) _q[_i] 241 | #define Q_LAST(_q, _i) _q[(_i == 0 ? Q_LENGTH-1 : _i-1)] 242 | #define ADVANCE_Qs(_q1,_q2,_q3, _i) _i = (_i == Q_LENGTH-1 ? 0 : _i+1) 243 | #define ADVANCE_6Qs(_q1,_q2,_q3,_q4,_q5,_q6,_i) _i = (_i == Q_LENGTH-1 ? 0 : _i+1) 244 | #endif 245 | #endif 246 | 247 | #define SMEM_Q_DEF(_n, _t) volatile __shared__ _t _n[Q_LENGTH][TILE_Y*TILE_X] 248 | #define SMEM_Q_CURR(_q, _i) _q[_i][threadIdx.x] 249 | #define SMEM_Q_LAST(_q, _i) _q[(_i == 0 ? Q_LENGTH-1 : _i-1)][threadIdx.x] 250 | #define SMEM_ADVANCE_Qs(_q1,_q2,_q3,_i) _i = (_i == Q_LENGTH-1 ? 0 : _i+1) 251 | #define SMEM_ADVANCE_6Qs(_q1,_q2,_q3,_q4,_q5,_q6,_i) _i = (_i == Q_LENGTH-1 ? 0 : _i+1) 252 | #define SMEM_ADVANCE_3Qs(_q1,_q2,_q3,_i) _i = (_i == Q_LENGTH-1 ? 0 : _i+1) 253 | 254 | #if NVCC_SUPPORTS_UNROLL_INLINE_ASM==1 255 | #define READ_Z_COEFF(_a, _i) cachedRead(_a, _i) 256 | #else 257 | #define READ_Z_COEFF(_a, _i) _a[_i] 258 | #endif 259 | 260 | #define SMEM_ROW_WIDTH (2*(tile_x+2*R)) 261 | 262 | #if R <= TILE_Y 263 | #define HALO_CNT 1 264 | #define HALO_INIT {0.f} 265 | #define LAST_HIDX 0 266 | #else 267 | #if R <= 2*TILE_Y 268 | #define HALO_CNT 2 269 | #define HALO_INIT {0.f, 0.f} 270 | #define LAST_HIDX 1 271 | #else 272 | #error "Not coded to handle the case when R > 2*TILE_Y" 273 | #endif 274 | #endif 275 | 276 | #if USE_REDUCTION == 1 277 | #define REDUCTION_LD(_mv_ld_statement) 278 | // _mv = memory value, _uv = update value 279 | #define REDUCTION_SUB(_l, _mv, _uv) _myRedAdd(&_l, -(_uv)) 280 | #else 281 | #define REDUCTION_LD(_mv_ld_statement) _mv_ld_statement 282 | #define REDUCTION_SUB(_l, _mv, _uv) _l = (_mv) - (_uv) 283 | #endif 284 | 285 | #define COMPUTE_THREADS_PER_CTA (TILE_X*TILE_Y) 286 | #define COUNT_PER_STRIDE_ROW (TILE_X+2*R) // The extra 32 covers the 2*R halo elements 287 | #define DMA_THREADS_PER_LD 32 288 | //#define DMA_THREADS_PER_LD 64 289 | //#define DMA_THREADS_PER_LD 128 290 | //#define DMA_THREADS_PER_LD 256 291 | #define DMA_THREADS_PER_CTA (1*DMA_THREADS_PER_LD) 292 | #define BYTES_PER_THREAD (sizeof(float2)*COUNT_PER_STRIDE_ROW*(TILE_Y+2*R)/DMA_THREADS_PER_LD) 293 | 294 | #define PQY_BUF_OFFSET (-(R*COUNT_PER_STRIDE_ROW)) 295 | 296 | __global__ void 297 | __launch_bounds__(160,3) 298 | single_pass_ktuned_DMA_single_specialized_one_phase(float2 *g_PnQn, 299 | float2* g_PQ, 300 | #ifndef USE_TEX 301 | float4* g_abde, float* g_vpz2, 302 | #endif 303 | const int row_stride, const int slice_stride, const int nz, 304 | const int R_x_row_stride, const int tile_y_x_row_stride, 305 | const int Pn_P_diff, const int lead_pad, const int offset, 306 | const int q_start_idx 307 | #ifndef USE_CONSTANT_FOR_ZCOEFF 308 | , const float* c_z, const float* c_zz 309 | #endif 310 | ) 311 | { 312 | int tid = threadIdx.x; 313 | __shared__ float2 s_PQ[(TILE_Y+2*R)*COUNT_PER_STRIDE_ROW+TILE_Y*COUNT_PER_STRIDE_ROW]; 314 | float2 *PQy_buf = s_PQ+(TILE_Y+2*R)*COUNT_PER_STRIDE_ROW+PQY_BUF_OFFSET; 315 | cudaDMAStrided 316 | dma_ld_pq(1, 317 | COMPUTE_THREADS_PER_CTA, 318 | COMPUTE_THREADS_PER_CTA, 319 | row_stride*sizeof(float2), 320 | COUNT_PER_STRIDE_ROW*sizeof(float2) 321 | ); 322 | int gid = offset + blockIdx.x*TILE_X + 323 | + blockIdx.y*TILE_Y*row_stride; 324 | 325 | if(tid 18 | #include 19 | #include 20 | #include 21 | 22 | #include "cudaDMAK.h" 23 | 24 | #define LDG_BYTES (12*16) 25 | 26 | #ifndef USE_REDUCTION 27 | #define USE_REDUCTION 0 28 | #endif 29 | 30 | #ifndef X_BEFORE_Y_FOR_PXY 31 | #define X_BEFORE_Y_FOR_PXY 1 32 | #endif 33 | 34 | #define Z_BEFORE_X_FOR_PXZ 1 35 | #define Z_BEFORE_Y_FOR_PYZ 1 36 | 37 | #ifndef R 38 | #define R 4 39 | #endif 40 | 41 | #ifndef USE_CONSTANT_FOR_ZCOEFF 42 | #define USE_CONSTANT_FOR_ZCOEFF 1 43 | #endif 44 | 45 | #ifndef TILE_X 46 | #define TILE_X 32 47 | #endif 48 | #ifndef TILE_Y 49 | #define TILE_Y 4 50 | #endif 51 | 52 | #ifndef BLOCKING_TYPE 53 | #define BLOCKING_TYPE float2 54 | #endif 55 | const int x_b = sizeof(BLOCKING_TYPE)/sizeof(float); 56 | // Need radius in X-dimension to be a multiple of the X-dimension blocking factor 57 | const int RX = (R % x_b ? R + x_b - (R % x_b) : R); 58 | 59 | #define FP_RAND() (float(rand())/float(RAND_MAX) < 0.5 ? -(float((rand()+1)%5)) : (float((rand()+1)%5))) 60 | #define FP_RAND_RAD() (float((rand()%4)) * M_PI_2) 61 | 62 | #ifndef MAX_Z_DIM 63 | #define MAX_Z_DIM 500 64 | #endif 65 | 66 | #ifndef RTM_ELMTS 67 | #define RTM_ELMTS 64 68 | #endif 69 | 70 | __constant__ float c_xx[R+1], c_yy[R+1]; 71 | __constant__ float c_x[R+1], c_y[R+1]; 72 | __constant__ float vsz2_constant; 73 | 74 | #ifdef USE_CONSTANT_FOR_ZCOEFF 75 | __constant__ float c_zz[(2*R+1)*MAX_Z_DIM], c_z[(2*R+1)*MAX_Z_DIM]; 76 | #endif 77 | 78 | template 79 | __device__ __forceinline__ void 80 | shift(T b[ecnt+1]) { 81 | #pragma unroll 82 | for(unsigned idx=0;idx 87 | __device__ inline void 88 | init(float b[ecnt+1], const T ival) { 89 | #pragma unroll 90 | for(unsigned idx = 0; idx < ecnt + 1; idx ++) 91 | b[idx] = ival; 92 | } 93 | 94 | template 95 | struct deriv_t { 96 | T dx2; 97 | T dy2; 98 | T dz2; 99 | T dxy; 100 | T dxz; 101 | T dyz; 102 | }; 103 | 104 | struct range { 105 | int start; 106 | int end; 107 | range(int s, int e) : start(s), end(e) {} 108 | }; 109 | 110 | enum Param_type_e { 111 | Vpz2, 112 | Delta, 113 | Epsln, 114 | Alpha, 115 | Beta, 116 | ParameterCnt 117 | }; 118 | 119 | #define REGS_PER_THREAD 64 120 | #define REGS_PER_SM 65536 121 | 122 | __device__ __forceinline__ float 123 | cachedRead(const float* data, const int index) 124 | { 125 | const float* address = &data[index]; 126 | float result; 127 | asm("ld.ca.f32 %0, [%1];\n" 128 | : "=f" (result) 129 | #if defined(_WIN64) || defined(__LP64__) 130 | : "l"(address) 131 | #else 132 | : "r"(address) 133 | #endif 134 | : "memory" 135 | ); 136 | return result; 137 | } 138 | 139 | __device__ __forceinline__ void 140 | _myRedAdd(const float* address, const float update) 141 | { 142 | asm("red.global.add.f32 [%0], %1;\n" 143 | : 144 | #if defined(_WIN64) || defined(__LP64__) 145 | : "l"(address) 146 | #else 147 | : "r"(address) 148 | #endif 149 | , "f" (update) 150 | : "memory" 151 | ); 152 | } 153 | 154 | #define DIV_CEILING(x,y) (x/y + (x % y ? 1 : 0)) 155 | #define EPT(x) (DIV_CEILING(2*R+x, x)) 156 | #define WARP_WIDTH 32 157 | #define WPAD 0 158 | #define MAX(a, b) (a < b ? b : a) 159 | #define PQW_WIDTH MAX(WARP_WIDTH+WPAD, tile_x+2*R+WPAD) 160 | #define halfWarpCnt (TILE_Y * TILE_X / WARP_WIDTH) 161 | #define haloCnt ((2*R)/TILE_Y + 1) 162 | #ifdef USE_TEX 163 | texture tex_PQ2; 164 | texture tex_PnQn2; 165 | texture tex_PQ; 166 | texture tex_PnQn; 167 | texture tex_abde; 168 | texture tex_vpz2; 169 | texture tex_P; 170 | texture tex_Pn; 171 | texture tex_Q; 172 | texture tex_Qn; 173 | #endif 174 | #ifdef USE_TEX 175 | #define ld_PQ2_ro(_loc) tex1Dfetch(tex_PQ2, _loc) 176 | #define ld_PnQn2_ro(_loc) tex1Dfetch(tex_PnQn2, _loc) 177 | #define ld_PQ_ro(_loc) tex1Dfetch(tex_PQ, _loc) 178 | #define ld_P_ro(_loc) tex1Dfetch(tex_P, _loc) 179 | #define ld_Q_ro(_loc) tex1Dfetch(tex_Q, _loc) 180 | #define ld_PQ_ro_cached(_loc) tex1Dfetch(tex_PQ, _loc) 181 | #define ld_PnQn_ro(_loc) tex1Dfetch(tex_PnQn, _loc) 182 | #define ld_Pn_ro(_loc) tex1Dfetch(tex_Pn, _loc) 183 | #define ld_Qn_ro(_loc) tex1Dfetch(tex_Qn, _loc) 184 | #define ld_param_ro(_loc, _param) tex1Dfetch(tex_##_param, _loc) 185 | #else 186 | #define ld_PQ2_ro(_loc) g_PQ[_loc] 187 | #define ld_PnQn2_ro(_loc) g_PnQn[_loc] 188 | #define ld_PQ_ro(_loc) g_PQ[_loc] 189 | #define ld_P_ro(_loc) g_P[_loc] 190 | #define ld_Q_ro(_loc) g_Q[_loc] 191 | #if NVCC_SUPPORTS_UNROLL_INLINE_ASM==1 192 | #define ld_PQ_ro_cached(_loc) cachedRead(g_PQ, _loc) 193 | #define ld_P_ro_cached(_loc) cachedRead(g_P, _loc) 194 | #define ld_Q_ro_cached(_loc) cachedRead(g_Q, _loc) 195 | #else 196 | #define ld_PQ_ro_cached(_loc) ld_PQ_ro(_loc) 197 | #define ld_P_ro_cached(_loc) ld_P_ro(_loc) 198 | #define ld_Q_ro_cached(_loc) ld_Q_ro(_loc) 199 | #endif 200 | #define ld_PnQn_ro(_loc) g_PnQn[_loc] 201 | #define ld_Pn_ro(_loc) g_Pn[_loc] 202 | #define ld_Qn_ro(_loc) g_Qn[_loc] 203 | #define ld_param_ro(_loc, _param) g_##_param[_loc] 204 | #endif 205 | 206 | #define ZPENCIL_LENGTH (2*R+1) 207 | #define ZPENCIL_LAST (ZPENCIL_LENGTH-1) 208 | #define ZPENCIL_FIRST 0 209 | #define ZPENCIL(_n, _t) _t _n[ZPENCIL_LENGTH] 210 | #define ZPENCIL_SHIFT(_n) shift(_n) 211 | #define ZPENCIL_INIT(_n) init(_n, 0.0f) 212 | #define ZPENCIL_CTR_PRESHIFT R+1 213 | #define ZPENCIL_CTR_POSTSHIFT R 214 | 215 | #define Q_LENGTH R 216 | #ifdef Q_IN_REGISTERS 217 | #define Q_DEF(_n, _t) _t _n[Q_LENGTH]; init(_n, 0.0f) 218 | #define qidx 219 | #define Q_COMMON(_i) 220 | #define Q_CURR(_q, _i) _q[0] 221 | #define Q_LAST(_q, _i) _q[Q_LENGTH-1] 222 | #define ADVANCE_Qs(_q1,_q2,_q3,_i) shift(_q1);shift(_q2);shift(_q3) 223 | #define ADVANCE_6Qs(_q1,_q2,_q3,_q4, _q5, _q6, _i) shift(_q1);shift(_q2);shift(_q3);shift(_q4);shift(_q5);shift(_q6) 224 | #define ADVANCE_3Qs(_q1,_q2,_q3,_i) shift(_q1);shift(_q2);shift(_q3); 225 | #else 226 | #define Q_COMMON(_i) int _i = 0 227 | #ifdef Q_IN_SMEM 228 | #if 0 229 | #define Q_DEF(_n, _t) volatile __shared__ _t _n[Q_LENGTH][1*tile_y][tile_x] 230 | #define Q_CURR(_q, _i) _q[_i][threadIdx.y][threadIdx.x] 231 | #define Q_LAST(_q, _i) _q[(_i == 0 ? Q_LENGTH-1 : _i-1)][threadIdx.y][threadIdx.x] 232 | #endif 233 | #define Q_DEF(_n, _t) volatile __shared__ _t _n[Q_LENGTH][TILE_Y*TILE_X] 234 | #define Q_CURR(_q, _i) _q[_i][threadIdx.x] 235 | #define Q_LAST(_q, _i) _q[(_i == 0 ? Q_LENGTH-1 : _i-1)][threadIdx.x] 236 | #define ADVANCE_Qs(_q1,_q2,_q3,_i) _i = (_i == Q_LENGTH-1 ? 0 : _i+1) 237 | #define ADVANCE_6Qs(_q1,_q2,_q3,_q4,_q5,_q6,_i) _i = (_i == Q_LENGTH-1 ? 0 : _i+1) 238 | #else 239 | #define Q_DEF(_n, _t) _t _n[Q_LENGTH] 240 | #define Q_CURR(_q, _i) _q[_i] 241 | #define Q_LAST(_q, _i) _q[(_i == 0 ? Q_LENGTH-1 : _i-1)] 242 | #define ADVANCE_Qs(_q1,_q2,_q3, _i) _i = (_i == Q_LENGTH-1 ? 0 : _i+1) 243 | #define ADVANCE_6Qs(_q1,_q2,_q3,_q4,_q5,_q6,_i) _i = (_i == Q_LENGTH-1 ? 0 : _i+1) 244 | #endif 245 | #endif 246 | 247 | #define SMEM_Q_DEF(_n, _t) volatile __shared__ _t _n[Q_LENGTH][TILE_Y*TILE_X] 248 | #define SMEM_Q_CURR(_q, _i) _q[_i][threadIdx.x] 249 | #define SMEM_Q_LAST(_q, _i) _q[(_i == 0 ? Q_LENGTH-1 : _i-1)][threadIdx.x] 250 | #define SMEM_ADVANCE_Qs(_q1,_q2,_q3,_i) _i = (_i == Q_LENGTH-1 ? 0 : _i+1) 251 | #define SMEM_ADVANCE_6Qs(_q1,_q2,_q3,_q4,_q5,_q6,_i) _i = (_i == Q_LENGTH-1 ? 0 : _i+1) 252 | #define SMEM_ADVANCE_3Qs(_q1,_q2,_q3,_i) _i = (_i == Q_LENGTH-1 ? 0 : _i+1) 253 | 254 | #if NVCC_SUPPORTS_UNROLL_INLINE_ASM==1 255 | #define READ_Z_COEFF(_a, _i) cachedRead(_a, _i) 256 | #else 257 | #define READ_Z_COEFF(_a, _i) _a[_i] 258 | #endif 259 | 260 | #define SMEM_ROW_WIDTH (2*(tile_x+2*R)) 261 | 262 | #if R <= TILE_Y 263 | #define HALO_CNT 1 264 | #define HALO_INIT {0.f} 265 | #define LAST_HIDX 0 266 | #else 267 | #if R <= 2*TILE_Y 268 | #define HALO_CNT 2 269 | #define HALO_INIT {0.f, 0.f} 270 | #define LAST_HIDX 1 271 | #else 272 | #error "Not coded to handle the case when R > 2*TILE_Y" 273 | #endif 274 | #endif 275 | 276 | #if USE_REDUCTION == 1 277 | #define REDUCTION_LD(_mv_ld_statement) 278 | // _mv = memory value, _uv = update value 279 | #define REDUCTION_SUB(_l, _mv, _uv) _myRedAdd(&_l, -(_uv)) 280 | #else 281 | #define REDUCTION_LD(_mv_ld_statement) _mv_ld_statement 282 | #define REDUCTION_SUB(_l, _mv, _uv) _l = (_mv) - (_uv) 283 | #endif 284 | 285 | #define COMPUTE_THREADS_PER_CTA (TILE_X*TILE_Y) 286 | #define COUNT_PER_STRIDE_ROW (TILE_X+2*R) // The extra 32 covers the 2*R halo elements 287 | #define DMA_THREADS_PER_LD 32 288 | //#define DMA_THREADS_PER_LD 64 289 | //#define DMA_THREADS_PER_LD 128 290 | //#define DMA_THREADS_PER_LD 256 291 | #define DMA_THREADS_PER_CTA (1*DMA_THREADS_PER_LD) 292 | #define BYTES_PER_THREAD (sizeof(float2)*COUNT_PER_STRIDE_ROW*(TILE_Y+2*R)/DMA_THREADS_PER_LD) 293 | 294 | #define PQY_BUF_OFFSET (-(R*COUNT_PER_STRIDE_ROW)) 295 | 296 | __global__ void 297 | __launch_bounds__(160,3) 298 | single_pass_ktuned_DMA_single_specialized_two_phase(float2 *g_PnQn, 299 | float2* g_PQ, 300 | #ifndef USE_TEX 301 | float4* g_abde, float* g_vpz2, 302 | #endif 303 | const int row_stride, const int slice_stride, const int nz, 304 | const int R_x_row_stride, const int tile_y_x_row_stride, 305 | const int Pn_P_diff, const int lead_pad, const int offset, 306 | const int q_start_idx 307 | #ifndef USE_CONSTANT_FOR_ZCOEFF 308 | , const float* c_z, const float* c_zz 309 | #endif 310 | ) 311 | { 312 | int tid = threadIdx.x; 313 | __shared__ float2 s_PQ[(TILE_Y+2*R)*COUNT_PER_STRIDE_ROW+TILE_Y*COUNT_PER_STRIDE_ROW]; 314 | float2 *PQy_buf = s_PQ+(TILE_Y+2*R)*COUNT_PER_STRIDE_ROW+PQY_BUF_OFFSET; 315 | cudaDMAStridedTwoPhase 316 | dma_ld_pq(1, 317 | COMPUTE_THREADS_PER_CTA, 318 | COMPUTE_THREADS_PER_CTA, 319 | row_stride*sizeof(float2), 320 | COUNT_PER_STRIDE_ROW*sizeof(float2) 321 | ); 322 | int gid = offset + blockIdx.x*TILE_X + 323 | + blockIdx.y*TILE_Y*row_stride; 324 | 325 | if(tid 21 | #include 22 | #include 23 | 24 | #include 25 | #include 26 | #include 27 | #include 28 | 29 | // Special registers get special values 30 | // Make these macros so they can be negative 31 | #define WEFT_TID_X_REG (-1) 32 | #define WEFT_TID_Y_REG (-2) 33 | #define WEFT_TID_Z_REG (-3) 34 | #define WEFT_NTID_X_REG (-4) 35 | #define WEFT_NTID_Y_REG (-5) 36 | #define WEFT_NTID_Z_REG (-6) 37 | #define WEFT_LANE_REG (-7) 38 | #define WEFT_WARP_REG (-8) 39 | #define WEFT_NWARP_REG (-9) 40 | #define WEFT_CTA_X_REG (-10) 41 | #define WEFT_CTA_Y_REG (-11) 42 | #define WEFT_CTA_Z_REG (-12) 43 | #define WEFT_NCTA_X_REG (-13) 44 | #define WEFT_NCTA_Y_REG (-14) 45 | #define WEFT_NCTA_Z_REG (-15) 46 | 47 | #define SDDRINC (100000000) 48 | 49 | enum PTXKind { 50 | PTX_SHARED_DECL, 51 | PTX_MOVE, 52 | PTX_RIGHT_SHIFT, 53 | PTX_LEFT_SHIFT, 54 | PTX_AND, 55 | PTX_OR, 56 | PTX_XOR, 57 | PTX_NOT, 58 | PTX_ADD, 59 | PTX_SUB, 60 | PTX_NEGATE, 61 | PTX_CONVERT, 62 | PTX_CONVERT_ADDRESS, 63 | PTX_BFE, 64 | PTX_MULTIPLY, 65 | PTX_MAD, 66 | PTX_SET_PREDICATE, 67 | PTX_SELECT_PREDICATE, 68 | PTX_BARRIER, 69 | PTX_SHARED_ACCESS, 70 | PTX_LABEL, 71 | PTX_BRANCH, 72 | PTX_UNIFORM_BRANCH, 73 | PTX_SHFL, 74 | PTX_EXIT, 75 | PTX_GLOBAL_DECL, 76 | PTX_GLOBAL_LOAD, 77 | PTX_LAST, // this one must be last 78 | }; 79 | 80 | enum CompType { 81 | COMP_GT, 82 | COMP_GE, 83 | COMP_EQ, 84 | COMP_NE, 85 | COMP_LE, 86 | COMP_LT, 87 | }; 88 | 89 | // Some helper methods 90 | inline void split(std::vector &results, 91 | const char *str, char c = ',') 92 | { 93 | do { 94 | const char *begin = str; 95 | while ((*str != ' ') && (*str != '\t') && 96 | (*str != c) && (*str)) str++; 97 | 98 | std::string result(begin, str); 99 | if (!result.empty()) 100 | results.push_back(result); 101 | } while (0 != *str++); 102 | } 103 | 104 | class Thread; 105 | class PTXLabel; 106 | class PTXBranch; 107 | class PTXBarrier; 108 | class WeftBarrier; 109 | class WeftAccess; 110 | class BarrierSync; 111 | class BarrierArrive; 112 | class SharedWrite; 113 | class SharedRead; 114 | class SharedStore; 115 | class BarrierInstance; 116 | 117 | class PTXInstruction { 118 | public: 119 | PTXInstruction(void); 120 | PTXInstruction(PTXKind kind, int line_num); 121 | virtual~ PTXInstruction(void); 122 | public: 123 | virtual PTXInstruction* emulate(Thread *thread) = 0; 124 | // Most instructions do the same thing, but some need 125 | // to override this behavior so make it virtual 126 | virtual PTXInstruction* emulate_warp(Thread **threads, 127 | ThreadState *thread_state, 128 | int &shared_access_id, 129 | SharedStore &store); 130 | public: 131 | virtual bool is_label(void) const { return false; } 132 | virtual bool is_branch(void) const { return false; } 133 | virtual bool is_barrier(void) const { return false; } 134 | virtual bool is_shuffle(void) const { return false; } 135 | public: 136 | virtual PTXLabel* as_label(void) { return NULL; } 137 | virtual PTXBranch* as_branch(void) { return NULL; } 138 | virtual PTXBarrier* as_barrier(void) { return NULL; } 139 | public: 140 | inline PTXKind get_kind(void) const { return kind; } 141 | public: 142 | void set_next(PTXInstruction *next); 143 | void set_source_location(const char *file, int line); 144 | public: 145 | static PTXInstruction* interpret(const std::string &line, int line_num); 146 | static const char* get_kind_name(PTXKind k); 147 | public: 148 | static uint64_t compress_identifier(const char *buffer, size_t buffer_size); 149 | static void decompress_identifier(uint64_t id, char *buffer, size_t buffer_size); 150 | public: 151 | const PTXKind kind; 152 | const int line_number; 153 | protected: 154 | PTXInstruction *next; 155 | public: 156 | const char *source_file; 157 | int source_line_number; 158 | }; 159 | 160 | class PTXLabel: public PTXInstruction { 161 | public: 162 | PTXLabel(const std::string &label, int line_num); 163 | PTXLabel(const PTXLabel &rhs) { assert(false); } 164 | virtual ~PTXLabel(void) { } 165 | public: 166 | PTXLabel& operator=(const PTXLabel &rhs) { assert(false); return *this; } 167 | public: 168 | virtual PTXInstruction* emulate(Thread *thread); 169 | // Override for warp-synchronous execution 170 | virtual PTXInstruction* emulate_warp(Thread **threads, 171 | ThreadState *thread_state, 172 | int &shared_access_id, 173 | SharedStore &store); 174 | public: 175 | virtual bool is_label(void) const { return true; } 176 | public: 177 | virtual PTXLabel* as_label(void) { return this; } 178 | public: 179 | void update_labels(std::map &labels); 180 | protected: 181 | std::string label; 182 | public: 183 | static bool interpret(const std::string &line, int line_num, 184 | PTXInstruction *&result); 185 | }; 186 | 187 | class PTXBranch : public PTXInstruction { 188 | public: 189 | PTXBranch(const std::string &label, int line_num); 190 | PTXBranch(int64_t predicate, bool negate, const std::string &label, int line_num); 191 | PTXBranch(const PTXBranch &rhs) { assert(false); } 192 | virtual ~PTXBranch(void) { } 193 | public: 194 | PTXBranch& operator=(const PTXBranch &rhs) { assert(false); return *this; } 195 | public: 196 | virtual PTXInstruction* emulate(Thread *thread); 197 | // Override for warp-synchronous execution! 198 | virtual PTXInstruction* emulate_warp(Thread **threads, 199 | ThreadState *thread_state, 200 | int &shared_access_id, 201 | SharedStore &store); 202 | public: 203 | virtual bool is_branch(void) const { return true; } 204 | public: 205 | virtual PTXBranch* as_branch(void) { return this; } 206 | public: 207 | void set_targets(const std::map &labels); 208 | protected: 209 | int64_t predicate; 210 | bool negate; 211 | std::string label; 212 | PTXLabel *target; 213 | public: 214 | static bool interpret(const std::string &line, int line_num, 215 | PTXInstruction *&result); 216 | }; 217 | 218 | class PTXSharedDecl : public PTXInstruction { 219 | public: 220 | PTXSharedDecl(const std::string &name, int64_t address, int line_num); 221 | PTXSharedDecl(const PTXSharedDecl &rhs) { assert(false); } 222 | virtual ~PTXSharedDecl(void) { } 223 | public: 224 | PTXSharedDecl& operator=(const PTXSharedDecl &rhs) 225 | { assert(false); return *this; } 226 | public: 227 | virtual PTXInstruction* emulate(Thread *thread); 228 | protected: 229 | std::string name; 230 | int64_t address; 231 | public: 232 | static bool interpret(const std::string &line, int line_num, 233 | PTXInstruction *&result); 234 | }; 235 | 236 | class PTXMove : public PTXInstruction { 237 | public: 238 | PTXMove(int64_t dst, int64_t src, bool immediate, int line_num); 239 | PTXMove(int64_t dst, const std::string &src, int line_num); 240 | PTXMove(const PTXMove &rhs) { assert(false); } 241 | virtual ~PTXMove(void) { } 242 | public: 243 | PTXMove& operator=(const PTXMove &rhs) { assert(false); return *this; } 244 | public: 245 | virtual PTXInstruction* emulate(Thread *thread); 246 | protected: 247 | int64_t args[2]; 248 | std::string source; 249 | bool immediate; 250 | public: 251 | static bool interpret(const std::string &line, int line_num, 252 | PTXInstruction *&result); 253 | }; 254 | 255 | class PTXRightShift : public PTXInstruction { 256 | public: 257 | PTXRightShift(int64_t zero, int64_t one, int64_t two, 258 | bool immediate, int line_num); 259 | PTXRightShift(const PTXRightShift &rhs) { assert(false); } 260 | virtual ~PTXRightShift(void) { } 261 | public: 262 | PTXRightShift& operator=(const PTXRightShift &rhs) 263 | { assert(false); return *this; } 264 | public: 265 | virtual PTXInstruction* emulate(Thread *thread); 266 | protected: 267 | int64_t args[3]; 268 | bool immediate; 269 | public: 270 | static bool interpret(const std::string &line, int line_num, 271 | PTXInstruction *&result); 272 | }; 273 | 274 | class PTXLeftShift : public PTXInstruction { 275 | public: 276 | PTXLeftShift(int64_t zero, int64_t one, int64_t two, 277 | bool immediate, int line_num); 278 | PTXLeftShift(const PTXLeftShift &rhs) { assert(false); } 279 | virtual ~PTXLeftShift(void) { } 280 | public: 281 | PTXLeftShift& operator=(const PTXLeftShift &rhs) 282 | { assert(false); return *this; } 283 | public: 284 | virtual PTXInstruction* emulate(Thread *thread); 285 | protected: 286 | int64_t args[3]; 287 | bool immediate; 288 | public: 289 | static bool interpret(const std::string &line, int line_num, 290 | PTXInstruction *&result); 291 | }; 292 | 293 | class PTXAnd : public PTXInstruction { 294 | public: 295 | PTXAnd(int64_t zero, int64_t one, int64_t two, 296 | bool immediate, bool predicate, int line_num); 297 | PTXAnd(const PTXAnd &rhs) { assert(false); } 298 | virtual ~PTXAnd(void) { } 299 | public: 300 | PTXAnd& operator=(const PTXAnd &rhs) { assert(false); return *this; } 301 | public: 302 | virtual PTXInstruction* emulate(Thread *thread); 303 | protected: 304 | int64_t args[3]; 305 | bool immediate; 306 | bool predicate; 307 | public: 308 | static bool interpret(const std::string &line, int line_num, 309 | PTXInstruction *&result); 310 | }; 311 | 312 | class PTXOr : public PTXInstruction { 313 | public: 314 | PTXOr(int64_t zero, int64_t one, int64_t two, 315 | bool immediate, bool predicate, int line_num); 316 | PTXOr(const PTXOr &rhs) { assert(false); } 317 | virtual ~PTXOr(void) { } 318 | public: 319 | PTXOr& operator=(const PTXOr &rhs) { assert(false); return *this; } 320 | public: 321 | virtual PTXInstruction* emulate(Thread *thread); 322 | protected: 323 | int64_t args[3]; 324 | bool immediate; 325 | bool predicate; 326 | public: 327 | static bool interpret(const std::string &line, int line_num, 328 | PTXInstruction *&result); 329 | }; 330 | 331 | class PTXXor : public PTXInstruction { 332 | public: 333 | PTXXor(int64_t zero, int64_t one, int64_t two, 334 | bool immediate, bool predicate, int line_num); 335 | PTXXor(const PTXXor &rhs) { assert(false); } 336 | virtual ~PTXXor(void) { } 337 | public: 338 | PTXXor& operator=(const PTXXor &rhs) { assert(false); return *this; } 339 | public: 340 | virtual PTXInstruction* emulate(Thread *thread); 341 | protected: 342 | int64_t args[3]; 343 | bool immediate; 344 | bool predicate; 345 | public: 346 | static bool interpret(const std::string &line, int line_num, 347 | PTXInstruction *&result); 348 | }; 349 | 350 | class PTXNot : public PTXInstruction { 351 | public: 352 | PTXNot(int64_t zero, int64_t one, bool predicate, int line_num); 353 | PTXNot(const PTXNot &rhs) { assert(false); } 354 | virtual ~PTXNot(void) { } 355 | public: 356 | PTXNot& operator=(const PTXNot &rhs) { assert(false); return *this; } 357 | public: 358 | virtual PTXInstruction* emulate(Thread *thread); 359 | protected: 360 | int64_t args[2]; 361 | bool predicate; 362 | public: 363 | static bool interpret(const std::string &line, int line_num, 364 | PTXInstruction *&result); 365 | }; 366 | 367 | class PTXAdd : public PTXInstruction { 368 | public: 369 | PTXAdd(int64_t zero, int64_t one, int64_t two, 370 | bool immediate, int line_num); 371 | PTXAdd(const PTXAdd &rhs) { assert(false); } 372 | virtual ~PTXAdd(void) { } 373 | public: 374 | PTXAdd& operator=(const PTXAdd &rhs) { assert(false); return *this; } 375 | public: 376 | virtual PTXInstruction* emulate(Thread *thread); 377 | protected: 378 | int64_t args[3]; 379 | bool immediate; 380 | public: 381 | static bool interpret(const std::string &line, int line_num, 382 | PTXInstruction *&result); 383 | }; 384 | 385 | class PTXSub : public PTXInstruction { 386 | public: 387 | PTXSub(int64_t zero, int64_t one, int64_t two, 388 | bool immediate, int line_num); 389 | PTXSub(const PTXSub &rhs) { assert(false); } 390 | virtual ~PTXSub(void) { } 391 | public: 392 | PTXSub& operator=(const PTXSub &rhs) { assert(false); return *this; } 393 | public: 394 | virtual PTXInstruction* emulate(Thread *thread); 395 | protected: 396 | int64_t args[3]; 397 | bool immediate; 398 | public: 399 | static bool interpret(const std::string &line, int line_num, 400 | PTXInstruction *&result); 401 | }; 402 | 403 | class PTXNeg : public PTXInstruction { 404 | public: 405 | PTXNeg(int64_t zero, int64_t one, bool immediate, int line_num); 406 | PTXNeg(const PTXNeg &rhs) { assert(false); } 407 | virtual ~PTXNeg(void) { } 408 | public: 409 | PTXNeg& operator=(const PTXNeg &rhs) { assert(false); return *this; } 410 | public: 411 | virtual PTXInstruction* emulate(Thread *thread); 412 | protected: 413 | int64_t args[2]; 414 | bool immediate; 415 | public: 416 | static bool interpret(const std::string &line, int line_num, 417 | PTXInstruction *&result); 418 | }; 419 | 420 | class PTXMul : public PTXInstruction { 421 | public: 422 | PTXMul(int64_t zero, int64_t one, int64_t two, 423 | bool immediate, int line_num); 424 | PTXMul(const PTXMul &rhs) { assert(false); } 425 | virtual ~PTXMul(void) { } 426 | public: 427 | PTXMul& operator=(const PTXMul &rhs) { assert(false); return *this; } 428 | public: 429 | virtual PTXInstruction* emulate(Thread *thread); 430 | protected: 431 | int64_t args[3]; 432 | bool immediate; 433 | public: 434 | static bool interpret(const std::string &line, int line_num, 435 | PTXInstruction *&result); 436 | }; 437 | 438 | class PTXMad : public PTXInstruction { 439 | public: 440 | PTXMad(int64_t args[4], bool immediates[4], int line_num); 441 | PTXMad(const PTXMad &rhs) { assert(false); } 442 | virtual ~PTXMad(void) { } 443 | public: 444 | PTXMad& operator=(const PTXMad &rhs) { assert(false); return *this; } 445 | public: 446 | virtual PTXInstruction* emulate(Thread *thread); 447 | protected: 448 | int64_t args[4]; 449 | bool immediate[4]; 450 | public: 451 | static bool interpret(const std::string &line, int line_num, 452 | PTXInstruction *&result); 453 | }; 454 | 455 | class PTXSetPred : public PTXInstruction { 456 | public: 457 | PTXSetPred(int64_t zero, int64_t one, int64_t two, bool immediate, 458 | CompType comparison, int line_num); 459 | PTXSetPred(const PTXSetPred &rhs) { assert(false); } 460 | virtual ~PTXSetPred(void) { } 461 | public: 462 | virtual PTXInstruction* emulate(Thread *thread); 463 | public: 464 | PTXSetPred& operator=(const PTXSetPred &rhs) { assert(false); return *this; } 465 | protected: 466 | int64_t args[3]; 467 | CompType comparison; 468 | bool immediate; 469 | public: 470 | static bool interpret(const std::string &line, int line_num, 471 | PTXInstruction *&result); 472 | }; 473 | 474 | class PTXSelectPred : public PTXInstruction { 475 | public: 476 | PTXSelectPred(int64_t zero, int64_t one, int64_t two, int64_t three, 477 | bool negate, bool two_imm, bool three_imm, int line_num); 478 | PTXSelectPred(const PTXSelectPred &rhs) { assert(false); } 479 | virtual ~PTXSelectPred(void) { } 480 | public: 481 | PTXSelectPred& operator=(const PTXSelectPred &rhs) { assert(false); return *this; } 482 | public: 483 | virtual PTXInstruction* emulate(Thread *thread); 484 | protected: 485 | bool negate; 486 | int64_t predicate; 487 | int64_t args[3]; 488 | bool immediate[2]; 489 | public: 490 | static bool interpret(const std::string &line, int line_num, 491 | PTXInstruction *&result); 492 | }; 493 | 494 | class PTXBarrier : public PTXInstruction { 495 | public: 496 | PTXBarrier(int64_t name, int64_t count, bool sync, 497 | bool name_imm, bool count_imm, int line_num); 498 | PTXBarrier(const PTXBarrier &rhs) { assert(false); } 499 | virtual ~PTXBarrier(void) { } 500 | public: 501 | PTXBarrier& operator=(const PTXBarrier &rhs) { assert(false); return *this; } 502 | public: 503 | virtual PTXInstruction* emulate(Thread *thread); 504 | // Override for warp-synchronous execution! 505 | virtual PTXInstruction* emulate_warp(Thread **threads, 506 | ThreadState *thread_state, 507 | int &shared_access_id, 508 | SharedStore &store); 509 | public: 510 | virtual bool is_barrier(void) const { return true; } 511 | virtual PTXBarrier* as_barrier(void) { return this; } 512 | void update_count(unsigned arrival_count); 513 | int get_barrier_name(void) const { return name; } 514 | protected: 515 | int64_t name, count; 516 | bool sync; 517 | bool name_immediate, count_immediate; 518 | public: 519 | static bool interpret(const std::string &line, int line_num, 520 | PTXInstruction *&result); 521 | }; 522 | 523 | class PTXSharedAccess : public PTXInstruction { 524 | public: 525 | PTXSharedAccess(int64_t addr, int64_t offset, bool write, 526 | bool has_arg, int64_t arg, bool immediate, int line_num); 527 | PTXSharedAccess(const std::string &name, int64_t offset, bool write, 528 | bool has_arg, int64_t arg, bool immediate, int line_num); 529 | PTXSharedAccess(const PTXSharedAccess &rhs) { assert(false); } 530 | virtual ~PTXSharedAccess(void) { } 531 | public: 532 | PTXSharedAccess& operator=(const PTXSharedAccess &rhs) 533 | { assert(false); return *this; } 534 | public: 535 | virtual PTXInstruction* emulate(Thread *thread); 536 | // Override for warp-synchronous execution! 537 | virtual PTXInstruction* emulate_warp(Thread **threads, 538 | ThreadState *thread_state, 539 | int &shared_access_id, 540 | SharedStore &store); 541 | protected: 542 | bool has_name; 543 | std::string name; 544 | int64_t addr, offset, arg; 545 | bool write, has_arg, immediate; 546 | public: 547 | static bool interpret(const std::string &line, int line_num, 548 | PTXInstruction *&result); 549 | }; 550 | 551 | class PTXConvert : public PTXInstruction { 552 | public: 553 | PTXConvert(int64_t zero, int64_t one, int line_num); 554 | PTXConvert(const PTXConvert &rhs) { assert(false); } 555 | virtual ~PTXConvert(void) { } 556 | public: 557 | PTXConvert& operator=(const PTXConvert &rhs) { assert(false); return *this; } 558 | public: 559 | virtual PTXInstruction* emulate(Thread *thread); 560 | protected: 561 | int64_t src, dst; 562 | public: 563 | static bool interpret(const std::string &line, int line_num, 564 | PTXInstruction *&result); 565 | }; 566 | 567 | class PTXConvertAddress : public PTXInstruction { 568 | public: 569 | PTXConvertAddress(int64_t zero, int64_t one, int line_num); 570 | PTXConvertAddress(int64_t zero, const std::string &name, int line_num); 571 | PTXConvertAddress(const PTXConvertAddress &rhs) { assert(false); } 572 | virtual ~PTXConvertAddress(void) { } 573 | public: 574 | PTXConvertAddress& operator=(const PTXConvertAddress &rhs) 575 | { assert(false); return *this; } 576 | public: 577 | virtual PTXInstruction* emulate(Thread *thread); 578 | protected: 579 | bool has_name; 580 | int64_t src, dst; 581 | std::string name; 582 | public: 583 | static bool interpret(const std::string &line, int line_num, 584 | PTXInstruction *&result); 585 | }; 586 | 587 | class PTXBitFieldExtract : public PTXInstruction { 588 | public: 589 | PTXBitFieldExtract(int64_t args[4], bool immediates[4], int line_num); 590 | PTXBitFieldExtract(const PTXBitFieldExtract &rhs) { assert(false); } 591 | virtual ~PTXBitFieldExtract(void) { } 592 | public: 593 | PTXBitFieldExtract& operator=(const PTXBitFieldExtract &rhs) 594 | { assert(false); return *this; } 595 | public: 596 | virtual PTXInstruction* emulate(Thread *thread); 597 | protected: 598 | int64_t args[4]; 599 | bool immediate[4]; 600 | public: 601 | static bool interpret(const std::string &line, int line_num, 602 | PTXInstruction *&result); 603 | }; 604 | 605 | class PTXShuffle : public PTXInstruction { 606 | public: 607 | enum ShuffleKind { 608 | SHUFFLE_UP, 609 | SHUFFLE_DOWN, 610 | SHUFFLE_BUTTERFLY, 611 | SHUFFLE_IDX, 612 | }; 613 | public: 614 | PTXShuffle(ShuffleKind kind, int64_t args[4], bool immediates[4], int line_num); 615 | PTXShuffle(const PTXShuffle &rhs) { assert(false); } 616 | virtual ~PTXShuffle(void) { } 617 | public: 618 | PTXShuffle& operator=(const PTXShuffle &rhs) 619 | { assert(false); return *this; } 620 | public: 621 | virtual PTXInstruction* emulate(Thread *thread); 622 | // Override for warp-synchronous execution! 623 | virtual PTXInstruction* emulate_warp(Thread **threads, 624 | ThreadState *thread_state, 625 | int &shared_access_id, 626 | SharedStore &store); 627 | virtual bool is_shuffle(void) const { return true; } 628 | protected: 629 | ShuffleKind kind; 630 | int64_t args[4]; 631 | bool immediate[4]; 632 | public: 633 | static bool interpret(const std::string &line, int line_num, 634 | PTXInstruction *&result); 635 | }; 636 | 637 | class PTXExit : public PTXInstruction { 638 | public: 639 | PTXExit(int line_num); 640 | PTXExit(int64_t predicate, bool negate, int line_num); 641 | PTXExit(const PTXExit &rhs) { assert(false); } 642 | virtual ~PTXExit(void) { } 643 | public: 644 | PTXExit& operator=(const PTXExit &rhs) 645 | { assert(false); return *this; } 646 | public: 647 | virtual PTXInstruction* emulate(Thread *thread); 648 | // Override for warp-synchronous execution! 649 | virtual PTXInstruction* emulate_warp(Thread **threads, 650 | ThreadState *thread_state, 651 | int &shared_access_id, 652 | SharedStore &store); 653 | protected: 654 | bool has_predicate; 655 | bool negate; 656 | int64_t predicate; 657 | public: 658 | static bool interpret(const std::string &line, int line_num, 659 | PTXInstruction *&result); 660 | }; 661 | 662 | class PTXGlobalDecl : public PTXInstruction { 663 | public: 664 | PTXGlobalDecl(char *name, int *values, size_t size, int line_num); 665 | PTXGlobalDecl(const PTXGlobalDecl &rhs) { assert(false); } 666 | virtual ~PTXGlobalDecl(void) { free(name); free(values); } 667 | public: 668 | PTXGlobalDecl& operator=(const PTXGlobalDecl &rhs) 669 | { assert(false); return *this; } 670 | public: 671 | virtual PTXInstruction* emulate(Thread *thread); 672 | protected: 673 | char *name; 674 | int *values; 675 | size_t size; 676 | public: 677 | static bool interpret(const std::string &line, int line_num, 678 | PTXInstruction *&result); 679 | }; 680 | 681 | class PTXGlobalLoad : public PTXInstruction { 682 | public: 683 | PTXGlobalLoad(int64_t dst, int64_t addr, int line_num); 684 | PTXGlobalLoad(const PTXGlobalLoad &rhs) { assert(false); } 685 | virtual ~PTXGlobalLoad(void) { }; 686 | public: 687 | PTXGlobalLoad& operator=(const PTXGlobalLoad &rhs) 688 | { assert(false); return *this; } 689 | public: 690 | virtual PTXInstruction* emulate(Thread *thread); 691 | protected: 692 | int64_t dst, addr; 693 | public: 694 | static bool interpret(const std::string &line, int line_num, 695 | PTXInstruction *&result); 696 | }; 697 | 698 | class WeftInstruction { 699 | public: 700 | WeftInstruction(PTXInstruction *instruction, Thread *thread); 701 | WeftInstruction(const WeftInstruction &rhs) : instruction(NULL), 702 | thread(NULL), thread_line_number(-1) { assert(false); } 703 | virtual ~WeftInstruction(void) { } 704 | public: 705 | WeftInstruction& operator=(const WeftInstruction &rhs) 706 | { assert(false); return *this; } 707 | public: 708 | virtual bool is_barrier(void) const { return false; } 709 | virtual WeftBarrier* as_barrier(void) { return NULL; } 710 | public: 711 | virtual bool is_access(void) const { return false; } 712 | virtual WeftAccess* as_access(void) { return NULL; } 713 | public: 714 | virtual bool is_sync(void) const { return false; } 715 | virtual BarrierSync* as_sync(void) { return NULL; } 716 | public: 717 | virtual bool is_arrive(void) const { return false; } 718 | virtual BarrierArrive* as_arrive(void) { return NULL; } 719 | public: 720 | virtual bool is_write(void) const { return false; } 721 | virtual SharedWrite* as_write(void) { return NULL; } 722 | public: 723 | virtual bool is_read(void) const { return false; } 724 | virtual SharedRead* as_read(void) { return NULL; } 725 | public: 726 | void initialize_happens(Happens *happens); 727 | inline Happens* get_happens(void) const { return happens_relationship; } 728 | public: 729 | virtual void print_instruction(FILE *target) = 0; 730 | public: 731 | PTXInstruction *const instruction; 732 | Thread *const thread; 733 | const int thread_line_number; 734 | protected: 735 | Happens *happens_relationship; 736 | }; 737 | 738 | class WeftBarrier : public WeftInstruction { 739 | public: 740 | WeftBarrier(int name, int count, PTXBarrier *bar, Thread *thread); 741 | WeftBarrier(const WeftBarrier &rhs) : WeftInstruction(NULL, NULL), 742 | name(0), count(0), barrier(NULL) { assert(false); } 743 | virtual ~WeftBarrier(void) { } 744 | public: 745 | WeftBarrier& operator=(const WeftBarrier &rhs) { assert(false); return *this; } 746 | public: 747 | virtual bool is_barrier(void) const { return true; } 748 | virtual WeftBarrier* as_barrier(void) { return this; } 749 | public: 750 | void set_instance(BarrierInstance *instance); 751 | inline BarrierInstance* get_instance(void) const { return instance; } 752 | public: 753 | virtual void print_instruction(FILE *target) = 0; 754 | public: 755 | const int name; 756 | const int count; 757 | PTXBarrier *const barrier; 758 | protected: 759 | BarrierInstance *instance; 760 | }; 761 | 762 | class BarrierSync : public WeftBarrier { 763 | public: 764 | BarrierSync(int name, int count, PTXBarrier *bar, Thread *thread); 765 | BarrierSync(const BarrierSync &rhs) : WeftBarrier(0, 0, NULL, NULL) 766 | { assert(false); } 767 | virtual ~BarrierSync(void) { } 768 | public: 769 | BarrierSync& operator=(const BarrierSync &rhs) { assert(false); return *this; } 770 | public: 771 | virtual bool is_sync(void) const { return true; } 772 | virtual BarrierSync* as_sync(void) { return this; } 773 | virtual void print_instruction(FILE *target); 774 | }; 775 | 776 | class BarrierArrive : public WeftBarrier { 777 | public: 778 | BarrierArrive(int name, int count, PTXBarrier *bar, Thread *thread); 779 | BarrierArrive(const BarrierArrive &rhs) : WeftBarrier(0, 0, NULL, NULL) 780 | { assert(false); } 781 | virtual ~BarrierArrive(void) { } 782 | public: 783 | BarrierArrive& operator=(const BarrierArrive &rhs) { assert(false); return *this; } 784 | public: 785 | virtual bool is_arrive(void) const { return true; } 786 | virtual BarrierArrive* as_arrive(void) { return this; } 787 | virtual void print_instruction(FILE *target); 788 | }; 789 | 790 | class WeftAccess : public WeftInstruction { 791 | public: 792 | WeftAccess(int address, PTXSharedAccess *access, Thread *thread, int access_id); 793 | WeftAccess(const WeftAccess &rhs) : WeftInstruction(NULL, NULL), 794 | address(0), access(NULL), access_id(-1) { assert(false); } 795 | virtual ~WeftAccess(void) { } 796 | public: 797 | WeftAccess& operator=(const WeftAccess &rhs) { assert(false); return *this; } 798 | public: 799 | virtual bool is_access(void) const { return true; } 800 | virtual WeftAccess* as_access(void) { return this; } 801 | public: 802 | bool has_happens_relationship(WeftAccess *other); 803 | bool is_warp_synchronous(WeftAccess *other); 804 | public: 805 | virtual void print_instruction(FILE *target) = 0; 806 | public: 807 | const int address; 808 | PTXSharedAccess *const access; 809 | const int access_id; // for warp-synchronous execution 810 | }; 811 | 812 | class SharedWrite : public WeftAccess { 813 | public: 814 | SharedWrite(int address, PTXSharedAccess *access, 815 | Thread *thread, int access_id = -1); 816 | SharedWrite(const SharedWrite &rhs) : WeftAccess(0, NULL, NULL, -1) 817 | { assert(false); } 818 | virtual ~SharedWrite(void) { } 819 | public: 820 | SharedWrite& operator=(const SharedWrite &rhs) { assert(false); return *this; } 821 | public: 822 | virtual bool is_write(void) const { return true; } 823 | virtual SharedWrite* as_write(void) { return this; } 824 | virtual void print_instruction(FILE *target); 825 | }; 826 | 827 | class SharedRead : public WeftAccess { 828 | public: 829 | SharedRead(int address, PTXSharedAccess *access, 830 | Thread *thread, int access_id = -1); 831 | SharedRead(const SharedRead &rhs) : WeftAccess(0, NULL, NULL, -1) 832 | { assert(false); } 833 | virtual ~SharedRead(void) { } 834 | public: 835 | SharedRead& operator=(const SharedRead &rhs) { assert(false); return *this; } 836 | public: 837 | virtual bool is_read(void) const { return true; } 838 | virtual SharedRead* as_read(void) { return this; } 839 | virtual void print_instruction(FILE *target); 840 | }; 841 | 842 | #endif // __INSTRUCTION_H__ 843 | --------------------------------------------------------------------------------