├── .gitignore
├── examples
    ├── saxpy
    │   ├── params.h
    │   ├── params_directed.h
    │   ├── params_common.h
    │   ├── Makefile
    │   ├── saxpy_single.cu
    │   └── saxpy_double.cu
    ├── negatives
    │   ├── Makefile
    │   ├── arrival.cu
    │   ├── different.cu
    │   ├── over.cu
    │   └── deadlock.cu
    ├── sgemv
    │   ├── Makefile
    │   ├── vec_single.cu
    │   ├── vec_manual.cu
    │   ├── both_single.cu
    │   ├── vec_double.cu
    │   ├── both_manual.cu
    │   └── both_double.cu
    ├── RTM
    │   ├── Makefile
    │   ├── one_phase_single_buffer.cu
    │   └── two_phase_single_buffer.cu
    ├── PRF
    │   └── Makefile
    ├── DME
    │   └── Makefile
    ├── Heptane
    │   └── Makefile
    └── run_examples.sh
├── src
    ├── Makefile
    ├── race.h
    ├── graph.h
    ├── program.h
    ├── weft.h
    ├── race.cc
    ├── weft.cc
    └── instruction.h
├── README.md
└── LICENSE


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Compiled Object files
 2 | *.slo
 3 | *.lo
 4 | *.o
 5 | *.obj
 6 | 
 7 | # Precompiled Headers
 8 | *.gch
 9 | *.pch
10 | 
11 | # Compiled Dynamic libraries
12 | *.so
13 | *.dylib
14 | *.dll
15 | 
16 | # Fortran module files
17 | *.mod
18 | 
19 | # Compiled Static libraries
20 | *.lai
21 | *.la
22 | *.a
23 | *.lib
24 | 
25 | # Executables
26 | *.exe
27 | *.out
28 | *.app
29 | 


--------------------------------------------------------------------------------
/examples/saxpy/params.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2015 Stanford University and NVIDIA
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #include "params_directed.h"
18 | #include "params_common.h"
19 | 


--------------------------------------------------------------------------------
/examples/negatives/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2015 Stanford University and NVIDIA
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | # 
 8 | #   http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | INPUTS		:= arrival.cu deadlock.cu different.cu over.cu
18 | OUTPUTS 	:= $(INPUTS:.cu=.ptx)
19 | 
20 | %.ptx : %.cu
21 | 	nvcc -o $@ -ptx -lineinfo -m64 -arch=compute_20 $<
22 | 
23 | .PHONY: all
24 | all: normal
25 | 
26 | normal: $(OUTPUTS)
27 | 
28 | clean:
29 | 	rm -f *.ptx
30 | 
31 | 


--------------------------------------------------------------------------------
/examples/negatives/arrival.cu:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2015 Stanford University and NVIDIA
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | __global__ void
18 | __launch_bounds__(64,1)
19 | arrival_test(void)
20 | {
21 |   int wid = threadIdx.x >> 5;
22 |   if (wid == 0)
23 |     asm volatile("bar.arrive 1, 64;" : : : "memory");
24 |   else if (wid == 1)
25 |     asm volatile("bar.arrive 1, 64;" : : : "memory");
26 | }
27 | 
28 | 


--------------------------------------------------------------------------------
/examples/negatives/different.cu:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2015 Stanford University and NVIDIA
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | __global__ void
18 | __launch_bounds__(64,1)
19 | different_test(void)
20 | {
21 |   int wid = threadIdx.x >> 5;
22 |   if (wid == 0)
23 |     asm volatile("bar.sync 0, 96;" : : : "memory");
24 |   else if (wid == 1)
25 |     asm volatile("bar.sync 0, 128;" : : : "memory");
26 | }
27 | 
28 | 


--------------------------------------------------------------------------------
/examples/saxpy/params_directed.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2015 Stanford University and NVIDIA
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #define SAXPY_KERNEL saxpy_cudaDMA_doublebuffer
18 | #define CTA_COUNT 14
19 | #define COMPUTE_THREADS_PER_CTA 32 * 8
20 | #ifndef NUM_ITERS
21 | #define NUM_ITERS 2048
22 | #endif
23 | #define DMA_THREADS_PER_LD 32 * 1
24 | #define BYTES_PER_DMA_THREAD 32
25 | #define DMA_SZ 4 * COMPUTE_THREADS_PER_CTA
26 | 


--------------------------------------------------------------------------------
/examples/negatives/over.cu:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2015 Stanford University and NVIDIA
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | __global__ void
18 | __launch_bounds__(96,1)
19 | arrival_test(void)
20 | {
21 |   int wid = threadIdx.x >> 5;
22 |   if (wid == 0)
23 |     asm volatile("bar.sync 0, 64;" : : : "memory");
24 |   else if (wid == 1)
25 |     asm volatile("bar.arrive 0, 64;" : : : "memory");
26 |   else if (wid == 2)
27 |     asm volatile("bar.sync 0, 64;" : : : "memory");
28 | }
29 | 


--------------------------------------------------------------------------------
/examples/negatives/deadlock.cu:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2015 Stanford University and NVIDIA
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | __global__ void
18 | __launch_bounds__(64,1)
19 | deadlock_test(void)
20 | {
21 |   int wid = threadIdx.x >> 5;
22 |   if (wid == 0) {
23 |     asm volatile("bar.sync 0, 64;" : : : "memory");
24 |     asm volatile("bar.arrive 1, 64;" : : : "memory");
25 |   } else {
26 |     asm volatile("bar.sync 1, 64;" : : : "memory");
27 |     asm volatile("bar.arrive 0, 64;" : : : "memory");
28 |   }
29 | }
30 | 


--------------------------------------------------------------------------------
/examples/saxpy/params_common.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2015 Stanford University and NVIDIA
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | // These are computed from the above parameters
18 | #define DMA_THREADS_PER_CTA ( (SAXPY_KERNEL==saxpy_cudaDMA_doublebuffer) ? 4 : 2 ) * DMA_THREADS_PER_LD
19 | #define THREADS_PER_CTA \
20 |   (SAXPY_KERNEL==saxpy_cudaDMA_doublebuffer) ? (COMPUTE_THREADS_PER_CTA+DMA_THREADS_PER_CTA) : \
21 |   (SAXPY_KERNEL==saxpy_cudaDMA) ? (COMPUTE_THREADS_PER_CTA+DMA_THREADS_PER_CTA) : \
22 |   COMPUTE_THREADS_PER_CTA
23 | 


--------------------------------------------------------------------------------
/examples/saxpy/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2015 Stanford University and NVIDIA
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | # 
 8 | #   http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | INPUTS		:= saxpy_single.cu saxpy_double.cu
18 | OUTPUTS 	:= $(INPUTS:.cu=.ptx)
19 | SMALL_OUTPUTS	:= $(INPUTS:.cu=_small.ptx)
20 | 
21 | %_small.ptx : %.cu
22 | 	nvcc -o $@ -ptx -lineinfo -m64 -arch=compute_20 -DNUM_ITERS=8 $<
23 | 
24 | %.ptx : %.cu
25 | 	nvcc -o $@ -ptx -lineinfo -m64 -arch=compute_20 $<
26 | 
27 | .PHONY: all
28 | all: normal
29 | 
30 | normal: $(OUTPUTS)
31 | 
32 | .PHONY: small
33 | small: $(SMALL_OUTPUTS)
34 | 	
35 | clean:
36 | 	rm -f *.ptx
37 | 
38 | 


--------------------------------------------------------------------------------
/src/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2015 Stanford University and NVIDIA
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | # 
 8 | #   http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | OUTFILE := weft 
18 | 
19 | .PHONY: all
20 | all: $(OUTFILE)
21 | 
22 | GCC = g++
23 | CC_FLAGS = -O2 -Wall
24 | LD_FLAGS = -O2 -lpthread 
25 | 
26 | UNAME = $(shell uname)
27 | ifeq ($(UNAME),Linux)
28 | LD_FLAGS += -lrt
29 | endif
30 | 
31 | FILES = weft.cc \
32 | 	race.cc \
33 | 	graph.cc \
34 | 	program.cc \
35 | 	instruction.cc
36 | 
37 | OBJS := $(FILES:.cc=.o)
38 | 
39 | %.o : %.cc
40 | 	$(GCC) -c $(CC_FLAGS) $<
41 | 
42 | $(OUTFILE) : $(OBJS)
43 | 	$(GCC) -o $(OUTFILE) $(OBJS) $(LD_FLAGS)
44 | 
45 | clean:
46 | 	rm -f *.o $(OUTFILE)
47 | 


--------------------------------------------------------------------------------
/examples/sgemv/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2015 Stanford University and NVIDIA
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | # 
 8 | #   http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | INPUTS		:= vec_single.cu \
18 | 		   vec_double.cu \
19 | 		   vec_manual.cu \
20 | 		   both_single.cu \
21 | 		   both_double.cu \
22 | 		   both_manual.cu
23 | OUTPUTS 	:= $(INPUTS:.cu=.ptx)
24 | SMALL_OUTPUTS	:= $(INPUTS:.cu=_small.ptx)
25 | 
26 | %_small.ptx : %.cu
27 | 	nvcc -o $@ -ptx -lineinfo -m64 -arch=compute_20 -DSGEMV_ITERS=8 $<
28 | 
29 | %.ptx : %.cu
30 | 	nvcc -o $@ -ptx -lineinfo -m64 -arch=compute_20 $<
31 | 
32 | .PHONY: all
33 | all: normal
34 | 
35 | normal: $(OUTPUTS)
36 | 
37 | .PHONY: small
38 | small: $(SMALL_OUTPUTS)
39 | 	
40 | clean:
41 | 	rm -f *.ptx
42 | 
43 | 


--------------------------------------------------------------------------------
/examples/RTM/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2015 Stanford University and NVIDIA
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | # 
 8 | #   http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | INPUTS		:= one_phase_single_buffer.cu \
18 | 		   one_phase_manual_buffer.cu \
19 | 		   two_phase_single_buffer.cu \
20 | 		   two_phase_manual_buffer.cu \
21 | 		   two_phase_quad_buffer.cu 
22 | 
23 | OUTPUTS 	:= $(INPUTS:.cu=.ptx)
24 | SMALL_OUTPUTS	:= $(INPUTS:.cu=_small.ptx)
25 | DYNAMIC_OUTPUTS	:= $(INPUTS:.cu=_dynamic.ptx)
26 | 
27 | %_small.ptx : %.cu
28 | 	nvcc -o $@ -ptx -lineinfo -m64 -arch=compute_35 -DRTM_ELMTS=16 $<
29 | 
30 | %_dynamic.ptx : %.cu
31 | 	nvcc -o $@ -ptx -lineinfo -m64 -arch=compute_35 -DDYNAMIC $<
32 | 
33 | %.ptx : %.cu
34 | 	nvcc -o $@ -ptx -lineinfo -m64 -arch=compute_35 $<
35 | 
36 | .PHONY: all
37 | all: normal
38 | 
39 | normal: $(OUTPUTS)
40 | 
41 | .PHONY: small
42 | small: $(SMALL_OUTPUTS)
43 | 
44 | .PHONY: dynamic
45 | dynamic: $(DYNAMIC_OUTPUTS)
46 | 	
47 | clean:
48 | 	rm -f *.ptx
49 | 
50 | 


--------------------------------------------------------------------------------
/examples/PRF/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2015 Stanford University and NVIDIA
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | # 
 8 | #   http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | FERMI_INPUTS	:= diff_fermi.cu \
18 | 		   visc_fermi.cu 
19 | 
20 | KEPLER_INPUTS	:= diff_kepler.cu \
21 | 		   visc_kepler.cu 
22 | 
23 | FERMI_OUTPUTS 	:= $(FERMI_INPUTS:.cu=.ptx)
24 | KEPLER_OUTPUTS	:= $(KEPLER_INPUTS:.cu=.ptx)
25 | 
26 | SMALLF_OUTPUTS  := $(FERMI_INPUTS:.cu=_small.ptx)
27 | SMALLK_OUTPUTS	:= $(KEPLER_INPUTS:.cu=_small.ptx)
28 | DYNAMICF_OUTPUTS:= $(FERMI_INPUTS:.cu=_dynamic.ptx)
29 | DYNAMICK_OUTPUTS:= $(KEPLER_INPUTS:.cu=_dynamic.ptx)
30 | 
31 | %_fermi_small.ptx : %_fermi.cu
32 | 	nvcc -o $@ -ptx -m64 -lineinfo -arch=compute_20 -DTOTAL_STEPS=4 $<
33 | 
34 | %_fermi_dynamic.ptx : %_fermi.cu
35 | 	nvcc -o $@ -ptx -m64 -lineinfo -arch=compute_20 -DDYNAMIC $<
36 | 
37 | %_fermi.ptx : %_fermi.cu
38 | 	nvcc -o $@ -ptx -m64 -lineinfo -arch=compute_20 $<
39 | 
40 | %_kepler_small.ptx : %_kepler.cu
41 | 	nvcc -o $@ -ptx -m64 -lineinfo -arch=compute_35 -DTOTAL_STEPS=4 $<
42 | 
43 | %_kepler_dynamic.ptx : %_kepler.cu
44 | 	nvcc -o $@ -ptx -m64 -lineinfo -arch=compute_35 -DDYNAMIC $<
45 | 
46 | %_kepler.ptx : %_kepler.cu
47 | 	nvcc -o $@ -ptx -m64 -lineinfo -arch=compute_35 $<
48 | 
49 | 
50 | .PHONY: all
51 | all: normal
52 | 
53 | normal: $(FERMI_OUTPUTS) $(KEPLER_OUTPUTS)
54 | 
55 | .PHONY: small
56 | small: $(SMALLF_OUTPUTS) $(SMALLK_OUTPUTS)
57 | 
58 | .PHONY: dynamic
59 | dynamic: $(DYNAMICF_OUTPUTS) $(DYNAMICK_OUTPUTS)
60 | 	
61 | clean:
62 | 	rm -f *.ptx
63 | 
64 | 


--------------------------------------------------------------------------------
/examples/DME/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2015 Stanford University and NVIDIA
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | # 
 8 | #   http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | FERMI_INPUTS	:= diff_fermi.cu \
18 | 		   visc_fermi.cu \
19 | 		   chem_fermi.cu
20 | 
21 | KEPLER_INPUTS	:= diff_kepler.cu \
22 | 		   visc_kepler.cu \
23 | 		   chem_kepler.cu
24 | 
25 | FERMI_OUTPUTS 	:= $(FERMI_INPUTS:.cu=.ptx)
26 | KEPLER_OUTPUTS	:= $(KEPLER_INPUTS:.cu=.ptx)
27 | 
28 | SMALLF_OUTPUTS  := $(FERMI_INPUTS:.cu=_small.ptx)
29 | SMALLK_OUTPUTS	:= $(KEPLER_INPUTS:.cu=_small.ptx)
30 | DYNAMICF_OUTPUTS:= $(FERMI_INPUTS:.cu=_dynamic.ptx)
31 | DYNAMICK_OUTPUTS:= $(KEPLER_INPUTS:.cu=_dynamic.ptx)
32 | 
33 | %_fermi_small.ptx : %_fermi.cu
34 | 	nvcc -o $@ -ptx -lineinfo -m64 -arch=compute_20 -DTOTAL_STEPS=4 $<
35 | 
36 | %_fermi_dynamic.ptx : %_fermi.cu
37 | 	nvcc -o $@ -ptx -lineinfo -m64 -arch=compute_20 -DDYNAMIC $<
38 | 
39 | %_fermi.ptx : %_fermi.cu
40 | 	nvcc -o $@ -ptx -lineinfo -m64 -arch=compute_20 $<
41 | 
42 | %_kepler_small.ptx : %_kepler.cu
43 | 	nvcc -o $@ -ptx -lineinfo -m64 -arch=compute_35 -DTOTAL_STEPS=4 $<
44 | 
45 | %_kepler_dynamic.ptx : %_kepler.cu
46 | 	nvcc -o $@ -ptx -lineinfo -m64 -arch=compute_35 -DDYNAMIC $<
47 | 
48 | %_kepler.ptx : %_kepler.cu
49 | 	nvcc -o $@ -ptx -lineinfo -m64 -arch=compute_35 $<
50 | 
51 | 
52 | .PHONY: all
53 | all: normal
54 | 
55 | normal: $(FERMI_OUTPUTS) $(KEPLER_OUTPUTS)
56 | 
57 | .PHONY: small
58 | small: $(SMALLF_OUTPUTS) $(SMALLK_OUTPUTS)
59 | 
60 | .PHONY: dynamic
61 | dynamic: $(DYNAMICF_OUTPUTS) $(DYNAMICK_OUTPUTS)
62 | 	
63 | clean:
64 | 	rm -f *.ptx
65 | 
66 | 


--------------------------------------------------------------------------------
/examples/Heptane/Makefile:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2015 Stanford University and NVIDIA
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | # 
 8 | #   http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | FERMI_INPUTS	:= diff_fermi.cu \
18 | 		   visc_fermi.cu \
19 | 		   chem_fermi.cu
20 | 
21 | KEPLER_INPUTS	:= diff_kepler.cu \
22 | 		   visc_kepler.cu \
23 | 		   chem_kepler.cu
24 | 
25 | FERMI_OUTPUTS 	:= $(FERMI_INPUTS:.cu=.ptx)
26 | KEPLER_OUTPUTS	:= $(KEPLER_INPUTS:.cu=.ptx)
27 | 
28 | SMALLF_OUTPUTS  := $(FERMI_INPUTS:.cu=_small.ptx)
29 | SMALLK_OUTPUTS	:= $(KEPLER_INPUTS:.cu=_small.ptx)
30 | DYNAMICF_OUTPUTS:= $(FERMI_INPUTS:.cu=_dynamic.ptx)
31 | DYNAMICK_OUTPUTS:= $(KEPLER_INPUTS:.cu=_dynamic.ptx)
32 | 
33 | %_fermi_small.ptx : %_fermi.cu
34 | 	nvcc -o $@ -ptx -m64 -lineinfo -arch=compute_20 -DTOTAL_STEPS=4 $<
35 | 
36 | %_fermi_dynamic.ptx : %_fermi.cu
37 | 	nvcc -o $@ -ptx -m64 -lineinfo -arch=compute_20 -DDYNAMIC $<
38 | 
39 | %_fermi.ptx : %_fermi.cu
40 | 	nvcc -o $@ -ptx -m64 -lineinfo -arch=compute_20 $<
41 | 
42 | %_kepler_small.ptx : %_kepler.cu
43 | 	nvcc -o $@ -ptx -m64 -lineinfo -arch=compute_35 -DTOTAL_STEPS=4 $<
44 | 
45 | %_kepler_dynamic.ptx : %_kepler.cu
46 | 	nvcc -o $@ -ptx -m64 -lineinfo -arch=compute_35 -DDYNAMIC $<
47 | 
48 | %_kepler.ptx : %_kepler.cu
49 | 	nvcc -o $@ -ptx -m64 -lineinfo -arch=compute_35 $<
50 | 
51 | 
52 | .PHONY: all
53 | all: normal
54 | 
55 | normal: $(FERMI_OUTPUTS) $(KEPLER_OUTPUTS)
56 | 
57 | .PHONY: small
58 | small: $(SMALLF_OUTPUTS) $(SMALLK_OUTPUTS)
59 | 
60 | .PHONY: dynamic
61 | dynamic: $(DYNAMICF_OUTPUTS) $(DYNAMICK_OUTPUTS)
62 | 	
63 | clean:
64 | 	rm -f *.ptx
65 | 
66 | 


--------------------------------------------------------------------------------
/examples/sgemv/vec_single.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2015 Stanford University and NVIDIA
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | #include <cstdio>
 18 | #include <cstdlib>
 19 | #include <cassert>
 20 | #include <cmath>
 21 | 
 22 | #include "cuda.h"
 23 | #include "cuda_runtime.h"
 24 | 
 25 | #include "cudaDMA.h"
 26 | 
 27 | #define SIZE_N	        896
 28 | #define SIZE_M		SIZE_N
 29 | 
 30 | #define DMA_KERNEL			sgemvn_cuda_dma_vec_single
 31 | #define COMPUTE_THREADS_PER_CTA		128	
 32 | #define DMA_THREADS_PER_LD		32	
 33 | #define DMA_LDS				1
 34 | #ifndef VEC_ELMTS
 35 | #define VEC_ELMTS		 	128	
 36 | #endif
 37 | 
 38 | #ifndef SGEMV_ITERS
 39 | #define SGEMV_ITERS                     128
 40 | #endif
 41 | 
 42 | __global__ void
 43 | __launch_bounds__(160,1)
 44 | sgemvn_cuda_dma_vec_single(int n, int m, int n1, float alpha, float *A, int lda, float *x, float *y)
 45 | {
 46 | 	__shared__ float buff[VEC_ELMTS];
 47 | 
 48 | 	cudaDMASequential<true,16,4*VEC_ELMTS,DMA_THREADS_PER_LD>
 49 | 	  dma_ld_0(1,COMPUTE_THREADS_PER_CTA,COMPUTE_THREADS_PER_CTA);
 50 | 
 51 | 	if (threadIdx.x < COMPUTE_THREADS_PER_CTA)
 52 | 	{
 53 | 		dma_ld_0.start_async_dma();	
 54 | 		int ind = blockIdx.x*COMPUTE_THREADS_PER_CTA + threadIdx.x;
 55 | 
 56 | 		A += ind;
 57 | 
 58 | 		float res = 0.f;
 59 | 
 60 | #ifdef DYNAMIC
 61 |                 #pragma unroll 1
 62 | 		for(int i=0; i<n1; i += VEC_ELMTS)
 63 | #else
 64 |                 for(int i=0; i<SGEMV_ITERS; i++)
 65 | #endif
 66 | 		{
 67 | 			dma_ld_0.wait_for_dma_finish();
 68 | 			#pragma unroll
 69 | 			for(int j=0; j < VEC_ELMTS; j++)
 70 | 			{
 71 | 				res+=A[0]*buff[j];
 72 | 				A+=lda;
 73 | 			}
 74 | 			dma_ld_0.start_async_dma();
 75 | 		}
 76 | 
 77 | 		#if 0
 78 | 		if (m>n1)
 79 | 		{
 80 | 			buff[threadIdx.x]  = x[n1];
 81 | 
 82 | 			__syncthreads();
 83 | 			for(int j=0; j<(m-n1); j++)
 84 | 			{
 85 | 				 res += A[0]*buff[j];
 86 | 				 A+=lda;
 87 | 			}
 88 | 		  }
 89 | 		#endif
 90 | 
 91 | 		if (ind<n)
 92 | 			y[ind] = alpha * res;
 93 | 	}
 94 | 	else if (dma_ld_0.owns_this_thread())
 95 | 	{
 96 | #ifdef DYNAMIC
 97 |                 #pragma unroll 1
 98 | 		for (int idx=0; idx<n1; idx += VEC_ELMTS)
 99 | #else
100 |                 for (int idx=0; idx<SGEMV_ITERS; idx++)
101 | #endif
102 | 		{
103 | 			dma_ld_0.execute_dma(x,buff);
104 | 			x += VEC_ELMTS;
105 | 		}	
106 | 		dma_ld_0.wait_for_dma_start();
107 | 	}
108 | }
109 | 
110 | 


--------------------------------------------------------------------------------
/examples/saxpy/saxpy_single.cu:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2015 Stanford University and NVIDIA
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | #include <stdio.h>
18 | #include <stdlib.h>
19 | 
20 | #include "cudaDMA.h"
21 | #include "params.h"
22 | 
23 | /*
24 |  * This version of saxpy uses cudaDMA for DMAs (but requires 2 CTAs/SM) for double buffering.
25 |  */
26 | __global__ void 
27 | __launch_bounds__(320,2)
28 | saxpy_cudaDMA ( float* y, float* x, float a, clock_t * timer_vals) 
29 | {
30 |   __shared__ float sdata_x0 [COMPUTE_THREADS_PER_CTA];
31 |   __shared__ float sdata_y0 [COMPUTE_THREADS_PER_CTA];
32 | 
33 |   cudaDMASequential<true, 16, DMA_SZ, DMA_THREADS_PER_LD>
34 |     dma_ld_x_0 (1, COMPUTE_THREADS_PER_CTA, COMPUTE_THREADS_PER_CTA );
35 |   cudaDMASequential<true, 16, DMA_SZ, DMA_THREADS_PER_LD>
36 |     dma_ld_y_0 (2, COMPUTE_THREADS_PER_CTA, COMPUTE_THREADS_PER_CTA + DMA_THREADS_PER_LD );
37 | 
38 |   int tid = threadIdx.x ;
39 | 
40 |   if ( tid < COMPUTE_THREADS_PER_CTA ) {
41 |     unsigned int idx;
42 |     int i;
43 |     float tmp_x;
44 |     float tmp_y;
45 |     
46 |     // Preamble:
47 |     dma_ld_x_0.start_async_dma();
48 |     dma_ld_y_0.start_async_dma();
49 |     #pragma unroll 1
50 |     for (i = 0; i < NUM_ITERS-1; ++i) {
51 |       dma_ld_x_0.wait_for_dma_finish();
52 |       tmp_x = sdata_x0[tid];
53 |       dma_ld_x_0.start_async_dma();
54 |       dma_ld_y_0.wait_for_dma_finish();
55 |       tmp_y = sdata_y0[tid];
56 |       dma_ld_y_0.start_async_dma();
57 |       idx = i * COMPUTE_THREADS_PER_CTA * CTA_COUNT + blockIdx.x * COMPUTE_THREADS_PER_CTA + threadIdx.x;
58 |       y[idx] = a * tmp_x + tmp_y;
59 |     }
60 |     // Postamble:
61 |     dma_ld_x_0.wait_for_dma_finish();
62 |     tmp_x = sdata_x0[tid];
63 |     dma_ld_y_0.wait_for_dma_finish();
64 |     tmp_y = sdata_y0[tid];
65 |     idx = i * COMPUTE_THREADS_PER_CTA * CTA_COUNT + blockIdx.x * COMPUTE_THREADS_PER_CTA + threadIdx.x;
66 |     y[idx] = a * tmp_x + tmp_y;
67 | 
68 |   } else if (dma_ld_x_0.owns_this_thread()) {
69 |     #pragma unroll 1
70 |     for (unsigned int j = 0; j < NUM_ITERS; ++j) {
71 |       // idx is a pointer to the base of the chunk of memory to copy
72 |       unsigned int idx = j * COMPUTE_THREADS_PER_CTA * CTA_COUNT + blockIdx.x * COMPUTE_THREADS_PER_CTA;
73 |       dma_ld_x_0.execute_dma( &x[idx], sdata_x0 );
74 |     }
75 |   } else if (dma_ld_y_0.owns_this_thread()) {
76 |     #pragma unroll 1
77 |     for (unsigned int j = 0; j < NUM_ITERS; ++j) {
78 |       unsigned int idx = j * COMPUTE_THREADS_PER_CTA * CTA_COUNT + blockIdx.x * COMPUTE_THREADS_PER_CTA;
79 |       dma_ld_y_0.execute_dma( &y[idx], sdata_y0 );
80 |     }
81 |   }
82 | }
83 | 
84 | 
85 | 
86 | 


--------------------------------------------------------------------------------
/examples/sgemv/vec_manual.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2015 Stanford University and NVIDIA
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | #include <cstdio>
 18 | #include <cstdlib>
 19 | #include <cassert>
 20 | #include <cmath>
 21 | 
 22 | #include "cuda.h"
 23 | #include "cuda_runtime.h"
 24 | 
 25 | #include "cudaDMA.h"
 26 | 
 27 | #define SIZE_N	        896
 28 | #define SIZE_M		SIZE_N
 29 | 
 30 | #define DMA_KERNEL			sgemvn_cuda_dma_vec_manual
 31 | #define COMPUTE_THREADS_PER_CTA		128	
 32 | #define DMA_THREADS_PER_LD		32
 33 | #define DMA_LDS				1
 34 | #ifndef VEC_ELMTS
 35 | #define VEC_ELMTS			512	
 36 | #endif
 37 | 
 38 | #ifndef SGEMV_ITERS
 39 | #define SGEMV_ITERS                     64
 40 | #endif
 41 | 
 42 | __global__ void
 43 | __launch_bounds__(160,1)
 44 | sgemvn_cuda_dma_vec_manual(int n, int m, int n1, float alpha, float *A, int lda, float *x, float *y)
 45 | {
 46 | 	__shared__ float buff0[VEC_ELMTS];
 47 | 	__shared__ float buff1[VEC_ELMTS];
 48 | 
 49 | 
 50 | 	cudaDMASequential<true,16,4*VEC_ELMTS,DMA_THREADS_PER_LD>
 51 | 	  dma_ld_0(1,COMPUTE_THREADS_PER_CTA,COMPUTE_THREADS_PER_CTA);
 52 | 	cudaDMASequential<true,16,4*VEC_ELMTS,DMA_THREADS_PER_LD>
 53 | 	  dma_ld_1(2,COMPUTE_THREADS_PER_CTA,COMPUTE_THREADS_PER_CTA);
 54 | 
 55 | 	if (threadIdx.x < COMPUTE_THREADS_PER_CTA)
 56 | 	{
 57 | 		dma_ld_0.start_async_dma();	
 58 | 		dma_ld_1.start_async_dma();
 59 | 		int ind = blockIdx.x*COMPUTE_THREADS_PER_CTA + threadIdx.x;
 60 | 
 61 | 		A += ind;
 62 | 
 63 | 		float res = 0.f;
 64 | 
 65 | #ifdef DYNAMIC
 66 |                 #pragma unroll 1
 67 | 		for(int i=0; i<n1; i += (VEC_ELMTS*2) )
 68 | #else
 69 |                 for(int i=0; i<SGEMV_ITERS; i++)
 70 | #endif
 71 | 		{
 72 | 			dma_ld_0.wait_for_dma_finish();
 73 | 			#pragma unroll
 74 | 			for(int j=0; j < VEC_ELMTS; j++)
 75 | 			{
 76 | 				res+=A[0]*buff0[j];
 77 | 				A+=lda;
 78 | 			}
 79 | 			dma_ld_0.start_async_dma();
 80 | 			dma_ld_1.wait_for_dma_finish();
 81 | 			#pragma unroll
 82 | 			for (int j=0; j < VEC_ELMTS; j++)
 83 | 			{
 84 | 				res+=A[0]*buff1[j];
 85 | 				A+=lda;
 86 | 			}
 87 | 			dma_ld_1.start_async_dma();
 88 | 		}
 89 | 
 90 | 		if (ind<n)
 91 | 			y[ind] = alpha * res;
 92 | 	}
 93 | 	else if (dma_ld_0.owns_this_thread())
 94 | 	{
 95 | #ifdef DYNAMIC
 96 |                 #pragma unroll 1
 97 | 		for (int i=0; i<n1; i += (VEC_ELMTS*2))
 98 | #else
 99 |                 for (int i=0; i<SGEMV_ITERS; i++)
100 | #endif
101 | 		{
102 | 			dma_ld_0.execute_dma(x,buff0);
103 | 			x += VEC_ELMTS;
104 | 			dma_ld_1.execute_dma(x,buff1);
105 | 			x += VEC_ELMTS;
106 | 		}	
107 | 		dma_ld_0.wait_for_dma_start();
108 | 		dma_ld_1.wait_for_dma_start();
109 | 	}
110 | }
111 | 
112 | 


--------------------------------------------------------------------------------
/src/race.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2015 Stanford University and NVIDIA
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | #ifndef __RACE_H__
 18 | #define __RACE_H__
 19 | 
 20 | #include <map>
 21 | #include <set>
 22 | #include <deque>
 23 | #include <vector>
 24 | #include <cassert>
 25 | #include <pthread.h>
 26 | 
 27 | class Weft;
 28 | class Thread;
 29 | class Program;
 30 | class WeftAccess;
 31 | class WeftBarrier;
 32 | class SharedMemory;
 33 | class PTXInstruction;
 34 | 
 35 | class Happens {
 36 | public:
 37 |   Happens(int total_threads);
 38 |   Happens(const Happens &rhs) { assert(false); }
 39 |   ~Happens(void) { }
 40 | public:
 41 |   Happens& operator=(const Happens &rhs) { assert(false); return *this; }
 42 | public:
 43 |   void update_barriers_before(const std::vector<WeftBarrier*> &before);
 44 |   void update_barriers_after(const std::vector<WeftBarrier*> &after);
 45 | public:
 46 |   void update_happens_relationships(void);
 47 |   bool has_happens(int thread, int line_number);
 48 | protected:
 49 |   bool initialized;
 50 |   std::vector<WeftBarrier*> latest_before;
 51 |   std::vector<WeftBarrier*> earliest_after;
 52 |   std::vector<int> happens_before;
 53 |   std::vector<int> happens_after;
 54 | };
 55 | 
 56 | class Address {
 57 | public:
 58 |   Address(const int addr, SharedMemory *memory);
 59 |   Address(const Address &rhs) : address(0), memory(NULL) { assert(false); }
 60 |   ~Address(void);
 61 | public:
 62 |   Address& operator=(const Address &rhs) { assert(false); return *this; }
 63 | public:
 64 |   void add_access(WeftAccess *access);
 65 |   void perform_race_tests(void);
 66 |   int report_races(std::map<
 67 |       std::pair<PTXInstruction*,PTXInstruction*>,size_t> &all_races);
 68 |   size_t count_race_tests(void);
 69 | protected:
 70 |   void record_race(WeftAccess *one, WeftAccess *two);
 71 | public:
 72 |   const int address;
 73 |   SharedMemory *const memory;
 74 | protected:
 75 |   pthread_mutex_t address_lock;
 76 |   std::vector<WeftAccess*> accesses;
 77 | protected:
 78 |   int total_races;
 79 |   std::map<std::pair<PTXInstruction*,PTXInstruction*>,
 80 |            std::set<std::pair<Thread*,Thread*> > > ptx_races;
 81 | };
 82 | 
 83 | class SharedMemory {
 84 | public:
 85 |   SharedMemory(Weft *weft, Program *program);
 86 |   SharedMemory(const SharedMemory &rhs) : weft(NULL), program(NULL) { assert(false); }
 87 |   ~SharedMemory(void);
 88 | public:
 89 |   SharedMemory& operator=(const SharedMemory &rhs) 
 90 |     { assert(false); return *this; }
 91 | public:
 92 |   void update_accesses(WeftAccess *access);
 93 |   int count_addresses(void) const;
 94 |   void enqueue_race_checks(void);
 95 |   void check_for_races(void);
 96 |   size_t count_race_tests(void);
 97 | public:
 98 |   Weft *const weft;
 99 |   Program *const program;
100 | protected:
101 |   pthread_mutex_t memory_lock;
102 |   std::map<int/*address*/,Address*> addresses;
103 | };
104 | 
105 | #endif // __RACE_H__
106 | 


--------------------------------------------------------------------------------
/examples/sgemv/both_single.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2015 Stanford University and NVIDIA
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | #include <cstdio>
 18 | #include <cstdlib>
 19 | #include <cassert>
 20 | #include <cmath>
 21 | 
 22 | #include "cuda.h"
 23 | #include "cuda_runtime.h"
 24 | 
 25 | #include "cudaDMA.h"
 26 | 
 27 | #define SIZE_N	        896
 28 | #define SIZE_M		SIZE_N
 29 | 
 30 | #define DMA_KERNEL			sgemvn_cuda_dma_both_single
 31 | #define COMPUTE_THREADS_PER_CTA	        128
 32 | #define DMA_THREADS_PER_LD		32
 33 | #define DMA_LDS				5
 34 | #ifndef VEC_ELMTS
 35 | #define VEC_ELMTS			32	
 36 | #endif
 37 | 
 38 | #ifndef SGEMV_ITERS
 39 | #define SGEMV_ITERS                     128
 40 | #endif
 41 | 
 42 | __global__ void
 43 | __launch_bounds__(288,1)
 44 | sgemvn_cuda_dma_both_single(int n, int m, int n1, float alpha, float *A, int lda, float *x, float *y)
 45 | {
 46 | 
 47 | 	__shared__ float buff[VEC_ELMTS];
 48 | 	__shared__ float mat[VEC_ELMTS][COMPUTE_THREADS_PER_CTA];	
 49 | 
 50 | 	cudaDMASequential<true,16,4*VEC_ELMTS,DMA_THREADS_PER_LD>
 51 | 	  dma_ld_0(1,COMPUTE_THREADS_PER_CTA,COMPUTE_THREADS_PER_CTA);
 52 | 
 53 | 	cudaDMAStrided<true,16,4*COMPUTE_THREADS_PER_CTA,4*DMA_THREADS_PER_LD,VEC_ELMTS>
 54 | 	  dma_ld_1(2,COMPUTE_THREADS_PER_CTA,COMPUTE_THREADS_PER_CTA+1*DMA_THREADS_PER_LD,4*lda);
 55 | 
 56 | 	if (threadIdx.x < COMPUTE_THREADS_PER_CTA)
 57 | 	{
 58 | 		dma_ld_0.start_async_dma();	
 59 | 		dma_ld_1.start_async_dma();
 60 | 
 61 | 		float res = 0.f;
 62 | 
 63 | #ifdef DYNAMIC
 64 |                 #pragma unroll 1
 65 | 		for(int i=0; i<n1; i += VEC_ELMTS)
 66 | #else
 67 |                 for(int i=0; i<SGEMV_ITERS; i++)
 68 | #endif
 69 | 		{
 70 | 			dma_ld_0.wait_for_dma_finish();
 71 | 			dma_ld_1.wait_for_dma_finish();
 72 | 			#pragma unroll
 73 | 			for(int j=0; j < VEC_ELMTS; j++)
 74 | 			{
 75 | 				res+=mat[j][threadIdx.x]*buff[j];
 76 | 			}
 77 | 			dma_ld_0.start_async_dma();
 78 | 			dma_ld_1.start_async_dma();
 79 | 		}
 80 | 
 81 | 		int ind = blockIdx.x*COMPUTE_THREADS_PER_CTA + threadIdx.x;
 82 | 		if (ind<n)
 83 | 			y[ind] = alpha * res;
 84 | 	}
 85 | 	else if (dma_ld_0.owns_this_thread())
 86 | 	{
 87 | #ifdef DYNAMIC
 88 |                 #pragma unroll 1
 89 | 		for (int idx=0; idx<n1; idx += VEC_ELMTS)
 90 | #else
 91 |                 for (int idx=0; idx<SGEMV_ITERS; idx++)
 92 | #endif
 93 | 		{
 94 | 			dma_ld_0.execute_dma(x,buff);
 95 | 			x += VEC_ELMTS;
 96 | 		}	
 97 | 		dma_ld_0.wait_for_dma_start();
 98 | 	}
 99 | 	else if (dma_ld_1.owns_this_thread())
100 | 	{
101 |                 int ind = blockIdx.x*COMPUTE_THREADS_PER_CTA;
102 |                 A += ind;
103 | #ifdef DYNAMIC
104 |                 #pragma unroll 1
105 | 		for (int idx=0; idx<n1; idx += VEC_ELMTS)
106 | #else
107 |                 for (int idx=0; idx<SGEMV_ITERS; idx++)
108 | #endif
109 | 		{
110 |     		        dma_ld_1.execute_dma(A,mat);
111 | 			A += (lda*VEC_ELMTS);
112 | 		}
113 | 		dma_ld_1.wait_for_dma_start();
114 | 	}
115 | }
116 | 
117 | 


--------------------------------------------------------------------------------
/examples/sgemv/vec_double.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2015 Stanford University and NVIDIA
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | #include <cstdio>
 18 | #include <cstdlib>
 19 | #include <cassert>
 20 | #include <cmath>
 21 | 
 22 | #include "cuda.h"
 23 | #include "cuda_runtime.h"
 24 | 
 25 | #include "cudaDMA.h"
 26 | 
 27 | #define SIZE_N	        896
 28 | #define SIZE_M		SIZE_N
 29 | 
 30 | #define DMA_KERNEL			sgemvn_cuda_dma_vec_double
 31 | #define COMPUTE_THREADS_PER_CTA		128	
 32 | #define DMA_THREADS_PER_LD		32	
 33 | #define DMA_LDS				2
 34 | #ifndef VEC_ELMTS
 35 | #define VEC_ELMTS			128
 36 | #endif
 37 | 
 38 | #ifndef SGEMV_ITERS
 39 | #define SGEMV_ITERS                     128
 40 | #endif
 41 | 
 42 | __global__ void
 43 | __launch_bounds__(192,1)
 44 | sgemvn_cuda_dma_vec_double(int n, int m, int n1, float alpha, float *A, int lda, float *x, float *y)
 45 | {
 46 | 	__shared__ float buff0[VEC_ELMTS];
 47 | 	__shared__ float buff1[VEC_ELMTS];
 48 | 
 49 | 	cudaDMASequential<true,16,4*VEC_ELMTS,DMA_THREADS_PER_LD>
 50 | 	  dma_ld_0(1,COMPUTE_THREADS_PER_CTA,COMPUTE_THREADS_PER_CTA);
 51 | 	cudaDMASequential<true,16,4*VEC_ELMTS,DMA_THREADS_PER_LD>
 52 | 	  dma_ld_1(2,COMPUTE_THREADS_PER_CTA,COMPUTE_THREADS_PER_CTA+1*DMA_THREADS_PER_LD);
 53 | 
 54 | 	if (threadIdx.x < COMPUTE_THREADS_PER_CTA)
 55 | 	{
 56 | 		dma_ld_0.start_async_dma();	
 57 | 		dma_ld_1.start_async_dma();
 58 | 		int ind = blockIdx.x*COMPUTE_THREADS_PER_CTA + threadIdx.x;
 59 | 
 60 | 		A += ind;
 61 | 
 62 | 		float res = 0.f;
 63 | 
 64 | #ifdef DYNAMIC
 65 |                 #pragma unroll 1
 66 | 		for(int i=0; i<n1; i += (VEC_ELMTS*2) )
 67 | #else
 68 |                 for(int i=0; i<SGEMV_ITERS; i++)
 69 | #endif
 70 | 		{
 71 | 			dma_ld_0.wait_for_dma_finish();
 72 | 			#pragma unroll
 73 | 			for(int j=0; j < VEC_ELMTS; j++)
 74 | 			{
 75 | 				res+=A[0]*buff0[j];
 76 | 				A+=lda;
 77 | 			}
 78 | 			dma_ld_0.start_async_dma();
 79 | 
 80 | 			dma_ld_1.wait_for_dma_finish();
 81 | 			#pragma unroll
 82 | 			for (int j=0; j < VEC_ELMTS; j++)
 83 | 			{
 84 | 				res+=A[0]*buff1[j];
 85 | 				A+=lda;
 86 | 			}
 87 | 			dma_ld_1.start_async_dma();
 88 | 		}
 89 | 
 90 | 		if (ind<n)
 91 | 			y[ind] = alpha * res;
 92 | 	}
 93 | 	else if (dma_ld_0.owns_this_thread())
 94 | 	{
 95 | #ifdef DYNAMIC
 96 |                 #pragma unroll 1
 97 | 		for (int i=0; i<n1; i += (2*VEC_ELMTS))
 98 | #else
 99 |                 for (int i=0; i<SGEMV_ITERS; i++)
100 | #endif
101 | 		{
102 | 			dma_ld_0.execute_dma(x,buff0);
103 | 			x += 2*VEC_ELMTS;
104 | 		}	
105 | 		dma_ld_0.wait_for_dma_start();
106 | 	}
107 | 	else if (dma_ld_1.owns_this_thread())
108 | 	{
109 | 	        x += VEC_ELMTS;
110 | #ifdef DYNAMIC
111 |                 #pragma unroll 1
112 | 		for (int i=0; i<n1; i += (2*VEC_ELMTS))
113 | #else
114 |                 for (int i=0; i<SGEMV_ITERS; i++)
115 | #endif
116 | 		{
117 | 			dma_ld_1.execute_dma(x,buff1);
118 | 			x += 2*VEC_ELMTS;
119 | 		}
120 | 		dma_ld_1.wait_for_dma_start();
121 | 	}
122 | }
123 | 
124 | 


--------------------------------------------------------------------------------
/examples/run_examples.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/sh
  2 | 
  3 | cd ../src
  4 | make
  5 | cd ../examples
  6 | 
  7 | #run saxpy
  8 | cd saxpy
  9 | make
 10 | 
 11 | echo "Running saxpy single..."
 12 | ../../src/weft -t 4 -n 320 saxpy_single.ptx
 13 | 
 14 | echo "Running saxpy double..."
 15 | ../../src/weft -t 4 -n 384 saxpy_double.ptx
 16 | 
 17 | make clean
 18 | cd ..
 19 | 
 20 | #run sgemv
 21 | cd sgemv
 22 | make
 23 | 
 24 | echo "Running sgemv vec single..."
 25 | ../../src/weft -t 4 vec_single.ptx
 26 | 
 27 | echo "Running sgemv vec double..."
 28 | ../../src/weft -t 4 vec_double.ptx
 29 | 
 30 | echo "Running sgemv vec manual..."
 31 | ../../src/weft -t 4 vec_manual.ptx
 32 | 
 33 | echo "Running sgemv both single..."
 34 | ../../src/weft -t 4 both_single.ptx
 35 | 
 36 | echo "Running sgemv both double..."
 37 | ../../src/weft -t 4 both_double.ptx
 38 | 
 39 | echo "Running sgemv both manual..."
 40 | ../../src/weft -t 4 both_manual.ptx
 41 | 
 42 | make clean
 43 | cd ..
 44 | 
 45 | #run RTM
 46 | cd RTM
 47 | make
 48 | 
 49 | echo "Running RTM one phase manual..."
 50 | ../../src/weft -t 4 one_phase_manual_buffer.ptx
 51 | 
 52 | echo "Running RTM one phase single..."
 53 | ../../src/weft -t 4 one_phase_single_buffer.ptx
 54 | 
 55 | echo "Running RTM two phase manual..."
 56 | ../../src/weft -t 4 two_phase_manual_buffer.ptx
 57 | 
 58 | echo "Running RTM two phase quad..."
 59 | ../../src/weft -t 4 two_phase_quad_buffer.ptx
 60 | 
 61 | echo "Running RTM two phase single..."
 62 | ../../src/weft -t 4 two_phase_single_buffer.ptx
 63 | 
 64 | make clean
 65 | cd ..
 66 | 
 67 | #run DME
 68 | cd DME
 69 | make
 70 | #Fermi requires warp synchronization for verification
 71 | echo "Running DME diffusion fermi..."
 72 | ../../src/weft -s -t 4 diff_fermi.ptx
 73 | echo "Running DME viscosity fermi..."
 74 | ../../src/weft -s -t 4 visc_fermi.ptx
 75 | echo "Running DME chemistry fermi..."
 76 | ../../src/weft -s -t 4 chem_fermi.ptx
 77 | 
 78 | #Kepler switches on warp synchronization when shuffles are detected
 79 | echo "Running DME diffusion kepler..."
 80 | ../../src/weft -t 4 diff_kepler.ptx
 81 | echo "Running DME viscosity kepler..."
 82 | ../../src/weft -t 4 visc_kepler.ptx
 83 | echo "Running DME chemistry kepler..."
 84 | ../../src/weft -t 4 chem_kepler.ptx
 85 | 
 86 | make clean
 87 | cd ..
 88 | 
 89 | #run Heptane
 90 | cd Heptane
 91 | make
 92 | 
 93 | #Fermi requires warp synchronization for verification
 94 | echo "Running Heptane diffusion fermi..."
 95 | ../../src/weft -s -t 4 diff_fermi.ptx
 96 | echo "Running Heptane viscosity fermi..."
 97 | ../../src/weft -s -t 4 visc_fermi.ptx
 98 | echo "Running Heptane chemistry fermi..."
 99 | ../../src/weft -s -t 4 chem_fermi.ptx
100 | 
101 | #Kepler switches on warp synchronization when shuffles are detected
102 | echo "Running Heptane diffusion kepler..."
103 | ../../src/weft -t 4 diff_kepler.ptx
104 | echo "Running Heptane viscosity kepler..."
105 | ../../src/weft -t 4 visc_kepler.ptx
106 | echo "Running Heptane chemistry kepler..."
107 | ../../src/weft -t 4 chem_kepler.ptx
108 | 
109 | make clean
110 | cd ..
111 | 
112 | #run PRF
113 | cd PRF
114 | make
115 | 
116 | #Fermi requires warp synchronization for verification
117 | echo "Running PRF diffusion fermi..."
118 | ../../src/weft -s -t 4 diff_fermi.ptx
119 | echo "Running PRF viscosity fermi..."
120 | ../../src/weft -s -t 4 visc_fermi.ptx
121 | 
122 | #Kepler switches on warp synchronization when shuffles are detected
123 | echo "Running PRF diffusion kepler..."
124 | ../../src/weft -t 4 diff_kepler.ptx
125 | echo "Running PRF viscosity kepler..."
126 | ../../src/weft -t 4 visc_kepler.ptx
127 | 
128 | make clean
129 | cd ..
130 | 
131 | 


--------------------------------------------------------------------------------
/examples/sgemv/both_manual.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2015 Stanford University and NVIDIA
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | #include <cstdio>
 18 | #include <cstdlib>
 19 | #include <cassert>
 20 | #include <cmath>
 21 | 
 22 | #include "cuda.h"
 23 | #include "cuda_runtime.h"
 24 | 
 25 | #include "cudaDMA.h"
 26 | 
 27 | #define SIZE_N	        896
 28 | #define SIZE_M		SIZE_N
 29 | 
 30 | #define DMA_KERNEL			sgemvn_cuda_dma_both_manual
 31 | #define COMPUTE_THREADS_PER_CTA		128
 32 | #define DMA_THREADS_PER_LD		32
 33 | #define DMA_LDS				9	
 34 | #ifndef VEC_ELMTS
 35 | #define VEC_ELMTS			32	
 36 | #endif
 37 | 
 38 | #ifndef SGEMV_ITERS
 39 | #define SGEMV_ITERS                     64
 40 | #endif
 41 | 
 42 | __global__ void
 43 | __launch_bounds__(416,1)
 44 | sgemvn_cuda_dma_both_manual(int n, int m, int n1, float alpha, float *A, int lda, float *x, float *y)
 45 | {
 46 | 	__shared__ float buff0[VEC_ELMTS];
 47 | 	__shared__ float buff1[VEC_ELMTS];
 48 | 	__shared__ float mat0[VEC_ELMTS][COMPUTE_THREADS_PER_CTA];	
 49 | 	__shared__ float mat1[VEC_ELMTS][COMPUTE_THREADS_PER_CTA];
 50 | 
 51 | 	cudaDMASequential<true,16,4*VEC_ELMTS,DMA_THREADS_PER_LD>
 52 | 	  dma_ld_0(1,COMPUTE_THREADS_PER_CTA,COMPUTE_THREADS_PER_CTA);
 53 | 
 54 | 	cudaDMASequential<true,16,4*VEC_ELMTS,DMA_THREADS_PER_LD>
 55 | 	  dma_ld_1(2,COMPUTE_THREADS_PER_CTA,COMPUTE_THREADS_PER_CTA);
 56 | 
 57 | 	cudaDMAStrided<true,16,4*COMPUTE_THREADS_PER_CTA,8*DMA_THREADS_PER_LD,VEC_ELMTS>
 58 | 	  dma_ld_2(3,COMPUTE_THREADS_PER_CTA,COMPUTE_THREADS_PER_CTA+1*DMA_THREADS_PER_LD,4*lda);
 59 | 
 60 | 	cudaDMAStrided<true,16,4*COMPUTE_THREADS_PER_CTA,8*DMA_THREADS_PER_LD,VEC_ELMTS>
 61 | 	  dma_ld_3(4,COMPUTE_THREADS_PER_CTA,COMPUTE_THREADS_PER_CTA+1*DMA_THREADS_PER_LD,4*lda);
 62 | 
 63 | 	if (threadIdx.x < COMPUTE_THREADS_PER_CTA)
 64 | 	{
 65 | 		dma_ld_0.start_async_dma();	
 66 | 		dma_ld_1.start_async_dma();
 67 | 		dma_ld_2.start_async_dma();
 68 | 		dma_ld_3.start_async_dma();
 69 | 
 70 | 		float res = 0.f;
 71 | 
 72 | #ifdef DYNAMIC
 73 |                 #pragma unroll 1
 74 | 		for(int i=0; i<n1; i += (VEC_ELMTS*2))
 75 | #else
 76 |                 for(int i=0; i<SGEMV_ITERS; i++)
 77 | #endif
 78 | 		{
 79 | 			dma_ld_0.wait_for_dma_finish();
 80 | 			dma_ld_2.wait_for_dma_finish();
 81 | 			#pragma unroll
 82 | 			for(int j=0; j < VEC_ELMTS; j++)
 83 | 			{
 84 | 				res+=mat0[j][threadIdx.x]*buff0[j];
 85 | 			}
 86 | 			dma_ld_0.start_async_dma();
 87 | 			dma_ld_2.start_async_dma();
 88 | 
 89 | 			dma_ld_1.wait_for_dma_finish();
 90 | 			dma_ld_3.wait_for_dma_finish();
 91 | 			#pragma unroll
 92 | 			for (int j=0; j < VEC_ELMTS; j++)
 93 | 			{
 94 | 				res+=mat1[j][threadIdx.x]*buff1[j];
 95 | 			}
 96 | 			dma_ld_1.start_async_dma();
 97 | 			dma_ld_3.start_async_dma();
 98 | 		}
 99 | 
100 | 		int ind = blockIdx.x*COMPUTE_THREADS_PER_CTA + threadIdx.x;
101 | 		if (ind<n)
102 | 			y[ind] = alpha * res;
103 | 	}
104 | 	else if (dma_ld_0.owns_this_thread())
105 | 	{
106 | #ifdef DYNAMIC
107 |                 #pragma unroll 1
108 | 		for (int idx=0; idx<n1; idx += (2*VEC_ELMTS))
109 | #else
110 |                 for (int idx=0; idx<SGEMV_ITERS; idx++)
111 | #endif
112 | 		{
113 | 			dma_ld_0.execute_dma(x,buff0);
114 | 			x += VEC_ELMTS;
115 | 			dma_ld_1.execute_dma(x,buff1);
116 | 			x += VEC_ELMTS;
117 | 		}	
118 | 		dma_ld_0.wait_for_dma_start();
119 | 		dma_ld_1.wait_for_dma_start();
120 | 	}
121 | 	else if (dma_ld_2.owns_this_thread())
122 | 	{
123 |                 int ind = blockIdx.x*COMPUTE_THREADS_PER_CTA;
124 |                 A += ind;
125 | #ifdef DYNAMIC
126 |                 #pragma unroll 1
127 | 		for (int idx=0; idx<n1; idx += (2*VEC_ELMTS))
128 | #else
129 |                 for (int idx=0; idx<SGEMV_ITERS; idx++)
130 | #endif
131 | 		{
132 | 			dma_ld_2.execute_dma(A,mat0);
133 | 			A += (lda*VEC_ELMTS);
134 | 			dma_ld_3.execute_dma(A,mat1);
135 | 			A += (lda*VEC_ELMTS);
136 | 		}
137 | 		dma_ld_2.wait_for_dma_start();
138 | 		dma_ld_3.wait_for_dma_start();
139 | 	}
140 | }
141 | 
142 | 


--------------------------------------------------------------------------------
/examples/saxpy/saxpy_double.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2015 Stanford University and NVIDIA
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | #include <stdio.h>
 18 | #include <stdlib.h>
 19 | 
 20 | #include "cudaDMA.h"
 21 | #include "params.h"
 22 | 
 23 | /*
 24 |  * This version of saxpy uses cudaDMA for DMAs with manual double buffering.
 25 |  */
 26 | __global__ void 
 27 | __launch_bounds__(384,2)
 28 | saxpy_cudaDMA_doublebuffer ( float* y, float* x, float a, clock_t * timer_vals) 
 29 | {
 30 |   __shared__ float sdata_x0 [COMPUTE_THREADS_PER_CTA];
 31 |   __shared__ float sdata_x1 [COMPUTE_THREADS_PER_CTA];
 32 |   __shared__ float sdata_y0 [COMPUTE_THREADS_PER_CTA];
 33 |   __shared__ float sdata_y1 [COMPUTE_THREADS_PER_CTA];
 34 | 
 35 |   cudaDMASequential<true, 16, DMA_SZ, DMA_THREADS_PER_LD>
 36 |     dma_ld_x_0 (1, COMPUTE_THREADS_PER_CTA, COMPUTE_THREADS_PER_CTA);
 37 |   cudaDMASequential<true, 16, DMA_SZ, DMA_THREADS_PER_LD>
 38 |     dma_ld_y_0 (2, COMPUTE_THREADS_PER_CTA, COMPUTE_THREADS_PER_CTA + DMA_THREADS_PER_LD);
 39 |   cudaDMASequential<true, 16, DMA_SZ, DMA_THREADS_PER_LD>
 40 |     dma_ld_x_1 (3, COMPUTE_THREADS_PER_CTA, COMPUTE_THREADS_PER_CTA + 2*DMA_THREADS_PER_LD);
 41 |   cudaDMASequential<true, 16, DMA_SZ, DMA_THREADS_PER_LD>
 42 |     dma_ld_y_1 (4, COMPUTE_THREADS_PER_CTA, COMPUTE_THREADS_PER_CTA + 3*DMA_THREADS_PER_LD);
 43 | 
 44 |   int tid = threadIdx.x ;
 45 | 
 46 |   if ( tid < COMPUTE_THREADS_PER_CTA ) {
 47 |     unsigned int idx;
 48 |     int i;
 49 |     float tmp_x;
 50 |     float tmp_y;
 51 |     
 52 |     // Preamble:
 53 |     dma_ld_x_0.start_async_dma();
 54 |     dma_ld_y_0.start_async_dma();
 55 |     dma_ld_x_1.start_async_dma();
 56 |     dma_ld_y_1.start_async_dma();
 57 |     #pragma unroll 1
 58 |     for (i = 0; i < NUM_ITERS-2; i += 2) {
 59 |       
 60 |       // Phase 1:
 61 |       dma_ld_x_0.wait_for_dma_finish();
 62 |       tmp_x = sdata_x0[tid];
 63 |       dma_ld_x_0.start_async_dma();
 64 |       dma_ld_y_0.wait_for_dma_finish();
 65 |       tmp_y = sdata_y0[tid];
 66 |       dma_ld_y_0.start_async_dma();
 67 |       idx = i * COMPUTE_THREADS_PER_CTA * CTA_COUNT + blockIdx.x * COMPUTE_THREADS_PER_CTA + threadIdx.x;
 68 |       y[idx] = a * tmp_x + tmp_y;
 69 | 
 70 |       // Phase 2:
 71 |       dma_ld_x_1.wait_for_dma_finish();
 72 |       tmp_x = sdata_x1[tid];
 73 |       dma_ld_x_1.start_async_dma();
 74 |       dma_ld_y_1.wait_for_dma_finish();
 75 |       tmp_y = sdata_y1[tid];
 76 |       dma_ld_y_1.start_async_dma();
 77 |       idx = (i+1) * COMPUTE_THREADS_PER_CTA * CTA_COUNT + blockIdx.x * COMPUTE_THREADS_PER_CTA + threadIdx.x;
 78 |       y[idx] = a * tmp_x + tmp_y;
 79 |     }
 80 |       
 81 |     // Postamble
 82 |     dma_ld_x_0.wait_for_dma_finish();
 83 |     tmp_x = sdata_x0[tid];
 84 |     dma_ld_y_0.wait_for_dma_finish();
 85 |     tmp_y = sdata_y0[tid];
 86 |     idx = i * COMPUTE_THREADS_PER_CTA * CTA_COUNT + blockIdx.x * COMPUTE_THREADS_PER_CTA + threadIdx.x;
 87 |     y[idx] = a * tmp_x + tmp_y;
 88 |     dma_ld_x_1.wait_for_dma_finish();
 89 |     tmp_x = sdata_x1[tid];
 90 |     dma_ld_y_1.wait_for_dma_finish();
 91 |     tmp_y = sdata_y1[tid];
 92 |     idx = (i+1) * COMPUTE_THREADS_PER_CTA * CTA_COUNT + blockIdx.x * COMPUTE_THREADS_PER_CTA + threadIdx.x;
 93 |     y[idx] = a * tmp_x + tmp_y;
 94 | 
 95 |   } else if (dma_ld_x_0.owns_this_thread()) {
 96 |     #pragma unroll 1
 97 |     for (unsigned int j = 0; j < NUM_ITERS; j+=2) {
 98 |       // idx is a pointer to the base of the chunk of memory to copy
 99 |       unsigned int idx = j * COMPUTE_THREADS_PER_CTA * CTA_COUNT + blockIdx.x * COMPUTE_THREADS_PER_CTA;
100 |       dma_ld_x_0.execute_dma( &x[idx], sdata_x0 );
101 |     }
102 |   } else if (dma_ld_y_0.owns_this_thread()) {
103 |     #pragma unroll 1
104 |     for (unsigned int j = 0; j < NUM_ITERS; j+=2) {
105 |       unsigned int idx = j * COMPUTE_THREADS_PER_CTA * CTA_COUNT + blockIdx.x * COMPUTE_THREADS_PER_CTA;
106 |       dma_ld_y_0.execute_dma( &y[idx], sdata_y0 );
107 |     }
108 |   } else if (dma_ld_x_1.owns_this_thread()) {
109 |     #pragma unroll 1
110 |     for (unsigned int j = 1; j < NUM_ITERS; j+=2) {
111 |       unsigned int idx = j * COMPUTE_THREADS_PER_CTA * CTA_COUNT + blockIdx.x * COMPUTE_THREADS_PER_CTA;
112 |       dma_ld_x_1.execute_dma( &x[idx], sdata_x1 );
113 |     }
114 |   } else if (dma_ld_y_1.owns_this_thread()) {
115 |     #pragma unroll 1
116 |     for (unsigned int j = 1; j < NUM_ITERS; j+=2) {
117 |       unsigned int idx = j * COMPUTE_THREADS_PER_CTA * CTA_COUNT + blockIdx.x * COMPUTE_THREADS_PER_CTA;
118 |       dma_ld_y_1.execute_dma( &y[idx], sdata_y1 );
119 |     }
120 |   }
121 | }
122 | 
123 | 


--------------------------------------------------------------------------------
/examples/sgemv/both_double.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2015 Stanford University and NVIDIA
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | #include <cstdio>
 18 | #include <cstdlib>
 19 | #include <cassert>
 20 | #include <cmath>
 21 | 
 22 | #include "cuda.h"
 23 | #include "cuda_runtime.h"
 24 | 
 25 | #include "cudaDMA.h"
 26 | 
 27 | #define SIZE_N	        896
 28 | #define SIZE_M		SIZE_N
 29 | 
 30 | #define DMA_KERNEL			sgemvn_cuda_dma_both_double
 31 | #define COMPUTE_THREADS_PER_CTA		128	
 32 | #define DMA_THREADS_PER_LD		32
 33 | #define DMA_LDS				10	
 34 | #ifndef VEC_ELMTS
 35 | #define VEC_ELMTS			32	
 36 | #endif
 37 | 
 38 | #ifndef SGEMV_ITERS
 39 | #define SGEMV_ITERS                     128
 40 | #endif
 41 | 
 42 | __global__ void
 43 | __launch_bounds__(448,1)
 44 | sgemvn_cuda_dma_both_double(int n, int m, int n1, float alpha, float *A, int lda, float *x, float *y)
 45 | {
 46 | 	__shared__ float buff0[VEC_ELMTS];
 47 | 	__shared__ float buff1[VEC_ELMTS];
 48 | 	__shared__ float mat0[VEC_ELMTS][COMPUTE_THREADS_PER_CTA];	
 49 | 	__shared__ float mat1[VEC_ELMTS][COMPUTE_THREADS_PER_CTA];
 50 | 
 51 | 	cudaDMASequential<true,16,4*VEC_ELMTS,DMA_THREADS_PER_LD>
 52 | 	  dma_ld_0(1,COMPUTE_THREADS_PER_CTA,COMPUTE_THREADS_PER_CTA);
 53 | 
 54 | 	cudaDMASequential<true,16,4*VEC_ELMTS,DMA_THREADS_PER_LD>
 55 | 	  dma_ld_1(2,COMPUTE_THREADS_PER_CTA,COMPUTE_THREADS_PER_CTA+1*DMA_THREADS_PER_LD);
 56 | 
 57 | 	cudaDMAStrided<true,16,4*COMPUTE_THREADS_PER_CTA,4*DMA_THREADS_PER_LD,VEC_ELMTS>
 58 | 	  dma_ld_2(3,COMPUTE_THREADS_PER_CTA,COMPUTE_THREADS_PER_CTA+2*DMA_THREADS_PER_LD,4*lda);
 59 | 
 60 | 	cudaDMAStrided<true,16,4*COMPUTE_THREADS_PER_CTA,4*DMA_THREADS_PER_LD,VEC_ELMTS>
 61 | 	  dma_ld_3(4,COMPUTE_THREADS_PER_CTA,COMPUTE_THREADS_PER_CTA+6*DMA_THREADS_PER_LD,4*lda);
 62 | 
 63 | 	if (threadIdx.x < COMPUTE_THREADS_PER_CTA)
 64 | 	{
 65 | 		dma_ld_0.start_async_dma();	
 66 | 		dma_ld_1.start_async_dma();
 67 | 		dma_ld_2.start_async_dma();
 68 | 		dma_ld_3.start_async_dma();
 69 | 
 70 | 		float res = 0.f;
 71 | 
 72 | #ifdef DYNAMIC
 73 |                 #pragma unroll 1
 74 | 		for(int i=0; i<n1; i += (VEC_ELMTS*2))
 75 | #else
 76 |                 for(int i=0; i<SGEMV_ITERS; i++)
 77 | #endif
 78 | 		{
 79 | 			dma_ld_0.wait_for_dma_finish();
 80 | 			dma_ld_2.wait_for_dma_finish();
 81 | 			#pragma unroll
 82 | 			for(int j=0; j < VEC_ELMTS; j++)
 83 | 			{
 84 | 				res+=mat0[j][threadIdx.x]*buff0[j];
 85 | 			}
 86 | 			dma_ld_0.start_async_dma();
 87 | 			dma_ld_2.start_async_dma();
 88 | 
 89 | 			dma_ld_1.wait_for_dma_finish();
 90 | 			dma_ld_3.wait_for_dma_finish();
 91 | 			#pragma unroll
 92 | 			for (int j=0; j < VEC_ELMTS; j++)
 93 | 			{
 94 | 				res+=mat1[j][threadIdx.x]*buff1[j];
 95 | 			}
 96 | 			dma_ld_1.start_async_dma();
 97 | 			dma_ld_3.start_async_dma();
 98 | 		}
 99 | 
100 | 		int ind = blockIdx.x*COMPUTE_THREADS_PER_CTA + threadIdx.x;
101 | 		if (ind<n)
102 | 			y[ind] = alpha * res;
103 | 	}
104 | 	else if (dma_ld_0.owns_this_thread())
105 | 	{
106 | #ifdef DYNAMIC
107 |                 #pragma unroll 1
108 | 		for (int idx=0; idx<n1; idx += (VEC_ELMTS*2))
109 | #else
110 |                 for (int idx=0; idx < SGEMV_ITERS; idx++)
111 | #endif
112 | 		{
113 | 			dma_ld_0.execute_dma(x,buff0);
114 | 			x += 2 * VEC_ELMTS;
115 | 		}	
116 | 		dma_ld_0.wait_for_dma_start();
117 | 	}
118 | 	else if (dma_ld_1.owns_this_thread())
119 | 	{
120 | 		x += VEC_ELMTS;
121 | #ifdef DYNAMIC
122 |                 #pragma unroll 1
123 | 		for (int i=0; i<n1; i += (VEC_ELMTS*2))
124 | #else
125 |                 for (int i=0; i<SGEMV_ITERS; i++)
126 | #endif
127 | 		{
128 | 			dma_ld_1.execute_dma(x,buff1);
129 | 			x += 2 * VEC_ELMTS;
130 | 		} 
131 | 		dma_ld_1.wait_for_dma_start();
132 | 	}
133 | 	else if (dma_ld_2.owns_this_thread())
134 | 	{
135 |                 int ind = blockIdx.x*COMPUTE_THREADS_PER_CTA;
136 |                 A += ind;
137 | #ifdef DYNAMIC
138 |                 #pragma unroll 1
139 | 		for (int idx=0; idx<n1; idx += (VEC_ELMTS*2))
140 | #else
141 |                 for (int idx=0; idx<SGEMV_ITERS; idx++)
142 | #endif
143 | 		{
144 | 			dma_ld_2.execute_dma(A,mat0);
145 | 			A += (2*lda*VEC_ELMTS);
146 | 		}
147 | 		dma_ld_2.wait_for_dma_start();
148 | 	}
149 | 	else if (dma_ld_3.owns_this_thread())
150 | 	{
151 |                 int ind = blockIdx.x*COMPUTE_THREADS_PER_CTA + lda*VEC_ELMTS;
152 |                 A += ind;
153 | #ifdef DYNAMIC
154 |                 #pragma unroll 1
155 | 		for (int i=0; i<n1; i += (VEC_ELMTS*2))
156 | #else
157 |                 for (int i=0; i<SGEMV_ITERS; i++)
158 | #endif
159 | 		{
160 | 			dma_ld_3.execute_dma(A,mat1);
161 | 			A += (2*lda*VEC_ELMTS);
162 | 		}	
163 | 		dma_ld_3.wait_for_dma_start();
164 | 	}
165 | }
166 | 
167 | 


--------------------------------------------------------------------------------
/src/graph.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2015 Stanford University and NVIDIA
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | #ifndef __BARRIER_DEPENDENCE_GRAPH_H__
 18 | #define __BARRIER_DEPENDENCE_GRAPH_H__
 19 | 
 20 | #include <set>
 21 | #include <map>
 22 | #include <deque>
 23 | #include <vector>
 24 | #include <pthread.h>
 25 | 
 26 | class Weft;
 27 | class Thread;
 28 | class WeftBarrier;
 29 | class BarrierArrive;
 30 | class WeftInstruction;
 31 | class BarrierDependenceGraph;
 32 | 
 33 | class BarrierInstance {
 34 | public:
 35 |   BarrierInstance(BarrierDependenceGraph *graph, int name, int generation);
 36 |   BarrierInstance(const BarrierInstance &rhs);
 37 |   ~BarrierInstance(void);
 38 | public:
 39 |   BarrierInstance& operator=(const BarrierInstance &rhs);
 40 | public:
 41 |   void update_waiting_threads(std::set<Thread*> &waiting_threads);
 42 |   bool intersects_with(const std::set<Thread*> &waiting_threads);
 43 |   bool happens_after(BarrierInstance *other);
 44 |   bool happens_before(const std::vector<WeftBarrier*> &other_participants);
 45 | public:
 46 |   void add_participant(WeftBarrier *participant, bool sync);
 47 |   bool has_next(BarrierInstance *other);
 48 |   bool has_previous(BarrierInstance *other);
 49 |   void add_incoming(BarrierInstance *other);
 50 |   void add_outgoing(BarrierInstance *other);
 51 |   void remove_incoming(int name, int gen);
 52 |   void remove_outgoing(int name, int gen);
 53 | public:
 54 |   void initialize_pending_counts(void);
 55 |   template<typename T>
 56 |   void launch_if_ready(Weft *weft, bool forward);
 57 |   template<typename T>
 58 |   void notify_dependences(Weft *weft, bool forward);
 59 |   void compute_reachability(Weft *weft, bool forward);
 60 |   void compute_transitivity(Weft *weft, bool forward);
 61 |   void update_latest_incoming(std::vector<BarrierInstance*> &other);
 62 |   void update_earliest_outgoing(std::vector<BarrierInstance*> &other);
 63 |   void update_latest_before(std::vector<int> &other);
 64 |   void update_earliest_after(std::vector<int> &other);
 65 | public:
 66 |   void traverse_forward(std::deque<BarrierInstance*> &queue,
 67 |                         std::set<BarrierInstance*> &visited);
 68 | public:
 69 |   BarrierDependenceGraph *const graph;
 70 |   const int name;
 71 |   const int generation;
 72 | protected:
 73 |   std::vector<WeftBarrier*> participants;
 74 |   // Helpful for constructing barrier dependence graph
 75 |   std::map<Thread*,WeftBarrier*> syncs_only;
 76 | protected:
 77 |   std::vector<BarrierInstance*> incoming;
 78 |   std::vector<BarrierInstance*> outgoing;
 79 | protected:
 80 |   std::vector<BarrierInstance*> latest_incoming;
 81 |   std::vector<BarrierInstance*> earliest_outgoing;
 82 | protected:
 83 |   std::vector<int> latest_before;
 84 |   std::vector<int> earliest_after;
 85 | protected:
 86 |   int base_incoming;
 87 |   int base_outgoing;
 88 |   int pending_incoming;
 89 |   int pending_outgoing;
 90 | };
 91 | 
 92 | class BarrierDependenceGraph {
 93 | private:
 94 |   struct PendingState {
 95 |   public:
 96 |     PendingState(void)
 97 |       : expected(-1), generation(0) { }
 98 |   public:
 99 |     inline void reset(void) {
100 |       expected = -1;
101 |       generation++;
102 |       arrivals.clear();
103 |     }
104 |   public:
105 |     int expected;
106 |     int generation;
107 |     std::set<BarrierArrive*> arrivals;
108 |   };
109 |   struct PreceedingBarriers {
110 |   public:
111 |     PreceedingBarriers(void) { }
112 |   public:
113 |     void find_preceeding(BarrierInstance *bar);
114 |     void add_instance(BarrierInstance *bar);
115 |   public:
116 |     // This is an upper bound on all arrivals
117 |     std::set<Thread*> arrival_threads;
118 |     std::deque<BarrierInstance*> previous;
119 |   };
120 | public:
121 |   BarrierDependenceGraph(Weft *weft, Program *p);
122 |   BarrierDependenceGraph(const BarrierDependenceGraph &rhs);
123 |   ~BarrierDependenceGraph(void);
124 | public:
125 |   BarrierDependenceGraph& operator=(const BarrierDependenceGraph &rhs);
126 | public:
127 |   void construct_graph(const std::vector<Thread*> &threads);
128 |   int count_validation_tasks(void);
129 |   void enqueue_validation_tasks(void);
130 |   void check_for_validation_errors(void);
131 |   void validate_barrier(int name, int generation);
132 | public:
133 |   int count_total_barriers(void);
134 |   void enqueue_reachability_tasks(void);
135 |   void enqueue_transitive_happens_tasks(void);
136 | protected:
137 |   bool remove_complete_barriers(std::vector<int> &program_counters,
138 |                                 std::vector<PendingState> &pending_arrives,
139 |                                 std::vector<PreceedingBarriers> &preceeding,
140 |                                 const std::vector<Thread*> &threads);
141 |   bool are_empty(const std::vector<int> &program_counters,
142 |                  const std::vector<Thread*> &threads);
143 |   bool advance_program_counters(std::vector<int> &program_counters,
144 |                                 std::vector<PendingState> &pending_arrives,
145 |                                 const std::vector<Thread*> &threads);
146 |   void report_state(const std::vector<int> &program_counters,
147 |                     const std::vector<Thread*> &threads,
148 |                     const std::vector<PendingState> &pending_arrives);
149 | protected:
150 |   void initialize_pending_counts(void);
151 | public:
152 |   Weft *const weft;
153 |   Program *const program;
154 |   const int max_num_barriers;
155 | protected:
156 |   std::vector<std::deque<BarrierInstance*> > barrier_instances;
157 |   // A summary of all barriers in one place
158 |   std::deque<BarrierInstance*>               all_barriers;
159 | protected:
160 |   pthread_mutex_t validation_mutex;
161 |   std::vector<std::pair<int/*name*/,int/*gen*/> > failed_validations;
162 | };
163 | 
164 | class BFSSearch {
165 | public:
166 |   BFSSearch(BarrierInstance *source, BarrierInstance *target);
167 |   BFSSearch(const BFSSearch &rhs) : source(NULL), target(NULL) { assert(false); }
168 |   ~BFSSearch(void) { }
169 | public:
170 |   BFSSearch& operator=(const BFSSearch &rhs) { assert(false); return *this; }
171 | public:
172 |   bool execute(void);
173 | public:
174 |   BarrierInstance *const source;
175 |   BarrierInstance *const target;
176 | protected:
177 |   std::deque<BarrierInstance*> queue;
178 |   std::set<BarrierInstance*> visited;
179 | };
180 | 
181 | #endif // __BARRIER_DEPENDENCE_GRAPH_H__
182 | 


--------------------------------------------------------------------------------
/src/program.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2015 Stanford University and NVIDIA
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | #ifndef __PROGRAM_H__
 18 | #define __PROGRAM_H__
 19 | 
 20 | #include <string>
 21 | #include <map>
 22 | #include <deque>
 23 | #include <vector>
 24 | #include <cassert>
 25 | #include <stdint.h>
 26 | 
 27 | enum ThreadStatus {
 28 |   THREAD_ENABLED,
 29 |   THREAD_DISABLED,
 30 |   THREAD_EXITTED,
 31 | };
 32 | 
 33 | enum ProgramStage {
 34 |   EMULATE_THREADS_STAGE,
 35 |   CONSTRUCT_BARRIER_GRAPH_STAGE,
 36 |   COMPUTE_HAPPENS_RELATIONSHIP_STAGE,
 37 |   CHECK_FOR_RACES_STAGE,
 38 |   TOTAL_STAGES,
 39 | };
 40 | 
 41 | class Weft;
 42 | class Thread;
 43 | class Happens;
 44 | class PTXLabel;
 45 | class WeftAccess;
 46 | class SharedMemory;
 47 | class PTXInstruction;
 48 | class WeftInstruction;
 49 | 
 50 | struct ThreadState {
 51 | public:
 52 |   ThreadState(void)
 53 |     : status(THREAD_ENABLED), next(NULL) { }
 54 | public:
 55 |   ThreadStatus status;
 56 |   PTXLabel *next;
 57 | };
 58 | 
 59 | class Program {
 60 | public:
 61 |   struct CTAState {
 62 |   public:
 63 |     CTAState(void)
 64 |       : shared_memory(NULL), graph(NULL) { }
 65 |   public:
 66 |     int block_id[3];
 67 |     SharedMemory *shared_memory;
 68 |     BarrierDependenceGraph *graph;
 69 |     std::vector<Thread*> threads;
 70 |   };
 71 | public:
 72 |   Program(Weft *weft, std::string &kernel_name);
 73 |   Program(const Program &rhs);
 74 |   ~Program(void);
 75 | public:
 76 |   Program& operator=(const Program &rhs);
 77 | public:
 78 |   static void parse_ptx_file(const char *file_name, Weft *weft,
 79 |                              std::vector<Program*> &programs);
 80 |   void report_statistics(void);
 81 |   void report_statistics(const std::vector<Thread*> &threads);
 82 |   bool has_shuffles(void) const;
 83 |   inline int count_instructions(void) const { return ptx_instructions.size(); }
 84 |   inline int barrier_upper_bound(void) const { return max_num_barriers; }
 85 |   inline int thread_count(void) const { return max_num_threads; }
 86 |   inline bool assume_warp_synchronous(void) const { return warp_synchronous; }
 87 |   inline const char* get_name(void) const { return kernel_name.c_str(); }
 88 | protected:
 89 |   void emulate_threads(void);
 90 |   void construct_dependence_graph(void);
 91 |   void compute_happens_relationships(void);
 92 |   void check_for_race_conditions(void);
 93 |   void print_statistics(void);
 94 |   void print_files(void);
 95 |   int count_dynamic_instructions(void);
 96 |   int count_weft_statements(void);
 97 |   int count_total_barriers(void);
 98 |   int count_addresses(void);
 99 |   size_t count_race_tests(void);
100 | public:
101 |   int emulate(Thread *thread);
102 |   void emulate_warp(Thread **threads);
103 |   void get_kernel_prefix(char *buffer, size_t count);
104 | public:
105 |   void add_line(const std::string &line, int line_num);
106 |   void set_block_dim(const int *array);
107 |   void add_block_id(const int *array);
108 |   void set_grid_dim(const int *array);
109 |   void fill_block_dim(int *array) const;
110 |   void fill_block_id(int *array) const;
111 |   void fill_grid_dim(int *array) const;
112 |   void verify(void);
113 | protected:
114 |   void convert_to_instructions(const std::map<int,const char*> &source_files);
115 |   static bool parse_file_location(const std::string &line,
116 |                                   std::map<int,const char*> &source_files);
117 |   static bool parse_source_location(const std::string &line,
118 |                                     int &source_file, int &source_line);
119 | protected:
120 |   void start_instrumentation(ProgramStage stage);
121 |   void stop_instrumentation(ProgramStage stage);
122 | public:
123 |   void report_instrumentation(size_t &accumulated_memory);
124 | public:
125 |   Weft *const weft;
126 | protected:
127 |   std::string kernel_name;
128 |   int max_num_threads;
129 |   int max_num_barriers;
130 | protected:
131 |   int block_dim[3];
132 |   int block_id[3];
133 |   int grid_dim[3];
134 |   bool warp_synchronous;
135 |   unsigned current_cta;
136 |   std::vector<CTAState> cta_states;
137 | protected:
138 |   std::vector<std::pair<std::string,int> > lines;
139 |   std::vector<PTXInstruction*> ptx_instructions;
140 | protected:
141 |   // Instrumentation
142 |   unsigned long long timing[TOTAL_STAGES];
143 |   size_t memory_usage[TOTAL_STAGES];
144 | };
145 | 
146 | class Thread {
147 | public:
148 |   struct GlobalDataInfo {
149 |   public:
150 |     const char *name;
151 |     const int *data;
152 |     size_t size;
153 |   };
154 | public:
155 |   Thread(unsigned thread_id, int tidx, int tidy, int tidz, 
156 |          Program *p, SharedMemory *s);
157 |   Thread(const Thread &rhs) : thread_id(0), tid_x(-1), tid_y(-1), tid_z(-1),
158 |     program(NULL), shared_memory(NULL) { assert(false); }
159 |   ~Thread(void);
160 | public:
161 |   Thread& operator=(const Thread &rhs) { assert(false); return *this; }
162 | public:
163 |   void initialize(void);
164 |   void emulate(void);
165 |   void cleanup(void);
166 | public:
167 |   void register_shared_location(const std::string &name, int64_t address);
168 |   bool find_shared_location(const std::string &name, int64_t &addr);
169 | public:
170 |   void register_global_location(const char *name, const int *data, size_t size);
171 |   bool get_global_location(const char *name, int64_t &addr);
172 |   bool get_global_value(int64_t addr, int64_t &value);
173 | public:
174 |   void set_value(int64_t reg, int64_t value);
175 |   bool get_value(int64_t reg, int64_t &value);
176 | public:
177 |   void set_pred(int64_t pred, bool value);
178 |   bool get_pred(int64_t pred, bool &value);
179 | public:
180 |   void add_instruction(WeftInstruction *instruction);
181 |   void update_max_barrier_name(int name);
182 |   inline int get_max_barrier_name(void) const { return max_barrier_name; }
183 | public:
184 |   void profile_instruction(PTXInstruction *instruction);
185 |   int accumulate_instruction_counts(std::vector<int> &total_counts);
186 |   void dump_weft_thread(void);
187 | public:
188 |   void update_shared_memory(WeftAccess *access);
189 | public:
190 |   inline size_t get_program_size(void) const { return instructions.size(); }
191 |   inline WeftInstruction* get_instruction(int idx)
192 |     { return ((unsigned(idx) < instructions.size()) ? instructions[idx] : NULL); } 
193 |   inline int count_dynamic_instructions(void) const 
194 |     { return dynamic_instructions; }
195 |   inline int count_weft_statements(void) const
196 |     { return instructions.size(); }
197 |   inline void set_dynamic_instructions(int count) { dynamic_instructions = count; }
198 | public:
199 |   void initialize_happens(int total_threads, int max_num_barriers);
200 |   void update_happens_relationships(void);
201 | protected:
202 |   void initialize_happens_instances(int total_threads);
203 |   void compute_barriers_before(int max_num_barriers);
204 |   void compute_barriers_after(int max_num_barriers);
205 | public:
206 |   const unsigned thread_id;
207 |   const int tid_x, tid_y, tid_z;
208 |   Program *const program;
209 |   SharedMemory *const shared_memory;
210 | protected:
211 |   std::map<std::string,int64_t/*addr*/>           shared_locations;
212 |   std::map<int64_t/*register*/,int64_t/*value*/>  register_store;
213 |   std::map<int64_t/*predicate*/,bool/*value*/>    predicate_store;
214 |   std::vector<GlobalDataInfo>                     globals;
215 | protected:
216 |   int max_barrier_name;
217 |   int dynamic_instructions;
218 |   std::vector<WeftInstruction*>                   instructions;
219 |   std::vector<int>                                dynamic_counts;
220 | protected:
221 |   std::deque<Happens*>                            all_happens;
222 | };
223 | 
224 | class SharedStore {
225 | public:
226 |   SharedStore(void) { }
227 |   SharedStore(const SharedStore &rhs) { assert(false); }
228 |   ~SharedStore(void) { }
229 | public:
230 |   SharedStore& operator=(const SharedStore &rhs) { assert(false); return *this; }
231 | public:
232 |   void write(int64_t addr, int64_t value);
233 |   bool read(int64_t addr, int64_t &value);
234 | protected:
235 |   std::map<int64_t/*addr*/,int64_t/*value*/> store;
236 | };
237 | 
238 | #endif //__PROGRAM_H__
239 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | Weft
  2 | ====
  3 | 
  4 | A Sound and Complete Verification Tool for Warp-Specialized GPU Kernels
  5 | 
  6 | Update! Our paper on Weft, **Verification of Producer-Consumer
  7 | Synchronization in GPU Programs** will be appearing at
  8 | [PLDI 2015](http://conf.researchr.org/home/pldi2015).
  9 | 
 10 | Navigation
 11 | ----
 12 | 
 13 | 1. [Overview](#overview)
 14 | 2. [Prerequisites](#prerequisites)
 15 | 3. [Downloading Weft](#downloading-and-building-weft)
 16 | 4. [Using Weft](#using-weft)
 17 | 5. [Command Line Arguments](#command-line-arguments)
 18 | 
 19 | Overview
 20 | ----
 21 | 
 22 | Weft is a sound and complete verification tool for warp-specialized 
 23 | kernels that use named barriers on NVIDIA GPUs. Warp-specialized 
 24 | kernels can encode arbitrary producer-consumer relationships between 
 25 | different subsets of warps within a kernel using named barriers.
 26 | This requires a more general analysis than most current GPU verification
 27 | tools provide.
 28 | 
 29 | Weft operates on the PTX code emitted by the CUDA compiler and verifies 
 30 | three important properties of any warp-specialized kernel.
 31 | 
 32 |  * Deadlock Freedom - the use of named barriers should not result in deadlock.
 33 |  * Safe Barrier Recycling - named barriers are a limited physical resource
 34 |                             and it is important to check that they are
 35 |                             safely recycled.
 36 |  * Race Freedom - checking that all shared memory accesses are properly
 37 |                   synchronized by named barriers.
 38 | 
 39 | Weft performs a fully static analysis which requires that the use of 
 40 | named barriers and shared memory accesses be statically analyzable.
 41 | All operations which are not statically analyzable are ignored and 
 42 | can optionally be reported. In practice, we have found that for most 
 43 | GPU kernels this is not an issue because synchronization and shared 
 44 | memory accesses are not dependent on program input and therefore
 45 | can be verified statically.
 46 | 
 47 | Due to its generality, Weft is also capable of checking non-warp-specialized
 48 | code as well for race freedom. The one caveat is that Weft currently
 49 | does not attempt to check code that uses atomics.
 50 | 
 51 | Prerequisites
 52 | ----
 53 | 
 54 | Weft requires an installation of the CUDA compiler for generating
 55 | input PTX files. The CUDA toolkit can be downloaded 
 56 | [here](https://developer.nvidia.com/cuda-downloads). Weft requires
 57 | CUDA version 5.5 or later.
 58 | 
 59 | Weft can be built with a standard C++ compiler. Weft has been tested
 60 | with g++ and clang on both Linux and Mac systems.
 61 | 
 62 | Downloading and Building Weft
 63 | ----
 64 | 
 65 | Weft is available on github under the Apache Software License
 66 | version 2.0. To clone a copy of the Weft source type:
 67 | 
 68 |     $ git clone https://github.com/lightsighter/Weft.git
 69 | 
 70 | After cloning the repository, change into the `src` directory
 71 | and type:
 72 | 
 73 |     $ make
 74 | 
 75 | This will build the Weft binary `weft`. You may wish to add the 
 76 | directory containing the Weft binary to your path using the
 77 | following command.
 78 | 
 79 |     $ export PATH=$PATH:/<path_to_weft>/src
 80 | 
 81 | Using Weft
 82 | ----
 83 | 
 84 | Using Weft to validate a CUDA source file is straightforward.
 85 | The first step is to use the CUDA compiler to generate a PTX
 86 | file for Weft to consume as input. Currently, Weft will only
 87 | analyze the first kernel that it finds in a PTX file, so files
 88 | containing multiple kernels should be divided into separate
 89 | source files.
 90 | 
 91 | To generate input for Weft, the CUDA compiler should be
 92 | invoked with the `-ptx` flag to create an output PTX file.
 93 | We also recommend the CUDA compiler be called with the 
 94 | `-lineinfo` flag so Weft can provide output based on CUDA 
 95 | source code line numbers instead of PTX line numbers. In 
 96 | some cases, the flags for compute architecture (`-arch`) and 
 97 | machine size (`-m`) may need to be specified depending on the
 98 | kernel being compiled. Below are the two ways that we invoke 
 99 | the CUDA compiler on all of our example kernels for the
100 | Fermi and Kepler architectures respectively.
101 | 
102 |     $ nvcc -ptx -lineinfo -m64 -arch=compute_20 source.cu
103 |     $ nvcc -ptx -lineinfo -m64 -arch=compute_35 source.cu
104 | 
105 | The resulting PTX file is the input to Weft. The PTX file name
106 | can either be specified to Weft using the `-f` flag or as the
107 | last argument.
108 | 
109 |     $ weft -f source.ptx -s -t 4
110 |     $ weft -s -t 4 source.ptx
111 | 
112 | As part of its validation, Weft needs to know how many threads
113 | are in each CTA. For kernels with 1-D CTAs, Weft can infer this
114 | information if the `__launch_bounds__` annotation was given on
115 | the CUDA original kernel. However, if this declaration did not exits on
116 | the original source kernel, then it must be explicitly specified
117 | using the `-n` flag. As an example, our `saxpy_single.cu` source
118 | file contains no `__launch_bounds__` declaration on its
119 | kernel, therefore we must tell Weft that the kernel requires CTAs
120 | containing 320 threads.
121 | 
122 |     $ weft -n 320 saxpy_single.ptx
123 | 
124 | Note that the `-n` flag should also be used to specify multi-dimensional
125 | CTA shapes which cannot be captured by the `__launch_bounds__` 
126 | annotation. Both of the following are valid examples:
127 | 
128 |     $ weft -n 320x1x1 saxpy_single.ptx
129 |     $ weft -n 16x16 dgemm.ptx
130 | 
131 | Weft supports a large set of command line flags which we cover in
132 | more detail [later](#command-line-arguments). We mention two flags
133 | briefly now as they are often useful for many users. First, by default,
134 | Weft does not assume <em>warp synchronous</em> execution where all
135 | threads in a warp execute in lock-step. Many CUDA programs rely on 
136 | this property for correctness. The warp synchronous execution assumption
137 | can be enabled in Weft by passing the `-s` flag on the command line.
138 | As an example, the Fermi chemistry kernel in `examples/DME/chem_fermi.cu`
139 | will report races if run under normal assumptions, but will always be 
140 | race free under a warp synchronous execution.
141 | 
142 | Another useful flag for Weft is the `-t` flag which controls the 
143 | number of parallel threads that Weft will use when performing validation.
144 | For most multi-core architectures we find that 2-4 threads is a good
145 | option. Weft is primarily a memory bound application, and having two
146 | threads per socket is usually sufficient to saturate memory bandwidth.
147 | 
148 | We have provided a set of test kernels for Weft in the `examples` 
149 | directory. Each individual directory contains its own Makefile for
150 | generating the PTX code for individual kernels. We also have a script 
151 | called `run_examples.sh` in the main `examples` directory which will 
152 | validate all of the example kernels. Note that some kernels will 
153 | report races. The script may take between 30 minutes
154 | and 1 hour (depending on the machine) to validate all of the kernels.
155 | 
156 | Command Line Arguments
157 | ----
158 | 
159 | Below is a summary of the command line flags that Weft supports.
160 | 
161 |  * `-b`: specify the CTA id to simulate (default 0x0x0)
162 |  * `-d`: print detailed information when giving error output,
163 |                 including where threads are blocked for deadlock as
164 |                 well as per-thread and per-address information for races
165 |  * `-f`: specify the input PTX file (can be omitted if 
166 |                 the file is the last argument in the command line)
167 |  * `-g`: specify the grid dimensions for the kernel being simulated
168 |                 (this argument can be omitted in most cases as many kernels
169 |                 will not depend on these values; regardless of the grid
170 |                 bounds Weft will always validate a single CTA specified
171 |                 by the `-b` flag)
172 |  * `-i`: instrument the execution of Weft to report the
173 |                 time taken and memory usage for each stage
174 |  * `-n`: set the number of threads per CTA. This is required
175 |                 if the CUDA kernel did not have a 
176 |                 `__launch_bounds__` annotation
177 |  * `-p`: print out individual files for each thread of all Weft modeled 
178 |                 instructions, this will generate one file per thread
179 |  * `-s`: assume warp-synchronous execution when checking for races
180 |  * `-t`: set the size of the thread pool for Weft to use; in
181 |                 general, Weft is memory bound, so one or two threads per socket
182 |                 should be sufficient for achieving peak performance.
183 |  * `-v`: enable verbose output
184 |  * `-w`: enable warnings about PTX instructions that cannot be
185 |                 statically emulated (can result in large output)
186 | 
187 | 


--------------------------------------------------------------------------------
/src/weft.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2015 Stanford University and NVIDIA
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | #ifndef __WEFT_H__
 18 | #define __WEFT_H__
 19 | 
 20 | #include <cstdio>
 21 | #include <cassert>
 22 | #include <pthread.h>
 23 | #include <deque>
 24 | #include <vector>
 25 | #include <string>
 26 | 
 27 | #define PTHREAD_SAFE_CALL(cmd)        \
 28 |   {                                   \
 29 |     int ret = (cmd);                  \
 30 |     if (ret != 0) {                   \
 31 |       fprintf(stderr,"PTHREAD error: %s = %d (%s)\n", #cmd, ret, strerror(ret)); \
 32 |       assert(false);                  \
 33 |     }                                 \
 34 |   }
 35 | 
 36 | #define WARP_SIZE   32
 37 | 
 38 | enum {
 39 |   WEFT_SUCCESS,
 40 |   WEFT_ERROR_NO_FILE_NAME,
 41 |   WEFT_ERROR_FILE_OPEN,
 42 |   WEFT_ERROR_NO_KERNELS,
 43 |   WEFT_ERROR_NO_THREAD_COUNT,
 44 |   WEFT_ERROR_ARRIVAL_MISMATCH,
 45 |   WEFT_ERROR_TOO_MANY_PARTICIPANTS,
 46 |   WEFT_ERROR_ALL_ARRIVALS,
 47 |   WEFT_ERROR_DEADLOCK,
 48 |   WEFT_ERROR_GRAPH_VALIDATION,
 49 |   WEFT_ERROR_INVALID_PTX_VERSION,
 50 | };
 51 | 
 52 | class Weft;
 53 | class Thread;
 54 | class Program;
 55 | class Address;
 56 | class SharedMemory;
 57 | class BarrierInstance;
 58 | class BarrierDependenceGraph;
 59 | 
 60 | class WeftTask {
 61 | public:
 62 |   virtual ~WeftTask(void) { }
 63 |   virtual void execute(void) = 0;
 64 | };
 65 | 
 66 | class EmulateThread : public WeftTask {
 67 | public:
 68 |   EmulateThread(Thread *thread);
 69 |   EmulateThread(const EmulateThread &rhs) : thread(NULL) { assert(false); }
 70 |   virtual ~EmulateThread(void) { }
 71 | public:
 72 |   EmulateThread& operator=(const EmulateThread &rhs) { assert(false); return *this; }
 73 | public:
 74 |   virtual void execute(void);
 75 | public:
 76 |   Thread *const thread;
 77 | };
 78 | 
 79 | class EmulateWarp : public WeftTask {
 80 | public:
 81 |   EmulateWarp(Program *p, Thread **start);
 82 |   EmulateWarp(const EmulateWarp &rhs) : program(NULL), threads(NULL) { assert(false); }
 83 |   virtual ~EmulateWarp(void) { }
 84 | public:
 85 |   EmulateWarp& operator=(const EmulateWarp &rhs) { assert(false); return *this; }
 86 | public:
 87 |   virtual void execute(void);
 88 | public:
 89 |   Program *const program;
 90 |   Thread **const threads;
 91 | };
 92 | 
 93 | class ValidationTask : public WeftTask {
 94 | public:
 95 |   ValidationTask(BarrierDependenceGraph *graph, int name, int generation);
 96 |   ValidationTask(const ValidationTask &rhs) : graph(NULL), 
 97 |     name(0), generation(0) { assert(false); }
 98 |   virtual ~ValidationTask(void) { }
 99 | public:
100 |   ValidationTask& operator=(const ValidationTask &rhs) { assert(false); return *this; }
101 | public:
102 |   virtual void execute(void);
103 | public:
104 |   BarrierDependenceGraph *const graph;
105 |   const int name;
106 |   const int generation;
107 | };
108 | 
109 | class InitializationTask : public WeftTask {
110 | public:
111 |   InitializationTask(Thread *thread, int total, int max_num_barriers);
112 |   InitializationTask(const InitializationTask &rhs) 
113 |     : thread(NULL), total_threads(0), max_num_barriers(0) { assert(false); }
114 |   virtual ~InitializationTask(void) { }
115 | public:
116 |   InitializationTask& operator=(const InitializationTask &rhs)
117 |     { assert(false); return *this; }
118 | public:
119 |   virtual void execute(void);
120 | public:
121 |   Thread *const thread;
122 |   const int total_threads;
123 |   const int max_num_barriers;
124 | };
125 | 
126 | class ReachabilityTask : public WeftTask {
127 | public:
128 |   ReachabilityTask(BarrierInstance *instance, Weft *weft, bool forward);
129 |   ReachabilityTask(const ReachabilityTask &rhs) : instance(NULL),
130 |     weft(NULL), forward(true) { assert(false); }
131 |   virtual ~ReachabilityTask(void) { }
132 | public:
133 |   ReachabilityTask& operator=(const ReachabilityTask &rhs)
134 |     { assert(false); return *this; }
135 | public:
136 |   virtual void execute(void);
137 | public:
138 |   BarrierInstance *const instance;
139 |   Weft *const weft;
140 |   const bool forward;
141 | };
142 | 
143 | class TransitiveTask : public WeftTask {
144 | public:
145 |   TransitiveTask(BarrierInstance *instance, Weft *weft, bool forward);
146 |   TransitiveTask(const TransitiveTask &rhs) : instance(NULL),
147 |     weft(NULL), forward(true) { assert(false); }
148 |   virtual ~TransitiveTask(void) { }
149 | public:
150 |   TransitiveTask& operator=(const TransitiveTask &rhs)
151 |     { assert(false); return *this; }
152 | public:
153 |   virtual void execute(void);
154 | public:
155 |   BarrierInstance *const instance;
156 |   Weft *const weft;
157 |   const bool forward;
158 | };
159 | 
160 | class UpdateThreadTask : public WeftTask {
161 | public:
162 |   UpdateThreadTask(Thread *thread);
163 |   UpdateThreadTask(const UpdateThreadTask &rhs) : thread(NULL) { assert(false); }
164 |   virtual ~UpdateThreadTask(void) { }
165 | public:
166 |   UpdateThreadTask& operator=(const UpdateThreadTask &rhs) 
167 |     { assert(false); return *this; }  
168 | public:
169 |   virtual void execute(void);
170 | public:
171 |   Thread *const thread;
172 | };
173 | 
174 | class RaceCheckTask : public WeftTask {
175 | public:
176 |   RaceCheckTask(Address *address);
177 |   RaceCheckTask(const RaceCheckTask &rhs) : address(NULL) { assert(false); }
178 |   virtual ~RaceCheckTask(void) { }
179 | public:
180 |   RaceCheckTask& operator=(const RaceCheckTask &rhs)
181 |     { assert(false); return *this; }
182 | public:
183 |   virtual void execute(void);
184 | public:
185 |   Address *const address;
186 | };
187 | 
188 | class DumpThreadTask : public WeftTask {
189 | public:
190 |   DumpThreadTask(Thread *thread);
191 |   DumpThreadTask(const DumpThreadTask &rhs) : thread(NULL) { assert(false); }
192 |   virtual ~DumpThreadTask(void) { }
193 | public:
194 |   DumpThreadTask& operator=(const DumpThreadTask &rhs)
195 |     { assert(false); return *this; }
196 | public:
197 |   virtual void execute(void);
198 | public:
199 |   Thread *const thread;
200 | };
201 | 
202 | class Weft {
203 | public:
204 |   Weft(int argc, char **argv);
205 |   ~Weft(void);
206 | public:
207 |   void verify(void);
208 |   void report_error(int error_code, const char *message);
209 |   inline bool report_warnings(void) const { return warnings; }
210 |   inline bool print_verbose(void) const { return verbose; }
211 |   inline bool print_detail(void) const { return detailed; }
212 |   inline bool perform_instrumentation(void) const { return instrument; }
213 |   inline bool emit_program_files(void) const { return print_files; }
214 | protected:
215 |   void parse_inputs(int argc, char **argv);
216 |   bool parse_triple(const std::string &input, int *array,
217 |                     const char *flag, const char *error_str);
218 |   void report_usage(int error, const char *error_str);
219 |   Program* parse_ptx(void);
220 | public:
221 |   bool initialize_program(Program *program) const;
222 |   void start_parsing_instrumentation(void);
223 |   void stop_parsing_instrumentation(void);
224 | protected:
225 |   void report_instrumentation(void);
226 | protected:
227 |   void start_threadpool(void);
228 |   void stop_threadpool(void);
229 | public:
230 |   void initialize_count(unsigned count);
231 |   void wait_until_done(void);
232 | public:
233 |   void enqueue_task(WeftTask *task);
234 |   WeftTask* dequeue_task(void);
235 |   void complete_task(WeftTask *task);
236 | public:
237 |   static void* worker_loop(void *arg);
238 |   static unsigned long long get_current_time_in_micros(void);
239 |   static size_t get_memory_usage(void);
240 | protected:
241 |   const char *file_name;
242 |   int block_dim[3]; // x, y, z
243 |   int block_id[3]; // x, y, z
244 |   int grid_dim[3]; // x, y, z
245 |   int thread_pool_size;
246 |   bool verbose;
247 |   bool detailed;
248 |   bool instrument;
249 |   bool warnings;
250 |   bool warp_synchronous;
251 |   bool print_files;
252 |   std::vector<Program*> programs;
253 | protected:
254 |   pthread_t *worker_threads;
255 |   bool threadpool_finished;
256 | protected:
257 |   pthread_mutex_t count_lock;
258 |   pthread_cond_t count_cond;
259 |   unsigned int pending_count;
260 | protected:
261 |   pthread_mutex_t queue_lock;
262 |   pthread_cond_t queue_cond;
263 |   std::deque<WeftTask*> queue;
264 | protected:
265 |   unsigned long long parsing_time;
266 |   size_t parsing_memory;
267 | };
268 | 
269 | #endif // __WEFT_H__
270 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright {yyyy} {name of copyright owner}
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 
203 | 


--------------------------------------------------------------------------------
/src/race.cc:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2015 Stanford University and NVIDIA
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | #include "weft.h"
 18 | #include "race.h"
 19 | #include "graph.h"
 20 | #include "program.h"
 21 | #include "instruction.h"
 22 | 
 23 | Happens::Happens(int total_threads)
 24 |   : initialized(false)
 25 | {
 26 |   happens_before.resize(total_threads, -1);
 27 |   happens_after.resize(total_threads, -1);
 28 | }
 29 | 
 30 | void Happens::update_barriers_before(const std::vector<WeftBarrier*> &before)
 31 | {
 32 |   assert(latest_before.empty());
 33 |   latest_before = before;
 34 | }
 35 | 
 36 | void Happens::update_barriers_after(const std::vector<WeftBarrier*> &after)
 37 | {
 38 |   assert(earliest_after.empty());
 39 |   earliest_after = after;
 40 | }
 41 | 
 42 | void Happens::update_happens_relationships(void)
 43 | {
 44 |   for (std::vector<WeftBarrier*>::const_iterator it = 
 45 |         latest_before.begin(); it != latest_before.end(); it++)
 46 |   {
 47 |     if ((*it) == NULL)
 48 |       continue;
 49 |     (*it)->get_instance()->update_latest_before(happens_after);
 50 |   }
 51 |   for (std::vector<WeftBarrier*>::const_iterator it = 
 52 |         earliest_after.begin(); it != earliest_after.end(); it++)
 53 |   {
 54 |     if ((*it) == NULL)
 55 |       continue;
 56 |     (*it)->get_instance()->update_earliest_after(happens_before);
 57 |   }
 58 | }
 59 | 
 60 | bool Happens::has_happens(int thread, int line_number)
 61 | {
 62 |   if (happens_before[thread] <= line_number)
 63 |     return true;
 64 |   if (happens_after[thread] >= line_number)
 65 |     return true;
 66 |   return false;
 67 | }
 68 | 
 69 | Address::Address(const int addr, SharedMemory *mem)
 70 |   : address(addr), memory(mem), total_races(0)
 71 | {
 72 |   PTHREAD_SAFE_CALL( pthread_mutex_init(&address_lock,NULL) );
 73 | }
 74 | 
 75 | Address::~Address(void)
 76 | {
 77 |   PTHREAD_SAFE_CALL( pthread_mutex_destroy(&address_lock) );
 78 | }
 79 | 
 80 | void Address::add_access(WeftAccess *access)
 81 | {
 82 |   PTHREAD_SAFE_CALL( pthread_mutex_lock(&address_lock) );
 83 |   accesses.push_back(access);
 84 |   PTHREAD_SAFE_CALL( pthread_mutex_unlock(&address_lock) );
 85 | }
 86 | 
 87 | void Address::perform_race_tests(void)
 88 | {
 89 |   if (memory->program->assume_warp_synchronous())
 90 |   {
 91 |     for (unsigned idx1 = 0; idx1 < accesses.size(); idx1++)
 92 |     {
 93 |       WeftAccess *first = accesses[idx1];
 94 |       if (first->is_read())
 95 |       {
 96 |         for (unsigned idx2 = idx1+1; idx2 < accesses.size(); idx2++)
 97 |         {
 98 |           WeftAccess *second = accesses[idx2]; 
 99 |           // Check for both reads
100 |           if (second->is_read())
101 |             continue;
102 |           // Check for warp-synchronous
103 |           if (first->is_warp_synchronous(second))
104 |             continue;
105 |           if (!first->has_happens_relationship(second))
106 |             record_race(first, second);
107 |         }
108 |       }
109 |       else
110 |       {
111 |         for (unsigned idx2 = idx1+1; idx2 < accesses.size(); idx2++)
112 |         {
113 |           WeftAccess *second = accesses[idx2];
114 |           // Check for warp-synchronous
115 |           if (first->is_warp_synchronous(second))
116 |             continue;
117 |           if (!first->has_happens_relationship(second))
118 |             record_race(first, second);
119 |         }
120 |       }
121 |     }
122 |   }
123 |   else
124 |   {
125 |     // For every pair of addresses, check to see if we can 
126 |     // establish a happens before or a happens after relationship
127 |     for (unsigned idx1 = 0; idx1 < accesses.size(); idx1++)
128 |     {
129 |       WeftAccess *first = accesses[idx1];
130 |       if (first->is_read())
131 |       {
132 |         for (unsigned idx2 = idx1+1; idx2 < accesses.size(); idx2++)
133 |         {
134 |           WeftAccess *second = accesses[idx2]; 
135 |           // Check for both reads
136 |           if (second->is_read())
137 |             continue;
138 |           if (!first->has_happens_relationship(second))
139 |             record_race(first, second);
140 |         }
141 |       }
142 |       else
143 |       {
144 |         for (unsigned idx2 = idx1+1; idx2 < accesses.size(); idx2++)
145 |         {
146 |           WeftAccess *second = accesses[idx2];
147 |           if (!first->has_happens_relationship(second))
148 |             record_race(first, second);
149 |         }
150 |       }
151 |     }
152 |   }
153 | }
154 | 
155 | void Address::record_race(WeftAccess *one, WeftAccess *two)
156 | {
157 |   // Alternative race reporting
158 |   //printf("Race between threads %d and %d on instructions "
159 |   //       "%d and %d (PTX %d and %d)\n",
160 |   //       one->thread->thread_id, two->thread->thread_id,
161 |   //       one->thread_line_number, two->thread_line_number,
162 |   //       one->instruction->line_number, two->instruction->line_number);
163 |   total_races++;
164 |   // Save the races based on the PTX instructions
165 |   int ptx_one = one->instruction->line_number;
166 |   int ptx_two = two->instruction->line_number;
167 |   if (ptx_one <= ptx_two)
168 |   {
169 |     std::pair<PTXInstruction*,PTXInstruction*> 
170 |                             key(one->instruction, two->instruction);
171 |     if (one->thread->thread_id <= two->thread->thread_id)
172 |       ptx_races[key].insert(
173 |           std::pair<Thread*,Thread*>(one->thread, two->thread));
174 |     else
175 |       ptx_races[key].insert(
176 |           std::pair<Thread*,Thread*>(two->thread, one->thread));
177 |   }
178 |   else
179 |   {
180 |     std::pair<PTXInstruction*,PTXInstruction*> 
181 |                             key(two->instruction, one->instruction);
182 |     if (one->thread->thread_id <= two->thread->thread_id)
183 |       ptx_races[key].insert(
184 |           std::pair<Thread*,Thread*>(one->thread, two->thread));
185 |     else
186 |       ptx_races[key].insert(
187 |           std::pair<Thread*,Thread*>(two->thread, one->thread));
188 |   }
189 | }
190 | 
191 | int Address::report_races(std::map<
192 |             std::pair<PTXInstruction*,PTXInstruction*>,size_t> &all_races)
193 | {
194 |   if (total_races > 0)
195 |   { 
196 |     if (memory->weft->print_detail())
197 |     {
198 |       fprintf(stderr,"WEFT INFO: Found %d races on address %d!\n",
199 |                       total_races, address);
200 |       for (std::map<std::pair<PTXInstruction*,PTXInstruction*>,std::set<
201 |                     std::pair<Thread*,Thread*> > >::const_iterator it = 
202 |             ptx_races.begin(); it != ptx_races.end(); it++)
203 |       {
204 |         PTXInstruction *one = it->first.first;
205 |         PTXInstruction *two = it->first.second;
206 |         if (one->source_file != NULL)
207 |         {
208 |           assert(two->source_file != NULL);
209 |           if (one == two)
210 |             fprintf(stderr,"\tThere are %ld races between different threads "
211 |                   "on line %d of %s with address %d\n", it->second.size(),
212 |                   one->source_line_number, one->source_file, address);
213 |           else
214 |             fprintf(stderr,"\tThere are %ld races between line %d of %s "
215 |                     " and line %d of %s with address %d\n", it->second.size(),
216 |                     one->source_line_number, one->source_file,
217 |                     two->source_line_number, two->source_file, address);
218 |         }
219 |         else
220 |         {
221 |           assert(two->source_file == NULL);
222 |           if (one == two)
223 |             fprintf(stderr,"\tThere are %ld races between different threads "
224 |                    "on PTX line %d with address %d\n", it->second.size(),
225 |                    one->line_number, address);
226 |           else
227 |             fprintf(stderr,"\tThere are %ld races between PTX line %d "
228 |                     " and PTX line %d with address %d\n", it->second.size(),
229 |                     one->line_number, two->line_number, address);
230 |         }
231 |         const std::set<std::pair<Thread*,Thread*> > &threads = it->second;
232 |         for (std::set<std::pair<Thread*,Thread*> >::const_iterator 
233 |               thread_it = threads.begin(); 
234 |               thread_it != threads.end(); thread_it++)
235 |         {
236 |           Thread *first = thread_it->first;
237 |           Thread *second = thread_it->second;
238 |           fprintf(stderr,"\t\t... between thread (%d,%d,%d) and (%d,%d,%d)\n",
239 |                   first->tid_x, first->tid_y, first->tid_z,
240 |                   second->tid_x, second->tid_y, second->tid_z);
241 |         }
242 |       }
243 |     }
244 |     else
245 |     {
246 |       for (std::map<std::pair<PTXInstruction*,PTXInstruction*>,
247 |                     std::set<std::pair<Thread*,Thread*> > >::const_iterator
248 |             it = ptx_races.begin(); it != ptx_races.end(); it++)
249 |       {
250 |         std::map<std::pair<PTXInstruction*,PTXInstruction*>,size_t>::iterator
251 |           finder = all_races.find(it->first);
252 |         if (finder == all_races.end())
253 |           all_races[it->first] = it->second.size();
254 |         else
255 |           finder->second += it->second.size();
256 |       }
257 |     }
258 |   }
259 |   return total_races;
260 | }
261 | 
262 | size_t Address::count_race_tests(void)
263 | {
264 |   size_t num_accesses = accesses.size();
265 |   // OLA's equality
266 |   // 1 + 2 + 3 + ... + n-1 = (n-1)*n/2
267 |   return ((num_accesses * (num_accesses-1))/2);
268 | }
269 | 
270 | SharedMemory::SharedMemory(Weft *w, Program *p)
271 |   : weft(w), program(p)
272 | {
273 |   PTHREAD_SAFE_CALL( pthread_mutex_init(&memory_lock,NULL) );
274 | }
275 | 
276 | SharedMemory::~SharedMemory(void)
277 | {
278 |   for (std::map<int,Address*>::iterator it = addresses.begin();
279 |         it != addresses.end(); it++)
280 |   {
281 |     delete it->second;
282 |   }
283 |   addresses.clear();
284 |   PTHREAD_SAFE_CALL( pthread_mutex_destroy(&memory_lock) );
285 | }
286 | 
287 | void SharedMemory::update_accesses(WeftAccess *access)
288 | {
289 |   Address *address;
290 |   // These lookups need to be thread safe
291 |   PTHREAD_SAFE_CALL( pthread_mutex_lock(&memory_lock) );
292 |   std::map<int,Address*>::const_iterator finder = 
293 |     addresses.find(access->address);
294 |   if (finder == addresses.end())
295 |   {
296 |     address = new Address(access->address, this);
297 |     addresses[access->address] = address;
298 |   }
299 |   else
300 |     address = finder->second;
301 |   PTHREAD_SAFE_CALL( pthread_mutex_unlock(&memory_lock) );
302 |   address->add_access(access);
303 | }
304 | 
305 | int SharedMemory::count_addresses(void) const
306 | {
307 |   return addresses.size();
308 | }
309 | 
310 | void SharedMemory::enqueue_race_checks(void)
311 | {
312 |   for (std::map<int,Address*>::const_iterator it = addresses.begin();
313 |         it != addresses.end(); it++)
314 |   {
315 |     weft->enqueue_task(new RaceCheckTask(it->second));
316 |   }
317 | }
318 | 
319 | void SharedMemory::check_for_races(void)
320 | {
321 |   int total_races = 0;
322 |   std::map<std::pair<PTXInstruction*,PTXInstruction*>,size_t> all_races;
323 |   for (std::map<int,Address*>::const_iterator it = 
324 |         addresses.begin(); it != addresses.end(); it++)
325 |   {
326 |     total_races += it->second->report_races(all_races);
327 |   }
328 |   if (total_races > 0)
329 |   {
330 |     if (!weft->print_detail())
331 |     {
332 |       for (std::map<std::pair<PTXInstruction*,PTXInstruction*>,size_t>::const_iterator 
333 |             it = all_races.begin(); it != all_races.end(); it++)
334 |       {
335 |         PTXInstruction *one = it->first.first;
336 |         PTXInstruction *two = it->first.second;
337 |         if (one->source_file != NULL)
338 |         {
339 |           assert(two->source_file != NULL);
340 |           if (one == two)
341 |             fprintf(stderr,"\tFound races between %ld pairs of "
342 |                            "threads on line %d of %s\n", it->second,
343 |                            one->source_line_number, one->source_file);
344 |           else
345 |             fprintf(stderr,"\tFound races between %ld pairs of threads "
346 |                            "on line %d of %s and line %d of %s\n", it->second,
347 |                            one->source_line_number, one->source_file,
348 |                            two->source_line_number, two->source_file);
349 |         }
350 |         else
351 |         {
352 |           assert(two->source_file == NULL);
353 |           if (one == two)
354 |             fprintf(stderr,"\tFound races between %ld pairs of "
355 |                            "threads on PTX line number %d\n",
356 |                            it->second, one->line_number);
357 |           else
358 |             fprintf(stderr,"\tFound races between %ld pairs of threads on "
359 |                            "PTX line %d and PTX line %d\n", it->second,
360 |                            one->line_number, two->line_number);
361 |         }
362 |       }
363 |       fprintf(stderr,"WEFT INFO: Found %d total races in kernel %s!\n"
364 |                      "           Run with '-d' flag to see detailed per-thread "
365 |                      "and per-address races\n", total_races, program->get_name());
366 |     }
367 |     else
368 |       fprintf(stderr,"WEFT INFO: Found %d total races in kernel %s!\n", 
369 |                      total_races, program->get_name());
370 |     fprintf(stderr,"WEFT INFO: RACES DETECTED IN KERNEL %s!\n", 
371 |                      program->get_name());
372 |   }
373 |   else
374 |     fprintf(stdout,"WEFT INFO: No races detected in kernel %s!\n", 
375 |                     program->get_name());
376 | }
377 | 
378 | size_t SharedMemory::count_race_tests(void)
379 | {
380 |   size_t result = 0;
381 |   for (std::map<int,Address*>::const_iterator it = addresses.begin();
382 |         it != addresses.end(); it++)
383 |   {
384 |     result += it->second->count_race_tests();
385 |   }
386 |   return result;
387 | }
388 | 
389 | RaceCheckTask::RaceCheckTask(Address *addr)
390 |   : address(addr)
391 | {
392 | }
393 | 
394 | void RaceCheckTask::execute(void)
395 | {
396 |   address->perform_race_tests();
397 | }
398 | 
399 | 


--------------------------------------------------------------------------------
/src/weft.cc:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2015 Stanford University and NVIDIA
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | #include "weft.h"
 18 | #include "race.h"
 19 | #include "graph.h"
 20 | #include "program.h"
 21 | #include "instruction.h"
 22 | 
 23 | #include <string>
 24 | 
 25 | #include <cstdio>
 26 | #include <cassert>
 27 | #include <cstring>
 28 | #include <cstdlib>
 29 | 
 30 | #include <sys/time.h>
 31 | #include <sys/resource.h>
 32 | 
 33 | #ifdef __MACH__
 34 | #include "mach/clock.h"
 35 | #include "mach/mach.h"
 36 | #endif
 37 | 
 38 | Weft::Weft(int argc, char **argv)
 39 |   : file_name(NULL), thread_pool_size(1), 
 40 |     verbose(false), detailed(false), instrument(false), 
 41 |     warnings(false), warp_synchronous(false), print_files(false),
 42 |     worker_threads(NULL), pending_count(0)
 43 | {
 44 |   for (int i = 0; i < 3; i++)
 45 |     block_dim[i] = 1;
 46 |   for (int i = 0; i < 3; i++)
 47 |     block_id[i] = 0;
 48 |   for (int i = 0; i < 3; i++)
 49 |     grid_dim[i] = 1;
 50 |   parse_inputs(argc, argv);  
 51 |   start_threadpool();
 52 | }
 53 | 
 54 | Weft::~Weft(void)
 55 | {
 56 |   stop_threadpool();
 57 |   for (std::vector<Program*>::iterator it = programs.begin();
 58 |         it != programs.end(); it++)
 59 |   {
 60 |     delete (*it);
 61 |   }
 62 |   programs.clear();
 63 | }
 64 | 
 65 | void Weft::verify(void)
 66 | {
 67 |   Program::parse_ptx_file(file_name, this, programs);
 68 |   for (std::vector<Program*>::const_iterator it = programs.begin();
 69 |         it != programs.end(); it++)
 70 |   {
 71 |     Program *program = *it;
 72 |     program->verify(); 
 73 |   }
 74 |   if (instrument)
 75 |     report_instrumentation();
 76 | }
 77 | 
 78 | void Weft::report_error(int error_code, const char *message)
 79 | {
 80 |   assert(error_code != WEFT_SUCCESS);
 81 |   fprintf(stderr,"WEFT ERROR %d: %s!\n", error_code, message);
 82 |   fprintf(stderr,"WEFT WILL NOW EXIT...\n");
 83 |   fflush(stderr);
 84 |   stop_threadpool();
 85 |   exit(error_code);
 86 | }
 87 | 
 88 | void Weft::parse_inputs(int argc, char **argv)
 89 | {
 90 |   for (int i = 1; i < argc; i++)
 91 |   {
 92 |     if (!strcmp(argv[i],"-b"))
 93 |     {
 94 |       std::string block(argv[++i]);
 95 |       parse_triple(block, block_id, "-b", "CTA ID");
 96 |       continue;
 97 |     }
 98 |     if (!strcmp(argv[i],"-d"))
 99 |     {
100 |       detailed = true;
101 |       continue;
102 |     }
103 |     if (!strcmp(argv[i],"-f"))
104 |     {
105 |       file_name = argv[++i];
106 |       continue;
107 |     }
108 |     if (!strcmp(argv[i],"-g"))
109 |     {
110 |       std::string grid(argv[++i]);
111 |       parse_triple(grid, grid_dim, "-g", "Grid Size");
112 |       continue;
113 |     }
114 |     if (!strcmp(argv[i],"-i"))
115 |     {
116 |       instrument = true;
117 |       continue;
118 |     }
119 |     if (!strcmp(argv[i],"-n"))
120 |     {
121 |       std::string threads(argv[++i]);
122 |       parse_triple(threads, block_dim, "-n", "CTA size");
123 |       continue;
124 |     }
125 |     if (!strcmp(argv[i],"-p"))
126 |     {
127 |       print_files = true;
128 |       continue;
129 |     }
130 |     if (!strcmp(argv[i],"-s"))
131 |     {
132 |       warp_synchronous = true;
133 |       continue;
134 |     }
135 |     if (!strcmp(argv[i],"-t"))
136 |     {
137 |       thread_pool_size = atoi(argv[++i]);
138 |       if (thread_pool_size < 1)
139 |         thread_pool_size = 1;
140 |       continue;
141 |     }
142 |     if (!strcmp(argv[i],"-v"))
143 |     {
144 |       verbose = true;
145 |       continue;
146 |     }
147 |     if (!strcmp(argv[i],"-w"))
148 |     {
149 |       warnings = true;
150 |       continue;
151 |     }
152 |     // If it has a ptx ending then guess it is the file name
153 |     std::string file(argv[i]);
154 |     if (file.find(".ptx") != std::string::npos)
155 |     {
156 |       file_name = argv[i];
157 |       continue;
158 |     }
159 |     fprintf(stderr,"WEFT WARNING: skipping argument %s\n", argv[i]);
160 |   }
161 |   if (file_name == NULL)
162 |     report_usage(WEFT_ERROR_NO_FILE_NAME, "No file name specified");
163 |   if (verbose)
164 |   {
165 |     fprintf(stdout,"INITIAL WEFT SETTINGS:\n");
166 |     fprintf(stdout,"  File Name: %s\n", file_name);
167 |     fprintf(stdout,"  CTA dimensions: (%d,%d,%d)\n", 
168 |                       block_dim[0], block_dim[1], block_dim[2]);
169 |     fprintf(stdout,"  Block ID: (%d,%d,%d)\n",
170 |                       block_id[0], block_id[1], block_id[2]);
171 |     fprintf(stdout,"  Grid dimensions: (%d,%d,%d)\n",
172 |                       grid_dim[0], grid_dim[1], grid_dim[2]);
173 |     fprintf(stdout,"  Thread Pool Size: %d\n", thread_pool_size);
174 |     fprintf(stdout,"  Verbose: %s\n", (verbose ? "yes" : "no"));
175 |     fprintf(stdout,"  Detailed: %s\n", (detailed ? "yes" : "no"));
176 |     fprintf(stdout,"  Instrument: %s\n", (instrument ? "yes" : "no"));
177 |     fprintf(stdout,"  Report Warnings: %s\n", (warnings ? "yes" : "no"));
178 |     fprintf(stdout,"  Warp-Synchronous Execution: %s\n", (warnings ? "yes" : "no"));
179 |     fprintf(stdout,"  Dump Weft thread files: %s\n", (print_files ? "yes" : "no"));
180 |   }
181 | }
182 | 
183 | bool Weft::parse_triple(const std::string &input, int *array,
184 |                         const char *flag, const char *error_str)
185 | {
186 |   bool success = true;
187 |   if (input.find("x") != std::string::npos)
188 |   {
189 |     // Try parsing this block configuration
190 |     std::vector<std::string> values;
191 |     split(values, input.c_str(), 'x');
192 |     if (!values.empty() && (values.size() <= 3))
193 |     {
194 |       // Try parsing each of the arguments   
195 |       for (unsigned i = 0; i < values.size(); i++)
196 |       {
197 |         int count = atoi(values[i].c_str());
198 |         if (count < 1)
199 |         {
200 |           fprintf(stderr,"WEFT WARNING: Failed to parse dimension %d "
201 |                          "of %s: \"%s %s\"!\n",
202 |                          i, error_str, flag, input.c_str());
203 |           success = false;
204 |           break;
205 |         }
206 |         array[i] = count;
207 |       }
208 |     }
209 |     else
210 |     {
211 |       fprintf(stderr,"WEFT WARNING: Failed to parse %s with %ld"
212 |                "dimensions from input: \"%s %s\"!\n", 
213 |                error_str, values.size(), flag, input.c_str());
214 |       success = false;
215 |     }
216 |   }
217 |   else
218 |   {
219 |     int count = atoi(input.c_str());
220 |     if (count >= 1)
221 |       array[0] = count;
222 |     else
223 |     {
224 |       success = false;
225 |       fprintf(stderr,"WEFT WARNING: Ignoring invalid input for %s "
226 |                      "\"%s %s\"!\n", error_str, flag, input.c_str());
227 |     }
228 |   }
229 |   return success;
230 | }
231 | 
232 | void Weft::report_usage(int error, const char *error_str)
233 | {
234 |   fprintf(stderr,"WEFT ERROR %d: %s!\nWEFT WILL NOW EXIT...\n", 
235 |           error, error_str);
236 |   fprintf(stderr,"Usage: Weft [args]\n");
237 |   fprintf(stderr,"  -b: specify the CTA id to simulate (default 0x0x0)\n");
238 |   fprintf(stderr,"      can be an integer or an x-separated tuple e.g. 0x0x1 or 1x2\n");
239 |   fprintf(stderr,"  -d: print detailed information for error reporting\n");
240 |   fprintf(stderr,"      this includes line numbers for blocked threads under deadlock and\n");
241 |   fprintf(stderr,"      and per-thread and per-address information for races\n");
242 |   fprintf(stderr,"  -f: specify the input file\n");
243 |   fprintf(stderr,"  -g: specify the grid dimensions for the kernel being simulated\n");
244 |   fprintf(stderr,"      can be an integer or an x-separated tuple e.g. 32x32x2 or 32x1\n");
245 |   fprintf(stderr,"      Weft will still only simulate a single CTA specified by '-b'\n");
246 |   fprintf(stderr,"  -i: instrument execution\n");
247 |   fprintf(stderr,"  -n: number of threads per CTA\n");
248 |   fprintf(stderr,"      can be an integer or an x-separated tuple e.g. 64x2 or 32x8x1\n");
249 |   fprintf(stderr,"  -p: print individual Weft thread files (one file per thread!)\n");
250 |   fprintf(stderr,"  -s: assume warp-synchronous execution\n");
251 |   fprintf(stderr,"  -t: thread pool size\n");
252 |   fprintf(stderr,"  -v: print verbose output\n");
253 |   fprintf(stderr,"  -w: report emulation warnings (this may generate considerable output)\n");
254 |   exit(error);
255 | }
256 | 
257 | bool Weft::initialize_program(Program *program) const
258 | {
259 |   program->set_block_dim(block_dim);
260 |   program->add_block_id(block_id);
261 |   program->set_grid_dim(grid_dim);
262 |   return warp_synchronous;
263 | }
264 | 
265 | void Weft::start_parsing_instrumentation(void)
266 | {
267 |   parsing_time = get_current_time_in_micros();
268 | }
269 | 
270 | void Weft::stop_parsing_instrumentation(void)
271 | {
272 |   unsigned long long stop = get_current_time_in_micros();
273 |   unsigned long long start = parsing_time;
274 |   parsing_time = stop - start;
275 |   parsing_memory = get_memory_usage();
276 | }
277 | 
278 | void Weft::report_instrumentation(void)
279 | {
280 |   fprintf(stdout,"WEFT INSTRUMENTATION FOR PARSING FILE %s\n", file_name); 
281 | #ifdef __MACH__
282 |   fprintf(stdout,"  %50s: %10.3lf ms %12ld MB\n",
283 |           "Parse PTX", double(parsing_time) * 1e-3, parsing_memory / (1024 * 1024));
284 | #else
285 |   fprintf(stdout,"  %50s: %10.3lf ms %12ld MB\n",
286 |           "Parse PTX", double(parsing_time) * 1e-3, parsing_memory / 1024);
287 | #endif
288 |   size_t accumulated_memory = parsing_memory;
289 |   for (std::vector<Program*>::const_iterator it = programs.begin();
290 |         it != programs.end(); it++)
291 |   {
292 |     (*it)->report_instrumentation(accumulated_memory);
293 |   }
294 | }
295 | 
296 | void Weft::start_threadpool(void)
297 | {
298 |   assert(thread_pool_size > 0);
299 |   PTHREAD_SAFE_CALL( pthread_mutex_init(&count_lock, NULL) );
300 |   PTHREAD_SAFE_CALL( pthread_cond_init(&count_cond, NULL) );
301 |   PTHREAD_SAFE_CALL( pthread_mutex_init(&queue_lock, NULL) );
302 |   PTHREAD_SAFE_CALL( pthread_cond_init(&queue_cond, NULL) );
303 |   assert(worker_threads == NULL);
304 |   worker_threads = (pthread_t*)malloc(thread_pool_size * sizeof(pthread_t));
305 |   threadpool_finished = false;
306 |   for (int i = 0; i < thread_pool_size; i++)
307 |   {
308 |     PTHREAD_SAFE_CALL( pthread_create(worker_threads+i, NULL, 
309 |                                       Weft::worker_loop, this) );
310 |   }
311 | }
312 | 
313 | void Weft::stop_threadpool(void)
314 | {
315 |   // Wake up all the worker threads so that they exit
316 |   PTHREAD_SAFE_CALL( pthread_mutex_lock(&queue_lock) );
317 |   threadpool_finished = true;
318 |   PTHREAD_SAFE_CALL( pthread_cond_broadcast(&queue_cond) );
319 |   PTHREAD_SAFE_CALL( pthread_mutex_unlock(&queue_lock) );
320 |   for (int i = 0; i < thread_pool_size; i++)
321 |   {
322 |     PTHREAD_SAFE_CALL( pthread_join(worker_threads[i], NULL) ) ;
323 |   }
324 |   free(worker_threads);
325 |   worker_threads = NULL;
326 |   PTHREAD_SAFE_CALL( pthread_mutex_destroy(&count_lock) );
327 |   PTHREAD_SAFE_CALL( pthread_cond_destroy(&count_cond) );
328 |   PTHREAD_SAFE_CALL( pthread_mutex_destroy(&queue_lock) );
329 |   PTHREAD_SAFE_CALL( pthread_cond_destroy(&queue_cond) );
330 | }
331 | 
332 | void Weft::initialize_count(unsigned count)
333 | {
334 |   PTHREAD_SAFE_CALL( pthread_mutex_lock(&count_lock) ); 
335 |   assert(pending_count == 0);
336 |   pending_count = count;
337 |   PTHREAD_SAFE_CALL( pthread_mutex_unlock(&count_lock) );
338 | }
339 | 
340 | void Weft::wait_until_done(void)
341 | {
342 |   PTHREAD_SAFE_CALL( pthread_mutex_lock(&count_lock) );
343 |   if (pending_count > 0)
344 |   {
345 |     PTHREAD_SAFE_CALL( pthread_cond_wait(&count_cond, &count_lock) );
346 |   }
347 |   PTHREAD_SAFE_CALL( pthread_mutex_unlock(&count_lock) );
348 | }
349 | 
350 | void Weft::enqueue_task(WeftTask *task)
351 | {
352 |   PTHREAD_SAFE_CALL( pthread_mutex_lock(&queue_lock) );
353 |   queue.push_back(task); 
354 |   PTHREAD_SAFE_CALL( pthread_cond_signal(&queue_cond) );
355 |   PTHREAD_SAFE_CALL( pthread_mutex_unlock(&queue_lock) );
356 | }
357 | 
358 | WeftTask* Weft::dequeue_task(void)
359 | {
360 |   WeftTask *result = NULL;
361 |   bool done = false;
362 |   while (!done)
363 |   {
364 |     PTHREAD_SAFE_CALL( pthread_mutex_lock(&queue_lock) );
365 |     if (queue.empty()) 
366 |     {
367 |       if (!threadpool_finished)
368 |       {
369 |         PTHREAD_SAFE_CALL( pthread_cond_wait(&queue_cond, &queue_lock) );
370 |       }
371 |       else
372 |         done = true;
373 |     }
374 |     else
375 |     {
376 |       result = queue.front();
377 |       queue.pop_front();
378 |       done = true;
379 |     }
380 |     PTHREAD_SAFE_CALL( pthread_mutex_unlock(&queue_lock) );
381 |   }
382 |   return result;
383 | }
384 | 
385 | void Weft::complete_task(WeftTask *task)
386 | {
387 |   PTHREAD_SAFE_CALL( pthread_mutex_lock(&count_lock) );
388 |   assert(pending_count > 0);
389 |   pending_count--;
390 |   if (pending_count == 0)
391 |     PTHREAD_SAFE_CALL( pthread_cond_signal(&count_cond) );
392 |   PTHREAD_SAFE_CALL( pthread_mutex_unlock(&count_lock) );
393 |   // Clean up the task
394 |   delete task;
395 | }
396 | 
397 | /*static*/
398 | void* Weft::worker_loop(void *arg)
399 | {
400 |   Weft *weft = (Weft*)arg;
401 |   while (true)
402 |   {
403 |     WeftTask *task = weft->dequeue_task();
404 |     // If we ever get a NULL task then we are done
405 |     if (task == NULL)
406 |       break;
407 |     task->execute();
408 |     weft->complete_task(task);
409 |   }
410 |   return NULL;
411 | }
412 | 
413 | /*static*/
414 | unsigned long long Weft::get_current_time_in_micros(void)
415 | {
416 | #ifdef __MACH__
417 |   mach_timespec_t spec;
418 |   clock_serv_t cclock;
419 |   host_get_clock_service(mach_host_self(), CALENDAR_CLOCK, &cclock);
420 |   clock_get_time(cclock, &spec);
421 |   mach_port_deallocate(mach_host_self(), cclock);
422 | #else
423 |   struct timespec spec;
424 |   clock_gettime(CLOCK_MONOTONIC, &spec);
425 | #endif
426 |   unsigned long long result = (((unsigned long long)spec.tv_sec) * 1000000) +
427 |                               (((unsigned long long)spec.tv_nsec) / 1000);
428 |   return result;
429 | }
430 | 
431 | /*static*/
432 | size_t Weft::get_memory_usage(void)
433 | {
434 |   struct rusage usage;
435 |   getrusage(RUSAGE_SELF, &usage);
436 |   return usage.ru_maxrss;
437 | }
438 | 
439 | int main(int argc, char **argv)
440 | {
441 |   Weft weft(argc, argv);
442 |   weft.verify();
443 |   fflush(stderr);
444 |   fflush(stdout);
445 |   return 0;
446 | }
447 | 
448 | 


--------------------------------------------------------------------------------
/examples/RTM/one_phase_single_buffer.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2015 Stanford University and NVIDIA
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | #include <stdio.h>
 18 | #include <stdlib.h>
 19 | #include <assert.h>
 20 | #include <math.h>
 21 | 
 22 | #include "cudaDMAK.h"
 23 | 
 24 | #define LDG_BYTES	(12*16)
 25 | 
 26 | #ifndef USE_REDUCTION
 27 | #define USE_REDUCTION 0
 28 | #endif
 29 | 
 30 | #ifndef X_BEFORE_Y_FOR_PXY
 31 | #define X_BEFORE_Y_FOR_PXY 1
 32 | #endif
 33 | 
 34 | #define Z_BEFORE_X_FOR_PXZ 1
 35 | #define Z_BEFORE_Y_FOR_PYZ 1
 36 | 
 37 | #ifndef R
 38 | #define R	4
 39 | #endif
 40 | 
 41 | #ifndef USE_CONSTANT_FOR_ZCOEFF
 42 | #define USE_CONSTANT_FOR_ZCOEFF 1
 43 | #endif
 44 | 
 45 | #ifndef TILE_X
 46 | #define	TILE_X 32	
 47 | #endif
 48 | #ifndef TILE_Y
 49 | #define	TILE_Y	4		
 50 | #endif
 51 | 
 52 | #ifndef BLOCKING_TYPE
 53 | #define BLOCKING_TYPE float2
 54 | #endif
 55 | const int x_b = sizeof(BLOCKING_TYPE)/sizeof(float);
 56 | // Need radius in X-dimension to be a multiple of the X-dimension blocking factor
 57 | const int RX = (R % x_b ? R + x_b - (R % x_b) : R);
 58 | 
 59 | #define FP_RAND() (float(rand())/float(RAND_MAX) < 0.5 ? -(float((rand()+1)%5)) : (float((rand()+1)%5)))
 60 | #define FP_RAND_RAD() (float((rand()%4)) * M_PI_2)
 61 | 
 62 | #ifndef MAX_Z_DIM
 63 | #define MAX_Z_DIM 500
 64 | #endif
 65 | 
 66 | #ifndef RTM_ELMTS
 67 | #define RTM_ELMTS 64
 68 | #endif
 69 | 
 70 | __constant__ float c_xx[R+1], c_yy[R+1];
 71 | __constant__ float c_x[R+1], c_y[R+1];
 72 | __constant__ float vsz2_constant;
 73 | 
 74 | #ifdef USE_CONSTANT_FOR_ZCOEFF
 75 | __constant__ float c_zz[(2*R+1)*MAX_Z_DIM], c_z[(2*R+1)*MAX_Z_DIM];
 76 | #endif
 77 | 
 78 | template<class T, unsigned ecnt>
 79 | __device__ __forceinline__ void
 80 | shift(T b[ecnt+1]) {
 81 | #pragma unroll
 82 |     for(unsigned idx=0;idx<ecnt;idx++)
 83 |         b[idx] = b[idx+1];
 84 | }
 85 | 
 86 | template<class T, unsigned ecnt>
 87 | __device__ inline void
 88 | init(float b[ecnt+1], const T ival) {
 89 | #pragma unroll
 90 |     for(unsigned idx = 0; idx < ecnt + 1; idx ++)
 91 |         b[idx] = ival;
 92 | }
 93 | 
 94 | template<class T>
 95 | struct deriv_t {
 96 |     T dx2;
 97 |     T dy2;
 98 |     T dz2;
 99 |     T dxy;
100 |     T dxz;
101 |     T dyz;
102 | };
103 | 
104 | struct range {
105 |     int start;
106 |     int end;
107 |     range(int s, int e) : start(s), end(e) {}
108 | };
109 | 
110 | enum Param_type_e {
111 |     Vpz2,
112 |     Delta,
113 |     Epsln,
114 |     Alpha,
115 |     Beta,
116 |     ParameterCnt
117 | };
118 | 
119 | #define REGS_PER_THREAD 64
120 | #define REGS_PER_SM 65536
121 | 
122 | __device__ __forceinline__ float
123 | cachedRead(const float* data, const int index)
124 | {
125 |     const float* address = &data[index];
126 |     float result;
127 |     asm("ld.ca.f32 %0, [%1];\n"
128 |         : "=f" (result)
129 | #if defined(_WIN64) || defined(__LP64__)
130 |         : "l"(address)
131 | #else
132 |         : "r"(address)
133 | #endif
134 |         : "memory"
135 |        );
136 |     return result;
137 | }
138 | 
139 | __device__ __forceinline__ void
140 | _myRedAdd(const float* address, const float update)
141 | {
142 |     asm("red.global.add.f32 [%0], %1;\n"
143 |         :
144 | #if defined(_WIN64) || defined(__LP64__)
145 |         : "l"(address)
146 | #else
147 |         : "r"(address)
148 | #endif
149 |           , "f" (update)
150 |         : "memory"
151 |         );
152 | }
153 | 
154 | #define DIV_CEILING(x,y) (x/y + (x % y ? 1 : 0))
155 | #define EPT(x) (DIV_CEILING(2*R+x, x))
156 | #define WARP_WIDTH 32
157 | #define WPAD 0
158 | #define MAX(a, b) (a < b ? b : a)
159 | #define PQW_WIDTH MAX(WARP_WIDTH+WPAD, tile_x+2*R+WPAD)
160 | #define halfWarpCnt (TILE_Y * TILE_X / WARP_WIDTH)
161 | #define haloCnt ((2*R)/TILE_Y + 1)
162 | #ifdef USE_TEX
163 | texture<float2, 1, cudaReadModeElementType> tex_PQ2;
164 | texture<float2, 1, cudaReadModeElementType> tex_PnQn2;
165 | texture<float, 1, cudaReadModeElementType> tex_PQ;
166 | texture<float, 1, cudaReadModeElementType> tex_PnQn;
167 | texture<float4, 1, cudaReadModeElementType> tex_abde;
168 | texture<float, 1, cudaReadModeElementType> tex_vpz2;
169 | texture<float, 1, cudaReadModeElementType> tex_P;
170 | texture<float, 1, cudaReadModeElementType> tex_Pn;
171 | texture<float, 1, cudaReadModeElementType> tex_Q;
172 | texture<float, 1, cudaReadModeElementType> tex_Qn;
173 | #endif
174 | #ifdef USE_TEX
175 | #define ld_PQ2_ro(_loc) tex1Dfetch(tex_PQ2, _loc)
176 | #define ld_PnQn2_ro(_loc) tex1Dfetch(tex_PnQn2, _loc)
177 | #define ld_PQ_ro(_loc) tex1Dfetch(tex_PQ, _loc)
178 | #define ld_P_ro(_loc) tex1Dfetch(tex_P, _loc)
179 | #define ld_Q_ro(_loc) tex1Dfetch(tex_Q, _loc)
180 | #define ld_PQ_ro_cached(_loc) tex1Dfetch(tex_PQ, _loc)
181 | #define ld_PnQn_ro(_loc) tex1Dfetch(tex_PnQn, _loc)
182 | #define ld_Pn_ro(_loc) tex1Dfetch(tex_Pn, _loc)
183 | #define ld_Qn_ro(_loc) tex1Dfetch(tex_Qn, _loc)
184 | #define ld_param_ro(_loc, _param) tex1Dfetch(tex_##_param, _loc)
185 | #else
186 | #define ld_PQ2_ro(_loc) g_PQ[_loc] 
187 | #define ld_PnQn2_ro(_loc) g_PnQn[_loc] 
188 | #define ld_PQ_ro(_loc) g_PQ[_loc]
189 | #define ld_P_ro(_loc) g_P[_loc]
190 | #define ld_Q_ro(_loc) g_Q[_loc]
191 | #if NVCC_SUPPORTS_UNROLL_INLINE_ASM==1
192 | #define ld_PQ_ro_cached(_loc) cachedRead(g_PQ, _loc)
193 | #define ld_P_ro_cached(_loc) cachedRead(g_P, _loc)
194 | #define ld_Q_ro_cached(_loc) cachedRead(g_Q, _loc)
195 | #else
196 | #define ld_PQ_ro_cached(_loc) ld_PQ_ro(_loc)
197 | #define ld_P_ro_cached(_loc) ld_P_ro(_loc)
198 | #define ld_Q_ro_cached(_loc) ld_Q_ro(_loc)
199 | #endif
200 | #define ld_PnQn_ro(_loc) g_PnQn[_loc]
201 | #define ld_Pn_ro(_loc) g_Pn[_loc]
202 | #define ld_Qn_ro(_loc) g_Qn[_loc]
203 | #define ld_param_ro(_loc, _param) g_##_param[_loc]
204 | #endif
205 | 
206 | #define ZPENCIL_LENGTH (2*R+1)
207 | #define ZPENCIL_LAST (ZPENCIL_LENGTH-1)
208 | #define ZPENCIL_FIRST 0
209 | #define ZPENCIL(_n, _t) _t _n[ZPENCIL_LENGTH]
210 | #define ZPENCIL_SHIFT(_n) shift<float, ZPENCIL_LENGTH-1>(_n)
211 | #define ZPENCIL_INIT(_n) init<float, ZPENCIL_LENGTH-1>(_n, 0.0f)
212 | #define ZPENCIL_CTR_PRESHIFT R+1
213 | #define ZPENCIL_CTR_POSTSHIFT R
214 | 
215 | #define Q_LENGTH R
216 | #ifdef Q_IN_REGISTERS
217 | #define Q_DEF(_n, _t) _t _n[Q_LENGTH]; init<float, Q_LENGTH-1>(_n, 0.0f)
218 | #define qidx 
219 | #define Q_COMMON(_i)
220 | #define Q_CURR(_q, _i) _q[0]
221 | #define Q_LAST(_q, _i) _q[Q_LENGTH-1]
222 | #define ADVANCE_Qs(_q1,_q2,_q3,_i) shift<float, Q_LENGTH-1>(_q1);shift<float, Q_LENGTH-1>(_q2);shift<float, Q_LENGTH-1>(_q3)
223 | #define ADVANCE_6Qs(_q1,_q2,_q3,_q4, _q5, _q6, _i) shift<float, Q_LENGTH-1>(_q1);shift<float, Q_LENGTH-1>(_q2);shift<float, Q_LENGTH-1>(_q3);shift<float, Q_LENGTH-1>(_q4);shift<float, Q_LENGTH-1>(_q5);shift<float, Q_LENGTH-1>(_q6)
224 | #define ADVANCE_3Qs(_q1,_q2,_q3,_i) shift<float, Q_LENGTH-1>(_q1);shift<float,Q_LENGTH-1>(_q2);shift<float, Q_LENGTH-1>(_q3);
225 | #else
226 | #define Q_COMMON(_i) int _i = 0
227 | #ifdef Q_IN_SMEM
228 | #if 0
229 | #define Q_DEF(_n, _t) volatile __shared__ _t _n[Q_LENGTH][1*tile_y][tile_x]
230 | #define Q_CURR(_q, _i) _q[_i][threadIdx.y][threadIdx.x]
231 | #define Q_LAST(_q, _i) _q[(_i == 0 ? Q_LENGTH-1 : _i-1)][threadIdx.y][threadIdx.x]
232 | #endif
233 | #define Q_DEF(_n, _t) volatile __shared__ _t _n[Q_LENGTH][TILE_Y*TILE_X]
234 | #define Q_CURR(_q, _i) _q[_i][threadIdx.x]
235 | #define Q_LAST(_q, _i) _q[(_i == 0 ? Q_LENGTH-1 : _i-1)][threadIdx.x]
236 | #define ADVANCE_Qs(_q1,_q2,_q3,_i) _i = (_i == Q_LENGTH-1 ? 0 : _i+1)
237 | #define ADVANCE_6Qs(_q1,_q2,_q3,_q4,_q5,_q6,_i) _i = (_i == Q_LENGTH-1 ? 0 : _i+1)
238 | #else
239 | #define Q_DEF(_n, _t) _t _n[Q_LENGTH]
240 | #define Q_CURR(_q, _i) _q[_i]
241 | #define Q_LAST(_q, _i) _q[(_i == 0 ? Q_LENGTH-1 : _i-1)]
242 | #define ADVANCE_Qs(_q1,_q2,_q3, _i) _i = (_i == Q_LENGTH-1 ? 0 : _i+1)
243 | #define ADVANCE_6Qs(_q1,_q2,_q3,_q4,_q5,_q6,_i) _i = (_i == Q_LENGTH-1 ? 0 : _i+1)
244 | #endif
245 | #endif
246 | 
247 | #define SMEM_Q_DEF(_n, _t) volatile __shared__ _t _n[Q_LENGTH][TILE_Y*TILE_X]
248 | #define SMEM_Q_CURR(_q, _i) _q[_i][threadIdx.x]
249 | #define SMEM_Q_LAST(_q, _i) _q[(_i == 0 ? Q_LENGTH-1 : _i-1)][threadIdx.x]
250 | #define SMEM_ADVANCE_Qs(_q1,_q2,_q3,_i) _i = (_i == Q_LENGTH-1 ? 0 : _i+1)
251 | #define SMEM_ADVANCE_6Qs(_q1,_q2,_q3,_q4,_q5,_q6,_i) _i = (_i == Q_LENGTH-1 ? 0 : _i+1)
252 | #define SMEM_ADVANCE_3Qs(_q1,_q2,_q3,_i) _i = (_i == Q_LENGTH-1 ? 0 : _i+1)
253 | 
254 | #if NVCC_SUPPORTS_UNROLL_INLINE_ASM==1
255 | #define READ_Z_COEFF(_a, _i) cachedRead(_a, _i)
256 | #else
257 | #define READ_Z_COEFF(_a, _i) _a[_i]
258 | #endif
259 | 
260 | #define SMEM_ROW_WIDTH (2*(tile_x+2*R))
261 | 
262 | #if R <= TILE_Y
263 | #define HALO_CNT 1
264 | #define HALO_INIT {0.f}
265 | #define LAST_HIDX 0
266 | #else
267 | #if R <= 2*TILE_Y
268 | #define HALO_CNT 2
269 | #define HALO_INIT {0.f, 0.f}
270 | #define LAST_HIDX 1
271 | #else
272 | #error "Not coded to handle the case when R > 2*TILE_Y"
273 | #endif
274 | #endif
275 | 
276 | #if USE_REDUCTION == 1
277 | #define REDUCTION_LD(_mv_ld_statement)
278 | // _mv = memory value, _uv = update value
279 | #define REDUCTION_SUB(_l, _mv, _uv) _myRedAdd(&_l, -(_uv))
280 | #else
281 | #define REDUCTION_LD(_mv_ld_statement) _mv_ld_statement
282 | #define REDUCTION_SUB(_l, _mv, _uv) _l = (_mv) - (_uv)
283 | #endif
284 | 
285 | #define COMPUTE_THREADS_PER_CTA (TILE_X*TILE_Y)
286 | #define COUNT_PER_STRIDE_ROW	(TILE_X+2*R) // The extra 32 covers the 2*R halo elements
287 | #define DMA_THREADS_PER_LD 		32	
288 | //#define DMA_THREADS_PER_LD 		64	
289 | //#define DMA_THREADS_PER_LD		128
290 | //#define DMA_THREADS_PER_LD		256	
291 | #define DMA_THREADS_PER_CTA		(1*DMA_THREADS_PER_LD)
292 | #define BYTES_PER_THREAD		(sizeof(float2)*COUNT_PER_STRIDE_ROW*(TILE_Y+2*R)/DMA_THREADS_PER_LD)
293 | 
294 | #define PQY_BUF_OFFSET			(-(R*COUNT_PER_STRIDE_ROW))
295 | 
296 | __global__ void
297 | __launch_bounds__(160,3)
298 | single_pass_ktuned_DMA_single_specialized_one_phase(float2 *g_PnQn,
299 |                    float2* g_PQ, 
300 | #ifndef USE_TEX				   
301 | 				   float4* g_abde, float* g_vpz2,
302 | #endif
303 |                    const int row_stride, const int slice_stride, const int nz,
304 |                    const int R_x_row_stride, const int tile_y_x_row_stride,
305 |                    const int Pn_P_diff, const int lead_pad, const int offset, 
306 |                    const int q_start_idx
307 | #ifndef USE_CONSTANT_FOR_ZCOEFF
308 |                    , const float* c_z, const float* c_zz
309 | #endif
310 |     )
311 | {
312 | 	int tid = threadIdx.x;
313 | 	__shared__ float2 s_PQ[(TILE_Y+2*R)*COUNT_PER_STRIDE_ROW+TILE_Y*COUNT_PER_STRIDE_ROW];
314 | 	float2 *PQy_buf = s_PQ+(TILE_Y+2*R)*COUNT_PER_STRIDE_ROW+PQY_BUF_OFFSET;
315 | 	cudaDMAStrided<true,16,LDG_BYTES,COUNT_PER_STRIDE_ROW*sizeof(float2),DMA_THREADS_PER_LD,TILE_Y+2*R>
316 | 		dma_ld_pq(1,
317 | 				COMPUTE_THREADS_PER_CTA,
318 | 				COMPUTE_THREADS_PER_CTA,
319 | 				row_stride*sizeof(float2),
320 | 				COUNT_PER_STRIDE_ROW*sizeof(float2)
321 | 				);
322 | 	int gid = offset + blockIdx.x*TILE_X +
323 | 					 + blockIdx.y*TILE_Y*row_stride;
324 | 
325 | 	if(tid<COMPUTE_THREADS_PER_CTA)
326 | 	{
327 | 		// Compute 2D index
328 | 		const int tx = tid % TILE_X;
329 | 		const int ty = tid / TILE_X;
330 | 
331 | 		int out_idx = gid + ty*row_stride + tx;
332 | 
333 | 		// SMEM pointers
334 | 		const int PQ_index = (R+ty)*COUNT_PER_STRIDE_ROW + (R+tx);
335 | 
336 | 		// Halo work for y stencil
337 | 		// R=4 ONLY
338 | 		const bool Xlower = (tx<(WARP_WIDTH/2));
339 | 		const int Xofs = (tx & 0x3) + 1;
340 | 		const int Xalign = (tx & 0xf);
341 | 		const int PQ_halo_diff = (Xlower ?
342 | 								 -(Xalign+Xofs) :
343 | 								  (WARP_WIDTH/2) - (Xalign+1) + Xofs);
344 | 		// State
345 | 		ZPENCIL(l_P, float);
346 | 		ZPENCIL(l_Px,float);
347 | 		ZPENCIL(l_Py,float);
348 | 		ZPENCIL(l_Q, float);
349 | 		ZPENCIL(l_Qx,float);
350 | 		ZPENCIL(l_Qy,float);
351 | 		ZPENCIL_INIT(l_P);
352 | 		ZPENCIL_INIT(l_Px);
353 | 		ZPENCIL_INIT(l_Py);
354 | 		ZPENCIL_INIT(l_Q);
355 | 		ZPENCIL_INIT(l_Qx);
356 | 		ZPENCIL_INIT(l_Qy);
357 | 
358 | 		Q_COMMON(qidx);
359 | 		Q_DEF (q_Pxx, float);
360 | 		Q_DEF (q_Pyy, float);
361 | 		Q_DEF (q_Pxy, float);
362 | 		Q_DEF (q_Qxx, float);
363 | 		Q_DEF (q_Qyy, float);
364 | 		Q_DEF (q_Qxy, float);
365 | 
366 | 		//------------------------------------------------------
367 | 		// Prime the pump
368 | 		//------------------------------------------------------
369 | 		dma_ld_pq.start_async_dma();
370 | 		for(int i=0; i<2*R; i++)
371 | 		{
372 | 			ZPENCIL_SHIFT(l_P);
373 | 			ZPENCIL_SHIFT(l_Px);
374 | 			ZPENCIL_SHIFT(l_Py);
375 | 			ZPENCIL_SHIFT(l_Q);
376 | 			ZPENCIL_SHIFT(l_Qx);
377 | 			ZPENCIL_SHIFT(l_Qy);
378 | 			ADVANCE_6Qs(q_Pxx, q_Pyy, q_Pxy, q_Qxx, q_Qyy, q_Qxy, qidx);
379 | 
380 | 			dma_ld_pq.wait_for_dma_finish(); // s_PQ ready, synchronization point for compute threads
381 | 			l_P[ZPENCIL_LAST] = s_PQ[PQ_index].x;
382 | 			l_Q[ZPENCIL_LAST] = s_PQ[PQ_index].y;
383 | 
384 | 			// Compute x, xx stencils
385 | 			l_Px[ZPENCIL_LAST] = c_x[0]*l_P[ZPENCIL_LAST];
386 |         	l_Qx[ZPENCIL_LAST] = c_x[0]*l_Q[ZPENCIL_LAST];
387 |         	float q_Pxxl = c_xx[0]*l_P[ZPENCIL_LAST];
388 |         	float q_Qxxl = c_xx[0]*l_Q[ZPENCIL_LAST];
389 | 			#pragma unroll
390 | 			for(int j=1;j<=R;j++) {
391 | 				const float2 v1 = s_PQ[PQ_index-j];
392 | 				const float2 v2 = s_PQ[PQ_index+j];
393 | 				l_Px[ZPENCIL_LAST] += c_x[j] * (v1.x + v2.x);
394 | 				l_Qx[ZPENCIL_LAST] += c_x[j] * (v1.y + v2.y);
395 | 				q_Pxxl += c_xx[j] * (v1.x + v2.x);
396 | 				q_Qxxl += c_xx[j] * (v1.y + v2.y);
397 | 			}
398 | 			Q_LAST(q_Pxx, qidx) = q_Pxxl;
399 | 			Q_LAST(q_Qxx, qidx) = q_Qxxl;
400 | 
401 | 			// Compute y stencils for halo
402 | 			float2 PQyside;
403 | 			PQyside.x = c_y[0] * s_PQ[PQ_index+PQ_halo_diff].x; 
404 | 			PQyside.y = c_y[0] * s_PQ[PQ_index+PQ_halo_diff].y; 
405 | 			#pragma unroll
406 | 			for(int j=1;j<=R;j++)
407 | 			{
408 | 				const float2 v1 = s_PQ[PQ_index+PQ_halo_diff-j*COUNT_PER_STRIDE_ROW];
409 | 				const float2 v2 = s_PQ[PQ_index+PQ_halo_diff+j*COUNT_PER_STRIDE_ROW];
410 | 				PQyside.x += c_y[j] * (v1.x + v2.x);
411 | 				PQyside.y += c_y[j] * (v1.y + v2.y);
412 | 			}
413 | 			PQy_buf[PQ_index+PQ_halo_diff] = PQyside;
414 | 
415 | 			// Compute y, yy stencils
416 |         	l_Py[ZPENCIL_LAST] = c_y[0]*l_P[ZPENCIL_LAST];
417 |         	l_Qy[ZPENCIL_LAST] = c_y[0]*l_Q[ZPENCIL_LAST];
418 |         	float q_Pyyl = c_yy[0]*l_P[ZPENCIL_LAST];
419 |         	float q_Qyyl = c_yy[0]*l_Q[ZPENCIL_LAST];
420 | 			#pragma unroll
421 | 			for(int j=1;j<=R;j++) {
422 | 				const float2 v1 = s_PQ[PQ_index-j*COUNT_PER_STRIDE_ROW];
423 | 				const float2 v2 = s_PQ[PQ_index+j*COUNT_PER_STRIDE_ROW];
424 | 				l_Py[ZPENCIL_LAST] += c_y[j] * (v1.x + v2.x);
425 | 				l_Qy[ZPENCIL_LAST] += c_y[j] * (v1.y + v2.y);
426 | 				q_Pyyl += c_yy[j] * (v1.x + v2.x);
427 | 				q_Qyyl += c_yy[j] * (v1.y + v2.y);
428 | 			}
429 | 			Q_LAST(q_Pyy, qidx) = q_Pyyl;
430 | 			Q_LAST(q_Qyy, qidx) = q_Qyyl;
431 | 			PQy_buf[PQ_index].x = l_Py[ZPENCIL_LAST];
432 | 			PQy_buf[PQ_index].y = l_Qy[ZPENCIL_LAST];
433 | 
434 | 			ptx_cudaDMA_barrier_blocking(0,COMPUTE_THREADS_PER_CTA); // __syncthreads()
435 | 			dma_ld_pq.start_async_dma(); // fetch next s_PQ
436 | 
437 | 			// Compute xy 
438 | 			float q_Pxyl = c_x[0]*l_Py[ZPENCIL_LAST];
439 | 			float q_Qxyl = c_x[0]*l_Qy[ZPENCIL_LAST];
440 | 			#pragma unroll
441 | 			for(int j=1;j<=R;j++)
442 | 			{
443 | 				const float2 v1 = PQy_buf[PQ_index-j];
444 | 				const float2 v2 = PQy_buf[PQ_index+j];
445 | 				q_Pxyl += c_x[j] * (v1.x + v2.x);
446 | 				q_Qxyl += c_x[j] * (v1.y + v2.y);
447 | 			}
448 | 			Q_LAST(q_Pxy, qidx) = q_Pxyl;
449 | 			Q_LAST(q_Qxy, qidx) = q_Qxyl;
450 | 		}
451 | 
452 | 		//------------------------------------------------------
453 | 		// Main loop
454 | 		//------------------------------------------------------
455 | 		int z_base = 0;
456 | #ifdef DYNAMIC
457 | 		for(int iz=0; iz<(nz-1); iz++, z_base+=(2*R+1))
458 | #else
459 |                 #pragma unroll 1
460 |                 for (int iz=0; iz<(RTM_ELMTS-1); iz++, z_base+=(2*R+1))
461 | #endif
462 | 		{
463 |             ZPENCIL_SHIFT(l_P);
464 |             ZPENCIL_SHIFT(l_Px);
465 |             ZPENCIL_SHIFT(l_Py);
466 |             ZPENCIL_SHIFT(l_Q);
467 |             ZPENCIL_SHIFT(l_Qx);
468 |             ZPENCIL_SHIFT(l_Qy);
469 |             const float q_Pxx0 = Q_CURR(q_Pxx, qidx);
470 |             const float q_Qxx0 = Q_CURR(q_Qxx, qidx);
471 |             const float q_Pyy0 = Q_CURR(q_Pyy, qidx);
472 |             const float q_Qyy0 = Q_CURR(q_Qyy, qidx);
473 |             const float q_Pxy0 = Q_CURR(q_Pxy, qidx);
474 |             const float q_Qxy0 = Q_CURR(q_Qxy, qidx);
475 |             ADVANCE_6Qs(q_Pxx, q_Pyy, q_Pxy, q_Qxx, q_Qyy, q_Qxy, qidx);
476 | 
477 | 			// Params
478 | 			REDUCTION_LD(const float2 PnQn = ld_PnQn2_ro(out_idx));
479 | 			const float4 abde = ld_param_ro(out_idx, abde);
480 | 			const float  vpz2 = ld_param_ro(out_idx, vpz2);
481 | 
482 | 			dma_ld_pq.wait_for_dma_finish(); // s_PQ ready, synchronization point for compute threads
483 | 
484 | 			l_P[ZPENCIL_LAST] = s_PQ[PQ_index].x;
485 | 			l_Q[ZPENCIL_LAST] = s_PQ[PQ_index].y;
486 | 
487 | 			// Compute x, xx stencils
488 |             l_Px[ZPENCIL_LAST] = c_x[0]*l_P[ZPENCIL_LAST];
489 |             l_Qx[ZPENCIL_LAST] = c_x[0]*l_Q[ZPENCIL_LAST];
490 |             float q_Pxxl = c_xx[0]*l_P[ZPENCIL_LAST];
491 |             float q_Qxxl = c_xx[0]*l_Q[ZPENCIL_LAST];
492 | 			#pragma unroll
493 |             for(int i=1;i<=R;i++) {
494 | 				const float2 v1 = s_PQ[PQ_index-i];
495 | 				const float2 v2 = s_PQ[PQ_index+i];
496 |                 l_Px[ZPENCIL_LAST] += c_x[i] * (v1.x + v2.x);
497 |                 l_Qx[ZPENCIL_LAST] += c_x[i] * (v1.y + v2.y);
498 |                 q_Pxxl += c_xx[i] * (v1.x + v2.x);
499 |                 q_Qxxl += c_xx[i] * (v1.y + v2.y);
500 |             }
501 |             Q_LAST(q_Pxx, qidx) = q_Pxxl;
502 |             Q_LAST(q_Qxx, qidx) = q_Qxxl;
503 | 
504 | 			// Compute zz stencils
505 |             float Pzz = 0.f;
506 |             float Qzz = 0.f;
507 | 			#pragma unroll
508 |             for(int j=0;j<2*R+1;j++) {
509 |                 const float zcoeff = READ_Z_COEFF(c_zz, z_base+j);
510 |                 Pzz += zcoeff * l_P[ZPENCIL_CTR_POSTSHIFT-R+j];
511 |                 Qzz += zcoeff * l_Q[ZPENCIL_CTR_POSTSHIFT-R+j];
512 |             }
513 | 
514 | 			// Compute y stencil for halo
515 |         	float2 PQyside;
516 | 	        PQyside.x = c_y[0] * s_PQ[PQ_index+PQ_halo_diff].x; 
517 | 	        PQyside.y = c_y[0] * s_PQ[PQ_index+PQ_halo_diff].y; 
518 | 			#pragma unroll
519 |    	    	for(int j=1;j<=R;j++)
520 | 			{
521 | 				const float2 v1 = s_PQ[PQ_index+PQ_halo_diff-j*COUNT_PER_STRIDE_ROW];
522 | 				const float2 v2 = s_PQ[PQ_index+PQ_halo_diff+j*COUNT_PER_STRIDE_ROW];
523 |            		PQyside.x += c_y[j] * (v1.x + v2.x);
524 | 				PQyside.y += c_y[j] * (v1.y + v2.y);
525 | 			}
526 |         	PQy_buf[PQ_index+PQ_halo_diff] = PQyside;
527 | 			
528 | 			// Compute y, yy stencils
529 |             l_Py[ZPENCIL_LAST] = c_y[0]*l_P[ZPENCIL_LAST];
530 |             l_Qy[ZPENCIL_LAST] = c_y[0]*l_Q[ZPENCIL_LAST];
531 |             float q_Pyyl = c_yy[0]*l_P[ZPENCIL_LAST];
532 |             float q_Qyyl = c_yy[0]*l_Q[ZPENCIL_LAST];
533 | 			#pragma unroll
534 |             for(int j=1;j<=R;j++) {
535 | 				const float2 v1 = s_PQ[PQ_index-j*COUNT_PER_STRIDE_ROW];
536 | 				const float2 v2 = s_PQ[PQ_index+j*COUNT_PER_STRIDE_ROW];
537 |                 l_Py[ZPENCIL_LAST] += c_y[j] * (v1.x + v2.x);
538 |                 l_Qy[ZPENCIL_LAST] += c_y[j] * (v1.y + v2.y);
539 |                 q_Pyyl += c_yy[j] * (v1.x + v2.x);
540 |                 q_Qyyl += c_yy[j] * (v1.y + v2.y);
541 |             }
542 |             Q_LAST(q_Pyy, qidx) = q_Pyyl;
543 |             Q_LAST(q_Qyy, qidx) = q_Qyyl;
544 |             PQy_buf[PQ_index].x = l_Py[ZPENCIL_LAST];
545 |             PQy_buf[PQ_index].y = l_Qy[ZPENCIL_LAST];
546 | 
547 | 			ptx_cudaDMA_barrier_blocking(0,COMPUTE_THREADS_PER_CTA); // __syncthreads()
548 | 			dma_ld_pq.start_async_dma(); // fetch next s_PQ
549 | 
550 | 			// Compute xy
551 |             float q_Pxyl = c_x[0] * l_Py[ZPENCIL_LAST];
552 |             float q_Qxyl = c_x[0] * l_Qy[ZPENCIL_LAST];
553 | 			#pragma unroll
554 |             for(int j=1;j<=R;j++)
555 | 			{
556 | 				const float2 v1 = PQy_buf[PQ_index-j];
557 | 				const float2 v2 = PQy_buf[PQ_index+j];
558 |                 q_Pxyl += c_x[j] * (v1.x + v2.x);
559 | 				q_Qxyl += c_x[j] * (v1.y + v2.y);
560 | 			}
561 |             Q_LAST(q_Pxy, qidx) = q_Pxyl;
562 |             Q_LAST(q_Qxy, qidx) = q_Qxyl;
563 | 
564 |             float Pxz = 0.f;
565 |             float Qxz = 0.f;
566 |             float Pyz = 0.f;
567 |             float Qyz = 0.f;
568 | 			#pragma unroll
569 |             for(int j=0;j<2*R+1;j++) {
570 |                 const float zcoeff = READ_Z_COEFF(c_z, z_base+j);
571 |                 Pxz += zcoeff * l_Px[ZPENCIL_CTR_POSTSHIFT-R+j];
572 |                 Qxz += zcoeff * l_Qx[ZPENCIL_CTR_POSTSHIFT-R+j];
573 |                 Pyz += zcoeff * l_Py[ZPENCIL_CTR_POSTSHIFT-R+j];
574 |                 Qyz += zcoeff * l_Qy[ZPENCIL_CTR_POSTSHIFT-R+j];
575 |             }
576 | 			
577 | 			// Output calculation
578 | 			const float  alpha = abde.x;
579 | 			const float  beta  = abde.y;
580 | 			const float  delta = abde.z;
581 | 			const float  epsln = abde.w;
582 | 
583 |             float sina;
584 |             float cosa;
585 |             __sincosf(alpha, &sina, &cosa);
586 |             float sinb;
587 |             float cosb;
588 |             __sincosf(beta, &sinb, &cosb);
589 | 
590 |             const float sina2 = sina * sina;
591 |             const float cosa2 = 1.f - sina2;
592 |             const float cosb2 = cosb * cosb;
593 |             const float sinb2 = 1.f - cosb2;
594 |             const float sin2a = 2.f * sina * cosa;
595 |             const float sin2b = 2.f * sinb * cosb;
596 | 
597 |             const float h1p = sina*cosb2*q_Pxx0 + 
598 |                 sina2*sinb2*q_Pyy0 + 
599 |                 cosa2*Pzz + 
600 |                 sina2*sin2b*q_Pxy0 + 
601 |                 sin2a*sinb*Pyz + 
602 |                 sin2a*cosb*Pxz;
603 |             const float h2p = q_Pxx0 + q_Pyy0 + Pzz - h1p;
604 | 
605 |             const float h1q = sina*cosb2*q_Qxx0 + 
606 |                 sina2*sinb2*q_Qyy0 + 
607 |                 cosa2*Qzz + 
608 |                 sina2*sin2b*q_Qxy0 + 
609 |                 sin2a*sinb*Qyz + 
610 |                 sin2a*cosb*Qxz;
611 |             const float h2q = q_Qxx0 + q_Qyy0 + Qzz - h1q;
612 | 
613 |             const float vsz2 = vsz2_constant * fabsf(delta - epsln);
614 |             const float vpn2 = vpz2 * (1.f + 2.f*delta);
615 |             const float vpx2 = vpz2 * (1.f + 2.f*epsln);
616 | 
617 |             REDUCTION_SUB(g_PnQn[out_idx].x, PnQn.x, 2.f*l_P[ZPENCIL_CTR_POSTSHIFT] + vpx2*h2p + vsz2*h1p + vpz2*h1q - vsz2*h1q);
618 |             REDUCTION_SUB(g_PnQn[out_idx].y, PnQn.y, 2.f*l_Q[ZPENCIL_CTR_POSTSHIFT] + vpz2*h1q + vsz2*h2q + vpn2*h2p - vsz2*h2p);
619 | 			
620 | 			out_idx += slice_stride;
621 | 		}
622 | 		//------------------------------------------------------
623 | 		// "Postamble" for main loop
624 | 		//------------------------------------------------------
625 | 		dma_ld_pq.wait_for_dma_finish();
626 | 		/*
627 | 		ZPENCIL_SHIFT(l_P);
628 | 		ZPENCIL_SHIFT(l_Px);
629 | 		ZPENCIL_SHIFT(l_Py);
630 | 		ZPENCIL_SHIFT(l_Q);
631 | 		ZPENCIL_SHIFT(l_Qx);
632 | 		ZPENCIL_SHIFT(l_Qy);
633 | 		const float q_Pxx0 = Q_CURR(q_Pxx, qidx);
634 | 		const float q_Qxx0 = Q_CURR(q_Qxx, qidx);
635 | 		const float q_Pyy0 = Q_CURR(q_Pyy, qidx);
636 | 		const float q_Qyy0 = Q_CURR(q_Qyy, qidx);
637 | 		const float q_Pxy0 = Q_CURR(q_Pxy, qidx);
638 | 		const float q_Qxy0 = Q_CURR(q_Qxy, qidx);
639 | 		ADVANCE_6Qs(q_Pxx, q_Pyy, q_Pxy, q_Qxx, q_Qyy, q_Qxy, qidx);
640 | 
641 | 		// Params
642 | 		REDUCTION_LD(const float2 PnQn = ld_PnQn2_ro(out_idx));
643 | 		const float4 abde = ld_param_ro(out_idx, abde);
644 | 		const float  vpz2 = ld_param_ro(out_idx, vpz2);
645 | 		const float  alpha = abde.x;
646 | 		const float  beta  = abde.y;
647 | 		const float  delta = abde.z;
648 | 		const float  epsln = abde.w;
649 | 
650 | //		dma_ld_pq.wait_for_dma_finish();
651 | 
652 | 		l_P[ZPENCIL_LAST] = s_PQ[PQ_index].x;
653 | 		l_Q[ZPENCIL_LAST] = s_PQ[PQ_index].y;
654 | 
655 | 		// Compute x, xx stencils
656 | 		l_Px[ZPENCIL_LAST] = c_x[0]*l_P[ZPENCIL_LAST];
657 | 		l_Qx[ZPENCIL_LAST] = c_x[0]*l_Q[ZPENCIL_LAST];
658 | 		float q_Pxxl = c_xx[0]*l_P[ZPENCIL_LAST];
659 | 		float q_Qxxl = c_xx[0]*l_Q[ZPENCIL_LAST];
660 | 		#pragma unroll
661 | 		for(int i=1;i<=R;i++) {
662 | 			const float2 v1 = s_PQ[PQ_index-i];
663 | 			const float2 v2 = s_PQ[PQ_index+i];
664 | 			l_Px[ZPENCIL_LAST] += c_x[i] * (v1.x + v2.x);
665 | 			l_Qx[ZPENCIL_LAST] += c_x[i] * (v1.y + v2.y);
666 | 			q_Pxxl += c_xx[i] * (v1.x + v2.x);
667 | 			q_Qxxl += c_xx[i] * (v1.y + v2.y);
668 | 		}
669 | 		Q_LAST(q_Pxx, qidx) = q_Pxxl;
670 | 		Q_LAST(q_Qxx, qidx) = q_Qxxl;
671 | 
672 | 		// Compute zz stencils
673 | 		float Pzz = 0.f;
674 | 		float Qzz = 0.f;
675 | 		#pragma unroll
676 | 		for(int j=0;j<2*R+1;j++) {
677 | 			const float zcoeff = READ_Z_COEFF(c_zz, z_base+j);
678 | 			Pzz += zcoeff * l_P[ZPENCIL_CTR_POSTSHIFT-R+j];
679 | 			Qzz += zcoeff * l_Q[ZPENCIL_CTR_POSTSHIFT-R+j];
680 | 		}
681 | 
682 | 		// Compute y stencil for halo
683 | 		float2 PQyside;
684 | 		PQyside.x = c_y[0] * s_PQ[PQ_index+PQ_halo_diff].x; 
685 | 		PQyside.y = c_y[0] * s_PQ[PQ_index+PQ_halo_diff].y; 
686 | 		#pragma unroll
687 | 		for(int j=1;j<=R;j++)
688 | 		{
689 | 			const float2 v1 = s_PQ[PQ_index+PQ_halo_diff-j*COUNT_PER_STRIDE_ROW];
690 | 			const float2 v2 = s_PQ[PQ_index+PQ_halo_diff+j*COUNT_PER_STRIDE_ROW];
691 | 			PQyside.x += c_y[j] * (v1.x + v2.x);
692 | 			PQyside.y += c_y[j] * (v1.y + v2.y);
693 | 		}
694 | 		PQy_buf[PQ_index+PQ_halo_diff] = PQyside;
695 | 		
696 | 		// Compute y, yy stencils
697 | 		l_Py[ZPENCIL_LAST] = c_y[0]*l_P[ZPENCIL_LAST];
698 | 		l_Qy[ZPENCIL_LAST] = c_y[0]*l_Q[ZPENCIL_LAST];
699 | 		float q_Pyyl = c_yy[0]*l_P[ZPENCIL_LAST];
700 | 		float q_Qyyl = c_yy[0]*l_Q[ZPENCIL_LAST];
701 | 		#pragma unroll
702 | 		for(int j=1;j<=R;j++) {
703 | 			const float2 v1 = s_PQ[PQ_index-j*COUNT_PER_STRIDE_ROW];
704 | 			const float2 v2 = s_PQ[PQ_index+j*COUNT_PER_STRIDE_ROW];
705 | 			l_Py[ZPENCIL_LAST] += c_y[j] * (v1.x + v2.x);
706 | 			l_Qy[ZPENCIL_LAST] += c_y[j] * (v1.y + v2.y);
707 | 			q_Pyyl += c_yy[j] * (v1.x + v2.x);
708 | 			q_Qyyl += c_yy[j] * (v1.y + v2.y);
709 | 		}
710 | 		Q_LAST(q_Pyy, qidx) = q_Pyyl;
711 | 		Q_LAST(q_Qyy, qidx) = q_Qyyl;
712 | 		PQy_buf[PQ_index].x = l_Py[ZPENCIL_LAST];
713 | 		PQy_buf[PQ_index].y = l_Qy[ZPENCIL_LAST];
714 | 
715 | 		ptx_cudaDMA_barrier_blocking(0,COMPUTE_THREADS_PER_CTA); // __syncthreads()
716 | 
717 | 		// Compute xy
718 | 		float q_Pxyl = c_x[0] * l_Py[ZPENCIL_LAST];
719 | 		float q_Qxyl = c_x[0] * l_Qy[ZPENCIL_LAST];
720 | 		#pragma unroll
721 | 		for(int j=1;j<=R;j++)
722 | 		{
723 | 			const float2 v1 = PQy_buf[PQ_index-j];
724 | 			const float2 v2 = PQy_buf[PQ_index+j];
725 | 			q_Pxyl += c_x[j] * (v1.x + v2.x);
726 | 			q_Qxyl += c_x[j] * (v1.y + v2.y);
727 | 		}
728 | 		Q_LAST(q_Pxy, qidx) = q_Pxyl;
729 | 		Q_LAST(q_Qxy, qidx) = q_Qxyl;
730 | 
731 | 		float Pxz = 0.f;
732 | 		float Qxz = 0.f;
733 | 		float Pyz = 0.f;
734 | 		float Qyz = 0.f;
735 | 		#pragma unroll
736 | 		for(int j=0;j<2*R+1;j++) {
737 | 			const float zcoeff = READ_Z_COEFF(c_z, z_base+j);
738 | 			Pxz += zcoeff * l_Px[ZPENCIL_CTR_POSTSHIFT-R+j];
739 | 			Qxz += zcoeff * l_Qx[ZPENCIL_CTR_POSTSHIFT-R+j];
740 | 			Pyz += zcoeff * l_Py[ZPENCIL_CTR_POSTSHIFT-R+j];
741 | 			Qyz += zcoeff * l_Qy[ZPENCIL_CTR_POSTSHIFT-R+j];
742 | 		}
743 | 		
744 | 		// Output calculation
745 | 		float sina;
746 | 		float cosa;
747 | 		__sincosf(alpha, &sina, &cosa);
748 | 		float sinb;
749 | 		float cosb;
750 | 		__sincosf(beta, &sinb, &cosb);
751 | 
752 | 		const float sina2 = sina * sina;
753 | 		const float cosa2 = 1.f - sina2;
754 | 		const float cosb2 = cosb * cosb;
755 | 		const float sinb2 = 1.f - cosb2;
756 | 		const float sin2a = 2.f * sina * cosa;
757 | 		const float sin2b = 2.f * sinb * cosb;
758 | 
759 | 		const float h1p = sina*cosb2*q_Pxx0 + 
760 | 			sina2*sinb2*q_Pyy0 + 
761 | 			cosa2*Pzz + 
762 | 			sina2*sin2b*q_Pxy0 + 
763 | 			sin2a*sinb*Pyz + 
764 | 			sin2a*cosb*Pxz;
765 | 		const float h2p = q_Pxx0 + q_Pyy0 + Pzz - h1p;
766 | 
767 | 		const float h1q = sina*cosb2*q_Qxx0 + 
768 | 			sina2*sinb2*q_Qyy0 + 
769 | 			cosa2*Qzz + 
770 | 			sina2*sin2b*q_Qxy0 + 
771 | 			sin2a*sinb*Qyz + 
772 | 			sin2a*cosb*Qxz;
773 | 		const float h2q = q_Qxx0 + q_Qyy0 + Qzz - h1q;
774 | 
775 | 		const float vsz2 = vsz2_constant * fabsf(delta - epsln);
776 | 		const float vpn2 = vpz2 * (1.f + 2.f*delta);
777 | 		const float vpx2 = vpz2 * (1.f + 2.f*epsln);
778 | 
779 | 		REDUCTION_SUB(g_PnQn[out_idx].x, PnQn.x, 2.f*l_P[ZPENCIL_CTR_POSTSHIFT] + vpx2*h2p + vsz2*h1p + vpz2*h1q - vsz2*h1q);
780 | 		REDUCTION_SUB(g_PnQn[out_idx].y, PnQn.y, 2.f*l_Q[ZPENCIL_CTR_POSTSHIFT] + vpz2*h1q + vsz2*h2q + vpn2*h2p - vsz2*h2p);
781 | 		*/
782 | 	}
783 | 	else if(dma_ld_pq.owns_this_thread())
784 | 	{
785 | 		int in_idx = gid - R_x_row_stride - R 
786 | 						 - R*slice_stride;
787 | #ifdef DYNAMIC
788 | 		for(int i=0; i<(2*R+nz); i++)
789 | #else
790 |                 #pragma unroll 1
791 |                 for(int i=0; i<(2*R+RTM_ELMTS); i++)
792 | #endif
793 | 		{
794 | 			dma_ld_pq.execute_dma(&g_PQ[in_idx],s_PQ);
795 | 			in_idx += slice_stride;
796 | 		}
797 | 	}
798 | }
799 | 
800 | 
801 | 


--------------------------------------------------------------------------------
/examples/RTM/two_phase_single_buffer.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2015 Stanford University and NVIDIA
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | #include <stdio.h>
 18 | #include <stdlib.h>
 19 | #include <assert.h>
 20 | #include <math.h>
 21 | 
 22 | #include "cudaDMAK.h"
 23 | 
 24 | #define LDG_BYTES	(12*16)
 25 | 
 26 | #ifndef USE_REDUCTION
 27 | #define USE_REDUCTION 0
 28 | #endif
 29 | 
 30 | #ifndef X_BEFORE_Y_FOR_PXY
 31 | #define X_BEFORE_Y_FOR_PXY 1
 32 | #endif
 33 | 
 34 | #define Z_BEFORE_X_FOR_PXZ 1
 35 | #define Z_BEFORE_Y_FOR_PYZ 1
 36 | 
 37 | #ifndef R
 38 | #define R	4
 39 | #endif
 40 | 
 41 | #ifndef USE_CONSTANT_FOR_ZCOEFF
 42 | #define USE_CONSTANT_FOR_ZCOEFF 1
 43 | #endif
 44 | 
 45 | #ifndef TILE_X
 46 | #define	TILE_X 32	
 47 | #endif
 48 | #ifndef TILE_Y
 49 | #define	TILE_Y	4		
 50 | #endif
 51 | 
 52 | #ifndef BLOCKING_TYPE
 53 | #define BLOCKING_TYPE float2
 54 | #endif
 55 | const int x_b = sizeof(BLOCKING_TYPE)/sizeof(float);
 56 | // Need radius in X-dimension to be a multiple of the X-dimension blocking factor
 57 | const int RX = (R % x_b ? R + x_b - (R % x_b) : R);
 58 | 
 59 | #define FP_RAND() (float(rand())/float(RAND_MAX) < 0.5 ? -(float((rand()+1)%5)) : (float((rand()+1)%5)))
 60 | #define FP_RAND_RAD() (float((rand()%4)) * M_PI_2)
 61 | 
 62 | #ifndef MAX_Z_DIM
 63 | #define MAX_Z_DIM 500
 64 | #endif
 65 | 
 66 | #ifndef RTM_ELMTS
 67 | #define RTM_ELMTS 64
 68 | #endif
 69 | 
 70 | __constant__ float c_xx[R+1], c_yy[R+1];
 71 | __constant__ float c_x[R+1], c_y[R+1];
 72 | __constant__ float vsz2_constant;
 73 | 
 74 | #ifdef USE_CONSTANT_FOR_ZCOEFF
 75 | __constant__ float c_zz[(2*R+1)*MAX_Z_DIM], c_z[(2*R+1)*MAX_Z_DIM];
 76 | #endif
 77 | 
 78 | template<class T, unsigned ecnt>
 79 | __device__ __forceinline__ void
 80 | shift(T b[ecnt+1]) {
 81 | #pragma unroll
 82 |     for(unsigned idx=0;idx<ecnt;idx++)
 83 |         b[idx] = b[idx+1];
 84 | }
 85 | 
 86 | template<class T, unsigned ecnt>
 87 | __device__ inline void
 88 | init(float b[ecnt+1], const T ival) {
 89 | #pragma unroll
 90 |     for(unsigned idx = 0; idx < ecnt + 1; idx ++)
 91 |         b[idx] = ival;
 92 | }
 93 | 
 94 | template<class T>
 95 | struct deriv_t {
 96 |     T dx2;
 97 |     T dy2;
 98 |     T dz2;
 99 |     T dxy;
100 |     T dxz;
101 |     T dyz;
102 | };
103 | 
104 | struct range {
105 |     int start;
106 |     int end;
107 |     range(int s, int e) : start(s), end(e) {}
108 | };
109 | 
110 | enum Param_type_e {
111 |     Vpz2,
112 |     Delta,
113 |     Epsln,
114 |     Alpha,
115 |     Beta,
116 |     ParameterCnt
117 | };
118 | 
119 | #define REGS_PER_THREAD 64
120 | #define REGS_PER_SM 65536
121 | 
122 | __device__ __forceinline__ float
123 | cachedRead(const float* data, const int index)
124 | {
125 |     const float* address = &data[index];
126 |     float result;
127 |     asm("ld.ca.f32 %0, [%1];\n"
128 |         : "=f" (result)
129 | #if defined(_WIN64) || defined(__LP64__)
130 |         : "l"(address)
131 | #else
132 |         : "r"(address)
133 | #endif
134 |         : "memory"
135 |        );
136 |     return result;
137 | }
138 | 
139 | __device__ __forceinline__ void
140 | _myRedAdd(const float* address, const float update)
141 | {
142 |     asm("red.global.add.f32 [%0], %1;\n"
143 |         :
144 | #if defined(_WIN64) || defined(__LP64__)
145 |         : "l"(address)
146 | #else
147 |         : "r"(address)
148 | #endif
149 |           , "f" (update)
150 |         : "memory"
151 |         );
152 | }
153 | 
154 | #define DIV_CEILING(x,y) (x/y + (x % y ? 1 : 0))
155 | #define EPT(x) (DIV_CEILING(2*R+x, x))
156 | #define WARP_WIDTH 32
157 | #define WPAD 0
158 | #define MAX(a, b) (a < b ? b : a)
159 | #define PQW_WIDTH MAX(WARP_WIDTH+WPAD, tile_x+2*R+WPAD)
160 | #define halfWarpCnt (TILE_Y * TILE_X / WARP_WIDTH)
161 | #define haloCnt ((2*R)/TILE_Y + 1)
162 | #ifdef USE_TEX
163 | texture<float2, 1, cudaReadModeElementType> tex_PQ2;
164 | texture<float2, 1, cudaReadModeElementType> tex_PnQn2;
165 | texture<float, 1, cudaReadModeElementType> tex_PQ;
166 | texture<float, 1, cudaReadModeElementType> tex_PnQn;
167 | texture<float4, 1, cudaReadModeElementType> tex_abde;
168 | texture<float, 1, cudaReadModeElementType> tex_vpz2;
169 | texture<float, 1, cudaReadModeElementType> tex_P;
170 | texture<float, 1, cudaReadModeElementType> tex_Pn;
171 | texture<float, 1, cudaReadModeElementType> tex_Q;
172 | texture<float, 1, cudaReadModeElementType> tex_Qn;
173 | #endif
174 | #ifdef USE_TEX
175 | #define ld_PQ2_ro(_loc) tex1Dfetch(tex_PQ2, _loc)
176 | #define ld_PnQn2_ro(_loc) tex1Dfetch(tex_PnQn2, _loc)
177 | #define ld_PQ_ro(_loc) tex1Dfetch(tex_PQ, _loc)
178 | #define ld_P_ro(_loc) tex1Dfetch(tex_P, _loc)
179 | #define ld_Q_ro(_loc) tex1Dfetch(tex_Q, _loc)
180 | #define ld_PQ_ro_cached(_loc) tex1Dfetch(tex_PQ, _loc)
181 | #define ld_PnQn_ro(_loc) tex1Dfetch(tex_PnQn, _loc)
182 | #define ld_Pn_ro(_loc) tex1Dfetch(tex_Pn, _loc)
183 | #define ld_Qn_ro(_loc) tex1Dfetch(tex_Qn, _loc)
184 | #define ld_param_ro(_loc, _param) tex1Dfetch(tex_##_param, _loc)
185 | #else
186 | #define ld_PQ2_ro(_loc) g_PQ[_loc] 
187 | #define ld_PnQn2_ro(_loc) g_PnQn[_loc] 
188 | #define ld_PQ_ro(_loc) g_PQ[_loc]
189 | #define ld_P_ro(_loc) g_P[_loc]
190 | #define ld_Q_ro(_loc) g_Q[_loc]
191 | #if NVCC_SUPPORTS_UNROLL_INLINE_ASM==1
192 | #define ld_PQ_ro_cached(_loc) cachedRead(g_PQ, _loc)
193 | #define ld_P_ro_cached(_loc) cachedRead(g_P, _loc)
194 | #define ld_Q_ro_cached(_loc) cachedRead(g_Q, _loc)
195 | #else
196 | #define ld_PQ_ro_cached(_loc) ld_PQ_ro(_loc)
197 | #define ld_P_ro_cached(_loc) ld_P_ro(_loc)
198 | #define ld_Q_ro_cached(_loc) ld_Q_ro(_loc)
199 | #endif
200 | #define ld_PnQn_ro(_loc) g_PnQn[_loc]
201 | #define ld_Pn_ro(_loc) g_Pn[_loc]
202 | #define ld_Qn_ro(_loc) g_Qn[_loc]
203 | #define ld_param_ro(_loc, _param) g_##_param[_loc]
204 | #endif
205 | 
206 | #define ZPENCIL_LENGTH (2*R+1)
207 | #define ZPENCIL_LAST (ZPENCIL_LENGTH-1)
208 | #define ZPENCIL_FIRST 0
209 | #define ZPENCIL(_n, _t) _t _n[ZPENCIL_LENGTH]
210 | #define ZPENCIL_SHIFT(_n) shift<float, ZPENCIL_LENGTH-1>(_n)
211 | #define ZPENCIL_INIT(_n) init<float, ZPENCIL_LENGTH-1>(_n, 0.0f)
212 | #define ZPENCIL_CTR_PRESHIFT R+1
213 | #define ZPENCIL_CTR_POSTSHIFT R
214 | 
215 | #define Q_LENGTH R
216 | #ifdef Q_IN_REGISTERS
217 | #define Q_DEF(_n, _t) _t _n[Q_LENGTH]; init<float, Q_LENGTH-1>(_n, 0.0f)
218 | #define qidx 
219 | #define Q_COMMON(_i)
220 | #define Q_CURR(_q, _i) _q[0]
221 | #define Q_LAST(_q, _i) _q[Q_LENGTH-1]
222 | #define ADVANCE_Qs(_q1,_q2,_q3,_i) shift<float, Q_LENGTH-1>(_q1);shift<float, Q_LENGTH-1>(_q2);shift<float, Q_LENGTH-1>(_q3)
223 | #define ADVANCE_6Qs(_q1,_q2,_q3,_q4, _q5, _q6, _i) shift<float, Q_LENGTH-1>(_q1);shift<float, Q_LENGTH-1>(_q2);shift<float, Q_LENGTH-1>(_q3);shift<float, Q_LENGTH-1>(_q4);shift<float, Q_LENGTH-1>(_q5);shift<float, Q_LENGTH-1>(_q6)
224 | #define ADVANCE_3Qs(_q1,_q2,_q3,_i) shift<float, Q_LENGTH-1>(_q1);shift<float,Q_LENGTH-1>(_q2);shift<float, Q_LENGTH-1>(_q3);
225 | #else
226 | #define Q_COMMON(_i) int _i = 0
227 | #ifdef Q_IN_SMEM
228 | #if 0
229 | #define Q_DEF(_n, _t) volatile __shared__ _t _n[Q_LENGTH][1*tile_y][tile_x]
230 | #define Q_CURR(_q, _i) _q[_i][threadIdx.y][threadIdx.x]
231 | #define Q_LAST(_q, _i) _q[(_i == 0 ? Q_LENGTH-1 : _i-1)][threadIdx.y][threadIdx.x]
232 | #endif
233 | #define Q_DEF(_n, _t) volatile __shared__ _t _n[Q_LENGTH][TILE_Y*TILE_X]
234 | #define Q_CURR(_q, _i) _q[_i][threadIdx.x]
235 | #define Q_LAST(_q, _i) _q[(_i == 0 ? Q_LENGTH-1 : _i-1)][threadIdx.x]
236 | #define ADVANCE_Qs(_q1,_q2,_q3,_i) _i = (_i == Q_LENGTH-1 ? 0 : _i+1)
237 | #define ADVANCE_6Qs(_q1,_q2,_q3,_q4,_q5,_q6,_i) _i = (_i == Q_LENGTH-1 ? 0 : _i+1)
238 | #else
239 | #define Q_DEF(_n, _t) _t _n[Q_LENGTH]
240 | #define Q_CURR(_q, _i) _q[_i]
241 | #define Q_LAST(_q, _i) _q[(_i == 0 ? Q_LENGTH-1 : _i-1)]
242 | #define ADVANCE_Qs(_q1,_q2,_q3, _i) _i = (_i == Q_LENGTH-1 ? 0 : _i+1)
243 | #define ADVANCE_6Qs(_q1,_q2,_q3,_q4,_q5,_q6,_i) _i = (_i == Q_LENGTH-1 ? 0 : _i+1)
244 | #endif
245 | #endif
246 | 
247 | #define SMEM_Q_DEF(_n, _t) volatile __shared__ _t _n[Q_LENGTH][TILE_Y*TILE_X]
248 | #define SMEM_Q_CURR(_q, _i) _q[_i][threadIdx.x]
249 | #define SMEM_Q_LAST(_q, _i) _q[(_i == 0 ? Q_LENGTH-1 : _i-1)][threadIdx.x]
250 | #define SMEM_ADVANCE_Qs(_q1,_q2,_q3,_i) _i = (_i == Q_LENGTH-1 ? 0 : _i+1)
251 | #define SMEM_ADVANCE_6Qs(_q1,_q2,_q3,_q4,_q5,_q6,_i) _i = (_i == Q_LENGTH-1 ? 0 : _i+1)
252 | #define SMEM_ADVANCE_3Qs(_q1,_q2,_q3,_i) _i = (_i == Q_LENGTH-1 ? 0 : _i+1)
253 | 
254 | #if NVCC_SUPPORTS_UNROLL_INLINE_ASM==1
255 | #define READ_Z_COEFF(_a, _i) cachedRead(_a, _i)
256 | #else
257 | #define READ_Z_COEFF(_a, _i) _a[_i]
258 | #endif
259 | 
260 | #define SMEM_ROW_WIDTH (2*(tile_x+2*R))
261 | 
262 | #if R <= TILE_Y
263 | #define HALO_CNT 1
264 | #define HALO_INIT {0.f}
265 | #define LAST_HIDX 0
266 | #else
267 | #if R <= 2*TILE_Y
268 | #define HALO_CNT 2
269 | #define HALO_INIT {0.f, 0.f}
270 | #define LAST_HIDX 1
271 | #else
272 | #error "Not coded to handle the case when R > 2*TILE_Y"
273 | #endif
274 | #endif
275 | 
276 | #if USE_REDUCTION == 1
277 | #define REDUCTION_LD(_mv_ld_statement)
278 | // _mv = memory value, _uv = update value
279 | #define REDUCTION_SUB(_l, _mv, _uv) _myRedAdd(&_l, -(_uv))
280 | #else
281 | #define REDUCTION_LD(_mv_ld_statement) _mv_ld_statement
282 | #define REDUCTION_SUB(_l, _mv, _uv) _l = (_mv) - (_uv)
283 | #endif
284 | 
285 | #define COMPUTE_THREADS_PER_CTA (TILE_X*TILE_Y)
286 | #define COUNT_PER_STRIDE_ROW	(TILE_X+2*R) // The extra 32 covers the 2*R halo elements
287 | #define DMA_THREADS_PER_LD 		32	
288 | //#define DMA_THREADS_PER_LD 		64	
289 | //#define DMA_THREADS_PER_LD		128
290 | //#define DMA_THREADS_PER_LD		256	
291 | #define DMA_THREADS_PER_CTA		(1*DMA_THREADS_PER_LD)
292 | #define BYTES_PER_THREAD		(sizeof(float2)*COUNT_PER_STRIDE_ROW*(TILE_Y+2*R)/DMA_THREADS_PER_LD)
293 | 
294 | #define PQY_BUF_OFFSET			(-(R*COUNT_PER_STRIDE_ROW))
295 | 
296 | __global__ void
297 | __launch_bounds__(160,3)
298 | single_pass_ktuned_DMA_single_specialized_two_phase(float2 *g_PnQn,
299 |                    float2* g_PQ, 
300 | #ifndef USE_TEX				   
301 | 				   float4* g_abde, float* g_vpz2,
302 | #endif
303 |                    const int row_stride, const int slice_stride, const int nz,
304 |                    const int R_x_row_stride, const int tile_y_x_row_stride,
305 |                    const int Pn_P_diff, const int lead_pad, const int offset, 
306 |                    const int q_start_idx
307 | #ifndef USE_CONSTANT_FOR_ZCOEFF
308 |                    , const float* c_z, const float* c_zz
309 | #endif
310 |     )
311 | {
312 | 	int tid = threadIdx.x;
313 | 	__shared__ float2 s_PQ[(TILE_Y+2*R)*COUNT_PER_STRIDE_ROW+TILE_Y*COUNT_PER_STRIDE_ROW];
314 | 	float2 *PQy_buf = s_PQ+(TILE_Y+2*R)*COUNT_PER_STRIDE_ROW+PQY_BUF_OFFSET;
315 | 	cudaDMAStridedTwoPhase<true,float4,16,LDG_BYTES,COUNT_PER_STRIDE_ROW*sizeof(float2),DMA_THREADS_PER_LD,TILE_Y+2*R>
316 | 		dma_ld_pq(1,
317 | 				COMPUTE_THREADS_PER_CTA,
318 | 				COMPUTE_THREADS_PER_CTA,
319 | 				row_stride*sizeof(float2),
320 | 				COUNT_PER_STRIDE_ROW*sizeof(float2)
321 | 				);
322 | 	int gid = offset + blockIdx.x*TILE_X +
323 | 					 + blockIdx.y*TILE_Y*row_stride;
324 | 
325 | 	if(tid<COMPUTE_THREADS_PER_CTA)
326 | 	{
327 | 		// Compute 2D index
328 | 		const int tx = tid % TILE_X;
329 | 		const int ty = tid / TILE_X;
330 | 
331 | 		int out_idx = gid + ty*row_stride + tx;
332 | 
333 | 		// SMEM pointers
334 | 		const int PQ_index = (R+ty)*COUNT_PER_STRIDE_ROW + (R+tx);
335 | 
336 | 		// Halo work for y stencil
337 | 		// R=4 ONLY
338 | 		const bool Xlower = (tx<(WARP_WIDTH/2));
339 | 		const int Xofs = (tx & 0x3) + 1;
340 | 		const int Xalign = (tx & 0xf);
341 | 		const int PQ_halo_diff = (Xlower ?
342 | 								 -(Xalign+Xofs) :
343 | 								  (WARP_WIDTH/2) - (Xalign+1) + Xofs);
344 | 		// State
345 | 		ZPENCIL(l_P, float);
346 | 		ZPENCIL(l_Px,float);
347 | 		ZPENCIL(l_Py,float);
348 | 		ZPENCIL(l_Q, float);
349 | 		ZPENCIL(l_Qx,float);
350 | 		ZPENCIL(l_Qy,float);
351 | 		ZPENCIL_INIT(l_P);
352 | 		ZPENCIL_INIT(l_Px);
353 | 		ZPENCIL_INIT(l_Py);
354 | 		ZPENCIL_INIT(l_Q);
355 | 		ZPENCIL_INIT(l_Qx);
356 | 		ZPENCIL_INIT(l_Qy);
357 | 
358 | 		Q_COMMON(qidx);
359 | 		Q_DEF (q_Pxx, float);
360 | 		Q_DEF (q_Pyy, float);
361 | 		Q_DEF (q_Pxy, float);
362 | 		Q_DEF (q_Qxx, float);
363 | 		Q_DEF (q_Qyy, float);
364 | 		Q_DEF (q_Qxy, float);
365 | 
366 | 		//------------------------------------------------------
367 | 		// Prime the pump
368 | 		//------------------------------------------------------
369 | 		dma_ld_pq.start_async_dma();
370 | 		for(int i=0; i<2*R; i++)
371 | 		{
372 | 			ZPENCIL_SHIFT(l_P);
373 | 			ZPENCIL_SHIFT(l_Px);
374 | 			ZPENCIL_SHIFT(l_Py);
375 | 			ZPENCIL_SHIFT(l_Q);
376 | 			ZPENCIL_SHIFT(l_Qx);
377 | 			ZPENCIL_SHIFT(l_Qy);
378 | 			ADVANCE_6Qs(q_Pxx, q_Pyy, q_Pxy, q_Qxx, q_Qyy, q_Qxy, qidx);
379 | 
380 | 			dma_ld_pq.wait_for_dma_finish(); // s_PQ ready, synchronization point for compute threads
381 | 			l_P[ZPENCIL_LAST] = s_PQ[PQ_index].x;
382 | 			l_Q[ZPENCIL_LAST] = s_PQ[PQ_index].y;
383 | 
384 | 			// Compute x, xx stencils
385 | 			l_Px[ZPENCIL_LAST] = c_x[0]*l_P[ZPENCIL_LAST];
386 |         	l_Qx[ZPENCIL_LAST] = c_x[0]*l_Q[ZPENCIL_LAST];
387 |         	float q_Pxxl = c_xx[0]*l_P[ZPENCIL_LAST];
388 |         	float q_Qxxl = c_xx[0]*l_Q[ZPENCIL_LAST];
389 | 			#pragma unroll
390 | 			for(int j=1;j<=R;j++) {
391 | 				const float2 v1 = s_PQ[PQ_index-j];
392 | 				const float2 v2 = s_PQ[PQ_index+j];
393 | 				l_Px[ZPENCIL_LAST] += c_x[j] * (v1.x + v2.x);
394 | 				l_Qx[ZPENCIL_LAST] += c_x[j] * (v1.y + v2.y);
395 | 				q_Pxxl += c_xx[j] * (v1.x + v2.x);
396 | 				q_Qxxl += c_xx[j] * (v1.y + v2.y);
397 | 			}
398 | 			Q_LAST(q_Pxx, qidx) = q_Pxxl;
399 | 			Q_LAST(q_Qxx, qidx) = q_Qxxl;
400 | 
401 | 			// Compute y stencils for halo
402 | 			float2 PQyside;
403 | 			PQyside.x = c_y[0] * s_PQ[PQ_index+PQ_halo_diff].x; 
404 | 			PQyside.y = c_y[0] * s_PQ[PQ_index+PQ_halo_diff].y; 
405 | 			#pragma unroll
406 | 			for(int j=1;j<=R;j++)
407 | 			{
408 | 				const float2 v1 = s_PQ[PQ_index+PQ_halo_diff-j*COUNT_PER_STRIDE_ROW];
409 | 				const float2 v2 = s_PQ[PQ_index+PQ_halo_diff+j*COUNT_PER_STRIDE_ROW];
410 | 				PQyside.x += c_y[j] * (v1.x + v2.x);
411 | 				PQyside.y += c_y[j] * (v1.y + v2.y);
412 | 			}
413 | 			PQy_buf[PQ_index+PQ_halo_diff] = PQyside;
414 | 
415 | 			// Compute y, yy stencils
416 |         	l_Py[ZPENCIL_LAST] = c_y[0]*l_P[ZPENCIL_LAST];
417 |         	l_Qy[ZPENCIL_LAST] = c_y[0]*l_Q[ZPENCIL_LAST];
418 |         	float q_Pyyl = c_yy[0]*l_P[ZPENCIL_LAST];
419 |         	float q_Qyyl = c_yy[0]*l_Q[ZPENCIL_LAST];
420 | 			#pragma unroll
421 | 			for(int j=1;j<=R;j++) {
422 | 				const float2 v1 = s_PQ[PQ_index-j*COUNT_PER_STRIDE_ROW];
423 | 				const float2 v2 = s_PQ[PQ_index+j*COUNT_PER_STRIDE_ROW];
424 | 				l_Py[ZPENCIL_LAST] += c_y[j] * (v1.x + v2.x);
425 | 				l_Qy[ZPENCIL_LAST] += c_y[j] * (v1.y + v2.y);
426 | 				q_Pyyl += c_yy[j] * (v1.x + v2.x);
427 | 				q_Qyyl += c_yy[j] * (v1.y + v2.y);
428 | 			}
429 | 			Q_LAST(q_Pyy, qidx) = q_Pyyl;
430 | 			Q_LAST(q_Qyy, qidx) = q_Qyyl;
431 | 			PQy_buf[PQ_index].x = l_Py[ZPENCIL_LAST];
432 | 			PQy_buf[PQ_index].y = l_Qy[ZPENCIL_LAST];
433 | 
434 | 			ptx_cudaDMA_barrier_blocking(0,COMPUTE_THREADS_PER_CTA); // __syncthreads()
435 | 			dma_ld_pq.start_async_dma(); // fetch next s_PQ
436 | 
437 | 			// Compute xy 
438 | 			float q_Pxyl = c_x[0]*l_Py[ZPENCIL_LAST];
439 | 			float q_Qxyl = c_x[0]*l_Qy[ZPENCIL_LAST];
440 | 			#pragma unroll
441 | 			for(int j=1;j<=R;j++)
442 | 			{
443 | 				const float2 v1 = PQy_buf[PQ_index-j];
444 | 				const float2 v2 = PQy_buf[PQ_index+j];
445 | 				q_Pxyl += c_x[j] * (v1.x + v2.x);
446 | 				q_Qxyl += c_x[j] * (v1.y + v2.y);
447 | 			}
448 | 			Q_LAST(q_Pxy, qidx) = q_Pxyl;
449 | 			Q_LAST(q_Qxy, qidx) = q_Qxyl;
450 | 		}
451 | 
452 | 		//------------------------------------------------------
453 | 		// Main loop
454 | 		//------------------------------------------------------
455 | 		int z_base = 0;
456 | #ifdef DYNAMIC
457 | 		for(int iz=0; iz<(nz-1); iz++, z_base+=(2*R+1))
458 | #else
459 |                 #pragma unroll 1
460 |                 for(int iz=0; iz<(RTM_ELMTS-1); iz++, z_base+=(2*R+1))
461 | #endif
462 | 		{
463 |             ZPENCIL_SHIFT(l_P);
464 |             ZPENCIL_SHIFT(l_Px);
465 |             ZPENCIL_SHIFT(l_Py);
466 |             ZPENCIL_SHIFT(l_Q);
467 |             ZPENCIL_SHIFT(l_Qx);
468 |             ZPENCIL_SHIFT(l_Qy);
469 |             const float q_Pxx0 = Q_CURR(q_Pxx, qidx);
470 |             const float q_Qxx0 = Q_CURR(q_Qxx, qidx);
471 |             const float q_Pyy0 = Q_CURR(q_Pyy, qidx);
472 |             const float q_Qyy0 = Q_CURR(q_Qyy, qidx);
473 |             const float q_Pxy0 = Q_CURR(q_Pxy, qidx);
474 |             const float q_Qxy0 = Q_CURR(q_Qxy, qidx);
475 |             ADVANCE_6Qs(q_Pxx, q_Pyy, q_Pxy, q_Qxx, q_Qyy, q_Qxy, qidx);
476 | 
477 | 			// Params
478 | 			REDUCTION_LD(const float2 PnQn = ld_PnQn2_ro(out_idx));
479 | 			const float4 abde = ld_param_ro(out_idx, abde);
480 | 			const float  vpz2 = ld_param_ro(out_idx, vpz2);
481 | 
482 | 			dma_ld_pq.wait_for_dma_finish(); // s_PQ ready, synchronization point for compute threads
483 | 
484 | 			l_P[ZPENCIL_LAST] = s_PQ[PQ_index].x;
485 | 			l_Q[ZPENCIL_LAST] = s_PQ[PQ_index].y;
486 | 
487 | 			// Compute x, xx stencils
488 |             l_Px[ZPENCIL_LAST] = c_x[0]*l_P[ZPENCIL_LAST];
489 |             l_Qx[ZPENCIL_LAST] = c_x[0]*l_Q[ZPENCIL_LAST];
490 |             float q_Pxxl = c_xx[0]*l_P[ZPENCIL_LAST];
491 |             float q_Qxxl = c_xx[0]*l_Q[ZPENCIL_LAST];
492 | 			#pragma unroll
493 |             for(int i=1;i<=R;i++) {
494 | 				const float2 v1 = s_PQ[PQ_index-i];
495 | 				const float2 v2 = s_PQ[PQ_index+i];
496 |                 l_Px[ZPENCIL_LAST] += c_x[i] * (v1.x + v2.x);
497 |                 l_Qx[ZPENCIL_LAST] += c_x[i] * (v1.y + v2.y);
498 |                 q_Pxxl += c_xx[i] * (v1.x + v2.x);
499 |                 q_Qxxl += c_xx[i] * (v1.y + v2.y);
500 |             }
501 |             Q_LAST(q_Pxx, qidx) = q_Pxxl;
502 |             Q_LAST(q_Qxx, qidx) = q_Qxxl;
503 | 
504 | 			// Compute zz stencils
505 |             float Pzz = 0.f;
506 |             float Qzz = 0.f;
507 | 			#pragma unroll
508 |             for(int j=0;j<2*R+1;j++) {
509 |                 const float zcoeff = READ_Z_COEFF(c_zz, z_base+j);
510 |                 Pzz += zcoeff * l_P[ZPENCIL_CTR_POSTSHIFT-R+j];
511 |                 Qzz += zcoeff * l_Q[ZPENCIL_CTR_POSTSHIFT-R+j];
512 |             }
513 | 
514 | 			// Compute y stencil for halo
515 |         	float2 PQyside;
516 | 	        PQyside.x = c_y[0] * s_PQ[PQ_index+PQ_halo_diff].x; 
517 | 	        PQyside.y = c_y[0] * s_PQ[PQ_index+PQ_halo_diff].y; 
518 | 			#pragma unroll
519 |    	    	for(int j=1;j<=R;j++)
520 | 			{
521 | 				const float2 v1 = s_PQ[PQ_index+PQ_halo_diff-j*COUNT_PER_STRIDE_ROW];
522 | 				const float2 v2 = s_PQ[PQ_index+PQ_halo_diff+j*COUNT_PER_STRIDE_ROW];
523 |            		PQyside.x += c_y[j] * (v1.x + v2.x);
524 | 				PQyside.y += c_y[j] * (v1.y + v2.y);
525 | 			}
526 |         	PQy_buf[PQ_index+PQ_halo_diff] = PQyside;
527 | 			
528 | 			// Compute y, yy stencils
529 |             l_Py[ZPENCIL_LAST] = c_y[0]*l_P[ZPENCIL_LAST];
530 |             l_Qy[ZPENCIL_LAST] = c_y[0]*l_Q[ZPENCIL_LAST];
531 |             float q_Pyyl = c_yy[0]*l_P[ZPENCIL_LAST];
532 |             float q_Qyyl = c_yy[0]*l_Q[ZPENCIL_LAST];
533 | 			#pragma unroll
534 |             for(int j=1;j<=R;j++) {
535 | 				const float2 v1 = s_PQ[PQ_index-j*COUNT_PER_STRIDE_ROW];
536 | 				const float2 v2 = s_PQ[PQ_index+j*COUNT_PER_STRIDE_ROW];
537 |                 l_Py[ZPENCIL_LAST] += c_y[j] * (v1.x + v2.x);
538 |                 l_Qy[ZPENCIL_LAST] += c_y[j] * (v1.y + v2.y);
539 |                 q_Pyyl += c_yy[j] * (v1.x + v2.x);
540 |                 q_Qyyl += c_yy[j] * (v1.y + v2.y);
541 |             }
542 |             Q_LAST(q_Pyy, qidx) = q_Pyyl;
543 |             Q_LAST(q_Qyy, qidx) = q_Qyyl;
544 |             PQy_buf[PQ_index].x = l_Py[ZPENCIL_LAST];
545 |             PQy_buf[PQ_index].y = l_Qy[ZPENCIL_LAST];
546 | 
547 | 			ptx_cudaDMA_barrier_blocking(0,COMPUTE_THREADS_PER_CTA); // __syncthreads()
548 | 			dma_ld_pq.start_async_dma(); // fetch next s_PQ
549 | 
550 | 			// Compute xy
551 |             float q_Pxyl = c_x[0] * l_Py[ZPENCIL_LAST];
552 |             float q_Qxyl = c_x[0] * l_Qy[ZPENCIL_LAST];
553 | 			#pragma unroll
554 |             for(int j=1;j<=R;j++)
555 | 			{
556 | 				const float2 v1 = PQy_buf[PQ_index-j];
557 | 				const float2 v2 = PQy_buf[PQ_index+j];
558 |                 q_Pxyl += c_x[j] * (v1.x + v2.x);
559 | 				q_Qxyl += c_x[j] * (v1.y + v2.y);
560 | 			}
561 |             Q_LAST(q_Pxy, qidx) = q_Pxyl;
562 |             Q_LAST(q_Qxy, qidx) = q_Qxyl;
563 | 
564 |             float Pxz = 0.f;
565 |             float Qxz = 0.f;
566 |             float Pyz = 0.f;
567 |             float Qyz = 0.f;
568 | 			#pragma unroll
569 |             for(int j=0;j<2*R+1;j++) {
570 |                 const float zcoeff = READ_Z_COEFF(c_z, z_base+j);
571 |                 Pxz += zcoeff * l_Px[ZPENCIL_CTR_POSTSHIFT-R+j];
572 |                 Qxz += zcoeff * l_Qx[ZPENCIL_CTR_POSTSHIFT-R+j];
573 |                 Pyz += zcoeff * l_Py[ZPENCIL_CTR_POSTSHIFT-R+j];
574 |                 Qyz += zcoeff * l_Qy[ZPENCIL_CTR_POSTSHIFT-R+j];
575 |             }
576 | 			
577 | 			// Output calculation
578 | 			const float  alpha = abde.x;
579 | 			const float  beta  = abde.y;
580 | 			const float  delta = abde.z;
581 | 			const float  epsln = abde.w;
582 | 
583 |             float sina;
584 |             float cosa;
585 |             __sincosf(alpha, &sina, &cosa);
586 |             float sinb;
587 |             float cosb;
588 |             __sincosf(beta, &sinb, &cosb);
589 | 
590 |             const float sina2 = sina * sina;
591 |             const float cosa2 = 1.f - sina2;
592 |             const float cosb2 = cosb * cosb;
593 |             const float sinb2 = 1.f - cosb2;
594 |             const float sin2a = 2.f * sina * cosa;
595 |             const float sin2b = 2.f * sinb * cosb;
596 | 
597 |             const float h1p = sina*cosb2*q_Pxx0 + 
598 |                 sina2*sinb2*q_Pyy0 + 
599 |                 cosa2*Pzz + 
600 |                 sina2*sin2b*q_Pxy0 + 
601 |                 sin2a*sinb*Pyz + 
602 |                 sin2a*cosb*Pxz;
603 |             const float h2p = q_Pxx0 + q_Pyy0 + Pzz - h1p;
604 | 
605 |             const float h1q = sina*cosb2*q_Qxx0 + 
606 |                 sina2*sinb2*q_Qyy0 + 
607 |                 cosa2*Qzz + 
608 |                 sina2*sin2b*q_Qxy0 + 
609 |                 sin2a*sinb*Qyz + 
610 |                 sin2a*cosb*Qxz;
611 |             const float h2q = q_Qxx0 + q_Qyy0 + Qzz - h1q;
612 | 
613 |             const float vsz2 = vsz2_constant * fabsf(delta - epsln);
614 |             const float vpn2 = vpz2 * (1.f + 2.f*delta);
615 |             const float vpx2 = vpz2 * (1.f + 2.f*epsln);
616 | 
617 |             REDUCTION_SUB(g_PnQn[out_idx].x, PnQn.x, 2.f*l_P[ZPENCIL_CTR_POSTSHIFT] + vpx2*h2p + vsz2*h1p + vpz2*h1q - vsz2*h1q);
618 |             REDUCTION_SUB(g_PnQn[out_idx].y, PnQn.y, 2.f*l_Q[ZPENCIL_CTR_POSTSHIFT] + vpz2*h1q + vsz2*h2q + vpn2*h2p - vsz2*h2p);
619 | 			
620 | 			out_idx += slice_stride;
621 | 		}
622 | 		//------------------------------------------------------
623 | 		// "Postamble" for main loop
624 | 		//------------------------------------------------------
625 | 		dma_ld_pq.wait_for_dma_finish();
626 | 		/*
627 | 		ZPENCIL_SHIFT(l_P);
628 | 		ZPENCIL_SHIFT(l_Px);
629 | 		ZPENCIL_SHIFT(l_Py);
630 | 		ZPENCIL_SHIFT(l_Q);
631 | 		ZPENCIL_SHIFT(l_Qx);
632 | 		ZPENCIL_SHIFT(l_Qy);
633 | 		const float q_Pxx0 = Q_CURR(q_Pxx, qidx);
634 | 		const float q_Qxx0 = Q_CURR(q_Qxx, qidx);
635 | 		const float q_Pyy0 = Q_CURR(q_Pyy, qidx);
636 | 		const float q_Qyy0 = Q_CURR(q_Qyy, qidx);
637 | 		const float q_Pxy0 = Q_CURR(q_Pxy, qidx);
638 | 		const float q_Qxy0 = Q_CURR(q_Qxy, qidx);
639 | 		ADVANCE_6Qs(q_Pxx, q_Pyy, q_Pxy, q_Qxx, q_Qyy, q_Qxy, qidx);
640 | 
641 | 		// Params
642 | 		REDUCTION_LD(const float2 PnQn = ld_PnQn2_ro(out_idx));
643 | 		const float4 abde = ld_param_ro(out_idx, abde);
644 | 		const float  vpz2 = ld_param_ro(out_idx, vpz2);
645 | 		const float  alpha = abde.x;
646 | 		const float  beta  = abde.y;
647 | 		const float  delta = abde.z;
648 | 		const float  epsln = abde.w;
649 | 
650 | //		dma_ld_pq.wait_for_dma_finish();
651 | 
652 | 		l_P[ZPENCIL_LAST] = s_PQ[PQ_index].x;
653 | 		l_Q[ZPENCIL_LAST] = s_PQ[PQ_index].y;
654 | 
655 | 		// Compute x, xx stencils
656 | 		l_Px[ZPENCIL_LAST] = c_x[0]*l_P[ZPENCIL_LAST];
657 | 		l_Qx[ZPENCIL_LAST] = c_x[0]*l_Q[ZPENCIL_LAST];
658 | 		float q_Pxxl = c_xx[0]*l_P[ZPENCIL_LAST];
659 | 		float q_Qxxl = c_xx[0]*l_Q[ZPENCIL_LAST];
660 | 		#pragma unroll
661 | 		for(int i=1;i<=R;i++) {
662 | 			const float2 v1 = s_PQ[PQ_index-i];
663 | 			const float2 v2 = s_PQ[PQ_index+i];
664 | 			l_Px[ZPENCIL_LAST] += c_x[i] * (v1.x + v2.x);
665 | 			l_Qx[ZPENCIL_LAST] += c_x[i] * (v1.y + v2.y);
666 | 			q_Pxxl += c_xx[i] * (v1.x + v2.x);
667 | 			q_Qxxl += c_xx[i] * (v1.y + v2.y);
668 | 		}
669 | 		Q_LAST(q_Pxx, qidx) = q_Pxxl;
670 | 		Q_LAST(q_Qxx, qidx) = q_Qxxl;
671 | 
672 | 		// Compute zz stencils
673 | 		float Pzz = 0.f;
674 | 		float Qzz = 0.f;
675 | 		#pragma unroll
676 | 		for(int j=0;j<2*R+1;j++) {
677 | 			const float zcoeff = READ_Z_COEFF(c_zz, z_base+j);
678 | 			Pzz += zcoeff * l_P[ZPENCIL_CTR_POSTSHIFT-R+j];
679 | 			Qzz += zcoeff * l_Q[ZPENCIL_CTR_POSTSHIFT-R+j];
680 | 		}
681 | 
682 | 		// Compute y stencil for halo
683 | 		float2 PQyside;
684 | 		PQyside.x = c_y[0] * s_PQ[PQ_index+PQ_halo_diff].x; 
685 | 		PQyside.y = c_y[0] * s_PQ[PQ_index+PQ_halo_diff].y; 
686 | 		#pragma unroll
687 | 		for(int j=1;j<=R;j++)
688 | 		{
689 | 			const float2 v1 = s_PQ[PQ_index+PQ_halo_diff-j*COUNT_PER_STRIDE_ROW];
690 | 			const float2 v2 = s_PQ[PQ_index+PQ_halo_diff+j*COUNT_PER_STRIDE_ROW];
691 | 			PQyside.x += c_y[j] * (v1.x + v2.x);
692 | 			PQyside.y += c_y[j] * (v1.y + v2.y);
693 | 		}
694 | 		PQy_buf[PQ_index+PQ_halo_diff] = PQyside;
695 | 		
696 | 		// Compute y, yy stencils
697 | 		l_Py[ZPENCIL_LAST] = c_y[0]*l_P[ZPENCIL_LAST];
698 | 		l_Qy[ZPENCIL_LAST] = c_y[0]*l_Q[ZPENCIL_LAST];
699 | 		float q_Pyyl = c_yy[0]*l_P[ZPENCIL_LAST];
700 | 		float q_Qyyl = c_yy[0]*l_Q[ZPENCIL_LAST];
701 | 		#pragma unroll
702 | 		for(int j=1;j<=R;j++) {
703 | 			const float2 v1 = s_PQ[PQ_index-j*COUNT_PER_STRIDE_ROW];
704 | 			const float2 v2 = s_PQ[PQ_index+j*COUNT_PER_STRIDE_ROW];
705 | 			l_Py[ZPENCIL_LAST] += c_y[j] * (v1.x + v2.x);
706 | 			l_Qy[ZPENCIL_LAST] += c_y[j] * (v1.y + v2.y);
707 | 			q_Pyyl += c_yy[j] * (v1.x + v2.x);
708 | 			q_Qyyl += c_yy[j] * (v1.y + v2.y);
709 | 		}
710 | 		Q_LAST(q_Pyy, qidx) = q_Pyyl;
711 | 		Q_LAST(q_Qyy, qidx) = q_Qyyl;
712 | 		PQy_buf[PQ_index].x = l_Py[ZPENCIL_LAST];
713 | 		PQy_buf[PQ_index].y = l_Qy[ZPENCIL_LAST];
714 | 
715 | 		ptx_cudaDMA_barrier_blocking(0,COMPUTE_THREADS_PER_CTA); // __syncthreads()
716 | 
717 | 		// Compute xy
718 | 		float q_Pxyl = c_x[0] * l_Py[ZPENCIL_LAST];
719 | 		float q_Qxyl = c_x[0] * l_Qy[ZPENCIL_LAST];
720 | 		#pragma unroll
721 | 		for(int j=1;j<=R;j++)
722 | 		{
723 | 			const float2 v1 = PQy_buf[PQ_index-j];
724 | 			const float2 v2 = PQy_buf[PQ_index+j];
725 | 			q_Pxyl += c_x[j] * (v1.x + v2.x);
726 | 			q_Qxyl += c_x[j] * (v1.y + v2.y);
727 | 		}
728 | 		Q_LAST(q_Pxy, qidx) = q_Pxyl;
729 | 		Q_LAST(q_Qxy, qidx) = q_Qxyl;
730 | 
731 | 		float Pxz = 0.f;
732 | 		float Qxz = 0.f;
733 | 		float Pyz = 0.f;
734 | 		float Qyz = 0.f;
735 | 		#pragma unroll
736 | 		for(int j=0;j<2*R+1;j++) {
737 | 			const float zcoeff = READ_Z_COEFF(c_z, z_base+j);
738 | 			Pxz += zcoeff * l_Px[ZPENCIL_CTR_POSTSHIFT-R+j];
739 | 			Qxz += zcoeff * l_Qx[ZPENCIL_CTR_POSTSHIFT-R+j];
740 | 			Pyz += zcoeff * l_Py[ZPENCIL_CTR_POSTSHIFT-R+j];
741 | 			Qyz += zcoeff * l_Qy[ZPENCIL_CTR_POSTSHIFT-R+j];
742 | 		}
743 | 		
744 | 		// Output calculation
745 | 		float sina;
746 | 		float cosa;
747 | 		__sincosf(alpha, &sina, &cosa);
748 | 		float sinb;
749 | 		float cosb;
750 | 		__sincosf(beta, &sinb, &cosb);
751 | 
752 | 		const float sina2 = sina * sina;
753 | 		const float cosa2 = 1.f - sina2;
754 | 		const float cosb2 = cosb * cosb;
755 | 		const float sinb2 = 1.f - cosb2;
756 | 		const float sin2a = 2.f * sina * cosa;
757 | 		const float sin2b = 2.f * sinb * cosb;
758 | 
759 | 		const float h1p = sina*cosb2*q_Pxx0 + 
760 | 			sina2*sinb2*q_Pyy0 + 
761 | 			cosa2*Pzz + 
762 | 			sina2*sin2b*q_Pxy0 + 
763 | 			sin2a*sinb*Pyz + 
764 | 			sin2a*cosb*Pxz;
765 | 		const float h2p = q_Pxx0 + q_Pyy0 + Pzz - h1p;
766 | 
767 | 		const float h1q = sina*cosb2*q_Qxx0 + 
768 | 			sina2*sinb2*q_Qyy0 + 
769 | 			cosa2*Qzz + 
770 | 			sina2*sin2b*q_Qxy0 + 
771 | 			sin2a*sinb*Qyz + 
772 | 			sin2a*cosb*Qxz;
773 | 		const float h2q = q_Qxx0 + q_Qyy0 + Qzz - h1q;
774 | 
775 | 		const float vsz2 = vsz2_constant * fabsf(delta - epsln);
776 | 		const float vpn2 = vpz2 * (1.f + 2.f*delta);
777 | 		const float vpx2 = vpz2 * (1.f + 2.f*epsln);
778 | 
779 | 		REDUCTION_SUB(g_PnQn[out_idx].x, PnQn.x, 2.f*l_P[ZPENCIL_CTR_POSTSHIFT] + vpx2*h2p + vsz2*h1p + vpz2*h1q - vsz2*h1q);
780 | 		REDUCTION_SUB(g_PnQn[out_idx].y, PnQn.y, 2.f*l_Q[ZPENCIL_CTR_POSTSHIFT] + vpz2*h1q + vsz2*h2q + vpn2*h2p - vsz2*h2p);
781 | 		*/
782 | 	}
783 | 	else if(dma_ld_pq.owns_this_thread())
784 | 	{
785 | 		int in_idx = gid - R_x_row_stride - R 
786 | 						 - R*slice_stride;
787 | 
788 | #ifdef DYNAMIC
789 | 		for(int i=0; i<(2*R+nz); i++)
790 | #else
791 |                 #pragma unroll 1
792 |                 for(int i=0; i<(2*R+RTM_ELMTS); i++)
793 | #endif
794 | 		{
795 | 			dma_ld_pq.begin_xfer_async(&g_PQ[in_idx]);
796 | 			in_idx += slice_stride;
797 | 			dma_ld_pq.wait_xfer_commit(s_PQ);
798 | 		}
799 | 	}
800 | }
801 | 
802 | 


--------------------------------------------------------------------------------
/src/instruction.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2015 Stanford University and NVIDIA
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | #ifndef __INSTRUCTION_H__
 18 | #define __INSTRUCTION_H__
 19 | 
 20 | #include <map>
 21 | #include <string>
 22 | #include <vector>
 23 | 
 24 | #include <cstdio>
 25 | #include <cstdlib>
 26 | #include <cassert>
 27 | #include <cstring>
 28 | 
 29 | // Special registers get special values
 30 | // Make these macros so they can be negative
 31 | #define WEFT_TID_X_REG    (-1)
 32 | #define WEFT_TID_Y_REG    (-2)
 33 | #define WEFT_TID_Z_REG    (-3)
 34 | #define WEFT_NTID_X_REG   (-4)
 35 | #define WEFT_NTID_Y_REG   (-5)
 36 | #define WEFT_NTID_Z_REG   (-6)
 37 | #define WEFT_LANE_REG     (-7)
 38 | #define WEFT_WARP_REG     (-8)
 39 | #define WEFT_NWARP_REG    (-9)
 40 | #define WEFT_CTA_X_REG    (-10)
 41 | #define WEFT_CTA_Y_REG    (-11)
 42 | #define WEFT_CTA_Z_REG    (-12)
 43 | #define WEFT_NCTA_X_REG   (-13)
 44 | #define WEFT_NCTA_Y_REG   (-14)
 45 | #define WEFT_NCTA_Z_REG   (-15)
 46 | 
 47 | #define SDDRINC (100000000)
 48 | 
 49 | enum PTXKind {
 50 |   PTX_SHARED_DECL,
 51 |   PTX_MOVE,
 52 |   PTX_RIGHT_SHIFT,
 53 |   PTX_LEFT_SHIFT,
 54 |   PTX_AND,
 55 |   PTX_OR,
 56 |   PTX_XOR,
 57 |   PTX_NOT,
 58 |   PTX_ADD,
 59 |   PTX_SUB,
 60 |   PTX_NEGATE,
 61 |   PTX_CONVERT,
 62 |   PTX_CONVERT_ADDRESS,
 63 |   PTX_BFE,
 64 |   PTX_MULTIPLY,
 65 |   PTX_MAD,
 66 |   PTX_SET_PREDICATE,
 67 |   PTX_SELECT_PREDICATE,
 68 |   PTX_BARRIER,
 69 |   PTX_SHARED_ACCESS,
 70 |   PTX_LABEL,
 71 |   PTX_BRANCH,
 72 |   PTX_UNIFORM_BRANCH,
 73 |   PTX_SHFL,
 74 |   PTX_EXIT,
 75 |   PTX_GLOBAL_DECL,
 76 |   PTX_GLOBAL_LOAD,
 77 |   PTX_LAST, // this one must be last
 78 | };
 79 | 
 80 | enum CompType {
 81 |   COMP_GT,
 82 |   COMP_GE,
 83 |   COMP_EQ,
 84 |   COMP_NE,
 85 |   COMP_LE,
 86 |   COMP_LT,
 87 | };
 88 | 
 89 | // Some helper methods
 90 | inline void split(std::vector<std::string> &results, 
 91 |                   const char *str, char c = ',') 
 92 | {
 93 |   do {
 94 |     const char *begin = str;
 95 |     while ((*str != ' ') && (*str != '\t') && 
 96 |            (*str != c) && (*str)) str++;
 97 | 
 98 |     std::string result(begin, str);
 99 |     if (!result.empty())
100 |       results.push_back(result);
101 |   } while (0 != *str++);
102 | }
103 | 
104 | class Thread;
105 | class PTXLabel;
106 | class PTXBranch;
107 | class PTXBarrier;
108 | class WeftBarrier;
109 | class WeftAccess;
110 | class BarrierSync;
111 | class BarrierArrive;
112 | class SharedWrite;
113 | class SharedRead;
114 | class SharedStore;
115 | class BarrierInstance;
116 | 
117 | class PTXInstruction {
118 | public:
119 |   PTXInstruction(void);
120 |   PTXInstruction(PTXKind kind, int line_num);
121 |   virtual~ PTXInstruction(void);
122 | public:
123 |   virtual PTXInstruction* emulate(Thread *thread) = 0;
124 |   // Most instructions do the same thing, but some need
125 |   // to override this behavior so make it virtual
126 |   virtual PTXInstruction* emulate_warp(Thread **threads,
127 |                                        ThreadState *thread_state,
128 |                                        int &shared_access_id,
129 |                                        SharedStore &store);
130 | public:
131 |   virtual bool is_label(void) const { return false; }
132 |   virtual bool is_branch(void) const { return false; } 
133 |   virtual bool is_barrier(void) const { return false; }
134 |   virtual bool is_shuffle(void) const { return false; }
135 | public:
136 |   virtual PTXLabel* as_label(void) { return NULL; }
137 |   virtual PTXBranch* as_branch(void) { return NULL; }
138 |   virtual PTXBarrier* as_barrier(void) { return NULL; }
139 | public:
140 |   inline PTXKind get_kind(void) const { return kind; }
141 | public:
142 |   void set_next(PTXInstruction *next);
143 |   void set_source_location(const char *file, int line);
144 | public:
145 |   static PTXInstruction* interpret(const std::string &line, int line_num);
146 |   static const char* get_kind_name(PTXKind k);
147 | public:
148 |   static uint64_t compress_identifier(const char *buffer, size_t buffer_size);
149 |   static void decompress_identifier(uint64_t id, char *buffer, size_t buffer_size);
150 | public:
151 |   const PTXKind kind;
152 |   const int line_number;
153 | protected:
154 |   PTXInstruction *next;
155 | public:
156 |   const char *source_file;
157 |   int source_line_number;
158 | };
159 | 
160 | class PTXLabel: public PTXInstruction {
161 | public:
162 |   PTXLabel(const std::string &label, int line_num);
163 |   PTXLabel(const PTXLabel &rhs) { assert(false); }
164 |   virtual ~PTXLabel(void) { }
165 | public:
166 |   PTXLabel& operator=(const PTXLabel &rhs) { assert(false); return *this; }
167 | public:
168 |   virtual PTXInstruction* emulate(Thread *thread); 
169 |   // Override for warp-synchronous execution
170 |   virtual PTXInstruction* emulate_warp(Thread **threads,
171 |                                        ThreadState *thread_state,
172 |                                        int &shared_access_id,
173 |                                        SharedStore &store);
174 | public:
175 |   virtual bool is_label(void) const { return true; }
176 | public:
177 |   virtual PTXLabel* as_label(void) { return this; }
178 | public:
179 |   void update_labels(std::map<std::string,PTXLabel*> &labels);
180 | protected:
181 |   std::string label;
182 | public:
183 |   static bool interpret(const std::string &line, int line_num,
184 |                         PTXInstruction *&result);
185 | };
186 | 
187 | class PTXBranch : public PTXInstruction {
188 | public:
189 |   PTXBranch(const std::string &label, int line_num);
190 |   PTXBranch(int64_t predicate, bool negate, const std::string &label, int line_num);
191 |   PTXBranch(const PTXBranch &rhs) { assert(false); }
192 |   virtual ~PTXBranch(void) { }
193 | public:
194 |   PTXBranch& operator=(const PTXBranch &rhs) { assert(false); return *this; }
195 | public:
196 |   virtual PTXInstruction* emulate(Thread *thread);
197 |   // Override for warp-synchronous execution!
198 |   virtual PTXInstruction* emulate_warp(Thread **threads,
199 |                                        ThreadState *thread_state,
200 |                                        int &shared_access_id,
201 |                                        SharedStore &store);
202 | public:
203 |   virtual bool is_branch(void) const { return true; }
204 | public:
205 |   virtual PTXBranch* as_branch(void) { return this; }
206 | public:
207 |   void set_targets(const std::map<std::string,PTXLabel*> &labels);
208 | protected:
209 |   int64_t predicate;
210 |   bool negate;
211 |   std::string label;
212 |   PTXLabel *target;
213 | public:
214 |   static bool interpret(const std::string &line, int line_num,
215 |                         PTXInstruction *&result);
216 | };
217 | 
218 | class PTXSharedDecl : public PTXInstruction {
219 | public:
220 |   PTXSharedDecl(const std::string &name, int64_t address, int line_num);
221 |   PTXSharedDecl(const PTXSharedDecl &rhs) { assert(false); }
222 |   virtual ~PTXSharedDecl(void) { }
223 | public:
224 |   PTXSharedDecl& operator=(const PTXSharedDecl &rhs) 
225 |     { assert(false); return *this; }
226 | public:
227 |   virtual PTXInstruction* emulate(Thread *thread);
228 | protected:
229 |   std::string name;
230 |   int64_t address;
231 | public:
232 |   static bool interpret(const std::string &line, int line_num, 
233 |                         PTXInstruction *&result);
234 | };
235 | 
236 | class PTXMove : public PTXInstruction {
237 | public:
238 |   PTXMove(int64_t dst, int64_t src, bool immediate, int line_num);
239 |   PTXMove(int64_t dst, const std::string &src, int line_num);
240 |   PTXMove(const PTXMove &rhs) { assert(false); }
241 |   virtual ~PTXMove(void) { }
242 | public:
243 |   PTXMove& operator=(const PTXMove &rhs) { assert(false); return *this; }
244 | public:
245 |   virtual PTXInstruction* emulate(Thread *thread);
246 | protected:
247 |   int64_t args[2];
248 |   std::string source;
249 |   bool immediate;
250 | public:
251 |   static bool interpret(const std::string &line, int line_num,
252 |                         PTXInstruction *&result);
253 | };
254 | 
255 | class PTXRightShift : public PTXInstruction {
256 | public:
257 |   PTXRightShift(int64_t zero, int64_t one, int64_t two, 
258 |                 bool immediate, int line_num);
259 |   PTXRightShift(const PTXRightShift &rhs) { assert(false); }
260 |   virtual ~PTXRightShift(void) { }
261 | public:
262 |   PTXRightShift& operator=(const PTXRightShift &rhs) 
263 |     { assert(false); return *this; }
264 | public:
265 |   virtual PTXInstruction* emulate(Thread *thread);
266 | protected:
267 |   int64_t args[3];
268 |   bool immediate;
269 | public:
270 |   static bool interpret(const std::string &line, int line_num,
271 |                         PTXInstruction *&result);
272 | };
273 | 
274 | class PTXLeftShift : public PTXInstruction {
275 | public:
276 |   PTXLeftShift(int64_t zero, int64_t one, int64_t two, 
277 |                bool immediate, int line_num);
278 |   PTXLeftShift(const PTXLeftShift &rhs) { assert(false); }
279 |   virtual ~PTXLeftShift(void) { }
280 | public:
281 |   PTXLeftShift& operator=(const PTXLeftShift &rhs) 
282 |     { assert(false); return *this; }
283 | public:
284 |   virtual PTXInstruction* emulate(Thread *thread);
285 | protected:
286 |   int64_t args[3];
287 |   bool immediate;
288 | public:
289 |   static bool interpret(const std::string &line, int line_num,
290 |                         PTXInstruction *&result);
291 | };
292 | 
293 | class PTXAnd : public PTXInstruction {
294 | public:
295 |   PTXAnd(int64_t zero, int64_t one, int64_t two, 
296 |          bool immediate, bool predicate, int line_num);
297 |   PTXAnd(const PTXAnd &rhs) { assert(false); }
298 |   virtual ~PTXAnd(void) { }
299 | public:
300 |   PTXAnd& operator=(const PTXAnd &rhs) { assert(false); return *this; }
301 | public:
302 |   virtual PTXInstruction* emulate(Thread *thread);
303 | protected:
304 |   int64_t args[3];
305 |   bool immediate;
306 |   bool predicate;
307 | public:
308 |   static bool interpret(const std::string &line, int line_num,
309 |                         PTXInstruction *&result);
310 | };
311 | 
312 | class PTXOr : public PTXInstruction {
313 | public:
314 |   PTXOr(int64_t zero, int64_t one, int64_t two, 
315 |         bool immediate, bool predicate, int line_num);
316 |   PTXOr(const PTXOr &rhs) { assert(false); }
317 |   virtual ~PTXOr(void) { }
318 | public:
319 |   PTXOr& operator=(const PTXOr &rhs) { assert(false); return *this; }
320 | public:
321 |   virtual PTXInstruction* emulate(Thread *thread);
322 | protected:
323 |   int64_t args[3];
324 |   bool immediate;
325 |   bool predicate;
326 | public:
327 |   static bool interpret(const std::string &line, int line_num,
328 |                         PTXInstruction *&result);
329 | };
330 | 
331 | class PTXXor : public PTXInstruction {
332 | public:
333 |   PTXXor(int64_t zero, int64_t one, int64_t two, 
334 |         bool immediate, bool predicate, int line_num);
335 |   PTXXor(const PTXXor &rhs) { assert(false); }
336 |   virtual ~PTXXor(void) { }
337 | public:
338 |   PTXXor& operator=(const PTXXor &rhs) { assert(false); return *this; }
339 | public:
340 |   virtual PTXInstruction* emulate(Thread *thread);
341 | protected:
342 |   int64_t args[3];
343 |   bool immediate;
344 |   bool predicate;
345 | public:
346 |   static bool interpret(const std::string &line, int line_num,
347 |                         PTXInstruction *&result);
348 | };
349 | 
350 | class PTXNot : public PTXInstruction {
351 | public:
352 |   PTXNot(int64_t zero, int64_t one, bool predicate, int line_num);
353 |   PTXNot(const PTXNot &rhs) { assert(false); }
354 |   virtual ~PTXNot(void) { }
355 | public:
356 |   PTXNot& operator=(const PTXNot &rhs) { assert(false); return *this; }
357 | public:
358 |   virtual PTXInstruction* emulate(Thread *thread);
359 | protected:
360 |   int64_t args[2];
361 |   bool predicate;
362 | public:
363 |   static bool interpret(const std::string &line, int line_num,
364 |                         PTXInstruction *&result);
365 | };
366 | 
367 | class PTXAdd : public PTXInstruction {
368 | public:
369 |   PTXAdd(int64_t zero, int64_t one, int64_t two, 
370 |          bool immediate, int line_num);
371 |   PTXAdd(const PTXAdd &rhs) { assert(false); }
372 |   virtual ~PTXAdd(void) { }
373 | public:
374 |   PTXAdd& operator=(const PTXAdd &rhs) { assert(false); return *this; }
375 | public:
376 |   virtual PTXInstruction* emulate(Thread *thread);
377 | protected:
378 |   int64_t args[3];
379 |   bool immediate;
380 | public:
381 |   static bool interpret(const std::string &line, int line_num,
382 |                         PTXInstruction *&result);
383 | };
384 | 
385 | class PTXSub : public PTXInstruction {
386 | public:
387 |   PTXSub(int64_t zero, int64_t one, int64_t two, 
388 |          bool immediate, int line_num);
389 |   PTXSub(const PTXSub &rhs) { assert(false); }
390 |   virtual ~PTXSub(void) { }
391 | public:
392 |   PTXSub& operator=(const PTXSub &rhs) { assert(false); return *this; }
393 | public:
394 |   virtual PTXInstruction* emulate(Thread *thread);
395 | protected:
396 |   int64_t args[3];
397 |   bool immediate;
398 | public:
399 |   static bool interpret(const std::string &line, int line_num,
400 |                         PTXInstruction *&result);
401 | };
402 | 
403 | class PTXNeg : public PTXInstruction {
404 | public:
405 |   PTXNeg(int64_t zero, int64_t one, bool immediate, int line_num);
406 |   PTXNeg(const PTXNeg &rhs) { assert(false); }
407 |   virtual ~PTXNeg(void) { }
408 | public:
409 |   PTXNeg& operator=(const PTXNeg &rhs) { assert(false); return *this; }
410 | public:
411 |   virtual PTXInstruction* emulate(Thread *thread);
412 | protected:
413 |   int64_t args[2];
414 |   bool immediate;
415 | public:
416 |   static bool interpret(const std::string &line, int line_num,
417 |                         PTXInstruction *&result);
418 | };
419 | 
420 | class PTXMul : public PTXInstruction {
421 | public:
422 |   PTXMul(int64_t zero, int64_t one, int64_t two, 
423 |          bool immediate, int line_num);
424 |   PTXMul(const PTXMul &rhs) { assert(false); }
425 |   virtual ~PTXMul(void) { }
426 | public:
427 |   PTXMul& operator=(const PTXMul &rhs) { assert(false); return *this; }
428 | public:
429 |   virtual PTXInstruction* emulate(Thread *thread);
430 | protected:
431 |   int64_t args[3];
432 |   bool immediate;
433 | public:
434 |   static bool interpret(const std::string &line, int line_num,
435 |                         PTXInstruction *&result);
436 | };
437 | 
438 | class PTXMad : public PTXInstruction {
439 | public:
440 |   PTXMad(int64_t args[4], bool immediates[4], int line_num);
441 |   PTXMad(const PTXMad &rhs) { assert(false); }
442 |   virtual ~PTXMad(void) { }
443 | public:
444 |   PTXMad& operator=(const PTXMad &rhs) { assert(false); return *this; }
445 | public:
446 |   virtual PTXInstruction* emulate(Thread *thread);
447 | protected:
448 |   int64_t args[4];
449 |   bool immediate[4];
450 | public:
451 |   static bool interpret(const std::string &line, int line_num,
452 |                         PTXInstruction *&result);
453 | };
454 | 
455 | class PTXSetPred : public PTXInstruction {
456 | public:
457 |   PTXSetPred(int64_t zero, int64_t one, int64_t two, bool immediate, 
458 |              CompType comparison, int line_num);
459 |   PTXSetPred(const PTXSetPred &rhs) { assert(false); }
460 |   virtual ~PTXSetPred(void) { }
461 | public:
462 |   virtual PTXInstruction* emulate(Thread *thread);
463 | public:
464 |   PTXSetPred& operator=(const PTXSetPred &rhs) { assert(false); return *this; }
465 | protected:
466 |   int64_t args[3];
467 |   CompType comparison;
468 |   bool immediate;
469 | public:
470 |   static bool interpret(const std::string &line, int line_num,
471 |                         PTXInstruction *&result);
472 | };
473 | 
474 | class PTXSelectPred : public PTXInstruction {
475 | public:
476 |   PTXSelectPred(int64_t zero, int64_t one, int64_t two, int64_t three,
477 |                 bool negate, bool two_imm, bool three_imm, int line_num);
478 |   PTXSelectPred(const PTXSelectPred &rhs) { assert(false); }
479 |   virtual ~PTXSelectPred(void) { }
480 | public:
481 |   PTXSelectPred& operator=(const PTXSelectPred &rhs) { assert(false); return *this; }
482 | public:
483 |   virtual PTXInstruction* emulate(Thread *thread);
484 | protected:
485 |   bool negate;
486 |   int64_t predicate;
487 |   int64_t args[3];
488 |   bool immediate[2];
489 | public:
490 |   static bool interpret(const std::string &line, int line_num,
491 |                         PTXInstruction *&result);
492 | };
493 | 
494 | class PTXBarrier : public PTXInstruction {
495 | public:
496 |   PTXBarrier(int64_t name, int64_t count, bool sync, 
497 |              bool name_imm, bool count_imm, int line_num);
498 |   PTXBarrier(const PTXBarrier &rhs) { assert(false); }
499 |   virtual ~PTXBarrier(void) { }
500 | public:
501 |   PTXBarrier& operator=(const PTXBarrier &rhs) { assert(false); return *this; }
502 | public:
503 |   virtual PTXInstruction* emulate(Thread *thread);
504 |   // Override for warp-synchronous execution!
505 |   virtual PTXInstruction* emulate_warp(Thread **threads,
506 |                                        ThreadState *thread_state,
507 |                                        int &shared_access_id,
508 |                                        SharedStore &store);
509 | public:
510 |   virtual bool is_barrier(void) const { return true; }
511 |   virtual PTXBarrier* as_barrier(void) { return this; }
512 |   void update_count(unsigned arrival_count);
513 |   int get_barrier_name(void) const { return name; }
514 | protected:
515 |   int64_t name, count;
516 |   bool sync;
517 |   bool name_immediate, count_immediate;
518 | public:
519 |   static bool interpret(const std::string &line, int line_num,
520 |                         PTXInstruction *&result);
521 | };
522 | 
523 | class PTXSharedAccess : public PTXInstruction {
524 | public:
525 |   PTXSharedAccess(int64_t addr, int64_t offset, bool write, 
526 |                   bool has_arg, int64_t arg, bool immediate, int line_num);
527 |   PTXSharedAccess(const std::string &name, int64_t offset, bool write,
528 |                   bool has_arg, int64_t arg, bool immediate, int line_num);
529 |   PTXSharedAccess(const PTXSharedAccess &rhs) { assert(false); }
530 |   virtual ~PTXSharedAccess(void) { }
531 | public:
532 |   PTXSharedAccess& operator=(const PTXSharedAccess &rhs) 
533 |     { assert(false); return *this; }
534 | public:
535 |   virtual PTXInstruction* emulate(Thread *thread);
536 |   // Override for warp-synchronous execution!
537 |   virtual PTXInstruction* emulate_warp(Thread **threads,
538 |                                        ThreadState *thread_state,
539 |                                        int &shared_access_id,
540 |                                        SharedStore &store);
541 | protected:
542 |   bool has_name;
543 |   std::string name;
544 |   int64_t addr, offset, arg;
545 |   bool write, has_arg, immediate;
546 | public:
547 |   static bool interpret(const std::string &line, int line_num,
548 |                         PTXInstruction *&result);
549 | };
550 | 
551 | class PTXConvert : public PTXInstruction {
552 | public:
553 |   PTXConvert(int64_t zero, int64_t one, int line_num);
554 |   PTXConvert(const PTXConvert &rhs) { assert(false); }
555 |   virtual ~PTXConvert(void) { }
556 | public:
557 |   PTXConvert& operator=(const PTXConvert &rhs) { assert(false); return *this; }
558 | public:
559 |   virtual PTXInstruction* emulate(Thread *thread);
560 | protected:
561 |   int64_t src, dst;
562 | public:
563 |   static bool interpret(const std::string &line, int line_num,
564 |                         PTXInstruction *&result);
565 | };
566 | 
567 | class PTXConvertAddress : public PTXInstruction {
568 | public:
569 |   PTXConvertAddress(int64_t zero, int64_t one, int line_num);
570 |   PTXConvertAddress(int64_t zero, const std::string &name, int line_num);
571 |   PTXConvertAddress(const PTXConvertAddress &rhs) { assert(false); }
572 |   virtual ~PTXConvertAddress(void) { }
573 | public:
574 |   PTXConvertAddress& operator=(const PTXConvertAddress &rhs)
575 |   { assert(false); return *this; }
576 | public:
577 |   virtual PTXInstruction* emulate(Thread *thread);
578 | protected:
579 |   bool has_name;
580 |   int64_t src, dst;
581 |   std::string name;
582 | public:
583 |   static bool interpret(const std::string &line, int line_num,
584 |                         PTXInstruction *&result);
585 | };
586 | 
587 | class PTXBitFieldExtract : public PTXInstruction {
588 | public:
589 |   PTXBitFieldExtract(int64_t args[4], bool immediates[4], int line_num);
590 |   PTXBitFieldExtract(const PTXBitFieldExtract &rhs) { assert(false); }
591 |   virtual ~PTXBitFieldExtract(void) { }
592 | public:
593 |   PTXBitFieldExtract& operator=(const PTXBitFieldExtract &rhs)
594 |     { assert(false); return *this; }
595 | public:
596 |   virtual PTXInstruction* emulate(Thread *thread);
597 | protected:
598 |   int64_t args[4];
599 |   bool immediate[4];
600 | public:
601 |   static bool interpret(const std::string &line, int line_num,
602 |                         PTXInstruction *&result);
603 | };
604 | 
605 | class PTXShuffle : public PTXInstruction {
606 | public:
607 |   enum ShuffleKind {
608 |     SHUFFLE_UP,
609 |     SHUFFLE_DOWN,
610 |     SHUFFLE_BUTTERFLY,
611 |     SHUFFLE_IDX,
612 |   };
613 | public:
614 |   PTXShuffle(ShuffleKind kind, int64_t args[4], bool immediates[4], int line_num);
615 |   PTXShuffle(const PTXShuffle &rhs) { assert(false); }
616 |   virtual ~PTXShuffle(void) { }
617 | public:
618 |   PTXShuffle& operator=(const PTXShuffle &rhs)
619 |     { assert(false); return *this; }
620 | public:
621 |   virtual PTXInstruction* emulate(Thread *thread);
622 |   // Override for warp-synchronous execution!
623 |   virtual PTXInstruction* emulate_warp(Thread **threads,
624 |                                        ThreadState *thread_state,
625 |                                        int &shared_access_id,
626 |                                        SharedStore &store);
627 |   virtual bool is_shuffle(void) const { return true; }
628 | protected:
629 |   ShuffleKind kind;
630 |   int64_t args[4];
631 |   bool immediate[4];
632 | public:
633 |   static bool interpret(const std::string &line, int line_num,
634 |                         PTXInstruction *&result);
635 | };
636 | 
637 | class PTXExit : public PTXInstruction {
638 | public:
639 |   PTXExit(int line_num);
640 |   PTXExit(int64_t predicate, bool negate, int line_num);
641 |   PTXExit(const PTXExit &rhs) { assert(false); }
642 |   virtual ~PTXExit(void) { }
643 | public:
644 |   PTXExit& operator=(const PTXExit &rhs)
645 |     { assert(false); return *this; }
646 | public:
647 |   virtual PTXInstruction* emulate(Thread *thread);
648 |   // Override for warp-synchronous execution!
649 |   virtual PTXInstruction* emulate_warp(Thread **threads,
650 |                                        ThreadState *thread_state,
651 |                                        int &shared_access_id,
652 |                                        SharedStore &store);
653 | protected:
654 |   bool has_predicate;
655 |   bool negate;
656 |   int64_t predicate;
657 | public:
658 |   static bool interpret(const std::string &line, int line_num,
659 |                         PTXInstruction *&result);
660 | };
661 | 
662 | class PTXGlobalDecl : public PTXInstruction {
663 | public:
664 |   PTXGlobalDecl(char *name, int *values, size_t size, int line_num);
665 |   PTXGlobalDecl(const PTXGlobalDecl &rhs) { assert(false); }
666 |   virtual ~PTXGlobalDecl(void) { free(name); free(values); }
667 | public:
668 |   PTXGlobalDecl& operator=(const PTXGlobalDecl &rhs) 
669 |     { assert(false); return *this; }
670 | public:
671 |   virtual PTXInstruction* emulate(Thread *thread);
672 | protected:
673 |   char *name;
674 |   int *values;
675 |   size_t size;
676 | public:
677 |   static bool interpret(const std::string &line, int line_num,
678 |                         PTXInstruction *&result);
679 | };
680 | 
681 | class PTXGlobalLoad : public PTXInstruction {
682 | public:
683 |   PTXGlobalLoad(int64_t dst, int64_t addr, int line_num);
684 |   PTXGlobalLoad(const PTXGlobalLoad &rhs) { assert(false); }
685 |   virtual ~PTXGlobalLoad(void) { };
686 | public:
687 |   PTXGlobalLoad& operator=(const PTXGlobalLoad &rhs)
688 |     { assert(false); return *this; }
689 | public:
690 |   virtual PTXInstruction* emulate(Thread *thread);
691 | protected:
692 |   int64_t dst, addr;
693 | public:
694 |   static bool interpret(const std::string &line, int line_num,
695 |                         PTXInstruction *&result);
696 | };
697 | 
698 | class WeftInstruction {
699 | public:
700 |   WeftInstruction(PTXInstruction *instruction, Thread *thread);
701 |   WeftInstruction(const WeftInstruction &rhs) : instruction(NULL), 
702 |     thread(NULL), thread_line_number(-1) { assert(false); }
703 |   virtual ~WeftInstruction(void) { }
704 | public:
705 |   WeftInstruction& operator=(const WeftInstruction &rhs)
706 |     { assert(false); return *this; }
707 | public:
708 |   virtual bool is_barrier(void) const { return false; }
709 |   virtual WeftBarrier* as_barrier(void) { return NULL; }
710 | public:
711 |   virtual bool is_access(void) const { return false; }
712 |   virtual WeftAccess* as_access(void) { return NULL; }
713 | public:
714 |   virtual bool is_sync(void) const { return false; }
715 |   virtual BarrierSync* as_sync(void) { return NULL; }
716 | public:
717 |   virtual bool is_arrive(void) const { return false; }
718 |   virtual BarrierArrive* as_arrive(void) { return NULL; }
719 | public:
720 |   virtual bool is_write(void) const { return false; }
721 |   virtual SharedWrite* as_write(void) { return NULL; }
722 | public:
723 |   virtual bool is_read(void) const { return false; }
724 |   virtual SharedRead* as_read(void) { return NULL; }
725 | public:
726 |   void initialize_happens(Happens *happens);
727 |   inline Happens* get_happens(void) const { return happens_relationship; }
728 | public:
729 |   virtual void print_instruction(FILE *target) = 0;
730 | public:
731 |   PTXInstruction *const instruction;
732 |   Thread *const thread;
733 |   const int thread_line_number;
734 | protected:
735 |   Happens *happens_relationship;
736 | };
737 | 
738 | class WeftBarrier : public WeftInstruction {
739 | public:
740 |   WeftBarrier(int name, int count, PTXBarrier *bar, Thread *thread);
741 |   WeftBarrier(const WeftBarrier &rhs) : WeftInstruction(NULL, NULL),
742 |     name(0), count(0), barrier(NULL) { assert(false); }
743 |   virtual ~WeftBarrier(void) { }
744 | public:
745 |   WeftBarrier& operator=(const WeftBarrier &rhs) { assert(false); return *this; }
746 | public:
747 |   virtual bool is_barrier(void) const { return true; }
748 |   virtual WeftBarrier* as_barrier(void) { return this; }
749 | public:
750 |   void set_instance(BarrierInstance *instance);
751 |   inline BarrierInstance* get_instance(void) const { return instance; }
752 | public:
753 |   virtual void print_instruction(FILE *target) = 0;
754 | public:
755 |   const int name;
756 |   const int count;
757 |   PTXBarrier *const barrier;
758 | protected:
759 |   BarrierInstance *instance;
760 | };
761 | 
762 | class BarrierSync : public WeftBarrier {
763 | public:
764 |   BarrierSync(int name, int count, PTXBarrier *bar, Thread *thread);
765 |   BarrierSync(const BarrierSync &rhs) : WeftBarrier(0, 0, NULL, NULL)
766 |     { assert(false); }
767 |   virtual ~BarrierSync(void) { }
768 | public:
769 |   BarrierSync& operator=(const BarrierSync &rhs) { assert(false); return *this; }
770 | public:
771 |   virtual bool is_sync(void) const { return true; }
772 |   virtual BarrierSync* as_sync(void) { return this; }
773 |   virtual void print_instruction(FILE *target);
774 | };
775 | 
776 | class BarrierArrive : public WeftBarrier {
777 | public:
778 |   BarrierArrive(int name, int count, PTXBarrier *bar, Thread *thread);
779 |   BarrierArrive(const BarrierArrive &rhs) : WeftBarrier(0, 0, NULL, NULL)
780 |     { assert(false); }
781 |   virtual ~BarrierArrive(void) { }
782 | public:
783 |   BarrierArrive& operator=(const BarrierArrive &rhs) { assert(false); return *this; }
784 | public:
785 |   virtual bool is_arrive(void) const { return true; }
786 |   virtual BarrierArrive* as_arrive(void) { return this; }
787 |   virtual void print_instruction(FILE *target);
788 | };
789 | 
790 | class WeftAccess : public WeftInstruction {
791 | public:
792 |   WeftAccess(int address, PTXSharedAccess *access, Thread *thread, int access_id);
793 |   WeftAccess(const WeftAccess &rhs) : WeftInstruction(NULL, NULL),
794 |     address(0), access(NULL), access_id(-1) { assert(false); }
795 |   virtual ~WeftAccess(void) { }
796 | public:
797 |   WeftAccess& operator=(const WeftAccess &rhs) { assert(false); return *this; }
798 | public:
799 |   virtual bool is_access(void) const { return true; }
800 |   virtual WeftAccess* as_access(void) { return this; }
801 | public:
802 |   bool has_happens_relationship(WeftAccess *other);
803 |   bool is_warp_synchronous(WeftAccess *other);
804 | public:
805 |   virtual void print_instruction(FILE *target) = 0;
806 | public:
807 |   const int address;
808 |   PTXSharedAccess *const access;
809 |   const int access_id; // for warp-synchronous execution
810 | };
811 | 
812 | class SharedWrite : public WeftAccess {
813 | public:
814 |   SharedWrite(int address, PTXSharedAccess *access, 
815 |               Thread *thread, int access_id = -1);
816 |   SharedWrite(const SharedWrite &rhs) : WeftAccess(0, NULL, NULL, -1)
817 |     { assert(false); }
818 |   virtual ~SharedWrite(void) { }
819 | public:
820 |   SharedWrite& operator=(const SharedWrite &rhs) { assert(false); return *this; }
821 | public:
822 |   virtual bool is_write(void) const { return true; }
823 |   virtual SharedWrite* as_write(void) { return this; }
824 |   virtual void print_instruction(FILE *target);
825 | };
826 | 
827 | class SharedRead : public WeftAccess {
828 | public:
829 |   SharedRead(int address, PTXSharedAccess *access, 
830 |              Thread *thread, int access_id = -1);
831 |   SharedRead(const SharedRead &rhs) : WeftAccess(0, NULL, NULL, -1)
832 |     { assert(false); }
833 |   virtual ~SharedRead(void) { }
834 | public:
835 |   SharedRead& operator=(const SharedRead &rhs) { assert(false); return *this; }
836 | public:
837 |   virtual bool is_read(void) const { return true; }
838 |   virtual SharedRead* as_read(void) { return this; }
839 |   virtual void print_instruction(FILE *target);
840 | };
841 | 
842 | #endif // __INSTRUCTION_H__
843 | 


--------------------------------------------------------------------------------