├── .gitignore
├── Changes
├── LICENSE
├── MANIFEST
├── Makefile.PL
├── README.md
├── bin
    └── maxas.pl
├── cpanfile
├── lib
    └── MaxAs
    │   ├── Cubin.pm
    │   ├── MaxAs.pm
    │   └── MaxAsGrammar.pm
├── microbench
    ├── microbench.cpp
    ├── microbench.cu
    ├── microbench.sass
    ├── shared.pl
    ├── shared_lds.sass
    ├── shared_sts16.sass
    ├── throughput.pl
    ├── throughput.sass
    ├── throughput2.pl
    ├── throughput2.sass
    ├── throughput3.pl
    ├── throughput4.pl
    ├── throughput5.pl
    ├── xmad.pl
    └── xmad2.sass
├── sgemm
    ├── batched_gemm.xlsx
    ├── cublas_sgemm.ptx
    ├── sgemm.cpp
    ├── sgemm.cu
    ├── sgemm.pl
    ├── sgemm.sln
    ├── sgemm.vcxproj
    ├── sgemm128.sass
    ├── sgemm64.sass
    ├── sgemm_final_128.sass
    ├── sgemm_final_64.sass
    ├── sgemm_pre_128.sass
    └── sgemm_pre_64.sass
└── t
    └── MaxAs-MaxAs.t


/.gitignore:
--------------------------------------------------------------------------------
1 | Makefile
2 | Makefile.old
3 | pm_to_blib
4 | blib
5 | MYMETA.*
6 | 


--------------------------------------------------------------------------------
/Changes:
--------------------------------------------------------------------------------
1 | Revision history for Perl extension MaxAs::MaxAs.
2 | 
3 | 1.01  Thu Mar 26 17:09:57 2015
4 | 	- original Perl packaged version
5 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2014 Scott Gray
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MANIFEST:
--------------------------------------------------------------------------------
 1 | bin/maxas.pl
 2 | Changes
 3 | lib/MaxAs/Cubin.pm
 4 | lib/MaxAs/MaxAs.pm
 5 | lib/MaxAs/MaxAsGrammar.pm
 6 | LICENSE
 7 | Makefile.PL
 8 | MANIFEST
 9 | microbench/microbench.cpp
10 | microbench/microbench.cu
11 | microbench/microbench.sass
12 | microbench/shared.pl
13 | microbench/shared_lds.sass
14 | microbench/shared_sts16.sass
15 | microbench/throughput.pl
16 | microbench/throughput.sass
17 | microbench/throughput2.pl
18 | microbench/throughput2.sass
19 | microbench/throughput3.pl
20 | microbench/throughput4.pl
21 | microbench/throughput5.pl
22 | microbench/xmad.pl
23 | microbench/xmad2.sass
24 | README.md
25 | sgemm/batched_gemm.xlsx
26 | sgemm/cublas_sgemm.ptx
27 | sgemm/sgemm.cpp
28 | sgemm/sgemm.cu
29 | sgemm/sgemm.pl
30 | sgemm/sgemm.sln
31 | sgemm/sgemm.vcxproj
32 | sgemm/sgemm128.sass
33 | sgemm/sgemm64.sass
34 | sgemm/sgemm_final_128.sass
35 | sgemm/sgemm_final_64.sass
36 | sgemm/sgemm_pre_128.sass
37 | sgemm/sgemm_pre_64.sass
38 | t/MaxAs-MaxAs.t
39 | 


--------------------------------------------------------------------------------
/Makefile.PL:
--------------------------------------------------------------------------------
 1 | require 5.10.0;
 2 | use ExtUtils::MakeMaker;
 3 | # See lib/ExtUtils/MakeMaker.pm for details of how to influence
 4 | # the contents of the Makefile that is written.
 5 | WriteMakefile(
 6 |     NAME              => 'MaxAs::MaxAs',
 7 |     VERSION_FROM      => 'lib/MaxAs/MaxAs.pm', # finds $VERSION
 8 |     EXE_FILES         => ['bin/maxas.pl'],
 9 |     PREREQ_PM         => {Carp => 1.29, Data::Dumper => 2.145},
10 |     LICENSE           => 'MIT',
11 |     ($] >= 5.005 ?     ## Add these new keywords supported since 5.005
12 |       (ABSTRACT_FROM  => 'lib/MaxAs/MaxAs.pm', # retrieve abstract from module
13 |        AUTHOR         => 'Scott Gray <sgray@nervanasys.com>') : ()),
14 | );
15 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # DISCONTINUATION OF PROJECT #
 2 | This project will no longer be maintained by Intel.
 3 | Intel has ceased development and contributions including, but not limited to, maintenance, bug fixes, new releases, or updates, to this project.
 4 | Intel no longer accepts patches to this project.
 5 | # MaxAs
 6 | Assembler for NVIDIA Maxwell architecture
 7 | 
 8 | To install (system-wide):
 9 | 
10 |     sudo cpanm git://github.com/NervanaSystems/maxas.git
11 | 
12 | or
13 | 
14 |     perl Makefile.PL
15 |     make
16 |     sudo make install
17 | 
18 | 
19 | See wiki pages for more information:
20 | 
21 | - [Introduction](https://github.com/NervanaSystems/maxas/wiki/Introduction)
22 | - [Getting Started](https://github.com/NervanaSystems/maxas/wiki/Getting-Started)
23 | - [Control Codes](https://github.com/NervanaSystems/maxas/wiki/Control-Codes)
24 | - [SGEMM walkthrough](https://github.com/NervanaSystems/maxas/wiki/SGEMM)
25 | 
26 | Related work with lots of additional shader assembly (sass) examples:
27 | 
28 | - [Nervana Neon](https://github.com/NervanaSystems/neon)
29 | 
30 | This project is released under the [MIT License](http://opensource.org/licenses/MIT).
31 | 
32 | -- Scott Gray
33 | 


--------------------------------------------------------------------------------
/bin/maxas.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl
  2 | use strict;
  3 | use MaxAs::Cubin;
  4 | use MaxAs::MaxAs;
  5 | use Data::Dumper;
  6 | use File::Spec;
  7 | 
  8 | require 5.10.0;
  9 | 
 10 | $Data::Dumper::Sortkeys = 1;
 11 | 
 12 | my $mode = shift;
 13 | 
 14 | # List cubin contents
 15 | if ($mode =~ /^\-?\-l/i)
 16 | {
 17 |     my $cubinFile = shift or usage();
 18 | 
 19 |     my $cubin = MaxAs::Cubin->new($cubinFile);
 20 | 
 21 |     my $arch    = $cubin->arch;
 22 |     my $class   = $cubin->class;
 23 |     my $asize   = $cubin->address_size;
 24 |     my $kernels = $cubin->listKernels;
 25 |     my $symbols = $cubin->listSymbols;
 26 | 
 27 |     printf "%s: arch:sm_%d machine:%dbit address_size:%dbit\n", $cubinFile, $arch, $class, $asize;
 28 | 
 29 |     foreach my $ker (sort keys %$kernels)
 30 |     {
 31 |         printf "Kernel: %s (Linkage: %s, Params: %d, Size: %d, Registers: %d, SharedMem: %d, Barriers: %d)\n", $ker, @{$kernels->{$ker}}{qw(Linkage ParamCnt size RegCnt SharedSize BarCnt)};
 32 |     }
 33 |     foreach my $sym (sort keys %$symbols)
 34 |     {
 35 |         printf "Symbol: %s\n", $sym;
 36 |     }
 37 | }
 38 | # Test that the assembler can reproduce the op codes this cubin or sass contains
 39 | elsif ($mode =~ /^\-?\-t/i)
 40 | {
 41 |     my $reg  = shift if $ARGV[0] =~ /^\-?\-r/i;
 42 |     my $all  = shift if $ARGV[0] =~ /^\-?\-a/i;
 43 |     my $file = shift or usage();
 44 |     my $fh;
 45 |     # sass file
 46 |     if (-T $file)
 47 |     {
 48 |         open $fh, $file or die "$file: $!";
 49 |     }
 50 |     # cubin file
 51 |     else
 52 |     {
 53 |         my $cubin = MaxAs::Cubin->new($file);
 54 |         my $arch  = $cubin->arch;
 55 | 
 56 |         open $fh, "cuobjdump -arch sm_$arch -sass $file |" or die "cuobjdump -arch sm_$arch -sass $file: $!";
 57 |         my $first = <$fh>;
 58 |         if ($first =~ /cuobjdump fatal/)
 59 |         {
 60 |             print $first;
 61 |             exit(1);
 62 |         }
 63 |     }
 64 |     exit(MaxAs::MaxAs::Test($fh, $reg, $all) ? 1 : 0);
 65 | }
 66 | # Extract an asm file containing the desired kernel
 67 | elsif ($mode =~ /^\-?\-e/i)
 68 | {
 69 |     my $kernelName;
 70 |     if ($ARGV[0] =~ /^\-?\-k/i)
 71 |     {
 72 |         shift;
 73 |         $kernelName = shift or usage();
 74 |     }
 75 |     my $cubinFile = shift or usage();
 76 |     my $asmFile   = shift;
 77 |     my $cubin     = MaxAs::Cubin->new($cubinFile);
 78 |     my $arch      = $cubin->arch;
 79 |     my $kernels   = $cubin->listKernels;
 80 | 
 81 |     #default the kernel name if not specified.
 82 |     $kernelName ||= (sort keys %$kernels)[0];
 83 | 
 84 |     my $kernel = $kernels->{$kernelName} or die "bad kernel: $kernelName";
 85 | 
 86 |     open my $in, "cuobjdump -arch sm_$arch -sass -fun $kernelName $cubinFile |" or die "cuobjdump -arch sm_50 -sass -fun $kernelName $cubinFile: $!";
 87 |     my $first = <$in>;
 88 |     if ($first =~ /cuobjdump fatal/)
 89 |     {
 90 |         print $first;
 91 |         exit(1);
 92 |     }
 93 |     my $out;
 94 |     if ($asmFile)
 95 |     {
 96 |         open $out, ">$asmFile" or die "$asmFile: $!";
 97 |     }
 98 |     else
 99 |     {
100 |         $out = \*STDOUT;
101 |     }
102 | 
103 |     print $out "# Kernel: $kernelName\n# Arch: sm_$arch\n";
104 | 
105 |     print $out "# $_: $kernel->{$_}\n" foreach (qw(InsCnt RegCnt SharedSize BarCnt));
106 | 
107 |     print $out "# Params($kernel->{ParamCnt}):\n#\tord:addr:size:align\n";
108 | 
109 |     print $out join('', map "#\t$_\n", @{$kernel->{Params}}) if $kernel->{Params};
110 | 
111 |     print $out "#\n# Instructions:\n\n";
112 | 
113 |     MaxAs::MaxAs::Extract($in, $out, $kernel->{Params});
114 | 
115 |     close $out if $asmFile;
116 |     close $in;
117 | }
118 | # Extract a kernel from a sass dump
119 | elsif ($mode =~ /^\-?\-s/i)
120 | {
121 |     my $sassFile  = shift or usage();
122 |     my $asmFile   = shift;
123 | 
124 |     open my $in, $sassFile or die "$sassFile: $!";
125 | 
126 |     my $out;
127 |     if ($asmFile)
128 |     {
129 |         open $out, ">$asmFile" or die "$asmFile: $!";
130 |     }
131 |     else
132 |     {
133 |         $out = \*STDOUT;
134 |     }
135 | 
136 |     MaxAs::MaxAs::Extract($in, $out, []);
137 | 
138 |     close $out if $asmFile;
139 |     close $in;
140 | }
141 | # Insert the kernel asm back into the cubin:
142 | elsif ($mode =~ /^\-?\-i/i)
143 | {
144 |     my $nowarn;
145 |     if ($ARGV[0] =~ /^\-?\-w/i)
146 |     {
147 |         $nowarn = shift;
148 |     }
149 |     my $kernelName;
150 |     if ($ARGV[0] =~ /^\-?\-k/i)
151 |     {
152 |         shift;
153 |         $kernelName = shift or usage();
154 |     }
155 |     my $noReuse   = shift if $ARGV[0] =~ /^\-?\-n/i;
156 |     while ($ARGV[0] =~ /^\-?\-D(\w+)/)
157 |     {
158 |         shift;
159 |         my $name  = $1;
160 |         my $value = shift;
161 |         eval "package MaxAs::MaxAs::CODE; our \$$name = '$value';"
162 |     }
163 | 
164 |     my $asmFile   = shift or usage();
165 |     my $cubinFile = shift or usage();
166 |     my $newCubin  = shift || $cubinFile;
167 | 
168 |     my $file;
169 |     if (open my $fh, $asmFile)
170 |     {
171 |         local $/;
172 |         $file = <$fh>;
173 |         close $fh;
174 |     }
175 |     else { die "$asmFile: $!" }
176 | 
177 |     my ($vol,$dir) = File::Spec->splitpath($asmFile);
178 |     my $include = [$vol, $dir];
179 | 
180 |     # extract the kernel name from the file
181 |     ($kernelName) = $file =~ /^# Kernel: (\w+)/ unless $kernelName;
182 |     die "asm file missing kernel name or is badly formatted" unless $kernelName;
183 | 
184 |     my $kernel = MaxAs::MaxAs::Assemble($file, $include, !$noReuse, $nowarn);
185 | 
186 |     my $cubin  = MaxAs::Cubin->new($cubinFile);
187 |     $kernel->{Kernel} = $cubin->getKernel($kernelName) or die "cubin does not contain kernel: $kernelName";
188 | 
189 |     $cubin->modifyKernel(%$kernel);
190 | 
191 |     $cubin->write($newCubin);
192 | 
193 |     printf "Kernel: $kernelName, Instructions: %d, Register Count: %d, Bank Conflicts: %d, Reuse: %.1f% (%d/%d)\n",
194 |         @{$kernel}{qw(InsCnt RegCnt ConflictCnt ReusePct ReuseCnt ReuseTot)};
195 | 
196 | }
197 | # Preprocessing:
198 | elsif ($mode =~ /^\-?\-p/i)
199 | {
200 |     while ($ARGV[0] =~ /^\-?\-D(\w+)/)
201 |     {
202 |         shift;
203 |         my $name  = $1;
204 |         my $value = shift;
205 |         eval "package MaxAs::MaxAs::CODE; our \$$name = '$value';";
206 |     }
207 |     my $debug     = shift if $ARGV[0] =~ /^\-?\-d/i;
208 |     my $asmFile   = shift or usage();
209 |     my $asmFile2  = shift;
210 | 
211 |     die "source and destination probably shouldn't be the same file\n" if $asmFile eq $asmFile2;
212 | 
213 |     open my $fh,  $asmFile or die "$asmFile: $!";
214 |     local $/;
215 |     my $file = <$fh>;
216 |     close $fh;
217 | 
218 |     my ($vol,$dir) = File::Spec->splitpath($asmFile);
219 |     my $include = [$vol, $dir];
220 | 
221 |     if ($asmFile2)
222 |     {
223 |         open $fh, ">$asmFile2" or die "$asmFile2: $!";
224 |     }
225 |     else
226 |     {
227 |         $fh = \*STDOUT;
228 |     }
229 |     print $fh MaxAs::MaxAs::Preprocess($file, $include, $debug);
230 |     close $fh;
231 | }
232 | # get version information
233 | elsif ($mode =~ /^\-?\-v/i)
234 | {
235 |     print "$MaxAs::MaxAs::VERSION\n";
236 | }
237 | else
238 | {
239 |     print "$mode\n";
240 |     usage();
241 | }
242 | 
243 | exit(0);
244 | 
245 | 
246 | 
247 | sub usage
248 | {
249 |     print <<EOF;
250 | Usage:
251 | 
252 |   List kernels and symbols:
253 | 
254 |     maxas.pl --list|-l <cubin_file>
255 | 
256 |   Test a cubin or sass file to to see if the assembler can reproduce all of the contained opcodes.
257 |   Also useful for extending the missing grammar rules.  Defaults to only showing failures without --all.
258 |   With the --reg flag it will show register bank conflicts not hidden by reuse flags.
259 | 
260 |     maxas.pl --test|-t [--reg|-r] [--all|-a] <cubin_file | cuobjdump_sass_file>
261 | 
262 |   Extract a single kernel into an asm file from a cubin.
263 |   Works much like cuobjdump but outputs in a format that can be re-assembled back into the cubin.
264 | 
265 |     maxas.pl --extract|-e [--kernel|-k kernel_name] <cubin_file> [asm_file]
266 | 
267 |   Preprocess the asm: expand CODE sections, perform scheduling. Mainly used for debugging purposes.
268 |   Include the debug flag to print out detailed scheduler info.
269 | 
270 |     maxas.pl --pre|-p [--debug|-d] <asm_file> [new_asm_file]
271 | 
272 |   Insert the kernel asm back into the cubin.  Overwrite existing or create new cubin.
273 |   Optionally you can skip register reuse flag auto insertion.  This allows you to observe
274 |   performance without any reuse or you can use it to set the flags manually in your sass.
275 | 
276 |     maxas.pl --insert|-i [--noreuse|-n] <asm_file> <cubin_file> [new_cubin_file]
277 | 
278 |   Display version information and exit:
279 | 
280 |     maxas.pl --version|-v
281 | 
282 | EOF
283 |     exit(1);
284 | }
285 | 
286 | __END__
287 | 


--------------------------------------------------------------------------------
/cpanfile:
--------------------------------------------------------------------------------
1 | requires 'perl', '5.10.0';
2 | 
3 | requires 'Carp', '1.29';
4 | requires 'Data::Dumper', '2.145';
5 | 


--------------------------------------------------------------------------------
/lib/MaxAs/Cubin.pm:
--------------------------------------------------------------------------------
  1 | package MaxAs::Cubin;
  2 | 
  3 | use strict;
  4 | use Data::Dumper;
  5 | 
  6 | my @Elf32_Hdr = qw(
  7 |     H8  magic
  8 |     C   fileClass
  9 |     C   encoding
 10 |     C   fileVersion
 11 |     H18 padding
 12 |     S   type
 13 |     S   machine
 14 |     L   version
 15 |     L   entry
 16 |     L   phOffset
 17 |     L   shOffset
 18 |     L   flags
 19 |     S   ehSize
 20 |     S   phEntSize
 21 |     S   phNum
 22 |     S   shEntSize
 23 |     S   shNum
 24 |     S   shStrIndx
 25 | );
 26 | my @Elf64_Hdr = qw(
 27 |     H8  magic
 28 |     C   fileClass
 29 |     C   encoding
 30 |     C   fileVersion
 31 |     H18 padding
 32 |     S   type
 33 |     S   machine
 34 |     L   version
 35 |     Q   entry
 36 |     Q   phOffset
 37 |     Q   shOffset
 38 |     L   flags
 39 |     S   ehSize
 40 |     S   phEntSize
 41 |     S   phNum
 42 |     S   shEntSize
 43 |     S   shNum
 44 |     S   shStrIndx
 45 | );
 46 | my @Elf32_PrgHdr = qw(
 47 |     L   type
 48 |     L   offset
 49 |     L   vaddr
 50 |     L   paddr
 51 |     L   fileSize
 52 |     L   memSize
 53 |     L   flags
 54 |     L   align
 55 | );
 56 | my @Elf64_PrgHdr = qw(
 57 |     L   type
 58 |     L   flags
 59 |     Q   offset
 60 |     Q   vaddr
 61 |     Q   paddr
 62 |     Q   fileSize
 63 |     Q   memSize
 64 |     Q   align
 65 | );
 66 | my @Elf32_SecHdr = qw(
 67 |     L   name
 68 |     L   type
 69 |     L   flags
 70 |     L   addr
 71 |     L   offset
 72 |     L   size
 73 |     L   link
 74 |     L   info
 75 |     L   align
 76 |     L   entSize
 77 | );
 78 | my @Elf64_SecHdr = qw(
 79 |     L   name
 80 |     L   type
 81 |     Q   flags
 82 |     Q   addr
 83 |     Q   offset
 84 |     Q   size
 85 |     L   link
 86 |     L   info
 87 |     Q   align
 88 |     Q   entSize
 89 | );
 90 | my @Elf32_SymEnt = qw(
 91 |     L   name
 92 |     L   value
 93 |     L   size
 94 |     C   info
 95 |     C   other
 96 |     S   shIndx
 97 | );
 98 | my @Elf64_SymEnt = qw(
 99 |     L   name
100 |     C   info
101 |     C   other
102 |     S   shIndx
103 |     Q   value
104 |     Q   size
105 | );
106 | my @symBind = qw(LOCAL GLOBAL WEAK);
107 | 
108 | # Split the Elf Header defs into template strings (T) and corresponding hash keys columns (C)
109 | my (@elfHdrT, @prgHdrT, @secHdrT, @symHdrT, @elfHdrC, @prgHdrC, @secHdrC, @symHdrC);
110 | 
111 | $elfHdrT[1] = join '', grep { length($_) <= 3} @Elf32_Hdr;
112 | $prgHdrT[1] = join '', grep { length($_) <= 3} @Elf32_PrgHdr;
113 | $secHdrT[1] = join '', grep { length($_) <= 3} @Elf32_SecHdr;
114 | $symHdrT[1] = join '', grep { length($_) <= 3} @Elf32_SymEnt;
115 | 
116 | $elfHdrT[2] = join '', grep { length($_) <= 3} @Elf64_Hdr;
117 | $prgHdrT[2] = join '', grep { length($_) <= 3} @Elf64_PrgHdr;
118 | $secHdrT[2] = join '', grep { length($_) <= 3} @Elf64_SecHdr;
119 | $symHdrT[2] = join '', grep { length($_) <= 3} @Elf64_SymEnt;
120 | 
121 | $elfHdrC[1] = [ grep { length($_) > 3} @Elf32_Hdr    ];
122 | $prgHdrC[1] = [ grep { length($_) > 3} @Elf32_PrgHdr ];
123 | $secHdrC[1] = [ grep { length($_) > 3} @Elf32_SecHdr ];
124 | $symHdrC[1] = [ grep { length($_) > 3} @Elf32_SymEnt ];
125 | 
126 | $elfHdrC[2] = [ grep { length($_) > 3} @Elf64_Hdr    ];
127 | $prgHdrC[2] = [ grep { length($_) > 3} @Elf64_PrgHdr ];
128 | $secHdrC[2] = [ grep { length($_) > 3} @Elf64_SecHdr ];
129 | $symHdrC[2] = [ grep { length($_) > 3} @Elf64_SymEnt ];
130 | 
131 | # Load a cubin ELF file
132 | sub new
133 | {
134 |     my ($package, $file) = @_;
135 | 
136 |     my $cubin = bless { fileName => $file }, $package;
137 | 
138 |     open my $fh, $file or die "$file: $!";
139 |     binmode($fh);
140 | 
141 |     # Read in assuming 32 bit header
142 |     my $data;
143 |     read $fh, $data, 0x34;
144 |     my $elfHdr = $cubin->{elfHdr} = {};
145 |     @{$elfHdr}{@{$elfHdrC[1]}} = unpack $elfHdrT[1], $data;
146 | 
147 |     # 1: 32bit, 2: 64bit
148 |     my $class = $elfHdr->{fileClass};
149 | 
150 |     # re-read in with 64 bit header if needed
151 |     if ($class == 2)
152 |     {
153 |         seek $fh, 0, 0;
154 |         read $fh, $data, 0x46;
155 |         @{$elfHdr}{@{$elfHdrC[$class]}} = unpack $elfHdrT[$class], $data;
156 | 
157 |         $cubin->{Class} = 64;
158 |     }
159 |     else
160 |     {
161 |         $cubin->{Class} = 32;
162 |     }
163 | 
164 |     # verify sm_50 cubin
165 |     $cubin->{Arch} = $elfHdr->{flags} & 0xFF;
166 |     die "Cubin not in sm_50 or greater format. Found: sm_$cubin->{Arch}\n" if $cubin->{Arch} < 50;
167 | 
168 |     $cubin->{AddressSize} = $elfHdr->{flags} & 0x400 ? 64 : 32;
169 | 
170 |     # Read in Program Headers
171 |     seek $fh, $elfHdr->{phOffset}, 0;
172 |     foreach (1 .. $elfHdr->{phNum})
173 |     {
174 |         read $fh, $data, $elfHdr->{phEntSize};
175 | 
176 |         my %prgHdr = (Indx => $_ - 1);
177 |         @prgHdr{@{$prgHdrC[$class]}} = unpack $prgHdrT[$class], $data;
178 |         push @{$cubin->{prgHdrs}}, \%prgHdr;
179 |     }
180 | 
181 |     # Read in Section Headers
182 |     seek $fh, $elfHdr->{shOffset}, 0;
183 |     foreach (1 .. $elfHdr->{shNum})
184 |     {
185 |         read $fh, $data, $elfHdr->{shEntSize};
186 | 
187 |         my %secHdr = (Indx => $_ - 1);
188 |         @secHdr{@{$secHdrC[$class]}} = unpack $secHdrT[$class], $data;
189 |         push @{$cubin->{secHdrs}}, \%secHdr;
190 |     }
191 | 
192 |     # Read in Section data
193 |     foreach my $secHdr (@{$cubin->{secHdrs}})
194 |     {
195 |         $data = '';
196 |         # Skip sections with no data (type NULL or NOBITS)
197 |         if ($secHdr->{size} && $secHdr->{type} != 8)
198 |         {
199 |             seek $fh, $secHdr->{offset}, 0;
200 |             read $fh, $data, $secHdr->{size};
201 |         }
202 |         # Convert string tables to maps
203 |         if ($secHdr->{type} == 3) # STRTAB
204 |         {
205 |             my $strTab = $secHdr->{StrTab} = {};
206 |             my $indx   = 0;
207 |             foreach my $str (split "\0", $data)
208 |             {
209 |                 $strTab->{$indx} = $str;
210 |                 $indx += 1 + length($str);
211 |             }
212 |         }
213 |         # Read in Symbol data
214 |         if ($secHdr->{type} == 2) # SYMTAB
215 |         {
216 |             my $offset = 0;
217 |             while ($offset < $secHdr->{size})
218 |             {
219 |                 my $symEnt = {};
220 |                 @{$symEnt}{@{$symHdrC[$class]}} = unpack $symHdrT[$class], substr($data, $offset, $secHdr->{entSize});
221 |                 $offset += $secHdr->{entSize};
222 | 
223 |                 push @{$secHdr->{SymTab}}, $symEnt;
224 |             }
225 |         }
226 |         # Cache raw data for further processing and writing
227 |         $secHdr->{Data} = unpack 'H*', $data;
228 |     }
229 |     close $fh;
230 | 
231 |     # Update section headers with their names.  Map names directly to headers.
232 |     my $shStrTab = $cubin->{secHdrs}[$elfHdr->{shStrIndx}]{StrTab};
233 |     foreach my $secHdr (@{$cubin->{secHdrs}})
234 |     {
235 |         $secHdr->{Name} = $shStrTab->{$secHdr->{name}};
236 |         $cubin->{$secHdr->{Name}} = $secHdr;
237 |     }
238 | 
239 |     # Update symbols with their names
240 |     # For the Global functions, extract kernel meta data
241 |     # Populate the kernel hash
242 |     my $strTab = $cubin->{'.strtab'}{StrTab};
243 |     foreach my $symEnt (@{$cubin->{'.symtab'}{SymTab}})
244 |     {
245 |         $symEnt->{Name} = $strTab->{$symEnt->{name}};
246 | 
247 |         # Attach symbol to section
248 |         my $secHdr = $cubin->{secHdrs}[$symEnt->{shIndx}];
249 |         $secHdr->{SymbolEnt} = $symEnt;
250 | 
251 |         # Look for symbols tagged FUNC
252 |         if (($symEnt->{info} & 0x0f) == 0x02)
253 |         {
254 |             # Create a hash of kernels for output
255 |             my $kernelSec = $cubin->{Kernels}{$symEnt->{Name}} = $secHdr;
256 | 
257 |             # Extract local/global/weak binding info
258 |             $kernelSec->{Linkage} = $symBind[($symEnt->{info} & 0xf0) >> 4];
259 | 
260 |             # Extract the kernel instructions
261 |             $kernelSec->{KernelData} = [ unpack "Q*", pack "H*", $kernelSec->{Data} ];
262 | 
263 |             # Extract the max barrier resource identifier used and add 1. Should be 0-16.
264 |             # If a register is used as a barrier resource id, then this value is the max of 16.
265 |             $kernelSec->{BarCnt} = ($kernelSec->{flags} & 0x01f00000) >> 20;
266 | 
267 |             # Extract the number of allocated registers for this kernel.
268 |             $kernelSec->{RegCnt} = ($kernelSec->{info} & 0xff000000) >> 24;
269 | 
270 |             # Extract the size of shared memory this kernel uses.
271 |             my $sharedSec = $kernelSec->{SharedSec} = $cubin->{".nv.shared.$symEnt->{Name}"};
272 |             $kernelSec->{SharedSize} = $sharedSec ? $sharedSec->{size} : 0;
273 | 
274 |             # Attach constant0 section
275 |             $kernelSec->{ConstantSec} = $cubin->{".nv.constant0.$symEnt->{Name}"};
276 | 
277 |             # Extract the kernel parameter data.
278 |             my $paramSec = $kernelSec->{ParamSec} = $cubin->{".nv.info.$symEnt->{Name}"};
279 |             if ($paramSec)
280 |             {
281 |                 # Extract raw param data
282 |                 my @data = unpack "L*", pack "H*", $paramSec->{Data};
283 | 
284 |                 $paramSec->{ParamData} = \@data;
285 |                 $paramSec->{ParamHex} = [ map { sprintf '0x%08x', $_ } @data ];
286 | 
287 |                 # Find the first param delimiter
288 |                 my $idx = 0;
289 |                 $idx++ while $idx < @data && $data[$idx] != 0x00080a04;
290 | 
291 |                 my $first = $data[$idx+2] & 0xFFFF;
292 |                 #my $size  = $data[$idx+2] >> 16;
293 |                 $idx += 4;
294 | 
295 |                 my @params;
296 |                 while ($idx < @data && $data[$idx] == 0x000c1704)
297 |                 {
298 |                     # Get the ordinal, offset, size and pointer alignment for each param
299 |                     my $ord    = $data[$idx+2] & 0xFFFF;
300 |                     my $offset = sprintf '0x%02x', $first + ($data[$idx+2] >> 16);
301 |                     my $psize  = $data[$idx+3] >> 18;
302 |                     my $align  = $data[$idx+3] & 0x400 ? 1 << ($data[$idx+3] & 0x3ff) : 0;
303 |                     unshift @params, "$ord:$offset:$psize:$align";
304 |                     $idx += 4;
305 |                 }
306 |                 my @staticParams = @data[0 .. ($idx-1)];
307 | 
308 |                 my ($maxregCount, @exitOffsets, @ctaidOffsets, $ctaidzUsed, @reqntid, @maxntid, @stackSize);
309 |                 while ($idx < @data)
310 |                 {
311 |                     my $code = $data[$idx] & 0xffff;
312 |                     my $size = $data[$idx] >> 16;
313 |                     $idx++;
314 | 
315 |                     # EIATTR_MAXREG_COUNT
316 |                     if ($code == 0x1b03)
317 |                     {
318 |                         $maxregCount = $size;
319 |                     }
320 |                     # EIATTR_S2RCTAID_INSTR_OFFSETS
321 |                     elsif ($code == 0x1d04)
322 |                     {
323 |                         while ($size > 0)
324 |                         {
325 |                             push @ctaidOffsets, $data[$idx++];
326 |                             $size -= 4;
327 |                         }
328 |                     }
329 |                     # EIATTR_EXIT_INSTR_OFFSETS
330 |                     elsif ($code == 0x1c04)
331 |                     {
332 |                         while ($size > 0)
333 |                         {
334 |                             push @exitOffsets, $data[$idx++];
335 |                             $size -= 4;
336 |                         }
337 |                     }
338 |                     # EIATTR_CTAIDZ_USED
339 |                     elsif ($code == 0x0401)
340 |                     {
341 |                         $ctaidzUsed = 1;
342 |                     }
343 |                     # EIATTR_REQNTID
344 |                     elsif ($code == 0x1004)
345 |                     {
346 |                         while ($size > 0)
347 |                         {
348 |                             push @reqntid, $data[$idx++];
349 |                             $size -= 4;
350 |                         }
351 |                     }
352 |                     # EIATTR_MAX_THREADS
353 |                     elsif ($code == 0x0504)
354 |                     {
355 |                         while ($size > 0)
356 |                         {
357 |                             push @maxntid, $data[$idx++];
358 |                             $size -= 4;
359 |                         }
360 |                     }
361 |                     # EIATTR_CRS_STACK_SIZE
362 |                     elsif ($code == 0x1e04)
363 |                     {
364 |                         while ($size > 0)
365 |                         {
366 |                             push @stackSize, $data[$idx++];
367 |                             $size -= 4;
368 |                         }
369 |                     }
370 |                     else
371 |                     {
372 |                         printf STDERR "Unknown Code 0x%02x (size:%d)\n", $code, $size;
373 |                     }
374 |                 }
375 |                 $kernelSec->{Params}   = \@params;
376 |                 $kernelSec->{ParamCnt} = scalar @params;
377 | 
378 |                 $paramSec->{StaticParams} = \@staticParams;
379 |                 $paramSec->{MAXREG_COUNT} = $maxregCount;
380 |                 $paramSec->{ExitOffsets}  = \@exitOffsets;
381 |                 $paramSec->{CTAIDOffsets} = \@ctaidOffsets;
382 |                 $paramSec->{CTAIDZUsed}   = $ctaidzUsed;
383 |                 $paramSec->{REQNTID}      = \@reqntid;
384 |                 $paramSec->{MAXNTID}      = \@maxntid;
385 |                 $paramSec->{STACKSIZE}    = \@stackSize;
386 |             }
387 |             # print Dumper($paramSec);
388 |             # exit();
389 |         }
390 |         # Note GLOBALs found in this cubin
391 |         elsif (($symEnt->{info} & 0x10) == 0x10)
392 |         {
393 |             $cubin->{Symbols}{$symEnt->{Name}} = $symEnt;
394 |         }
395 |     }
396 | 
397 |     # print "phOffset: $elfHdr->{phOffset}\n";
398 |     # print "shOffset: $elfHdr->{shOffset}\n";
399 |     # foreach my $secHdr (@{$cubin->{secHdrs}})
400 |     # {
401 |     #     print "secHdr($secHdr->{Indx}): $secHdr->{offset}, $secHdr->{size}, $secHdr->{align} ($secHdr->{Name})\n";
402 |     # }
403 |     # my $p = 0;
404 |     # foreach my $prgHdr (@{$cubin->{prgHdrs}})
405 |     # {
406 |     #     print "prgHdr($p): type: $prgHdr->{type}, offset: $prgHdr->{offset}, fileSize: $prgHdr->{fileSize}, memSize: $prgHdr->{memSize}, align: $prgHdr->{align}\n";
407 |     #     $p++;
408 |     # }
409 |     # exit();
410 | 
411 |     # print Dumper($cubin->{prgHdrs});
412 |     # exit();
413 |     return $cubin;
414 | }
415 | sub class
416 | {
417 |     return shift()->{Class};
418 | }
419 | sub arch
420 | {
421 |     return shift()->{Arch};
422 | }
423 | sub address_size
424 | {
425 |     return shift()->{AddressSize};
426 | }
427 | sub listKernels
428 | {
429 |     return shift()->{Kernels};
430 | }
431 | sub listSymbols
432 | {
433 |     return shift()->{Symbols};
434 | }
435 | sub getKernel
436 | {
437 |     my ($cubin, $kernel) = @_;
438 |     return $cubin->{Kernels}{$kernel};
439 | }
440 | 
441 | sub modifyKernel
442 | {
443 |     my ($cubin, %params) = @_;
444 | 
445 |     my $kernelSec    = $params{Kernel};
446 |     my $newReg       = $params{RegCnt};
447 |     my $newBar       = $params{BarCnt};
448 |     my $exitOffsets  = $params{ExitOffsets};
449 |     my $ctaidOffsets = $params{CTAIDOffsets};
450 |     my $ctaidzUsed   = $params{CTAIDZUsed};
451 |     my $newData      = $params{KernelData};
452 |     my $newSize      = @$newData * 8;
453 | 
454 |     die "255 register max" if $newReg > 255;
455 |     die "new kernel size must be multiple of 8 instructions (64 bytes)" if $newSize & 63;
456 |     die "16 is max barrier count" if $newBar > 16;
457 | 
458 |     my $paramSec    = $kernelSec->{ParamSec};
459 |     my $kernelName  = $kernelSec->{SymbolEnt}{Name};
460 |     my $maxregCount = $paramSec->{MAXREG_COUNT};
461 |     my $stackSize   = $paramSec->{STACKSIZE};
462 | 
463 |     # update the kernel
464 |     $kernelSec->{KernelData} = $newData;
465 |     $kernelSec->{Data}       = unpack "H*", pack "Q*", @$newData;
466 | 
467 |     if ($newReg != $kernelSec->{RegCnt})
468 |     {
469 |         print "Modified $kernelName RegCnt: $kernelSec->{RegCnt} => $newReg\n";
470 |         $kernelSec->{RegCnt} = $newReg;
471 |         $kernelSec->{info}  &= ~0xff000000;
472 |         $kernelSec->{info}  |= $newReg << 24;
473 |     }
474 |     if ($newBar != $kernelSec->{BarCnt})
475 |     {
476 |         print "Modified $kernelName BarCnt: $kernelSec->{BarCnt} => $newBar\n";
477 |         $kernelSec->{BarCnt} = $newBar;
478 |         $kernelSec->{flags} &= ~0x01f00000;
479 |         $kernelSec->{flags} |=  $newBar << 20;
480 |     }
481 | 
482 |     my @paramData = @{$paramSec->{StaticParams}};
483 | 
484 |     if (defined $maxregCount)
485 |     {
486 |         push @paramData, ($maxregCount << 16) | 0x1b03;
487 |     }
488 | 
489 |     my $newCTAIDs = join ',', map { sprintf '%04x', $_ } @$ctaidOffsets;
490 |     my $oldCTAIDs = join ',', map { sprintf '%04x', $_ } @{$paramSec->{CTAIDOffsets}};
491 | 
492 |     if ($newCTAIDs ne $oldCTAIDs)
493 |     {
494 |         print "Modified $kernelName CTAID Offsets: '$oldCTAIDs' => '$newCTAIDs'\n";
495 |     }
496 |     if (@$ctaidOffsets)
497 |     {
498 |         push @paramData, (scalar(@$ctaidOffsets) << 18) | 0x1d04;
499 |         push @paramData, @$ctaidOffsets;
500 |     }
501 | 
502 |     my $newExits = join ',', map { sprintf '%04x', $_ } @$exitOffsets;
503 |     my $oldExits = join ',', map { sprintf '%04x', $_ } @{$paramSec->{ExitOffsets}};
504 | 
505 |     if ($newExits ne $oldExits)
506 |     {
507 |         print "Modified $kernelName Exit Offsets: '$oldExits' => '$newExits'\n";
508 |     }
509 |     if (@$exitOffsets)
510 |     {
511 |         push @paramData, (scalar(@$exitOffsets) << 18) | 0x1c04;
512 |         push @paramData, @$exitOffsets;
513 |     }
514 | 
515 |     if ($ctaidzUsed != $paramSec->{CTAIDZUsed})
516 |     {
517 |         print "Modified $kernelName CTAID.Z Used: '$paramSec->{CTAIDZUsed}' => '$ctaidzUsed'\n";
518 |     }
519 |     if ($ctaidzUsed)
520 |     {
521 |         push @paramData, 0x0401;
522 |     }
523 | 
524 |     if (@{$paramSec->{REQNTID}})
525 |     {
526 |         push @paramData, (scalar(@{$paramSec->{REQNTID}}) << 18) | 0x1004;
527 |         push @paramData, @{$paramSec->{REQNTID}};
528 |     }
529 |     if (@{$paramSec->{MAXNTID}})
530 |     {
531 |         push @paramData, (scalar(@{$paramSec->{MAXNTID}}) << 18) | 0x0504;
532 |         push @paramData, @{$paramSec->{MAXNTID}};
533 |     }
534 | 
535 |     if (@$stackSize)
536 |     {
537 |         push @paramData, (scalar(@$stackSize) << 18) | 0x1e04;
538 |         push @paramData, @$stackSize;
539 |     }
540 | 
541 |     my $newParamSize  = scalar(@paramData)*4;
542 |     $paramSec->{Data} = unpack "H*", pack "L*", @paramData;
543 |     if ($newParamSize != $paramSec->{size})
544 |     {
545 |         print "Modified $kernelName ParamSecSize: $paramSec->{size} => $newParamSize\n";
546 |         $cubin->updateSize($paramSec, $newParamSize);
547 |     }
548 | 
549 |     if ($newSize != $kernelSec->{size})
550 |     {
551 |         print "Modified $kernelName KernelSize: $kernelSec->{size} => $newSize\n";
552 |         $cubin->updateSize($kernelSec, $newSize, 1);
553 |     }
554 | }
555 | 
556 | sub updateSize
557 | {
558 |     my ($cubin, $sec, $newSize, $updatePrgSize) = @_;
559 | 
560 |     my $elfHdr = $cubin->{elfHdr};
561 |     my $class  = $elfHdr->{fileClass};
562 | 
563 |     # update section header
564 |     my $delta = $newSize - $sec->{size};
565 |     $sec->{size} = $newSize;
566 | 
567 |     # update symtab section
568 |     if ($sec->{SymbolEnt})
569 |     {
570 |         $sec->{SymbolEnt}{size} = $newSize;
571 |         my $symSection = $cubin->{'.symtab'};
572 |         $symSection->{Data} = '';
573 |         foreach my $symEnt (@{$symSection->{SymTab}})
574 |         {
575 |             $symSection->{Data} .= unpack "H*", pack $symHdrT[$class], @{$symEnt}{@{$symHdrC[$class]}};
576 |         }
577 |     }
578 | 
579 |     my $pos = $elfHdr->{ehSize};
580 |     my %sizeMap;
581 | 
582 |     # update section header offsets
583 |     foreach my $secHdr (@{$cubin->{secHdrs}})
584 |     {
585 |         # skip first header
586 |         next if $secHdr->{align} == 0;
587 | 
588 |         # NOBITS data sections are size 0
589 |         my $size = $secHdr->{type} == 8 ? 0 : $secHdr->{size};
590 | 
591 |         # Add any needed padding between sections
592 |         my $pad = $pos % $secHdr->{align};
593 |         if ($pad > 0)
594 |         {
595 |             $pos += $secHdr->{align} - $pad;
596 |         }
597 |         # map old offset to new
598 |         $sizeMap{$secHdr->{offset}} = $pos;
599 | 
600 |         # update offset
601 |         $secHdr->{offset} = $pos;
602 | 
603 |         # advance position by size
604 |         $pos += $size;
605 |     }
606 | 
607 |     # compute total section header size
608 |     my $shSize = $elfHdr->{phOffset} - $elfHdr->{shOffset};
609 | 
610 |     # map old offset to new
611 |     $sizeMap{$elfHdr->{shOffset}} = $pos;
612 |     $sizeMap{$elfHdr->{phOffset}} = $pos + $shSize;
613 | 
614 |     $elfHdr->{shOffset} = $pos;
615 |     $elfHdr->{phOffset} = $pos + $shSize;
616 | 
617 |     # update program header offsets and sizes
618 |     foreach my $prgHdr (@{$cubin->{prgHdrs}})
619 |     {
620 |         # Not sure how best to adjust these so just assume they'll track other offsets.
621 |         $prgHdr->{offset} = $sizeMap{$prgHdr->{offset}};
622 | 
623 |         # If the kernel sizes changes, also update the associated ProgramHeader.
624 |         # Note that this size is the kernel size plus any constant section sizes.
625 |         if ($updatePrgSize && $prgHdr->{type} == 1 &&
626 |             $sec->{offset} >= $prgHdr->{offset} &&
627 |             $sec->{offset} < $prgHdr->{offset} + $prgHdr->{fileSize} + $delta)
628 |         {
629 |             $prgHdr->{fileSize} += $delta;
630 |             $prgHdr->{memSize}  += $delta;
631 |         }
632 |     }
633 | }
634 | 
635 | # Write out the cubin after modifying it.
636 | sub write
637 | {
638 |     my ($cubin, $file) = @_;
639 | 
640 |     open my $fh, ">$file" or die "Error: could not open $file for writing: $!";
641 |     binmode($fh);
642 | 
643 |     my $elfHdr = $cubin->{elfHdr};
644 |     my $class  = $elfHdr->{fileClass};
645 | 
646 |     # write elf header
647 |     print $fh pack $elfHdrT[$class], @{$elfHdr}{@{$elfHdrC[$class]}};
648 |     my $pos = $elfHdr->{ehSize};
649 | 
650 |     # write section data
651 |     foreach my $secHdr (@{$cubin->{secHdrs}})
652 |     {
653 |         # Skip NULL and NOBITS data sections
654 |         next if $secHdr->{size} == 0 || $secHdr->{type} == 8;
655 | 
656 |         # Add any needed padding between sections
657 |         my $pad = $pos % $secHdr->{align};
658 |         if ($pad > 0)
659 |         {
660 |             $pad = $secHdr->{align} - $pad;
661 |             print $fh join '', "\0" x $pad;
662 |             $pos += $pad;
663 |         }
664 | 
665 |         print $fh pack 'H*', $secHdr->{Data};
666 |         $pos += $secHdr->{size};
667 |     }
668 | 
669 |     # write section headers
670 |     foreach my $secHdr (@{$cubin->{secHdrs}})
671 |     {
672 |         print $fh pack $secHdrT[$class], @{$secHdr}{@{$secHdrC[$class]}};
673 |     }
674 | 
675 |     #write program headers
676 |     foreach my $prgHdr (@{$cubin->{prgHdrs}})
677 |     {
678 |         print $fh pack $prgHdrT[$class], @{$prgHdr}{@{$prgHdrC[$class]}};
679 |     }
680 |     close $fh;
681 | }
682 | 
683 | __END__
684 | 
685 | 


--------------------------------------------------------------------------------
/microbench/microbench.cpp:
--------------------------------------------------------------------------------
  1 | // microbench.cpp : Defines the entry point for the console application.
  2 | //
  3 | 
  4 | // nvcc -l cuda -o microbench microbench.cpp
  5 | 
  6 | #include <stdio.h>
  7 | #include <stdlib.h>
  8 | #include <string.h>
  9 | #include <cuda.h>
 10 | #include <cudaProfiler.h>
 11 | 
 12 | CUcontext hContext = 0;
 13 | 
 14 | #define CUDA_CHECK( fn ) do { \
 15 | 		CUresult status = (fn); \
 16 | 		if ( CUDA_SUCCESS != status ) { \
 17 | 			const char* errstr; \
 18 | 			cuGetErrorString(status, &errstr); \
 19 | 			printf("CUDA Driver Failure (line %d of file %s):\n\t%s returned 0x%x (%s)\n", __LINE__, __FILE__, #fn, status, errstr); \
 20 | 			if (hContext) cuCtxDestroy(hContext); \
 21 | 			exit(EXIT_FAILURE); \
 22 | 		} \
 23 | 	} while (0)
 24 | 
 25 | 
 26 | int main(int argc, char* argv[])
 27 | {
 28 | 	//int iTest = 2896;
 29 | 	//while (iTest < 0x7fff)
 30 | 	//{
 31 | 	//	int iResult = iTest * iTest;
 32 | 	//	float fTest = (float)iTest;
 33 | 	//	int fResult = (int)(fTest * fTest);
 34 | 
 35 | 	//	printf("i*i:%08x f*f:%08x\n", iResult, fResult);
 36 | 
 37 | 	//	iTest += 0x0800;
 38 | 	//}
 39 | 	//exit(0);
 40 | 
 41 | 	char deviceName[32];
 42 | 	int devCount, ordinal, major, minor;
 43 | 	CUdevice  hDevice;
 44 | 
 45 | 	// Initialize the Driver API and find a device
 46 | 	CUDA_CHECK( cuInit(0) );
 47 | 	CUDA_CHECK( cuDeviceGetCount(&devCount) );
 48 | 	for (ordinal = 0; ordinal < devCount; ordinal++)
 49 | 	{
 50 | 		CUDA_CHECK( cuDeviceGet(&hDevice, ordinal) );
 51 | 		CUDA_CHECK( cuDeviceGetAttribute (&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, hDevice) );
 52 | 		CUDA_CHECK( cuDeviceGetAttribute (&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, hDevice) );
 53 | 		CUDA_CHECK( cuDeviceGetName(deviceName, sizeof(deviceName), hDevice) );
 54 | 		if (major >= 5 && minor >= 2)
 55 | 		{
 56 | 			printf("Using: Id:%d %s (%d.%d)\n\n", ordinal, deviceName, major, minor);
 57 | 			break;
 58 | 		}
 59 | 	}
 60 | 	if (ordinal == devCount)
 61 | 	{
 62 | 		printf("No compute 5.0 device found, exiting.\n");
 63 | 		exit(EXIT_FAILURE);
 64 | 	}
 65 | 
 66 | 	// First command line arg is the type: internal (CS2R) or external (cuEventElapsedTime) timing
 67 | 	int internalTiming = 1;
 68 | 	if (argc > 1)
 69 | 		internalTiming = strcmp(argv[1], "i") == 0 ? 1 : 0;
 70 | 
 71 | 	// Second command line arg is the number of blocks
 72 | 	int blocks = 1;
 73 | 	if (argc > 2)
 74 | 		blocks = atoi(argv[2]);
 75 | 	if (blocks < 1)
 76 | 		blocks = 1;
 77 | 
 78 | 	// Third command line arg is the number of threads
 79 | 	int threads = 128;
 80 | 	if (argc > 3)
 81 | 		threads = atoi(argv[3]);
 82 | 	if (threads > 1024 || threads < 32)
 83 | 		threads = 128;
 84 | 	threads &= -32;
 85 | 
 86 | 	// Forth command line arg:
 87 | 	double fops = 1.0;
 88 | 	int lanes = 1;
 89 | 	if (argc > 4)
 90 | 	{
 91 | 		if (internalTiming)
 92 | 		{
 93 | 			// The number of lanes to print for each warp
 94 | 			lanes = atoi(argv[4]);
 95 | 			if (lanes > 32 || lanes < 1)
 96 | 				lanes = 1;
 97 | 		}
 98 | 		else
 99 | 			// The number of floating point operations in a full kernel launch
100 | 			fops = atof(argv[4]);
101 | 	}
102 | 
103 | 	// Fifth command line arg is the repeat count for benchmarking
104 | 	int repeat = 1;
105 | 	if (argc > 5)
106 | 		repeat = atoi(argv[5]);
107 | 	if (repeat > 1000 || repeat < 1)
108 | 		repeat = 1;
109 | 
110 | 	// threads = total number of threads
111 | 	size_t size = sizeof(int) * threads * blocks;
112 | 
113 | 	// Setup our input and output buffers
114 | 	int* dataIn  = (int*)malloc(size);
115 | 	int* dataOut = (int*)malloc(size);
116 | 	int* clocks  = (int*)malloc(size);
117 | 	memset(dataIn, 0, size);
118 | 
119 | 	CUmodule hModule;
120 | 	CUfunction hKernel;
121 | 	CUevent hStart, hStop;
122 | 	CUdeviceptr devIn, devOut, devClocks;
123 | 
124 | 	// Init our context and device memory buffers
125 | 	CUDA_CHECK( cuCtxCreate(&hContext, 0, hDevice) );
126 | 	CUDA_CHECK( cuMemAlloc(&devIn, size) );
127 | 	CUDA_CHECK( cuMemAlloc(&devOut, size) );
128 | 	CUDA_CHECK( cuMemAlloc(&devClocks, size) );
129 | 	CUDA_CHECK( cuMemcpyHtoD(devIn, dataIn, size) );
130 | 	CUDA_CHECK( cuMemsetD8(devOut, 0, size) );
131 | 	CUDA_CHECK( cuMemsetD8(devClocks, 0, size) );
132 | 
133 | 	CUDA_CHECK( cuEventCreate(&hStart, CU_EVENT_BLOCKING_SYNC) );
134 | 	CUDA_CHECK( cuEventCreate(&hStop,  CU_EVENT_BLOCKING_SYNC) );
135 | 
136 | 	// Load our kernel
137 | 	CUDA_CHECK( cuModuleLoad(&hModule, "microbench.cubin") );
138 | 	CUDA_CHECK( cuModuleGetFunction(&hKernel, hModule, "microbench") );
139 | 
140 | 	// Setup the params
141 | 	void* params[] = { &devOut, &devClocks, &devIn };
142 | 	float ms = 0;
143 | 
144 | 	// Warm up the clock (unless under nsight)
145 | 	if (!getenv("NSIGHT_LAUNCHED")) // NSIGHT_CUDA_ANALYSIS NSIGHT_CUDA_DEBUGGER
146 | 		for (int i = 0; i < repeat; i++)
147 | 			CUDA_CHECK( cuLaunchKernel(hKernel, blocks, 1, 1, threads, 1, 1, 0, 0, params, 0) );
148 | 
149 | 	// Launch the kernel
150 | 	CUDA_CHECK( cuEventRecord(hStart, NULL) );
151 | 	//CUDA_CHECK( cuProfilerStart() );
152 | 	CUDA_CHECK( cuLaunchKernel(hKernel, blocks, 1, 1, threads, 1, 1, 0, 0, params, 0) );
153 | 	//CUDA_CHECK( cuProfilerStop() );
154 | 	CUDA_CHECK( cuEventRecord(hStop, NULL) );
155 | 	CUDA_CHECK( cuEventSynchronize(hStop) );
156 | 	CUDA_CHECK( cuEventElapsedTime(&ms, hStart, hStop) );
157 | 
158 | 	//CUDA_CHECK( cuCtxSynchronize() );
159 | 
160 | 	// Get back our results from each kernel
161 | 	CUDA_CHECK( cuMemcpyDtoH(dataOut, devOut, size) );
162 | 	CUDA_CHECK( cuMemcpyDtoH(clocks, devClocks, size) );
163 | 
164 | 	// Cleanup and shutdown of cuda
165 | 	CUDA_CHECK( cuEventDestroy(hStart) );
166 | 	CUDA_CHECK( cuEventDestroy(hStop) );
167 | 	CUDA_CHECK( cuModuleUnload(hModule) );
168 | 	CUDA_CHECK( cuMemFree(devIn) );
169 | 	CUDA_CHECK( cuMemFree(devOut) );
170 | 	CUDA_CHECK( cuMemFree(devClocks) );
171 | 	CUDA_CHECK( cuCtxDestroy(hContext) );
172 | 	hContext = 0;
173 | 
174 | 	// When using just one block, print out the internal timing data
175 | 	if (internalTiming)
176 | 	{
177 | 		int count = 0, total = 0, min = 999999, max = 0;
178 | 
179 | 		int* clocks_p  = clocks;
180 | 		int* dataOut_p = dataOut;
181 | 
182 | 		// Loop over and print results
183 | 		for (int blk = 0; blk < blocks; blk++)
184 | 		{
185 | 			float *fDataOut = reinterpret_cast<float*>(dataOut_p);
186 | 
187 | 			for(int tid = 0; tid < threads; tid += 32)
188 | 			{
189 | 				// Sometimes we want data on each thread, sometimes just one sample per warp is fine
190 | 				for (int lane = 0; lane < lanes; lane++)
191 | 					printf("b:%02d w:%03d t:%04d l:%02d clocks:%08d out:%08x\n", blk, tid/32, tid, lane, clocks_p[tid+lane], dataOut_p[tid+lane]); // %04u
192 | 
193 | 				count++;
194 | 				total += clocks_p[tid];
195 | 				if (clocks_p[tid] < min) min = clocks_p[tid];
196 | 				if (clocks_p[tid] > max) max = clocks_p[tid];
197 | 			}
198 | 			clocks_p  += threads;
199 | 			dataOut_p += threads;
200 | 		}
201 | 		printf("average: %.3f, min %d, max: %d\n", (float)total/count, min, max);
202 | 	}
203 | 	else
204 | 	{
205 | 		// For more than one block we're testing throughput and want external timing data
206 | 		printf("MilliSecs: %.3f, GFLOPS: %.3f\n", ms, fops / (ms * 1000000.0));
207 | 	}
208 | 	// And free up host memory
209 | 	free(dataIn); free(dataOut); free(clocks);
210 | 
211 | 	return 0;
212 | }
213 | 


--------------------------------------------------------------------------------
/microbench/microbench.cu:
--------------------------------------------------------------------------------
 1 | 
 2 | // Note this file isn't configured to automatically compile
 3 | 
 4 | #include <device_functions.h>
 5 | #include <device_launch_parameters.h>
 6 | 
 7 | // Build:
 8 | // nvcc -l cuda -o microbench microbench.cpp
 9 | // nvcc -arch sm_50 -cubin microbench.cu
10 | 
11 | // Inspect a cubin (use nvdisasm from cuda 6.5 for best results):
12 | // maxas.pl -e microbench.cubin
13 | 
14 | // Insert new sass into cubin
15 | // maxas.pl -i microbench.sass microbench.cubin
16 | 
17 | // run it:
18 | // ./microbench
19 | 
20 | // Use extern C so C++ doesn't mangle our kernel name
21 | extern "C" __global__ void  microbench(int *out, int *clocks, int *in)
22 | {
23 |     __shared__ int share[1024];
24 | 
25 |     int tid = threadIdx.x;
26 |     int bx  = blockIdx.x;
27 |     int by  = blockIdx.y;
28 | 
29 |     int start = clock();
30 | 
31 |     share[tid] = in[by * 65535 + bx]; //tid + blkDimX + blkDimY + blkDimZ + grdDimX + grdDimY + grdDimZ
32 | 
33 |     __syncthreads();
34 | 
35 |     int end = clock();
36 | 
37 |     clocks[tid] = (start >> 16) | (end & 0xffff0000); //end - start;
38 | 
39 |     out[tid] = share[tid ^ 1];
40 | }
41 | 
42 | // A note about using the Cuda Runtime.
43 | // If that's your preference over the driver API then here's what you'd do:
44 | 
45 | // In your project properties in the Cuda C/C++ panel:
46 | //    -Set the "Keep Processed Files" (-keep) option
47 | //    -Add a -v manually to the command line
48 | // If compiling on command line just add -keep -v options to nvcc.
49 | // Rebuild your solution and look in the log for these lines that follow the ptxas step:
50 | 
51 | // #$ fatbinary --create="Release/kernel.fatbin" -32 --key="a7bce87544c2a492" --ident="C:/Users/Scott/Documents/sgemm6/sgemm6/kernel.cu" --cmdline="-v --opt-level 4 --generate-line-info " "--image=profile=sm_50,file=Release/kernel.sm_50.cubin" "--image=profile=compute_50,file=Release/kernel.ptx" --embedded-fatbin="Release/kernel.fatbin.c" --cuda
52 | // #$ cl.exe @Release/kernel.cu.cpp.ii.res > "Release/kernel.cu.cpp.ii"
53 | // #$ cl.exe @Release/kernel.cu.obj.res -Fo"Release/kernel.cu.obj"
54 | 
55 | // You just need to manually run these 3 commands (or add them to a build script)
56 | // after you've modified the cubin generated from the preceeding ptxas command.
57 | // That will give you a new .cu.obj file which will automatically be linked in for you next time you
58 | // build your project (or you could manually run the linker step as well).
59 | 
60 | // Having done that you can call your kernel normally using the <<< >>> syntax.
61 | // Debugging will have to be with the sass syntax but that's what you'll want to see anyway.
62 | // With fatbin you can also keep non-maxwell optimized versions of your code.
63 | 
64 | 
65 | // I just discovered this also works as a shortcut to the above:
66 | // nvcc -lib -arch sm_52 -m 32 -use-cubin code=sm_52,cubin=microbench.cubin -o microbench.lib microbench.cu
67 | 
68 | // The cu kernel definitions above need to have empty bodies.
69 | // And, the cu file must be compiled to a lib seperately before linking.


--------------------------------------------------------------------------------
/microbench/microbench.sass:
--------------------------------------------------------------------------------
 1 | # Kernel: microbench
 2 | 
 3 | // This is a simple micro bench to demonstrate the latency in loading SR_TID.X
 4 | 
 5 | <CONSTANT_MAPPING>
 6 |     blockDimX : c[0x0][0x08]
 7 |     blockDimY : c[0x0][0x0c]
 8 |     blockDimZ : c[0x0][0x10]
 9 |     gridDimX  : c[0x0][0x14]
10 |     gridDimY  : c[0x0][0x18]
11 |     gridDimZ  : c[0x0][0x1c]
12 | 
13 |     param_out[0]    : c[0x0][0x140]
14 |     param_out[1]    : c[0x0][0x144]
15 |     param_clocks[0] : c[0x0][0x148]
16 |     param_clocks[1] : c[0x0][0x14c]
17 |     param_in[0]     : c[0x0][0x150]
18 |     param_in[1]     : c[0x0][0x154]
19 | </CONSTANT_MAPPING>
20 | 
21 | <REGISTER_MAPPING>
22 | 
23 |      0-1 : out<0-1>
24 |      2-3 : clocks<0-1>
25 |      4-5 : in<0-1>
26 |     6-20 : tid, bid, blockDim, clock1, clock2, result, offset, x
27 | 
28 | </REGISTER_MAPPING>
29 | 
30 | // Load in our params (not currently used below)
31 | --:-:-:-:1      MOV in0, param_in[0];
32 | --:-:-:-:1      MOV in1, param_in[1];
33 | 
34 | // Get the first clock value
35 | --:-:-:-:1      CS2R clock1, SR_CLOCKLO;
36 | 
37 | // Get the threadId and blockId
38 | // Set the Read-After-Write dependency barrier 1 and 2
39 | --:-:1:-:1      S2R tid, SR_TID.X;
40 | // Add one additional clock stall to allow the barrier time to set prior to next instruction that uses it
41 | --:-:2:-:2      S2R bid, SR_CTAID.X;
42 | 
43 | 
44 | // Get the second clock value
45 | // Wait on the depenedency barriers that were set in the prior instruction
46 | // Stall 6 to allow CS2R time to complete before next instruction
47 | // CS2R takes a constant 6 clocks to complete unlike S2R which is a variable 22-44 clocks
48 | // This stall count does not factor into the time calculation at all
49 | 03:-:-:-:6      CS2R clock2, SR_CLOCKLO;
50 | 
51 | // Take the difference of clocks
52 | --:-:-:-:1      IADD clock1, clock2, -clock1;
53 | 
54 | // Setup our output addresses
55 | // Stall your pipeline dependencies properly
56 | // Note using a single XMAD assumes blockDimX and bid are 16 bit values, which is reasonable for this test code
57 | --:-:-:-:6      XMAD offset, bid, blockDimX, tid;
58 | 
59 | // LEA is "load effective address"
60 | // The offset param is shifted left 2 and added to the pointers with 64bit math
61 | --:-:-:-:6      LEA      clocks0.CC, offset, param_clocks[0],     2;
62 | --:-:-:-:1      LEA.HI.X clocks1,    offset, param_clocks[1], RZ, 2;
63 | 
64 | --:-:-:-:6      LEA      out0.CC, offset, param_out[0],     2;
65 | --:-:-:-:1      LEA.HI.X out1,    offset, param_out[1], RZ, 2;
66 | 
67 | // Output the results.
68 | // No stall needed on prior instruction as memory store instructions have a 5 clock delay in picking up register values
69 | --:-:-:-:1      STG.E [clocks], clock1;
70 | --:-:-:-:1      STG.E [out],    offset; # use this to return whatever you like to inspect the results
71 | --:-:-:-:5      EXIT;
72 | 
73 | 


--------------------------------------------------------------------------------
/microbench/shared.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | use strict;
 3 | 
 4 | print `maxas.pl -i shared_sts16.sass microbench.cubin`;
 5 | 
 6 | exit if $?;
 7 | 
 8 | print `Release\\microbench.exe i 1 64`;
 9 | 
10 | 
11 | __END__
12 | 
13 | 


--------------------------------------------------------------------------------
/microbench/shared_lds.sass:
--------------------------------------------------------------------------------
  1 | # Kernel: microbench
  2 | # InsCnt: 18
  3 | # RegCnt: 5
  4 | # SharedSize: 4096
  5 | # BarCnt: 1
  6 | # Params(3):
  7 | #   ord:addr:size:align
  8 | #   0:0x140:4:0
  9 | #   1:0x144:4:0
 10 | #   2:0x148:4:0
 11 | 
 12 | // This is a simple micro bench to demonstrate the latency in loading SR_TID.X
 13 | 
 14 | <REGISTER_MAPPING>
 15 | 
 16 |     0-3 : result, a, b, c
 17 | 
 18 |     4-40 : out, clocks, in, tid, bid, blockDim, clock1, clock2, x, tid3, tid7, tid96, tid128, readAs, readBs, val<0-20>
 19 | 
 20 | </REGISTER_MAPPING>
 21 | 
 22 | // Load in our params
 23 | --:-:1:-:1      S2R tid,      SR_TID.X;
 24 | --:-:2:-:1      S2R bid,      SR_CTAID.X;
 25 | 
 26 | --:-:-:-:1      MOV result,  c[0x0][0x0];
 27 | --:-:-:-:1      MOV in,      c[0x0][0x100];
 28 | 
 29 | --:-:-:-:1      CS2R clock1, SR_CLOCKLO;
 30 | --:-:-:-:1      MOV result,  c[0x0][0x13c];
 31 | --:-:-:-:1      CS2R clock2, SR_CLOCKLO;
 32 | 
 33 | --:-:-:-:1      MOV blockDim, c[0x0][0x8];
 34 | --:-:-:-:1      MOV out,      c[0x0][0x140];
 35 | --:-:-:-:1      MOV clocks,   c[0x0][0x144];
 36 | 
 37 | 
 38 | 
 39 | 
 40 | <SCHEDULE_BLOCK>
 41 | 
 42 | 03:-:-:-:1      LOP.AND tid3,   tid, 3;
 43 | --:-:-:-:1      LOP.AND tid7,   tid, 7;
 44 | --:-:-:-:1      LOP.AND tid96,  tid, 96;
 45 | --:-:-:-:1      LOP.AND tid128, tid, 128;
 46 | 
 47 | // readAs = ((tid128 >> 4) | tid7) << 4
 48 | --:-:-:-:1      SHR.U32 readAs, tid128, 4;
 49 | --:-:-:-:1      LOP.OR  readAs, readAs, tid7;
 50 | --:-:-:-:1      SHL     readAs, readAs, 4;
 51 | 
 52 | // readBs  = ((tid96 >> 3) | tid3) << 4
 53 | --:-:-:-:1      SHR.U32 readBs, tid96, 3;
 54 | --:-:-:-:1      LOP.OR  readBs, readBs, tid3;
 55 | #--:-:-:-:1      SHL     readBs, readBs, 4;
 56 | #--:-:-:-:1      ISCADD  readBs, readBs, 4x<1024>, 4;
 57 | 
 58 | 
 59 | </SCHEDULE_BLOCK>
 60 | 
 61 | 
 62 | 
 63 | #--:-:-:-:1      LDS.U.128 result, [readBs];
 64 | 
 65 | 
 66 | 
 67 | 
 68 | 01:-:-:-:1      IADD clock1, clock2, -clock1;
 69 | 
 70 | 
 71 | --:-:-:-:1      XMAD tid, blockDim, bid, tid;
 72 | --:-:-:Y:6      XMAD.MRG x, blockDim, bid.H1, RZ;
 73 | --:-:-:Y:6      XMAD.PSL.CBCC tid, blockDim.H1, x.H1, tid;
 74 | --:-:-:Y:6      SHL  tid, tid, 0x2;
 75 | 
 76 | --:-:-:-:1      IADD clocks, clocks, tid;
 77 | --:-:-:-:2      IADD out,  out,  tid;
 78 | 
 79 | --:-:-:-:1      STG [clocks], clock1;
 80 | --:-:-:-:1      STG [out],    readBs;
 81 | --:-:-:-:5      EXIT;
 82 | 
 83 | <COMMENT>
 84 | 
 85 | --:-:-:-:4      LOP.AND tid32, tid, -32;
 86 | 
 87 | --:-:-:-:1      STS.128 [tid32 + 4x<2048>], RZ;
 88 | 
 89 | --:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
 90 | --:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
 91 | --:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
 92 | --:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
 93 | --:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
 94 | --:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
 95 | --:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
 96 | --:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
 97 | --:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
 98 | --:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
 99 | --:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
100 | --:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
101 | --:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
102 | --:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
103 | --:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
104 | --:-:1:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
105 | 
106 | 
107 | // readAs = (((tid & 0x80) >> 4) | ((tid >> 1) & 7)) << 4;
108 | --:-:-:-:1      BFE.U32 tid7,   tid,    0x301;
109 | --:-:-:-:1      LOP.AND readAs, tid,    0x80;
110 | --:-:-:-:1      SHR.U32 readAs, readAs, 4;
111 | --:-:-:-:1      LOP.OR  readAs, readAs, tid7;
112 | --:-:-:-:1      SHL     readAs, readAs, 4;
113 | 
114 | // readBs  = ((($tid & 0x70) >> 3) | ($tid & 1)) << 4 + 4096;
115 | --:-:-:-:1      LOP.AND tid1,   tid,    0x1;
116 | --:-:-:-:1      LOP.AND readBs, tid,    0x70;
117 | --:-:-:-:1      SHR.U32 readBs, readBs, 3;
118 | --:-:-:-:1      LOP.OR  readBs, readBs, tid1;
119 | --:-:-:-:1      ISCADD  readBs, readBs, 4x<1024>, 4;
120 | 
121 | 
122 | </COMMENT>


--------------------------------------------------------------------------------
/microbench/shared_sts16.sass:
--------------------------------------------------------------------------------
  1 | # Kernel: microbench
  2 | # InsCnt: 18
  3 | # RegCnt: 5
  4 | # SharedSize: 4096
  5 | # BarCnt: 1
  6 | # Params(3):
  7 | #   ord:addr:size:align
  8 | #   0:0x140:4:0
  9 | #   1:0x144:4:0
 10 | #   2:0x148:4:0
 11 | 
 12 | // This is a simple micro bench to demonstrate the latency in loading SR_TID.X
 13 | 
 14 | <REGISTER_MAPPING>
 15 | 
 16 |     0-3 : result, a, b, c
 17 | 
 18 |     4-40 : out, clocks, in, tid, bid, blockDim, clock1, clock2, x, tid1, tid31, tid32, readAs, readBs, val<0-20>
 19 | 
 20 | </REGISTER_MAPPING>
 21 | 
 22 | // Load in our params
 23 | --:-:1:-:1      S2R tid,      SR_TID.X;
 24 | --:-:2:-:1      S2R bid,      SR_CTAID.X;
 25 | 
 26 | //--:-:-:-:1      MOV result,  c[0x0][0x0];
 27 | //--:-:-:-:1      MOV in,      c[0x0][0x100];
 28 | --:-:-:-:1      MOV result, 1;
 29 | 
 30 | --:-:-:-:1      MOV blockDim, c[0x0][0x8];
 31 | --:-:-:-:1      MOV out,      c[0x0][0x140];
 32 | --:-:-:-:1      MOV clocks,   c[0x0][0x144];
 33 | 
 34 | 
 35 | // readAs = ((tid >> 1) & 7) << 4;
 36 | 03:-:-:-:6      BFE.U32 readAs, tid,    0x301; // 3 bits at position 1
 37 | --:-:-:-:6      SHL     readAs, readAs, 3;
 38 | 
 39 | // readBs  = (((tid & 0x30) >> 3) | (tid & 1)) << 4 + 1024;
 40 | --:-:-:-:6      LOP.AND tid1,   tid,    1;
 41 | --:-:-:-:6      LOP.AND readBs, tid,    0x30;
 42 | --:-:-:-:6      SHR.U32 readBs, readBs, 3;
 43 | --:-:-:-:6      LOP.OR  readBs, readBs, tid1;
 44 | --:-:-:-:6      ISCADD  readBs, readBs, 0, 3;
 45 | 
 46 | 
 47 | 
 48 | ///--:-:-:-:1      STS [tid32], result;
 49 | //--:-:-:-:1      STS.S16 [tid32 + 2x<32>], result;
 50 | //--:-:1:-:2      LDS.U.64 result, [readBs];
 51 | 
 52 | --:-:-:-:0      CS2R clock1, SR_CLOCKLO;
 53 | --:-:1:-:6      LDS.U.64 result, [readAs];
 54 | --:-:-:-:6      CS2R clock2, SR_CLOCKLO;
 55 | 
 56 | 
 57 | 01:-:-:-:1      IADD clock1, clock2, -clock1;
 58 | 
 59 | 
 60 | --:-:-:-:1      XMAD tid, blockDim, bid, tid;
 61 | --:-:-:Y:6      XMAD.MRG x, blockDim, bid.H1, RZ;
 62 | --:-:-:Y:6      XMAD.PSL.CBCC tid, blockDim.H1, x.H1, tid;
 63 | --:-:-:Y:6      SHL  tid, tid, 0x2;
 64 | 
 65 | --:-:-:-:1      IADD clocks, clocks, tid;
 66 | --:-:-:-:2      IADD out,  out,  tid;
 67 | 
 68 | --:-:-:-:1      STG [clocks], clock1;
 69 | --:-:-:-:1      STG [out],    result;
 70 | --:-:-:-:5      EXIT;
 71 | 
 72 | <COMMENT>
 73 | 
 74 | --:-:-:-:4      LOP.AND tid32, tid, -32;
 75 | 
 76 | --:-:-:-:1      STS.128 [tid32 + 4x<2048>], RZ;
 77 | 
 78 | --:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
 79 | --:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
 80 | --:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
 81 | --:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
 82 | --:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
 83 | --:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
 84 | --:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
 85 | --:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
 86 | --:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
 87 | --:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
 88 | --:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
 89 | --:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
 90 | --:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
 91 | --:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
 92 | --:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
 93 | --:-:1:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
 94 | 
 95 | 03:-:-:-:6      LOP.AND  tid31, tid, 31;
 96 | --:-:-:-:6      LOP.AND  tid32, tid, 32;
 97 | --:-:-:-:6      SHL  tid32, tid32, 0x2;
 98 | --:-:-:-:6      LOP.OR  tid32, tid32, tid31;
 99 | --:-:-:-:6      SHL  tid32, tid32, 0x2;
100 | 
101 | // readAs = (((tid & 0x80) >> 4) | ((tid >> 1) & 7)) << 4;
102 | --:-:-:-:1      BFE.U32 tid7,   tid,    0x301;
103 | --:-:-:-:1      LOP.AND readAs, tid,    0x80;
104 | --:-:-:-:1      SHR.U32 readAs, readAs, 4;
105 | --:-:-:-:1      LOP.OR  readAs, readAs, tid7;
106 | --:-:-:-:1      SHL     readAs, readAs, 4;
107 | 
108 | // readBs  = ((($tid & 0x70) >> 3) | ($tid & 1)) << 4 + 4096;
109 | --:-:-:-:1      LOP.AND tid1,   tid,    0x1;
110 | --:-:-:-:1      LOP.AND readBs, tid,    0x70;
111 | --:-:-:-:1      SHR.U32 readBs, readBs, 3;
112 | --:-:-:-:1      LOP.OR  readBs, readBs, tid1;
113 | --:-:-:-:1      ISCADD  readBs, readBs, 4x<1024>, 4;
114 | 
115 | 
116 | </COMMENT>


--------------------------------------------------------------------------------
/microbench/throughput.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | use strict;
 3 | 
 4 | my $loopSize  = 512;
 5 | my $blocks    = 32;
 6 | my $loops     = 10240000;
 7 | my $fileName  = 'throughput2.sass';
 8 | 
 9 | writeSassFile($fileName, $loops);
10 | 
11 | #print `maxas.pl -p $fileName`;
12 | #exit;
13 | 
14 | print `maxas.pl -i $fileName microbench.cubin`;
15 | exit if $?;
16 | 
17 | foreach my $thread128 (2)
18 | {
19 |     my $threads   = $thread128 * 128;
20 |     my $fops      = 2 * $loops * $loopSize * $blocks * $threads;
21 | 
22 |     my $data = `Release\\microbench.exe e $blocks $threads $fops`;
23 | 
24 |     my ($gflops) = $data =~ /GFLOPS: ([0-9]+)/ms;
25 | 
26 |     printf "%d %d %d\n", $thread128, $threads, $gflops;
27 | }
28 | 
29 | exit;
30 | 
31 | sub writeSassFile
32 | {
33 |     my ($filename, $loops) = @_;
34 | 
35 |     open my $fh, ">$filename" or die "$filename: $!";
36 | 
37 |     printf $fh <<'EOF', $loops;
38 | # Kernel: microbench
39 | 
40 | <REGISTER_MAPPING>
41 | 
42 |     0-10 : result, r1, r2, r3
43 |     20-27 ~ count, stop
44 | 
45 | </REGISTER_MAPPING>
46 | 
47 | --:-:-:-:1      MOV count, RZ;
48 | --:-:-:-:1      MOV32I stop, %d;
49 | --:-:-:-:1      MOV32I r1, 1.0;
50 | --:-:-:-:1      MOV32I r2, 1.0;
51 | --:-:-:-:4      MOV32I r3, 1.0;
52 | 
53 | LOOP:
54 | 
55 | --:-:-:-:1      ISETP.LE.AND P0, PT, count, stop, PT;
56 | --:-:-:-:1      IADD count, count, 1;
57 | 
58 | <CODE>
59 |     my $out;
60 | 
61 |     foreach my $i (0 .. 511)
62 |     {
63 |         my $yield = ($i + 32) & 63 ? '-' : 'Y';
64 | 
65 |         my $stall = $i == 511 ? 0 : 1;
66 | 
67 |         $out .= "--:-:-:$yield:$stall      FFMA result, r1, r2, r3;\n";
68 |     }
69 |     return $out;
70 | </CODE>
71 | 
72 | --:-:-:Y:5  @P0 BRA LOOP;
73 | --:-:-:-:5      EXIT;
74 | EOF
75 | 
76 |     close $fh;
77 | }
78 | 
79 | __END__
80 | 
81 | 


--------------------------------------------------------------------------------
/microbench/throughput.sass:
--------------------------------------------------------------------------------
 1 | # Kernel: microbench
 2 | # InsCnt: 18
 3 | # RegCnt: 5
 4 | # SharedSize: 4096
 5 | # BarCnt: 1
 6 | # Params(3):
 7 | #   ord:addr:size:align
 8 | #   0:0x140:4:0
 9 | #   1:0x144:4:0
10 | #   2:0x148:4:0
11 | 
12 | <REGISTER_MAPPING>
13 | 
14 |     8-20 : count
15 | 
16 | </REGISTER_MAPPING>
17 | 
18 | --:-:-:-:1      MOV R0, RZ;
19 | --:-:-:-:1      MOV R1, RZ;
20 | --:-:-:-:1      MOV R2, RZ;
21 | --:-:-:-:1      MOV R3, RZ;
22 | --:-:-:-:1      MOV R4, RZ;
23 | --:-:-:-:1      MOV R5, RZ;
24 | --:-:-:-:1      MOV R6, RZ;
25 | --:-:-:-:1      MOV R7, RZ;
26 | --:-:-:-:1      MOV R8, RZ;
27 | --:-:-:Y:6      MOV count, RZ;
28 | 
29 | // This loop is capable of running at 1700 GFlops on GM107.
30 | // You can tweak it to see how register bank conflicts or different control codes
31 | // effect performance.
32 | // With thoughput.pl you can pass params to this code and do some autotuning.
33 | LOOP:
34 | 
35 | --:-:-:-:1      ISETP.LE.AND P0, PT, count, 0x19000, PT;
36 | --:-:-:-:1      IADD count, count, 0x1;
37 | 
38 | <CODE>
39 |     my $out;
40 | 
41 |     foreach my $i (0..511) #511
42 |     {
43 |         my $y = ($i + 32) & 63 ? '-' : 'Y';
44 | 
45 |         $out .= qq|
46 | --:-:-:$y:1      FFMA R0, R1, R2, R3;|; #c[0x0][$c]
47 |     }
48 |     return $out;
49 | </CODE>
50 | 
51 | --:-:-:Y:5  @P0 BRA LOOP;
52 | 
53 | --:-:-:-:5      EXIT;
54 | 
55 | <COMMENT>
56 | 
57 | 
58 |     open my $fh, 'params.txt';
59 |     my $line = <$fh>;
60 |     close $fh;
61 |     my ($r1, $r2, $r3) = split "\t", $line;
62 | 
63 |     80-95 : out, clocks, in, tid, clock1, clock2, result
64 | 
65 | 
66 | --:-:1:-:1      S2R tid,   SR_TID.X;
67 | --:-:-:-:1      MOV out,    c[0x0][0x140];
68 | --:-:-:-:1      MOV clocks, c[0x0][0x144];
69 | 01:-:-:-:1      MOV in,     c[0x0][0x148];
70 | 
71 | 
72 | 
73 | --:-:-:-:1      MOV32I f0, 0x3f800000;
74 | --:-:-:-:1      MOV32I f1, 0x3f800000;
75 | --:-:-:-:1      MOV32I f2, 0x3f800000;
76 | --:-:-:-:5      MOV32I f3, 0x3f800000;
77 | 
78 | --:-:-:-:1      CS2R clock1, SR_CLOCKLO;
79 | 
80 | 
81 | --:-:-:-:1      CS2R clock2, SR_CLOCKLO;
82 | 
83 | --:-:-:-:6      MOV32I result, 0x457;
84 | --:-:-:-:1      IADD clock1, clock2, -clock1;
85 | 
86 | 
87 | --:-:-:-:6      SHL  tid, tid, 0x2;
88 | --:-:-:-:1      IADD clocks, clocks, tid;
89 | --:-:-:-:1      IADD out,  out,  tid;
90 | 
91 | --:-:-:-:1      STG [clocks], clock1;
92 | --:-:-:-:1      STG [out],    R24;
93 | 
94 | 
95 | </COMMENT>


--------------------------------------------------------------------------------
/microbench/throughput2.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl
  2 | use strict;
  3 | my %p;
  4 | 
  5 | $p{N}         = 8192;
  6 | $p{blocking}  = 8;
  7 | $p{unroll}    = 8;
  8 | $p{threads}   = 64;   #256
  9 | 
 10 | $p{csize}     = $p{blocking} * $p{blocking};
 11 | $p{loopSize}  = $p{unroll} * $p{csize};
 12 | $p{width}     = sqrt($p{csize} * $p{threads});
 13 | $p{blocks}    = ($p{N} / $p{width}) * ($p{N} / $p{width});
 14 | $p{loops}     = $p{N} / $p{unroll};
 15 | $p{fops}      = 2 * $p{loops} * $p{loopSize} * $p{blocks} * $p{threads};
 16 | 
 17 | my $fileName  = 'throughput2.sass';
 18 | 
 19 | my @params = qw(N blocking unroll threads csize loopSize loops width blocks fops);
 20 | 
 21 | #print join("\t", @params), "\n";
 22 | #print join("\t", @p{@params}), "\n";
 23 | 
 24 | print map sprintf("%-9s: %d\n", $_, $p{$_}), @params;
 25 | 
 26 | writeSassFile($fileName, $p{loopSize}, $p{loops});
 27 | 
 28 | #print `maxas.pl -p $fileName`;
 29 | #exit;
 30 | 
 31 | print `maxas.pl -i $fileName microbench.cubin`;
 32 | 
 33 | exit if $?;
 34 | 
 35 | my $data = `Release\\microbench.exe e $p{blocks} $p{threads} $p{fops} 50`;
 36 | 
 37 | my ($gflops) = $data =~ /GFLOPS: ([0-9]+)/ms;
 38 | 
 39 | print $data;
 40 | 
 41 | #printf "%d %4d %4d %d\n", $thread128, $loopSize, $loops, $gflops;
 42 | 
 43 | 
 44 | 
 45 | 
 46 | sub writeSassFile
 47 | {
 48 |     my ($filename, $loopSize, $loops) = @_;
 49 | 
 50 |     open my $fh, ">$filename" or die "$filename: $!";
 51 | 
 52 |     printf $fh <<'END_SASS', $loops;
 53 | # Kernel: microbench
 54 | 
 55 | <REGISTER_MAPPING>
 56 | 
 57 |      3, 2,11,10,19,18,27,26 : cx00y<00-03|64-67>
 58 |      7, 6,15,14,23,22,31,30 : cx01y<00-03|64-67>
 59 |      1, 0, 9, 8,17,16,25,24 : cx02y<00-03|64-67>
 60 |      5, 4,13,12,21,20,29,28 : cx03y<00-03|64-67>
 61 |     35,34,43,42,51,50,59,58 : cx64y<00-03|64-67>
 62 |     39,38,47,46,55,54,63,62 : cx65y<00-03|64-67>
 63 |     33,32,41,40,49,48,57,56 : cx66y<00-03|64-67>
 64 |     37,36,45,44,53,52,61,60 : cx67y<00-03|64-67>
 65 | 
 66 |     64-79 : j0Ax<00-03|64-67>, j0By<00-03|64-67>
 67 |     80-95 : j1Ax<00-03|64-67>, j1By<00-03|64-67>
 68 | 
 69 |     0-127 : r<0-127>
 70 | 
 71 |     100-101 : count, stop
 72 | 
 73 |     //102-112 ~ readAs, readBs, writeS
 74 | 
 75 | </REGISTER_MAPPING>
 76 | 
 77 | --:-:-:-:1      MOV count, RZ;
 78 | --:-:-:-:1      MOV32I stop, %d;
 79 | //--:-:-:-:1      MOV writeS, RZ;
 80 | //--:-:-:-:1      MOV readAs, RZ;
 81 | //--:-:-:-:1      MOV readBs, RZ;
 82 | 
 83 | <CODE>
 84 |     return join '', map "--:-:-:-:1      MOV32I r$_, 1.0;\n", 0..95;
 85 | </CODE>
 86 | 
 87 | LOOP:
 88 | 
 89 | --:-:-:-:1      ISETP.LE.AND P0, PT, count, stop, PT;
 90 | --:-:-:-:1      IADD count, count, 1;
 91 | 
 92 | <CODE>
 93 |     my $out;
 94 | 
 95 | 
 96 |     my @cOrder;
 97 |     #my @swirl = ([0,1],[0,0],[2,0],[2,1]);
 98 |     my @swirl = ([2,0],[2,1],[0,1],[0,0]);
 99 |     #my @swirl = ([0,1],[0,0],[1,0],[1,1]);
100 |     my @xVals = (0,1,64,65);
101 |     #my @xVals = (0,2,64,66);
102 | 
103 |     my @yVals = (0,2,64,66);
104 | 
105 |     foreach my $y (@yVals)
106 |     {
107 |         foreach my $x (@xVals)
108 |         {
109 |             push @cOrder, sprintf('x%%02dy%%02d', $x + $_->[0], $y + $_->[1]) foreach @swirl;
110 |         }
111 |         @xVals = reverse @xVals;
112 |     }
113 | 
114 |     foreach my $j (0..7)
115 |     {
116 |         my $odd  = $j & 1;
117 |         my $nOdd = !$odd + 0;
118 | 
119 | 		my %%insert;
120 | 
121 |         #$insert{c62} = "01:-:-:-:5      BAR.SYNC 0;\n" if $j == 6;
122 | 
123 |         $insert{c62} =
124 |                 "--:-:-:-:1      LOP.XOR readAs, readAs, 0;\n" .
125 |                 "--:-:-:-:1      LOP.XOR readBs, readBs, 0;\n" .
126 |                 "--:-:-:-:1      LOP.XOR readAs, readAs, 0;\n" .
127 |                 "--:-:-:-:1      LOP.XOR readBs, readBs, 0;\n" .
128 |                 "--:-:-:-:1      LOP.XOR writeS, writeS, 0;\n" if $j == 8;
129 | 
130 |         foreach my $c (0 .. 63)
131 |         {
132 |             my ($x,$y) = $cOrder[$c] =~ /^(x\d+)(y\d+)/;
133 |             my $ins    = $insert{"c$c"} || '';
134 |             my $stall  = ($c == 63 && $j == 7) ? 0 : 1; #1; #$ins ||
135 |             my $yield  = $c == 32 ? 'Y' : '-';
136 |             my $wait   = '--'; #$c ? '--' : '01';
137 | 
138 |             $out .= "$wait:-:-:$yield:$stall      FFMA c$cOrder[$c], j${odd}A$x, j${odd}B$y, c$cOrder[$c];\n$ins";
139 |         }
140 |     }
141 |     return $out;
142 | </CODE>
143 | 
144 | --:-:-:Y:5  @P0 BRA LOOP;
145 | --:-:-:-:5      EXIT;
146 | END_SASS
147 | 
148 |     close $fh;
149 | }
150 | 
151 | __END__
152 | 
153 |         my %%insert = (
154 |             c0 => "--:-:-:-:1      LDS.U.128 j${nOdd}Ax00, [readAs+0x10];\n",
155 |             c2 => "--:-:-:-:1      LDS.U.128 j${nOdd}By00, [readBs+0x10];\n",
156 |             c4 => "--:-:-:-:1      LDS.U.128 j${nOdd}Ax64, [readAs+0x10];\n",
157 |             c6 => "--:-:1:-:1      LDS.U.128 j${nOdd}By64, [readBs+0x10];\n",
158 |         );


--------------------------------------------------------------------------------
/microbench/throughput2.sass:
--------------------------------------------------------------------------------
 1 | # Kernel: microbench
 2 | 
 3 | <REGISTER_MAPPING>
 4 | 
 5 |     0-10 : result, r1, r2, r3
 6 |     20-27 ~ count, stop
 7 | 
 8 | </REGISTER_MAPPING>
 9 | 
10 | --:-:-:-:1      MOV count, RZ;
11 | --:-:-:-:1      MOV32I stop, 102400;
12 | --:-:-:-:1      MOV32I r1, 1.0;
13 | --:-:-:-:1      MOV32I r2, 1.0;
14 | --:-:-:-:4      MOV32I r3, 1.0;
15 | 
16 | LOOP:
17 | 
18 | --:-:-:-:1      ISETP.LE.AND P0, PT, count, stop, PT;
19 | --:-:-:-:1      IADD count, count, 1;
20 | 
21 | <CODE>
22 |     my $out;
23 | 
24 |     foreach my $i (0 .. 511)
25 |     {
26 |         my $yield = ($i + 32) & 63 ? '-' : 'Y';
27 | 
28 |         my $stall = $i == 511 ? 0 : 1;
29 | 
30 |         #$out .= "--:-:-:$yield:1      FFMA r3, r1, r2, r3;\n";
31 |         #$out .= "--:-:-:-:1      FFMA r3, r1, r2, r3;\n";
32 |         #$out .= "--:-:-:-:1      FFMA r3, r1, r2, r3;\n";
33 |         #$out .= "--:-:-:-:0      FFMA r3, r1, r2, r3;\n";
34 |         #$out .= "--:-:-:-:1      I2F.F32.S16 result, r1;\n";
35 | 
36 |         #$out .= "--:-:-:$yield:$stall      VADD.S16.S16.SAT.MRG_16L result, r1, r2, RZ;\n";
37 |         #$out .= "--:-:-:-:1      MOV result, RZ;\n";
38 | 
39 |         $out .= "--:-:-:$yield:$stall      IADD.SAT result, r1, r2;\n";
40 |         #$out .= "--:-:-:$yield:$stall      VMAD.S8.S8.SAT result, r1, r2, r3;\n";
41 |         #$out .= "--:-:-:$yield:$stall      XMAD result, r1, r2, r3;\n";
42 |     }
43 |     return $out;
44 | </CODE>
45 | 
46 | --:-:-:Y:5  @P0 BRA LOOP;
47 | --:-:-:-:5      EXIT;
48 | 


--------------------------------------------------------------------------------
/microbench/throughput3.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | use strict;
 3 | 
 4 | my %data;
 5 | 
 6 | foreach my $thread128 (1 .. 8)
 7 | {
 8 |     foreach my $size64 (8 .. 16)
 9 |     {
10 |         my $loopSize  = $size64 * 64;
11 |         my $loops     = int(2 * 1638400 / ($size64 * $thread128));
12 | 
13 |         my $blocks    = 16;
14 |         my $threads   = $thread128 * 128;
15 |         my $fops      = 2 * $loops * $loopSize * $blocks * $threads;
16 |         my $fileName  = 'throughput2.sass';
17 | 
18 |         #printf "%d %4d %4d %d\n", $thread128, $loopSize, $loops, $fops;
19 |         #next;
20 | 
21 |         writeSassFile($fileName, $loopSize, $loops);
22 | 
23 |         `maxas.pl -i $fileName microbench.cubin`;
24 | 
25 |         exit if $?;
26 | 
27 |         my $data = `Release\\microbench.exe e $blocks $threads $fops`;
28 | 
29 |         my ($gflops) = $data =~ /GFLOPS: ([0-9]+)/ms;
30 | 
31 |         printf "%d %4d %4d %d\n", $thread128, $loopSize, $loops, $gflops;
32 | 
33 |         push @{$data{$loopSize}}, $gflops;
34 |     }
35 | }
36 | print join("\t", 'size', 1 .. 8), "\n";
37 | foreach my $loopSize (sort {$a <=> $b} keys %data)
38 | {
39 |     print join("\t", $loopSize, @{$data{$loopSize}}), "\n";
40 | }
41 | 
42 | exit;
43 | 
44 | sub writeSassFile
45 | {
46 |     my ($filename, $loopSize, $loops) = @_;
47 | 
48 |     open my $fh, ">$filename" or die "$filename: $!";
49 | 
50 |     printf $fh <<'EOF', $loops, $loopSize, $loopSize;
51 | # Kernel: microbench
52 | 
53 | <REGISTER_MAPPING>
54 | 
55 |     0-10 : result, r1, r2, r3, count, stop
56 | 
57 | </REGISTER_MAPPING>
58 | 
59 | --:-:-:-:1      MOV count, RZ;
60 | --:-:-:-:1      MOV32I stop, %d;
61 | --:-:-:-:1      MOV32I r1, 1.0;
62 | --:-:-:-:1      MOV32I r2, 1.0;
63 | --:-:-:-:4      MOV32I r3, 1.0;
64 | 
65 | LOOP:
66 | 
67 | --:-:-:-:1      ISETP.LE.AND P0, PT, count, stop, PT;
68 | --:-:-:-:1      IADD count, count, 1;
69 | 
70 | <CODE>
71 |     my $out;
72 | 
73 |     foreach my $i (0 .. %d)
74 |     {
75 |         my $y = %d > 64 && (($i + 32) & 63) ? '-' : 'Y';
76 | 
77 |         $out .= "--:-:-:$y:1      FFMA result, r1, r2, r3;\n";
78 |     }
79 |     return $out;
80 | </CODE>
81 | 
82 | --:-:-:Y:5  @P0 BRA LOOP;
83 | --:-:-:-:5      EXIT;
84 | EOF
85 | 
86 |     close $fh;
87 | }
88 | 
89 | __END__
90 | 
91 | 


--------------------------------------------------------------------------------
/microbench/throughput4.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl
  2 | use strict;
  3 | 
  4 | my $loopSize  = 512;
  5 | my $blocks    = 64;
  6 | my $loops     = 102400;
  7 | my $fileName  = 'throughput2.sass';
  8 | 
  9 | writeSassFile($fileName, $loops);
 10 | 
 11 | #print `maxas.pl -p $fileName`;
 12 | #exit;
 13 | 
 14 | print `maxas.pl -i $fileName microbench.cubin`;
 15 | exit if $?;
 16 | 
 17 | foreach my $thread128 (4)
 18 | {
 19 |     my $threads   = $thread128 * 128;
 20 |     my $fops      = 2 * $loops * $loopSize * $blocks * $threads;
 21 | 
 22 |     print "./microbench e $blocks $threads $fops\n\n";
 23 |     my $data = `./microbench e $blocks $threads $fops`;
 24 |     exit($?) if $?;
 25 | 
 26 |     my ($gflops) = $data =~ /GFLOPS: ([0-9]+)/ms;
 27 | 
 28 |     printf "%d %d %d %.2f\n", $thread128, $threads, $gflops, 100 * $gflops / 3050.0;
 29 | }
 30 | 
 31 | exit;
 32 | 
 33 | sub writeSassFile
 34 | {
 35 |     my ($filename, $loops) = @_;
 36 | 
 37 |     open my $fh, ">$filename" or die "$filename: $!";
 38 | 
 39 |     printf $fh <<'EOF', $loops;
 40 | # Kernel: microbench
 41 | 
 42 | <REGISTER_MAPPING>
 43 | 
 44 |     0-10 : result, r1, r2, r3
 45 |     20-27 ~ count, stop
 46 | 
 47 | </REGISTER_MAPPING>
 48 | 
 49 | --:-:-:-:1      MOV count, RZ;
 50 | --:-:-:-:1      MOV32I stop, %d;
 51 | --:-:-:-:1      MOV32I r1, 1.0;
 52 | --:-:-:-:1      MOV32I r2, 1.0;
 53 | --:-:-:-:4      MOV32I r3, 1.0;
 54 | 
 55 | LOOP:
 56 | 
 57 | --:-:-:-:1      ISETP.LE.AND P0, PT, count, stop, PT;
 58 | --:-:-:-:1      IADD count, count, 1;
 59 | 
 60 | <CODE>
 61 |     my $out;
 62 | 
 63 |     foreach my $i (0 .. 511)
 64 |     {
 65 |         my $yield = ($i + 32) & 63 ? '-' : 'Y';
 66 | 
 67 |         my $stall = $i == 511 ? 0 : 1;
 68 | 
 69 |         #$out .= "--:-:-:$yield:1      FFMA r3, r1, r2, r3;\n";
 70 |         #$out .= "--:-:-:-:1      FFMA r3, r1, r2, r3;\n";
 71 |         #$out .= "--:-:-:-:1      FFMA r3, r1, r2, r3;\n";
 72 |         #$out .= "--:-:-:-:0      FFMA r3, r1, r2, r3;\n";
 73 |         #$out .= "--:-:-:-:1      I2F.F32.S16 result, r1;\n";
 74 | 
 75 |         #$out .= "--:-:-:$yield:$stall      VADD.S16.S16.SAT.MRG_16L result, r1, r2, RZ;\n";
 76 |         #$out .= "--:-:-:-:1      MOV result, RZ;\n";
 77 | 
 78 |         $out .= "--:-:-:$yield:$stall      IADD.SAT result, r1, r2;\n";
 79 |         #$out .= "--:-:-:$yield:$stall      VMAD.S8.S8.SAT result, r1, r2, r3;\n";
 80 |         #$out .= "--:-:-:$yield:$stall      XMAD result, r1, r2, r3;\n";
 81 |     }
 82 |     return $out;
 83 | </CODE>
 84 | 
 85 | --:-:-:Y:5  @P0 BRA LOOP;
 86 | --:-:-:-:5      EXIT;
 87 | EOF
 88 | 
 89 |     close $fh;
 90 | }
 91 | 
 92 | __END__
 93 | 
 94 | VMAD.U8.U8
 95 | 
 96 | dddd 2655 / 4968 = 53.4%
 97 | 1d1d 4594 / 4968 = 92.4%
 98 | 11d  4746 / 4968 = 95.5%
 99 | 111d 4841 / 4968 = 97.4%
100 | 
101 | block context switches are a little more expensive than thread context switches
102 | 
103 | stall codes:
104 | 
105 | f : 13 clocks
106 | e :  8 clocks
107 | d :  6 clocks
108 | c :  8 clocks, no yield
109 | b : 11 clocks
110 | a : 10 clocks
111 | 9 :  9 clocks
112 | 8 :  8 clocks
113 | 7 :  7 clocks
114 | 6 :  6 clocks
115 | 5 :  5 clocks
116 | 4 :  4 clocks
117 | 3 :  3 clocks
118 | 2 :  2 clocks
119 | 1 :  1 clocks,  no yield
120 | 0 :  0 clocks,  no yield, dual issue


--------------------------------------------------------------------------------
/microbench/throughput5.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl
  2 | use strict;
  3 | my %p;
  4 | 
  5 | $p{N}         = 8192;
  6 | $p{blocking}  = 8;
  7 | $p{unroll}    = 8;
  8 | $p{threads}   = 64;   #256
  9 | 
 10 | $p{csize}     = $p{blocking} * $p{blocking};
 11 | $p{loopSize}  = $p{unroll} * $p{csize};
 12 | $p{width}     = sqrt($p{csize} * $p{threads});
 13 | $p{blocks}    = ($p{N} / $p{width}) * ($p{N} / $p{width});
 14 | $p{loops}     = $p{N} / $p{unroll};
 15 | $p{fops}      = 2 * $p{loops} * $p{loopSize} * $p{blocks} * $p{threads};
 16 | 
 17 | my $fileName  = 'throughput2.sass';
 18 | 
 19 | my @params = qw(N blocking unroll threads csize loopSize loops width blocks fops);
 20 | 
 21 | #print join("\t", @params), "\n";
 22 | #print join("\t", @p{@params}), "\n";
 23 | 
 24 | print map sprintf("%-9s: %d\n", $_, $p{$_}), @params;
 25 | 
 26 | writeSassFile($fileName, $p{loopSize}, $p{loops});
 27 | 
 28 | #print `maxas.pl -p $fileName`;
 29 | #exit;
 30 | 
 31 | print `maxas.pl -i $fileName microbench.cubin`;
 32 | 
 33 | exit if $?;
 34 | 
 35 | my $data = `Release\\microbench.exe e $p{blocks} $p{threads} $p{fops} 50`;
 36 | 
 37 | my ($gflops) = $data =~ /GFLOPS: ([0-9]+)/ms;
 38 | 
 39 | print $data;
 40 | 
 41 | #printf "%d %4d %4d %d\n", $thread128, $loopSize, $loops, $gflops;
 42 | 
 43 | 
 44 | 
 45 | 
 46 | sub writeSassFile
 47 | {
 48 |     my ($filename, $loopSize, $loops) = @_;
 49 | 
 50 |     open my $fh, ">$filename" or die "$filename: $!";
 51 | 
 52 |     printf $fh <<'END_SASS', $loops;
 53 | # Kernel: microbench
 54 | 
 55 | <REGISTER_MAPPING>
 56 | 
 57 |      1, 9, 2,10,17,25,18,26 : cy0x<0-7>
 58 |      5,13, 6,14,21,29,22,30 : cy1x<0-7>
 59 |      3,11, 0, 8,19,27,16,24 : cy2x<0-7>
 60 |      7,15, 4,12,23,31,20,28 : cy3x<0-7>
 61 |     35,43,32,40,51,59,48,56 : cy4x<0-7>
 62 |     39,47,36,44,55,63,52,60 : cy5x<0-7>
 63 |     33,41,34,42,49,57,50,58 : cy6x<0-7>
 64 |     37,45,38,46,53,61,54,62 : cy7x<0-7>
 65 | 
 66 |     64-71   : j0Ax<0-3>, j0By<0-3>
 67 |     72-79   : j1Ax<0-3>, j1By<0-3>
 68 | 
 69 |     0-79 : r<0-79>
 70 | 
 71 |     100-101 : count, stop
 72 | 
 73 |     //102-112 ~ readAs, readBs, writeS
 74 | 
 75 | </REGISTER_MAPPING>
 76 | 
 77 | --:-:-:-:1      MOV count, RZ;
 78 | --:-:-:-:1      MOV32I stop, %d;
 79 | //--:-:-:-:1      MOV writeS, RZ;
 80 | //--:-:-:-:1      MOV readAs, RZ;
 81 | //--:-:-:-:1      MOV readBs, RZ;
 82 | 
 83 | <CODE>
 84 |     return join '', map "--:-:-:-:1      MOV r$_, RZ;\n", 0..63;
 85 | </CODE>
 86 | 
 87 | <CODE>
 88 |     return join '', map "--:-:-:-:1      MOV32I r$_, 0x00010001;\n", 64..79;
 89 | </CODE>
 90 | 
 91 | LOOP:
 92 | 
 93 | --:-:-:-:1      ISETP.LE.AND P0, PT, count, stop, PT;
 94 | --:-:-:-:1      IADD count, count, 1;
 95 | 
 96 | <CODE>
 97 |     my $out;
 98 | 
 99 |     my @swirl1 = ([0,0],[0,4],[4,4],[4,0]);
100 |     my @swirl2 = ([0,0],[1,0],[1,1],[0,1]);
101 |     my @swirl3 = ([0,2],[2,2],[2,0],[0,0]);
102 | 
103 |     my @cOrder;
104 |     foreach my $s1 (@swirl1)
105 |     {
106 |         foreach my $s2 (@swirl2)
107 |         {
108 |             foreach my $s3 (@swirl3)
109 |             {
110 |                 push @cOrder, [$s1->[0] + $s2->[0] + $s3->[0], $s1->[1] + $s2->[1] + $s3->[1]];
111 |             }
112 |         }
113 |     }
114 | 
115 |     foreach my $j (0..7)
116 |     {
117 |         my $odd  = $j & 1;
118 |         my $nOdd = !$odd + 0;
119 | 
120 |         my %%insert;
121 | 
122 |         #$insert{c62} = "01:-:-:-:5      BAR.SYNC 0;\n" if $j == 6;
123 | 
124 |         $insert{c62} =
125 |                 "--:-:-:-:1      LOP.XOR readAs, readAs, 0;\n" .
126 |                 "--:-:-:-:1      LOP.XOR readBs, readBs, 0;\n" .
127 |                 "--:-:-:-:1      LOP.XOR readAs, readAs, 0;\n" .
128 |                 "--:-:-:-:1      LOP.XOR readBs, readBs, 0;\n" .
129 |                 "--:-:-:-:1      LOP.XOR writeS, writeS, 0;\n" if $j == 8;
130 | 
131 |         foreach my $c (0 .. 63)
132 |         {
133 |             my ($x,$y) = @{$cOrder[$c]};
134 |             my $ins    = $insert{"c$c"} || '';
135 |             my $stall  = ($c == 63 && $j == 7) ? 0 : 1; #1; #$ins ||
136 |             my $yield  = $c == 32 ? 'Y' : '-';
137 |             my $wait   = '--'; #$c ? '--' : '01';
138 | 
139 |             my $xReg  = $x >> 1;
140 |             my $yReg  = $y >> 1;
141 |             my $xPart = $x & 1 ? '.H1' : '';
142 |             my $yPart = $y & 1 ? '.H1' : '';
143 | 
144 |             $out .= sprintf "$wait:-:-:$yield:$stall      XMAD cy%%dx%%d, j%%dAx%%d%%s, j%%dBy%%d%%s, cy%%dx%%d;\n%%s", $y,$x,  $odd,$xReg,$xPart,  $odd,$yReg,$yPart,  $y,$x,  $ins;
145 |         }
146 |     }
147 |     return $out;
148 | </CODE>
149 | 
150 | --:-:-:Y:5  @P0 BRA LOOP;
151 | --:-:-:-:5      EXIT;
152 | END_SASS
153 | 
154 |     close $fh;
155 | }
156 | 
157 | __END__
158 | 
159 |         my %%insert = (
160 |             c0 => "--:-:-:-:1      LDS.U.128 j${nOdd}Ax00, [readAs+0x10];\n",
161 |             c2 => "--:-:-:-:1      LDS.U.128 j${nOdd}By00, [readBs+0x10];\n",
162 |             c4 => "--:-:-:-:1      LDS.U.128 j${nOdd}Ax64, [readAs+0x10];\n",
163 |             c6 => "--:-:1:-:1      LDS.U.128 j${nOdd}By64, [readBs+0x10];\n",
164 |         );


--------------------------------------------------------------------------------
/microbench/xmad.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | use strict;
 3 | 
 4 | print `maxas.pl -i xmad2.sass microbench.cubin`;
 5 | 
 6 | exit if $?;
 7 | 
 8 | print `./microbench i 1 128`;
 9 | 
10 | 
11 | __END__
12 | 
13 | 


--------------------------------------------------------------------------------
/microbench/xmad2.sass:
--------------------------------------------------------------------------------
  1 | # Kernel: microbench
  2 | # InsCnt: 18
  3 | # RegCnt: 5
  4 | # SharedSize: 4096
  5 | # BarCnt: 1
  6 | # Params(3):
  7 | #	ord:addr:size:align
  8 | #	0:0x140:8:0
  9 | #	1:0x148:8:0
 10 | #	2:0x150:8:0
 11 | #
 12 | # Instructions:
 13 | 
 14 | <CONSTANT_MAPPING>
 15 |     blockDimX : c[0x0][0x8]
 16 |     blockDimY : c[0x0][0xc]
 17 |     blockDimZ : c[0x0][0x10]
 18 |     gridDimX : c[0x0][0x14]
 19 |     gridDimY : c[0x0][0x18]
 20 |     gridDimZ : c[0x0][0x1c]
 21 | 
 22 |     param_out[0] : c[0x0][0x140]
 23 |     param_out[1] : c[0x0][0x144]
 24 |     param_clocks[0] : c[0x0][0x148]
 25 |     param_clocks[1] : c[0x0][0x14c]
 26 |     param_in[0] : c[0x0][0x150]
 27 |     param_in[1] : c[0x0][0x154]
 28 | </CONSTANT_MAPPING>
 29 | 
 30 | <REGISTER_MAPPING>
 31 | 
 32 | 	0-1 : out<0-1>
 33 | 	2-3 : clocks<0-1>
 34 |     4-15  : result, result2, tid, bid, blockDim, clock1, clock2, scale, s
 35 |     16-24 : a, b, c, x
 36 | 
 37 | </REGISTER_MAPPING>
 38 | 
 39 | // Load in our params
 40 | --:-:-:-:1      MOV out0,      param_out[0];
 41 | --:-:-:-:1      MOV out1,      param_out[1];
 42 | --:-:-:-:1      MOV clocks0,   param_clocks[0];
 43 | --:-:-:-:1      MOV clocks1,   param_clocks[1];
 44 | //--:-:-:-:1      MOV in,       c[0x0][0x148];
 45 | --:-:-:-:1      MOV blockDim, blockDimX;
 46 | 
 47 | --:-:-:-:1      PSETP.AND.AND P0, PT, !PT, PT, PT;
 48 | 
 49 | --:-:-:-:6      MOV32I result,  0xffffffff;
 50 | --:-:-:-:6      MOV32I result2, 0x0;
 51 | --:-:-:-:1      MOV32I a, 1;
 52 | --:-:-:-:1      MOV32I b, 1;
 53 | --:-:-:-:6      MOV32I c, 0x0;
 54 | 
 55 | // (127 - scale) << 23
 56 | //--:-:-:-:6      MOV32I scale, 28;
 57 | //--:-:-:-:6      IADD scale, -scale, 127;
 58 | //--:-:-:-:6      SHL  scale, scale, 23;
 59 | 
 60 | 
 61 | //--:-:-:-:6      MOV32I c, 0x4f765432;
 62 | 
 63 | //--:-:1:-:2      LDG.CI.128 a, [in];
 64 | 
 65 | //01:-:-:-:6      VMAD.S16.S16 result, a, b, c;
 66 | 
 67 | //--:-:-:-:6      MOV result, a;
 68 | 
 69 | // a >> 16 | (b & 0xffff0000)
 70 | 
 71 | //--:-:-:-:6      SHR.U32 result, a, 16;
 72 | //--:-:-:-:6      LOP3.LUT result, result, b, c, 0xf8;
 73 | 
 74 | //--:-:-:-:6      I2I.S32.S16 result, a.H1;
 75 | 
 76 | //--:-:-:Y:d      IADD result.CC, a, -c;
 77 | //--:-:-:Y:2      IADD.X result2, b, -RZ;
 78 | 
 79 | //--:-:-:-:6      SHR result, a, 1;
 80 | 
 81 | //--:-:-:-:6      BFI result, b, 0x1010, a;
 82 | 
 83 | --:-:-:-:1      CS2R clock1, SR_CLOCKLO;
 84 | 
 85 | //--:-:-:-:6      XMAD.S16.S16 c, a, b, RZ;
 86 | //--:-:-:-:6      ISET.LT.AND s, c, RZ, PT;
 87 | //--:-:-:-:6      IADD result.CC, c, result;
 88 | //--:-:-:-:6      IADD.X result2, s, result2;
 89 | 
 90 | //--:-:-:-:6      XMAD.S16.S16 result.CC, a, b, result;
 91 | //--:-:-:-:6      IADD.X result2, result2, RZ;
 92 | 
 93 | //--:-:-:-:6      SHF.R.S64 result, result, 1, result2;
 94 | //--:-:-:-:6      MOV32I result2, 0;
 95 | 
 96 | --:-:-:-:f      LOP.AND.NZ P0, RZ, result, 1;
 97 | 
 98 | --:-:-:-:6  @P0 VADD.S16.S16.SAT.MRG_16H result, a, b, result;
 99 | 
100 | //--:-:1:-:d      I2F.F32.S32 result2, a;
101 | //01:-:-:-:6      FMUL result2, result2, scale;
102 | //01:-:2:-:d      F2I.S32.F32 result, result2;
103 | 
104 | 02:-:-:-:6      CS2R clock2, SR_CLOCKLO;
105 | 
106 | //F2I   = "^$pred?F2I$ftz$x2x$round $r0, $cr20;"
107 | //I2F   = "^$pred?I2F$x2x$rnd $r0, $cr20;"
108 | //x2x   = "\.(?<destSign>F|U|S)(?<destWidth>8|16|32|64)\.(?<srcSign>F|U|S)(?<srcWidth>8|16|32|64)"
109 | //rnd   = "(?:\.(?<rnd>RN|RM|RP|RZ))?"
110 | //round = "(?:\.(?<round>ROUND|FLOOR|CEIL|TRUNC))?"
111 | //r8    = qr"(?<r8neg>\-)?(?<r8abs>\|)?(?<r8>$reg)\|?(?:\.(?<r8part>H0|H1|B1|B2|B3))?(?<reuse1>\.reuse)?"
112 | //r20   = qr"(?<r20neg>\-)?(?<r20abs>\|)?(?<r20>$reg)\|?(?:\.(?<r20part>H0|H1|B1|B2|B3))?(?<reuse2>\.reuse)?"
113 | 
114 | 
115 | //--:-:-:-:1      XMAD.MRG x, a, b.H1, RZ;
116 | //--:-:-:-:6      XMAD result, a.H1, b.H1, c;
117 | //--:-:-:-:1      XMAD.PSL.CBCC result, a.H1, x.H1, result;
118 | 
119 | // Get the first clock value
120 | 
121 | --:-:1:-:1      S2R tid, SR_TID.X;
122 | --:-:2:-:2      S2R bid, SR_CTAID.X;
123 | 
124 | 
125 | 
126 | // Take the difference of clocks
127 | --:-:-:-:1      IADD clock1, clock2, -clock1;
128 | 
129 | // Setup our output addresses
130 | // Stall your pipeline dependencies properly
131 | 03:-:-:-:1      XMAD tid, blockDim, bid, tid;
132 | --:-:-:Y:6      XMAD.MRG x, blockDim, bid.H1, RZ;
133 | --:-:-:Y:6      XMAD.PSL.CBCC tid, blockDim.H1, x.H1, tid;
134 | --:-:-:Y:6      SHL  tid, tid, 0x2;
135 | 
136 | --:-:-:-:1      IADD clocks, clocks, tid;
137 | --:-:-:-:1      IADD out,  out,  tid;
138 | 
139 | // Output the results.
140 | // No stall needed on prior instruction as memory store instructions have a 5 clock delay in picking up register values
141 | --:-:-:-:1      STG.E [clocks], result2;
142 | --:-:-:-:1      STG.E [out],    result;
143 | --:-:-:-:5      EXIT;
144 | 
145 | 


--------------------------------------------------------------------------------
/sgemm/batched_gemm.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NervanaSystems/maxas/54eda7af086a46c9dae1688b691968235d560164/sgemm/batched_gemm.xlsx


--------------------------------------------------------------------------------
/sgemm/cublas_sgemm.ptx:
--------------------------------------------------------------------------------
 1 | .version 4.1
 2 | .target sm_50
 3 | .address_size 64
 4 | 
 5 | // ptxas -v -arch=sm_50 -m 32 --opt-level 4 -o cublas_sgemm.cubin cublas_sgemm.ptx
 6 | 
 7 | // You can use maxas to insert cublas_device.lib code into a cubin built from this ptx:
 8 | 
 9 | // From C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v6.5\lib\Win32\cublas_device.lib
10 | 
11 | // cuobjdump -lelf cublas_device.lib | find "sm_50"
12 | 
13 | // cuobjdump -xelf maxwell_sgemm.asm.sm_50.cubin cublas_device.lib
14 | 
15 | // maxas -l maxwell_sgemm.asm.sm_50.cubin
16 | 
17 | // maxas -e -k maxwell_sgemm_128x128_nt maxwell_sgemm_128x128_nt.sass
18 | // maxas -e -k maxwell_sgemm_128x64_nt  maxwell_sgemm_128x64_nt.sass
19 | 
20 | // maxas -i maxwell_sgemm_128x128_nt.sass cublas_sgemm.cubin
21 | // maxas -i maxwell_sgemm_128x64_nt.sass  cublas_sgemm.cubin
22 | 
23 | // The sgemm.cpp code makes use of this cubin to benchmark the kernels outside of cublas.
24 | 
25 | .visible .entry maxwell_sgemm_128x128_nt(
26 | 	.param .u64 .ptr.global.align 8 param_A,
27 | 	.param .u64 .ptr.global.align 8 param_B,
28 | 	.param .u64 .ptr.global.align 8 param_C,
29 | 	.param .s32 param_lda,
30 | 	.param .s32 param_ldb,
31 | 	.param .s32 param_ldc,
32 | 	.param .s32 param_k,
33 | 	.param .u64 .ptr.global.align 8 param_Alpha,
34 | 	.param .u64 .ptr.global.align 8 param_Beta,
35 | 	.param .s32 param_alpha,
36 | 	.param .s32 param_beta,
37 | 	.param .s32 param_flag
38 | )
39 | .reqntid 256
40 | {
41 | 	.shared .align 16 .b8 share[16384];
42 | 
43 | 	ret;
44 | }
45 | 
46 | .visible .entry maxwell_sgemm_128x64_nt(
47 | 	.param .u64 .ptr.global.align 8 param_A,
48 | 	.param .u64 .ptr.global.align 8 param_B,
49 | 	.param .u64 .ptr.global.align 8 param_C,
50 | 	.param .s32 param_lda,
51 | 	.param .s32 param_ldb,
52 | 	.param .s32 param_ldc,
53 | 	.param .s32 param_k,
54 | 	.param .u64 .ptr.global.align 8 param_Alpha,
55 | 	.param .u64 .ptr.global.align 8 param_Beta,
56 | 	.param .s32 param_alpha,
57 | 	.param .s32 param_beta,
58 | 	.param .s32 param_flag
59 | )
60 | .reqntid 128
61 | {
62 | 	.shared .align 16 .b8 share[12288];
63 | 
64 | 	ret;
65 | }
66 | 


--------------------------------------------------------------------------------
/sgemm/sgemm.cpp:
--------------------------------------------------------------------------------
  1 | // sgemm.cpp : Defines the entry point for the console application.
  2 | //
  3 | 
  4 | #include <stdio.h>
  5 | #include <stdlib.h>
  6 | #include <string.h>
  7 | #include <time.h>
  8 | #include <cuda.h>
  9 | #include <cublas_v2.h>
 10 | 
 11 | CUcontext      hContext = 0;
 12 | cublasHandle_t hCublas  = 0;
 13 | 
 14 | float assemblySgemm(const char* kernel, CUarray_format format, size_t size, CUdeviceptr devC, CUdeviceptr devA, CUdeviceptr devB, int N, CUevent hStart, CUevent hStop, int repeat = 1, int printVars = 0);
 15 | float cublasSgemm(const char* kernel, CUdeviceptr devC, CUdeviceptr devA, CUdeviceptr devB, int N, CUevent hStart, CUevent hStop, int repeat);
 16 | void gflops(const char* ident, int N, float ms, int repeat);
 17 | void test(float* C, float* T, int N, size_t size);
 18 | 
 19 | #define REPEAT_BLOCK 2000
 20 | 
 21 | #define CUDA_CHECK( fn ) do { \
 22 | 		CUresult status = (fn); \
 23 | 		if ( CUDA_SUCCESS != status ) { \
 24 | 			const char* errstr; \
 25 | 			cuGetErrorString(status, &errstr); \
 26 | 			printf("CUDA Driver Failure (line %d of file %s):\n\t%s returned 0x%x (%s)\n", __LINE__, __FILE__, #fn, status, errstr); \
 27 | 			if (hCublas)  cublasDestroy(hCublas); \
 28 | 			if (hContext) cuCtxDestroy(hContext); \
 29 | 			exit(EXIT_FAILURE); \
 30 | 		} \
 31 | 	} while (0)
 32 | 
 33 | #define CUBLAS_CHECK( fn ) do { \
 34 | 		cublasStatus_t status = (fn); \
 35 | 		if ( CUBLAS_STATUS_SUCCESS != status ) { \
 36 | 			printf("Cublas Failure (line %d of file %s):\n\t%s returned %d\n", __LINE__, __FILE__, #fn, status); \
 37 | 			if (hCublas)  cublasDestroy(hCublas); \
 38 | 			if (hContext) cuCtxDestroy(hContext); \
 39 | 			exit(EXIT_FAILURE); \
 40 | 		} \
 41 | 	} while (0)
 42 | 
 43 | int main(int argc, char* argv[])
 44 | {
 45 | 	char deviceName[32];
 46 | 	int count, ordinal, major, minor;
 47 | 	CUdevice  hDevice;
 48 | 	CUevent hStart, hStop;
 49 | 	CUdeviceptr devA, devB, devC, devT, otherDevA, otherDevB;
 50 | 
 51 | 	// Initialize the Driver API and find a device
 52 | 	CUDA_CHECK( cuInit(0) );
 53 | 	CUDA_CHECK( cuDeviceGetCount(&count) );
 54 | 	for (ordinal = 0; ordinal < count; ordinal++)
 55 | 	{
 56 | 		CUDA_CHECK( cuDeviceGet(&hDevice, ordinal) );
 57 | 		CUDA_CHECK( cuDeviceGetAttribute (&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, hDevice) );
 58 | 		CUDA_CHECK( cuDeviceGetAttribute (&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, hDevice) );
 59 | 		CUDA_CHECK( cuDeviceGetName(deviceName, sizeof(deviceName), hDevice) );
 60 | 		if (major >= 5 && minor >= 2)
 61 | 		{
 62 | 			//printf("Using: Id:%d %s (%d.%d)\n\n", ordinal, deviceName, major, minor);
 63 | 			break;
 64 | 		}
 65 | 	}
 66 | 	if (ordinal == count)
 67 | 	{
 68 | 		printf("No compute 5.0 device found, exiting.\n");
 69 | 		exit(EXIT_FAILURE);
 70 | 	}
 71 | 
 72 | 	// First command line arg is the size of N divided by 128
 73 | 	int thread128 = 64;
 74 | 	if (argc > 1)
 75 | 		thread128 = atoi(argv[1]);
 76 | 	if (thread128 > 64 || thread128 < 1)
 77 | 		thread128 = 64;
 78 | 
 79 | 	// Second command line arg is the repeat count for benchmarking
 80 | 	int repeat = 1;
 81 | 	if (argc > 2)
 82 | 		repeat = atoi(argv[2]);
 83 | 	if (repeat > 10000 || repeat < 1)
 84 | 		repeat = 1;
 85 | 
 86 | 	// Third command line arg is the normalized float size
 87 | 	CUarray_format format = CU_AD_FORMAT_FLOAT;
 88 | 	if (argc > 3)
 89 | 		format = (CUarray_format)atoi(argv[3]);
 90 | 	if (format != CU_AD_FORMAT_FLOAT && format != CU_AD_FORMAT_UNSIGNED_INT16 && format != CU_AD_FORMAT_UNSIGNED_INT8)
 91 | 		format = CU_AD_FORMAT_FLOAT;
 92 | 
 93 | 	// Forth command line arg is for printf debugging 
 94 | 	int printVars = 0;
 95 | 	if (argc > 4)
 96 | 		printVars = atoi(argv[4]);
 97 | 	if (printVars > 100 || printVars < 1)
 98 | 		printVars = 0;
 99 | 
100 | 	int N = thread128 * 128;
101 | 	float alpha = 1, beta = 0, ms = 1;
102 | 	size_t sizeOther = N * N;
103 | 	size_t sizeFloat = sizeOther * 4;
104 | 
105 | 	float* A = (float*)malloc(sizeFloat);
106 | 	float* B = (float*)malloc(sizeFloat);
107 | 	float* C = (float*)malloc(sizeFloat);
108 | 	float* T = (float*)malloc(sizeFloat);  
109 | 	float *otherA, *otherB; 
110 | 
111 | 	//int counter = 0;
112 | 	//srand((unsigned int)time(0));
113 | 	for(int i = 0; i < N * N; i++) //
114 | 	{
115 | 		//A[i] = (float)rand() / (float)RAND_MAX;
116 | 		//B[i] = (float)rand() / (float)RAND_MAX;
117 | 		A[i] = B[i] = 1.0f; // * (i & 3) + 1.0f;
118 | 		//A[i] = 1.0f;
119 | 		//B[i * N + counter++] = 1.0f; // identity matrix
120 | 	}
121 | 
122 | 	if (format == CU_AD_FORMAT_FLOAT)
123 | 	{
124 | 		sizeOther *= 4;
125 | 		otherA = A;
126 | 		otherB = B;
127 | 	}
128 | 	else if (format == CU_AD_FORMAT_UNSIGNED_INT16)
129 | 	{
130 | 		sizeOther *= 2;
131 | 		unsigned short* othera = (unsigned short*)malloc(sizeOther);
132 | 		unsigned short* otherb = (unsigned short*)malloc(sizeOther);
133 | 		for(int i = 0; i < N * N; i++)
134 | 			othera[i] = otherb[i] = 65535;
135 | 
136 | 		otherA = reinterpret_cast<float*>(othera);
137 | 		otherB = reinterpret_cast<float*>(otherb);
138 | 	}
139 | 	else // (format == CU_AD_FORMAT_UNSIGNED_INT8)
140 | 	{
141 | 		otherA = (float*)malloc(sizeOther);
142 | 		otherB = (float*)malloc(sizeOther);
143 | 		memset(otherA, 255, sizeOther);
144 | 		memset(otherB, 255, sizeOther); 
145 | 	}
146 | 
147 | 	CUDA_CHECK( cuCtxCreate(&hContext, 0, hDevice) );
148 | 	//CUBLAS_CHECK( cublasCreate(&hCublas) );
149 | 	
150 | 	CUDA_CHECK( cuEventCreate(&hStart, CU_EVENT_BLOCKING_SYNC) ); // CU_EVENT_DEFAULT 
151 | 	CUDA_CHECK( cuEventCreate(&hStop,  CU_EVENT_BLOCKING_SYNC) );
152 | 
153 | 	CUDA_CHECK( cuMemAlloc(&devA, sizeFloat) );
154 | 	CUDA_CHECK( cuMemAlloc(&devB, sizeFloat) );
155 | 	CUDA_CHECK( cuMemAlloc(&devC, sizeFloat) );
156 | 	CUDA_CHECK( cuMemAlloc(&devT, sizeFloat) );
157 | 	
158 | 	CUDA_CHECK( cuMemcpyHtoD(devA, A, sizeFloat) );
159 | 	CUDA_CHECK( cuMemcpyHtoD(devB, B, sizeFloat) );
160 | 	CUDA_CHECK( cuMemsetD8(devC, 0, sizeFloat) );
161 | 	CUDA_CHECK( cuMemsetD8(devT, 0, sizeFloat) );
162 | 
163 | 	if (format == CU_AD_FORMAT_FLOAT)
164 | 	{
165 | 		otherDevA = devA;
166 | 		otherDevB = devB;
167 | 	}
168 | 	else
169 | 	{
170 | 		CUDA_CHECK( cuMemAlloc(&otherDevA, sizeOther) );
171 | 		CUDA_CHECK( cuMemAlloc(&otherDevB, sizeOther) );
172 | 		CUDA_CHECK( cuMemcpyHtoD(otherDevA, otherA, sizeOther) );
173 | 		CUDA_CHECK( cuMemcpyHtoD(otherDevB, otherB, sizeOther) );
174 | 	}
175 | 
176 | 	// Warm up the clock (unless under nsight)
177 | 	//if (!getenv("NSIGHT_LAUNCHED")) // NSIGHT_CUDA_ANALYSIS NSIGHT_CUDA_DEBUGGER 
178 | 	//	for (int i = 0; i < 3; i++)
179 | 	//		CUBLAS_CHECK( cublasSgemm(hCublas, CUBLAS_OP_N, CUBLAS_OP_T, N, N, N, &alpha, reinterpret_cast<float*>(devA), N, reinterpret_cast<float*>(devB), N, &beta, reinterpret_cast<float*>(devT), N) );
180 | 
181 | 	// Launch our kernel
182 | 	ms = assemblySgemm("sgemm_kernel_64", format, sizeOther, devC, otherDevA, otherDevB, N, hStart, hStop, repeat, printVars);
183 | 	gflops("Max64 ", N, ms, repeat);
184 | 
185 | 	ms = assemblySgemm("sgemm_kernel_128", format, sizeOther, devC, otherDevA, otherDevB, N, hStart, hStop, repeat, printVars);
186 | 	gflops("Max128", N, ms, repeat);
187 | 
188 | 	//ms = cublasSgemm("maxwell_sgemm_128x64_nt", devT, devA, devB, N, hStart, hStop, repeat);
189 | 	//gflops("Cub64 ", N, ms, repeat);
190 | 
191 | 	//ms = cublasSgemm("maxwell_sgemm_128x128_nt", devT, devA, devB, N, hStart, hStop, repeat);
192 | 	//gflops("Cub128", N, ms, repeat);
193 | 
194 | 	// Run cublas again for the same repeat count for comparison
195 | 	//CUDA_CHECK( cuEventRecord(hStart, NULL) );
196 | 	//for (int i = 0; i < repeat; i++)
197 | 	//	CUBLAS_CHECK( cublasSgemm(hCublas, CUBLAS_OP_N, CUBLAS_OP_T, N, N, N, &alpha, reinterpret_cast<float*>(devA), N, reinterpret_cast<float*>(devB), N, &beta, reinterpret_cast<float*>(devT), N) );
198 | 	//CUDA_CHECK( cuEventRecord(hStop, NULL) );
199 | 	//CUDA_CHECK( cuEventSynchronize(hStop) );
200 | 	//CUDA_CHECK( cuEventElapsedTime(&ms, hStart, hStop) );
201 | 	//gflops("Cublas", N, ms, repeat);
202 | 
203 | 	// Get back our results from each kernel
204 | 	CUDA_CHECK( cuMemcpyDtoH(C, devC, sizeFloat) );
205 | 	CUDA_CHECK( cuMemcpyDtoH(T, devT, sizeFloat) );
206 | 	
207 | 	// Cleanup and shutdown of cuda
208 | 	CUDA_CHECK( cuMemFree(devA) );
209 | 	CUDA_CHECK( cuMemFree(devB) );
210 | 	CUDA_CHECK( cuMemFree(devC) );
211 | 	CUDA_CHECK( cuMemFree(devT) );
212 | 	if (format != CU_AD_FORMAT_FLOAT)
213 | 	{
214 | 		CUDA_CHECK( cuMemFree(otherDevA) );
215 | 		CUDA_CHECK( cuMemFree(otherDevB) );
216 | 	}
217 | 
218 | 	CUDA_CHECK( cuEventDestroy(hStart) );
219 | 	CUDA_CHECK( cuEventDestroy(hStop) );
220 | 
221 | 	//CUBLAS_CHECK( cublasDestroy(hCublas) );
222 | 	//hCublas  = 0;
223 | 	CUDA_CHECK( cuCtxDestroy(hContext) );
224 | 	hContext = 0;
225 | 
226 | 	// compare C and T for accuracy
227 | 	test(C, T, N, sizeFloat);
228 | 
229 | 	// And free up host memory
230 | 	free(A); free(B); free(C); free(T);
231 | 
232 | 	if (format != CU_AD_FORMAT_FLOAT)
233 | 	{
234 | 		free(otherA); 
235 | 		free(otherB);
236 | 	}
237 | 
238 | 	return 0;
239 | }
240 | 
241 | // Our kernel wrapper function
242 | float assemblySgemm(const char* kernel, CUarray_format format, size_t size, CUdeviceptr devC, CUdeviceptr devA, CUdeviceptr devB, int N, CUevent hStart, CUevent hStop, int repeat, int printVars)
243 | {
244 | 	// Configure our x and y grid dimensions (assume nice square matrixes).
245 | 	// Each block gets 128 tracks from A and 128 tracks from B.
246 | 	// Each of the 256 threads calculates 64 elements of that 128x128 sub matrix of C.
247 | 	// See Figure 2 here to get the gist of things (we use a different mapping to maximize LDS.128 usage):
248 | 	// http://icl.cs.utk.edu/projectsfiles/magma/pubs/fermi_gemm.pdf
249 | 
250 | 	int threads, width;
251 | 	if (strcmp(kernel, "sgemm_kernel_64") == 0)
252 | 	{
253 | 		threads = 64;
254 | 		width   = 64;
255 | 	}
256 | 	else
257 | 	{
258 | 		threads = 256;
259 | 		width   = 128;
260 | 	}
261 | 
262 | 	int gridDimXY = N / width + (N % width != 0);
263 | 	int blocks    = gridDimXY * gridDimXY;
264 | 
265 | 	// Setup out debug printf output buffer
266 | 	CUdeviceptr devD = NULL; 
267 | 	int* D = NULL;
268 | 	int  sizeD = 0;
269 | 
270 | 	if (printVars)
271 | 	{
272 | 		sizeD = blocks * threads * printVars * sizeof(int);
273 | 		D = (int*)malloc(sizeD);
274 | 
275 | 		CUDA_CHECK( cuMemAlloc(&devD, sizeD) );
276 | 		CUDA_CHECK( cuMemsetD8(devD, 0, sizeD) );
277 | 	}
278 | 
279 | 	// Load the cubin
280 | 	CUmodule hModule;
281 | 	CUDA_CHECK( cuModuleLoad(&hModule, "sgemm.cubin") );
282 | 
283 | 	// Load the textures
284 | 	CUtexref texA, texB;
285 | 	CUDA_CHECK( cuModuleGetTexRef(&texA, hModule, "texA") );
286 | 	CUDA_CHECK( cuModuleGetTexRef(&texB, hModule, "texB") );
287 | 
288 | 	// Configure the textures
289 | 	CUDA_CHECK( cuTexRefSetFormat(texA, format, 4) );
290 | 	CUDA_CHECK( cuTexRefSetFormat(texB, format, 4) );
291 | 
292 | 	CUDA_CHECK( cuTexRefSetAddress(NULL, texA, devA, size) );
293 | 	CUDA_CHECK( cuTexRefSetAddress(NULL, texB, devB, size) );
294 | 
295 | 	// Load the kernel function
296 | 	CUfunction hKernel;
297 | 	CUDA_CHECK( cuModuleGetFunction(&hKernel, hModule, kernel) );
298 | 
299 | 	// Setup the params
300 | 	float alpha = 1.0f;
301 | 	void* params[] = { &devC, &N, &N, &N, &N, &N, &N, &alpha, &devD };
302 | 
303 | 	float totalTime = 0;
304 | 	// Launch the kernel repeat times.. but break it up into pieces so as not to lock things up.
305 | 	while (repeat > 0)
306 | 	{
307 | 		float ms;
308 | 		int r = repeat > REPEAT_BLOCK ? REPEAT_BLOCK : repeat;
309 | 		CUDA_CHECK( cuEventRecord( hStart, NULL ) );
310 | 		
311 | 		for (int i = 0; i < r; i++)
312 | 			CUDA_CHECK( cuLaunchKernel(hKernel, gridDimXY, gridDimXY, 1, threads, 1, 1, 0, 0, params, 0) );
313 | 		
314 | 		CUDA_CHECK( cuEventRecord( hStop, NULL ) );
315 | 		CUDA_CHECK( cuEventSynchronize( hStop ) );
316 | 		CUDA_CHECK( cuEventElapsedTime( &ms, hStart, hStop ) );
317 | 		totalTime += ms;
318 | 		repeat -= r;
319 | 	}
320 | 
321 | 
322 | 	CUDA_CHECK( cuModuleUnload(hModule) );
323 | 
324 | 	// And here we print out the debug info if requested:
325 | 	if (printVars)
326 | 	{
327 | 		CUDA_CHECK( cuMemcpyDtoH(D, devD, sizeD) );
328 | 		CUDA_CHECK( cuMemFree(devD) );
329 | 		int   *iD = D;
330 | 		float *fD = reinterpret_cast<float*>(D);
331 | 		unsigned int *uD = reinterpret_cast<unsigned int*>(D);
332 | 
333 | 		for (int by = 0; by < gridDimXY; by++)
334 | 		{
335 | 			for (int bx = 0; bx < gridDimXY; bx++)
336 | 			{
337 | 				unsigned int clock = 0xffffffff, sm = 0;
338 | 
339 | 				for (int tid = 0; tid < threads; tid++)
340 | 				{
341 | 					//printf("by: %3d, bx: %3d, tid:%3d, rA:%5d, rB:%5d, wr:%5d, rd:%5d, cx:%5d, cy:%5d, ci:%5d, c:%.2f\n", 
342 | 					//printf("by: %3d, bx: %3d, tid:%3d, t0:%5d, end:%5d, k:%5d, tid2:%5d, tid15:%5d, ldx:%5d, t2:%5d, t4:%5d\n", 
343 | 					//	    by,      bx,      tid,     iD[0],  iD[1],   iD[2], iD[3],    iD[4],     iD[5],   iD[6],  iD[7]
344 | 					//);
345 | 					if (uD[1] < clock) clock = uD[1];
346 | 					sm = uD[0];
347 | 
348 | 					iD += printVars;
349 | 					fD += printVars;
350 | 					uD += printVars;
351 | 				}
352 | 				printf("%02d %08u %d %d\n", sm, clock, by, bx);
353 | 			}
354 | 		}
355 | 		free(D);
356 | 	}
357 | 
358 | 	return totalTime;
359 | }
360 | 
361 | typedef struct dPointer
362 | {
363 | 	CUdeviceptr lo;
364 | 	CUdeviceptr hi;
365 | } dPointer;
366 | 
367 | float cublasSgemm(const char* kernel, CUdeviceptr devC, CUdeviceptr devA, CUdeviceptr devB, int N, CUevent hStart, CUevent hStop, int repeat)
368 | {
369 | 	int threads, gridX, gridY;
370 | 	if (strcmp(kernel, "maxwell_sgemm_128x64_nt") == 0)
371 | 	{
372 | 		threads = 128;
373 | 		gridX = N / 128 + (N % 128 != 0);
374 | 		gridY = N / 64  + (N % 64  != 0);
375 | 	}
376 | 	else
377 | 	{
378 | 		threads = 256;
379 | 		gridX = gridY = N / 128 + (N % 128 != 0);
380 | 	}
381 | 	int blocks = gridX * gridY;
382 | 
383 | 	// Load the cubin
384 | 	// See cublas_sgemm.ptx for info on how to build this.
385 | 	CUmodule hModule;
386 | 	CUDA_CHECK( cuModuleLoad(&hModule, "cublas_sgemm.cubin") );
387 | 
388 | 	// Load the kernel function
389 | 	CUfunction hKernel;
390 | 	CUDA_CHECK( cuModuleGetFunction(&hKernel, hModule, kernel) );
391 | 
392 | 	// Setup the params
393 | 	// I should probably be working in 64 bits...
394 | 	dPointer dA = { devA, 0 };
395 | 	dPointer dB = { devB, 0 };
396 | 	dPointer dC = { devC, 0 };
397 | 
398 | 	int   flag  = 0;
399 | 	float alpha = 1.0;
400 | 	float beta  = 0.0;
401 | 	
402 | 	void* params[] = { &dA, &dB, &dC, &N, &N, &N, &N, &dA, &dA, &alpha, &beta, &flag };
403 | 
404 | 	float totalTime = 0;
405 | 	// Launch the kernel repeat times.. but break it up into pieces so as not to lock things up.
406 | 	while (repeat > 0)
407 | 	{
408 | 		float ms;
409 | 		int r = repeat > REPEAT_BLOCK ? REPEAT_BLOCK : repeat;
410 | 		CUDA_CHECK( cuEventRecord( hStart, NULL ) );
411 | 		
412 | 		for (int i = 0; i < r; i++)
413 | 			CUDA_CHECK( cuLaunchKernel(hKernel, gridX, gridY, 1, threads, 1, 1, 0, 0, params, 0) );
414 | 		
415 | 		CUDA_CHECK( cuEventRecord( hStop, NULL ) );
416 | 		CUDA_CHECK( cuEventSynchronize( hStop ) );
417 | 		CUDA_CHECK( cuEventElapsedTime( &ms, hStart, hStop ) );
418 | 		totalTime += ms;
419 | 		repeat -= r;
420 | 	}
421 | 
422 | 
423 | 	CUDA_CHECK( cuModuleUnload(hModule) );
424 | 
425 | 	return totalTime;
426 | }
427 | 
428 | void gflops(const char* ident, int N, float ms, int repeat)
429 | {
430 | 	// Standard sgemm flops formula
431 | 	ms /= repeat;
432 | 	printf("%s GFLOPS: %.2f (size: %d, iterations: %d)\n", ident, ((double)N * N * N * 2.0 + N * N) / (ms * 1000000.0), N, repeat);
433 | }
434 | 
435 | void test(float* C, float* T, int N, size_t size)
436 | {
437 | 	// Compare our implementation with the cublas result
438 | 	int errors = memcmp(C, T, size);
439 | 	if (errors)
440 | 	{
441 | 		if (N <= 512) // This gets too big and slow for large N
442 | 		{
443 | 			errors = 0;
444 | 			FILE* file;
445 | 			if (fopen_s(&file, "data.txt", "w") == 0)
446 | 			{
447 | 				for (int y = 0; y < N; ++y)
448 | 				{
449 | 					for (int x = 0; x < N; ++x)
450 | 					{
451 | 						float c = C[x*N + y];
452 | 						float t = T[x*N + y];
453 | 						if (c != t)
454 | 						{
455 | 							errors++;
456 | 							fprintf(file, "%.8f!%.8f\t", c , t);
457 | 							//fprintf(file, "%.0f!", c);
458 | 							//fprintf(file, "!");
459 | 						}
460 | 						else
461 | 						{
462 | 							//fprintf(file, "%.0f=%.0f\t", c , t);
463 | 							//fprintf(file, "%.0f=", c);
464 | 							fprintf(file, "=");
465 | 						}
466 | 					}
467 | 					fprintf(file, "\n");
468 | 				}
469 | 				fclose(file);
470 | 				printf("%d errors\n", errors);
471 | 			}
472 | 			else
473 | 				{ printf("Cannot open data.txt for writing\n"); }
474 | 		}
475 | 		else
476 | 			{ printf("%d errors\n", errors); }
477 | 	}
478 | 	else
479 | 		{ printf("%d errors\n", errors); }
480 | }


--------------------------------------------------------------------------------
/sgemm/sgemm.cu:
--------------------------------------------------------------------------------
  1 | 
  2 | // Note this file isn't configured to automatically compile.
  3 | // Here's how:
  4 | 
  5 | // If you want to look at the ptx first:
  6 | // nvcc -arch sm_50 -m 32 -ptx sgemm.cu
  7 | 
  8 | // Manually compile your kernel to a cubin.
  9 | // You should only have to do this once, unless you change params or shared size or globals:
 10 | // nvcc -arch sm_50 -m 32 -cubin sgemm.cu
 11 | 
 12 | // If tweaking a kernel or writing a new one based on this shell code you would then do this:
 13 | // maxas.pl -e kernel.cubin kernel.sass
 14 | 
 15 | // I've already included a modified kernel (sgemm.sass) so the next step is..
 16 | 
 17 | // Splice the manually assembled code back into the cubin:
 18 | // maxas.pl -i sgemm.sass sgemm.cubin
 19 | 
 20 | #include <device_functions.h>
 21 | #include <device_launch_parameters.h>
 22 | #include <cuda_texture_types.h>
 23 | #include <texture_fetch_functions.h>
 24 | 
 25 | typedef texture<float4, cudaTextureType1D, cudaReadModeElementType> floatTex;
 26 | 
 27 | floatTex  texA(0, cudaFilterModePoint, cudaAddressModeBorder);
 28 | floatTex  texB(0, cudaFilterModePoint, cudaAddressModeBorder);
 29 | 
 30 | // Use extern C so C++ doesn't mangle our kernel name
 31 | extern "C"
 32 | // This kernel requires 256x1x1 threads per block
 33 | __global__ void __launch_bounds__(256) sgemm_kernel_128(
 34 | 	float *C,
 35 | 	const int m,   const int n,   const int k,
 36 | 	const int lda, const int ldb, const int ldc,
 37 | 	float alpha, int *D)
 38 | {
 39 | 	// Declare any shared memory your kernel requires
 40 | 	// Or you could just pass the amount in as a param to cuLaunchKernel
 41 | 	__shared__ float4 share[1024];
 42 | 
 43 | 	int tid = threadIdx.x;
 44 | 
 45 | 	// If you use indirect texture references, they will be passed as params at the end of the param list
 46 | 	// So set that up here to make sure they're available in your kernel
 47 | 	floatTex tex = tid > 127 ? texB : texA;
 48 | 
 49 | 	// Make use of shared and your textures so it doesn't get optimized away
 50 | 	share[tid] = tex1Dfetch(tex, tid);
 51 | 
 52 | 	__syncthreads();
 53 | 
 54 | 	// output something so your setup isn't optimized away.
 55 | 	C[tid] = share[255-tid].x;
 56 | }
 57 | 
 58 | extern "C"
 59 | __global__ void __launch_bounds__(64) sgemm_kernel_64(
 60 | 	float *C,
 61 | 	const int m,   const int n,   const int k,
 62 | 	const int lda, const int ldb, const int ldc,
 63 | 	float alpha, int *D)
 64 | {
 65 | 	__shared__ float4 share[512];
 66 | 
 67 | 	int tid = threadIdx.x;
 68 | 
 69 | 	floatTex tex = tid > 127 ? texB : texA;
 70 | 
 71 | 	share[tid] = tex1Dfetch(tex, tid);
 72 | 
 73 | 	__syncthreads();
 74 | 
 75 | 	C[tid] = share[255-tid].x;
 76 | }
 77 | 
 78 | // A note about using the Cuda Runtime.
 79 | // If that's your preference over the driver API then here's what you'd do:
 80 | 
 81 | // In your project properties in the Cuda C/C++ panel:
 82 | //    -Set the "Keep Processed Files" (-keep) option
 83 | //    -Add a -v manually to the command line
 84 | // If compiling on command line just add -keep -v options to nvcc.
 85 | // Rebuild your solution and look in the log for these lines that follow the ptxas step:
 86 | 
 87 | // #$ fatbinary --create="Release/kernel.fatbin" -32 --key="a7bce87544c2a492" --ident="C:/Users/Scott/Documents/sgemm6/sgemm6/kernel.cu" --cmdline="-v --opt-level 4 --generate-line-info " "--image=profile=sm_50,file=Release/kernel.sm_50.cubin" "--image=profile=compute_50,file=Release/kernel.ptx" --embedded-fatbin="Release/kernel.fatbin.c" --cuda
 88 | // #$ cl.exe @Release/kernel.cu.cpp.ii.res > "Release/kernel.cu.cpp.ii"
 89 | // #$ cl.exe @Release/kernel.cu.obj.res -Fo"Release/kernel.cu.obj"
 90 | 
 91 | // You just need to manually run these 3 commands (or add them to a build script)
 92 | // after you've modified the cubin generated from the preceeding ptxas command.
 93 | // That will give you a new .cu.obj file which will automatically be linked in for you next time you
 94 | // build your project (or you could manually run the linker step as well).
 95 | 
 96 | // Having done that you can call your kernel normally using the <<< >>> syntax.
 97 | // Debugging will have to be with the sass syntax but that's what you'll want to see anyway.
 98 | // With fatbin you can also keep non-maxwell optimized versions of your code.
 99 | 
100 | 
101 | // I just discovered this also works as a shortcut to the above:
102 | // nvcc -lib -arch sm_52 -m 32 -use-cubin code=sm_52,cubin=sgemm.cubin -o sgemm.lib sgemm.cu
103 | 
104 | // The cu kernel definitions above need to have empty bodies.
105 | // And, the cu file must be compiled to a lib seperately before linking.


--------------------------------------------------------------------------------
/sgemm/sgemm.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl
  2 | use strict;
  3 | 
  4 | my $CU_AD_FORMAT_UNSIGNED_INT8  = 0x01;
  5 | my $CU_AD_FORMAT_UNSIGNED_INT16 = 0x02;
  6 | my $CU_AD_FORMAT_FLOAT          = 0x20;
  7 | 
  8 | if (!-f 'sgemm_pre_128.sass' || (stat 'sgemm128.sass')[9] > (stat 'sgemm_pre_128.sass')[9])
  9 | {
 10 |     print `maxas.pl -p sgemm128.sass sgemm_pre_128.sass`;
 11 |     exit if $?;
 12 |     print `maxas.pl -i sgemm128.sass sgemm.cubin`;
 13 |     exit if $?;
 14 |     print `maxas.pl -e -k sgemm_kernel_128 sgemm.cubin sgemm_final_128.sass`;
 15 | }
 16 | if (!-f 'sgemm_pre_64.sass' || (stat 'sgemm64.sass')[9] > (stat 'sgemm_pre_64.sass')[9])
 17 | {
 18 |     print `maxas.pl -p sgemm64.sass sgemm_pre_64.sass`;
 19 |     exit if $?;
 20 |     print `maxas.pl -i sgemm64.sass sgemm.cubin`;
 21 |     exit if $?;
 22 |     print `maxas.pl -e -k sgemm_kernel_64 sgemm.cubin sgemm_final_64.sass`;
 23 | }
 24 | 
 25 | #print `Release\\sgemm.exe $_ 20` foreach (80,60,40,30,20,10,9,8,7,6,5,4,3,2);
 26 | 
 27 | `Release\\sgemm.exe 64 5 $CU_AD_FORMAT_FLOAT`;
 28 | 
 29 | print `Release\\sgemm.exe 64 20 $CU_AD_FORMAT_UNSIGNED_INT8`;
 30 | exit;
 31 | 
 32 | my %data;
 33 | foreach my $thread128 (4 .. 64)
 34 | {
 35 |     my $N = $thread128 * 128;
 36 | 
 37 |     my $iterations = int(20 * (64 * 128)**3 / $N**3);
 38 |     $iterations = 10000 if $iterations > 10000;
 39 | 
 40 |     print "$N $iterations\n";
 41 | 
 42 |     my $data = `Release\\sgemm.exe $thread128 $iterations $CU_AD_FORMAT_UNSIGNED_INT16`;
 43 | 
 44 |     foreach my $bench (split "\n", $data)
 45 |     {
 46 |         if ($bench =~ /^(\w+)\s+GFLOPS: ([0-9.]+) /)
 47 |         {
 48 |             push @{$data{$N}}, $2;
 49 |             print "$1 $2\n";
 50 |         }
 51 |     }
 52 | }
 53 | print join("\t", qw(size Max64 Max128 Cub64 Cub128)), "\n";
 54 | 
 55 | foreach my $N (sort { $a <=> $b } keys %data)
 56 | {
 57 |     print join("\t", @{$data{$N}}), "\n";
 58 | }
 59 | 
 60 | 
 61 | #print $data;
 62 | 
 63 | __END__
 64 | 
 65 | 
 66 | 64 * 128 * 16 * 1.620 * .931 / 520
 67 | 
 68 | Max64  GFLOPS: 1377.38 (size: 256, iterations: 2000)
 69 | Max128 GFLOPS: 973.70 (size: 256, iterations: 2000)
 70 | Cub64  GFLOPS: 1272.42 (size: 256, iterations: 2000)
 71 | Cub128 GFLOPS: 948.15 (size: 256, iterations: 2000)
 72 | 
 73 | my @data = grep /\S/, split "\n", $data;
 74 | 
 75 | my $min;
 76 | my %smData;
 77 | my @sdata;
 78 | foreach (@data)
 79 | {
 80 |     next if /GFLOPS/;
 81 | 
 82 |     my ($sm, $clock, $by, $bx) = split /\s+/;
 83 | 
 84 |     $smData{$sm} = $clock if !$smData{$sm} || $clock < $smData{$sm};
 85 | 
 86 |     $min = $clock if !$min || $clock < $min;
 87 | 
 88 |     push @sdata, [$sm, $clock, $by, $bx];
 89 | }
 90 | 
 91 | foreach (@sdata)
 92 | {
 93 |     $_->[1] -= $smData{$_->[0]};
 94 | }
 95 | 
 96 | foreach (sort {$a->[1] <=> $b->[1] || $a->[0] <=> $b->[0]} @sdata)
 97 | {
 98 |     printf "%02d %8u  by: %2d bx: %2d\n", @$_;
 99 | 
100 | }
101 | 
102 | 
103 | 


--------------------------------------------------------------------------------
/sgemm/sgemm.sln:
--------------------------------------------------------------------------------
 1 | ﻿
 2 | Microsoft Visual Studio Solution File, Format Version 11.00
 3 | # Visual Studio 2010
 4 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "sgemm", "sgemm.vcxproj", "{D571379D-3653-43CB-BE83-A6C68D392A05}"
 5 | EndProject
 6 | Global
 7 | 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 8 | 		Debug|Win32 = Debug|Win32
 9 | 		Release|Win32 = Release|Win32
10 | 	EndGlobalSection
11 | 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
12 | 		{D571379D-3653-43CB-BE83-A6C68D392A05}.Debug|Win32.ActiveCfg = Debug|Win32
13 | 		{D571379D-3653-43CB-BE83-A6C68D392A05}.Debug|Win32.Build.0 = Debug|Win32
14 | 		{D571379D-3653-43CB-BE83-A6C68D392A05}.Release|Win32.ActiveCfg = Release|Win32
15 | 		{D571379D-3653-43CB-BE83-A6C68D392A05}.Release|Win32.Build.0 = Release|Win32
16 | 	EndGlobalSection
17 | 	GlobalSection(SolutionProperties) = preSolution
18 | 		HideSolutionNode = FALSE
19 | 	EndGlobalSection
20 | EndGlobal
21 | 


--------------------------------------------------------------------------------
/sgemm/sgemm.vcxproj:
--------------------------------------------------------------------------------
 1 | ﻿<?xml version="1.0" encoding="utf-8"?>
 2 | <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
 3 |   <ItemGroup Label="ProjectConfigurations">
 4 |     <ProjectConfiguration Include="Debug|Win32">
 5 |       <Configuration>Debug</Configuration>
 6 |       <Platform>Win32</Platform>
 7 |     </ProjectConfiguration>
 8 |     <ProjectConfiguration Include="Release|Win32">
 9 |       <Configuration>Release</Configuration>
10 |       <Platform>Win32</Platform>
11 |     </ProjectConfiguration>
12 |   </ItemGroup>
13 |   <PropertyGroup Label="Globals">
14 |     <ProjectGuid>{D571379D-3653-43CB-BE83-A6C68D392A05}</ProjectGuid>
15 |     <Keyword>Win32Proj</Keyword>
16 |     <RootNamespace>sgemm</RootNamespace>
17 |   </PropertyGroup>
18 |   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
19 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
20 |     <ConfigurationType>Application</ConfigurationType>
21 |     <UseDebugLibraries>true</UseDebugLibraries>
22 |     <CharacterSet>Unicode</CharacterSet>
23 |   </PropertyGroup>
24 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
25 |     <ConfigurationType>Application</ConfigurationType>
26 |     <UseDebugLibraries>false</UseDebugLibraries>
27 |     <WholeProgramOptimization>true</WholeProgramOptimization>
28 |     <CharacterSet>Unicode</CharacterSet>
29 |   </PropertyGroup>
30 |   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
31 |   <ImportGroup Label="ExtensionSettings">
32 |   </ImportGroup>
33 |   <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
34 |     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
35 |   </ImportGroup>
36 |   <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
37 |     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
38 |   </ImportGroup>
39 |   <PropertyGroup Label="UserMacros" />
40 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
41 |     <LinkIncremental>true</LinkIncremental>
42 |   </PropertyGroup>
43 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
44 |     <LinkIncremental>false</LinkIncremental>
45 |   </PropertyGroup>
46 |   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
47 |     <ClCompile>
48 |       <PrecompiledHeader>
49 |       </PrecompiledHeader>
50 |       <WarningLevel>Level3</WarningLevel>
51 |       <Optimization>Disabled</Optimization>
52 |       <PreprocessorDefinitions>_CRT_SECURE_NO_WARNINGS;WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
53 |       <AdditionalIncludeDirectories>$(CUDA_PATH_V6_5)\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
54 |     </ClCompile>
55 |     <Link>
56 |       <SubSystem>Console</SubSystem>
57 |       <GenerateDebugInformation>true</GenerateDebugInformation>
58 |       <AdditionalLibraryDirectories>$(CUDA_PATH_V6_5)\lib\Win32;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
59 |       <AdditionalDependencies>cuda.lib;cublas.lib;%(AdditionalDependencies)</AdditionalDependencies>
60 |     </Link>
61 |   </ItemDefinitionGroup>
62 |   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
63 |     <ClCompile>
64 |       <WarningLevel>Level3</WarningLevel>
65 |       <PrecompiledHeader>
66 |       </PrecompiledHeader>
67 |       <Optimization>MaxSpeed</Optimization>
68 |       <FunctionLevelLinking>true</FunctionLevelLinking>
69 |       <IntrinsicFunctions>true</IntrinsicFunctions>
70 |       <PreprocessorDefinitions>_CRT_SECURE_NO_WARNINGS;WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
71 |       <AdditionalIncludeDirectories>$(CUDA_PATH_V6_5)\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
72 |     </ClCompile>
73 |     <Link>
74 |       <SubSystem>Console</SubSystem>
75 |       <GenerateDebugInformation>true</GenerateDebugInformation>
76 |       <EnableCOMDATFolding>true</EnableCOMDATFolding>
77 |       <OptimizeReferences>true</OptimizeReferences>
78 |       <AdditionalLibraryDirectories>$(CUDA_PATH_V6_5)\lib\Win32;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
79 |       <AdditionalDependencies>cuda.lib;cublas.lib;%(AdditionalDependencies)</AdditionalDependencies>
80 |     </Link>
81 |   </ItemDefinitionGroup>
82 |   <ItemGroup>
83 |     <ClCompile Include="sgemm.cpp" />
84 |   </ItemGroup>
85 |   <ItemGroup>
86 |     <None Include="sgemm128.sass" />
87 |     <None Include="sgemm64.sass" />
88 |   </ItemGroup>
89 |   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
90 |   <ImportGroup Label="ExtensionTargets">
91 |   </ImportGroup>
92 | </Project>


--------------------------------------------------------------------------------
/sgemm/sgemm128.sass:
--------------------------------------------------------------------------------
  1 | # Kernel: sgemm_kernel_128
  2 | #
  3 | # SharedSize: 16384
  4 | # Params(8):
  5 | #   0:0x140:4:4 param_C,
  6 | #   1:0x144:4:0 param_m,
  7 | #   2:0x148:4:0 param_n,
  8 | #   3:0x14c:4:0 param_k,
  9 | #   4:0x150:4:0 param_lda,
 10 | #   5:0x154:4:0 param_ldb,
 11 | #   6:0x158:4:0 param_ldc
 12 | #   7:0x15c:4:0 param_alpha
 13 | #   8:0x160:4:4 param_D // for diagnostic printf output
 14 | #
 15 | # Globals:
 16 | #   c[0x0][0x164]: texA (the value is 1)
 17 | #   c[0x0][0x168]: texB (the value is 0)
 18 | 
 19 | <REGISTER_MAPPING>
 20 | 
 21 |     // Temporary registers to calculate the state registers. Reuse the C output registers.
 22 |     // These can be dynamically allocated (~) in the available registger space to elimiate any register bank conflicts.
 23 |     0-63    ~ blk, ldx, ldx2, ldx4, k, tid1, tid4, tid7, tid31_4, xmad_t0, xmad_end, bxOrig, byOrig, loy
 24 | 
 25 |     // Aliases for the C registers we use for initializing C (used as vectors)
 26 |     0-63    : cz<00-63>
 27 | 
 28 |     // The offset we store our zero value for initializing C. Reuse a register from the second blocking registers
 29 |     80      : zOffset
 30 | 
 31 |     // 64 C maxtrix output registers.
 32 |     // Use special mapping to avoid register bank conflicts between these registers and the blocking registers.
 33 |      3, 2,11,10,19,18,27,26 : cx00y<00-03|64-67>
 34 |      7, 6,15,14,23,22,31,30 : cx01y<00-03|64-67>
 35 |      1, 0, 9, 8,17,16,25,24 : cx02y<00-03|64-67>
 36 |      5, 4,13,12,21,20,29,28 : cx03y<00-03|64-67>
 37 |     35,34,43,42,51,50,59,58 : cx64y<00-03|64-67>
 38 |     39,38,47,46,55,54,63,62 : cx65y<00-03|64-67>
 39 |     33,32,41,40,49,48,57,56 : cx66y<00-03|64-67>
 40 |     37,36,45,44,53,52,61,60 : cx67y<00-03|64-67>
 41 | 
 42 |     // Double buffered register blocking used in vector loads.
 43 |     // Any bank conflicts that we can't avoid in these registers we can hide with .reuse flags
 44 |     64-79   : j0Ax<00-03|64-67>, j0By<00-03|64-67>
 45 |     80-95   : j1Ax<00-03|64-67>, j1By<00-03|64-67>
 46 | 
 47 |     // Registers to load A or B
 48 |     96-103  : loadX<0-7>
 49 | 
 50 |     // Key global state registers for main loop and some we reuse for outputing C.
 51 |     // Note, tweaking the register banks of track<0|4>, tex, writeS, readBs, readAs impacts performance because of
 52 |     // delayed bank conflicts between memory operations and ffmas.
 53 |     // The array index bracket notation can be used to request a bank in a dynamically allocated range.
 54 |     104-127 ~ track<0|4>[0], tex[2], readAs[2], readBs[3], writeS[3], end, ldx8, tid, bx, by, tid31, tid96, tid128 //, clock, smId, nSMs
 55 | 
 56 |     // Registers to store the results back to global memory. Reuse any register not needed after the main loop.
 57 |     // Statically allocate cs0-7 because they're vector registers.
 58 |     64-71   : cs<0-7>
 59 | 
 60 |     // dynamically allocated C output registers(~)
 61 |     72-103  ~ cy<00|04|08|12>, Cy<00|04|08|12>, ldc, ldc1, ldc4, ldc8, ldc60, writeCs, readCs, cx, ci, alpha, xmad_ci //, xmad_D, D, blckDimX, gridDimX
 62 | 
 63 | </REGISTER_MAPPING>
 64 | 
 65 | // Note the absense of the loading of the stack pointer into R1.
 66 | // No idea why ptxas does that anyway when it's not used for register spilling.
 67 | // Such a waste of a perfectly good register.
 68 | 
 69 | // Scheduler doesn't handle the dependency flags yet,
 70 | // so move these first instructions outside the block that's auto scheduled
 71 | //--:-:-:-:1      CS2R clock, SR_CLOCKLO;
 72 | //--:-:-:-:1      S2R smId, SR_VIRTID;
 73 | //--:-:-:-:1      S2R nSMs, SR_VIRTCFG;
 74 | --:-:1:-:1      S2R tid, SR_TID.X;   // Set Dep 1
 75 | --:-:2:-:1      S2R bx,  SR_CTAID.X; // Set Dep 2
 76 | --:-:3:-:1      S2R by,  SR_CTAID.Y; // Set Dep 3
 77 | 
 78 | // Instructions in a SCHEDULE_BLOCK are automatically reordered and appropriately stalled for simple dependancies
 79 | // Memory dependencies are left up to the auther to deal with manually for now.
 80 | <SCHEDULE_BLOCK>
 81 | 
 82 | // First 128 threads load A to shared, 2nd 128 loads B to shared
 83 | // Note this technique is not possible in cuda or ptx as there's no way to
 84 | // efficiently specify a warp-uniform predicate for a memory op.
 85 | // Compile sgemm.cu and inspect the sass to see what I'm talking about.
 86 | 
 87 | // blk = tid >= 128 ? by   : bx;
 88 | // ldx = tid >= 128 ? ldb  : lda;
 89 | // tex = tid >= 128 ? texB : texA;
 90 | 01:-:-:Y:1      ISETP.GE.AND P0, PT, tid, 128, PT; // Wait Dep 1
 91 | 06:-:-:-:1      SEL blk, by, bx, P0;               // Wait Dep 2 & 3
 92 | --:-:-:-:1 @!P0 MOV ldx4, c[0x0][0x150];
 93 | --:-:-:-:1  @P0 MOV ldx4, c[0x0][0x154];
 94 | --:-:-:-:1 @!P0 MOV32I tex, 0x80000001; // texA
 95 | --:-:-:-:1  @P0 MOV32I tex, 0x80000000; // texB
 96 | 
 97 | // Initialize the portion of shared we use to zero our C registers
 98 | // Give each warp its own address to write to.
 99 | // All threads write to the same address, but we don't care because only one needs to take.
100 | // There is no bank conflict on writing to the same address, just indeterminacy in which thread will get its value stored.
101 | --:-:-:-:1      LOP.AND zOffset, tid, -32;
102 | --:-:-:-:1      STS.128 [zOffset + 4x<16*128>], RZ;
103 | 
104 | // tid4   = (tid >> 5) & 3
105 | // tid31  = tid & 31
106 | // tid96  = tid & 96
107 | // tid128 = tid & 128
108 | --:-:-:-:1      BFE.U32 tid4,   tid, 0x205; // 2 bits at position 5
109 | --:-:-:-:1      LOP.AND tid31,  tid, 31;
110 | --:-:-:-:1      LOP.AND tid96,  tid, 96;
111 | --:-:-:-:1      LOP.AND tid128, tid, 128;
112 | 
113 | // ldx4  = ldx * 4;
114 | // ldx8  = ldx * 8;
115 | --:-:-:-:1      SHR.U32 ldx, ldx4, 2;
116 | --:-:-:-:1      IADD ldx8, ldx4, ldx4;
117 | 
118 | // track0 = blk*128/4 + tid31 + (ldx * tid4)
119 | --:-:-:-:1      ISCADD  track0, blk, tid31, 5;
120 | --:-:-:-:1      XMAD.LO track0, ldx, tid4,  track0, xmad_t0; // XMAD.LO is a macro that is expanded out into the 3 XMADs
121 | --:-:-:-:1      IADD track4, track0, ldx4;
122 | 
123 | // writeS  = tid31*4*4 + tid4*128*4
124 | // writeS += 4096 if tid >= 128
125 | --:-:-:-:1      SHL    tid31_4, tid31, 4;
126 | --:-:-:-:1      ISCADD writeS, tid4, tid31_4, 9;
127 | --:-:-:-:1  @P0 IADD   writeS, writeS, 4x<8*128>;
128 | 
129 | // int end = track0 + (k-8)*ldx;
130 | --:-:-:-:1      MOV k, c[0x0][0x14c];
131 | --:-:-:-:1      IADD k, k, -8;
132 | --:-:-:-:1      XMAD.LO end, k, ldx, track0, xmad_end;
133 | 
134 | // readAs and readBs are carefully constructed to avoid any bank conflicts while loading from shared
135 | // readAs = ((tid128 >> 4) | ((tid >> 1) & 7)) << 4;
136 | --:-:-:-:1      BFE.U32 tid7,   tid,    0x301; // 3 bits at position 1
137 | --:-:-:-:1      SHR.U32 readAs, tid128, 4;
138 | --:-:-:-:1      LOP.OR  readAs, readAs, tid7;
139 | --:-:-:-:1      SHL     readAs, readAs, 4;
140 | 
141 | // readBs  = (((tid & 0x70) >> 3) | (tid & 1)) << 4 + 4096;
142 | --:-:-:-:1      LOP.AND tid1,   tid,    1;
143 | --:-:-:-:1      LOP.AND readBs, tid,    0x70;
144 | --:-:-:-:1      SHR.U32 readBs, readBs, 3;
145 | --:-:-:-:1      LOP.OR  readBs, readBs, tid1;
146 | --:-:-:-:1      ISCADD  readBs, readBs, 4x<8*128>, 4;
147 | 
148 | // Preload the first 8 lines from texture memory
149 | // Keep these instructions in this order (but allow others to interleave).
150 | // Normally the scheduler tries to preserve source order by default, but this demonstrates how you enforce
151 | // an ordering if you need to.
152 | // Note: these are the 4 element vector load versions (last param: 0xf=vec4, 0x3=vec2, 0x1=single)
153 | <ORDERED>
154 | --:-:1:-:1      TLD.B.LZ.P loadX0, track0, tex, 0x0, 1D, 0xf; // Set Dep 1
155 | --:-:2:-:1      TLD.B.LZ.P loadX4, track4, tex, 0x0, 1D, 0xf; // Set Dep 2
156 | </ORDERED>
157 | 
158 | </SCHEDULE_BLOCK>
159 | 
160 | // Initialize C registeres to zero
161 | // Using LDS.U.128 is a neat trick to save a few clock cyles
162 | // (when you have enough warps to hide the latency.)
163 | <CODE>
164 |     return join '', map sprintf("--:-:3:-:1      LDS.U.128 cz%02d, [zOffset + 4x<16*128>];\n", $_ * 4), 0..15;
165 | </CODE>
166 | 
167 | // These instuctions need to occur after the textures load so put them in a new block
168 | // that starts with a dependency barrier wait.
169 | <SCHEDULE_BLOCK>
170 | 
171 | 01:-:-:-:1      STS.128 [writeS + 4x<0*128>], loadX0; // Wait Dep 1
172 | 02:-:-:-:1      STS.128 [writeS + 4x<4*128>], loadX4; // Wait Dep 2
173 | 
174 | // Increment tracks after the loads are complete to avoid needing write-after-read dependencies
175 | --:-:-:-:1      IADD track0, track0, ldx8;
176 | --:-:-:-:1      IADD track4, track4, ldx8;
177 | 
178 | // Wait for all threads to finish loading shared
179 | 04:-:-:-:5      BAR.SYNC 0;
180 | 
181 | </SCHEDULE_BLOCK>
182 | 
183 | // The next store to shared goes to high area.
184 | // Having 2 share buffers allows us to eliminate a bar.sync in the main loop.
185 | // This way we don't have to wait for all threads to arrive before writing fresh data to shared.
186 | // Other threads can continue reading from the last batch while the new data is being written.
187 | --:-:-:-:0      LOP.XOR writeS, writeS, 4x<16*128>;
188 | 
189 | // Preload the fist lines of A and B from shared
190 | --:-:-:-:1      LDS.U.128 j0Ax00, [readAs + 4x<0*128 + 00>];
191 | --:-:-:-:1      LDS.U.128 j0By00, [readBs + 4x<0*128 + 00>];
192 | --:-:-:-:1      LDS.U.128 j0Ax64, [readAs + 4x<0*128 + 64>];
193 | --:-:1:-:1      LDS.U.128 j0By64, [readBs + 4x<0*128 + 64>]; // Set Dep 1
194 | 
195 | 
196 | // The main loop
197 | // While calculating the first line, load in the next line from shared.
198 | // Shared memory stores enough to do this 8 times per loop.
199 | // Also pull in the next block of memory from global and store it to shared.
200 | 
201 | // Efficiency:
202 | // ffma: 512
203 | // lds:  32 dual issued
204 | // sts:  2  dual issued
205 | // tex:  2  dual issued
206 | // add:  2
207 | // xor:  3
208 | // setp: 1
209 | // bar:  1  dual issued
210 | // bra:  1  dual issued
211 | // Total: 524 (512/518 = 98.8% FFMA)
212 | 
213 | // Memory Throughput Upper Bound:
214 | // 2 * 4 * 4 bytes per thread per 518 clocks
215 | // 128 threads per SM
216 | // 16 SM's (GM204)
217 | // 1640Mhz (boost overclock)
218 | // .931 GiB/GB  (1000^3 / 1024^3)
219 | // 193 GiB/sec
220 | // Available: 224 GiB/sec (or 256 GiB/sec overclocked at 8GHz)
221 | 
222 | LOOP:
223 | 
224 | // Loop end condition
225 | --:-:-:-:1      ISETP.LE.AND P0, PT, track0, end, PT;
226 | 
227 | <CODE>
228 | 
229 |     # We eliminated bank conflicts with our C registers and the blocking registers,
230 |     # but there are still 16 bank conflicts between the blocking registers themselves.
231 |     # By ordering the FFMA's in a swirling zigzag pattern we can completely hide those conflicts
232 |     # behind register reuse.  This pattern also maximizes that reuse (47%) and minimizes the bandwidth
233 |     # out of the register bank, thereby reducing power consumption and allowing the chip to
234 |     # stay at a higher sustained clock speed.  One other constraint is that we want each successive
235 |     # instruction to pull its third operand from alternating banks.  We space the swirl by 2 in the x
236 |     # direction to achieve this.  This has the effect of making it easier to avoid delayed bank conflicts
237 |     # with the memory operations.  Finally, for the very first ffma, don't choose one of the 16 bank conflicts
238 |     # as we have no way of hiding that conflict behind a reuse (cublas makes this mistake).
239 | 
240 |     # Alternating banks (1320 Hz, full speed)
241 |     my @swirl = ([2,0],[2,1],[0,1],[0,0]);
242 |     my @xVals = (0,1,64,65);
243 | 
244 |     # Repeating banks (1320Hz, 83 Gflops slower, but lower power draw probably becuase of increased stalls)
245 |     # Only explanation I can think of is increased delayed register bank conflicts with memory ops.
246 |     #my @swirl = ([0,1],[0,0],[1,0],[1,1]);
247 |     #my @xVals = (0,2,64,66);
248 | 
249 |     my @cOrder;
250 |     foreach my $y (0,2,64,66)
251 |     {
252 |         # apply the swirl
253 |         foreach my $x (@xVals)
254 |         {
255 |             push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
256 |         }
257 |         # apply the zigzag
258 |         @xVals = reverse @xVals;
259 |     }
260 | 
261 |     # This ordering (a simple zigzag) eliminates the bank conflicts but only achieves 39% reuse.
262 |     # It runs 20 GFlops slower since the register bank draws more power and the clock slows down to 1306 Hz.
263 |     # There may be more delayed bank conflicts with memory operations as the slowdown is 4 Glops more than
264 |     # the reduced clock accounts for.
265 |     #my @cOrder2;
266 |     #my @xVals = (0..3,64..67);
267 |     #foreach my $y (0..3,64..67)
268 |     #{
269 |     #    @xVals = reverse @xVals;
270 |     #    push @cOrder2, [$_, $y] foreach @xVals;
271 |     #}
272 |     #@cOrder = @cOrder2;
273 | 
274 |     my %insert =
275 |     (
276 |         # Don't start the first TLD before 12 to let ISETP to write P0
277 |         # These global reads and shared writes we put exactly in the middle of the LDS ops
278 |         # This is to not overwhelm the memory units with instructions (and because these were tested faster here).
279 |         # The 4 spacing seems to work best for vec4 instructions.
280 |         # It's odd that these two textures loads can drive 512 FFMA's all by themselves.. but 256 threads can load 8 128 F32 wide lines.
281 |         # So we only need 2 to get 8 lines from both matrices.
282 | 
283 |         j0c31 => "--:-:2:-:1  \@P0 TLD.B.LZ.P loadX0, track0, tex, 0x0, 1D, 0xf; // Set Dep 2\n",
284 |         j0c33 => "--:-:3:-:1  \@P0 TLD.B.LZ.P loadX4, track4, tex, 0x0, 1D, 0xf; // Set Dep 3\n",
285 | 
286 |         j6c30 => "02:-:-:-:1  \@P0 STS.128 [writeS + 4x<0*128>], loadX0; // Wait Dep 2\n",
287 |         j6c34 => "04:-:-:-:1  \@P0 STS.128 [writeS + 4x<4*128>], loadX4; // Wait Dep 3\n",
288 | 
289 |         # We need one barrier in the main loop after writing shared memory.
290 |         # The barrier is needed even if this is our last loop because we need to protect the warp shuffle step.
291 |         # Note, BAR.SYNCs do not sync memory read access automatically, you still need to flag the barriers (writes are sync'd).
292 |         # After the BAR, swap our share buffer location.  We don't need an additional barrier because of these swaps.
293 |         # Note, this doubles our shared memory usage but this kernel's occupancy is entirely bound by registers.
294 |         # LOP.XOR readAs needs to be 4 clocks prior to the LDS.U.128 for readAs (but push this as far down as possible)
295 |         j6c62 =>
296 |                 "01:-:-:-:5      BAR.SYNC 0;                            // Wait Dep 1\n" .
297 |                 "--:-:-:-:1  \@P0 LOP.XOR readAs, readAs, 4x<16*128>;\n" .
298 |                 "--:-:-:-:1  \@P0 LOP.XOR readBs, readBs, 4x<16*128>;\n" .
299 |                 "--:-:-:-:1  \@P0 LOP.XOR writeS, writeS, 4x<16*128>;\n",
300 | 
301 |         # Note having 2 IADDs slightly hits our FFMA performance (1/518 = .2%), but TLD doesn't take an offset.
302 |         # LDG.CI doesn't have this issue, but doesn't give you the nice features of texture loads:
303 |         #   -Boundry Clamping:  simplifies our matrix load logic so we don't need to worry about loading out of bounds
304 |         #   -Normalized Floats: if we don't need full 32 bits of precision we could store our matrices using 16 or 8 bit values
305 |         j7c63 =>
306 |                 "--:-:-:-:1  \@P0 IADD track0, track0, ldx8;\n" .
307 |                 "--:-:-:-:0  \@P0 IADD track4, track4, ldx8;\n" .
308 |                 "--:-:-:Y:5  \@P0 BRA LOOP;\n",
309 |     );
310 | 
311 |     my $out;
312 |     # We unroll our main loop 8 iterations.
313 |     # This gives us a loop instruction count of 556.  Add the control instructions and that makes it 741 opcodes sized 8 bytes.
314 |     # This is 5928 bytes, nicely fitting inside the 8kb instruction cache.  Going to the next biggest size would be 12 lines.
315 |     # That would be 768 ffmas and not leaving enough room for the other instructions and control codes.
316 |     # So by staying inside the instruction cache size, we avoid hitting any instruction fetch latencies.
317 |     foreach my $j (0 .. 7)
318 |     {
319 |         my $odd      = $j & 1;
320 |         my $nOdd     = !$odd + 0;
321 |         # Our rolling blocking registers stay one load ahead off the FFMA's (rs: read share)
322 |         my $rsOffset = ($j + 1) % 8;
323 |         # No need to load on last loop iteration
324 |         my $rsPred   = $j == 7 ? '@P0' : '   ';
325 | 
326 |         # You can experiment here with different vector load sizes
327 |         my $vec = 128;
328 | 
329 |         if ($vec == 128)
330 |         {
331 |             # Roll up our LDS ops here to keep them easier to manage and tune
332 |             # Space at every other clock to maximize throughput.
333 |             $insert{"j${j}c0"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dAx00, [readAs + 4x<%d*128 + 00>];\n", $rsPred, $nOdd, $rsOffset;
334 |             $insert{"j${j}c2"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dBy00, [readBs + 4x<%d*128 + 00>];\n", $rsPred, $nOdd, $rsOffset;
335 |             $insert{"j${j}c4"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dAx64, [readAs + 4x<%d*128 + 64>];\n", $rsPred, $nOdd, $rsOffset;
336 |             $insert{"j${j}c6"} = sprintf "--:-:1:-:1  %s LDS.U.128 j%dBy64, [readBs + 4x<%d*128 + 64>]; // Set Dep 1\n", $rsPred, $nOdd, $rsOffset;
337 |         }
338 |         elsif ($vec == 64)
339 |         {
340 |             # LDS.64 runs about 22 Gflops slower than LDS.128 (GM107).  Not a huge difference since our latencies are so well hidden.
341 |             # I think LDS.128 is implemented internally as a pair of LDS.64 ops which could be another reason for the comparable performance.
342 |             # I think the big benefit with 128 is being able to issue all our LDS ops earlier, allowing more FFMA's prior to reading out the results.
343 |             # There could also be additional opportunity for delayed bank conflicts.
344 |             $insert{"j${j}c0"}  = sprintf "--:-:-:-:1  %s LDS.U.64 j%dAx00, [readAs + 4x<%d*128 + 00>];\n", $rsPred, $nOdd, $rsOffset;
345 |             $insert{"j${j}c2"}  = sprintf "--:-:-:-:1  %s LDS.U.64 j%dAx02, [readAs + 4x<%d*128 + 02>];\n", $rsPred, $nOdd, $rsOffset;
346 |             $insert{"j${j}c4"}  = sprintf "--:-:-:-:1  %s LDS.U.64 j%dBy00, [readBs + 4x<%d*128 + 00>];\n", $rsPred, $nOdd, $rsOffset;
347 |             $insert{"j${j}c6"}  = sprintf "--:-:-:-:1  %s LDS.U.64 j%dBy02, [readBs + 4x<%d*128 + 02>];\n", $rsPred, $nOdd, $rsOffset;
348 |             $insert{"j${j}c8"}  = sprintf "--:-:-:-:1  %s LDS.U.64 j%dAx64, [readAs + 4x<%d*128 + 64>];\n", $rsPred, $nOdd, $rsOffset;
349 |             $insert{"j${j}c10"} = sprintf "--:-:-:-:1  %s LDS.U.64 j%dAx66, [readAs + 4x<%d*128 + 66>];\n", $rsPred, $nOdd, $rsOffset;
350 |             $insert{"j${j}c12"} = sprintf "--:-:-:-:1  %s LDS.U.64 j%dBy64, [readBs + 4x<%d*128 + 64>];\n", $rsPred, $nOdd, $rsOffset;
351 |             $insert{"j${j}c14"} = sprintf "--:-:1:-:1  %s LDS.U.64 j%dBy66, [readBs + 4x<%d*128 + 66>]; // Set Dep 1\n", $rsPred, $nOdd, $rsOffset;
352 |         }
353 |         else
354 |         {
355 |             # This one drops performance by over 200 Gflops.  So you want to at least use LDS.64 if you can.
356 |             # We don't even have room to properly space these at half throuput.
357 |             $insert{"j${j}c0"}  = sprintf "--:-:-:-:1  %s LDS j%dAx00, [readAs + 4x<%d*128 + 00>];\n", $rsPred, $nOdd, $rsOffset;
358 |             $insert{"j${j}c1"}  = sprintf "--:-:-:-:1  %s LDS j%dAx01, [readAs + 4x<%d*128 + 01>];\n", $rsPred, $nOdd, $rsOffset;
359 |             $insert{"j${j}c2"}  = sprintf "--:-:-:-:1  %s LDS j%dAx02, [readAs + 4x<%d*128 + 02>];\n", $rsPred, $nOdd, $rsOffset;
360 |             $insert{"j${j}c3"}  = sprintf "--:-:-:-:1  %s LDS j%dAx03, [readAs + 4x<%d*128 + 03>];\n", $rsPred, $nOdd, $rsOffset;
361 |             $insert{"j${j}c4"}  = sprintf "--:-:-:-:1  %s LDS j%dBy00, [readBs + 4x<%d*128 + 00>];\n", $rsPred, $nOdd, $rsOffset;
362 |             $insert{"j${j}c5"}  = sprintf "--:-:-:-:1  %s LDS j%dBy01, [readBs + 4x<%d*128 + 01>];\n", $rsPred, $nOdd, $rsOffset;
363 |             $insert{"j${j}c6"}  = sprintf "--:-:-:-:1  %s LDS j%dBy02, [readBs + 4x<%d*128 + 02>];\n", $rsPred, $nOdd, $rsOffset;
364 |             $insert{"j${j}c7"}  = sprintf "--:-:-:-:1  %s LDS j%dBy03, [readBs + 4x<%d*128 + 03>];\n", $rsPred, $nOdd, $rsOffset;
365 |             $insert{"j${j}c8"}  = sprintf "--:-:-:-:1  %s LDS j%dAx64, [readAs + 4x<%d*128 + 64>];\n", $rsPred, $nOdd, $rsOffset;
366 |             $insert{"j${j}c9"}  = sprintf "--:-:-:-:1  %s LDS j%dAx65, [readAs + 4x<%d*128 + 65>];\n", $rsPred, $nOdd, $rsOffset;
367 |             $insert{"j${j}c10"} = sprintf "--:-:-:-:1  %s LDS j%dAx66, [readAs + 4x<%d*128 + 66>];\n", $rsPred, $nOdd, $rsOffset;
368 |             $insert{"j${j}c11"} = sprintf "--:-:-:-:1  %s LDS j%dAx67, [readAs + 4x<%d*128 + 67>];\n", $rsPred, $nOdd, $rsOffset;
369 |             $insert{"j${j}c12"} = sprintf "--:-:-:-:1  %s LDS j%dBy64, [readBs + 4x<%d*128 + 64>];\n", $rsPred, $nOdd, $rsOffset;
370 |             $insert{"j${j}c13"} = sprintf "--:-:-:-:1  %s LDS j%dBy65, [readBs + 4x<%d*128 + 65>];\n", $rsPred, $nOdd, $rsOffset;
371 |             $insert{"j${j}c14"} = sprintf "--:-:-:-:1  %s LDS j%dBy66, [readBs + 4x<%d*128 + 66>];\n", $rsPred, $nOdd, $rsOffset;
372 |             $insert{"j${j}c15"} = sprintf "--:-:1:-:1  %s LDS j%dBy67, [readBs + 4x<%d*128 + 67>]; // Set Dep 1\n", $rsPred, $nOdd, $rsOffset;
373 |         }
374 |         foreach my $c (0 .. 63)
375 |         {
376 |             my ($x,$y) = @{$cOrder[$c]};
377 | 
378 |             # Grab an instruction for insertion if one exists for this j and c combination
379 |             my $ins    = $insert{"j${j}c$c"} || '';
380 | 
381 |             # Scatter some yields in there to better balance the workload and reduce sync stalls
382 |             # Don't pair a yeild with the dual issued ffmas as that kills performance for some reason
383 |             ##### This no longer offers extra performance on GM204 as it did on GM107.  It still does for the 64 thread version. Keeping since it doesn't hurt. ####
384 |             my $yield  = $c == 32 ? 'Y' : '-';
385 | 
386 |             # The first FFMA needs to wait on the prior loop's LDS.U.128 ops to finish (except if the barrier does the wait for us)
387 |             my ($wait, $comment) = $c == 0 && $j < 7 ? ('01', ' // Wait Dep 1') : ('--','');
388 | 
389 |             # Dual issue these ops
390 |             my $stall  = $ins =~ /LDS|TLD|STS|BAR/ ? 0 : 1;
391 | 
392 |             my $ctrl   = "$wait:-:-:$yield:$stall";
393 | 
394 |             # output our FFMA and also any inserted ops
395 |             $out .= sprintf "%s      FFMA cx%02dy%02d, j%dAx%02d, j%dBy%02d, cx%02dy%02d;%s\n%s", $ctrl,  $x,$y,  $odd,$x,  $odd,$y,  $x,$y,  $comment,  $ins;
396 |         }
397 |     }
398 |     return $out;
399 | 
400 | </CODE>
401 | 
402 | // Main loop is done, time to write C to global memory.
403 | <SCHEDULE_BLOCK>
404 | 
405 | // Remove the high bits if present from the last loop's xor.
406 | // Also remove the 4096 added onto readBs.
407 | // This gives us the x and y coordinates of the start of this thread's data in C.
408 | --:-:-:-:1      LOP.AND readAs, readAs, 0xfff;
409 | --:-:-:-:1      LOP.AND readBs, readBs, 0xfff;
410 | 
411 | // Remap readAs and readBs onto writeCs so we can shuffle the output for coalesced global writes.
412 | // readAs stays constant, readBs colapses down from stride 4 to 1
413 | // writeCs = (readBs / 4) * 128 + readAs;
414 | --:-:-:-:1      ISCADD  writeCs, readBs, readAs, 5;
415 | 
416 | // Read out the C values from shared in a simple tid mapped pattern but
417 | // offset by the position of this warp's colapsed data in shared.
418 | 
419 | // cx = tid31 | (tid128 >> 2);
420 | --:-:-:-:1      SHR.U32  cx, tid128, 2;
421 | --:-:-:-:1      LOP.OR   cx, tid31,  cx;
422 | 
423 | // readCs = ((tid96 << 4) | cx) << 2;
424 | --:-:-:-:1      SHL      readCs, tid96,  4;
425 | --:-:-:-:1      LOP.OR   readCs, readCs, cx;
426 | --:-:-:-:1      SHL      readCs, readCs, 2;
427 | 
428 | // cx += bx*128;
429 | --:-:-:-:1      ISCADD  cx, bx, cx, 7;
430 | 
431 | // cy = by*128 + (tid96 >> 1)
432 | --:-:-:-:1      SHR.U32 cy00, tid96, 1;
433 | --:-:-:-:1      ISCADD  cy00, by, cy00, 7;
434 | 
435 | // C += (cy*ldc + cx) * 4;
436 | --:-:-:-:1      MOV ldc, c[0x0][0x158];
437 | --:-:-:-:1      XMAD.LO ci, cy00, ldc, cx, xmad_ci;
438 | --:-:-:-:1      ISCADD  Cy00, ci, c[0x0][0x140], 2;
439 | 
440 | // When writing in assembly, being able to 'printf' is sometimes easier than stepping through the debugger.
441 | // Here's how it's done.  Drop something like this in your code. Then modify the c code to accept this
442 | // many params per thread to printf (see assemblySgemm function).
443 | 
444 | //--:-:-:-:1      SHR.U32  smId, smId, 20;
445 | 
446 | // D += ((by * gridDimX * blockDimX * vars) + (bx * blockDimX * vars) + (tid * vars)) * 4
447 | // D += ((by * gridDimX + bx) * blockDimX + tid) * vars * 4
448 | //--:-:-:-:1      MOV gridDimX, c[0x0][0x14];
449 | //--:-:-:-:1      MOV blckDimX, c[0x0][0x8];
450 | //--:-:-:-:1      XMAD.LO D, by, gridDimX, bx, xmad_D;
451 | //--:-:-:-:1      XMAD.LO D, D, blckDimX, tid, xmad_D;
452 | //--:-:-:-:1      ISCADD D, D, c[0x0][0x160], 3; // 4 bytes * 2 vars = 8 or shift 3
453 | 
454 | //--:-:-:-:1      STG.CS [D + 4x<0>], readAs;
455 | //--:-:-:-:1      STG.CS [D + 4x<1>], readBs;
456 | //--:-:-:-:1      STG.CS [D + 4x<2>], writeCs;
457 | //--:-:-:-:1      STG.CS [D + 4x<3>], readCs;
458 | //--:-:-:-:1      STG.CS [D + 4x<4>], cx;
459 | //--:-:-:-:1      STG.CS [D + 4x<5>], cy00;
460 | //--:-:-:-:1      STG.CS [D + 4x<6>], ci;
461 | //--:-:-:-:1      STG.CS [D + 4x<7>], cx67y67;
462 | 
463 | //--:-:-:-:1      STG.CS [D + 4x<0>], smId;
464 | //--:-:-:-:1      STG.CS [D + 4x<1>], clock;
465 | 
466 | 
467 | // Setup our matrix bounds checking vars and preds
468 | // Bounds checking is what allows this code to work on matrix sizes not a multiple of 128
469 | --:-:-:-:1      ISETP.LT.AND P5, PT, cx, c[0x0][0x144], PT; // cx +  0 < m
470 | --:-:-:-:1      IADD cx, cx, 64;
471 | --:-:-:-:1      ISETP.LT.AND P6, PT, cx, c[0x0][0x144], PT; // cx + 64 < m
472 | 
473 | --:-:-:-:1      IADD cy00, cy00, -1;
474 | --:-:-:-:1      IADD cy04, cy00,  4;
475 | --:-:-:-:1      IADD cy08, cy00,  8;
476 | --:-:-:-:1      IADD cy12, cy00,  12;
477 | 
478 | // Setup our C output addresses and increments.
479 | --:-:-:-:1      SHL  ldc1,  ldc, 2;
480 | --:-:-:-:1      SHL  ldc4,  ldc, 4;
481 | --:-:-:-:1      SHL  ldc8,  ldc, 5;
482 | --:-:-:-:1      ISCADD ldc60, ldc, -ldc4, 8;
483 | 
484 | // Load the first set of the STORE_C subroutine params in the scheduled block.
485 | # This is also a good time to apply alpha.
486 | --:-:-:-:1      MOV alpha, c[0x0][0x15c];
487 | 
488 | --:-:-:-:1      FMUL cs0, cx00y00, alpha;
489 | --:-:-:-:1      FMUL cs1, cx01y00, alpha;
490 | --:-:-:-:1      FMUL cs2, cx02y00, alpha;
491 | --:-:-:-:1      FMUL cs3, cx03y00, alpha;
492 | --:-:-:-:1      FMUL cs4, cx64y00, alpha;
493 | --:-:-:-:1      FMUL cs5, cx65y00, alpha;
494 | --:-:-:-:1      FMUL cs6, cx66y00, alpha;
495 | --:-:-:-:1      FMUL cs7, cx67y00, alpha;
496 | 
497 | // We pre-increment the output addresses so they can be dual issued with memory ops
498 | // So start with a -1 instead of 0 value.
499 | --:-:-:-:1      IADD Cy00, Cy00, -ldc1;
500 | --:-:-:-:1      IADD Cy04, Cy00, ldc4;
501 | --:-:-:-:1      IADD Cy08, Cy00, ldc8;
502 | --:-:-:-:0      IADD Cy12, Cy04, ldc8; // Dual Issue (last instruction after reordering)
503 | 
504 | </SCHEDULE_BLOCK>
505 | 
506 | // There's nothing yet in place to handle dependecies with subroutines.
507 | // So don't schedule this block.
508 | <CODE>
509 | 
510 |     my $out;
511 |     foreach my $y (0..3, 64..67)
512 |     {
513 |         my ($wait, $comment) = $y == 64 ? ('--', '') : ('02',' // Wait Dep 2');
514 | 
515 |         # Jump ahead 60 units (to get to the values at y=64)
516 |         $out .=
517 |             "--:-:-:-:1      IADD cy00, cy00, 60;\n" .
518 |             "--:-:-:-:1      IADD cy04, cy04, 60;\n" .
519 |             "--:-:-:-:1      IADD cy08, cy08, 60;\n" .
520 |             "--:-:-:-:1      IADD cy12, cy12, 60;\n\n" .
521 | 
522 |             "02:-:-:-:1      IADD Cy00, Cy00, ldc60; // Wait Dep 2\n" .
523 |             "--:-:-:-:1      IADD Cy04, Cy04, ldc60;\n" .
524 |             "--:-:-:-:1      IADD Cy08, Cy08, ldc60;\n" .
525 |             "--:-:-:-:1      IADD Cy12, Cy12, ldc60;\n\n"  if $y == 64;
526 | 
527 |         # We need to move the C values to the param registers of the STORE_C subroutine.
528 |         # This is also a good time to apply alpha.
529 |         $out .= sprintf(
530 |             "%s:-:-:-:1      FMUL cs0, cx00y%02d, alpha;%s\n" .
531 |             "--:-:-:-:1      FMUL cs1, cx01y%02d, alpha;\n" .
532 |             "--:-:-:-:1      FMUL cs2, cx02y%02d, alpha;\n" .
533 |             "--:-:-:-:1      FMUL cs3, cx03y%02d, alpha;\n" .
534 |             "--:-:-:-:1      FMUL cs4, cx64y%02d, alpha;\n" .
535 |             "--:-:-:-:1      FMUL cs5, cx65y%02d, alpha;\n" .
536 |             "--:-:-:-:1      FMUL cs6, cx66y%02d, alpha;\n" .
537 |             "--:-:-:-:0      FMUL cs7, cx67y%02d, alpha; // Dual Issue\n",
538 |             $wait, $y, $comment, ($y) x 7) if $y;
539 | 
540 |         # Call the subroutine.
541 |         $out .= "--:-:-:-:5      CAL STORE_C;\n\n";
542 |     }
543 |     return $out;
544 | 
545 | </CODE>
546 | 
547 | // And we'd done.  The remainder is the STORE_C subroutine that's defined at the end of the kernel.
548 | --:-:-:-:5      EXIT;
549 | 
550 | // This routine does warp synchronous shuffling of our output data so as to be able
551 | // to have coalesced writes to global memory.  This is actually faster because the shared
552 | // memory latencies can be hidden by other warps and we're only adding a few extra clocks
553 | // to this thread.  Global memory here is the bottleneck and being able to half the needed
554 | // bandwidth at the expense of a few clocks is a modest win.  This also keeps power lower
555 | // and our chip running faster.
556 | 
557 | // Note, the SHFL instruction doesn't help us here because we're swaping different registers
558 | // from different threads.
559 | STORE_C:
560 | 
561 | <SCHEDULE_BLOCK>
562 | 
563 | // Each warp writes to its own region of memory so we don't need to bar.sync the access.
564 | // There are some bank conflicts here on the STS.128s but no way to avoid them, and the hit just means a few extra clocks.
565 | // Note here that the scheduler is able to handle the dependencies between vector and non-vector instructions.
566 | // It knows from the instruction type and the register map that cs0 here includes cs1, cs2 and cs3 as well.
567 | --:-:-:-:1      STS.128 [writeCs+4x<00>], cs0;
568 | --:-:-:-:1      STS.128 [writeCs+4x<64>], cs4;
569 | 
570 | // In a single warp, loads naturally occur after the store to shared completes, no sync required.
571 | --:-:-:-:1      LDS cs0, [readCs + 4x<0*128 + 00>];
572 | --:-:-:-:1      LDS cs1, [readCs + 4x<0*128 + 64>];
573 | --:-:-:-:1      LDS cs2, [readCs + 4x<1*128 + 00>];
574 | --:-:-:-:1      LDS cs3, [readCs + 4x<1*128 + 64>];
575 | --:-:-:-:1      LDS cs4, [readCs + 4x<2*128 + 00>];
576 | --:-:-:-:1      LDS cs5, [readCs + 4x<2*128 + 64>];
577 | --:-:-:-:1      LDS cs6, [readCs + 4x<3*128 + 00>];
578 | --:-:1:-:1      LDS cs7, [readCs + 4x<3*128 + 64>]; // Set Dep 1
579 | 
580 | --:-:-:-:1      IADD cy00, cy00, 1;
581 | --:-:-:-:1      IADD cy04, cy04, 1;
582 | --:-:-:-:1      IADD cy08, cy08, 1;
583 | --:-:-:-:1      IADD cy12, cy12, 1;
584 | 
585 | --:-:-:-:1      IADD Cy00, Cy00, ldc1;
586 | --:-:-:-:1      IADD Cy04, Cy04, ldc1;
587 | --:-:-:-:1      IADD Cy08, Cy08, ldc1;
588 | --:-:-:-:1      IADD Cy12, Cy12, ldc1;
589 | 
590 | --:-:-:-:1      ISETP.LT.AND P0, PT, cy00, c[0x0][0x148], P5; // cy00 < n && cx +  0 < m
591 | --:-:-:-:1      ISETP.LT.AND P1, PT, cy00, c[0x0][0x148], P6; // cy00 < n && cx + 64 < m
592 | --:-:-:-:1      ISETP.LT.AND P2, PT, cy04, c[0x0][0x148], P5; // cy04 < n && cx +  0 < m
593 | --:-:-:-:1      ISETP.LT.AND P3, PT, cy04, c[0x0][0x148], P6; // cy04 < n && cx + 64 < m
594 | 
595 | 01:-:-:-:1  @P0 STG.CG [Cy00 + 4x<00>], cs0; // Wait Dep 1
596 | --:-:-:-:1  @P1 STG.CG [Cy00 + 4x<64>], cs1;
597 | --:-:-:-:1  @P2 STG.CG [Cy04 + 4x<00>], cs2;
598 | --:-:-:-:1  @P3 STG.CG [Cy04 + 4x<64>], cs3;
599 | 
600 | --:-:-:-:1      ISETP.LT.AND P0, PT, cy08, c[0x0][0x148], P5; // cy08 < n && cx +  0 < m
601 | --:-:-:-:1      ISETP.LT.AND P1, PT, cy08, c[0x0][0x148], P6; // cy08 < n && cx + 64 < m
602 | --:-:-:-:1      ISETP.LT.AND P2, PT, cy12, c[0x0][0x148], P5; // cy12 < n && cx +  0 < m
603 | --:-:-:-:1      ISETP.LT.AND P3, PT, cy12, c[0x0][0x148], P6; // cy12 < n && cx + 64 < m
604 | 
605 | --:-:-:-:1  @P0 STG.CG [Cy08 + 4x<00>], cs4;
606 | --:-:-:-:1  @P1 STG.CG [Cy08 + 4x<64>], cs5;
607 | --:-:-:-:1  @P2 STG.CG [Cy12 + 4x<00>], cs6;
608 | --:2:-:-:1  @P3 STG.CG [Cy12 + 4x<64>], cs7; // Set Dep 2
609 | 
610 | </SCHEDULE_BLOCK>
611 | 
612 | --:-:-:-:5      RET;
613 | 
614 | 


--------------------------------------------------------------------------------
/sgemm/sgemm64.sass:
--------------------------------------------------------------------------------
  1 | # Kernel: sgemm_kernel_64
  2 | #
  3 | # SharedSize: 8192
  4 | # Params(8):
  5 | #   0:0x140:4:4 param_C,
  6 | #   1:0x144:4:0 param_m,
  7 | #   2:0x148:4:0 param_n,
  8 | #   3:0x14c:4:0 param_k,
  9 | #   4:0x150:4:0 param_lda,
 10 | #   5:0x154:4:0 param_ldb,
 11 | #   6:0x158:4:0 param_ldc
 12 | #   7:0x15c:4:0 param_alpha
 13 | #   8:0x160:4:4 param_D // for diagnostic printf output
 14 | #
 15 | # Globals:
 16 | #   c[0x0][0x164]: texA (the value is 1)
 17 | #   c[0x0][0x168]: texB (the value is 0)
 18 | 
 19 | <REGISTER_MAPPING>
 20 | 
 21 |     0-63    ~ blk, ldx, ldx4, k, tid1, tid2, tid15, tid15_4, xmad_t0, xmad_end
 22 | 
 23 |     80      : zOffset
 24 |     0-63    : cz<00-63>
 25 | 
 26 |      3, 2,11,10,19,18,27,26 : cx00y<00-03|32-35>
 27 |      7, 6,15,14,23,22,31,30 : cx01y<00-03|32-35>
 28 |      1, 0, 9, 8,17,16,25,24 : cx02y<00-03|32-35>
 29 |      5, 4,13,12,21,20,29,28 : cx03y<00-03|32-35>
 30 |     35,34,43,42,51,50,59,58 : cx32y<00-03|32-35>
 31 |     39,38,47,46,55,54,63,62 : cx33y<00-03|32-35>
 32 |     33,32,41,40,49,48,57,56 : cx34y<00-03|32-35>
 33 |     37,36,45,44,53,52,61,60 : cx35y<00-03|32-35>
 34 | 
 35 |     64-79   : j0Ax<00-03|32-35>, j0By<00-03|32-35>
 36 |     80-95   : j1Ax<00-03|32-35>, j1By<00-03|32-35>
 37 | 
 38 |     64-71   : cs<0-7>
 39 | 
 40 |     96-111  : loadX0<0-3>, loadX2<0-3>, loadX4<0-3>, loadX6<0-3>
 41 | 
 42 |     112-127 ~ track<0|2|4|6>[0], tex[1], readAs[2], readBs[3], writeS[2], end, ldx8, tid, bx, by, tid31, tid32
 43 | 
 44 |     72-111  ~ cy<00|04|08|12>, Cy<00|04|08|12>, ldc, ldc1, ldc4, ldc8, ldc28, writeCs, readCs, cx, ci, xmad_ci, alpha, xmadD, D, blckDimX, gridDimX
 45 | 
 46 | </REGISTER_MAPPING>
 47 | 
 48 | --:-:1:-:1      S2R tid, SR_TID.X;   // Set Dep 1
 49 | --:-:2:-:1      S2R bx,  SR_CTAID.X; // Set Dep 2
 50 | --:-:3:-:1      S2R by,  SR_CTAID.Y; // Set Dep 3
 51 | 
 52 | <SCHEDULE_BLOCK>
 53 | 
 54 | // blk = tid >= 32 ? by   : bx;
 55 | // ldx = tid >= 32 ? ldb  : lda;
 56 | // tex = tid >= 32 ? texB : texA;
 57 | 01:-:-:-:1      ISETP.GE.AND P0, PT, tid, 32, PT; // Wait Dep 1
 58 | 06:-:-:-:1      SEL blk, by, bx, P0;              // Wait Dep 2 & 3
 59 | --:-:-:-:1 @!P0 MOV ldx4, c[0x0][0x150];
 60 | --:-:-:-:1  @P0 MOV ldx4, c[0x0][0x154];
 61 | --:-:-:-:1 @!P0 MOV32I tex, 0x80000001; // texA
 62 | --:-:-:-:1  @P0 MOV32I tex, 0x80000000; // texB
 63 | 
 64 | --:-:-:-:1      LOP.AND zOffset, tid, -32;
 65 | --:-:-:-:1      STS.128 [zOffset + 4x<16*64>], RZ;
 66 | 
 67 | // tid2   = (tid >> 4) & 1
 68 | // tid15  = tid & 15
 69 | // tid31 = tid & 31
 70 | // tid32 = tid & 32
 71 | --:-:-:-:1      BFE.U32 tid2,  tid, 0x104; // 1 bit at position 4
 72 | --:-:-:-:1      LOP.AND tid15, tid, 15;
 73 | --:-:-:-:1      LOP.AND tid31, tid, 31;
 74 | --:-:-:-:1      LOP.AND tid32, tid, 32;
 75 | 
 76 | // ldx4  = ldx * 4;
 77 | // ldx8  = ldx * 8;
 78 | --:-:-:-:1      SHR.U32 ldx, ldx4, 2;
 79 | --:-:-:-:1      IADD ldx8, ldx4, ldx4;
 80 | 
 81 | // track0 = blk*64/4 + tid15 + (ldx * tid2)
 82 | --:-:-:-:1      ISCADD  track0, blk, tid15, 4;
 83 | --:-:-:-:1      XMAD.LO track0, ldx, tid2,  track0, xmad_t0;
 84 | --:-:-:-:1      IADD3 track2, track0, ldx, ldx;
 85 | --:-:-:-:1      IADD track4, track0, ldx4;
 86 | --:-:-:-:1      IADD track6, track2, ldx4;
 87 | 
 88 | // writeS = tid15*4*4 + tid2*64*4
 89 | --:-:-:-:1      SHL    tid15_4, tid15, 4;
 90 | --:-:-:-:1      ISCADD writeS, tid2, tid15_4, 8;
 91 | 
 92 | // writeS += 2048 if tid >= 32
 93 | --:-:-:-:1  @P0 IADD   writeS, writeS, 4x<8*64>;
 94 | 
 95 | // int end = track0 + (k-8)*ldx;
 96 | --:-:-:-:1      MOV k, c[0x0][0x14c];
 97 | --:-:-:-:1      IADD k, k, -8;
 98 | --:-:-:-:1      XMAD.LO end, k, ldx, track0, xmad_end;
 99 | 
100 | // readAs = ((tid >> 1) & 7) << 4;
101 | --:-:-:-:1      BFE.U32 readAs, tid,    0x301; // 3 bits at position 1
102 | --:-:-:-:1      SHL     readAs, readAs, 4;
103 | 
104 | // readBs  = (((tid & 0x30) >> 3) | (tid & 1)) << 4 + 2048;
105 | --:-:-:-:1      LOP.AND tid1,   tid,    1;
106 | --:-:-:-:1      LOP.AND readBs, tid,    0x30;
107 | --:-:-:-:1      SHR.U32 readBs, readBs, 3;
108 | --:-:-:-:1      LOP.OR  readBs, readBs, tid1;
109 | --:-:-:-:1      ISCADD  readBs, readBs, 4x<8*64>, 4;
110 | 
111 | <ORDERED>
112 | --:-:1:-:1      TLD.B.LZ.P loadX0, track0, tex, 0x0, 1D, 0xf; // Set Dep 1
113 | --:-:2:-:1      TLD.B.LZ.P loadX2, track2, tex, 0x0, 1D, 0xf; // Set Dep 2
114 | --:-:3:-:1      TLD.B.LZ.P loadX4, track4, tex, 0x0, 1D, 0xf; // Set Dep 1
115 | --:-:4:-:1      TLD.B.LZ.P loadX6, track6, tex, 0x0, 1D, 0xf; // Set Dep 2
116 | </ORDERED>
117 | 
118 | </SCHEDULE_BLOCK>
119 | 
120 | <CODE>
121 |     return join '', map sprintf("--:-:5:-:1      LDS.U.128 cz%02d, [zOffset + 4x<16*64>];\n", $_ * 4), 0..15;
122 | </CODE>
123 | 
124 | <SCHEDULE_BLOCK>
125 | 
126 | 01:-:-:-:1      STS.128 [writeS + 4x<0*64>], loadX0; // Wait Dep 1
127 | 02:-:-:-:1      STS.128 [writeS + 4x<2*64>], loadX2; // Wait Dep 2
128 | 04:-:-:-:1      STS.128 [writeS + 4x<4*64>], loadX4; // Wait Dep 3
129 | 08:-:-:-:1      STS.128 [writeS + 4x<6*64>], loadX6; // Wait Dep 4
130 | 
131 | --:-:-:-:1      IADD track0, track0, ldx8;
132 | --:-:-:-:1      IADD track2, track2, ldx8;
133 | --:-:-:-:1      IADD track4, track4, ldx8;
134 | --:-:-:-:1      IADD track6, track6, ldx8;
135 | 
136 | 10:-:-:-:5      BAR.SYNC 0;
137 | 
138 | </SCHEDULE_BLOCK>
139 | 
140 | --:-:-:-:0      LOP.XOR writeS, writeS, 4x<16*64>;
141 | 
142 | --:-:-:-:1      LDS.U.128 j0Ax00, [readAs + 4x<0*64 + 00>];
143 | --:-:-:-:1      LDS.U.128 j0By00, [readBs + 4x<0*64 + 00>];
144 | --:-:-:-:1      LDS.U.128 j0Ax32, [readAs + 4x<0*64 + 32>];
145 | --:-:1:-:1      LDS.U.128 j0By32, [readBs + 4x<0*64 + 32>]; // Set Dep 1
146 | 
147 | // Efficiency:
148 | // ffma: 512
149 | // lds:  32 dual issued
150 | // sts:  4  dual issued
151 | // tex:  4  dual issued
152 | // add:  4
153 | // xor:  3
154 | // setp: 1
155 | // bar:  1  dual issued
156 | // bra:  1  dual issued
157 | // Total: 520 (512/520 = 98.5% FFMA)
158 | 
159 | LOOP:
160 | 
161 | // Loop end condition
162 | --:-:-:-:1      ISETP.LE.AND P0, PT, track0, end, PT;
163 | 
164 | <CODE>
165 | 
166 |     my @cOrder;
167 |     my @swirl = ([2,0],[2,1],[0,1],[0,0]);
168 |     my @x = (0,1,32,33);
169 |     foreach my $y (0,2,32,34)
170 |     {
171 |         foreach my $x (@x)
172 |         {
173 |             push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
174 |         }
175 |         @x = reverse @x;
176 |     }
177 | 
178 |     my %insert =
179 |     (
180 |         j0c31 => "--:-:-:-:1  \@P0 TLD.B.LZ.P loadX0, track0, tex, 0x0, 1D, 0xf;\n",
181 |         j0c33 => "--:-:2:-:1  \@P0 TLD.B.LZ.P loadX2, track2, tex, 0x0, 1D, 0xf; // Set Dep 2\n",
182 | 
183 |         j1c31 => "--:-:-:-:1  \@P0 TLD.B.LZ.P loadX4, track4, tex, 0x0, 1D, 0xf;\n",
184 |         j1c33 => "--:-:3:-:1  \@P0 TLD.B.LZ.P loadX6, track6, tex, 0x0, 1D, 0xf; // Set Dep 3\n",
185 | 
186 |         j5c30 => "02:-:-:-:1  \@P0 STS.128 [writeS + 4x<0*64>], loadX0; // Wait Dep 2\n",
187 |         j5c34 => "--:-:-:-:1  \@P0 STS.128 [writeS + 4x<2*64>], loadX2;\n",
188 | 
189 |         j6c30 => "04:-:-:-:1  \@P0 STS.128 [writeS + 4x<4*64>], loadX4; // Wait Dep 3\n",
190 |         j6c34 => "--:-:-:-:1  \@P0 STS.128 [writeS + 4x<6*64>], loadX6;\n",
191 | 
192 |         j6c62 =>
193 |                 "01:-:-:-:5      BAR.SYNC 0;                            // Wait Dep 1\n" .
194 |                 "--:-:-:-:1  \@P0 LOP.XOR readAs, readAs, 4x<16*64>;\n" .
195 |                 "--:-:-:-:1  \@P0 LOP.XOR readBs, readBs, 4x<16*64>;\n" .
196 |                 "--:-:-:-:1  \@P0 LOP.XOR writeS, writeS, 4x<16*64>;\n",
197 | 
198 |         j7c63 =>
199 |                 "--:-:-:-:1  \@P0 IADD track0, track0, ldx8;\n" .
200 |                 "--:-:-:-:1  \@P0 IADD track2, track2, ldx8;\n" .
201 |                 "--:-:-:-:1  \@P0 IADD track4, track4, ldx8;\n" .
202 |                 "--:-:-:-:0  \@P0 IADD track6, track6, ldx8;\n" .
203 |                 "--:-:-:Y:5  \@P0 BRA LOOP;\n",
204 |     );
205 | 
206 |     my $out;
207 |     foreach my $j (0 .. 7)
208 |     {
209 |         my $odd      = $j & 1;
210 |         my $nOdd     = !$odd + 0;
211 |         my $rsOffset = ($j + 1) % 8;
212 |         my $rsPred   = $j == 7 ? '@P0' : '   ';
213 | 
214 |         $insert{"j${j}c0"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dAx00, [readAs + 4x<%d*64 + 00>];\n", $rsPred, $nOdd, $rsOffset;
215 |         $insert{"j${j}c2"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dBy00, [readBs + 4x<%d*64 + 00>];\n", $rsPred, $nOdd, $rsOffset;
216 |         $insert{"j${j}c4"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dAx32, [readAs + 4x<%d*64 + 32>];\n", $rsPred, $nOdd, $rsOffset;
217 |         $insert{"j${j}c6"} = sprintf "--:-:1:-:1  %s LDS.U.128 j%dBy32, [readBs + 4x<%d*64 + 32>]; // Set Dep 1\n", $rsPred, $nOdd, $rsOffset;
218 | 
219 |         foreach my $c (0 .. 63)
220 |         {
221 |             my ($x,$y) = @{$cOrder[$c]};
222 | 
223 |             my $ins    = $insert{"j${j}c$c"} || '';
224 | 
225 |             my $yield  = $c == 32 ? 'Y' : '-';
226 | 
227 |             my ($wait, $comment) = $c == 0 && $j < 7 ? ('01', ' // Wait Dep 1') : ('--','');
228 | 
229 |             my $stall  = $ins =~ /LDS|TLD|STS|BAR/ ? 0 : 1;
230 | 
231 |             my $ctrl   = "$wait:-:-:$yield:$stall";
232 | 
233 |             $out .= sprintf "%s      FFMA cx%02dy%02d, j%dAx%02d, j%dBy%02d, cx%02dy%02d;%s\n%s", $ctrl,  $x,$y,  $odd,$x,  $odd,$y,  $x,$y,  $comment,  $ins;
234 |         }
235 |     }
236 |     return $out;
237 | 
238 | </CODE>
239 | 
240 | <SCHEDULE_BLOCK>
241 | 
242 | --:-:-:-:1      LOP.AND readAs, readAs, 0x7ff;
243 | --:-:-:-:1      LOP.AND readBs, readBs, 0x7ff;
244 | 
245 | // writeCs = (readBs / 4) * 64 + readAs;
246 | --:-:-:-:1      ISCADD  writeCs, readBs, readAs, 4;
247 | 
248 | // readCs = ((tid32 << 3) + tid31) << 2;
249 | --:-:-:-:1      ISCADD  readCs, tid32,  tid31, 3;
250 | --:-:-:-:1      SHL     readCs, readCs, 2;
251 | 
252 | // cx = bx*64 + tid31;
253 | --:-:-:-:1      ISCADD  cx, bx, tid31, 6;
254 | 
255 | // cy = by*64 + (tid32 >> 1)
256 | --:-:-:-:1      SHR.U32 cy00, tid32, 1;
257 | --:-:-:-:1      ISCADD  cy00, by, cy00, 6;
258 | 
259 | // C += (cy*ldc + cx) * 4;
260 | --:-:-:-:1      MOV ldc, c[0x0][0x158];
261 | --:-:-:-:1      XMAD.LO ci, cy00, ldc, cx, xmad_ci;
262 | --:-:-:-:1      ISCADD  Cy00, ci, c[0x0][0x140], 2;
263 | 
264 | --:-:-:-:1      ISETP.LT.AND P5, PT, cx, c[0x0][0x144], PT; // cx +  0 < m
265 | --:-:-:-:1      IADD cx, cx, 32;
266 | --:-:-:-:1      ISETP.LT.AND P6, PT, cx, c[0x0][0x144], PT; // cx + 64 < m
267 | 
268 | // D += ((by * gridDimX * blockDimX * vars) + (bx * blockDimX * vars) + (tid * vars)) * 4
269 | // D += ((by * gridDimX + bx) * blockDimX + tid) * vars * 4
270 | //--:-:-:-:1      MOV gridDimX, c[0x0][0x14];
271 | //--:-:-:-:1      MOV blckDimX, c[0x0][0x8];
272 | //--:-:-:-:1      XMAD.LO D, by, gridDimX, bx, xmadD;
273 | //--:-:-:-:1      XMAD.LO D, D, blckDimX, tid, xmadD;
274 | //--:-:-:-:1      ISCADD D, D, c[0x0][0x160], 5; // 4 bytes * 8 vars = 32 or shift 5
275 | 
276 | //--:-:-:-:1      STG.CS [D + 4x<0>], readAs;
277 | //--:-:-:-:1      STG.CS [D + 4x<1>], readBs;
278 | //--:-:-:-:1      STG.CS [D + 4x<2>], writeCs;
279 | //--:-:-:-:1      STG.CS [D + 4x<3>], readCs;
280 | //--:-:-:-:1      STG.CS [D + 4x<4>], cx;
281 | //--:-:-:-:1      STG.CS [D + 4x<5>], cy00;
282 | //--:-:-:-:1      STG.CS [D + 4x<6>], ci;
283 | //--:-:-:-:1      STG.CS [D + 4x<7>], cx35y35;
284 | 
285 | --:-:-:-:1      IADD cy00, cy00, -1;
286 | --:-:-:-:1      IADD cy04, cy00,  4;
287 | --:-:-:-:1      IADD cy08, cy00,  8;
288 | --:-:-:-:1      IADD cy12, cy00,  12;
289 | 
290 | --:-:-:-:1      SHL  ldc1,  ldc, 2;
291 | --:-:-:-:1      SHL  ldc4,  ldc, 4;
292 | --:-:-:-:1      SHL  ldc8,  ldc, 5;
293 | --:-:-:-:1      ISCADD ldc28, ldc, -ldc4, 7;
294 | 
295 | --:-:-:-:1      MOV alpha, c[0x0][0x15c];
296 | --:-:-:-:1      FMUL cs0, cx00y00, alpha;
297 | --:-:-:-:1      FMUL cs1, cx01y00, alpha;
298 | --:-:-:-:1      FMUL cs2, cx02y00, alpha;
299 | --:-:-:-:1      FMUL cs3, cx03y00, alpha;
300 | --:-:-:-:1      FMUL cs4, cx32y00, alpha;
301 | --:-:-:-:1      FMUL cs5, cx33y00, alpha;
302 | --:-:-:-:1      FMUL cs6, cx34y00, alpha;
303 | --:-:-:-:1      FMUL cs7, cx35y00, alpha;
304 | 
305 | --:-:-:-:1      IADD Cy00, Cy00, -ldc1;
306 | --:-:-:-:1      IADD Cy04, Cy00, ldc4;
307 | --:-:-:-:1      IADD Cy08, Cy00, ldc8;
308 | --:-:-:-:0      IADD Cy12, Cy04, ldc8; // Dual Issue (last instruction after reordering)
309 | 
310 | </SCHEDULE_BLOCK>
311 | 
312 | <CODE>
313 | 
314 |     my $out;
315 |     foreach my $y (0..3, 32..35)
316 |     {
317 |         my ($wait, $comment) = $y == 32 ? ('--', '') : ('02',' // Wait Dep 2');
318 | 
319 |         $out .=
320 |             "--:-:-:-:1      IADD cy00, cy00, 28;\n" .
321 |             "--:-:-:-:1      IADD cy04, cy04, 28;\n" .
322 |             "--:-:-:-:1      IADD cy08, cy08, 28;\n" .
323 |             "--:-:-:-:1      IADD cy12, cy12, 28;\n\n" .
324 | 
325 |             "02:-:-:-:1      IADD Cy00, Cy00, ldc28; // Wait Dep 2\n" .
326 |             "--:-:-:-:1      IADD Cy04, Cy04, ldc28;\n" .
327 |             "--:-:-:-:1      IADD Cy08, Cy08, ldc28;\n" .
328 |             "--:-:-:-:1      IADD Cy12, Cy12, ldc28;\n\n"  if $y == 32;
329 | 
330 |         $out .= sprintf(
331 |             "%s:-:-:-:1      FMUL cs0, cx00y%02d, alpha;%s\n" .
332 |             "--:-:-:-:1      FMUL cs1, cx01y%02d, alpha;\n" .
333 |             "--:-:-:-:1      FMUL cs2, cx02y%02d, alpha;\n" .
334 |             "--:-:-:-:1      FMUL cs3, cx03y%02d, alpha;\n" .
335 |             "--:-:-:-:1      FMUL cs4, cx32y%02d, alpha;\n" .
336 |             "--:-:-:-:1      FMUL cs5, cx33y%02d, alpha;\n" .
337 |             "--:-:-:-:1      FMUL cs6, cx34y%02d, alpha;\n" .
338 |             "--:-:-:-:0      FMUL cs7, cx35y%02d, alpha; // Dual Issue\n",
339 |             $wait, $y, $comment, ($y) x 7) if $y;
340 | 
341 |         $out .= "--:-:-:-:5      CAL STORE_C;\n\n";
342 |     }
343 |     return $out;
344 | 
345 | </CODE>
346 | 
347 | --:-:-:-:5      EXIT;
348 | 
349 | STORE_C:
350 | 
351 | <SCHEDULE_BLOCK>
352 | 
353 | --:-:-:-:1      STS.128 [writeCs+4x<00>], cs0;
354 | --:-:-:-:1      STS.128 [writeCs+4x<32>], cs4;
355 | 
356 | --:-:-:-:1      LDS cs0, [readCs + 4x<0*64 + 00>];
357 | --:-:-:-:1      LDS cs1, [readCs + 4x<0*64 + 32>];
358 | --:-:-:-:1      LDS cs2, [readCs + 4x<1*64 + 00>];
359 | --:-:-:-:1      LDS cs3, [readCs + 4x<1*64 + 32>];
360 | --:-:-:-:1      LDS cs4, [readCs + 4x<2*64 + 00>];
361 | --:-:-:-:1      LDS cs5, [readCs + 4x<2*64 + 32>];
362 | --:-:-:-:1      LDS cs6, [readCs + 4x<3*64 + 00>];
363 | --:-:1:-:1      LDS cs7, [readCs + 4x<3*64 + 32>]; // Set Dep 1
364 | 
365 | --:-:-:-:1      IADD cy00, cy00, 1;
366 | --:-:-:-:1      IADD cy04, cy04, 1;
367 | --:-:-:-:1      IADD cy08, cy08, 1;
368 | --:-:-:-:1      IADD cy12, cy12, 1;
369 | 
370 | --:-:-:-:1      IADD Cy00, Cy00, ldc1;
371 | --:-:-:-:1      IADD Cy04, Cy04, ldc1;
372 | --:-:-:-:1      IADD Cy08, Cy08, ldc1;
373 | --:-:-:-:1      IADD Cy12, Cy12, ldc1;
374 | 
375 | --:-:-:-:1      ISETP.LT.AND P0, PT, cy00, c[0x0][0x148], P5; // cy00 < n && cx +  0 < m
376 | --:-:-:-:1      ISETP.LT.AND P1, PT, cy00, c[0x0][0x148], P6; // cy00 < n && cx + 32 < m
377 | --:-:-:-:1      ISETP.LT.AND P2, PT, cy04, c[0x0][0x148], P5; // cy04 < n && cx +  0 < m
378 | --:-:-:-:1      ISETP.LT.AND P3, PT, cy04, c[0x0][0x148], P6; // cy04 < n && cx + 32 < m
379 | 
380 | 01:-:-:-:1  @P0 STG.CG [Cy00 + 4x<00>], cs0; // Wait Dep 1
381 | --:-:-:-:1  @P1 STG.CG [Cy00 + 4x<32>], cs1;
382 | --:-:-:-:1  @P2 STG.CG [Cy04 + 4x<00>], cs2;
383 | --:-:-:-:1  @P3 STG.CG [Cy04 + 4x<32>], cs3;
384 | 
385 | --:-:-:-:1      ISETP.LT.AND P0, PT, cy08, c[0x0][0x148], P5; // cy08 < n && cx +  0 < m
386 | --:-:-:-:1      ISETP.LT.AND P1, PT, cy08, c[0x0][0x148], P6; // cy08 < n && cx + 32 < m
387 | --:-:-:-:1      ISETP.LT.AND P2, PT, cy12, c[0x0][0x148], P5; // cy12 < n && cx +  0 < m
388 | --:-:-:-:1      ISETP.LT.AND P3, PT, cy12, c[0x0][0x148], P6; // cy12 < n && cx + 32 < m
389 | 
390 | --:-:-:-:1  @P0 STG.CG [Cy08 + 4x<00>], cs4;
391 | --:-:-:-:1  @P1 STG.CG [Cy08 + 4x<32>], cs5;
392 | --:-:-:-:1  @P2 STG.CG [Cy12 + 4x<00>], cs6;
393 | --:2:-:-:1  @P3 STG.CG [Cy12 + 4x<32>], cs7; // Set Dep 2
394 | 
395 | </SCHEDULE_BLOCK>
396 | 
397 | --:-:-:-:5      RET;
398 | 
399 | 


--------------------------------------------------------------------------------
/sgemm/sgemm_final_128.sass:
--------------------------------------------------------------------------------
  1 | # Kernel: sgemm_kernel_128
  2 | # Arch: sm_50
  3 | # InsCnt: 770
  4 | # RegCnt: 118
  5 | # SharedSize: 16384
  6 | # BarCnt: 1
  7 | # Params(9):
  8 | #	ord:addr:size:align
  9 | #	0:0x140:4:0
 10 | #	1:0x144:4:0
 11 | #	2:0x148:4:0
 12 | #	3:0x14c:4:0
 13 | #	4:0x150:4:0
 14 | #	5:0x154:4:0
 15 | #	6:0x158:4:0
 16 | #	7:0x15c:4:0
 17 | #	8:0x160:4:0
 18 | #
 19 | # Instructions:
 20 | 
 21 | --:-:1:-:1      S2R R112, SR_TID.X;
 22 | --:-:2:-:1      S2R R113, SR_CTAID.X;
 23 | --:-:3:-:1      S2R R114, SR_CTAID.Y;
 24 | 01:-:-:Y:1      ISETP.GE.AND P0, PT, R112.reuse, 0x80, PT;
 25 | --:-:-:-:1      LOP.AND R117, R112.reuse, 0x1f;
 26 | --:-:-:-:1      BFE.U32 R9, R112.reuse, 0x205;
 27 | --:-:-:-:1      MOV R13, c[0x0][0x14c];
 28 | --:-:-:-:1      BFE.U32 R4, R112.reuse, 0x301;
 29 | --:-:-:-:1      LOP.AND R115, R112.reuse, 0x80;
 30 | --:-:-:-:1      LOP.AND R107, R112.reuse, 0x70;
 31 | --:-:-:-:1      SHL R16, R117, 0x4;
 32 | --:-:-:-:1      LOP.AND R0, R112.reuse, 0x1;
 33 | --:-:-:-:1      IADD R13, R13, -0x8;
 34 | --:-:-:-:1      LOP.AND R80, R112.reuse, -0x20;
 35 | --:-:-:-:1      SHR.U32 R106, R115, 0x4;
 36 | --:-:-:-:1      LOP.AND R116, R112, 0x60;
 37 | --:-:-:-:1      SHR.U32 R107, R107, 0x3;
 38 | --:-:-:-:0 @!P0 MOV R1, c[0x0][0x150];
 39 | --:-:-:-:1      STS.128 [R80+0x2000], RZ;
 40 | --:-:-:-:1  @P0 MOV R1, c[0x0][0x154];
 41 | --:-:-:-:1      ISCADD R111, R9, R16, 0x9;
 42 | 06:-:-:-:1      SEL R12, R114, R113, P0;
 43 | --:-:-:-:1 @!P0 MOV32I R110, 0x80000001;
 44 | --:-:-:-:1  @P0 MOV32I R110, 0x80000000;
 45 | --:-:-:-:1      LOP.OR R106, R106, R4;
 46 | --:-:-:-:1      SHR.U32 R8, R1.reuse, 0x2;
 47 | --:-:-:-:1      LOP.OR R107, R107, R0;
 48 | --:-:-:-:1      ISCADD R104, R12, R117, 0x5;
 49 | --:-:-:-:1      IADD R109, R1, R1;
 50 | --:-:-:-:1  @P0 IADD R111, R111, 0x1000;
 51 | --:-:-:-:1      SHL R106, R106, 0x4;
 52 | --:-:-:-:1      XMAD.MRG R5, R8.reuse, R9.H1.reuse, RZ;
 53 | --:-:-:-:1      ISCADD R107, R107, 0x1000, 0x4;
 54 | --:-:-:-:1      XMAD R104, R8.reuse, R9, R104;
 55 | --:-:-:Y:5      XMAD.MRG R20, R13.reuse, R8.H1.reuse, RZ;
 56 | --:-:-:-:2      XMAD.PSL.CBCC R104, R8.H1, R5.H1, R104;
 57 | --:-:1:-:4      TLD.B.LZ.P R96, R104, R110, 0x0, 1D, 0xf;
 58 | --:-:-:-:1      IADD R108, R104, R1;
 59 | --:-:-:-:1      XMAD R105, R13.reuse, R8, R104;
 60 | --:-:2:Y:5      TLD.B.LZ.P R100, R108, R110, 0x0, 1D, 0xf;
 61 | --:-:-:-:1      XMAD.PSL.CBCC R105, R13.H1, R20.H1, R105;
 62 | --:-:3:-:1      LDS.U.128 R0, [R80+0x2000];
 63 | --:-:3:-:1      LDS.U.128 R4, [R80+0x2000];
 64 | --:-:3:-:1      LDS.U.128 R8, [R80+0x2000];
 65 | --:-:3:-:1      LDS.U.128 R12, [R80+0x2000];
 66 | --:-:3:-:1      LDS.U.128 R16, [R80+0x2000];
 67 | --:-:3:-:1      LDS.U.128 R20, [R80+0x2000];
 68 | --:-:3:-:1      LDS.U.128 R24, [R80+0x2000];
 69 | --:-:3:-:1      LDS.U.128 R28, [R80+0x2000];
 70 | --:-:3:-:1      LDS.U.128 R32, [R80+0x2000];
 71 | --:-:3:-:1      LDS.U.128 R36, [R80+0x2000];
 72 | --:-:3:-:1      LDS.U.128 R40, [R80+0x2000];
 73 | --:-:3:-:1      LDS.U.128 R44, [R80+0x2000];
 74 | --:-:3:-:1      LDS.U.128 R48, [R80+0x2000];
 75 | --:-:3:-:1      LDS.U.128 R52, [R80+0x2000];
 76 | --:-:3:-:1      LDS.U.128 R56, [R80+0x2000];
 77 | --:-:3:-:1      LDS.U.128 R60, [R80+0x2000];
 78 | 01:-:-:-:1      STS.128 [R111], R96;
 79 | --:-:-:-:0      IADD R104, R104, R109.reuse;
 80 | 02:-:-:-:1      STS.128 [R111+0x800], R100;
 81 | --:-:-:-:0      IADD R108, R108, R109;
 82 | 04:-:-:-:5      BAR.SYNC 0x0;
 83 | --:-:-:-:0      LOP.XOR R111, R111, 0x2000;
 84 | --:-:-:-:1      LDS.U.128 R64, [R106];
 85 | --:-:-:-:1      LDS.U.128 R72, [R107];
 86 | --:-:-:-:1      LDS.U.128 R68, [R106+0x100];
 87 | --:-:1:-:1      LDS.U.128 R76, [R107+0x100];
 88 | TARGET1:
 89 | --:-:-:-:1      ISETP.LE.AND P0, PT, R104, R105, PT;
 90 | 01:-:-:-:0      FFMA R1, R66.reuse, R72.reuse, R1;
 91 | --:-:-:-:1      LDS.U.128 R80, [R106+0x200];
 92 | --:-:-:-:1      FFMA R0, R66, R73.reuse, R0;
 93 | --:-:-:-:0      FFMA R2, R64.reuse, R73.reuse, R2;
 94 | --:-:-:-:1      LDS.U.128 R88, [R107+0x200];
 95 | --:-:-:-:1      FFMA R3, R64, R72.reuse, R3;
 96 | --:-:-:-:0      FFMA R5, R67.reuse, R72.reuse, R5;
 97 | --:-:-:-:1      LDS.U.128 R84, [R106+0x300];
 98 | --:-:-:-:1      FFMA R4, R67, R73.reuse, R4;
 99 | --:-:-:-:0      FFMA R6, R65.reuse, R73.reuse, R6;
100 | --:-:1:-:1      LDS.U.128 R92, [R107+0x300];
101 | --:-:-:-:1      FFMA R7, R65, R72.reuse, R7;
102 | --:-:-:-:1      FFMA R33, R70.reuse, R72.reuse, R33;
103 | --:-:-:-:1      FFMA R32, R70, R73.reuse, R32;
104 | --:-:-:-:1      FFMA R34, R68.reuse, R73.reuse, R34;
105 | --:-:-:-:1      FFMA R35, R68, R72.reuse, R35;
106 | --:-:-:-:1      FFMA R37, R71.reuse, R72.reuse, R37;
107 | --:-:-:-:1      FFMA R36, R71.reuse, R73.reuse, R36;
108 | --:-:-:-:1      FFMA R38, R69.reuse, R73, R38;
109 | --:-:-:-:1      FFMA R39, R69.reuse, R72, R39;
110 | --:-:-:-:1      FFMA R45, R71.reuse, R74.reuse, R45;
111 | --:-:-:-:1      FFMA R44, R71, R75.reuse, R44;
112 | --:-:-:-:1      FFMA R46, R69.reuse, R75.reuse, R46;
113 | --:-:-:-:1      FFMA R47, R69, R74.reuse, R47;
114 | --:-:-:-:1      FFMA R41, R70.reuse, R74.reuse, R41;
115 | --:-:-:-:1      FFMA R40, R70, R75.reuse, R40;
116 | --:-:-:-:1      FFMA R42, R68.reuse, R75.reuse, R42;
117 | --:-:-:-:1      FFMA R43, R68, R74.reuse, R43;
118 | --:-:-:-:1      FFMA R13, R67.reuse, R74.reuse, R13;
119 | --:-:-:-:1      FFMA R12, R67, R75.reuse, R12;
120 | --:-:-:-:1      FFMA R14, R65.reuse, R75.reuse, R14;
121 | --:-:-:-:1      FFMA R15, R65, R74.reuse, R15;
122 | --:-:-:-:1      FFMA R9, R66.reuse, R74.reuse, R9;
123 | --:-:-:-:1      FFMA R8, R66.reuse, R75.reuse, R8;
124 | --:-:-:-:1      FFMA R10, R64.reuse, R75, R10;
125 | --:-:-:-:0      FFMA R11, R64.reuse, R74, R11;
126 | --:-:2:-:1  @P0 TLD.B.LZ.P R96, R104, R110, 0x0, 1D, 0xf;
127 | --:-:-:Y:1      FFMA R17, R66.reuse, R76.reuse, R17;
128 | --:-:-:-:0      FFMA R16, R66, R77.reuse, R16;
129 | --:-:3:-:1  @P0 TLD.B.LZ.P R100, R108, R110, 0x0, 1D, 0xf;
130 | --:-:-:-:1      FFMA R18, R64.reuse, R77.reuse, R18;
131 | --:-:-:-:1      FFMA R19, R64, R76.reuse, R19;
132 | --:-:-:-:1      FFMA R21, R67.reuse, R76.reuse, R21;
133 | --:-:-:-:1      FFMA R20, R67, R77.reuse, R20;
134 | --:-:-:-:1      FFMA R22, R65.reuse, R77.reuse, R22;
135 | --:-:-:-:1      FFMA R23, R65, R76.reuse, R23;
136 | --:-:-:-:1      FFMA R49, R70.reuse, R76.reuse, R49;
137 | --:-:-:-:1      FFMA R48, R70, R77.reuse, R48;
138 | --:-:-:-:1      FFMA R50, R68.reuse, R77.reuse, R50;
139 | --:-:-:-:1      FFMA R51, R68, R76.reuse, R51;
140 | --:-:-:-:1      FFMA R53, R71.reuse, R76.reuse, R53;
141 | --:-:-:-:1      FFMA R52, R71.reuse, R77.reuse, R52;
142 | --:-:-:-:1      FFMA R54, R69.reuse, R77, R54;
143 | --:-:-:-:1      FFMA R55, R69.reuse, R76, R55;
144 | --:-:-:-:1      FFMA R61, R71.reuse, R78.reuse, R61;
145 | --:-:-:-:1      FFMA R60, R71, R79.reuse, R60;
146 | --:-:-:-:1      FFMA R62, R69.reuse, R79.reuse, R62;
147 | --:-:-:-:1      FFMA R63, R69, R78.reuse, R63;
148 | --:-:-:-:1      FFMA R57, R70.reuse, R78.reuse, R57;
149 | --:-:-:-:1      FFMA R56, R70, R79.reuse, R56;
150 | --:-:-:-:1      FFMA R58, R68.reuse, R79.reuse, R58;
151 | --:-:-:-:1      FFMA R59, R68, R78.reuse, R59;
152 | --:-:-:-:1      FFMA R29, R67.reuse, R78.reuse, R29;
153 | --:-:-:-:1      FFMA R28, R67, R79.reuse, R28;
154 | --:-:-:-:1      FFMA R30, R65.reuse, R79.reuse, R30;
155 | --:-:-:-:1      FFMA R31, R65, R78.reuse, R31;
156 | --:-:-:-:1      FFMA R25, R66.reuse, R78.reuse, R25;
157 | --:-:-:-:1      FFMA R24, R66, R79.reuse, R24;
158 | --:-:-:-:1      FFMA R26, R64.reuse, R79, R26;
159 | --:-:-:-:1      FFMA R27, R64, R78, R27;
160 | 01:-:-:-:0      FFMA R1, R82.reuse, R88.reuse, R1;
161 | --:-:-:-:1      LDS.U.128 R64, [R106+0x400];
162 | --:-:-:-:1      FFMA R0, R82, R89.reuse, R0;
163 | --:-:-:-:0      FFMA R2, R80.reuse, R89.reuse, R2;
164 | --:-:-:-:1      LDS.U.128 R72, [R107+0x400];
165 | --:-:-:-:1      FFMA R3, R80, R88.reuse, R3;
166 | --:-:-:-:0      FFMA R5, R83.reuse, R88.reuse, R5;
167 | --:-:-:-:1      LDS.U.128 R68, [R106+0x500];
168 | --:-:-:-:1      FFMA R4, R83, R89.reuse, R4;
169 | --:-:-:-:0      FFMA R6, R81.reuse, R89.reuse, R6;
170 | --:-:1:-:1      LDS.U.128 R76, [R107+0x500];
171 | --:-:-:-:1      FFMA R7, R81, R88.reuse, R7;
172 | --:-:-:-:1      FFMA R33, R86.reuse, R88.reuse, R33;
173 | --:-:-:-:1      FFMA R32, R86, R89.reuse, R32;
174 | --:-:-:-:1      FFMA R34, R84.reuse, R89.reuse, R34;
175 | --:-:-:-:1      FFMA R35, R84, R88.reuse, R35;
176 | --:-:-:-:1      FFMA R37, R87.reuse, R88.reuse, R37;
177 | --:-:-:-:1      FFMA R36, R87.reuse, R89.reuse, R36;
178 | --:-:-:-:1      FFMA R38, R85.reuse, R89, R38;
179 | --:-:-:-:1      FFMA R39, R85.reuse, R88, R39;
180 | --:-:-:-:1      FFMA R45, R87.reuse, R90.reuse, R45;
181 | --:-:-:-:1      FFMA R44, R87, R91.reuse, R44;
182 | --:-:-:-:1      FFMA R46, R85.reuse, R91.reuse, R46;
183 | --:-:-:-:1      FFMA R47, R85, R90.reuse, R47;
184 | --:-:-:-:1      FFMA R41, R86.reuse, R90.reuse, R41;
185 | --:-:-:-:1      FFMA R40, R86, R91.reuse, R40;
186 | --:-:-:-:1      FFMA R42, R84.reuse, R91.reuse, R42;
187 | --:-:-:-:1      FFMA R43, R84, R90.reuse, R43;
188 | --:-:-:-:1      FFMA R13, R83.reuse, R90.reuse, R13;
189 | --:-:-:-:1      FFMA R12, R83, R91.reuse, R12;
190 | --:-:-:-:1      FFMA R14, R81.reuse, R91.reuse, R14;
191 | --:-:-:-:1      FFMA R15, R81, R90.reuse, R15;
192 | --:-:-:-:1      FFMA R9, R82.reuse, R90.reuse, R9;
193 | --:-:-:-:1      FFMA R8, R82.reuse, R91.reuse, R8;
194 | --:-:-:-:1      FFMA R10, R80.reuse, R91, R10;
195 | --:-:-:-:1      FFMA R11, R80.reuse, R90, R11;
196 | --:-:-:Y:1      FFMA R17, R82.reuse, R92.reuse, R17;
197 | --:-:-:-:1      FFMA R16, R82, R93.reuse, R16;
198 | --:-:-:-:1      FFMA R18, R80.reuse, R93.reuse, R18;
199 | --:-:-:-:1      FFMA R19, R80, R92.reuse, R19;
200 | --:-:-:-:1      FFMA R21, R83.reuse, R92.reuse, R21;
201 | --:-:-:-:1      FFMA R20, R83, R93.reuse, R20;
202 | --:-:-:-:1      FFMA R22, R81.reuse, R93.reuse, R22;
203 | --:-:-:-:1      FFMA R23, R81, R92.reuse, R23;
204 | --:-:-:-:1      FFMA R49, R86.reuse, R92.reuse, R49;
205 | --:-:-:-:1      FFMA R48, R86, R93.reuse, R48;
206 | --:-:-:-:1      FFMA R50, R84.reuse, R93.reuse, R50;
207 | --:-:-:-:1      FFMA R51, R84, R92.reuse, R51;
208 | --:-:-:-:1      FFMA R53, R87.reuse, R92.reuse, R53;
209 | --:-:-:-:1      FFMA R52, R87.reuse, R93.reuse, R52;
210 | --:-:-:-:1      FFMA R54, R85.reuse, R93, R54;
211 | --:-:-:-:1      FFMA R55, R85.reuse, R92, R55;
212 | --:-:-:-:1      FFMA R61, R87.reuse, R94.reuse, R61;
213 | --:-:-:-:1      FFMA R60, R87, R95.reuse, R60;
214 | --:-:-:-:1      FFMA R62, R85.reuse, R95.reuse, R62;
215 | --:-:-:-:1      FFMA R63, R85, R94.reuse, R63;
216 | --:-:-:-:1      FFMA R57, R86.reuse, R94.reuse, R57;
217 | --:-:-:-:1      FFMA R56, R86, R95.reuse, R56;
218 | --:-:-:-:1      FFMA R58, R84.reuse, R95.reuse, R58;
219 | --:-:-:-:1      FFMA R59, R84, R94.reuse, R59;
220 | --:-:-:-:1      FFMA R29, R83.reuse, R94.reuse, R29;
221 | --:-:-:-:1      FFMA R28, R83, R95.reuse, R28;
222 | --:-:-:-:1      FFMA R30, R81.reuse, R95.reuse, R30;
223 | --:-:-:-:1      FFMA R31, R81, R94.reuse, R31;
224 | --:-:-:-:1      FFMA R25, R82.reuse, R94.reuse, R25;
225 | --:-:-:-:1      FFMA R24, R82, R95.reuse, R24;
226 | --:-:-:-:1      FFMA R26, R80.reuse, R95, R26;
227 | --:-:-:-:1      FFMA R27, R80, R94, R27;
228 | 01:-:-:-:0      FFMA R1, R66.reuse, R72.reuse, R1;
229 | --:-:-:-:1      LDS.U.128 R80, [R106+0x600];
230 | --:-:-:-:1      FFMA R0, R66, R73.reuse, R0;
231 | --:-:-:-:0      FFMA R2, R64.reuse, R73.reuse, R2;
232 | --:-:-:-:1      LDS.U.128 R88, [R107+0x600];
233 | --:-:-:-:1      FFMA R3, R64, R72.reuse, R3;
234 | --:-:-:-:0      FFMA R5, R67.reuse, R72.reuse, R5;
235 | --:-:-:-:1      LDS.U.128 R84, [R106+0x700];
236 | --:-:-:-:1      FFMA R4, R67, R73.reuse, R4;
237 | --:-:-:-:0      FFMA R6, R65.reuse, R73.reuse, R6;
238 | --:-:1:-:1      LDS.U.128 R92, [R107+0x700];
239 | --:-:-:-:1      FFMA R7, R65, R72.reuse, R7;
240 | --:-:-:-:1      FFMA R33, R70.reuse, R72.reuse, R33;
241 | --:-:-:-:1      FFMA R32, R70, R73.reuse, R32;
242 | --:-:-:-:1      FFMA R34, R68.reuse, R73.reuse, R34;
243 | --:-:-:-:1      FFMA R35, R68, R72.reuse, R35;
244 | --:-:-:-:1      FFMA R37, R71.reuse, R72.reuse, R37;
245 | --:-:-:-:1      FFMA R36, R71.reuse, R73.reuse, R36;
246 | --:-:-:-:1      FFMA R38, R69.reuse, R73, R38;
247 | --:-:-:-:1      FFMA R39, R69.reuse, R72, R39;
248 | --:-:-:-:1      FFMA R45, R71.reuse, R74.reuse, R45;
249 | --:-:-:-:1      FFMA R44, R71, R75.reuse, R44;
250 | --:-:-:-:1      FFMA R46, R69.reuse, R75.reuse, R46;
251 | --:-:-:-:1      FFMA R47, R69, R74.reuse, R47;
252 | --:-:-:-:1      FFMA R41, R70.reuse, R74.reuse, R41;
253 | --:-:-:-:1      FFMA R40, R70, R75.reuse, R40;
254 | --:-:-:-:1      FFMA R42, R68.reuse, R75.reuse, R42;
255 | --:-:-:-:1      FFMA R43, R68, R74.reuse, R43;
256 | --:-:-:-:1      FFMA R13, R67.reuse, R74.reuse, R13;
257 | --:-:-:-:1      FFMA R12, R67, R75.reuse, R12;
258 | --:-:-:-:1      FFMA R14, R65.reuse, R75.reuse, R14;
259 | --:-:-:-:1      FFMA R15, R65, R74.reuse, R15;
260 | --:-:-:-:1      FFMA R9, R66.reuse, R74.reuse, R9;
261 | --:-:-:-:1      FFMA R8, R66.reuse, R75.reuse, R8;
262 | --:-:-:-:1      FFMA R10, R64.reuse, R75, R10;
263 | --:-:-:-:1      FFMA R11, R64.reuse, R74, R11;
264 | --:-:-:Y:1      FFMA R17, R66.reuse, R76.reuse, R17;
265 | --:-:-:-:1      FFMA R16, R66, R77.reuse, R16;
266 | --:-:-:-:1      FFMA R18, R64.reuse, R77.reuse, R18;
267 | --:-:-:-:1      FFMA R19, R64, R76.reuse, R19;
268 | --:-:-:-:1      FFMA R21, R67.reuse, R76.reuse, R21;
269 | --:-:-:-:1      FFMA R20, R67, R77.reuse, R20;
270 | --:-:-:-:1      FFMA R22, R65.reuse, R77.reuse, R22;
271 | --:-:-:-:1      FFMA R23, R65, R76.reuse, R23;
272 | --:-:-:-:1      FFMA R49, R70.reuse, R76.reuse, R49;
273 | --:-:-:-:1      FFMA R48, R70, R77.reuse, R48;
274 | --:-:-:-:1      FFMA R50, R68.reuse, R77.reuse, R50;
275 | --:-:-:-:1      FFMA R51, R68, R76.reuse, R51;
276 | --:-:-:-:1      FFMA R53, R71.reuse, R76.reuse, R53;
277 | --:-:-:-:1      FFMA R52, R71.reuse, R77.reuse, R52;
278 | --:-:-:-:1      FFMA R54, R69.reuse, R77, R54;
279 | --:-:-:-:1      FFMA R55, R69.reuse, R76, R55;
280 | --:-:-:-:1      FFMA R61, R71.reuse, R78.reuse, R61;
281 | --:-:-:-:1      FFMA R60, R71, R79.reuse, R60;
282 | --:-:-:-:1      FFMA R62, R69.reuse, R79.reuse, R62;
283 | --:-:-:-:1      FFMA R63, R69, R78.reuse, R63;
284 | --:-:-:-:1      FFMA R57, R70.reuse, R78.reuse, R57;
285 | --:-:-:-:1      FFMA R56, R70, R79.reuse, R56;
286 | --:-:-:-:1      FFMA R58, R68.reuse, R79.reuse, R58;
287 | --:-:-:-:1      FFMA R59, R68, R78.reuse, R59;
288 | --:-:-:-:1      FFMA R29, R67.reuse, R78.reuse, R29;
289 | --:-:-:-:1      FFMA R28, R67, R79.reuse, R28;
290 | --:-:-:-:1      FFMA R30, R65.reuse, R79.reuse, R30;
291 | --:-:-:-:1      FFMA R31, R65, R78.reuse, R31;
292 | --:-:-:-:1      FFMA R25, R66.reuse, R78.reuse, R25;
293 | --:-:-:-:1      FFMA R24, R66, R79.reuse, R24;
294 | --:-:-:-:1      FFMA R26, R64.reuse, R79, R26;
295 | --:-:-:-:1      FFMA R27, R64, R78, R27;
296 | 01:-:-:-:0      FFMA R1, R82.reuse, R88.reuse, R1;
297 | --:-:-:-:1      LDS.U.128 R64, [R106+0x800];
298 | --:-:-:-:1      FFMA R0, R82, R89.reuse, R0;
299 | --:-:-:-:0      FFMA R2, R80.reuse, R89.reuse, R2;
300 | --:-:-:-:1      LDS.U.128 R72, [R107+0x800];
301 | --:-:-:-:1      FFMA R3, R80, R88.reuse, R3;
302 | --:-:-:-:0      FFMA R5, R83.reuse, R88.reuse, R5;
303 | --:-:-:-:1      LDS.U.128 R68, [R106+0x900];
304 | --:-:-:-:1      FFMA R4, R83, R89.reuse, R4;
305 | --:-:-:-:0      FFMA R6, R81.reuse, R89.reuse, R6;
306 | --:-:1:-:1      LDS.U.128 R76, [R107+0x900];
307 | --:-:-:-:1      FFMA R7, R81, R88.reuse, R7;
308 | --:-:-:-:1      FFMA R33, R86.reuse, R88.reuse, R33;
309 | --:-:-:-:1      FFMA R32, R86, R89.reuse, R32;
310 | --:-:-:-:1      FFMA R34, R84.reuse, R89.reuse, R34;
311 | --:-:-:-:1      FFMA R35, R84, R88.reuse, R35;
312 | --:-:-:-:1      FFMA R37, R87.reuse, R88.reuse, R37;
313 | --:-:-:-:1      FFMA R36, R87.reuse, R89.reuse, R36;
314 | --:-:-:-:1      FFMA R38, R85.reuse, R89, R38;
315 | --:-:-:-:1      FFMA R39, R85.reuse, R88, R39;
316 | --:-:-:-:1      FFMA R45, R87.reuse, R90.reuse, R45;
317 | --:-:-:-:1      FFMA R44, R87, R91.reuse, R44;
318 | --:-:-:-:1      FFMA R46, R85.reuse, R91.reuse, R46;
319 | --:-:-:-:1      FFMA R47, R85, R90.reuse, R47;
320 | --:-:-:-:1      FFMA R41, R86.reuse, R90.reuse, R41;
321 | --:-:-:-:1      FFMA R40, R86, R91.reuse, R40;
322 | --:-:-:-:1      FFMA R42, R84.reuse, R91.reuse, R42;
323 | --:-:-:-:1      FFMA R43, R84, R90.reuse, R43;
324 | --:-:-:-:1      FFMA R13, R83.reuse, R90.reuse, R13;
325 | --:-:-:-:1      FFMA R12, R83, R91.reuse, R12;
326 | --:-:-:-:1      FFMA R14, R81.reuse, R91.reuse, R14;
327 | --:-:-:-:1      FFMA R15, R81, R90.reuse, R15;
328 | --:-:-:-:1      FFMA R9, R82.reuse, R90.reuse, R9;
329 | --:-:-:-:1      FFMA R8, R82.reuse, R91.reuse, R8;
330 | --:-:-:-:1      FFMA R10, R80.reuse, R91, R10;
331 | --:-:-:-:1      FFMA R11, R80.reuse, R90, R11;
332 | --:-:-:Y:1      FFMA R17, R82.reuse, R92.reuse, R17;
333 | --:-:-:-:1      FFMA R16, R82, R93.reuse, R16;
334 | --:-:-:-:1      FFMA R18, R80.reuse, R93.reuse, R18;
335 | --:-:-:-:1      FFMA R19, R80, R92.reuse, R19;
336 | --:-:-:-:1      FFMA R21, R83.reuse, R92.reuse, R21;
337 | --:-:-:-:1      FFMA R20, R83, R93.reuse, R20;
338 | --:-:-:-:1      FFMA R22, R81.reuse, R93.reuse, R22;
339 | --:-:-:-:1      FFMA R23, R81, R92.reuse, R23;
340 | --:-:-:-:1      FFMA R49, R86.reuse, R92.reuse, R49;
341 | --:-:-:-:1      FFMA R48, R86, R93.reuse, R48;
342 | --:-:-:-:1      FFMA R50, R84.reuse, R93.reuse, R50;
343 | --:-:-:-:1      FFMA R51, R84, R92.reuse, R51;
344 | --:-:-:-:1      FFMA R53, R87.reuse, R92.reuse, R53;
345 | --:-:-:-:1      FFMA R52, R87.reuse, R93.reuse, R52;
346 | --:-:-:-:1      FFMA R54, R85.reuse, R93, R54;
347 | --:-:-:-:1      FFMA R55, R85.reuse, R92, R55;
348 | --:-:-:-:1      FFMA R61, R87.reuse, R94.reuse, R61;
349 | --:-:-:-:1      FFMA R60, R87, R95.reuse, R60;
350 | --:-:-:-:1      FFMA R62, R85.reuse, R95.reuse, R62;
351 | --:-:-:-:1      FFMA R63, R85, R94.reuse, R63;
352 | --:-:-:-:1      FFMA R57, R86.reuse, R94.reuse, R57;
353 | --:-:-:-:1      FFMA R56, R86, R95.reuse, R56;
354 | --:-:-:-:1      FFMA R58, R84.reuse, R95.reuse, R58;
355 | --:-:-:-:1      FFMA R59, R84, R94.reuse, R59;
356 | --:-:-:-:1      FFMA R29, R83.reuse, R94.reuse, R29;
357 | --:-:-:-:1      FFMA R28, R83, R95.reuse, R28;
358 | --:-:-:-:1      FFMA R30, R81.reuse, R95.reuse, R30;
359 | --:-:-:-:1      FFMA R31, R81, R94.reuse, R31;
360 | --:-:-:-:1      FFMA R25, R82.reuse, R94.reuse, R25;
361 | --:-:-:-:1      FFMA R24, R82, R95.reuse, R24;
362 | --:-:-:-:1      FFMA R26, R80.reuse, R95, R26;
363 | --:-:-:-:1      FFMA R27, R80, R94, R27;
364 | 01:-:-:-:0      FFMA R1, R66.reuse, R72.reuse, R1;
365 | --:-:-:-:1      LDS.U.128 R80, [R106+0xa00];
366 | --:-:-:-:1      FFMA R0, R66, R73.reuse, R0;
367 | --:-:-:-:0      FFMA R2, R64.reuse, R73.reuse, R2;
368 | --:-:-:-:1      LDS.U.128 R88, [R107+0xa00];
369 | --:-:-:-:1      FFMA R3, R64, R72.reuse, R3;
370 | --:-:-:-:0      FFMA R5, R67.reuse, R72.reuse, R5;
371 | --:-:-:-:1      LDS.U.128 R84, [R106+0xb00];
372 | --:-:-:-:1      FFMA R4, R67, R73.reuse, R4;
373 | --:-:-:-:0      FFMA R6, R65.reuse, R73.reuse, R6;
374 | --:-:1:-:1      LDS.U.128 R92, [R107+0xb00];
375 | --:-:-:-:1      FFMA R7, R65, R72.reuse, R7;
376 | --:-:-:-:1      FFMA R33, R70.reuse, R72.reuse, R33;
377 | --:-:-:-:1      FFMA R32, R70, R73.reuse, R32;
378 | --:-:-:-:1      FFMA R34, R68.reuse, R73.reuse, R34;
379 | --:-:-:-:1      FFMA R35, R68, R72.reuse, R35;
380 | --:-:-:-:1      FFMA R37, R71.reuse, R72.reuse, R37;
381 | --:-:-:-:1      FFMA R36, R71.reuse, R73.reuse, R36;
382 | --:-:-:-:1      FFMA R38, R69.reuse, R73, R38;
383 | --:-:-:-:1      FFMA R39, R69.reuse, R72, R39;
384 | --:-:-:-:1      FFMA R45, R71.reuse, R74.reuse, R45;
385 | --:-:-:-:1      FFMA R44, R71, R75.reuse, R44;
386 | --:-:-:-:1      FFMA R46, R69.reuse, R75.reuse, R46;
387 | --:-:-:-:1      FFMA R47, R69, R74.reuse, R47;
388 | --:-:-:-:1      FFMA R41, R70.reuse, R74.reuse, R41;
389 | --:-:-:-:1      FFMA R40, R70, R75.reuse, R40;
390 | --:-:-:-:1      FFMA R42, R68.reuse, R75.reuse, R42;
391 | --:-:-:-:1      FFMA R43, R68, R74.reuse, R43;
392 | --:-:-:-:1      FFMA R13, R67.reuse, R74.reuse, R13;
393 | --:-:-:-:1      FFMA R12, R67, R75.reuse, R12;
394 | --:-:-:-:1      FFMA R14, R65.reuse, R75.reuse, R14;
395 | --:-:-:-:1      FFMA R15, R65, R74.reuse, R15;
396 | --:-:-:-:1      FFMA R9, R66.reuse, R74.reuse, R9;
397 | --:-:-:-:1      FFMA R8, R66.reuse, R75.reuse, R8;
398 | --:-:-:-:1      FFMA R10, R64.reuse, R75, R10;
399 | --:-:-:-:1      FFMA R11, R64.reuse, R74, R11;
400 | --:-:-:Y:1      FFMA R17, R66.reuse, R76.reuse, R17;
401 | --:-:-:-:1      FFMA R16, R66, R77.reuse, R16;
402 | --:-:-:-:1      FFMA R18, R64.reuse, R77.reuse, R18;
403 | --:-:-:-:1      FFMA R19, R64, R76.reuse, R19;
404 | --:-:-:-:1      FFMA R21, R67.reuse, R76.reuse, R21;
405 | --:-:-:-:1      FFMA R20, R67, R77.reuse, R20;
406 | --:-:-:-:1      FFMA R22, R65.reuse, R77.reuse, R22;
407 | --:-:-:-:1      FFMA R23, R65, R76.reuse, R23;
408 | --:-:-:-:1      FFMA R49, R70.reuse, R76.reuse, R49;
409 | --:-:-:-:1      FFMA R48, R70, R77.reuse, R48;
410 | --:-:-:-:1      FFMA R50, R68.reuse, R77.reuse, R50;
411 | --:-:-:-:1      FFMA R51, R68, R76.reuse, R51;
412 | --:-:-:-:1      FFMA R53, R71.reuse, R76.reuse, R53;
413 | --:-:-:-:1      FFMA R52, R71.reuse, R77.reuse, R52;
414 | --:-:-:-:1      FFMA R54, R69.reuse, R77, R54;
415 | --:-:-:-:1      FFMA R55, R69.reuse, R76, R55;
416 | --:-:-:-:1      FFMA R61, R71.reuse, R78.reuse, R61;
417 | --:-:-:-:1      FFMA R60, R71, R79.reuse, R60;
418 | --:-:-:-:1      FFMA R62, R69.reuse, R79.reuse, R62;
419 | --:-:-:-:1      FFMA R63, R69, R78.reuse, R63;
420 | --:-:-:-:1      FFMA R57, R70.reuse, R78.reuse, R57;
421 | --:-:-:-:1      FFMA R56, R70, R79.reuse, R56;
422 | --:-:-:-:1      FFMA R58, R68.reuse, R79.reuse, R58;
423 | --:-:-:-:1      FFMA R59, R68, R78.reuse, R59;
424 | --:-:-:-:1      FFMA R29, R67.reuse, R78.reuse, R29;
425 | --:-:-:-:1      FFMA R28, R67, R79.reuse, R28;
426 | --:-:-:-:1      FFMA R30, R65.reuse, R79.reuse, R30;
427 | --:-:-:-:1      FFMA R31, R65, R78.reuse, R31;
428 | --:-:-:-:1      FFMA R25, R66.reuse, R78.reuse, R25;
429 | --:-:-:-:1      FFMA R24, R66, R79.reuse, R24;
430 | --:-:-:-:1      FFMA R26, R64.reuse, R79, R26;
431 | --:-:-:-:1      FFMA R27, R64, R78, R27;
432 | 01:-:-:-:0      FFMA R1, R82.reuse, R88.reuse, R1;
433 | --:-:-:-:1      LDS.U.128 R64, [R106+0xc00];
434 | --:-:-:-:1      FFMA R0, R82, R89.reuse, R0;
435 | --:-:-:-:0      FFMA R2, R80.reuse, R89.reuse, R2;
436 | --:-:-:-:1      LDS.U.128 R72, [R107+0xc00];
437 | --:-:-:-:1      FFMA R3, R80, R88.reuse, R3;
438 | --:-:-:-:0      FFMA R5, R83.reuse, R88.reuse, R5;
439 | --:-:-:-:1      LDS.U.128 R68, [R106+0xd00];
440 | --:-:-:-:1      FFMA R4, R83, R89.reuse, R4;
441 | --:-:-:-:0      FFMA R6, R81.reuse, R89.reuse, R6;
442 | --:-:1:-:1      LDS.U.128 R76, [R107+0xd00];
443 | --:-:-:-:1      FFMA R7, R81, R88.reuse, R7;
444 | --:-:-:-:1      FFMA R33, R86.reuse, R88.reuse, R33;
445 | --:-:-:-:1      FFMA R32, R86, R89.reuse, R32;
446 | --:-:-:-:1      FFMA R34, R84.reuse, R89.reuse, R34;
447 | --:-:-:-:1      FFMA R35, R84, R88.reuse, R35;
448 | --:-:-:-:1      FFMA R37, R87.reuse, R88.reuse, R37;
449 | --:-:-:-:1      FFMA R36, R87.reuse, R89.reuse, R36;
450 | --:-:-:-:1      FFMA R38, R85.reuse, R89, R38;
451 | --:-:-:-:1      FFMA R39, R85.reuse, R88, R39;
452 | --:-:-:-:1      FFMA R45, R87.reuse, R90.reuse, R45;
453 | --:-:-:-:1      FFMA R44, R87, R91.reuse, R44;
454 | --:-:-:-:1      FFMA R46, R85.reuse, R91.reuse, R46;
455 | --:-:-:-:1      FFMA R47, R85, R90.reuse, R47;
456 | --:-:-:-:1      FFMA R41, R86.reuse, R90.reuse, R41;
457 | --:-:-:-:1      FFMA R40, R86, R91.reuse, R40;
458 | --:-:-:-:1      FFMA R42, R84.reuse, R91.reuse, R42;
459 | --:-:-:-:1      FFMA R43, R84, R90.reuse, R43;
460 | --:-:-:-:1      FFMA R13, R83.reuse, R90.reuse, R13;
461 | --:-:-:-:1      FFMA R12, R83, R91.reuse, R12;
462 | --:-:-:-:1      FFMA R14, R81.reuse, R91.reuse, R14;
463 | --:-:-:-:1      FFMA R15, R81, R90.reuse, R15;
464 | --:-:-:-:1      FFMA R9, R82.reuse, R90.reuse, R9;
465 | --:-:-:-:1      FFMA R8, R82.reuse, R91.reuse, R8;
466 | --:-:-:-:1      FFMA R10, R80.reuse, R91, R10;
467 | --:-:-:-:1      FFMA R11, R80.reuse, R90, R11;
468 | --:-:-:Y:1      FFMA R17, R82.reuse, R92.reuse, R17;
469 | --:-:-:-:1      FFMA R16, R82, R93.reuse, R16;
470 | --:-:-:-:1      FFMA R18, R80.reuse, R93.reuse, R18;
471 | --:-:-:-:1      FFMA R19, R80, R92.reuse, R19;
472 | --:-:-:-:1      FFMA R21, R83.reuse, R92.reuse, R21;
473 | --:-:-:-:1      FFMA R20, R83, R93.reuse, R20;
474 | --:-:-:-:1      FFMA R22, R81.reuse, R93.reuse, R22;
475 | --:-:-:-:1      FFMA R23, R81, R92.reuse, R23;
476 | --:-:-:-:1      FFMA R49, R86.reuse, R92.reuse, R49;
477 | --:-:-:-:1      FFMA R48, R86, R93.reuse, R48;
478 | --:-:-:-:1      FFMA R50, R84.reuse, R93.reuse, R50;
479 | --:-:-:-:1      FFMA R51, R84, R92.reuse, R51;
480 | --:-:-:-:1      FFMA R53, R87.reuse, R92.reuse, R53;
481 | --:-:-:-:1      FFMA R52, R87.reuse, R93.reuse, R52;
482 | --:-:-:-:1      FFMA R54, R85.reuse, R93, R54;
483 | --:-:-:-:1      FFMA R55, R85.reuse, R92, R55;
484 | --:-:-:-:1      FFMA R61, R87.reuse, R94.reuse, R61;
485 | --:-:-:-:1      FFMA R60, R87, R95.reuse, R60;
486 | --:-:-:-:1      FFMA R62, R85.reuse, R95.reuse, R62;
487 | --:-:-:-:1      FFMA R63, R85, R94.reuse, R63;
488 | --:-:-:-:1      FFMA R57, R86.reuse, R94.reuse, R57;
489 | --:-:-:-:1      FFMA R56, R86, R95.reuse, R56;
490 | --:-:-:-:1      FFMA R58, R84.reuse, R95.reuse, R58;
491 | --:-:-:-:1      FFMA R59, R84, R94.reuse, R59;
492 | --:-:-:-:1      FFMA R29, R83.reuse, R94.reuse, R29;
493 | --:-:-:-:1      FFMA R28, R83, R95.reuse, R28;
494 | --:-:-:-:1      FFMA R30, R81.reuse, R95.reuse, R30;
495 | --:-:-:-:1      FFMA R31, R81, R94.reuse, R31;
496 | --:-:-:-:1      FFMA R25, R82.reuse, R94.reuse, R25;
497 | --:-:-:-:1      FFMA R24, R82, R95.reuse, R24;
498 | --:-:-:-:1      FFMA R26, R80.reuse, R95, R26;
499 | --:-:-:-:1      FFMA R27, R80, R94, R27;
500 | 01:-:-:-:0      FFMA R1, R66.reuse, R72.reuse, R1;
501 | --:-:-:-:1      LDS.U.128 R80, [R106+0xe00];
502 | --:-:-:-:1      FFMA R0, R66, R73.reuse, R0;
503 | --:-:-:-:0      FFMA R2, R64.reuse, R73.reuse, R2;
504 | --:-:-:-:1      LDS.U.128 R88, [R107+0xe00];
505 | --:-:-:-:1      FFMA R3, R64, R72.reuse, R3;
506 | --:-:-:-:0      FFMA R5, R67.reuse, R72.reuse, R5;
507 | --:-:-:-:1      LDS.U.128 R84, [R106+0xf00];
508 | --:-:-:-:1      FFMA R4, R67, R73.reuse, R4;
509 | --:-:-:-:0      FFMA R6, R65.reuse, R73.reuse, R6;
510 | --:-:1:-:1      LDS.U.128 R92, [R107+0xf00];
511 | --:-:-:-:1      FFMA R7, R65, R72.reuse, R7;
512 | --:-:-:-:1      FFMA R33, R70.reuse, R72.reuse, R33;
513 | --:-:-:-:1      FFMA R32, R70, R73.reuse, R32;
514 | --:-:-:-:1      FFMA R34, R68.reuse, R73.reuse, R34;
515 | --:-:-:-:1      FFMA R35, R68, R72.reuse, R35;
516 | --:-:-:-:1      FFMA R37, R71.reuse, R72.reuse, R37;
517 | --:-:-:-:1      FFMA R36, R71.reuse, R73.reuse, R36;
518 | --:-:-:-:1      FFMA R38, R69.reuse, R73, R38;
519 | --:-:-:-:1      FFMA R39, R69.reuse, R72, R39;
520 | --:-:-:-:1      FFMA R45, R71.reuse, R74.reuse, R45;
521 | --:-:-:-:1      FFMA R44, R71, R75.reuse, R44;
522 | --:-:-:-:1      FFMA R46, R69.reuse, R75.reuse, R46;
523 | --:-:-:-:1      FFMA R47, R69, R74.reuse, R47;
524 | --:-:-:-:1      FFMA R41, R70.reuse, R74.reuse, R41;
525 | --:-:-:-:1      FFMA R40, R70, R75.reuse, R40;
526 | --:-:-:-:1      FFMA R42, R68.reuse, R75.reuse, R42;
527 | --:-:-:-:1      FFMA R43, R68, R74.reuse, R43;
528 | --:-:-:-:1      FFMA R13, R67.reuse, R74.reuse, R13;
529 | --:-:-:-:1      FFMA R12, R67, R75.reuse, R12;
530 | --:-:-:-:1      FFMA R14, R65.reuse, R75.reuse, R14;
531 | --:-:-:-:1      FFMA R15, R65, R74.reuse, R15;
532 | --:-:-:-:1      FFMA R9, R66.reuse, R74.reuse, R9;
533 | --:-:-:-:1      FFMA R8, R66.reuse, R75.reuse, R8;
534 | --:-:-:-:0      FFMA R10, R64.reuse, R75, R10;
535 | 02:-:-:-:1  @P0 STS.128 [R111], R96;
536 | --:-:-:-:1      FFMA R11, R64.reuse, R74, R11;
537 | --:-:-:Y:1      FFMA R17, R66.reuse, R76.reuse, R17;
538 | --:-:-:-:1      FFMA R16, R66, R77.reuse, R16;
539 | --:-:-:-:0      FFMA R18, R64.reuse, R77.reuse, R18;
540 | 04:-:-:-:1  @P0 STS.128 [R111+0x800], R100;
541 | --:-:-:-:1      FFMA R19, R64, R76.reuse, R19;
542 | --:-:-:-:1      FFMA R21, R67.reuse, R76.reuse, R21;
543 | --:-:-:-:1      FFMA R20, R67, R77.reuse, R20;
544 | --:-:-:-:1      FFMA R22, R65.reuse, R77.reuse, R22;
545 | --:-:-:-:1      FFMA R23, R65, R76.reuse, R23;
546 | --:-:-:-:1      FFMA R49, R70.reuse, R76.reuse, R49;
547 | --:-:-:-:1      FFMA R48, R70, R77.reuse, R48;
548 | --:-:-:-:1      FFMA R50, R68.reuse, R77.reuse, R50;
549 | --:-:-:-:1      FFMA R51, R68, R76.reuse, R51;
550 | --:-:-:-:1      FFMA R53, R71.reuse, R76.reuse, R53;
551 | --:-:-:-:1      FFMA R52, R71.reuse, R77.reuse, R52;
552 | --:-:-:-:1      FFMA R54, R69.reuse, R77, R54;
553 | --:-:-:-:1      FFMA R55, R69.reuse, R76, R55;
554 | --:-:-:-:1      FFMA R61, R71.reuse, R78.reuse, R61;
555 | --:-:-:-:1      FFMA R60, R71, R79.reuse, R60;
556 | --:-:-:-:1      FFMA R62, R69.reuse, R79.reuse, R62;
557 | --:-:-:-:1      FFMA R63, R69, R78.reuse, R63;
558 | --:-:-:-:1      FFMA R57, R70.reuse, R78.reuse, R57;
559 | --:-:-:-:1      FFMA R56, R70, R79.reuse, R56;
560 | --:-:-:-:1      FFMA R58, R68.reuse, R79.reuse, R58;
561 | --:-:-:-:1      FFMA R59, R68, R78.reuse, R59;
562 | --:-:-:-:1      FFMA R29, R67.reuse, R78.reuse, R29;
563 | --:-:-:-:1      FFMA R28, R67, R79.reuse, R28;
564 | --:-:-:-:1      FFMA R30, R65.reuse, R79.reuse, R30;
565 | --:-:-:-:1      FFMA R31, R65, R78.reuse, R31;
566 | --:-:-:-:1      FFMA R25, R66.reuse, R78.reuse, R25;
567 | --:-:-:-:1      FFMA R24, R66, R79.reuse, R24;
568 | --:-:-:-:0      FFMA R26, R64.reuse, R79, R26;
569 | 01:-:-:-:5      BAR.SYNC 0x0;
570 | --:-:-:-:1  @P0 LOP.XOR R106, R106, 0x2000;
571 | --:-:-:-:1  @P0 LOP.XOR R107, R107, 0x2000;
572 | --:-:-:-:1  @P0 LOP.XOR R111, R111, 0x2000;
573 | --:-:-:-:1      FFMA R27, R64, R78, R27;
574 | --:-:-:-:0      FFMA R1, R82.reuse, R88.reuse, R1;
575 | --:-:-:-:1  @P0 LDS.U.128 R64, [R106];
576 | --:-:-:-:1      FFMA R0, R82, R89.reuse, R0;
577 | --:-:-:-:0      FFMA R2, R80.reuse, R89.reuse, R2;
578 | --:-:-:-:1  @P0 LDS.U.128 R72, [R107];
579 | --:-:-:-:1      FFMA R3, R80, R88.reuse, R3;
580 | --:-:-:-:0      FFMA R5, R83.reuse, R88.reuse, R5;
581 | --:-:-:-:1  @P0 LDS.U.128 R68, [R106+0x100];
582 | --:-:-:-:1      FFMA R4, R83, R89.reuse, R4;
583 | --:-:-:-:0      FFMA R6, R81.reuse, R89.reuse, R6;
584 | --:-:1:-:1  @P0 LDS.U.128 R76, [R107+0x100];
585 | --:-:-:-:1      FFMA R7, R81, R88.reuse, R7;
586 | --:-:-:-:1      FFMA R33, R86.reuse, R88.reuse, R33;
587 | --:-:-:-:1      FFMA R32, R86, R89.reuse, R32;
588 | --:-:-:-:1      FFMA R34, R84.reuse, R89.reuse, R34;
589 | --:-:-:-:1      FFMA R35, R84, R88.reuse, R35;
590 | --:-:-:-:1      FFMA R37, R87.reuse, R88.reuse, R37;
591 | --:-:-:-:1      FFMA R36, R87.reuse, R89.reuse, R36;
592 | --:-:-:-:1      FFMA R38, R85.reuse, R89, R38;
593 | --:-:-:-:1      FFMA R39, R85.reuse, R88, R39;
594 | --:-:-:-:1      FFMA R45, R87.reuse, R90.reuse, R45;
595 | --:-:-:-:1      FFMA R44, R87, R91.reuse, R44;
596 | --:-:-:-:1      FFMA R46, R85.reuse, R91.reuse, R46;
597 | --:-:-:-:1      FFMA R47, R85, R90.reuse, R47;
598 | --:-:-:-:1      FFMA R41, R86.reuse, R90.reuse, R41;
599 | --:-:-:-:1      FFMA R40, R86, R91.reuse, R40;
600 | --:-:-:-:1      FFMA R42, R84.reuse, R91.reuse, R42;
601 | --:-:-:-:1      FFMA R43, R84, R90.reuse, R43;
602 | --:-:-:-:1      FFMA R13, R83.reuse, R90.reuse, R13;
603 | --:-:-:-:1      FFMA R12, R83, R91.reuse, R12;
604 | --:-:-:-:1      FFMA R14, R81.reuse, R91.reuse, R14;
605 | --:-:-:-:1      FFMA R15, R81, R90.reuse, R15;
606 | --:-:-:-:1      FFMA R9, R82.reuse, R90.reuse, R9;
607 | --:-:-:-:1      FFMA R8, R82.reuse, R91.reuse, R8;
608 | --:-:-:-:1      FFMA R10, R80.reuse, R91, R10;
609 | --:-:-:-:1      FFMA R11, R80.reuse, R90, R11;
610 | --:-:-:Y:1      FFMA R17, R82.reuse, R92.reuse, R17;
611 | --:-:-:-:1      FFMA R16, R82, R93.reuse, R16;
612 | --:-:-:-:1      FFMA R18, R80.reuse, R93.reuse, R18;
613 | --:-:-:-:1      FFMA R19, R80, R92.reuse, R19;
614 | --:-:-:-:1      FFMA R21, R83.reuse, R92.reuse, R21;
615 | --:-:-:-:1      FFMA R20, R83, R93.reuse, R20;
616 | --:-:-:-:1      FFMA R22, R81.reuse, R93.reuse, R22;
617 | --:-:-:-:1      FFMA R23, R81, R92.reuse, R23;
618 | --:-:-:-:1      FFMA R49, R86.reuse, R92.reuse, R49;
619 | --:-:-:-:1      FFMA R48, R86, R93.reuse, R48;
620 | --:-:-:-:1      FFMA R50, R84.reuse, R93.reuse, R50;
621 | --:-:-:-:1      FFMA R51, R84, R92.reuse, R51;
622 | --:-:-:-:1      FFMA R53, R87.reuse, R92.reuse, R53;
623 | --:-:-:-:1      FFMA R52, R87.reuse, R93.reuse, R52;
624 | --:-:-:-:1      FFMA R54, R85.reuse, R93, R54;
625 | --:-:-:-:1      FFMA R55, R85.reuse, R92, R55;
626 | --:-:-:-:1      FFMA R61, R87.reuse, R94.reuse, R61;
627 | --:-:-:-:1      FFMA R60, R87, R95.reuse, R60;
628 | --:-:-:-:1      FFMA R62, R85.reuse, R95.reuse, R62;
629 | --:-:-:-:1      FFMA R63, R85, R94.reuse, R63;
630 | --:-:-:-:1      FFMA R57, R86.reuse, R94.reuse, R57;
631 | --:-:-:-:1      FFMA R56, R86, R95.reuse, R56;
632 | --:-:-:-:1      FFMA R58, R84.reuse, R95.reuse, R58;
633 | --:-:-:-:1      FFMA R59, R84, R94.reuse, R59;
634 | --:-:-:-:1      FFMA R29, R83.reuse, R94.reuse, R29;
635 | --:-:-:-:1      FFMA R28, R83, R95.reuse, R28;
636 | --:-:-:-:1      FFMA R30, R81.reuse, R95.reuse, R30;
637 | --:-:-:-:1      FFMA R31, R81, R94.reuse, R31;
638 | --:-:-:-:1      FFMA R25, R82.reuse, R94.reuse, R25;
639 | --:-:-:-:1      FFMA R24, R82, R95.reuse, R24;
640 | --:-:-:-:1      FFMA R26, R80.reuse, R95, R26;
641 | --:-:-:-:1      FFMA R27, R80, R94, R27;
642 | --:-:-:-:1  @P0 IADD R104, R104, R109.reuse;
643 | --:-:-:-:0  @P0 IADD R108, R108, R109;
644 | --:-:-:Y:5  @P0 BRA TARGET1;
645 | --:-:-:-:1      SHR.U32 R84, R115, 0x2;
646 | --:-:-:-:1      MOV R77, c[0x0][0x158];
647 | --:-:-:-:1      SHR.U32 R80, R116.reuse, 0x1;
648 | --:-:-:-:1      MOV R72, c[0x0][0x15c];
649 | --:-:-:-:1      SHL R89, R116, 0x4;
650 | --:-:-:-:1      LOP.AND R106, R106, 0xfff;
651 | --:-:-:-:1      LOP.OR R84, R117, R84;
652 | --:-:-:-:1      SHL R81, R77.reuse, 0x2;
653 | --:-:-:-:1      LOP.AND R107, R107, 0xfff;
654 | --:-:-:-:1      ISCADD R80, R114, R80, 0x7;
655 | --:-:-:-:1      FMUL R64, R3, R72.reuse;
656 | --:-:-:-:1      SHL R74, R77.reuse, 0x4;
657 | --:-:-:-:1      LOP.OR R89, R89, R84;
658 | --:-:-:-:1      ISCADD R84, R113, R84, 0x7;
659 | --:-:-:-:1      FMUL R65, R7, R72.reuse;
660 | --:-:-:-:1      SHL R88, R77, 0x5;
661 | --:-:-:-:1      XMAD.MRG R78, R80.reuse, R77.H1.reuse, RZ;
662 | --:-:-:-:1      ISCADD R90, R107, R106, 0x5;
663 | --:-:-:-:1      FMUL R66, R1, R72.reuse;
664 | --:-:-:-:1      SHL R89, R89, 0x2;
665 | --:-:-:-:1      XMAD R73, R80, R77, R84;
666 | --:-:-:-:1      ISETP.LT.AND P5, PT, R84, c[0x0][0x144], PT;
667 | --:-:-:-:1      IADD R84, R84, 0x40;
668 | --:-:-:-:1      ISCADD R85, R77, -R74, 0x8;
669 | --:-:-:-:1      FMUL R67, R5, R72.reuse;
670 | --:-:-:-:1      FMUL R68, R35, R72.reuse;
671 | --:-:-:-:1      XMAD.PSL.CBCC R73, R80.H1, R78.H1, R73;
672 | --:-:-:-:1      IADD R80, R80, -0x1;
673 | --:-:-:-:1      ISETP.LT.AND P6, PT, R84, c[0x0][0x144], PT;
674 | --:-:-:-:1      FMUL R69, R39, R72.reuse;
675 | --:-:-:-:1      FMUL R70, R33, R72.reuse;
676 | --:-:-:-:1      FMUL R71, R37, R72;
677 | --:-:-:-:1      ISCADD R76, R73, c[0x0][0x140], 0x2;
678 | --:-:-:-:1      IADD R83, R80.reuse, 0x4;
679 | --:-:-:-:1      IADD R86, R80.reuse, 0x8;
680 | --:-:-:-:3      IADD R87, R80, 0xc;
681 | --:-:-:Y:6      IADD R76, R76, -R81;
682 | --:-:-:-:1      IADD R75, R76.reuse, R74;
683 | --:-:-:Y:5      IADD R79, R76, R88.reuse;
684 | --:-:-:-:0      IADD R82, R75, R88;
685 | --:-:-:-:5      CAL TARGET2;
686 | 02:-:-:-:1      FMUL R64, R2, R72.reuse;
687 | --:-:-:-:1      FMUL R65, R6, R72.reuse;
688 | --:-:-:-:1      FMUL R66, R0, R72.reuse;
689 | --:-:-:-:1      FMUL R67, R4, R72.reuse;
690 | --:-:-:-:1      FMUL R68, R34, R72.reuse;
691 | --:-:-:-:1      FMUL R69, R38, R72.reuse;
692 | --:-:-:-:1      FMUL R70, R32, R72.reuse;
693 | --:-:-:-:0      FMUL R71, R36, R72;
694 | --:-:-:-:5      CAL TARGET2;
695 | 02:-:-:-:1      FMUL R64, R11, R72.reuse;
696 | --:-:-:-:1      FMUL R65, R15, R72.reuse;
697 | --:-:-:-:1      FMUL R66, R9, R72.reuse;
698 | --:-:-:-:1      FMUL R67, R13, R72.reuse;
699 | --:-:-:-:1      FMUL R68, R43, R72.reuse;
700 | --:-:-:-:1      FMUL R69, R47, R72.reuse;
701 | --:-:-:-:1      FMUL R70, R41, R72.reuse;
702 | --:-:-:-:0      FMUL R71, R45, R72;
703 | --:-:-:-:5      CAL TARGET2;
704 | 02:-:-:-:1      FMUL R64, R10, R72.reuse;
705 | --:-:-:-:1      FMUL R65, R14, R72.reuse;
706 | --:-:-:-:1      FMUL R66, R8, R72.reuse;
707 | --:-:-:-:1      FMUL R67, R12, R72.reuse;
708 | --:-:-:-:1      FMUL R68, R42, R72.reuse;
709 | --:-:-:-:1      FMUL R69, R46, R72.reuse;
710 | --:-:-:-:1      FMUL R70, R40, R72.reuse;
711 | --:-:-:-:0      FMUL R71, R44, R72;
712 | --:-:-:-:5      CAL TARGET2;
713 | --:-:-:-:1      IADD R80, R80, 0x3c;
714 | --:-:-:-:1      IADD R83, R83, 0x3c;
715 | --:-:-:-:1      IADD R86, R86, 0x3c;
716 | --:-:-:-:1      IADD R87, R87, 0x3c;
717 | 02:-:-:-:1      IADD R76, R76, R85.reuse;
718 | --:-:-:-:1      IADD R75, R75, R85.reuse;
719 | --:-:-:-:1      IADD R79, R79, R85.reuse;
720 | --:-:-:-:1      IADD R82, R82, R85;
721 | --:-:-:-:1      FMUL R64, R19, R72.reuse;
722 | --:-:-:-:1      FMUL R65, R23, R72.reuse;
723 | --:-:-:-:1      FMUL R66, R17, R72.reuse;
724 | --:-:-:-:1      FMUL R67, R21, R72.reuse;
725 | --:-:-:-:1      FMUL R68, R51, R72.reuse;
726 | --:-:-:-:1      FMUL R69, R55, R72.reuse;
727 | --:-:-:-:1      FMUL R70, R49, R72.reuse;
728 | --:-:-:-:0      FMUL R71, R53, R72;
729 | --:-:-:-:5      CAL TARGET2;
730 | 02:-:-:-:1      FMUL R64, R18, R72.reuse;
731 | --:-:-:-:1      FMUL R65, R22, R72.reuse;
732 | --:-:-:-:1      FMUL R66, R16, R72.reuse;
733 | --:-:-:-:1      FMUL R67, R20, R72.reuse;
734 | --:-:-:-:1      FMUL R68, R50, R72.reuse;
735 | --:-:-:-:1      FMUL R69, R54, R72.reuse;
736 | --:-:-:-:1      FMUL R70, R48, R72.reuse;
737 | --:-:-:-:0      FMUL R71, R52, R72;
738 | --:-:-:-:5      CAL TARGET2;
739 | 02:-:-:-:1      FMUL R64, R27, R72.reuse;
740 | --:-:-:-:1      FMUL R65, R31, R72.reuse;
741 | --:-:-:-:1      FMUL R66, R25, R72.reuse;
742 | --:-:-:-:1      FMUL R67, R29, R72.reuse;
743 | --:-:-:-:1      FMUL R68, R59, R72.reuse;
744 | --:-:-:-:1      FMUL R69, R63, R72.reuse;
745 | --:-:-:-:1      FMUL R70, R57, R72.reuse;
746 | --:-:-:-:0      FMUL R71, R61, R72;
747 | --:-:-:-:5      CAL TARGET2;
748 | 02:-:-:-:1      FMUL R64, R26, R72.reuse;
749 | --:-:-:-:1      FMUL R65, R30, R72.reuse;
750 | --:-:-:-:1      FMUL R66, R24, R72.reuse;
751 | --:-:-:-:1      FMUL R67, R28, R72.reuse;
752 | --:-:-:-:1      FMUL R68, R58, R72.reuse;
753 | --:-:-:-:1      FMUL R69, R62, R72.reuse;
754 | --:-:-:-:1      FMUL R70, R56, R72.reuse;
755 | --:-:-:-:0      FMUL R71, R60, R72;
756 | --:-:-:-:5      CAL TARGET2;
757 | --:-:-:-:5      EXIT;
758 | TARGET2:
759 | --:-:-:-:0      IADD R80, R80, 0x1;
760 | --:-:-:-:1      STS.128 [R90], R64;
761 | --:-:-:-:0      IADD R83, R83, 0x1;
762 | --:-:-:-:1      STS.128 [R90+0x100], R68;
763 | --:-:-:-:0      IADD R86, R86, 0x1;
764 | --:-:-:-:1      LDS R64, [R89];
765 | --:-:-:-:0      IADD R87, R87, 0x1;
766 | --:-:-:-:1      LDS R65, [R89+0x100];
767 | --:-:-:-:0      IADD R76, R76, R81.reuse;
768 | --:-:-:-:1      LDS R66, [R89+0x200];
769 | --:-:-:-:0      IADD R75, R75, R81.reuse;
770 | --:-:-:-:1      LDS R67, [R89+0x300];
771 | --:-:-:-:0      IADD R79, R79, R81.reuse;
772 | --:-:-:-:1      LDS R68, [R89+0x400];
773 | --:-:-:-:0      IADD R82, R82, R81;
774 | --:-:-:-:1      LDS R69, [R89+0x500];
775 | --:-:-:-:1      ISETP.LT.AND P0, PT, R80.reuse, c[0x0][0x148], P5;
776 | --:-:-:-:1      LDS R70, [R89+0x600];
777 | --:-:-:-:1      ISETP.LT.AND P1, PT, R80, c[0x0][0x148], P6;
778 | --:-:1:-:1      LDS R71, [R89+0x700];
779 | --:-:-:-:2      ISETP.LT.AND P2, PT, R83.reuse, c[0x0][0x148], P5;
780 | --:-:-:Y:7      ISETP.LT.AND P3, PT, R83, c[0x0][0x148], P6;
781 | 01:-:-:-:1  @P0 STG.CG [R76], R64;
782 | --:-:-:-:1      ISETP.LT.AND P0, PT, R86.reuse, c[0x0][0x148], P5;
783 | --:-:-:-:1  @P1 STG.CG [R76+0x100], R65;
784 | --:-:-:-:1      ISETP.LT.AND P1, PT, R86, c[0x0][0x148], P6;
785 | --:-:-:-:1  @P2 STG.CG [R75], R66;
786 | --:-:-:-:1      ISETP.LT.AND P2, PT, R87.reuse, c[0x0][0x148], P5;
787 | --:-:-:-:1  @P3 STG.CG [R75+0x100], R67;
788 | --:-:-:Y:7      ISETP.LT.AND P3, PT, R87, c[0x0][0x148], P6;
789 | --:-:-:-:2  @P0 STG.CG [R79], R68;
790 | --:-:-:-:2  @P1 STG.CG [R79+0x100], R69;
791 | --:-:-:-:2  @P2 STG.CG [R82], R70;
792 | --:2:-:-:1  @P3 STG.CG [R82+0x100], R71;
793 | --:-:-:-:5      RET;
794 | 


--------------------------------------------------------------------------------
/t/MaxAs-MaxAs.t:
--------------------------------------------------------------------------------
1 | use strict;
2 | use warnings;
3 | 
4 | use Test::More tests => 1;
5 | BEGIN { use_ok('MaxAs::MaxAs') };
6 | 


--------------------------------------------------------------------------------