├── .gitignore ├── Changes ├── LICENSE ├── MANIFEST ├── Makefile.PL ├── README.md ├── bin └── maxas.pl ├── cpanfile ├── lib └── MaxAs │ ├── Cubin.pm │ ├── MaxAs.pm │ └── MaxAsGrammar.pm ├── microbench ├── microbench.cpp ├── microbench.cu ├── microbench.sass ├── shared.pl ├── shared_lds.sass ├── shared_sts16.sass ├── throughput.pl ├── throughput.sass ├── throughput2.pl ├── throughput2.sass ├── throughput3.pl ├── throughput4.pl ├── throughput5.pl ├── xmad.pl └── xmad2.sass ├── sgemm ├── batched_gemm.xlsx ├── cublas_sgemm.ptx ├── sgemm.cpp ├── sgemm.cu ├── sgemm.pl ├── sgemm.sln ├── sgemm.vcxproj ├── sgemm128.sass ├── sgemm64.sass ├── sgemm_final_128.sass ├── sgemm_final_64.sass ├── sgemm_pre_128.sass └── sgemm_pre_64.sass └── t └── MaxAs-MaxAs.t /.gitignore: -------------------------------------------------------------------------------- 1 | Makefile 2 | Makefile.old 3 | pm_to_blib 4 | blib 5 | MYMETA.* 6 | -------------------------------------------------------------------------------- /Changes: -------------------------------------------------------------------------------- 1 | Revision history for Perl extension MaxAs::MaxAs. 2 | 3 | 1.01 Thu Mar 26 17:09:57 2015 4 | - original Perl packaged version 5 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014 Scott Gray 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST: -------------------------------------------------------------------------------- 1 | bin/maxas.pl 2 | Changes 3 | lib/MaxAs/Cubin.pm 4 | lib/MaxAs/MaxAs.pm 5 | lib/MaxAs/MaxAsGrammar.pm 6 | LICENSE 7 | Makefile.PL 8 | MANIFEST 9 | microbench/microbench.cpp 10 | microbench/microbench.cu 11 | microbench/microbench.sass 12 | microbench/shared.pl 13 | microbench/shared_lds.sass 14 | microbench/shared_sts16.sass 15 | microbench/throughput.pl 16 | microbench/throughput.sass 17 | microbench/throughput2.pl 18 | microbench/throughput2.sass 19 | microbench/throughput3.pl 20 | microbench/throughput4.pl 21 | microbench/throughput5.pl 22 | microbench/xmad.pl 23 | microbench/xmad2.sass 24 | README.md 25 | sgemm/batched_gemm.xlsx 26 | sgemm/cublas_sgemm.ptx 27 | sgemm/sgemm.cpp 28 | sgemm/sgemm.cu 29 | sgemm/sgemm.pl 30 | sgemm/sgemm.sln 31 | sgemm/sgemm.vcxproj 32 | sgemm/sgemm128.sass 33 | sgemm/sgemm64.sass 34 | sgemm/sgemm_final_128.sass 35 | sgemm/sgemm_final_64.sass 36 | sgemm/sgemm_pre_128.sass 37 | sgemm/sgemm_pre_64.sass 38 | t/MaxAs-MaxAs.t 39 | -------------------------------------------------------------------------------- /Makefile.PL: -------------------------------------------------------------------------------- 1 | require 5.10.0; 2 | use ExtUtils::MakeMaker; 3 | # See lib/ExtUtils/MakeMaker.pm for details of how to influence 4 | # the contents of the Makefile that is written. 5 | WriteMakefile( 6 | NAME => 'MaxAs::MaxAs', 7 | VERSION_FROM => 'lib/MaxAs/MaxAs.pm', # finds $VERSION 8 | EXE_FILES => ['bin/maxas.pl'], 9 | PREREQ_PM => {Carp => 1.29, Data::Dumper => 2.145}, 10 | LICENSE => 'MIT', 11 | ($] >= 5.005 ? ## Add these new keywords supported since 5.005 12 | (ABSTRACT_FROM => 'lib/MaxAs/MaxAs.pm', # retrieve abstract from module 13 | AUTHOR => 'Scott Gray ') : ()), 14 | ); 15 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DISCONTINUATION OF PROJECT # 2 | This project will no longer be maintained by Intel. 3 | Intel has ceased development and contributions including, but not limited to, maintenance, bug fixes, new releases, or updates, to this project. 4 | Intel no longer accepts patches to this project. 5 | # MaxAs 6 | Assembler for NVIDIA Maxwell architecture 7 | 8 | To install (system-wide): 9 | 10 | sudo cpanm git://github.com/NervanaSystems/maxas.git 11 | 12 | or 13 | 14 | perl Makefile.PL 15 | make 16 | sudo make install 17 | 18 | 19 | See wiki pages for more information: 20 | 21 | - [Introduction](https://github.com/NervanaSystems/maxas/wiki/Introduction) 22 | - [Getting Started](https://github.com/NervanaSystems/maxas/wiki/Getting-Started) 23 | - [Control Codes](https://github.com/NervanaSystems/maxas/wiki/Control-Codes) 24 | - [SGEMM walkthrough](https://github.com/NervanaSystems/maxas/wiki/SGEMM) 25 | 26 | Related work with lots of additional shader assembly (sass) examples: 27 | 28 | - [Nervana Neon](https://github.com/NervanaSystems/neon) 29 | 30 | This project is released under the [MIT License](http://opensource.org/licenses/MIT). 31 | 32 | -- Scott Gray 33 | -------------------------------------------------------------------------------- /bin/maxas.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use MaxAs::Cubin; 4 | use MaxAs::MaxAs; 5 | use Data::Dumper; 6 | use File::Spec; 7 | 8 | require 5.10.0; 9 | 10 | $Data::Dumper::Sortkeys = 1; 11 | 12 | my $mode = shift; 13 | 14 | # List cubin contents 15 | if ($mode =~ /^\-?\-l/i) 16 | { 17 | my $cubinFile = shift or usage(); 18 | 19 | my $cubin = MaxAs::Cubin->new($cubinFile); 20 | 21 | my $arch = $cubin->arch; 22 | my $class = $cubin->class; 23 | my $asize = $cubin->address_size; 24 | my $kernels = $cubin->listKernels; 25 | my $symbols = $cubin->listSymbols; 26 | 27 | printf "%s: arch:sm_%d machine:%dbit address_size:%dbit\n", $cubinFile, $arch, $class, $asize; 28 | 29 | foreach my $ker (sort keys %$kernels) 30 | { 31 | printf "Kernel: %s (Linkage: %s, Params: %d, Size: %d, Registers: %d, SharedMem: %d, Barriers: %d)\n", $ker, @{$kernels->{$ker}}{qw(Linkage ParamCnt size RegCnt SharedSize BarCnt)}; 32 | } 33 | foreach my $sym (sort keys %$symbols) 34 | { 35 | printf "Symbol: %s\n", $sym; 36 | } 37 | } 38 | # Test that the assembler can reproduce the op codes this cubin or sass contains 39 | elsif ($mode =~ /^\-?\-t/i) 40 | { 41 | my $reg = shift if $ARGV[0] =~ /^\-?\-r/i; 42 | my $all = shift if $ARGV[0] =~ /^\-?\-a/i; 43 | my $file = shift or usage(); 44 | my $fh; 45 | # sass file 46 | if (-T $file) 47 | { 48 | open $fh, $file or die "$file: $!"; 49 | } 50 | # cubin file 51 | else 52 | { 53 | my $cubin = MaxAs::Cubin->new($file); 54 | my $arch = $cubin->arch; 55 | 56 | open $fh, "cuobjdump -arch sm_$arch -sass $file |" or die "cuobjdump -arch sm_$arch -sass $file: $!"; 57 | my $first = <$fh>; 58 | if ($first =~ /cuobjdump fatal/) 59 | { 60 | print $first; 61 | exit(1); 62 | } 63 | } 64 | exit(MaxAs::MaxAs::Test($fh, $reg, $all) ? 1 : 0); 65 | } 66 | # Extract an asm file containing the desired kernel 67 | elsif ($mode =~ /^\-?\-e/i) 68 | { 69 | my $kernelName; 70 | if ($ARGV[0] =~ /^\-?\-k/i) 71 | { 72 | shift; 73 | $kernelName = shift or usage(); 74 | } 75 | my $cubinFile = shift or usage(); 76 | my $asmFile = shift; 77 | my $cubin = MaxAs::Cubin->new($cubinFile); 78 | my $arch = $cubin->arch; 79 | my $kernels = $cubin->listKernels; 80 | 81 | #default the kernel name if not specified. 82 | $kernelName ||= (sort keys %$kernels)[0]; 83 | 84 | my $kernel = $kernels->{$kernelName} or die "bad kernel: $kernelName"; 85 | 86 | open my $in, "cuobjdump -arch sm_$arch -sass -fun $kernelName $cubinFile |" or die "cuobjdump -arch sm_50 -sass -fun $kernelName $cubinFile: $!"; 87 | my $first = <$in>; 88 | if ($first =~ /cuobjdump fatal/) 89 | { 90 | print $first; 91 | exit(1); 92 | } 93 | my $out; 94 | if ($asmFile) 95 | { 96 | open $out, ">$asmFile" or die "$asmFile: $!"; 97 | } 98 | else 99 | { 100 | $out = \*STDOUT; 101 | } 102 | 103 | print $out "# Kernel: $kernelName\n# Arch: sm_$arch\n"; 104 | 105 | print $out "# $_: $kernel->{$_}\n" foreach (qw(InsCnt RegCnt SharedSize BarCnt)); 106 | 107 | print $out "# Params($kernel->{ParamCnt}):\n#\tord:addr:size:align\n"; 108 | 109 | print $out join('', map "#\t$_\n", @{$kernel->{Params}}) if $kernel->{Params}; 110 | 111 | print $out "#\n# Instructions:\n\n"; 112 | 113 | MaxAs::MaxAs::Extract($in, $out, $kernel->{Params}); 114 | 115 | close $out if $asmFile; 116 | close $in; 117 | } 118 | # Extract a kernel from a sass dump 119 | elsif ($mode =~ /^\-?\-s/i) 120 | { 121 | my $sassFile = shift or usage(); 122 | my $asmFile = shift; 123 | 124 | open my $in, $sassFile or die "$sassFile: $!"; 125 | 126 | my $out; 127 | if ($asmFile) 128 | { 129 | open $out, ">$asmFile" or die "$asmFile: $!"; 130 | } 131 | else 132 | { 133 | $out = \*STDOUT; 134 | } 135 | 136 | MaxAs::MaxAs::Extract($in, $out, []); 137 | 138 | close $out if $asmFile; 139 | close $in; 140 | } 141 | # Insert the kernel asm back into the cubin: 142 | elsif ($mode =~ /^\-?\-i/i) 143 | { 144 | my $nowarn; 145 | if ($ARGV[0] =~ /^\-?\-w/i) 146 | { 147 | $nowarn = shift; 148 | } 149 | my $kernelName; 150 | if ($ARGV[0] =~ /^\-?\-k/i) 151 | { 152 | shift; 153 | $kernelName = shift or usage(); 154 | } 155 | my $noReuse = shift if $ARGV[0] =~ /^\-?\-n/i; 156 | while ($ARGV[0] =~ /^\-?\-D(\w+)/) 157 | { 158 | shift; 159 | my $name = $1; 160 | my $value = shift; 161 | eval "package MaxAs::MaxAs::CODE; our \$$name = '$value';" 162 | } 163 | 164 | my $asmFile = shift or usage(); 165 | my $cubinFile = shift or usage(); 166 | my $newCubin = shift || $cubinFile; 167 | 168 | my $file; 169 | if (open my $fh, $asmFile) 170 | { 171 | local $/; 172 | $file = <$fh>; 173 | close $fh; 174 | } 175 | else { die "$asmFile: $!" } 176 | 177 | my ($vol,$dir) = File::Spec->splitpath($asmFile); 178 | my $include = [$vol, $dir]; 179 | 180 | # extract the kernel name from the file 181 | ($kernelName) = $file =~ /^# Kernel: (\w+)/ unless $kernelName; 182 | die "asm file missing kernel name or is badly formatted" unless $kernelName; 183 | 184 | my $kernel = MaxAs::MaxAs::Assemble($file, $include, !$noReuse, $nowarn); 185 | 186 | my $cubin = MaxAs::Cubin->new($cubinFile); 187 | $kernel->{Kernel} = $cubin->getKernel($kernelName) or die "cubin does not contain kernel: $kernelName"; 188 | 189 | $cubin->modifyKernel(%$kernel); 190 | 191 | $cubin->write($newCubin); 192 | 193 | printf "Kernel: $kernelName, Instructions: %d, Register Count: %d, Bank Conflicts: %d, Reuse: %.1f% (%d/%d)\n", 194 | @{$kernel}{qw(InsCnt RegCnt ConflictCnt ReusePct ReuseCnt ReuseTot)}; 195 | 196 | } 197 | # Preprocessing: 198 | elsif ($mode =~ /^\-?\-p/i) 199 | { 200 | while ($ARGV[0] =~ /^\-?\-D(\w+)/) 201 | { 202 | shift; 203 | my $name = $1; 204 | my $value = shift; 205 | eval "package MaxAs::MaxAs::CODE; our \$$name = '$value';"; 206 | } 207 | my $debug = shift if $ARGV[0] =~ /^\-?\-d/i; 208 | my $asmFile = shift or usage(); 209 | my $asmFile2 = shift; 210 | 211 | die "source and destination probably shouldn't be the same file\n" if $asmFile eq $asmFile2; 212 | 213 | open my $fh, $asmFile or die "$asmFile: $!"; 214 | local $/; 215 | my $file = <$fh>; 216 | close $fh; 217 | 218 | my ($vol,$dir) = File::Spec->splitpath($asmFile); 219 | my $include = [$vol, $dir]; 220 | 221 | if ($asmFile2) 222 | { 223 | open $fh, ">$asmFile2" or die "$asmFile2: $!"; 224 | } 225 | else 226 | { 227 | $fh = \*STDOUT; 228 | } 229 | print $fh MaxAs::MaxAs::Preprocess($file, $include, $debug); 230 | close $fh; 231 | } 232 | # get version information 233 | elsif ($mode =~ /^\-?\-v/i) 234 | { 235 | print "$MaxAs::MaxAs::VERSION\n"; 236 | } 237 | else 238 | { 239 | print "$mode\n"; 240 | usage(); 241 | } 242 | 243 | exit(0); 244 | 245 | 246 | 247 | sub usage 248 | { 249 | print < 255 | 256 | Test a cubin or sass file to to see if the assembler can reproduce all of the contained opcodes. 257 | Also useful for extending the missing grammar rules. Defaults to only showing failures without --all. 258 | With the --reg flag it will show register bank conflicts not hidden by reuse flags. 259 | 260 | maxas.pl --test|-t [--reg|-r] [--all|-a] 261 | 262 | Extract a single kernel into an asm file from a cubin. 263 | Works much like cuobjdump but outputs in a format that can be re-assembled back into the cubin. 264 | 265 | maxas.pl --extract|-e [--kernel|-k kernel_name] [asm_file] 266 | 267 | Preprocess the asm: expand CODE sections, perform scheduling. Mainly used for debugging purposes. 268 | Include the debug flag to print out detailed scheduler info. 269 | 270 | maxas.pl --pre|-p [--debug|-d] [new_asm_file] 271 | 272 | Insert the kernel asm back into the cubin. Overwrite existing or create new cubin. 273 | Optionally you can skip register reuse flag auto insertion. This allows you to observe 274 | performance without any reuse or you can use it to set the flags manually in your sass. 275 | 276 | maxas.pl --insert|-i [--noreuse|-n] [new_cubin_file] 277 | 278 | Display version information and exit: 279 | 280 | maxas.pl --version|-v 281 | 282 | EOF 283 | exit(1); 284 | } 285 | 286 | __END__ 287 | -------------------------------------------------------------------------------- /cpanfile: -------------------------------------------------------------------------------- 1 | requires 'perl', '5.10.0'; 2 | 3 | requires 'Carp', '1.29'; 4 | requires 'Data::Dumper', '2.145'; 5 | -------------------------------------------------------------------------------- /lib/MaxAs/Cubin.pm: -------------------------------------------------------------------------------- 1 | package MaxAs::Cubin; 2 | 3 | use strict; 4 | use Data::Dumper; 5 | 6 | my @Elf32_Hdr = qw( 7 | H8 magic 8 | C fileClass 9 | C encoding 10 | C fileVersion 11 | H18 padding 12 | S type 13 | S machine 14 | L version 15 | L entry 16 | L phOffset 17 | L shOffset 18 | L flags 19 | S ehSize 20 | S phEntSize 21 | S phNum 22 | S shEntSize 23 | S shNum 24 | S shStrIndx 25 | ); 26 | my @Elf64_Hdr = qw( 27 | H8 magic 28 | C fileClass 29 | C encoding 30 | C fileVersion 31 | H18 padding 32 | S type 33 | S machine 34 | L version 35 | Q entry 36 | Q phOffset 37 | Q shOffset 38 | L flags 39 | S ehSize 40 | S phEntSize 41 | S phNum 42 | S shEntSize 43 | S shNum 44 | S shStrIndx 45 | ); 46 | my @Elf32_PrgHdr = qw( 47 | L type 48 | L offset 49 | L vaddr 50 | L paddr 51 | L fileSize 52 | L memSize 53 | L flags 54 | L align 55 | ); 56 | my @Elf64_PrgHdr = qw( 57 | L type 58 | L flags 59 | Q offset 60 | Q vaddr 61 | Q paddr 62 | Q fileSize 63 | Q memSize 64 | Q align 65 | ); 66 | my @Elf32_SecHdr = qw( 67 | L name 68 | L type 69 | L flags 70 | L addr 71 | L offset 72 | L size 73 | L link 74 | L info 75 | L align 76 | L entSize 77 | ); 78 | my @Elf64_SecHdr = qw( 79 | L name 80 | L type 81 | Q flags 82 | Q addr 83 | Q offset 84 | Q size 85 | L link 86 | L info 87 | Q align 88 | Q entSize 89 | ); 90 | my @Elf32_SymEnt = qw( 91 | L name 92 | L value 93 | L size 94 | C info 95 | C other 96 | S shIndx 97 | ); 98 | my @Elf64_SymEnt = qw( 99 | L name 100 | C info 101 | C other 102 | S shIndx 103 | Q value 104 | Q size 105 | ); 106 | my @symBind = qw(LOCAL GLOBAL WEAK); 107 | 108 | # Split the Elf Header defs into template strings (T) and corresponding hash keys columns (C) 109 | my (@elfHdrT, @prgHdrT, @secHdrT, @symHdrT, @elfHdrC, @prgHdrC, @secHdrC, @symHdrC); 110 | 111 | $elfHdrT[1] = join '', grep { length($_) <= 3} @Elf32_Hdr; 112 | $prgHdrT[1] = join '', grep { length($_) <= 3} @Elf32_PrgHdr; 113 | $secHdrT[1] = join '', grep { length($_) <= 3} @Elf32_SecHdr; 114 | $symHdrT[1] = join '', grep { length($_) <= 3} @Elf32_SymEnt; 115 | 116 | $elfHdrT[2] = join '', grep { length($_) <= 3} @Elf64_Hdr; 117 | $prgHdrT[2] = join '', grep { length($_) <= 3} @Elf64_PrgHdr; 118 | $secHdrT[2] = join '', grep { length($_) <= 3} @Elf64_SecHdr; 119 | $symHdrT[2] = join '', grep { length($_) <= 3} @Elf64_SymEnt; 120 | 121 | $elfHdrC[1] = [ grep { length($_) > 3} @Elf32_Hdr ]; 122 | $prgHdrC[1] = [ grep { length($_) > 3} @Elf32_PrgHdr ]; 123 | $secHdrC[1] = [ grep { length($_) > 3} @Elf32_SecHdr ]; 124 | $symHdrC[1] = [ grep { length($_) > 3} @Elf32_SymEnt ]; 125 | 126 | $elfHdrC[2] = [ grep { length($_) > 3} @Elf64_Hdr ]; 127 | $prgHdrC[2] = [ grep { length($_) > 3} @Elf64_PrgHdr ]; 128 | $secHdrC[2] = [ grep { length($_) > 3} @Elf64_SecHdr ]; 129 | $symHdrC[2] = [ grep { length($_) > 3} @Elf64_SymEnt ]; 130 | 131 | # Load a cubin ELF file 132 | sub new 133 | { 134 | my ($package, $file) = @_; 135 | 136 | my $cubin = bless { fileName => $file }, $package; 137 | 138 | open my $fh, $file or die "$file: $!"; 139 | binmode($fh); 140 | 141 | # Read in assuming 32 bit header 142 | my $data; 143 | read $fh, $data, 0x34; 144 | my $elfHdr = $cubin->{elfHdr} = {}; 145 | @{$elfHdr}{@{$elfHdrC[1]}} = unpack $elfHdrT[1], $data; 146 | 147 | # 1: 32bit, 2: 64bit 148 | my $class = $elfHdr->{fileClass}; 149 | 150 | # re-read in with 64 bit header if needed 151 | if ($class == 2) 152 | { 153 | seek $fh, 0, 0; 154 | read $fh, $data, 0x46; 155 | @{$elfHdr}{@{$elfHdrC[$class]}} = unpack $elfHdrT[$class], $data; 156 | 157 | $cubin->{Class} = 64; 158 | } 159 | else 160 | { 161 | $cubin->{Class} = 32; 162 | } 163 | 164 | # verify sm_50 cubin 165 | $cubin->{Arch} = $elfHdr->{flags} & 0xFF; 166 | die "Cubin not in sm_50 or greater format. Found: sm_$cubin->{Arch}\n" if $cubin->{Arch} < 50; 167 | 168 | $cubin->{AddressSize} = $elfHdr->{flags} & 0x400 ? 64 : 32; 169 | 170 | # Read in Program Headers 171 | seek $fh, $elfHdr->{phOffset}, 0; 172 | foreach (1 .. $elfHdr->{phNum}) 173 | { 174 | read $fh, $data, $elfHdr->{phEntSize}; 175 | 176 | my %prgHdr = (Indx => $_ - 1); 177 | @prgHdr{@{$prgHdrC[$class]}} = unpack $prgHdrT[$class], $data; 178 | push @{$cubin->{prgHdrs}}, \%prgHdr; 179 | } 180 | 181 | # Read in Section Headers 182 | seek $fh, $elfHdr->{shOffset}, 0; 183 | foreach (1 .. $elfHdr->{shNum}) 184 | { 185 | read $fh, $data, $elfHdr->{shEntSize}; 186 | 187 | my %secHdr = (Indx => $_ - 1); 188 | @secHdr{@{$secHdrC[$class]}} = unpack $secHdrT[$class], $data; 189 | push @{$cubin->{secHdrs}}, \%secHdr; 190 | } 191 | 192 | # Read in Section data 193 | foreach my $secHdr (@{$cubin->{secHdrs}}) 194 | { 195 | $data = ''; 196 | # Skip sections with no data (type NULL or NOBITS) 197 | if ($secHdr->{size} && $secHdr->{type} != 8) 198 | { 199 | seek $fh, $secHdr->{offset}, 0; 200 | read $fh, $data, $secHdr->{size}; 201 | } 202 | # Convert string tables to maps 203 | if ($secHdr->{type} == 3) # STRTAB 204 | { 205 | my $strTab = $secHdr->{StrTab} = {}; 206 | my $indx = 0; 207 | foreach my $str (split "\0", $data) 208 | { 209 | $strTab->{$indx} = $str; 210 | $indx += 1 + length($str); 211 | } 212 | } 213 | # Read in Symbol data 214 | if ($secHdr->{type} == 2) # SYMTAB 215 | { 216 | my $offset = 0; 217 | while ($offset < $secHdr->{size}) 218 | { 219 | my $symEnt = {}; 220 | @{$symEnt}{@{$symHdrC[$class]}} = unpack $symHdrT[$class], substr($data, $offset, $secHdr->{entSize}); 221 | $offset += $secHdr->{entSize}; 222 | 223 | push @{$secHdr->{SymTab}}, $symEnt; 224 | } 225 | } 226 | # Cache raw data for further processing and writing 227 | $secHdr->{Data} = unpack 'H*', $data; 228 | } 229 | close $fh; 230 | 231 | # Update section headers with their names. Map names directly to headers. 232 | my $shStrTab = $cubin->{secHdrs}[$elfHdr->{shStrIndx}]{StrTab}; 233 | foreach my $secHdr (@{$cubin->{secHdrs}}) 234 | { 235 | $secHdr->{Name} = $shStrTab->{$secHdr->{name}}; 236 | $cubin->{$secHdr->{Name}} = $secHdr; 237 | } 238 | 239 | # Update symbols with their names 240 | # For the Global functions, extract kernel meta data 241 | # Populate the kernel hash 242 | my $strTab = $cubin->{'.strtab'}{StrTab}; 243 | foreach my $symEnt (@{$cubin->{'.symtab'}{SymTab}}) 244 | { 245 | $symEnt->{Name} = $strTab->{$symEnt->{name}}; 246 | 247 | # Attach symbol to section 248 | my $secHdr = $cubin->{secHdrs}[$symEnt->{shIndx}]; 249 | $secHdr->{SymbolEnt} = $symEnt; 250 | 251 | # Look for symbols tagged FUNC 252 | if (($symEnt->{info} & 0x0f) == 0x02) 253 | { 254 | # Create a hash of kernels for output 255 | my $kernelSec = $cubin->{Kernels}{$symEnt->{Name}} = $secHdr; 256 | 257 | # Extract local/global/weak binding info 258 | $kernelSec->{Linkage} = $symBind[($symEnt->{info} & 0xf0) >> 4]; 259 | 260 | # Extract the kernel instructions 261 | $kernelSec->{KernelData} = [ unpack "Q*", pack "H*", $kernelSec->{Data} ]; 262 | 263 | # Extract the max barrier resource identifier used and add 1. Should be 0-16. 264 | # If a register is used as a barrier resource id, then this value is the max of 16. 265 | $kernelSec->{BarCnt} = ($kernelSec->{flags} & 0x01f00000) >> 20; 266 | 267 | # Extract the number of allocated registers for this kernel. 268 | $kernelSec->{RegCnt} = ($kernelSec->{info} & 0xff000000) >> 24; 269 | 270 | # Extract the size of shared memory this kernel uses. 271 | my $sharedSec = $kernelSec->{SharedSec} = $cubin->{".nv.shared.$symEnt->{Name}"}; 272 | $kernelSec->{SharedSize} = $sharedSec ? $sharedSec->{size} : 0; 273 | 274 | # Attach constant0 section 275 | $kernelSec->{ConstantSec} = $cubin->{".nv.constant0.$symEnt->{Name}"}; 276 | 277 | # Extract the kernel parameter data. 278 | my $paramSec = $kernelSec->{ParamSec} = $cubin->{".nv.info.$symEnt->{Name}"}; 279 | if ($paramSec) 280 | { 281 | # Extract raw param data 282 | my @data = unpack "L*", pack "H*", $paramSec->{Data}; 283 | 284 | $paramSec->{ParamData} = \@data; 285 | $paramSec->{ParamHex} = [ map { sprintf '0x%08x', $_ } @data ]; 286 | 287 | # Find the first param delimiter 288 | my $idx = 0; 289 | $idx++ while $idx < @data && $data[$idx] != 0x00080a04; 290 | 291 | my $first = $data[$idx+2] & 0xFFFF; 292 | #my $size = $data[$idx+2] >> 16; 293 | $idx += 4; 294 | 295 | my @params; 296 | while ($idx < @data && $data[$idx] == 0x000c1704) 297 | { 298 | # Get the ordinal, offset, size and pointer alignment for each param 299 | my $ord = $data[$idx+2] & 0xFFFF; 300 | my $offset = sprintf '0x%02x', $first + ($data[$idx+2] >> 16); 301 | my $psize = $data[$idx+3] >> 18; 302 | my $align = $data[$idx+3] & 0x400 ? 1 << ($data[$idx+3] & 0x3ff) : 0; 303 | unshift @params, "$ord:$offset:$psize:$align"; 304 | $idx += 4; 305 | } 306 | my @staticParams = @data[0 .. ($idx-1)]; 307 | 308 | my ($maxregCount, @exitOffsets, @ctaidOffsets, $ctaidzUsed, @reqntid, @maxntid, @stackSize); 309 | while ($idx < @data) 310 | { 311 | my $code = $data[$idx] & 0xffff; 312 | my $size = $data[$idx] >> 16; 313 | $idx++; 314 | 315 | # EIATTR_MAXREG_COUNT 316 | if ($code == 0x1b03) 317 | { 318 | $maxregCount = $size; 319 | } 320 | # EIATTR_S2RCTAID_INSTR_OFFSETS 321 | elsif ($code == 0x1d04) 322 | { 323 | while ($size > 0) 324 | { 325 | push @ctaidOffsets, $data[$idx++]; 326 | $size -= 4; 327 | } 328 | } 329 | # EIATTR_EXIT_INSTR_OFFSETS 330 | elsif ($code == 0x1c04) 331 | { 332 | while ($size > 0) 333 | { 334 | push @exitOffsets, $data[$idx++]; 335 | $size -= 4; 336 | } 337 | } 338 | # EIATTR_CTAIDZ_USED 339 | elsif ($code == 0x0401) 340 | { 341 | $ctaidzUsed = 1; 342 | } 343 | # EIATTR_REQNTID 344 | elsif ($code == 0x1004) 345 | { 346 | while ($size > 0) 347 | { 348 | push @reqntid, $data[$idx++]; 349 | $size -= 4; 350 | } 351 | } 352 | # EIATTR_MAX_THREADS 353 | elsif ($code == 0x0504) 354 | { 355 | while ($size > 0) 356 | { 357 | push @maxntid, $data[$idx++]; 358 | $size -= 4; 359 | } 360 | } 361 | # EIATTR_CRS_STACK_SIZE 362 | elsif ($code == 0x1e04) 363 | { 364 | while ($size > 0) 365 | { 366 | push @stackSize, $data[$idx++]; 367 | $size -= 4; 368 | } 369 | } 370 | else 371 | { 372 | printf STDERR "Unknown Code 0x%02x (size:%d)\n", $code, $size; 373 | } 374 | } 375 | $kernelSec->{Params} = \@params; 376 | $kernelSec->{ParamCnt} = scalar @params; 377 | 378 | $paramSec->{StaticParams} = \@staticParams; 379 | $paramSec->{MAXREG_COUNT} = $maxregCount; 380 | $paramSec->{ExitOffsets} = \@exitOffsets; 381 | $paramSec->{CTAIDOffsets} = \@ctaidOffsets; 382 | $paramSec->{CTAIDZUsed} = $ctaidzUsed; 383 | $paramSec->{REQNTID} = \@reqntid; 384 | $paramSec->{MAXNTID} = \@maxntid; 385 | $paramSec->{STACKSIZE} = \@stackSize; 386 | } 387 | # print Dumper($paramSec); 388 | # exit(); 389 | } 390 | # Note GLOBALs found in this cubin 391 | elsif (($symEnt->{info} & 0x10) == 0x10) 392 | { 393 | $cubin->{Symbols}{$symEnt->{Name}} = $symEnt; 394 | } 395 | } 396 | 397 | # print "phOffset: $elfHdr->{phOffset}\n"; 398 | # print "shOffset: $elfHdr->{shOffset}\n"; 399 | # foreach my $secHdr (@{$cubin->{secHdrs}}) 400 | # { 401 | # print "secHdr($secHdr->{Indx}): $secHdr->{offset}, $secHdr->{size}, $secHdr->{align} ($secHdr->{Name})\n"; 402 | # } 403 | # my $p = 0; 404 | # foreach my $prgHdr (@{$cubin->{prgHdrs}}) 405 | # { 406 | # print "prgHdr($p): type: $prgHdr->{type}, offset: $prgHdr->{offset}, fileSize: $prgHdr->{fileSize}, memSize: $prgHdr->{memSize}, align: $prgHdr->{align}\n"; 407 | # $p++; 408 | # } 409 | # exit(); 410 | 411 | # print Dumper($cubin->{prgHdrs}); 412 | # exit(); 413 | return $cubin; 414 | } 415 | sub class 416 | { 417 | return shift()->{Class}; 418 | } 419 | sub arch 420 | { 421 | return shift()->{Arch}; 422 | } 423 | sub address_size 424 | { 425 | return shift()->{AddressSize}; 426 | } 427 | sub listKernels 428 | { 429 | return shift()->{Kernels}; 430 | } 431 | sub listSymbols 432 | { 433 | return shift()->{Symbols}; 434 | } 435 | sub getKernel 436 | { 437 | my ($cubin, $kernel) = @_; 438 | return $cubin->{Kernels}{$kernel}; 439 | } 440 | 441 | sub modifyKernel 442 | { 443 | my ($cubin, %params) = @_; 444 | 445 | my $kernelSec = $params{Kernel}; 446 | my $newReg = $params{RegCnt}; 447 | my $newBar = $params{BarCnt}; 448 | my $exitOffsets = $params{ExitOffsets}; 449 | my $ctaidOffsets = $params{CTAIDOffsets}; 450 | my $ctaidzUsed = $params{CTAIDZUsed}; 451 | my $newData = $params{KernelData}; 452 | my $newSize = @$newData * 8; 453 | 454 | die "255 register max" if $newReg > 255; 455 | die "new kernel size must be multiple of 8 instructions (64 bytes)" if $newSize & 63; 456 | die "16 is max barrier count" if $newBar > 16; 457 | 458 | my $paramSec = $kernelSec->{ParamSec}; 459 | my $kernelName = $kernelSec->{SymbolEnt}{Name}; 460 | my $maxregCount = $paramSec->{MAXREG_COUNT}; 461 | my $stackSize = $paramSec->{STACKSIZE}; 462 | 463 | # update the kernel 464 | $kernelSec->{KernelData} = $newData; 465 | $kernelSec->{Data} = unpack "H*", pack "Q*", @$newData; 466 | 467 | if ($newReg != $kernelSec->{RegCnt}) 468 | { 469 | print "Modified $kernelName RegCnt: $kernelSec->{RegCnt} => $newReg\n"; 470 | $kernelSec->{RegCnt} = $newReg; 471 | $kernelSec->{info} &= ~0xff000000; 472 | $kernelSec->{info} |= $newReg << 24; 473 | } 474 | if ($newBar != $kernelSec->{BarCnt}) 475 | { 476 | print "Modified $kernelName BarCnt: $kernelSec->{BarCnt} => $newBar\n"; 477 | $kernelSec->{BarCnt} = $newBar; 478 | $kernelSec->{flags} &= ~0x01f00000; 479 | $kernelSec->{flags} |= $newBar << 20; 480 | } 481 | 482 | my @paramData = @{$paramSec->{StaticParams}}; 483 | 484 | if (defined $maxregCount) 485 | { 486 | push @paramData, ($maxregCount << 16) | 0x1b03; 487 | } 488 | 489 | my $newCTAIDs = join ',', map { sprintf '%04x', $_ } @$ctaidOffsets; 490 | my $oldCTAIDs = join ',', map { sprintf '%04x', $_ } @{$paramSec->{CTAIDOffsets}}; 491 | 492 | if ($newCTAIDs ne $oldCTAIDs) 493 | { 494 | print "Modified $kernelName CTAID Offsets: '$oldCTAIDs' => '$newCTAIDs'\n"; 495 | } 496 | if (@$ctaidOffsets) 497 | { 498 | push @paramData, (scalar(@$ctaidOffsets) << 18) | 0x1d04; 499 | push @paramData, @$ctaidOffsets; 500 | } 501 | 502 | my $newExits = join ',', map { sprintf '%04x', $_ } @$exitOffsets; 503 | my $oldExits = join ',', map { sprintf '%04x', $_ } @{$paramSec->{ExitOffsets}}; 504 | 505 | if ($newExits ne $oldExits) 506 | { 507 | print "Modified $kernelName Exit Offsets: '$oldExits' => '$newExits'\n"; 508 | } 509 | if (@$exitOffsets) 510 | { 511 | push @paramData, (scalar(@$exitOffsets) << 18) | 0x1c04; 512 | push @paramData, @$exitOffsets; 513 | } 514 | 515 | if ($ctaidzUsed != $paramSec->{CTAIDZUsed}) 516 | { 517 | print "Modified $kernelName CTAID.Z Used: '$paramSec->{CTAIDZUsed}' => '$ctaidzUsed'\n"; 518 | } 519 | if ($ctaidzUsed) 520 | { 521 | push @paramData, 0x0401; 522 | } 523 | 524 | if (@{$paramSec->{REQNTID}}) 525 | { 526 | push @paramData, (scalar(@{$paramSec->{REQNTID}}) << 18) | 0x1004; 527 | push @paramData, @{$paramSec->{REQNTID}}; 528 | } 529 | if (@{$paramSec->{MAXNTID}}) 530 | { 531 | push @paramData, (scalar(@{$paramSec->{MAXNTID}}) << 18) | 0x0504; 532 | push @paramData, @{$paramSec->{MAXNTID}}; 533 | } 534 | 535 | if (@$stackSize) 536 | { 537 | push @paramData, (scalar(@$stackSize) << 18) | 0x1e04; 538 | push @paramData, @$stackSize; 539 | } 540 | 541 | my $newParamSize = scalar(@paramData)*4; 542 | $paramSec->{Data} = unpack "H*", pack "L*", @paramData; 543 | if ($newParamSize != $paramSec->{size}) 544 | { 545 | print "Modified $kernelName ParamSecSize: $paramSec->{size} => $newParamSize\n"; 546 | $cubin->updateSize($paramSec, $newParamSize); 547 | } 548 | 549 | if ($newSize != $kernelSec->{size}) 550 | { 551 | print "Modified $kernelName KernelSize: $kernelSec->{size} => $newSize\n"; 552 | $cubin->updateSize($kernelSec, $newSize, 1); 553 | } 554 | } 555 | 556 | sub updateSize 557 | { 558 | my ($cubin, $sec, $newSize, $updatePrgSize) = @_; 559 | 560 | my $elfHdr = $cubin->{elfHdr}; 561 | my $class = $elfHdr->{fileClass}; 562 | 563 | # update section header 564 | my $delta = $newSize - $sec->{size}; 565 | $sec->{size} = $newSize; 566 | 567 | # update symtab section 568 | if ($sec->{SymbolEnt}) 569 | { 570 | $sec->{SymbolEnt}{size} = $newSize; 571 | my $symSection = $cubin->{'.symtab'}; 572 | $symSection->{Data} = ''; 573 | foreach my $symEnt (@{$symSection->{SymTab}}) 574 | { 575 | $symSection->{Data} .= unpack "H*", pack $symHdrT[$class], @{$symEnt}{@{$symHdrC[$class]}}; 576 | } 577 | } 578 | 579 | my $pos = $elfHdr->{ehSize}; 580 | my %sizeMap; 581 | 582 | # update section header offsets 583 | foreach my $secHdr (@{$cubin->{secHdrs}}) 584 | { 585 | # skip first header 586 | next if $secHdr->{align} == 0; 587 | 588 | # NOBITS data sections are size 0 589 | my $size = $secHdr->{type} == 8 ? 0 : $secHdr->{size}; 590 | 591 | # Add any needed padding between sections 592 | my $pad = $pos % $secHdr->{align}; 593 | if ($pad > 0) 594 | { 595 | $pos += $secHdr->{align} - $pad; 596 | } 597 | # map old offset to new 598 | $sizeMap{$secHdr->{offset}} = $pos; 599 | 600 | # update offset 601 | $secHdr->{offset} = $pos; 602 | 603 | # advance position by size 604 | $pos += $size; 605 | } 606 | 607 | # compute total section header size 608 | my $shSize = $elfHdr->{phOffset} - $elfHdr->{shOffset}; 609 | 610 | # map old offset to new 611 | $sizeMap{$elfHdr->{shOffset}} = $pos; 612 | $sizeMap{$elfHdr->{phOffset}} = $pos + $shSize; 613 | 614 | $elfHdr->{shOffset} = $pos; 615 | $elfHdr->{phOffset} = $pos + $shSize; 616 | 617 | # update program header offsets and sizes 618 | foreach my $prgHdr (@{$cubin->{prgHdrs}}) 619 | { 620 | # Not sure how best to adjust these so just assume they'll track other offsets. 621 | $prgHdr->{offset} = $sizeMap{$prgHdr->{offset}}; 622 | 623 | # If the kernel sizes changes, also update the associated ProgramHeader. 624 | # Note that this size is the kernel size plus any constant section sizes. 625 | if ($updatePrgSize && $prgHdr->{type} == 1 && 626 | $sec->{offset} >= $prgHdr->{offset} && 627 | $sec->{offset} < $prgHdr->{offset} + $prgHdr->{fileSize} + $delta) 628 | { 629 | $prgHdr->{fileSize} += $delta; 630 | $prgHdr->{memSize} += $delta; 631 | } 632 | } 633 | } 634 | 635 | # Write out the cubin after modifying it. 636 | sub write 637 | { 638 | my ($cubin, $file) = @_; 639 | 640 | open my $fh, ">$file" or die "Error: could not open $file for writing: $!"; 641 | binmode($fh); 642 | 643 | my $elfHdr = $cubin->{elfHdr}; 644 | my $class = $elfHdr->{fileClass}; 645 | 646 | # write elf header 647 | print $fh pack $elfHdrT[$class], @{$elfHdr}{@{$elfHdrC[$class]}}; 648 | my $pos = $elfHdr->{ehSize}; 649 | 650 | # write section data 651 | foreach my $secHdr (@{$cubin->{secHdrs}}) 652 | { 653 | # Skip NULL and NOBITS data sections 654 | next if $secHdr->{size} == 0 || $secHdr->{type} == 8; 655 | 656 | # Add any needed padding between sections 657 | my $pad = $pos % $secHdr->{align}; 658 | if ($pad > 0) 659 | { 660 | $pad = $secHdr->{align} - $pad; 661 | print $fh join '', "\0" x $pad; 662 | $pos += $pad; 663 | } 664 | 665 | print $fh pack 'H*', $secHdr->{Data}; 666 | $pos += $secHdr->{size}; 667 | } 668 | 669 | # write section headers 670 | foreach my $secHdr (@{$cubin->{secHdrs}}) 671 | { 672 | print $fh pack $secHdrT[$class], @{$secHdr}{@{$secHdrC[$class]}}; 673 | } 674 | 675 | #write program headers 676 | foreach my $prgHdr (@{$cubin->{prgHdrs}}) 677 | { 678 | print $fh pack $prgHdrT[$class], @{$prgHdr}{@{$prgHdrC[$class]}}; 679 | } 680 | close $fh; 681 | } 682 | 683 | __END__ 684 | 685 | -------------------------------------------------------------------------------- /microbench/microbench.cpp: -------------------------------------------------------------------------------- 1 | // microbench.cpp : Defines the entry point for the console application. 2 | // 3 | 4 | // nvcc -l cuda -o microbench microbench.cpp 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | CUcontext hContext = 0; 13 | 14 | #define CUDA_CHECK( fn ) do { \ 15 | CUresult status = (fn); \ 16 | if ( CUDA_SUCCESS != status ) { \ 17 | const char* errstr; \ 18 | cuGetErrorString(status, &errstr); \ 19 | printf("CUDA Driver Failure (line %d of file %s):\n\t%s returned 0x%x (%s)\n", __LINE__, __FILE__, #fn, status, errstr); \ 20 | if (hContext) cuCtxDestroy(hContext); \ 21 | exit(EXIT_FAILURE); \ 22 | } \ 23 | } while (0) 24 | 25 | 26 | int main(int argc, char* argv[]) 27 | { 28 | //int iTest = 2896; 29 | //while (iTest < 0x7fff) 30 | //{ 31 | // int iResult = iTest * iTest; 32 | // float fTest = (float)iTest; 33 | // int fResult = (int)(fTest * fTest); 34 | 35 | // printf("i*i:%08x f*f:%08x\n", iResult, fResult); 36 | 37 | // iTest += 0x0800; 38 | //} 39 | //exit(0); 40 | 41 | char deviceName[32]; 42 | int devCount, ordinal, major, minor; 43 | CUdevice hDevice; 44 | 45 | // Initialize the Driver API and find a device 46 | CUDA_CHECK( cuInit(0) ); 47 | CUDA_CHECK( cuDeviceGetCount(&devCount) ); 48 | for (ordinal = 0; ordinal < devCount; ordinal++) 49 | { 50 | CUDA_CHECK( cuDeviceGet(&hDevice, ordinal) ); 51 | CUDA_CHECK( cuDeviceGetAttribute (&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, hDevice) ); 52 | CUDA_CHECK( cuDeviceGetAttribute (&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, hDevice) ); 53 | CUDA_CHECK( cuDeviceGetName(deviceName, sizeof(deviceName), hDevice) ); 54 | if (major >= 5 && minor >= 2) 55 | { 56 | printf("Using: Id:%d %s (%d.%d)\n\n", ordinal, deviceName, major, minor); 57 | break; 58 | } 59 | } 60 | if (ordinal == devCount) 61 | { 62 | printf("No compute 5.0 device found, exiting.\n"); 63 | exit(EXIT_FAILURE); 64 | } 65 | 66 | // First command line arg is the type: internal (CS2R) or external (cuEventElapsedTime) timing 67 | int internalTiming = 1; 68 | if (argc > 1) 69 | internalTiming = strcmp(argv[1], "i") == 0 ? 1 : 0; 70 | 71 | // Second command line arg is the number of blocks 72 | int blocks = 1; 73 | if (argc > 2) 74 | blocks = atoi(argv[2]); 75 | if (blocks < 1) 76 | blocks = 1; 77 | 78 | // Third command line arg is the number of threads 79 | int threads = 128; 80 | if (argc > 3) 81 | threads = atoi(argv[3]); 82 | if (threads > 1024 || threads < 32) 83 | threads = 128; 84 | threads &= -32; 85 | 86 | // Forth command line arg: 87 | double fops = 1.0; 88 | int lanes = 1; 89 | if (argc > 4) 90 | { 91 | if (internalTiming) 92 | { 93 | // The number of lanes to print for each warp 94 | lanes = atoi(argv[4]); 95 | if (lanes > 32 || lanes < 1) 96 | lanes = 1; 97 | } 98 | else 99 | // The number of floating point operations in a full kernel launch 100 | fops = atof(argv[4]); 101 | } 102 | 103 | // Fifth command line arg is the repeat count for benchmarking 104 | int repeat = 1; 105 | if (argc > 5) 106 | repeat = atoi(argv[5]); 107 | if (repeat > 1000 || repeat < 1) 108 | repeat = 1; 109 | 110 | // threads = total number of threads 111 | size_t size = sizeof(int) * threads * blocks; 112 | 113 | // Setup our input and output buffers 114 | int* dataIn = (int*)malloc(size); 115 | int* dataOut = (int*)malloc(size); 116 | int* clocks = (int*)malloc(size); 117 | memset(dataIn, 0, size); 118 | 119 | CUmodule hModule; 120 | CUfunction hKernel; 121 | CUevent hStart, hStop; 122 | CUdeviceptr devIn, devOut, devClocks; 123 | 124 | // Init our context and device memory buffers 125 | CUDA_CHECK( cuCtxCreate(&hContext, 0, hDevice) ); 126 | CUDA_CHECK( cuMemAlloc(&devIn, size) ); 127 | CUDA_CHECK( cuMemAlloc(&devOut, size) ); 128 | CUDA_CHECK( cuMemAlloc(&devClocks, size) ); 129 | CUDA_CHECK( cuMemcpyHtoD(devIn, dataIn, size) ); 130 | CUDA_CHECK( cuMemsetD8(devOut, 0, size) ); 131 | CUDA_CHECK( cuMemsetD8(devClocks, 0, size) ); 132 | 133 | CUDA_CHECK( cuEventCreate(&hStart, CU_EVENT_BLOCKING_SYNC) ); 134 | CUDA_CHECK( cuEventCreate(&hStop, CU_EVENT_BLOCKING_SYNC) ); 135 | 136 | // Load our kernel 137 | CUDA_CHECK( cuModuleLoad(&hModule, "microbench.cubin") ); 138 | CUDA_CHECK( cuModuleGetFunction(&hKernel, hModule, "microbench") ); 139 | 140 | // Setup the params 141 | void* params[] = { &devOut, &devClocks, &devIn }; 142 | float ms = 0; 143 | 144 | // Warm up the clock (unless under nsight) 145 | if (!getenv("NSIGHT_LAUNCHED")) // NSIGHT_CUDA_ANALYSIS NSIGHT_CUDA_DEBUGGER 146 | for (int i = 0; i < repeat; i++) 147 | CUDA_CHECK( cuLaunchKernel(hKernel, blocks, 1, 1, threads, 1, 1, 0, 0, params, 0) ); 148 | 149 | // Launch the kernel 150 | CUDA_CHECK( cuEventRecord(hStart, NULL) ); 151 | //CUDA_CHECK( cuProfilerStart() ); 152 | CUDA_CHECK( cuLaunchKernel(hKernel, blocks, 1, 1, threads, 1, 1, 0, 0, params, 0) ); 153 | //CUDA_CHECK( cuProfilerStop() ); 154 | CUDA_CHECK( cuEventRecord(hStop, NULL) ); 155 | CUDA_CHECK( cuEventSynchronize(hStop) ); 156 | CUDA_CHECK( cuEventElapsedTime(&ms, hStart, hStop) ); 157 | 158 | //CUDA_CHECK( cuCtxSynchronize() ); 159 | 160 | // Get back our results from each kernel 161 | CUDA_CHECK( cuMemcpyDtoH(dataOut, devOut, size) ); 162 | CUDA_CHECK( cuMemcpyDtoH(clocks, devClocks, size) ); 163 | 164 | // Cleanup and shutdown of cuda 165 | CUDA_CHECK( cuEventDestroy(hStart) ); 166 | CUDA_CHECK( cuEventDestroy(hStop) ); 167 | CUDA_CHECK( cuModuleUnload(hModule) ); 168 | CUDA_CHECK( cuMemFree(devIn) ); 169 | CUDA_CHECK( cuMemFree(devOut) ); 170 | CUDA_CHECK( cuMemFree(devClocks) ); 171 | CUDA_CHECK( cuCtxDestroy(hContext) ); 172 | hContext = 0; 173 | 174 | // When using just one block, print out the internal timing data 175 | if (internalTiming) 176 | { 177 | int count = 0, total = 0, min = 999999, max = 0; 178 | 179 | int* clocks_p = clocks; 180 | int* dataOut_p = dataOut; 181 | 182 | // Loop over and print results 183 | for (int blk = 0; blk < blocks; blk++) 184 | { 185 | float *fDataOut = reinterpret_cast(dataOut_p); 186 | 187 | for(int tid = 0; tid < threads; tid += 32) 188 | { 189 | // Sometimes we want data on each thread, sometimes just one sample per warp is fine 190 | for (int lane = 0; lane < lanes; lane++) 191 | printf("b:%02d w:%03d t:%04d l:%02d clocks:%08d out:%08x\n", blk, tid/32, tid, lane, clocks_p[tid+lane], dataOut_p[tid+lane]); // %04u 192 | 193 | count++; 194 | total += clocks_p[tid]; 195 | if (clocks_p[tid] < min) min = clocks_p[tid]; 196 | if (clocks_p[tid] > max) max = clocks_p[tid]; 197 | } 198 | clocks_p += threads; 199 | dataOut_p += threads; 200 | } 201 | printf("average: %.3f, min %d, max: %d\n", (float)total/count, min, max); 202 | } 203 | else 204 | { 205 | // For more than one block we're testing throughput and want external timing data 206 | printf("MilliSecs: %.3f, GFLOPS: %.3f\n", ms, fops / (ms * 1000000.0)); 207 | } 208 | // And free up host memory 209 | free(dataIn); free(dataOut); free(clocks); 210 | 211 | return 0; 212 | } 213 | -------------------------------------------------------------------------------- /microbench/microbench.cu: -------------------------------------------------------------------------------- 1 | 2 | // Note this file isn't configured to automatically compile 3 | 4 | #include 5 | #include 6 | 7 | // Build: 8 | // nvcc -l cuda -o microbench microbench.cpp 9 | // nvcc -arch sm_50 -cubin microbench.cu 10 | 11 | // Inspect a cubin (use nvdisasm from cuda 6.5 for best results): 12 | // maxas.pl -e microbench.cubin 13 | 14 | // Insert new sass into cubin 15 | // maxas.pl -i microbench.sass microbench.cubin 16 | 17 | // run it: 18 | // ./microbench 19 | 20 | // Use extern C so C++ doesn't mangle our kernel name 21 | extern "C" __global__ void microbench(int *out, int *clocks, int *in) 22 | { 23 | __shared__ int share[1024]; 24 | 25 | int tid = threadIdx.x; 26 | int bx = blockIdx.x; 27 | int by = blockIdx.y; 28 | 29 | int start = clock(); 30 | 31 | share[tid] = in[by * 65535 + bx]; //tid + blkDimX + blkDimY + blkDimZ + grdDimX + grdDimY + grdDimZ 32 | 33 | __syncthreads(); 34 | 35 | int end = clock(); 36 | 37 | clocks[tid] = (start >> 16) | (end & 0xffff0000); //end - start; 38 | 39 | out[tid] = share[tid ^ 1]; 40 | } 41 | 42 | // A note about using the Cuda Runtime. 43 | // If that's your preference over the driver API then here's what you'd do: 44 | 45 | // In your project properties in the Cuda C/C++ panel: 46 | // -Set the "Keep Processed Files" (-keep) option 47 | // -Add a -v manually to the command line 48 | // If compiling on command line just add -keep -v options to nvcc. 49 | // Rebuild your solution and look in the log for these lines that follow the ptxas step: 50 | 51 | // #$ fatbinary --create="Release/kernel.fatbin" -32 --key="a7bce87544c2a492" --ident="C:/Users/Scott/Documents/sgemm6/sgemm6/kernel.cu" --cmdline="-v --opt-level 4 --generate-line-info " "--image=profile=sm_50,file=Release/kernel.sm_50.cubin" "--image=profile=compute_50,file=Release/kernel.ptx" --embedded-fatbin="Release/kernel.fatbin.c" --cuda 52 | // #$ cl.exe @Release/kernel.cu.cpp.ii.res > "Release/kernel.cu.cpp.ii" 53 | // #$ cl.exe @Release/kernel.cu.obj.res -Fo"Release/kernel.cu.obj" 54 | 55 | // You just need to manually run these 3 commands (or add them to a build script) 56 | // after you've modified the cubin generated from the preceeding ptxas command. 57 | // That will give you a new .cu.obj file which will automatically be linked in for you next time you 58 | // build your project (or you could manually run the linker step as well). 59 | 60 | // Having done that you can call your kernel normally using the <<< >>> syntax. 61 | // Debugging will have to be with the sass syntax but that's what you'll want to see anyway. 62 | // With fatbin you can also keep non-maxwell optimized versions of your code. 63 | 64 | 65 | // I just discovered this also works as a shortcut to the above: 66 | // nvcc -lib -arch sm_52 -m 32 -use-cubin code=sm_52,cubin=microbench.cubin -o microbench.lib microbench.cu 67 | 68 | // The cu kernel definitions above need to have empty bodies. 69 | // And, the cu file must be compiled to a lib seperately before linking. -------------------------------------------------------------------------------- /microbench/microbench.sass: -------------------------------------------------------------------------------- 1 | # Kernel: microbench 2 | 3 | // This is a simple micro bench to demonstrate the latency in loading SR_TID.X 4 | 5 | 6 | blockDimX : c[0x0][0x08] 7 | blockDimY : c[0x0][0x0c] 8 | blockDimZ : c[0x0][0x10] 9 | gridDimX : c[0x0][0x14] 10 | gridDimY : c[0x0][0x18] 11 | gridDimZ : c[0x0][0x1c] 12 | 13 | param_out[0] : c[0x0][0x140] 14 | param_out[1] : c[0x0][0x144] 15 | param_clocks[0] : c[0x0][0x148] 16 | param_clocks[1] : c[0x0][0x14c] 17 | param_in[0] : c[0x0][0x150] 18 | param_in[1] : c[0x0][0x154] 19 | 20 | 21 | 22 | 23 | 0-1 : out<0-1> 24 | 2-3 : clocks<0-1> 25 | 4-5 : in<0-1> 26 | 6-20 : tid, bid, blockDim, clock1, clock2, result, offset, x 27 | 28 | 29 | 30 | // Load in our params (not currently used below) 31 | --:-:-:-:1 MOV in0, param_in[0]; 32 | --:-:-:-:1 MOV in1, param_in[1]; 33 | 34 | // Get the first clock value 35 | --:-:-:-:1 CS2R clock1, SR_CLOCKLO; 36 | 37 | // Get the threadId and blockId 38 | // Set the Read-After-Write dependency barrier 1 and 2 39 | --:-:1:-:1 S2R tid, SR_TID.X; 40 | // Add one additional clock stall to allow the barrier time to set prior to next instruction that uses it 41 | --:-:2:-:2 S2R bid, SR_CTAID.X; 42 | 43 | 44 | // Get the second clock value 45 | // Wait on the depenedency barriers that were set in the prior instruction 46 | // Stall 6 to allow CS2R time to complete before next instruction 47 | // CS2R takes a constant 6 clocks to complete unlike S2R which is a variable 22-44 clocks 48 | // This stall count does not factor into the time calculation at all 49 | 03:-:-:-:6 CS2R clock2, SR_CLOCKLO; 50 | 51 | // Take the difference of clocks 52 | --:-:-:-:1 IADD clock1, clock2, -clock1; 53 | 54 | // Setup our output addresses 55 | // Stall your pipeline dependencies properly 56 | // Note using a single XMAD assumes blockDimX and bid are 16 bit values, which is reasonable for this test code 57 | --:-:-:-:6 XMAD offset, bid, blockDimX, tid; 58 | 59 | // LEA is "load effective address" 60 | // The offset param is shifted left 2 and added to the pointers with 64bit math 61 | --:-:-:-:6 LEA clocks0.CC, offset, param_clocks[0], 2; 62 | --:-:-:-:1 LEA.HI.X clocks1, offset, param_clocks[1], RZ, 2; 63 | 64 | --:-:-:-:6 LEA out0.CC, offset, param_out[0], 2; 65 | --:-:-:-:1 LEA.HI.X out1, offset, param_out[1], RZ, 2; 66 | 67 | // Output the results. 68 | // No stall needed on prior instruction as memory store instructions have a 5 clock delay in picking up register values 69 | --:-:-:-:1 STG.E [clocks], clock1; 70 | --:-:-:-:1 STG.E [out], offset; # use this to return whatever you like to inspect the results 71 | --:-:-:-:5 EXIT; 72 | 73 | -------------------------------------------------------------------------------- /microbench/shared.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | 4 | print `maxas.pl -i shared_sts16.sass microbench.cubin`; 5 | 6 | exit if $?; 7 | 8 | print `Release\\microbench.exe i 1 64`; 9 | 10 | 11 | __END__ 12 | 13 | -------------------------------------------------------------------------------- /microbench/shared_lds.sass: -------------------------------------------------------------------------------- 1 | # Kernel: microbench 2 | # InsCnt: 18 3 | # RegCnt: 5 4 | # SharedSize: 4096 5 | # BarCnt: 1 6 | # Params(3): 7 | # ord:addr:size:align 8 | # 0:0x140:4:0 9 | # 1:0x144:4:0 10 | # 2:0x148:4:0 11 | 12 | // This is a simple micro bench to demonstrate the latency in loading SR_TID.X 13 | 14 | 15 | 16 | 0-3 : result, a, b, c 17 | 18 | 4-40 : out, clocks, in, tid, bid, blockDim, clock1, clock2, x, tid3, tid7, tid96, tid128, readAs, readBs, val<0-20> 19 | 20 | 21 | 22 | // Load in our params 23 | --:-:1:-:1 S2R tid, SR_TID.X; 24 | --:-:2:-:1 S2R bid, SR_CTAID.X; 25 | 26 | --:-:-:-:1 MOV result, c[0x0][0x0]; 27 | --:-:-:-:1 MOV in, c[0x0][0x100]; 28 | 29 | --:-:-:-:1 CS2R clock1, SR_CLOCKLO; 30 | --:-:-:-:1 MOV result, c[0x0][0x13c]; 31 | --:-:-:-:1 CS2R clock2, SR_CLOCKLO; 32 | 33 | --:-:-:-:1 MOV blockDim, c[0x0][0x8]; 34 | --:-:-:-:1 MOV out, c[0x0][0x140]; 35 | --:-:-:-:1 MOV clocks, c[0x0][0x144]; 36 | 37 | 38 | 39 | 40 | 41 | 42 | 03:-:-:-:1 LOP.AND tid3, tid, 3; 43 | --:-:-:-:1 LOP.AND tid7, tid, 7; 44 | --:-:-:-:1 LOP.AND tid96, tid, 96; 45 | --:-:-:-:1 LOP.AND tid128, tid, 128; 46 | 47 | // readAs = ((tid128 >> 4) | tid7) << 4 48 | --:-:-:-:1 SHR.U32 readAs, tid128, 4; 49 | --:-:-:-:1 LOP.OR readAs, readAs, tid7; 50 | --:-:-:-:1 SHL readAs, readAs, 4; 51 | 52 | // readBs = ((tid96 >> 3) | tid3) << 4 53 | --:-:-:-:1 SHR.U32 readBs, tid96, 3; 54 | --:-:-:-:1 LOP.OR readBs, readBs, tid3; 55 | #--:-:-:-:1 SHL readBs, readBs, 4; 56 | #--:-:-:-:1 ISCADD readBs, readBs, 4x<1024>, 4; 57 | 58 | 59 | 60 | 61 | 62 | 63 | #--:-:-:-:1 LDS.U.128 result, [readBs]; 64 | 65 | 66 | 67 | 68 | 01:-:-:-:1 IADD clock1, clock2, -clock1; 69 | 70 | 71 | --:-:-:-:1 XMAD tid, blockDim, bid, tid; 72 | --:-:-:Y:6 XMAD.MRG x, blockDim, bid.H1, RZ; 73 | --:-:-:Y:6 XMAD.PSL.CBCC tid, blockDim.H1, x.H1, tid; 74 | --:-:-:Y:6 SHL tid, tid, 0x2; 75 | 76 | --:-:-:-:1 IADD clocks, clocks, tid; 77 | --:-:-:-:2 IADD out, out, tid; 78 | 79 | --:-:-:-:1 STG [clocks], clock1; 80 | --:-:-:-:1 STG [out], readBs; 81 | --:-:-:-:5 EXIT; 82 | 83 | 84 | 85 | --:-:-:-:4 LOP.AND tid32, tid, -32; 86 | 87 | --:-:-:-:1 STS.128 [tid32 + 4x<2048>], RZ; 88 | 89 | --:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 90 | --:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 91 | --:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 92 | --:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 93 | --:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 94 | --:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 95 | --:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 96 | --:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 97 | --:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 98 | --:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 99 | --:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 100 | --:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 101 | --:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 102 | --:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 103 | --:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 104 | --:-:1:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 105 | 106 | 107 | // readAs = (((tid & 0x80) >> 4) | ((tid >> 1) & 7)) << 4; 108 | --:-:-:-:1 BFE.U32 tid7, tid, 0x301; 109 | --:-:-:-:1 LOP.AND readAs, tid, 0x80; 110 | --:-:-:-:1 SHR.U32 readAs, readAs, 4; 111 | --:-:-:-:1 LOP.OR readAs, readAs, tid7; 112 | --:-:-:-:1 SHL readAs, readAs, 4; 113 | 114 | // readBs = ((($tid & 0x70) >> 3) | ($tid & 1)) << 4 + 4096; 115 | --:-:-:-:1 LOP.AND tid1, tid, 0x1; 116 | --:-:-:-:1 LOP.AND readBs, tid, 0x70; 117 | --:-:-:-:1 SHR.U32 readBs, readBs, 3; 118 | --:-:-:-:1 LOP.OR readBs, readBs, tid1; 119 | --:-:-:-:1 ISCADD readBs, readBs, 4x<1024>, 4; 120 | 121 | 122 | -------------------------------------------------------------------------------- /microbench/shared_sts16.sass: -------------------------------------------------------------------------------- 1 | # Kernel: microbench 2 | # InsCnt: 18 3 | # RegCnt: 5 4 | # SharedSize: 4096 5 | # BarCnt: 1 6 | # Params(3): 7 | # ord:addr:size:align 8 | # 0:0x140:4:0 9 | # 1:0x144:4:0 10 | # 2:0x148:4:0 11 | 12 | // This is a simple micro bench to demonstrate the latency in loading SR_TID.X 13 | 14 | 15 | 16 | 0-3 : result, a, b, c 17 | 18 | 4-40 : out, clocks, in, tid, bid, blockDim, clock1, clock2, x, tid1, tid31, tid32, readAs, readBs, val<0-20> 19 | 20 | 21 | 22 | // Load in our params 23 | --:-:1:-:1 S2R tid, SR_TID.X; 24 | --:-:2:-:1 S2R bid, SR_CTAID.X; 25 | 26 | //--:-:-:-:1 MOV result, c[0x0][0x0]; 27 | //--:-:-:-:1 MOV in, c[0x0][0x100]; 28 | --:-:-:-:1 MOV result, 1; 29 | 30 | --:-:-:-:1 MOV blockDim, c[0x0][0x8]; 31 | --:-:-:-:1 MOV out, c[0x0][0x140]; 32 | --:-:-:-:1 MOV clocks, c[0x0][0x144]; 33 | 34 | 35 | // readAs = ((tid >> 1) & 7) << 4; 36 | 03:-:-:-:6 BFE.U32 readAs, tid, 0x301; // 3 bits at position 1 37 | --:-:-:-:6 SHL readAs, readAs, 3; 38 | 39 | // readBs = (((tid & 0x30) >> 3) | (tid & 1)) << 4 + 1024; 40 | --:-:-:-:6 LOP.AND tid1, tid, 1; 41 | --:-:-:-:6 LOP.AND readBs, tid, 0x30; 42 | --:-:-:-:6 SHR.U32 readBs, readBs, 3; 43 | --:-:-:-:6 LOP.OR readBs, readBs, tid1; 44 | --:-:-:-:6 ISCADD readBs, readBs, 0, 3; 45 | 46 | 47 | 48 | ///--:-:-:-:1 STS [tid32], result; 49 | //--:-:-:-:1 STS.S16 [tid32 + 2x<32>], result; 50 | //--:-:1:-:2 LDS.U.64 result, [readBs]; 51 | 52 | --:-:-:-:0 CS2R clock1, SR_CLOCKLO; 53 | --:-:1:-:6 LDS.U.64 result, [readAs]; 54 | --:-:-:-:6 CS2R clock2, SR_CLOCKLO; 55 | 56 | 57 | 01:-:-:-:1 IADD clock1, clock2, -clock1; 58 | 59 | 60 | --:-:-:-:1 XMAD tid, blockDim, bid, tid; 61 | --:-:-:Y:6 XMAD.MRG x, blockDim, bid.H1, RZ; 62 | --:-:-:Y:6 XMAD.PSL.CBCC tid, blockDim.H1, x.H1, tid; 63 | --:-:-:Y:6 SHL tid, tid, 0x2; 64 | 65 | --:-:-:-:1 IADD clocks, clocks, tid; 66 | --:-:-:-:2 IADD out, out, tid; 67 | 68 | --:-:-:-:1 STG [clocks], clock1; 69 | --:-:-:-:1 STG [out], result; 70 | --:-:-:-:5 EXIT; 71 | 72 | 73 | 74 | --:-:-:-:4 LOP.AND tid32, tid, -32; 75 | 76 | --:-:-:-:1 STS.128 [tid32 + 4x<2048>], RZ; 77 | 78 | --:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 79 | --:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 80 | --:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 81 | --:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 82 | --:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 83 | --:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 84 | --:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 85 | --:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 86 | --:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 87 | --:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 88 | --:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 89 | --:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 90 | --:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 91 | --:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 92 | --:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 93 | --:-:1:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 94 | 95 | 03:-:-:-:6 LOP.AND tid31, tid, 31; 96 | --:-:-:-:6 LOP.AND tid32, tid, 32; 97 | --:-:-:-:6 SHL tid32, tid32, 0x2; 98 | --:-:-:-:6 LOP.OR tid32, tid32, tid31; 99 | --:-:-:-:6 SHL tid32, tid32, 0x2; 100 | 101 | // readAs = (((tid & 0x80) >> 4) | ((tid >> 1) & 7)) << 4; 102 | --:-:-:-:1 BFE.U32 tid7, tid, 0x301; 103 | --:-:-:-:1 LOP.AND readAs, tid, 0x80; 104 | --:-:-:-:1 SHR.U32 readAs, readAs, 4; 105 | --:-:-:-:1 LOP.OR readAs, readAs, tid7; 106 | --:-:-:-:1 SHL readAs, readAs, 4; 107 | 108 | // readBs = ((($tid & 0x70) >> 3) | ($tid & 1)) << 4 + 4096; 109 | --:-:-:-:1 LOP.AND tid1, tid, 0x1; 110 | --:-:-:-:1 LOP.AND readBs, tid, 0x70; 111 | --:-:-:-:1 SHR.U32 readBs, readBs, 3; 112 | --:-:-:-:1 LOP.OR readBs, readBs, tid1; 113 | --:-:-:-:1 ISCADD readBs, readBs, 4x<1024>, 4; 114 | 115 | 116 | -------------------------------------------------------------------------------- /microbench/throughput.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | 4 | my $loopSize = 512; 5 | my $blocks = 32; 6 | my $loops = 10240000; 7 | my $fileName = 'throughput2.sass'; 8 | 9 | writeSassFile($fileName, $loops); 10 | 11 | #print `maxas.pl -p $fileName`; 12 | #exit; 13 | 14 | print `maxas.pl -i $fileName microbench.cubin`; 15 | exit if $?; 16 | 17 | foreach my $thread128 (2) 18 | { 19 | my $threads = $thread128 * 128; 20 | my $fops = 2 * $loops * $loopSize * $blocks * $threads; 21 | 22 | my $data = `Release\\microbench.exe e $blocks $threads $fops`; 23 | 24 | my ($gflops) = $data =~ /GFLOPS: ([0-9]+)/ms; 25 | 26 | printf "%d %d %d\n", $thread128, $threads, $gflops; 27 | } 28 | 29 | exit; 30 | 31 | sub writeSassFile 32 | { 33 | my ($filename, $loops) = @_; 34 | 35 | open my $fh, ">$filename" or die "$filename: $!"; 36 | 37 | printf $fh <<'EOF', $loops; 38 | # Kernel: microbench 39 | 40 | 41 | 42 | 0-10 : result, r1, r2, r3 43 | 20-27 ~ count, stop 44 | 45 | 46 | 47 | --:-:-:-:1 MOV count, RZ; 48 | --:-:-:-:1 MOV32I stop, %d; 49 | --:-:-:-:1 MOV32I r1, 1.0; 50 | --:-:-:-:1 MOV32I r2, 1.0; 51 | --:-:-:-:4 MOV32I r3, 1.0; 52 | 53 | LOOP: 54 | 55 | --:-:-:-:1 ISETP.LE.AND P0, PT, count, stop, PT; 56 | --:-:-:-:1 IADD count, count, 1; 57 | 58 | 59 | my $out; 60 | 61 | foreach my $i (0 .. 511) 62 | { 63 | my $yield = ($i + 32) & 63 ? '-' : 'Y'; 64 | 65 | my $stall = $i == 511 ? 0 : 1; 66 | 67 | $out .= "--:-:-:$yield:$stall FFMA result, r1, r2, r3;\n"; 68 | } 69 | return $out; 70 | 71 | 72 | --:-:-:Y:5 @P0 BRA LOOP; 73 | --:-:-:-:5 EXIT; 74 | EOF 75 | 76 | close $fh; 77 | } 78 | 79 | __END__ 80 | 81 | -------------------------------------------------------------------------------- /microbench/throughput.sass: -------------------------------------------------------------------------------- 1 | # Kernel: microbench 2 | # InsCnt: 18 3 | # RegCnt: 5 4 | # SharedSize: 4096 5 | # BarCnt: 1 6 | # Params(3): 7 | # ord:addr:size:align 8 | # 0:0x140:4:0 9 | # 1:0x144:4:0 10 | # 2:0x148:4:0 11 | 12 | 13 | 14 | 8-20 : count 15 | 16 | 17 | 18 | --:-:-:-:1 MOV R0, RZ; 19 | --:-:-:-:1 MOV R1, RZ; 20 | --:-:-:-:1 MOV R2, RZ; 21 | --:-:-:-:1 MOV R3, RZ; 22 | --:-:-:-:1 MOV R4, RZ; 23 | --:-:-:-:1 MOV R5, RZ; 24 | --:-:-:-:1 MOV R6, RZ; 25 | --:-:-:-:1 MOV R7, RZ; 26 | --:-:-:-:1 MOV R8, RZ; 27 | --:-:-:Y:6 MOV count, RZ; 28 | 29 | // This loop is capable of running at 1700 GFlops on GM107. 30 | // You can tweak it to see how register bank conflicts or different control codes 31 | // effect performance. 32 | // With thoughput.pl you can pass params to this code and do some autotuning. 33 | LOOP: 34 | 35 | --:-:-:-:1 ISETP.LE.AND P0, PT, count, 0x19000, PT; 36 | --:-:-:-:1 IADD count, count, 0x1; 37 | 38 | 39 | my $out; 40 | 41 | foreach my $i (0..511) #511 42 | { 43 | my $y = ($i + 32) & 63 ? '-' : 'Y'; 44 | 45 | $out .= qq| 46 | --:-:-:$y:1 FFMA R0, R1, R2, R3;|; #c[0x0][$c] 47 | } 48 | return $out; 49 | 50 | 51 | --:-:-:Y:5 @P0 BRA LOOP; 52 | 53 | --:-:-:-:5 EXIT; 54 | 55 | 56 | 57 | 58 | open my $fh, 'params.txt'; 59 | my $line = <$fh>; 60 | close $fh; 61 | my ($r1, $r2, $r3) = split "\t", $line; 62 | 63 | 80-95 : out, clocks, in, tid, clock1, clock2, result 64 | 65 | 66 | --:-:1:-:1 S2R tid, SR_TID.X; 67 | --:-:-:-:1 MOV out, c[0x0][0x140]; 68 | --:-:-:-:1 MOV clocks, c[0x0][0x144]; 69 | 01:-:-:-:1 MOV in, c[0x0][0x148]; 70 | 71 | 72 | 73 | --:-:-:-:1 MOV32I f0, 0x3f800000; 74 | --:-:-:-:1 MOV32I f1, 0x3f800000; 75 | --:-:-:-:1 MOV32I f2, 0x3f800000; 76 | --:-:-:-:5 MOV32I f3, 0x3f800000; 77 | 78 | --:-:-:-:1 CS2R clock1, SR_CLOCKLO; 79 | 80 | 81 | --:-:-:-:1 CS2R clock2, SR_CLOCKLO; 82 | 83 | --:-:-:-:6 MOV32I result, 0x457; 84 | --:-:-:-:1 IADD clock1, clock2, -clock1; 85 | 86 | 87 | --:-:-:-:6 SHL tid, tid, 0x2; 88 | --:-:-:-:1 IADD clocks, clocks, tid; 89 | --:-:-:-:1 IADD out, out, tid; 90 | 91 | --:-:-:-:1 STG [clocks], clock1; 92 | --:-:-:-:1 STG [out], R24; 93 | 94 | 95 | -------------------------------------------------------------------------------- /microbench/throughput2.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | my %p; 4 | 5 | $p{N} = 8192; 6 | $p{blocking} = 8; 7 | $p{unroll} = 8; 8 | $p{threads} = 64; #256 9 | 10 | $p{csize} = $p{blocking} * $p{blocking}; 11 | $p{loopSize} = $p{unroll} * $p{csize}; 12 | $p{width} = sqrt($p{csize} * $p{threads}); 13 | $p{blocks} = ($p{N} / $p{width}) * ($p{N} / $p{width}); 14 | $p{loops} = $p{N} / $p{unroll}; 15 | $p{fops} = 2 * $p{loops} * $p{loopSize} * $p{blocks} * $p{threads}; 16 | 17 | my $fileName = 'throughput2.sass'; 18 | 19 | my @params = qw(N blocking unroll threads csize loopSize loops width blocks fops); 20 | 21 | #print join("\t", @params), "\n"; 22 | #print join("\t", @p{@params}), "\n"; 23 | 24 | print map sprintf("%-9s: %d\n", $_, $p{$_}), @params; 25 | 26 | writeSassFile($fileName, $p{loopSize}, $p{loops}); 27 | 28 | #print `maxas.pl -p $fileName`; 29 | #exit; 30 | 31 | print `maxas.pl -i $fileName microbench.cubin`; 32 | 33 | exit if $?; 34 | 35 | my $data = `Release\\microbench.exe e $p{blocks} $p{threads} $p{fops} 50`; 36 | 37 | my ($gflops) = $data =~ /GFLOPS: ([0-9]+)/ms; 38 | 39 | print $data; 40 | 41 | #printf "%d %4d %4d %d\n", $thread128, $loopSize, $loops, $gflops; 42 | 43 | 44 | 45 | 46 | sub writeSassFile 47 | { 48 | my ($filename, $loopSize, $loops) = @_; 49 | 50 | open my $fh, ">$filename" or die "$filename: $!"; 51 | 52 | printf $fh <<'END_SASS', $loops; 53 | # Kernel: microbench 54 | 55 | 56 | 57 | 3, 2,11,10,19,18,27,26 : cx00y<00-03|64-67> 58 | 7, 6,15,14,23,22,31,30 : cx01y<00-03|64-67> 59 | 1, 0, 9, 8,17,16,25,24 : cx02y<00-03|64-67> 60 | 5, 4,13,12,21,20,29,28 : cx03y<00-03|64-67> 61 | 35,34,43,42,51,50,59,58 : cx64y<00-03|64-67> 62 | 39,38,47,46,55,54,63,62 : cx65y<00-03|64-67> 63 | 33,32,41,40,49,48,57,56 : cx66y<00-03|64-67> 64 | 37,36,45,44,53,52,61,60 : cx67y<00-03|64-67> 65 | 66 | 64-79 : j0Ax<00-03|64-67>, j0By<00-03|64-67> 67 | 80-95 : j1Ax<00-03|64-67>, j1By<00-03|64-67> 68 | 69 | 0-127 : r<0-127> 70 | 71 | 100-101 : count, stop 72 | 73 | //102-112 ~ readAs, readBs, writeS 74 | 75 | 76 | 77 | --:-:-:-:1 MOV count, RZ; 78 | --:-:-:-:1 MOV32I stop, %d; 79 | //--:-:-:-:1 MOV writeS, RZ; 80 | //--:-:-:-:1 MOV readAs, RZ; 81 | //--:-:-:-:1 MOV readBs, RZ; 82 | 83 | 84 | return join '', map "--:-:-:-:1 MOV32I r$_, 1.0;\n", 0..95; 85 | 86 | 87 | LOOP: 88 | 89 | --:-:-:-:1 ISETP.LE.AND P0, PT, count, stop, PT; 90 | --:-:-:-:1 IADD count, count, 1; 91 | 92 | 93 | my $out; 94 | 95 | 96 | my @cOrder; 97 | #my @swirl = ([0,1],[0,0],[2,0],[2,1]); 98 | my @swirl = ([2,0],[2,1],[0,1],[0,0]); 99 | #my @swirl = ([0,1],[0,0],[1,0],[1,1]); 100 | my @xVals = (0,1,64,65); 101 | #my @xVals = (0,2,64,66); 102 | 103 | my @yVals = (0,2,64,66); 104 | 105 | foreach my $y (@yVals) 106 | { 107 | foreach my $x (@xVals) 108 | { 109 | push @cOrder, sprintf('x%%02dy%%02d', $x + $_->[0], $y + $_->[1]) foreach @swirl; 110 | } 111 | @xVals = reverse @xVals; 112 | } 113 | 114 | foreach my $j (0..7) 115 | { 116 | my $odd = $j & 1; 117 | my $nOdd = !$odd + 0; 118 | 119 | my %%insert; 120 | 121 | #$insert{c62} = "01:-:-:-:5 BAR.SYNC 0;\n" if $j == 6; 122 | 123 | $insert{c62} = 124 | "--:-:-:-:1 LOP.XOR readAs, readAs, 0;\n" . 125 | "--:-:-:-:1 LOP.XOR readBs, readBs, 0;\n" . 126 | "--:-:-:-:1 LOP.XOR readAs, readAs, 0;\n" . 127 | "--:-:-:-:1 LOP.XOR readBs, readBs, 0;\n" . 128 | "--:-:-:-:1 LOP.XOR writeS, writeS, 0;\n" if $j == 8; 129 | 130 | foreach my $c (0 .. 63) 131 | { 132 | my ($x,$y) = $cOrder[$c] =~ /^(x\d+)(y\d+)/; 133 | my $ins = $insert{"c$c"} || ''; 134 | my $stall = ($c == 63 && $j == 7) ? 0 : 1; #1; #$ins || 135 | my $yield = $c == 32 ? 'Y' : '-'; 136 | my $wait = '--'; #$c ? '--' : '01'; 137 | 138 | $out .= "$wait:-:-:$yield:$stall FFMA c$cOrder[$c], j${odd}A$x, j${odd}B$y, c$cOrder[$c];\n$ins"; 139 | } 140 | } 141 | return $out; 142 | 143 | 144 | --:-:-:Y:5 @P0 BRA LOOP; 145 | --:-:-:-:5 EXIT; 146 | END_SASS 147 | 148 | close $fh; 149 | } 150 | 151 | __END__ 152 | 153 | my %%insert = ( 154 | c0 => "--:-:-:-:1 LDS.U.128 j${nOdd}Ax00, [readAs+0x10];\n", 155 | c2 => "--:-:-:-:1 LDS.U.128 j${nOdd}By00, [readBs+0x10];\n", 156 | c4 => "--:-:-:-:1 LDS.U.128 j${nOdd}Ax64, [readAs+0x10];\n", 157 | c6 => "--:-:1:-:1 LDS.U.128 j${nOdd}By64, [readBs+0x10];\n", 158 | ); -------------------------------------------------------------------------------- /microbench/throughput2.sass: -------------------------------------------------------------------------------- 1 | # Kernel: microbench 2 | 3 | 4 | 5 | 0-10 : result, r1, r2, r3 6 | 20-27 ~ count, stop 7 | 8 | 9 | 10 | --:-:-:-:1 MOV count, RZ; 11 | --:-:-:-:1 MOV32I stop, 102400; 12 | --:-:-:-:1 MOV32I r1, 1.0; 13 | --:-:-:-:1 MOV32I r2, 1.0; 14 | --:-:-:-:4 MOV32I r3, 1.0; 15 | 16 | LOOP: 17 | 18 | --:-:-:-:1 ISETP.LE.AND P0, PT, count, stop, PT; 19 | --:-:-:-:1 IADD count, count, 1; 20 | 21 | 22 | my $out; 23 | 24 | foreach my $i (0 .. 511) 25 | { 26 | my $yield = ($i + 32) & 63 ? '-' : 'Y'; 27 | 28 | my $stall = $i == 511 ? 0 : 1; 29 | 30 | #$out .= "--:-:-:$yield:1 FFMA r3, r1, r2, r3;\n"; 31 | #$out .= "--:-:-:-:1 FFMA r3, r1, r2, r3;\n"; 32 | #$out .= "--:-:-:-:1 FFMA r3, r1, r2, r3;\n"; 33 | #$out .= "--:-:-:-:0 FFMA r3, r1, r2, r3;\n"; 34 | #$out .= "--:-:-:-:1 I2F.F32.S16 result, r1;\n"; 35 | 36 | #$out .= "--:-:-:$yield:$stall VADD.S16.S16.SAT.MRG_16L result, r1, r2, RZ;\n"; 37 | #$out .= "--:-:-:-:1 MOV result, RZ;\n"; 38 | 39 | $out .= "--:-:-:$yield:$stall IADD.SAT result, r1, r2;\n"; 40 | #$out .= "--:-:-:$yield:$stall VMAD.S8.S8.SAT result, r1, r2, r3;\n"; 41 | #$out .= "--:-:-:$yield:$stall XMAD result, r1, r2, r3;\n"; 42 | } 43 | return $out; 44 | 45 | 46 | --:-:-:Y:5 @P0 BRA LOOP; 47 | --:-:-:-:5 EXIT; 48 | -------------------------------------------------------------------------------- /microbench/throughput3.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | 4 | my %data; 5 | 6 | foreach my $thread128 (1 .. 8) 7 | { 8 | foreach my $size64 (8 .. 16) 9 | { 10 | my $loopSize = $size64 * 64; 11 | my $loops = int(2 * 1638400 / ($size64 * $thread128)); 12 | 13 | my $blocks = 16; 14 | my $threads = $thread128 * 128; 15 | my $fops = 2 * $loops * $loopSize * $blocks * $threads; 16 | my $fileName = 'throughput2.sass'; 17 | 18 | #printf "%d %4d %4d %d\n", $thread128, $loopSize, $loops, $fops; 19 | #next; 20 | 21 | writeSassFile($fileName, $loopSize, $loops); 22 | 23 | `maxas.pl -i $fileName microbench.cubin`; 24 | 25 | exit if $?; 26 | 27 | my $data = `Release\\microbench.exe e $blocks $threads $fops`; 28 | 29 | my ($gflops) = $data =~ /GFLOPS: ([0-9]+)/ms; 30 | 31 | printf "%d %4d %4d %d\n", $thread128, $loopSize, $loops, $gflops; 32 | 33 | push @{$data{$loopSize}}, $gflops; 34 | } 35 | } 36 | print join("\t", 'size', 1 .. 8), "\n"; 37 | foreach my $loopSize (sort {$a <=> $b} keys %data) 38 | { 39 | print join("\t", $loopSize, @{$data{$loopSize}}), "\n"; 40 | } 41 | 42 | exit; 43 | 44 | sub writeSassFile 45 | { 46 | my ($filename, $loopSize, $loops) = @_; 47 | 48 | open my $fh, ">$filename" or die "$filename: $!"; 49 | 50 | printf $fh <<'EOF', $loops, $loopSize, $loopSize; 51 | # Kernel: microbench 52 | 53 | 54 | 55 | 0-10 : result, r1, r2, r3, count, stop 56 | 57 | 58 | 59 | --:-:-:-:1 MOV count, RZ; 60 | --:-:-:-:1 MOV32I stop, %d; 61 | --:-:-:-:1 MOV32I r1, 1.0; 62 | --:-:-:-:1 MOV32I r2, 1.0; 63 | --:-:-:-:4 MOV32I r3, 1.0; 64 | 65 | LOOP: 66 | 67 | --:-:-:-:1 ISETP.LE.AND P0, PT, count, stop, PT; 68 | --:-:-:-:1 IADD count, count, 1; 69 | 70 | 71 | my $out; 72 | 73 | foreach my $i (0 .. %d) 74 | { 75 | my $y = %d > 64 && (($i + 32) & 63) ? '-' : 'Y'; 76 | 77 | $out .= "--:-:-:$y:1 FFMA result, r1, r2, r3;\n"; 78 | } 79 | return $out; 80 | 81 | 82 | --:-:-:Y:5 @P0 BRA LOOP; 83 | --:-:-:-:5 EXIT; 84 | EOF 85 | 86 | close $fh; 87 | } 88 | 89 | __END__ 90 | 91 | -------------------------------------------------------------------------------- /microbench/throughput4.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | 4 | my $loopSize = 512; 5 | my $blocks = 64; 6 | my $loops = 102400; 7 | my $fileName = 'throughput2.sass'; 8 | 9 | writeSassFile($fileName, $loops); 10 | 11 | #print `maxas.pl -p $fileName`; 12 | #exit; 13 | 14 | print `maxas.pl -i $fileName microbench.cubin`; 15 | exit if $?; 16 | 17 | foreach my $thread128 (4) 18 | { 19 | my $threads = $thread128 * 128; 20 | my $fops = 2 * $loops * $loopSize * $blocks * $threads; 21 | 22 | print "./microbench e $blocks $threads $fops\n\n"; 23 | my $data = `./microbench e $blocks $threads $fops`; 24 | exit($?) if $?; 25 | 26 | my ($gflops) = $data =~ /GFLOPS: ([0-9]+)/ms; 27 | 28 | printf "%d %d %d %.2f\n", $thread128, $threads, $gflops, 100 * $gflops / 3050.0; 29 | } 30 | 31 | exit; 32 | 33 | sub writeSassFile 34 | { 35 | my ($filename, $loops) = @_; 36 | 37 | open my $fh, ">$filename" or die "$filename: $!"; 38 | 39 | printf $fh <<'EOF', $loops; 40 | # Kernel: microbench 41 | 42 | 43 | 44 | 0-10 : result, r1, r2, r3 45 | 20-27 ~ count, stop 46 | 47 | 48 | 49 | --:-:-:-:1 MOV count, RZ; 50 | --:-:-:-:1 MOV32I stop, %d; 51 | --:-:-:-:1 MOV32I r1, 1.0; 52 | --:-:-:-:1 MOV32I r2, 1.0; 53 | --:-:-:-:4 MOV32I r3, 1.0; 54 | 55 | LOOP: 56 | 57 | --:-:-:-:1 ISETP.LE.AND P0, PT, count, stop, PT; 58 | --:-:-:-:1 IADD count, count, 1; 59 | 60 | 61 | my $out; 62 | 63 | foreach my $i (0 .. 511) 64 | { 65 | my $yield = ($i + 32) & 63 ? '-' : 'Y'; 66 | 67 | my $stall = $i == 511 ? 0 : 1; 68 | 69 | #$out .= "--:-:-:$yield:1 FFMA r3, r1, r2, r3;\n"; 70 | #$out .= "--:-:-:-:1 FFMA r3, r1, r2, r3;\n"; 71 | #$out .= "--:-:-:-:1 FFMA r3, r1, r2, r3;\n"; 72 | #$out .= "--:-:-:-:0 FFMA r3, r1, r2, r3;\n"; 73 | #$out .= "--:-:-:-:1 I2F.F32.S16 result, r1;\n"; 74 | 75 | #$out .= "--:-:-:$yield:$stall VADD.S16.S16.SAT.MRG_16L result, r1, r2, RZ;\n"; 76 | #$out .= "--:-:-:-:1 MOV result, RZ;\n"; 77 | 78 | $out .= "--:-:-:$yield:$stall IADD.SAT result, r1, r2;\n"; 79 | #$out .= "--:-:-:$yield:$stall VMAD.S8.S8.SAT result, r1, r2, r3;\n"; 80 | #$out .= "--:-:-:$yield:$stall XMAD result, r1, r2, r3;\n"; 81 | } 82 | return $out; 83 | 84 | 85 | --:-:-:Y:5 @P0 BRA LOOP; 86 | --:-:-:-:5 EXIT; 87 | EOF 88 | 89 | close $fh; 90 | } 91 | 92 | __END__ 93 | 94 | VMAD.U8.U8 95 | 96 | dddd 2655 / 4968 = 53.4% 97 | 1d1d 4594 / 4968 = 92.4% 98 | 11d 4746 / 4968 = 95.5% 99 | 111d 4841 / 4968 = 97.4% 100 | 101 | block context switches are a little more expensive than thread context switches 102 | 103 | stall codes: 104 | 105 | f : 13 clocks 106 | e : 8 clocks 107 | d : 6 clocks 108 | c : 8 clocks, no yield 109 | b : 11 clocks 110 | a : 10 clocks 111 | 9 : 9 clocks 112 | 8 : 8 clocks 113 | 7 : 7 clocks 114 | 6 : 6 clocks 115 | 5 : 5 clocks 116 | 4 : 4 clocks 117 | 3 : 3 clocks 118 | 2 : 2 clocks 119 | 1 : 1 clocks, no yield 120 | 0 : 0 clocks, no yield, dual issue -------------------------------------------------------------------------------- /microbench/throughput5.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | my %p; 4 | 5 | $p{N} = 8192; 6 | $p{blocking} = 8; 7 | $p{unroll} = 8; 8 | $p{threads} = 64; #256 9 | 10 | $p{csize} = $p{blocking} * $p{blocking}; 11 | $p{loopSize} = $p{unroll} * $p{csize}; 12 | $p{width} = sqrt($p{csize} * $p{threads}); 13 | $p{blocks} = ($p{N} / $p{width}) * ($p{N} / $p{width}); 14 | $p{loops} = $p{N} / $p{unroll}; 15 | $p{fops} = 2 * $p{loops} * $p{loopSize} * $p{blocks} * $p{threads}; 16 | 17 | my $fileName = 'throughput2.sass'; 18 | 19 | my @params = qw(N blocking unroll threads csize loopSize loops width blocks fops); 20 | 21 | #print join("\t", @params), "\n"; 22 | #print join("\t", @p{@params}), "\n"; 23 | 24 | print map sprintf("%-9s: %d\n", $_, $p{$_}), @params; 25 | 26 | writeSassFile($fileName, $p{loopSize}, $p{loops}); 27 | 28 | #print `maxas.pl -p $fileName`; 29 | #exit; 30 | 31 | print `maxas.pl -i $fileName microbench.cubin`; 32 | 33 | exit if $?; 34 | 35 | my $data = `Release\\microbench.exe e $p{blocks} $p{threads} $p{fops} 50`; 36 | 37 | my ($gflops) = $data =~ /GFLOPS: ([0-9]+)/ms; 38 | 39 | print $data; 40 | 41 | #printf "%d %4d %4d %d\n", $thread128, $loopSize, $loops, $gflops; 42 | 43 | 44 | 45 | 46 | sub writeSassFile 47 | { 48 | my ($filename, $loopSize, $loops) = @_; 49 | 50 | open my $fh, ">$filename" or die "$filename: $!"; 51 | 52 | printf $fh <<'END_SASS', $loops; 53 | # Kernel: microbench 54 | 55 | 56 | 57 | 1, 9, 2,10,17,25,18,26 : cy0x<0-7> 58 | 5,13, 6,14,21,29,22,30 : cy1x<0-7> 59 | 3,11, 0, 8,19,27,16,24 : cy2x<0-7> 60 | 7,15, 4,12,23,31,20,28 : cy3x<0-7> 61 | 35,43,32,40,51,59,48,56 : cy4x<0-7> 62 | 39,47,36,44,55,63,52,60 : cy5x<0-7> 63 | 33,41,34,42,49,57,50,58 : cy6x<0-7> 64 | 37,45,38,46,53,61,54,62 : cy7x<0-7> 65 | 66 | 64-71 : j0Ax<0-3>, j0By<0-3> 67 | 72-79 : j1Ax<0-3>, j1By<0-3> 68 | 69 | 0-79 : r<0-79> 70 | 71 | 100-101 : count, stop 72 | 73 | //102-112 ~ readAs, readBs, writeS 74 | 75 | 76 | 77 | --:-:-:-:1 MOV count, RZ; 78 | --:-:-:-:1 MOV32I stop, %d; 79 | //--:-:-:-:1 MOV writeS, RZ; 80 | //--:-:-:-:1 MOV readAs, RZ; 81 | //--:-:-:-:1 MOV readBs, RZ; 82 | 83 | 84 | return join '', map "--:-:-:-:1 MOV r$_, RZ;\n", 0..63; 85 | 86 | 87 | 88 | return join '', map "--:-:-:-:1 MOV32I r$_, 0x00010001;\n", 64..79; 89 | 90 | 91 | LOOP: 92 | 93 | --:-:-:-:1 ISETP.LE.AND P0, PT, count, stop, PT; 94 | --:-:-:-:1 IADD count, count, 1; 95 | 96 | 97 | my $out; 98 | 99 | my @swirl1 = ([0,0],[0,4],[4,4],[4,0]); 100 | my @swirl2 = ([0,0],[1,0],[1,1],[0,1]); 101 | my @swirl3 = ([0,2],[2,2],[2,0],[0,0]); 102 | 103 | my @cOrder; 104 | foreach my $s1 (@swirl1) 105 | { 106 | foreach my $s2 (@swirl2) 107 | { 108 | foreach my $s3 (@swirl3) 109 | { 110 | push @cOrder, [$s1->[0] + $s2->[0] + $s3->[0], $s1->[1] + $s2->[1] + $s3->[1]]; 111 | } 112 | } 113 | } 114 | 115 | foreach my $j (0..7) 116 | { 117 | my $odd = $j & 1; 118 | my $nOdd = !$odd + 0; 119 | 120 | my %%insert; 121 | 122 | #$insert{c62} = "01:-:-:-:5 BAR.SYNC 0;\n" if $j == 6; 123 | 124 | $insert{c62} = 125 | "--:-:-:-:1 LOP.XOR readAs, readAs, 0;\n" . 126 | "--:-:-:-:1 LOP.XOR readBs, readBs, 0;\n" . 127 | "--:-:-:-:1 LOP.XOR readAs, readAs, 0;\n" . 128 | "--:-:-:-:1 LOP.XOR readBs, readBs, 0;\n" . 129 | "--:-:-:-:1 LOP.XOR writeS, writeS, 0;\n" if $j == 8; 130 | 131 | foreach my $c (0 .. 63) 132 | { 133 | my ($x,$y) = @{$cOrder[$c]}; 134 | my $ins = $insert{"c$c"} || ''; 135 | my $stall = ($c == 63 && $j == 7) ? 0 : 1; #1; #$ins || 136 | my $yield = $c == 32 ? 'Y' : '-'; 137 | my $wait = '--'; #$c ? '--' : '01'; 138 | 139 | my $xReg = $x >> 1; 140 | my $yReg = $y >> 1; 141 | my $xPart = $x & 1 ? '.H1' : ''; 142 | my $yPart = $y & 1 ? '.H1' : ''; 143 | 144 | $out .= sprintf "$wait:-:-:$yield:$stall XMAD cy%%dx%%d, j%%dAx%%d%%s, j%%dBy%%d%%s, cy%%dx%%d;\n%%s", $y,$x, $odd,$xReg,$xPart, $odd,$yReg,$yPart, $y,$x, $ins; 145 | } 146 | } 147 | return $out; 148 | 149 | 150 | --:-:-:Y:5 @P0 BRA LOOP; 151 | --:-:-:-:5 EXIT; 152 | END_SASS 153 | 154 | close $fh; 155 | } 156 | 157 | __END__ 158 | 159 | my %%insert = ( 160 | c0 => "--:-:-:-:1 LDS.U.128 j${nOdd}Ax00, [readAs+0x10];\n", 161 | c2 => "--:-:-:-:1 LDS.U.128 j${nOdd}By00, [readBs+0x10];\n", 162 | c4 => "--:-:-:-:1 LDS.U.128 j${nOdd}Ax64, [readAs+0x10];\n", 163 | c6 => "--:-:1:-:1 LDS.U.128 j${nOdd}By64, [readBs+0x10];\n", 164 | ); -------------------------------------------------------------------------------- /microbench/xmad.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | 4 | print `maxas.pl -i xmad2.sass microbench.cubin`; 5 | 6 | exit if $?; 7 | 8 | print `./microbench i 1 128`; 9 | 10 | 11 | __END__ 12 | 13 | -------------------------------------------------------------------------------- /microbench/xmad2.sass: -------------------------------------------------------------------------------- 1 | # Kernel: microbench 2 | # InsCnt: 18 3 | # RegCnt: 5 4 | # SharedSize: 4096 5 | # BarCnt: 1 6 | # Params(3): 7 | # ord:addr:size:align 8 | # 0:0x140:8:0 9 | # 1:0x148:8:0 10 | # 2:0x150:8:0 11 | # 12 | # Instructions: 13 | 14 | 15 | blockDimX : c[0x0][0x8] 16 | blockDimY : c[0x0][0xc] 17 | blockDimZ : c[0x0][0x10] 18 | gridDimX : c[0x0][0x14] 19 | gridDimY : c[0x0][0x18] 20 | gridDimZ : c[0x0][0x1c] 21 | 22 | param_out[0] : c[0x0][0x140] 23 | param_out[1] : c[0x0][0x144] 24 | param_clocks[0] : c[0x0][0x148] 25 | param_clocks[1] : c[0x0][0x14c] 26 | param_in[0] : c[0x0][0x150] 27 | param_in[1] : c[0x0][0x154] 28 | 29 | 30 | 31 | 32 | 0-1 : out<0-1> 33 | 2-3 : clocks<0-1> 34 | 4-15 : result, result2, tid, bid, blockDim, clock1, clock2, scale, s 35 | 16-24 : a, b, c, x 36 | 37 | 38 | 39 | // Load in our params 40 | --:-:-:-:1 MOV out0, param_out[0]; 41 | --:-:-:-:1 MOV out1, param_out[1]; 42 | --:-:-:-:1 MOV clocks0, param_clocks[0]; 43 | --:-:-:-:1 MOV clocks1, param_clocks[1]; 44 | //--:-:-:-:1 MOV in, c[0x0][0x148]; 45 | --:-:-:-:1 MOV blockDim, blockDimX; 46 | 47 | --:-:-:-:1 PSETP.AND.AND P0, PT, !PT, PT, PT; 48 | 49 | --:-:-:-:6 MOV32I result, 0xffffffff; 50 | --:-:-:-:6 MOV32I result2, 0x0; 51 | --:-:-:-:1 MOV32I a, 1; 52 | --:-:-:-:1 MOV32I b, 1; 53 | --:-:-:-:6 MOV32I c, 0x0; 54 | 55 | // (127 - scale) << 23 56 | //--:-:-:-:6 MOV32I scale, 28; 57 | //--:-:-:-:6 IADD scale, -scale, 127; 58 | //--:-:-:-:6 SHL scale, scale, 23; 59 | 60 | 61 | //--:-:-:-:6 MOV32I c, 0x4f765432; 62 | 63 | //--:-:1:-:2 LDG.CI.128 a, [in]; 64 | 65 | //01:-:-:-:6 VMAD.S16.S16 result, a, b, c; 66 | 67 | //--:-:-:-:6 MOV result, a; 68 | 69 | // a >> 16 | (b & 0xffff0000) 70 | 71 | //--:-:-:-:6 SHR.U32 result, a, 16; 72 | //--:-:-:-:6 LOP3.LUT result, result, b, c, 0xf8; 73 | 74 | //--:-:-:-:6 I2I.S32.S16 result, a.H1; 75 | 76 | //--:-:-:Y:d IADD result.CC, a, -c; 77 | //--:-:-:Y:2 IADD.X result2, b, -RZ; 78 | 79 | //--:-:-:-:6 SHR result, a, 1; 80 | 81 | //--:-:-:-:6 BFI result, b, 0x1010, a; 82 | 83 | --:-:-:-:1 CS2R clock1, SR_CLOCKLO; 84 | 85 | //--:-:-:-:6 XMAD.S16.S16 c, a, b, RZ; 86 | //--:-:-:-:6 ISET.LT.AND s, c, RZ, PT; 87 | //--:-:-:-:6 IADD result.CC, c, result; 88 | //--:-:-:-:6 IADD.X result2, s, result2; 89 | 90 | //--:-:-:-:6 XMAD.S16.S16 result.CC, a, b, result; 91 | //--:-:-:-:6 IADD.X result2, result2, RZ; 92 | 93 | //--:-:-:-:6 SHF.R.S64 result, result, 1, result2; 94 | //--:-:-:-:6 MOV32I result2, 0; 95 | 96 | --:-:-:-:f LOP.AND.NZ P0, RZ, result, 1; 97 | 98 | --:-:-:-:6 @P0 VADD.S16.S16.SAT.MRG_16H result, a, b, result; 99 | 100 | //--:-:1:-:d I2F.F32.S32 result2, a; 101 | //01:-:-:-:6 FMUL result2, result2, scale; 102 | //01:-:2:-:d F2I.S32.F32 result, result2; 103 | 104 | 02:-:-:-:6 CS2R clock2, SR_CLOCKLO; 105 | 106 | //F2I = "^$pred?F2I$ftz$x2x$round $r0, $cr20;" 107 | //I2F = "^$pred?I2F$x2x$rnd $r0, $cr20;" 108 | //x2x = "\.(?F|U|S)(?8|16|32|64)\.(?F|U|S)(?8|16|32|64)" 109 | //rnd = "(?:\.(?RN|RM|RP|RZ))?" 110 | //round = "(?:\.(?ROUND|FLOOR|CEIL|TRUNC))?" 111 | //r8 = qr"(?\-)?(?\|)?(?$reg)\|?(?:\.(?H0|H1|B1|B2|B3))?(?\.reuse)?" 112 | //r20 = qr"(?\-)?(?\|)?(?$reg)\|?(?:\.(?H0|H1|B1|B2|B3))?(?\.reuse)?" 113 | 114 | 115 | //--:-:-:-:1 XMAD.MRG x, a, b.H1, RZ; 116 | //--:-:-:-:6 XMAD result, a.H1, b.H1, c; 117 | //--:-:-:-:1 XMAD.PSL.CBCC result, a.H1, x.H1, result; 118 | 119 | // Get the first clock value 120 | 121 | --:-:1:-:1 S2R tid, SR_TID.X; 122 | --:-:2:-:2 S2R bid, SR_CTAID.X; 123 | 124 | 125 | 126 | // Take the difference of clocks 127 | --:-:-:-:1 IADD clock1, clock2, -clock1; 128 | 129 | // Setup our output addresses 130 | // Stall your pipeline dependencies properly 131 | 03:-:-:-:1 XMAD tid, blockDim, bid, tid; 132 | --:-:-:Y:6 XMAD.MRG x, blockDim, bid.H1, RZ; 133 | --:-:-:Y:6 XMAD.PSL.CBCC tid, blockDim.H1, x.H1, tid; 134 | --:-:-:Y:6 SHL tid, tid, 0x2; 135 | 136 | --:-:-:-:1 IADD clocks, clocks, tid; 137 | --:-:-:-:1 IADD out, out, tid; 138 | 139 | // Output the results. 140 | // No stall needed on prior instruction as memory store instructions have a 5 clock delay in picking up register values 141 | --:-:-:-:1 STG.E [clocks], result2; 142 | --:-:-:-:1 STG.E [out], result; 143 | --:-:-:-:5 EXIT; 144 | 145 | -------------------------------------------------------------------------------- /sgemm/batched_gemm.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NervanaSystems/maxas/54eda7af086a46c9dae1688b691968235d560164/sgemm/batched_gemm.xlsx -------------------------------------------------------------------------------- /sgemm/cublas_sgemm.ptx: -------------------------------------------------------------------------------- 1 | .version 4.1 2 | .target sm_50 3 | .address_size 64 4 | 5 | // ptxas -v -arch=sm_50 -m 32 --opt-level 4 -o cublas_sgemm.cubin cublas_sgemm.ptx 6 | 7 | // You can use maxas to insert cublas_device.lib code into a cubin built from this ptx: 8 | 9 | // From C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v6.5\lib\Win32\cublas_device.lib 10 | 11 | // cuobjdump -lelf cublas_device.lib | find "sm_50" 12 | 13 | // cuobjdump -xelf maxwell_sgemm.asm.sm_50.cubin cublas_device.lib 14 | 15 | // maxas -l maxwell_sgemm.asm.sm_50.cubin 16 | 17 | // maxas -e -k maxwell_sgemm_128x128_nt maxwell_sgemm_128x128_nt.sass 18 | // maxas -e -k maxwell_sgemm_128x64_nt maxwell_sgemm_128x64_nt.sass 19 | 20 | // maxas -i maxwell_sgemm_128x128_nt.sass cublas_sgemm.cubin 21 | // maxas -i maxwell_sgemm_128x64_nt.sass cublas_sgemm.cubin 22 | 23 | // The sgemm.cpp code makes use of this cubin to benchmark the kernels outside of cublas. 24 | 25 | .visible .entry maxwell_sgemm_128x128_nt( 26 | .param .u64 .ptr.global.align 8 param_A, 27 | .param .u64 .ptr.global.align 8 param_B, 28 | .param .u64 .ptr.global.align 8 param_C, 29 | .param .s32 param_lda, 30 | .param .s32 param_ldb, 31 | .param .s32 param_ldc, 32 | .param .s32 param_k, 33 | .param .u64 .ptr.global.align 8 param_Alpha, 34 | .param .u64 .ptr.global.align 8 param_Beta, 35 | .param .s32 param_alpha, 36 | .param .s32 param_beta, 37 | .param .s32 param_flag 38 | ) 39 | .reqntid 256 40 | { 41 | .shared .align 16 .b8 share[16384]; 42 | 43 | ret; 44 | } 45 | 46 | .visible .entry maxwell_sgemm_128x64_nt( 47 | .param .u64 .ptr.global.align 8 param_A, 48 | .param .u64 .ptr.global.align 8 param_B, 49 | .param .u64 .ptr.global.align 8 param_C, 50 | .param .s32 param_lda, 51 | .param .s32 param_ldb, 52 | .param .s32 param_ldc, 53 | .param .s32 param_k, 54 | .param .u64 .ptr.global.align 8 param_Alpha, 55 | .param .u64 .ptr.global.align 8 param_Beta, 56 | .param .s32 param_alpha, 57 | .param .s32 param_beta, 58 | .param .s32 param_flag 59 | ) 60 | .reqntid 128 61 | { 62 | .shared .align 16 .b8 share[12288]; 63 | 64 | ret; 65 | } 66 | -------------------------------------------------------------------------------- /sgemm/sgemm.cpp: -------------------------------------------------------------------------------- 1 | // sgemm.cpp : Defines the entry point for the console application. 2 | // 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | CUcontext hContext = 0; 12 | cublasHandle_t hCublas = 0; 13 | 14 | float assemblySgemm(const char* kernel, CUarray_format format, size_t size, CUdeviceptr devC, CUdeviceptr devA, CUdeviceptr devB, int N, CUevent hStart, CUevent hStop, int repeat = 1, int printVars = 0); 15 | float cublasSgemm(const char* kernel, CUdeviceptr devC, CUdeviceptr devA, CUdeviceptr devB, int N, CUevent hStart, CUevent hStop, int repeat); 16 | void gflops(const char* ident, int N, float ms, int repeat); 17 | void test(float* C, float* T, int N, size_t size); 18 | 19 | #define REPEAT_BLOCK 2000 20 | 21 | #define CUDA_CHECK( fn ) do { \ 22 | CUresult status = (fn); \ 23 | if ( CUDA_SUCCESS != status ) { \ 24 | const char* errstr; \ 25 | cuGetErrorString(status, &errstr); \ 26 | printf("CUDA Driver Failure (line %d of file %s):\n\t%s returned 0x%x (%s)\n", __LINE__, __FILE__, #fn, status, errstr); \ 27 | if (hCublas) cublasDestroy(hCublas); \ 28 | if (hContext) cuCtxDestroy(hContext); \ 29 | exit(EXIT_FAILURE); \ 30 | } \ 31 | } while (0) 32 | 33 | #define CUBLAS_CHECK( fn ) do { \ 34 | cublasStatus_t status = (fn); \ 35 | if ( CUBLAS_STATUS_SUCCESS != status ) { \ 36 | printf("Cublas Failure (line %d of file %s):\n\t%s returned %d\n", __LINE__, __FILE__, #fn, status); \ 37 | if (hCublas) cublasDestroy(hCublas); \ 38 | if (hContext) cuCtxDestroy(hContext); \ 39 | exit(EXIT_FAILURE); \ 40 | } \ 41 | } while (0) 42 | 43 | int main(int argc, char* argv[]) 44 | { 45 | char deviceName[32]; 46 | int count, ordinal, major, minor; 47 | CUdevice hDevice; 48 | CUevent hStart, hStop; 49 | CUdeviceptr devA, devB, devC, devT, otherDevA, otherDevB; 50 | 51 | // Initialize the Driver API and find a device 52 | CUDA_CHECK( cuInit(0) ); 53 | CUDA_CHECK( cuDeviceGetCount(&count) ); 54 | for (ordinal = 0; ordinal < count; ordinal++) 55 | { 56 | CUDA_CHECK( cuDeviceGet(&hDevice, ordinal) ); 57 | CUDA_CHECK( cuDeviceGetAttribute (&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, hDevice) ); 58 | CUDA_CHECK( cuDeviceGetAttribute (&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, hDevice) ); 59 | CUDA_CHECK( cuDeviceGetName(deviceName, sizeof(deviceName), hDevice) ); 60 | if (major >= 5 && minor >= 2) 61 | { 62 | //printf("Using: Id:%d %s (%d.%d)\n\n", ordinal, deviceName, major, minor); 63 | break; 64 | } 65 | } 66 | if (ordinal == count) 67 | { 68 | printf("No compute 5.0 device found, exiting.\n"); 69 | exit(EXIT_FAILURE); 70 | } 71 | 72 | // First command line arg is the size of N divided by 128 73 | int thread128 = 64; 74 | if (argc > 1) 75 | thread128 = atoi(argv[1]); 76 | if (thread128 > 64 || thread128 < 1) 77 | thread128 = 64; 78 | 79 | // Second command line arg is the repeat count for benchmarking 80 | int repeat = 1; 81 | if (argc > 2) 82 | repeat = atoi(argv[2]); 83 | if (repeat > 10000 || repeat < 1) 84 | repeat = 1; 85 | 86 | // Third command line arg is the normalized float size 87 | CUarray_format format = CU_AD_FORMAT_FLOAT; 88 | if (argc > 3) 89 | format = (CUarray_format)atoi(argv[3]); 90 | if (format != CU_AD_FORMAT_FLOAT && format != CU_AD_FORMAT_UNSIGNED_INT16 && format != CU_AD_FORMAT_UNSIGNED_INT8) 91 | format = CU_AD_FORMAT_FLOAT; 92 | 93 | // Forth command line arg is for printf debugging 94 | int printVars = 0; 95 | if (argc > 4) 96 | printVars = atoi(argv[4]); 97 | if (printVars > 100 || printVars < 1) 98 | printVars = 0; 99 | 100 | int N = thread128 * 128; 101 | float alpha = 1, beta = 0, ms = 1; 102 | size_t sizeOther = N * N; 103 | size_t sizeFloat = sizeOther * 4; 104 | 105 | float* A = (float*)malloc(sizeFloat); 106 | float* B = (float*)malloc(sizeFloat); 107 | float* C = (float*)malloc(sizeFloat); 108 | float* T = (float*)malloc(sizeFloat); 109 | float *otherA, *otherB; 110 | 111 | //int counter = 0; 112 | //srand((unsigned int)time(0)); 113 | for(int i = 0; i < N * N; i++) // 114 | { 115 | //A[i] = (float)rand() / (float)RAND_MAX; 116 | //B[i] = (float)rand() / (float)RAND_MAX; 117 | A[i] = B[i] = 1.0f; // * (i & 3) + 1.0f; 118 | //A[i] = 1.0f; 119 | //B[i * N + counter++] = 1.0f; // identity matrix 120 | } 121 | 122 | if (format == CU_AD_FORMAT_FLOAT) 123 | { 124 | sizeOther *= 4; 125 | otherA = A; 126 | otherB = B; 127 | } 128 | else if (format == CU_AD_FORMAT_UNSIGNED_INT16) 129 | { 130 | sizeOther *= 2; 131 | unsigned short* othera = (unsigned short*)malloc(sizeOther); 132 | unsigned short* otherb = (unsigned short*)malloc(sizeOther); 133 | for(int i = 0; i < N * N; i++) 134 | othera[i] = otherb[i] = 65535; 135 | 136 | otherA = reinterpret_cast(othera); 137 | otherB = reinterpret_cast(otherb); 138 | } 139 | else // (format == CU_AD_FORMAT_UNSIGNED_INT8) 140 | { 141 | otherA = (float*)malloc(sizeOther); 142 | otherB = (float*)malloc(sizeOther); 143 | memset(otherA, 255, sizeOther); 144 | memset(otherB, 255, sizeOther); 145 | } 146 | 147 | CUDA_CHECK( cuCtxCreate(&hContext, 0, hDevice) ); 148 | //CUBLAS_CHECK( cublasCreate(&hCublas) ); 149 | 150 | CUDA_CHECK( cuEventCreate(&hStart, CU_EVENT_BLOCKING_SYNC) ); // CU_EVENT_DEFAULT 151 | CUDA_CHECK( cuEventCreate(&hStop, CU_EVENT_BLOCKING_SYNC) ); 152 | 153 | CUDA_CHECK( cuMemAlloc(&devA, sizeFloat) ); 154 | CUDA_CHECK( cuMemAlloc(&devB, sizeFloat) ); 155 | CUDA_CHECK( cuMemAlloc(&devC, sizeFloat) ); 156 | CUDA_CHECK( cuMemAlloc(&devT, sizeFloat) ); 157 | 158 | CUDA_CHECK( cuMemcpyHtoD(devA, A, sizeFloat) ); 159 | CUDA_CHECK( cuMemcpyHtoD(devB, B, sizeFloat) ); 160 | CUDA_CHECK( cuMemsetD8(devC, 0, sizeFloat) ); 161 | CUDA_CHECK( cuMemsetD8(devT, 0, sizeFloat) ); 162 | 163 | if (format == CU_AD_FORMAT_FLOAT) 164 | { 165 | otherDevA = devA; 166 | otherDevB = devB; 167 | } 168 | else 169 | { 170 | CUDA_CHECK( cuMemAlloc(&otherDevA, sizeOther) ); 171 | CUDA_CHECK( cuMemAlloc(&otherDevB, sizeOther) ); 172 | CUDA_CHECK( cuMemcpyHtoD(otherDevA, otherA, sizeOther) ); 173 | CUDA_CHECK( cuMemcpyHtoD(otherDevB, otherB, sizeOther) ); 174 | } 175 | 176 | // Warm up the clock (unless under nsight) 177 | //if (!getenv("NSIGHT_LAUNCHED")) // NSIGHT_CUDA_ANALYSIS NSIGHT_CUDA_DEBUGGER 178 | // for (int i = 0; i < 3; i++) 179 | // CUBLAS_CHECK( cublasSgemm(hCublas, CUBLAS_OP_N, CUBLAS_OP_T, N, N, N, &alpha, reinterpret_cast(devA), N, reinterpret_cast(devB), N, &beta, reinterpret_cast(devT), N) ); 180 | 181 | // Launch our kernel 182 | ms = assemblySgemm("sgemm_kernel_64", format, sizeOther, devC, otherDevA, otherDevB, N, hStart, hStop, repeat, printVars); 183 | gflops("Max64 ", N, ms, repeat); 184 | 185 | ms = assemblySgemm("sgemm_kernel_128", format, sizeOther, devC, otherDevA, otherDevB, N, hStart, hStop, repeat, printVars); 186 | gflops("Max128", N, ms, repeat); 187 | 188 | //ms = cublasSgemm("maxwell_sgemm_128x64_nt", devT, devA, devB, N, hStart, hStop, repeat); 189 | //gflops("Cub64 ", N, ms, repeat); 190 | 191 | //ms = cublasSgemm("maxwell_sgemm_128x128_nt", devT, devA, devB, N, hStart, hStop, repeat); 192 | //gflops("Cub128", N, ms, repeat); 193 | 194 | // Run cublas again for the same repeat count for comparison 195 | //CUDA_CHECK( cuEventRecord(hStart, NULL) ); 196 | //for (int i = 0; i < repeat; i++) 197 | // CUBLAS_CHECK( cublasSgemm(hCublas, CUBLAS_OP_N, CUBLAS_OP_T, N, N, N, &alpha, reinterpret_cast(devA), N, reinterpret_cast(devB), N, &beta, reinterpret_cast(devT), N) ); 198 | //CUDA_CHECK( cuEventRecord(hStop, NULL) ); 199 | //CUDA_CHECK( cuEventSynchronize(hStop) ); 200 | //CUDA_CHECK( cuEventElapsedTime(&ms, hStart, hStop) ); 201 | //gflops("Cublas", N, ms, repeat); 202 | 203 | // Get back our results from each kernel 204 | CUDA_CHECK( cuMemcpyDtoH(C, devC, sizeFloat) ); 205 | CUDA_CHECK( cuMemcpyDtoH(T, devT, sizeFloat) ); 206 | 207 | // Cleanup and shutdown of cuda 208 | CUDA_CHECK( cuMemFree(devA) ); 209 | CUDA_CHECK( cuMemFree(devB) ); 210 | CUDA_CHECK( cuMemFree(devC) ); 211 | CUDA_CHECK( cuMemFree(devT) ); 212 | if (format != CU_AD_FORMAT_FLOAT) 213 | { 214 | CUDA_CHECK( cuMemFree(otherDevA) ); 215 | CUDA_CHECK( cuMemFree(otherDevB) ); 216 | } 217 | 218 | CUDA_CHECK( cuEventDestroy(hStart) ); 219 | CUDA_CHECK( cuEventDestroy(hStop) ); 220 | 221 | //CUBLAS_CHECK( cublasDestroy(hCublas) ); 222 | //hCublas = 0; 223 | CUDA_CHECK( cuCtxDestroy(hContext) ); 224 | hContext = 0; 225 | 226 | // compare C and T for accuracy 227 | test(C, T, N, sizeFloat); 228 | 229 | // And free up host memory 230 | free(A); free(B); free(C); free(T); 231 | 232 | if (format != CU_AD_FORMAT_FLOAT) 233 | { 234 | free(otherA); 235 | free(otherB); 236 | } 237 | 238 | return 0; 239 | } 240 | 241 | // Our kernel wrapper function 242 | float assemblySgemm(const char* kernel, CUarray_format format, size_t size, CUdeviceptr devC, CUdeviceptr devA, CUdeviceptr devB, int N, CUevent hStart, CUevent hStop, int repeat, int printVars) 243 | { 244 | // Configure our x and y grid dimensions (assume nice square matrixes). 245 | // Each block gets 128 tracks from A and 128 tracks from B. 246 | // Each of the 256 threads calculates 64 elements of that 128x128 sub matrix of C. 247 | // See Figure 2 here to get the gist of things (we use a different mapping to maximize LDS.128 usage): 248 | // http://icl.cs.utk.edu/projectsfiles/magma/pubs/fermi_gemm.pdf 249 | 250 | int threads, width; 251 | if (strcmp(kernel, "sgemm_kernel_64") == 0) 252 | { 253 | threads = 64; 254 | width = 64; 255 | } 256 | else 257 | { 258 | threads = 256; 259 | width = 128; 260 | } 261 | 262 | int gridDimXY = N / width + (N % width != 0); 263 | int blocks = gridDimXY * gridDimXY; 264 | 265 | // Setup out debug printf output buffer 266 | CUdeviceptr devD = NULL; 267 | int* D = NULL; 268 | int sizeD = 0; 269 | 270 | if (printVars) 271 | { 272 | sizeD = blocks * threads * printVars * sizeof(int); 273 | D = (int*)malloc(sizeD); 274 | 275 | CUDA_CHECK( cuMemAlloc(&devD, sizeD) ); 276 | CUDA_CHECK( cuMemsetD8(devD, 0, sizeD) ); 277 | } 278 | 279 | // Load the cubin 280 | CUmodule hModule; 281 | CUDA_CHECK( cuModuleLoad(&hModule, "sgemm.cubin") ); 282 | 283 | // Load the textures 284 | CUtexref texA, texB; 285 | CUDA_CHECK( cuModuleGetTexRef(&texA, hModule, "texA") ); 286 | CUDA_CHECK( cuModuleGetTexRef(&texB, hModule, "texB") ); 287 | 288 | // Configure the textures 289 | CUDA_CHECK( cuTexRefSetFormat(texA, format, 4) ); 290 | CUDA_CHECK( cuTexRefSetFormat(texB, format, 4) ); 291 | 292 | CUDA_CHECK( cuTexRefSetAddress(NULL, texA, devA, size) ); 293 | CUDA_CHECK( cuTexRefSetAddress(NULL, texB, devB, size) ); 294 | 295 | // Load the kernel function 296 | CUfunction hKernel; 297 | CUDA_CHECK( cuModuleGetFunction(&hKernel, hModule, kernel) ); 298 | 299 | // Setup the params 300 | float alpha = 1.0f; 301 | void* params[] = { &devC, &N, &N, &N, &N, &N, &N, &alpha, &devD }; 302 | 303 | float totalTime = 0; 304 | // Launch the kernel repeat times.. but break it up into pieces so as not to lock things up. 305 | while (repeat > 0) 306 | { 307 | float ms; 308 | int r = repeat > REPEAT_BLOCK ? REPEAT_BLOCK : repeat; 309 | CUDA_CHECK( cuEventRecord( hStart, NULL ) ); 310 | 311 | for (int i = 0; i < r; i++) 312 | CUDA_CHECK( cuLaunchKernel(hKernel, gridDimXY, gridDimXY, 1, threads, 1, 1, 0, 0, params, 0) ); 313 | 314 | CUDA_CHECK( cuEventRecord( hStop, NULL ) ); 315 | CUDA_CHECK( cuEventSynchronize( hStop ) ); 316 | CUDA_CHECK( cuEventElapsedTime( &ms, hStart, hStop ) ); 317 | totalTime += ms; 318 | repeat -= r; 319 | } 320 | 321 | 322 | CUDA_CHECK( cuModuleUnload(hModule) ); 323 | 324 | // And here we print out the debug info if requested: 325 | if (printVars) 326 | { 327 | CUDA_CHECK( cuMemcpyDtoH(D, devD, sizeD) ); 328 | CUDA_CHECK( cuMemFree(devD) ); 329 | int *iD = D; 330 | float *fD = reinterpret_cast(D); 331 | unsigned int *uD = reinterpret_cast(D); 332 | 333 | for (int by = 0; by < gridDimXY; by++) 334 | { 335 | for (int bx = 0; bx < gridDimXY; bx++) 336 | { 337 | unsigned int clock = 0xffffffff, sm = 0; 338 | 339 | for (int tid = 0; tid < threads; tid++) 340 | { 341 | //printf("by: %3d, bx: %3d, tid:%3d, rA:%5d, rB:%5d, wr:%5d, rd:%5d, cx:%5d, cy:%5d, ci:%5d, c:%.2f\n", 342 | //printf("by: %3d, bx: %3d, tid:%3d, t0:%5d, end:%5d, k:%5d, tid2:%5d, tid15:%5d, ldx:%5d, t2:%5d, t4:%5d\n", 343 | // by, bx, tid, iD[0], iD[1], iD[2], iD[3], iD[4], iD[5], iD[6], iD[7] 344 | //); 345 | if (uD[1] < clock) clock = uD[1]; 346 | sm = uD[0]; 347 | 348 | iD += printVars; 349 | fD += printVars; 350 | uD += printVars; 351 | } 352 | printf("%02d %08u %d %d\n", sm, clock, by, bx); 353 | } 354 | } 355 | free(D); 356 | } 357 | 358 | return totalTime; 359 | } 360 | 361 | typedef struct dPointer 362 | { 363 | CUdeviceptr lo; 364 | CUdeviceptr hi; 365 | } dPointer; 366 | 367 | float cublasSgemm(const char* kernel, CUdeviceptr devC, CUdeviceptr devA, CUdeviceptr devB, int N, CUevent hStart, CUevent hStop, int repeat) 368 | { 369 | int threads, gridX, gridY; 370 | if (strcmp(kernel, "maxwell_sgemm_128x64_nt") == 0) 371 | { 372 | threads = 128; 373 | gridX = N / 128 + (N % 128 != 0); 374 | gridY = N / 64 + (N % 64 != 0); 375 | } 376 | else 377 | { 378 | threads = 256; 379 | gridX = gridY = N / 128 + (N % 128 != 0); 380 | } 381 | int blocks = gridX * gridY; 382 | 383 | // Load the cubin 384 | // See cublas_sgemm.ptx for info on how to build this. 385 | CUmodule hModule; 386 | CUDA_CHECK( cuModuleLoad(&hModule, "cublas_sgemm.cubin") ); 387 | 388 | // Load the kernel function 389 | CUfunction hKernel; 390 | CUDA_CHECK( cuModuleGetFunction(&hKernel, hModule, kernel) ); 391 | 392 | // Setup the params 393 | // I should probably be working in 64 bits... 394 | dPointer dA = { devA, 0 }; 395 | dPointer dB = { devB, 0 }; 396 | dPointer dC = { devC, 0 }; 397 | 398 | int flag = 0; 399 | float alpha = 1.0; 400 | float beta = 0.0; 401 | 402 | void* params[] = { &dA, &dB, &dC, &N, &N, &N, &N, &dA, &dA, &alpha, &beta, &flag }; 403 | 404 | float totalTime = 0; 405 | // Launch the kernel repeat times.. but break it up into pieces so as not to lock things up. 406 | while (repeat > 0) 407 | { 408 | float ms; 409 | int r = repeat > REPEAT_BLOCK ? REPEAT_BLOCK : repeat; 410 | CUDA_CHECK( cuEventRecord( hStart, NULL ) ); 411 | 412 | for (int i = 0; i < r; i++) 413 | CUDA_CHECK( cuLaunchKernel(hKernel, gridX, gridY, 1, threads, 1, 1, 0, 0, params, 0) ); 414 | 415 | CUDA_CHECK( cuEventRecord( hStop, NULL ) ); 416 | CUDA_CHECK( cuEventSynchronize( hStop ) ); 417 | CUDA_CHECK( cuEventElapsedTime( &ms, hStart, hStop ) ); 418 | totalTime += ms; 419 | repeat -= r; 420 | } 421 | 422 | 423 | CUDA_CHECK( cuModuleUnload(hModule) ); 424 | 425 | return totalTime; 426 | } 427 | 428 | void gflops(const char* ident, int N, float ms, int repeat) 429 | { 430 | // Standard sgemm flops formula 431 | ms /= repeat; 432 | printf("%s GFLOPS: %.2f (size: %d, iterations: %d)\n", ident, ((double)N * N * N * 2.0 + N * N) / (ms * 1000000.0), N, repeat); 433 | } 434 | 435 | void test(float* C, float* T, int N, size_t size) 436 | { 437 | // Compare our implementation with the cublas result 438 | int errors = memcmp(C, T, size); 439 | if (errors) 440 | { 441 | if (N <= 512) // This gets too big and slow for large N 442 | { 443 | errors = 0; 444 | FILE* file; 445 | if (fopen_s(&file, "data.txt", "w") == 0) 446 | { 447 | for (int y = 0; y < N; ++y) 448 | { 449 | for (int x = 0; x < N; ++x) 450 | { 451 | float c = C[x*N + y]; 452 | float t = T[x*N + y]; 453 | if (c != t) 454 | { 455 | errors++; 456 | fprintf(file, "%.8f!%.8f\t", c , t); 457 | //fprintf(file, "%.0f!", c); 458 | //fprintf(file, "!"); 459 | } 460 | else 461 | { 462 | //fprintf(file, "%.0f=%.0f\t", c , t); 463 | //fprintf(file, "%.0f=", c); 464 | fprintf(file, "="); 465 | } 466 | } 467 | fprintf(file, "\n"); 468 | } 469 | fclose(file); 470 | printf("%d errors\n", errors); 471 | } 472 | else 473 | { printf("Cannot open data.txt for writing\n"); } 474 | } 475 | else 476 | { printf("%d errors\n", errors); } 477 | } 478 | else 479 | { printf("%d errors\n", errors); } 480 | } -------------------------------------------------------------------------------- /sgemm/sgemm.cu: -------------------------------------------------------------------------------- 1 | 2 | // Note this file isn't configured to automatically compile. 3 | // Here's how: 4 | 5 | // If you want to look at the ptx first: 6 | // nvcc -arch sm_50 -m 32 -ptx sgemm.cu 7 | 8 | // Manually compile your kernel to a cubin. 9 | // You should only have to do this once, unless you change params or shared size or globals: 10 | // nvcc -arch sm_50 -m 32 -cubin sgemm.cu 11 | 12 | // If tweaking a kernel or writing a new one based on this shell code you would then do this: 13 | // maxas.pl -e kernel.cubin kernel.sass 14 | 15 | // I've already included a modified kernel (sgemm.sass) so the next step is.. 16 | 17 | // Splice the manually assembled code back into the cubin: 18 | // maxas.pl -i sgemm.sass sgemm.cubin 19 | 20 | #include 21 | #include 22 | #include 23 | #include 24 | 25 | typedef texture floatTex; 26 | 27 | floatTex texA(0, cudaFilterModePoint, cudaAddressModeBorder); 28 | floatTex texB(0, cudaFilterModePoint, cudaAddressModeBorder); 29 | 30 | // Use extern C so C++ doesn't mangle our kernel name 31 | extern "C" 32 | // This kernel requires 256x1x1 threads per block 33 | __global__ void __launch_bounds__(256) sgemm_kernel_128( 34 | float *C, 35 | const int m, const int n, const int k, 36 | const int lda, const int ldb, const int ldc, 37 | float alpha, int *D) 38 | { 39 | // Declare any shared memory your kernel requires 40 | // Or you could just pass the amount in as a param to cuLaunchKernel 41 | __shared__ float4 share[1024]; 42 | 43 | int tid = threadIdx.x; 44 | 45 | // If you use indirect texture references, they will be passed as params at the end of the param list 46 | // So set that up here to make sure they're available in your kernel 47 | floatTex tex = tid > 127 ? texB : texA; 48 | 49 | // Make use of shared and your textures so it doesn't get optimized away 50 | share[tid] = tex1Dfetch(tex, tid); 51 | 52 | __syncthreads(); 53 | 54 | // output something so your setup isn't optimized away. 55 | C[tid] = share[255-tid].x; 56 | } 57 | 58 | extern "C" 59 | __global__ void __launch_bounds__(64) sgemm_kernel_64( 60 | float *C, 61 | const int m, const int n, const int k, 62 | const int lda, const int ldb, const int ldc, 63 | float alpha, int *D) 64 | { 65 | __shared__ float4 share[512]; 66 | 67 | int tid = threadIdx.x; 68 | 69 | floatTex tex = tid > 127 ? texB : texA; 70 | 71 | share[tid] = tex1Dfetch(tex, tid); 72 | 73 | __syncthreads(); 74 | 75 | C[tid] = share[255-tid].x; 76 | } 77 | 78 | // A note about using the Cuda Runtime. 79 | // If that's your preference over the driver API then here's what you'd do: 80 | 81 | // In your project properties in the Cuda C/C++ panel: 82 | // -Set the "Keep Processed Files" (-keep) option 83 | // -Add a -v manually to the command line 84 | // If compiling on command line just add -keep -v options to nvcc. 85 | // Rebuild your solution and look in the log for these lines that follow the ptxas step: 86 | 87 | // #$ fatbinary --create="Release/kernel.fatbin" -32 --key="a7bce87544c2a492" --ident="C:/Users/Scott/Documents/sgemm6/sgemm6/kernel.cu" --cmdline="-v --opt-level 4 --generate-line-info " "--image=profile=sm_50,file=Release/kernel.sm_50.cubin" "--image=profile=compute_50,file=Release/kernel.ptx" --embedded-fatbin="Release/kernel.fatbin.c" --cuda 88 | // #$ cl.exe @Release/kernel.cu.cpp.ii.res > "Release/kernel.cu.cpp.ii" 89 | // #$ cl.exe @Release/kernel.cu.obj.res -Fo"Release/kernel.cu.obj" 90 | 91 | // You just need to manually run these 3 commands (or add them to a build script) 92 | // after you've modified the cubin generated from the preceeding ptxas command. 93 | // That will give you a new .cu.obj file which will automatically be linked in for you next time you 94 | // build your project (or you could manually run the linker step as well). 95 | 96 | // Having done that you can call your kernel normally using the <<< >>> syntax. 97 | // Debugging will have to be with the sass syntax but that's what you'll want to see anyway. 98 | // With fatbin you can also keep non-maxwell optimized versions of your code. 99 | 100 | 101 | // I just discovered this also works as a shortcut to the above: 102 | // nvcc -lib -arch sm_52 -m 32 -use-cubin code=sm_52,cubin=sgemm.cubin -o sgemm.lib sgemm.cu 103 | 104 | // The cu kernel definitions above need to have empty bodies. 105 | // And, the cu file must be compiled to a lib seperately before linking. -------------------------------------------------------------------------------- /sgemm/sgemm.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | 4 | my $CU_AD_FORMAT_UNSIGNED_INT8 = 0x01; 5 | my $CU_AD_FORMAT_UNSIGNED_INT16 = 0x02; 6 | my $CU_AD_FORMAT_FLOAT = 0x20; 7 | 8 | if (!-f 'sgemm_pre_128.sass' || (stat 'sgemm128.sass')[9] > (stat 'sgemm_pre_128.sass')[9]) 9 | { 10 | print `maxas.pl -p sgemm128.sass sgemm_pre_128.sass`; 11 | exit if $?; 12 | print `maxas.pl -i sgemm128.sass sgemm.cubin`; 13 | exit if $?; 14 | print `maxas.pl -e -k sgemm_kernel_128 sgemm.cubin sgemm_final_128.sass`; 15 | } 16 | if (!-f 'sgemm_pre_64.sass' || (stat 'sgemm64.sass')[9] > (stat 'sgemm_pre_64.sass')[9]) 17 | { 18 | print `maxas.pl -p sgemm64.sass sgemm_pre_64.sass`; 19 | exit if $?; 20 | print `maxas.pl -i sgemm64.sass sgemm.cubin`; 21 | exit if $?; 22 | print `maxas.pl -e -k sgemm_kernel_64 sgemm.cubin sgemm_final_64.sass`; 23 | } 24 | 25 | #print `Release\\sgemm.exe $_ 20` foreach (80,60,40,30,20,10,9,8,7,6,5,4,3,2); 26 | 27 | `Release\\sgemm.exe 64 5 $CU_AD_FORMAT_FLOAT`; 28 | 29 | print `Release\\sgemm.exe 64 20 $CU_AD_FORMAT_UNSIGNED_INT8`; 30 | exit; 31 | 32 | my %data; 33 | foreach my $thread128 (4 .. 64) 34 | { 35 | my $N = $thread128 * 128; 36 | 37 | my $iterations = int(20 * (64 * 128)**3 / $N**3); 38 | $iterations = 10000 if $iterations > 10000; 39 | 40 | print "$N $iterations\n"; 41 | 42 | my $data = `Release\\sgemm.exe $thread128 $iterations $CU_AD_FORMAT_UNSIGNED_INT16`; 43 | 44 | foreach my $bench (split "\n", $data) 45 | { 46 | if ($bench =~ /^(\w+)\s+GFLOPS: ([0-9.]+) /) 47 | { 48 | push @{$data{$N}}, $2; 49 | print "$1 $2\n"; 50 | } 51 | } 52 | } 53 | print join("\t", qw(size Max64 Max128 Cub64 Cub128)), "\n"; 54 | 55 | foreach my $N (sort { $a <=> $b } keys %data) 56 | { 57 | print join("\t", @{$data{$N}}), "\n"; 58 | } 59 | 60 | 61 | #print $data; 62 | 63 | __END__ 64 | 65 | 66 | 64 * 128 * 16 * 1.620 * .931 / 520 67 | 68 | Max64 GFLOPS: 1377.38 (size: 256, iterations: 2000) 69 | Max128 GFLOPS: 973.70 (size: 256, iterations: 2000) 70 | Cub64 GFLOPS: 1272.42 (size: 256, iterations: 2000) 71 | Cub128 GFLOPS: 948.15 (size: 256, iterations: 2000) 72 | 73 | my @data = grep /\S/, split "\n", $data; 74 | 75 | my $min; 76 | my %smData; 77 | my @sdata; 78 | foreach (@data) 79 | { 80 | next if /GFLOPS/; 81 | 82 | my ($sm, $clock, $by, $bx) = split /\s+/; 83 | 84 | $smData{$sm} = $clock if !$smData{$sm} || $clock < $smData{$sm}; 85 | 86 | $min = $clock if !$min || $clock < $min; 87 | 88 | push @sdata, [$sm, $clock, $by, $bx]; 89 | } 90 | 91 | foreach (@sdata) 92 | { 93 | $_->[1] -= $smData{$_->[0]}; 94 | } 95 | 96 | foreach (sort {$a->[1] <=> $b->[1] || $a->[0] <=> $b->[0]} @sdata) 97 | { 98 | printf "%02d %8u by: %2d bx: %2d\n", @$_; 99 | 100 | } 101 | 102 | 103 | -------------------------------------------------------------------------------- /sgemm/sgemm.sln: -------------------------------------------------------------------------------- 1 |  2 | Microsoft Visual Studio Solution File, Format Version 11.00 3 | # Visual Studio 2010 4 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "sgemm", "sgemm.vcxproj", "{D571379D-3653-43CB-BE83-A6C68D392A05}" 5 | EndProject 6 | Global 7 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 8 | Debug|Win32 = Debug|Win32 9 | Release|Win32 = Release|Win32 10 | EndGlobalSection 11 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 12 | {D571379D-3653-43CB-BE83-A6C68D392A05}.Debug|Win32.ActiveCfg = Debug|Win32 13 | {D571379D-3653-43CB-BE83-A6C68D392A05}.Debug|Win32.Build.0 = Debug|Win32 14 | {D571379D-3653-43CB-BE83-A6C68D392A05}.Release|Win32.ActiveCfg = Release|Win32 15 | {D571379D-3653-43CB-BE83-A6C68D392A05}.Release|Win32.Build.0 = Release|Win32 16 | EndGlobalSection 17 | GlobalSection(SolutionProperties) = preSolution 18 | HideSolutionNode = FALSE 19 | EndGlobalSection 20 | EndGlobal 21 | -------------------------------------------------------------------------------- /sgemm/sgemm.vcxproj: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | Debug 6 | Win32 7 | 8 | 9 | Release 10 | Win32 11 | 12 | 13 | 14 | {D571379D-3653-43CB-BE83-A6C68D392A05} 15 | Win32Proj 16 | sgemm 17 | 18 | 19 | 20 | Application 21 | true 22 | Unicode 23 | 24 | 25 | Application 26 | false 27 | true 28 | Unicode 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | true 42 | 43 | 44 | false 45 | 46 | 47 | 48 | 49 | 50 | Level3 51 | Disabled 52 | _CRT_SECURE_NO_WARNINGS;WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) 53 | $(CUDA_PATH_V6_5)\include;%(AdditionalIncludeDirectories) 54 | 55 | 56 | Console 57 | true 58 | $(CUDA_PATH_V6_5)\lib\Win32;%(AdditionalLibraryDirectories) 59 | cuda.lib;cublas.lib;%(AdditionalDependencies) 60 | 61 | 62 | 63 | 64 | Level3 65 | 66 | 67 | MaxSpeed 68 | true 69 | true 70 | _CRT_SECURE_NO_WARNINGS;WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) 71 | $(CUDA_PATH_V6_5)\include;%(AdditionalIncludeDirectories) 72 | 73 | 74 | Console 75 | true 76 | true 77 | true 78 | $(CUDA_PATH_V6_5)\lib\Win32;%(AdditionalLibraryDirectories) 79 | cuda.lib;cublas.lib;%(AdditionalDependencies) 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | -------------------------------------------------------------------------------- /sgemm/sgemm128.sass: -------------------------------------------------------------------------------- 1 | # Kernel: sgemm_kernel_128 2 | # 3 | # SharedSize: 16384 4 | # Params(8): 5 | # 0:0x140:4:4 param_C, 6 | # 1:0x144:4:0 param_m, 7 | # 2:0x148:4:0 param_n, 8 | # 3:0x14c:4:0 param_k, 9 | # 4:0x150:4:0 param_lda, 10 | # 5:0x154:4:0 param_ldb, 11 | # 6:0x158:4:0 param_ldc 12 | # 7:0x15c:4:0 param_alpha 13 | # 8:0x160:4:4 param_D // for diagnostic printf output 14 | # 15 | # Globals: 16 | # c[0x0][0x164]: texA (the value is 1) 17 | # c[0x0][0x168]: texB (the value is 0) 18 | 19 | 20 | 21 | // Temporary registers to calculate the state registers. Reuse the C output registers. 22 | // These can be dynamically allocated (~) in the available registger space to elimiate any register bank conflicts. 23 | 0-63 ~ blk, ldx, ldx2, ldx4, k, tid1, tid4, tid7, tid31_4, xmad_t0, xmad_end, bxOrig, byOrig, loy 24 | 25 | // Aliases for the C registers we use for initializing C (used as vectors) 26 | 0-63 : cz<00-63> 27 | 28 | // The offset we store our zero value for initializing C. Reuse a register from the second blocking registers 29 | 80 : zOffset 30 | 31 | // 64 C maxtrix output registers. 32 | // Use special mapping to avoid register bank conflicts between these registers and the blocking registers. 33 | 3, 2,11,10,19,18,27,26 : cx00y<00-03|64-67> 34 | 7, 6,15,14,23,22,31,30 : cx01y<00-03|64-67> 35 | 1, 0, 9, 8,17,16,25,24 : cx02y<00-03|64-67> 36 | 5, 4,13,12,21,20,29,28 : cx03y<00-03|64-67> 37 | 35,34,43,42,51,50,59,58 : cx64y<00-03|64-67> 38 | 39,38,47,46,55,54,63,62 : cx65y<00-03|64-67> 39 | 33,32,41,40,49,48,57,56 : cx66y<00-03|64-67> 40 | 37,36,45,44,53,52,61,60 : cx67y<00-03|64-67> 41 | 42 | // Double buffered register blocking used in vector loads. 43 | // Any bank conflicts that we can't avoid in these registers we can hide with .reuse flags 44 | 64-79 : j0Ax<00-03|64-67>, j0By<00-03|64-67> 45 | 80-95 : j1Ax<00-03|64-67>, j1By<00-03|64-67> 46 | 47 | // Registers to load A or B 48 | 96-103 : loadX<0-7> 49 | 50 | // Key global state registers for main loop and some we reuse for outputing C. 51 | // Note, tweaking the register banks of track<0|4>, tex, writeS, readBs, readAs impacts performance because of 52 | // delayed bank conflicts between memory operations and ffmas. 53 | // The array index bracket notation can be used to request a bank in a dynamically allocated range. 54 | 104-127 ~ track<0|4>[0], tex[2], readAs[2], readBs[3], writeS[3], end, ldx8, tid, bx, by, tid31, tid96, tid128 //, clock, smId, nSMs 55 | 56 | // Registers to store the results back to global memory. Reuse any register not needed after the main loop. 57 | // Statically allocate cs0-7 because they're vector registers. 58 | 64-71 : cs<0-7> 59 | 60 | // dynamically allocated C output registers(~) 61 | 72-103 ~ cy<00|04|08|12>, Cy<00|04|08|12>, ldc, ldc1, ldc4, ldc8, ldc60, writeCs, readCs, cx, ci, alpha, xmad_ci //, xmad_D, D, blckDimX, gridDimX 62 | 63 | 64 | 65 | // Note the absense of the loading of the stack pointer into R1. 66 | // No idea why ptxas does that anyway when it's not used for register spilling. 67 | // Such a waste of a perfectly good register. 68 | 69 | // Scheduler doesn't handle the dependency flags yet, 70 | // so move these first instructions outside the block that's auto scheduled 71 | //--:-:-:-:1 CS2R clock, SR_CLOCKLO; 72 | //--:-:-:-:1 S2R smId, SR_VIRTID; 73 | //--:-:-:-:1 S2R nSMs, SR_VIRTCFG; 74 | --:-:1:-:1 S2R tid, SR_TID.X; // Set Dep 1 75 | --:-:2:-:1 S2R bx, SR_CTAID.X; // Set Dep 2 76 | --:-:3:-:1 S2R by, SR_CTAID.Y; // Set Dep 3 77 | 78 | // Instructions in a SCHEDULE_BLOCK are automatically reordered and appropriately stalled for simple dependancies 79 | // Memory dependencies are left up to the auther to deal with manually for now. 80 | 81 | 82 | // First 128 threads load A to shared, 2nd 128 loads B to shared 83 | // Note this technique is not possible in cuda or ptx as there's no way to 84 | // efficiently specify a warp-uniform predicate for a memory op. 85 | // Compile sgemm.cu and inspect the sass to see what I'm talking about. 86 | 87 | // blk = tid >= 128 ? by : bx; 88 | // ldx = tid >= 128 ? ldb : lda; 89 | // tex = tid >= 128 ? texB : texA; 90 | 01:-:-:Y:1 ISETP.GE.AND P0, PT, tid, 128, PT; // Wait Dep 1 91 | 06:-:-:-:1 SEL blk, by, bx, P0; // Wait Dep 2 & 3 92 | --:-:-:-:1 @!P0 MOV ldx4, c[0x0][0x150]; 93 | --:-:-:-:1 @P0 MOV ldx4, c[0x0][0x154]; 94 | --:-:-:-:1 @!P0 MOV32I tex, 0x80000001; // texA 95 | --:-:-:-:1 @P0 MOV32I tex, 0x80000000; // texB 96 | 97 | // Initialize the portion of shared we use to zero our C registers 98 | // Give each warp its own address to write to. 99 | // All threads write to the same address, but we don't care because only one needs to take. 100 | // There is no bank conflict on writing to the same address, just indeterminacy in which thread will get its value stored. 101 | --:-:-:-:1 LOP.AND zOffset, tid, -32; 102 | --:-:-:-:1 STS.128 [zOffset + 4x<16*128>], RZ; 103 | 104 | // tid4 = (tid >> 5) & 3 105 | // tid31 = tid & 31 106 | // tid96 = tid & 96 107 | // tid128 = tid & 128 108 | --:-:-:-:1 BFE.U32 tid4, tid, 0x205; // 2 bits at position 5 109 | --:-:-:-:1 LOP.AND tid31, tid, 31; 110 | --:-:-:-:1 LOP.AND tid96, tid, 96; 111 | --:-:-:-:1 LOP.AND tid128, tid, 128; 112 | 113 | // ldx4 = ldx * 4; 114 | // ldx8 = ldx * 8; 115 | --:-:-:-:1 SHR.U32 ldx, ldx4, 2; 116 | --:-:-:-:1 IADD ldx8, ldx4, ldx4; 117 | 118 | // track0 = blk*128/4 + tid31 + (ldx * tid4) 119 | --:-:-:-:1 ISCADD track0, blk, tid31, 5; 120 | --:-:-:-:1 XMAD.LO track0, ldx, tid4, track0, xmad_t0; // XMAD.LO is a macro that is expanded out into the 3 XMADs 121 | --:-:-:-:1 IADD track4, track0, ldx4; 122 | 123 | // writeS = tid31*4*4 + tid4*128*4 124 | // writeS += 4096 if tid >= 128 125 | --:-:-:-:1 SHL tid31_4, tid31, 4; 126 | --:-:-:-:1 ISCADD writeS, tid4, tid31_4, 9; 127 | --:-:-:-:1 @P0 IADD writeS, writeS, 4x<8*128>; 128 | 129 | // int end = track0 + (k-8)*ldx; 130 | --:-:-:-:1 MOV k, c[0x0][0x14c]; 131 | --:-:-:-:1 IADD k, k, -8; 132 | --:-:-:-:1 XMAD.LO end, k, ldx, track0, xmad_end; 133 | 134 | // readAs and readBs are carefully constructed to avoid any bank conflicts while loading from shared 135 | // readAs = ((tid128 >> 4) | ((tid >> 1) & 7)) << 4; 136 | --:-:-:-:1 BFE.U32 tid7, tid, 0x301; // 3 bits at position 1 137 | --:-:-:-:1 SHR.U32 readAs, tid128, 4; 138 | --:-:-:-:1 LOP.OR readAs, readAs, tid7; 139 | --:-:-:-:1 SHL readAs, readAs, 4; 140 | 141 | // readBs = (((tid & 0x70) >> 3) | (tid & 1)) << 4 + 4096; 142 | --:-:-:-:1 LOP.AND tid1, tid, 1; 143 | --:-:-:-:1 LOP.AND readBs, tid, 0x70; 144 | --:-:-:-:1 SHR.U32 readBs, readBs, 3; 145 | --:-:-:-:1 LOP.OR readBs, readBs, tid1; 146 | --:-:-:-:1 ISCADD readBs, readBs, 4x<8*128>, 4; 147 | 148 | // Preload the first 8 lines from texture memory 149 | // Keep these instructions in this order (but allow others to interleave). 150 | // Normally the scheduler tries to preserve source order by default, but this demonstrates how you enforce 151 | // an ordering if you need to. 152 | // Note: these are the 4 element vector load versions (last param: 0xf=vec4, 0x3=vec2, 0x1=single) 153 | 154 | --:-:1:-:1 TLD.B.LZ.P loadX0, track0, tex, 0x0, 1D, 0xf; // Set Dep 1 155 | --:-:2:-:1 TLD.B.LZ.P loadX4, track4, tex, 0x0, 1D, 0xf; // Set Dep 2 156 | 157 | 158 | 159 | 160 | // Initialize C registeres to zero 161 | // Using LDS.U.128 is a neat trick to save a few clock cyles 162 | // (when you have enough warps to hide the latency.) 163 | 164 | return join '', map sprintf("--:-:3:-:1 LDS.U.128 cz%02d, [zOffset + 4x<16*128>];\n", $_ * 4), 0..15; 165 | 166 | 167 | // These instuctions need to occur after the textures load so put them in a new block 168 | // that starts with a dependency barrier wait. 169 | 170 | 171 | 01:-:-:-:1 STS.128 [writeS + 4x<0*128>], loadX0; // Wait Dep 1 172 | 02:-:-:-:1 STS.128 [writeS + 4x<4*128>], loadX4; // Wait Dep 2 173 | 174 | // Increment tracks after the loads are complete to avoid needing write-after-read dependencies 175 | --:-:-:-:1 IADD track0, track0, ldx8; 176 | --:-:-:-:1 IADD track4, track4, ldx8; 177 | 178 | // Wait for all threads to finish loading shared 179 | 04:-:-:-:5 BAR.SYNC 0; 180 | 181 | 182 | 183 | // The next store to shared goes to high area. 184 | // Having 2 share buffers allows us to eliminate a bar.sync in the main loop. 185 | // This way we don't have to wait for all threads to arrive before writing fresh data to shared. 186 | // Other threads can continue reading from the last batch while the new data is being written. 187 | --:-:-:-:0 LOP.XOR writeS, writeS, 4x<16*128>; 188 | 189 | // Preload the fist lines of A and B from shared 190 | --:-:-:-:1 LDS.U.128 j0Ax00, [readAs + 4x<0*128 + 00>]; 191 | --:-:-:-:1 LDS.U.128 j0By00, [readBs + 4x<0*128 + 00>]; 192 | --:-:-:-:1 LDS.U.128 j0Ax64, [readAs + 4x<0*128 + 64>]; 193 | --:-:1:-:1 LDS.U.128 j0By64, [readBs + 4x<0*128 + 64>]; // Set Dep 1 194 | 195 | 196 | // The main loop 197 | // While calculating the first line, load in the next line from shared. 198 | // Shared memory stores enough to do this 8 times per loop. 199 | // Also pull in the next block of memory from global and store it to shared. 200 | 201 | // Efficiency: 202 | // ffma: 512 203 | // lds: 32 dual issued 204 | // sts: 2 dual issued 205 | // tex: 2 dual issued 206 | // add: 2 207 | // xor: 3 208 | // setp: 1 209 | // bar: 1 dual issued 210 | // bra: 1 dual issued 211 | // Total: 524 (512/518 = 98.8% FFMA) 212 | 213 | // Memory Throughput Upper Bound: 214 | // 2 * 4 * 4 bytes per thread per 518 clocks 215 | // 128 threads per SM 216 | // 16 SM's (GM204) 217 | // 1640Mhz (boost overclock) 218 | // .931 GiB/GB (1000^3 / 1024^3) 219 | // 193 GiB/sec 220 | // Available: 224 GiB/sec (or 256 GiB/sec overclocked at 8GHz) 221 | 222 | LOOP: 223 | 224 | // Loop end condition 225 | --:-:-:-:1 ISETP.LE.AND P0, PT, track0, end, PT; 226 | 227 | 228 | 229 | # We eliminated bank conflicts with our C registers and the blocking registers, 230 | # but there are still 16 bank conflicts between the blocking registers themselves. 231 | # By ordering the FFMA's in a swirling zigzag pattern we can completely hide those conflicts 232 | # behind register reuse. This pattern also maximizes that reuse (47%) and minimizes the bandwidth 233 | # out of the register bank, thereby reducing power consumption and allowing the chip to 234 | # stay at a higher sustained clock speed. One other constraint is that we want each successive 235 | # instruction to pull its third operand from alternating banks. We space the swirl by 2 in the x 236 | # direction to achieve this. This has the effect of making it easier to avoid delayed bank conflicts 237 | # with the memory operations. Finally, for the very first ffma, don't choose one of the 16 bank conflicts 238 | # as we have no way of hiding that conflict behind a reuse (cublas makes this mistake). 239 | 240 | # Alternating banks (1320 Hz, full speed) 241 | my @swirl = ([2,0],[2,1],[0,1],[0,0]); 242 | my @xVals = (0,1,64,65); 243 | 244 | # Repeating banks (1320Hz, 83 Gflops slower, but lower power draw probably becuase of increased stalls) 245 | # Only explanation I can think of is increased delayed register bank conflicts with memory ops. 246 | #my @swirl = ([0,1],[0,0],[1,0],[1,1]); 247 | #my @xVals = (0,2,64,66); 248 | 249 | my @cOrder; 250 | foreach my $y (0,2,64,66) 251 | { 252 | # apply the swirl 253 | foreach my $x (@xVals) 254 | { 255 | push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl; 256 | } 257 | # apply the zigzag 258 | @xVals = reverse @xVals; 259 | } 260 | 261 | # This ordering (a simple zigzag) eliminates the bank conflicts but only achieves 39% reuse. 262 | # It runs 20 GFlops slower since the register bank draws more power and the clock slows down to 1306 Hz. 263 | # There may be more delayed bank conflicts with memory operations as the slowdown is 4 Glops more than 264 | # the reduced clock accounts for. 265 | #my @cOrder2; 266 | #my @xVals = (0..3,64..67); 267 | #foreach my $y (0..3,64..67) 268 | #{ 269 | # @xVals = reverse @xVals; 270 | # push @cOrder2, [$_, $y] foreach @xVals; 271 | #} 272 | #@cOrder = @cOrder2; 273 | 274 | my %insert = 275 | ( 276 | # Don't start the first TLD before 12 to let ISETP to write P0 277 | # These global reads and shared writes we put exactly in the middle of the LDS ops 278 | # This is to not overwhelm the memory units with instructions (and because these were tested faster here). 279 | # The 4 spacing seems to work best for vec4 instructions. 280 | # It's odd that these two textures loads can drive 512 FFMA's all by themselves.. but 256 threads can load 8 128 F32 wide lines. 281 | # So we only need 2 to get 8 lines from both matrices. 282 | 283 | j0c31 => "--:-:2:-:1 \@P0 TLD.B.LZ.P loadX0, track0, tex, 0x0, 1D, 0xf; // Set Dep 2\n", 284 | j0c33 => "--:-:3:-:1 \@P0 TLD.B.LZ.P loadX4, track4, tex, 0x0, 1D, 0xf; // Set Dep 3\n", 285 | 286 | j6c30 => "02:-:-:-:1 \@P0 STS.128 [writeS + 4x<0*128>], loadX0; // Wait Dep 2\n", 287 | j6c34 => "04:-:-:-:1 \@P0 STS.128 [writeS + 4x<4*128>], loadX4; // Wait Dep 3\n", 288 | 289 | # We need one barrier in the main loop after writing shared memory. 290 | # The barrier is needed even if this is our last loop because we need to protect the warp shuffle step. 291 | # Note, BAR.SYNCs do not sync memory read access automatically, you still need to flag the barriers (writes are sync'd). 292 | # After the BAR, swap our share buffer location. We don't need an additional barrier because of these swaps. 293 | # Note, this doubles our shared memory usage but this kernel's occupancy is entirely bound by registers. 294 | # LOP.XOR readAs needs to be 4 clocks prior to the LDS.U.128 for readAs (but push this as far down as possible) 295 | j6c62 => 296 | "01:-:-:-:5 BAR.SYNC 0; // Wait Dep 1\n" . 297 | "--:-:-:-:1 \@P0 LOP.XOR readAs, readAs, 4x<16*128>;\n" . 298 | "--:-:-:-:1 \@P0 LOP.XOR readBs, readBs, 4x<16*128>;\n" . 299 | "--:-:-:-:1 \@P0 LOP.XOR writeS, writeS, 4x<16*128>;\n", 300 | 301 | # Note having 2 IADDs slightly hits our FFMA performance (1/518 = .2%), but TLD doesn't take an offset. 302 | # LDG.CI doesn't have this issue, but doesn't give you the nice features of texture loads: 303 | # -Boundry Clamping: simplifies our matrix load logic so we don't need to worry about loading out of bounds 304 | # -Normalized Floats: if we don't need full 32 bits of precision we could store our matrices using 16 or 8 bit values 305 | j7c63 => 306 | "--:-:-:-:1 \@P0 IADD track0, track0, ldx8;\n" . 307 | "--:-:-:-:0 \@P0 IADD track4, track4, ldx8;\n" . 308 | "--:-:-:Y:5 \@P0 BRA LOOP;\n", 309 | ); 310 | 311 | my $out; 312 | # We unroll our main loop 8 iterations. 313 | # This gives us a loop instruction count of 556. Add the control instructions and that makes it 741 opcodes sized 8 bytes. 314 | # This is 5928 bytes, nicely fitting inside the 8kb instruction cache. Going to the next biggest size would be 12 lines. 315 | # That would be 768 ffmas and not leaving enough room for the other instructions and control codes. 316 | # So by staying inside the instruction cache size, we avoid hitting any instruction fetch latencies. 317 | foreach my $j (0 .. 7) 318 | { 319 | my $odd = $j & 1; 320 | my $nOdd = !$odd + 0; 321 | # Our rolling blocking registers stay one load ahead off the FFMA's (rs: read share) 322 | my $rsOffset = ($j + 1) % 8; 323 | # No need to load on last loop iteration 324 | my $rsPred = $j == 7 ? '@P0' : ' '; 325 | 326 | # You can experiment here with different vector load sizes 327 | my $vec = 128; 328 | 329 | if ($vec == 128) 330 | { 331 | # Roll up our LDS ops here to keep them easier to manage and tune 332 | # Space at every other clock to maximize throughput. 333 | $insert{"j${j}c0"} = sprintf "--:-:-:-:1 %s LDS.U.128 j%dAx00, [readAs + 4x<%d*128 + 00>];\n", $rsPred, $nOdd, $rsOffset; 334 | $insert{"j${j}c2"} = sprintf "--:-:-:-:1 %s LDS.U.128 j%dBy00, [readBs + 4x<%d*128 + 00>];\n", $rsPred, $nOdd, $rsOffset; 335 | $insert{"j${j}c4"} = sprintf "--:-:-:-:1 %s LDS.U.128 j%dAx64, [readAs + 4x<%d*128 + 64>];\n", $rsPred, $nOdd, $rsOffset; 336 | $insert{"j${j}c6"} = sprintf "--:-:1:-:1 %s LDS.U.128 j%dBy64, [readBs + 4x<%d*128 + 64>]; // Set Dep 1\n", $rsPred, $nOdd, $rsOffset; 337 | } 338 | elsif ($vec == 64) 339 | { 340 | # LDS.64 runs about 22 Gflops slower than LDS.128 (GM107). Not a huge difference since our latencies are so well hidden. 341 | # I think LDS.128 is implemented internally as a pair of LDS.64 ops which could be another reason for the comparable performance. 342 | # I think the big benefit with 128 is being able to issue all our LDS ops earlier, allowing more FFMA's prior to reading out the results. 343 | # There could also be additional opportunity for delayed bank conflicts. 344 | $insert{"j${j}c0"} = sprintf "--:-:-:-:1 %s LDS.U.64 j%dAx00, [readAs + 4x<%d*128 + 00>];\n", $rsPred, $nOdd, $rsOffset; 345 | $insert{"j${j}c2"} = sprintf "--:-:-:-:1 %s LDS.U.64 j%dAx02, [readAs + 4x<%d*128 + 02>];\n", $rsPred, $nOdd, $rsOffset; 346 | $insert{"j${j}c4"} = sprintf "--:-:-:-:1 %s LDS.U.64 j%dBy00, [readBs + 4x<%d*128 + 00>];\n", $rsPred, $nOdd, $rsOffset; 347 | $insert{"j${j}c6"} = sprintf "--:-:-:-:1 %s LDS.U.64 j%dBy02, [readBs + 4x<%d*128 + 02>];\n", $rsPred, $nOdd, $rsOffset; 348 | $insert{"j${j}c8"} = sprintf "--:-:-:-:1 %s LDS.U.64 j%dAx64, [readAs + 4x<%d*128 + 64>];\n", $rsPred, $nOdd, $rsOffset; 349 | $insert{"j${j}c10"} = sprintf "--:-:-:-:1 %s LDS.U.64 j%dAx66, [readAs + 4x<%d*128 + 66>];\n", $rsPred, $nOdd, $rsOffset; 350 | $insert{"j${j}c12"} = sprintf "--:-:-:-:1 %s LDS.U.64 j%dBy64, [readBs + 4x<%d*128 + 64>];\n", $rsPred, $nOdd, $rsOffset; 351 | $insert{"j${j}c14"} = sprintf "--:-:1:-:1 %s LDS.U.64 j%dBy66, [readBs + 4x<%d*128 + 66>]; // Set Dep 1\n", $rsPred, $nOdd, $rsOffset; 352 | } 353 | else 354 | { 355 | # This one drops performance by over 200 Gflops. So you want to at least use LDS.64 if you can. 356 | # We don't even have room to properly space these at half throuput. 357 | $insert{"j${j}c0"} = sprintf "--:-:-:-:1 %s LDS j%dAx00, [readAs + 4x<%d*128 + 00>];\n", $rsPred, $nOdd, $rsOffset; 358 | $insert{"j${j}c1"} = sprintf "--:-:-:-:1 %s LDS j%dAx01, [readAs + 4x<%d*128 + 01>];\n", $rsPred, $nOdd, $rsOffset; 359 | $insert{"j${j}c2"} = sprintf "--:-:-:-:1 %s LDS j%dAx02, [readAs + 4x<%d*128 + 02>];\n", $rsPred, $nOdd, $rsOffset; 360 | $insert{"j${j}c3"} = sprintf "--:-:-:-:1 %s LDS j%dAx03, [readAs + 4x<%d*128 + 03>];\n", $rsPred, $nOdd, $rsOffset; 361 | $insert{"j${j}c4"} = sprintf "--:-:-:-:1 %s LDS j%dBy00, [readBs + 4x<%d*128 + 00>];\n", $rsPred, $nOdd, $rsOffset; 362 | $insert{"j${j}c5"} = sprintf "--:-:-:-:1 %s LDS j%dBy01, [readBs + 4x<%d*128 + 01>];\n", $rsPred, $nOdd, $rsOffset; 363 | $insert{"j${j}c6"} = sprintf "--:-:-:-:1 %s LDS j%dBy02, [readBs + 4x<%d*128 + 02>];\n", $rsPred, $nOdd, $rsOffset; 364 | $insert{"j${j}c7"} = sprintf "--:-:-:-:1 %s LDS j%dBy03, [readBs + 4x<%d*128 + 03>];\n", $rsPred, $nOdd, $rsOffset; 365 | $insert{"j${j}c8"} = sprintf "--:-:-:-:1 %s LDS j%dAx64, [readAs + 4x<%d*128 + 64>];\n", $rsPred, $nOdd, $rsOffset; 366 | $insert{"j${j}c9"} = sprintf "--:-:-:-:1 %s LDS j%dAx65, [readAs + 4x<%d*128 + 65>];\n", $rsPred, $nOdd, $rsOffset; 367 | $insert{"j${j}c10"} = sprintf "--:-:-:-:1 %s LDS j%dAx66, [readAs + 4x<%d*128 + 66>];\n", $rsPred, $nOdd, $rsOffset; 368 | $insert{"j${j}c11"} = sprintf "--:-:-:-:1 %s LDS j%dAx67, [readAs + 4x<%d*128 + 67>];\n", $rsPred, $nOdd, $rsOffset; 369 | $insert{"j${j}c12"} = sprintf "--:-:-:-:1 %s LDS j%dBy64, [readBs + 4x<%d*128 + 64>];\n", $rsPred, $nOdd, $rsOffset; 370 | $insert{"j${j}c13"} = sprintf "--:-:-:-:1 %s LDS j%dBy65, [readBs + 4x<%d*128 + 65>];\n", $rsPred, $nOdd, $rsOffset; 371 | $insert{"j${j}c14"} = sprintf "--:-:-:-:1 %s LDS j%dBy66, [readBs + 4x<%d*128 + 66>];\n", $rsPred, $nOdd, $rsOffset; 372 | $insert{"j${j}c15"} = sprintf "--:-:1:-:1 %s LDS j%dBy67, [readBs + 4x<%d*128 + 67>]; // Set Dep 1\n", $rsPred, $nOdd, $rsOffset; 373 | } 374 | foreach my $c (0 .. 63) 375 | { 376 | my ($x,$y) = @{$cOrder[$c]}; 377 | 378 | # Grab an instruction for insertion if one exists for this j and c combination 379 | my $ins = $insert{"j${j}c$c"} || ''; 380 | 381 | # Scatter some yields in there to better balance the workload and reduce sync stalls 382 | # Don't pair a yeild with the dual issued ffmas as that kills performance for some reason 383 | ##### This no longer offers extra performance on GM204 as it did on GM107. It still does for the 64 thread version. Keeping since it doesn't hurt. #### 384 | my $yield = $c == 32 ? 'Y' : '-'; 385 | 386 | # The first FFMA needs to wait on the prior loop's LDS.U.128 ops to finish (except if the barrier does the wait for us) 387 | my ($wait, $comment) = $c == 0 && $j < 7 ? ('01', ' // Wait Dep 1') : ('--',''); 388 | 389 | # Dual issue these ops 390 | my $stall = $ins =~ /LDS|TLD|STS|BAR/ ? 0 : 1; 391 | 392 | my $ctrl = "$wait:-:-:$yield:$stall"; 393 | 394 | # output our FFMA and also any inserted ops 395 | $out .= sprintf "%s FFMA cx%02dy%02d, j%dAx%02d, j%dBy%02d, cx%02dy%02d;%s\n%s", $ctrl, $x,$y, $odd,$x, $odd,$y, $x,$y, $comment, $ins; 396 | } 397 | } 398 | return $out; 399 | 400 | 401 | 402 | // Main loop is done, time to write C to global memory. 403 | 404 | 405 | // Remove the high bits if present from the last loop's xor. 406 | // Also remove the 4096 added onto readBs. 407 | // This gives us the x and y coordinates of the start of this thread's data in C. 408 | --:-:-:-:1 LOP.AND readAs, readAs, 0xfff; 409 | --:-:-:-:1 LOP.AND readBs, readBs, 0xfff; 410 | 411 | // Remap readAs and readBs onto writeCs so we can shuffle the output for coalesced global writes. 412 | // readAs stays constant, readBs colapses down from stride 4 to 1 413 | // writeCs = (readBs / 4) * 128 + readAs; 414 | --:-:-:-:1 ISCADD writeCs, readBs, readAs, 5; 415 | 416 | // Read out the C values from shared in a simple tid mapped pattern but 417 | // offset by the position of this warp's colapsed data in shared. 418 | 419 | // cx = tid31 | (tid128 >> 2); 420 | --:-:-:-:1 SHR.U32 cx, tid128, 2; 421 | --:-:-:-:1 LOP.OR cx, tid31, cx; 422 | 423 | // readCs = ((tid96 << 4) | cx) << 2; 424 | --:-:-:-:1 SHL readCs, tid96, 4; 425 | --:-:-:-:1 LOP.OR readCs, readCs, cx; 426 | --:-:-:-:1 SHL readCs, readCs, 2; 427 | 428 | // cx += bx*128; 429 | --:-:-:-:1 ISCADD cx, bx, cx, 7; 430 | 431 | // cy = by*128 + (tid96 >> 1) 432 | --:-:-:-:1 SHR.U32 cy00, tid96, 1; 433 | --:-:-:-:1 ISCADD cy00, by, cy00, 7; 434 | 435 | // C += (cy*ldc + cx) * 4; 436 | --:-:-:-:1 MOV ldc, c[0x0][0x158]; 437 | --:-:-:-:1 XMAD.LO ci, cy00, ldc, cx, xmad_ci; 438 | --:-:-:-:1 ISCADD Cy00, ci, c[0x0][0x140], 2; 439 | 440 | // When writing in assembly, being able to 'printf' is sometimes easier than stepping through the debugger. 441 | // Here's how it's done. Drop something like this in your code. Then modify the c code to accept this 442 | // many params per thread to printf (see assemblySgemm function). 443 | 444 | //--:-:-:-:1 SHR.U32 smId, smId, 20; 445 | 446 | // D += ((by * gridDimX * blockDimX * vars) + (bx * blockDimX * vars) + (tid * vars)) * 4 447 | // D += ((by * gridDimX + bx) * blockDimX + tid) * vars * 4 448 | //--:-:-:-:1 MOV gridDimX, c[0x0][0x14]; 449 | //--:-:-:-:1 MOV blckDimX, c[0x0][0x8]; 450 | //--:-:-:-:1 XMAD.LO D, by, gridDimX, bx, xmad_D; 451 | //--:-:-:-:1 XMAD.LO D, D, blckDimX, tid, xmad_D; 452 | //--:-:-:-:1 ISCADD D, D, c[0x0][0x160], 3; // 4 bytes * 2 vars = 8 or shift 3 453 | 454 | //--:-:-:-:1 STG.CS [D + 4x<0>], readAs; 455 | //--:-:-:-:1 STG.CS [D + 4x<1>], readBs; 456 | //--:-:-:-:1 STG.CS [D + 4x<2>], writeCs; 457 | //--:-:-:-:1 STG.CS [D + 4x<3>], readCs; 458 | //--:-:-:-:1 STG.CS [D + 4x<4>], cx; 459 | //--:-:-:-:1 STG.CS [D + 4x<5>], cy00; 460 | //--:-:-:-:1 STG.CS [D + 4x<6>], ci; 461 | //--:-:-:-:1 STG.CS [D + 4x<7>], cx67y67; 462 | 463 | //--:-:-:-:1 STG.CS [D + 4x<0>], smId; 464 | //--:-:-:-:1 STG.CS [D + 4x<1>], clock; 465 | 466 | 467 | // Setup our matrix bounds checking vars and preds 468 | // Bounds checking is what allows this code to work on matrix sizes not a multiple of 128 469 | --:-:-:-:1 ISETP.LT.AND P5, PT, cx, c[0x0][0x144], PT; // cx + 0 < m 470 | --:-:-:-:1 IADD cx, cx, 64; 471 | --:-:-:-:1 ISETP.LT.AND P6, PT, cx, c[0x0][0x144], PT; // cx + 64 < m 472 | 473 | --:-:-:-:1 IADD cy00, cy00, -1; 474 | --:-:-:-:1 IADD cy04, cy00, 4; 475 | --:-:-:-:1 IADD cy08, cy00, 8; 476 | --:-:-:-:1 IADD cy12, cy00, 12; 477 | 478 | // Setup our C output addresses and increments. 479 | --:-:-:-:1 SHL ldc1, ldc, 2; 480 | --:-:-:-:1 SHL ldc4, ldc, 4; 481 | --:-:-:-:1 SHL ldc8, ldc, 5; 482 | --:-:-:-:1 ISCADD ldc60, ldc, -ldc4, 8; 483 | 484 | // Load the first set of the STORE_C subroutine params in the scheduled block. 485 | # This is also a good time to apply alpha. 486 | --:-:-:-:1 MOV alpha, c[0x0][0x15c]; 487 | 488 | --:-:-:-:1 FMUL cs0, cx00y00, alpha; 489 | --:-:-:-:1 FMUL cs1, cx01y00, alpha; 490 | --:-:-:-:1 FMUL cs2, cx02y00, alpha; 491 | --:-:-:-:1 FMUL cs3, cx03y00, alpha; 492 | --:-:-:-:1 FMUL cs4, cx64y00, alpha; 493 | --:-:-:-:1 FMUL cs5, cx65y00, alpha; 494 | --:-:-:-:1 FMUL cs6, cx66y00, alpha; 495 | --:-:-:-:1 FMUL cs7, cx67y00, alpha; 496 | 497 | // We pre-increment the output addresses so they can be dual issued with memory ops 498 | // So start with a -1 instead of 0 value. 499 | --:-:-:-:1 IADD Cy00, Cy00, -ldc1; 500 | --:-:-:-:1 IADD Cy04, Cy00, ldc4; 501 | --:-:-:-:1 IADD Cy08, Cy00, ldc8; 502 | --:-:-:-:0 IADD Cy12, Cy04, ldc8; // Dual Issue (last instruction after reordering) 503 | 504 | 505 | 506 | // There's nothing yet in place to handle dependecies with subroutines. 507 | // So don't schedule this block. 508 | 509 | 510 | my $out; 511 | foreach my $y (0..3, 64..67) 512 | { 513 | my ($wait, $comment) = $y == 64 ? ('--', '') : ('02',' // Wait Dep 2'); 514 | 515 | # Jump ahead 60 units (to get to the values at y=64) 516 | $out .= 517 | "--:-:-:-:1 IADD cy00, cy00, 60;\n" . 518 | "--:-:-:-:1 IADD cy04, cy04, 60;\n" . 519 | "--:-:-:-:1 IADD cy08, cy08, 60;\n" . 520 | "--:-:-:-:1 IADD cy12, cy12, 60;\n\n" . 521 | 522 | "02:-:-:-:1 IADD Cy00, Cy00, ldc60; // Wait Dep 2\n" . 523 | "--:-:-:-:1 IADD Cy04, Cy04, ldc60;\n" . 524 | "--:-:-:-:1 IADD Cy08, Cy08, ldc60;\n" . 525 | "--:-:-:-:1 IADD Cy12, Cy12, ldc60;\n\n" if $y == 64; 526 | 527 | # We need to move the C values to the param registers of the STORE_C subroutine. 528 | # This is also a good time to apply alpha. 529 | $out .= sprintf( 530 | "%s:-:-:-:1 FMUL cs0, cx00y%02d, alpha;%s\n" . 531 | "--:-:-:-:1 FMUL cs1, cx01y%02d, alpha;\n" . 532 | "--:-:-:-:1 FMUL cs2, cx02y%02d, alpha;\n" . 533 | "--:-:-:-:1 FMUL cs3, cx03y%02d, alpha;\n" . 534 | "--:-:-:-:1 FMUL cs4, cx64y%02d, alpha;\n" . 535 | "--:-:-:-:1 FMUL cs5, cx65y%02d, alpha;\n" . 536 | "--:-:-:-:1 FMUL cs6, cx66y%02d, alpha;\n" . 537 | "--:-:-:-:0 FMUL cs7, cx67y%02d, alpha; // Dual Issue\n", 538 | $wait, $y, $comment, ($y) x 7) if $y; 539 | 540 | # Call the subroutine. 541 | $out .= "--:-:-:-:5 CAL STORE_C;\n\n"; 542 | } 543 | return $out; 544 | 545 | 546 | 547 | // And we'd done. The remainder is the STORE_C subroutine that's defined at the end of the kernel. 548 | --:-:-:-:5 EXIT; 549 | 550 | // This routine does warp synchronous shuffling of our output data so as to be able 551 | // to have coalesced writes to global memory. This is actually faster because the shared 552 | // memory latencies can be hidden by other warps and we're only adding a few extra clocks 553 | // to this thread. Global memory here is the bottleneck and being able to half the needed 554 | // bandwidth at the expense of a few clocks is a modest win. This also keeps power lower 555 | // and our chip running faster. 556 | 557 | // Note, the SHFL instruction doesn't help us here because we're swaping different registers 558 | // from different threads. 559 | STORE_C: 560 | 561 | 562 | 563 | // Each warp writes to its own region of memory so we don't need to bar.sync the access. 564 | // There are some bank conflicts here on the STS.128s but no way to avoid them, and the hit just means a few extra clocks. 565 | // Note here that the scheduler is able to handle the dependencies between vector and non-vector instructions. 566 | // It knows from the instruction type and the register map that cs0 here includes cs1, cs2 and cs3 as well. 567 | --:-:-:-:1 STS.128 [writeCs+4x<00>], cs0; 568 | --:-:-:-:1 STS.128 [writeCs+4x<64>], cs4; 569 | 570 | // In a single warp, loads naturally occur after the store to shared completes, no sync required. 571 | --:-:-:-:1 LDS cs0, [readCs + 4x<0*128 + 00>]; 572 | --:-:-:-:1 LDS cs1, [readCs + 4x<0*128 + 64>]; 573 | --:-:-:-:1 LDS cs2, [readCs + 4x<1*128 + 00>]; 574 | --:-:-:-:1 LDS cs3, [readCs + 4x<1*128 + 64>]; 575 | --:-:-:-:1 LDS cs4, [readCs + 4x<2*128 + 00>]; 576 | --:-:-:-:1 LDS cs5, [readCs + 4x<2*128 + 64>]; 577 | --:-:-:-:1 LDS cs6, [readCs + 4x<3*128 + 00>]; 578 | --:-:1:-:1 LDS cs7, [readCs + 4x<3*128 + 64>]; // Set Dep 1 579 | 580 | --:-:-:-:1 IADD cy00, cy00, 1; 581 | --:-:-:-:1 IADD cy04, cy04, 1; 582 | --:-:-:-:1 IADD cy08, cy08, 1; 583 | --:-:-:-:1 IADD cy12, cy12, 1; 584 | 585 | --:-:-:-:1 IADD Cy00, Cy00, ldc1; 586 | --:-:-:-:1 IADD Cy04, Cy04, ldc1; 587 | --:-:-:-:1 IADD Cy08, Cy08, ldc1; 588 | --:-:-:-:1 IADD Cy12, Cy12, ldc1; 589 | 590 | --:-:-:-:1 ISETP.LT.AND P0, PT, cy00, c[0x0][0x148], P5; // cy00 < n && cx + 0 < m 591 | --:-:-:-:1 ISETP.LT.AND P1, PT, cy00, c[0x0][0x148], P6; // cy00 < n && cx + 64 < m 592 | --:-:-:-:1 ISETP.LT.AND P2, PT, cy04, c[0x0][0x148], P5; // cy04 < n && cx + 0 < m 593 | --:-:-:-:1 ISETP.LT.AND P3, PT, cy04, c[0x0][0x148], P6; // cy04 < n && cx + 64 < m 594 | 595 | 01:-:-:-:1 @P0 STG.CG [Cy00 + 4x<00>], cs0; // Wait Dep 1 596 | --:-:-:-:1 @P1 STG.CG [Cy00 + 4x<64>], cs1; 597 | --:-:-:-:1 @P2 STG.CG [Cy04 + 4x<00>], cs2; 598 | --:-:-:-:1 @P3 STG.CG [Cy04 + 4x<64>], cs3; 599 | 600 | --:-:-:-:1 ISETP.LT.AND P0, PT, cy08, c[0x0][0x148], P5; // cy08 < n && cx + 0 < m 601 | --:-:-:-:1 ISETP.LT.AND P1, PT, cy08, c[0x0][0x148], P6; // cy08 < n && cx + 64 < m 602 | --:-:-:-:1 ISETP.LT.AND P2, PT, cy12, c[0x0][0x148], P5; // cy12 < n && cx + 0 < m 603 | --:-:-:-:1 ISETP.LT.AND P3, PT, cy12, c[0x0][0x148], P6; // cy12 < n && cx + 64 < m 604 | 605 | --:-:-:-:1 @P0 STG.CG [Cy08 + 4x<00>], cs4; 606 | --:-:-:-:1 @P1 STG.CG [Cy08 + 4x<64>], cs5; 607 | --:-:-:-:1 @P2 STG.CG [Cy12 + 4x<00>], cs6; 608 | --:2:-:-:1 @P3 STG.CG [Cy12 + 4x<64>], cs7; // Set Dep 2 609 | 610 | 611 | 612 | --:-:-:-:5 RET; 613 | 614 | -------------------------------------------------------------------------------- /sgemm/sgemm64.sass: -------------------------------------------------------------------------------- 1 | # Kernel: sgemm_kernel_64 2 | # 3 | # SharedSize: 8192 4 | # Params(8): 5 | # 0:0x140:4:4 param_C, 6 | # 1:0x144:4:0 param_m, 7 | # 2:0x148:4:0 param_n, 8 | # 3:0x14c:4:0 param_k, 9 | # 4:0x150:4:0 param_lda, 10 | # 5:0x154:4:0 param_ldb, 11 | # 6:0x158:4:0 param_ldc 12 | # 7:0x15c:4:0 param_alpha 13 | # 8:0x160:4:4 param_D // for diagnostic printf output 14 | # 15 | # Globals: 16 | # c[0x0][0x164]: texA (the value is 1) 17 | # c[0x0][0x168]: texB (the value is 0) 18 | 19 | 20 | 21 | 0-63 ~ blk, ldx, ldx4, k, tid1, tid2, tid15, tid15_4, xmad_t0, xmad_end 22 | 23 | 80 : zOffset 24 | 0-63 : cz<00-63> 25 | 26 | 3, 2,11,10,19,18,27,26 : cx00y<00-03|32-35> 27 | 7, 6,15,14,23,22,31,30 : cx01y<00-03|32-35> 28 | 1, 0, 9, 8,17,16,25,24 : cx02y<00-03|32-35> 29 | 5, 4,13,12,21,20,29,28 : cx03y<00-03|32-35> 30 | 35,34,43,42,51,50,59,58 : cx32y<00-03|32-35> 31 | 39,38,47,46,55,54,63,62 : cx33y<00-03|32-35> 32 | 33,32,41,40,49,48,57,56 : cx34y<00-03|32-35> 33 | 37,36,45,44,53,52,61,60 : cx35y<00-03|32-35> 34 | 35 | 64-79 : j0Ax<00-03|32-35>, j0By<00-03|32-35> 36 | 80-95 : j1Ax<00-03|32-35>, j1By<00-03|32-35> 37 | 38 | 64-71 : cs<0-7> 39 | 40 | 96-111 : loadX0<0-3>, loadX2<0-3>, loadX4<0-3>, loadX6<0-3> 41 | 42 | 112-127 ~ track<0|2|4|6>[0], tex[1], readAs[2], readBs[3], writeS[2], end, ldx8, tid, bx, by, tid31, tid32 43 | 44 | 72-111 ~ cy<00|04|08|12>, Cy<00|04|08|12>, ldc, ldc1, ldc4, ldc8, ldc28, writeCs, readCs, cx, ci, xmad_ci, alpha, xmadD, D, blckDimX, gridDimX 45 | 46 | 47 | 48 | --:-:1:-:1 S2R tid, SR_TID.X; // Set Dep 1 49 | --:-:2:-:1 S2R bx, SR_CTAID.X; // Set Dep 2 50 | --:-:3:-:1 S2R by, SR_CTAID.Y; // Set Dep 3 51 | 52 | 53 | 54 | // blk = tid >= 32 ? by : bx; 55 | // ldx = tid >= 32 ? ldb : lda; 56 | // tex = tid >= 32 ? texB : texA; 57 | 01:-:-:-:1 ISETP.GE.AND P0, PT, tid, 32, PT; // Wait Dep 1 58 | 06:-:-:-:1 SEL blk, by, bx, P0; // Wait Dep 2 & 3 59 | --:-:-:-:1 @!P0 MOV ldx4, c[0x0][0x150]; 60 | --:-:-:-:1 @P0 MOV ldx4, c[0x0][0x154]; 61 | --:-:-:-:1 @!P0 MOV32I tex, 0x80000001; // texA 62 | --:-:-:-:1 @P0 MOV32I tex, 0x80000000; // texB 63 | 64 | --:-:-:-:1 LOP.AND zOffset, tid, -32; 65 | --:-:-:-:1 STS.128 [zOffset + 4x<16*64>], RZ; 66 | 67 | // tid2 = (tid >> 4) & 1 68 | // tid15 = tid & 15 69 | // tid31 = tid & 31 70 | // tid32 = tid & 32 71 | --:-:-:-:1 BFE.U32 tid2, tid, 0x104; // 1 bit at position 4 72 | --:-:-:-:1 LOP.AND tid15, tid, 15; 73 | --:-:-:-:1 LOP.AND tid31, tid, 31; 74 | --:-:-:-:1 LOP.AND tid32, tid, 32; 75 | 76 | // ldx4 = ldx * 4; 77 | // ldx8 = ldx * 8; 78 | --:-:-:-:1 SHR.U32 ldx, ldx4, 2; 79 | --:-:-:-:1 IADD ldx8, ldx4, ldx4; 80 | 81 | // track0 = blk*64/4 + tid15 + (ldx * tid2) 82 | --:-:-:-:1 ISCADD track0, blk, tid15, 4; 83 | --:-:-:-:1 XMAD.LO track0, ldx, tid2, track0, xmad_t0; 84 | --:-:-:-:1 IADD3 track2, track0, ldx, ldx; 85 | --:-:-:-:1 IADD track4, track0, ldx4; 86 | --:-:-:-:1 IADD track6, track2, ldx4; 87 | 88 | // writeS = tid15*4*4 + tid2*64*4 89 | --:-:-:-:1 SHL tid15_4, tid15, 4; 90 | --:-:-:-:1 ISCADD writeS, tid2, tid15_4, 8; 91 | 92 | // writeS += 2048 if tid >= 32 93 | --:-:-:-:1 @P0 IADD writeS, writeS, 4x<8*64>; 94 | 95 | // int end = track0 + (k-8)*ldx; 96 | --:-:-:-:1 MOV k, c[0x0][0x14c]; 97 | --:-:-:-:1 IADD k, k, -8; 98 | --:-:-:-:1 XMAD.LO end, k, ldx, track0, xmad_end; 99 | 100 | // readAs = ((tid >> 1) & 7) << 4; 101 | --:-:-:-:1 BFE.U32 readAs, tid, 0x301; // 3 bits at position 1 102 | --:-:-:-:1 SHL readAs, readAs, 4; 103 | 104 | // readBs = (((tid & 0x30) >> 3) | (tid & 1)) << 4 + 2048; 105 | --:-:-:-:1 LOP.AND tid1, tid, 1; 106 | --:-:-:-:1 LOP.AND readBs, tid, 0x30; 107 | --:-:-:-:1 SHR.U32 readBs, readBs, 3; 108 | --:-:-:-:1 LOP.OR readBs, readBs, tid1; 109 | --:-:-:-:1 ISCADD readBs, readBs, 4x<8*64>, 4; 110 | 111 | 112 | --:-:1:-:1 TLD.B.LZ.P loadX0, track0, tex, 0x0, 1D, 0xf; // Set Dep 1 113 | --:-:2:-:1 TLD.B.LZ.P loadX2, track2, tex, 0x0, 1D, 0xf; // Set Dep 2 114 | --:-:3:-:1 TLD.B.LZ.P loadX4, track4, tex, 0x0, 1D, 0xf; // Set Dep 1 115 | --:-:4:-:1 TLD.B.LZ.P loadX6, track6, tex, 0x0, 1D, 0xf; // Set Dep 2 116 | 117 | 118 | 119 | 120 | 121 | return join '', map sprintf("--:-:5:-:1 LDS.U.128 cz%02d, [zOffset + 4x<16*64>];\n", $_ * 4), 0..15; 122 | 123 | 124 | 125 | 126 | 01:-:-:-:1 STS.128 [writeS + 4x<0*64>], loadX0; // Wait Dep 1 127 | 02:-:-:-:1 STS.128 [writeS + 4x<2*64>], loadX2; // Wait Dep 2 128 | 04:-:-:-:1 STS.128 [writeS + 4x<4*64>], loadX4; // Wait Dep 3 129 | 08:-:-:-:1 STS.128 [writeS + 4x<6*64>], loadX6; // Wait Dep 4 130 | 131 | --:-:-:-:1 IADD track0, track0, ldx8; 132 | --:-:-:-:1 IADD track2, track2, ldx8; 133 | --:-:-:-:1 IADD track4, track4, ldx8; 134 | --:-:-:-:1 IADD track6, track6, ldx8; 135 | 136 | 10:-:-:-:5 BAR.SYNC 0; 137 | 138 | 139 | 140 | --:-:-:-:0 LOP.XOR writeS, writeS, 4x<16*64>; 141 | 142 | --:-:-:-:1 LDS.U.128 j0Ax00, [readAs + 4x<0*64 + 00>]; 143 | --:-:-:-:1 LDS.U.128 j0By00, [readBs + 4x<0*64 + 00>]; 144 | --:-:-:-:1 LDS.U.128 j0Ax32, [readAs + 4x<0*64 + 32>]; 145 | --:-:1:-:1 LDS.U.128 j0By32, [readBs + 4x<0*64 + 32>]; // Set Dep 1 146 | 147 | // Efficiency: 148 | // ffma: 512 149 | // lds: 32 dual issued 150 | // sts: 4 dual issued 151 | // tex: 4 dual issued 152 | // add: 4 153 | // xor: 3 154 | // setp: 1 155 | // bar: 1 dual issued 156 | // bra: 1 dual issued 157 | // Total: 520 (512/520 = 98.5% FFMA) 158 | 159 | LOOP: 160 | 161 | // Loop end condition 162 | --:-:-:-:1 ISETP.LE.AND P0, PT, track0, end, PT; 163 | 164 | 165 | 166 | my @cOrder; 167 | my @swirl = ([2,0],[2,1],[0,1],[0,0]); 168 | my @x = (0,1,32,33); 169 | foreach my $y (0,2,32,34) 170 | { 171 | foreach my $x (@x) 172 | { 173 | push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl; 174 | } 175 | @x = reverse @x; 176 | } 177 | 178 | my %insert = 179 | ( 180 | j0c31 => "--:-:-:-:1 \@P0 TLD.B.LZ.P loadX0, track0, tex, 0x0, 1D, 0xf;\n", 181 | j0c33 => "--:-:2:-:1 \@P0 TLD.B.LZ.P loadX2, track2, tex, 0x0, 1D, 0xf; // Set Dep 2\n", 182 | 183 | j1c31 => "--:-:-:-:1 \@P0 TLD.B.LZ.P loadX4, track4, tex, 0x0, 1D, 0xf;\n", 184 | j1c33 => "--:-:3:-:1 \@P0 TLD.B.LZ.P loadX6, track6, tex, 0x0, 1D, 0xf; // Set Dep 3\n", 185 | 186 | j5c30 => "02:-:-:-:1 \@P0 STS.128 [writeS + 4x<0*64>], loadX0; // Wait Dep 2\n", 187 | j5c34 => "--:-:-:-:1 \@P0 STS.128 [writeS + 4x<2*64>], loadX2;\n", 188 | 189 | j6c30 => "04:-:-:-:1 \@P0 STS.128 [writeS + 4x<4*64>], loadX4; // Wait Dep 3\n", 190 | j6c34 => "--:-:-:-:1 \@P0 STS.128 [writeS + 4x<6*64>], loadX6;\n", 191 | 192 | j6c62 => 193 | "01:-:-:-:5 BAR.SYNC 0; // Wait Dep 1\n" . 194 | "--:-:-:-:1 \@P0 LOP.XOR readAs, readAs, 4x<16*64>;\n" . 195 | "--:-:-:-:1 \@P0 LOP.XOR readBs, readBs, 4x<16*64>;\n" . 196 | "--:-:-:-:1 \@P0 LOP.XOR writeS, writeS, 4x<16*64>;\n", 197 | 198 | j7c63 => 199 | "--:-:-:-:1 \@P0 IADD track0, track0, ldx8;\n" . 200 | "--:-:-:-:1 \@P0 IADD track2, track2, ldx8;\n" . 201 | "--:-:-:-:1 \@P0 IADD track4, track4, ldx8;\n" . 202 | "--:-:-:-:0 \@P0 IADD track6, track6, ldx8;\n" . 203 | "--:-:-:Y:5 \@P0 BRA LOOP;\n", 204 | ); 205 | 206 | my $out; 207 | foreach my $j (0 .. 7) 208 | { 209 | my $odd = $j & 1; 210 | my $nOdd = !$odd + 0; 211 | my $rsOffset = ($j + 1) % 8; 212 | my $rsPred = $j == 7 ? '@P0' : ' '; 213 | 214 | $insert{"j${j}c0"} = sprintf "--:-:-:-:1 %s LDS.U.128 j%dAx00, [readAs + 4x<%d*64 + 00>];\n", $rsPred, $nOdd, $rsOffset; 215 | $insert{"j${j}c2"} = sprintf "--:-:-:-:1 %s LDS.U.128 j%dBy00, [readBs + 4x<%d*64 + 00>];\n", $rsPred, $nOdd, $rsOffset; 216 | $insert{"j${j}c4"} = sprintf "--:-:-:-:1 %s LDS.U.128 j%dAx32, [readAs + 4x<%d*64 + 32>];\n", $rsPred, $nOdd, $rsOffset; 217 | $insert{"j${j}c6"} = sprintf "--:-:1:-:1 %s LDS.U.128 j%dBy32, [readBs + 4x<%d*64 + 32>]; // Set Dep 1\n", $rsPred, $nOdd, $rsOffset; 218 | 219 | foreach my $c (0 .. 63) 220 | { 221 | my ($x,$y) = @{$cOrder[$c]}; 222 | 223 | my $ins = $insert{"j${j}c$c"} || ''; 224 | 225 | my $yield = $c == 32 ? 'Y' : '-'; 226 | 227 | my ($wait, $comment) = $c == 0 && $j < 7 ? ('01', ' // Wait Dep 1') : ('--',''); 228 | 229 | my $stall = $ins =~ /LDS|TLD|STS|BAR/ ? 0 : 1; 230 | 231 | my $ctrl = "$wait:-:-:$yield:$stall"; 232 | 233 | $out .= sprintf "%s FFMA cx%02dy%02d, j%dAx%02d, j%dBy%02d, cx%02dy%02d;%s\n%s", $ctrl, $x,$y, $odd,$x, $odd,$y, $x,$y, $comment, $ins; 234 | } 235 | } 236 | return $out; 237 | 238 | 239 | 240 | 241 | 242 | --:-:-:-:1 LOP.AND readAs, readAs, 0x7ff; 243 | --:-:-:-:1 LOP.AND readBs, readBs, 0x7ff; 244 | 245 | // writeCs = (readBs / 4) * 64 + readAs; 246 | --:-:-:-:1 ISCADD writeCs, readBs, readAs, 4; 247 | 248 | // readCs = ((tid32 << 3) + tid31) << 2; 249 | --:-:-:-:1 ISCADD readCs, tid32, tid31, 3; 250 | --:-:-:-:1 SHL readCs, readCs, 2; 251 | 252 | // cx = bx*64 + tid31; 253 | --:-:-:-:1 ISCADD cx, bx, tid31, 6; 254 | 255 | // cy = by*64 + (tid32 >> 1) 256 | --:-:-:-:1 SHR.U32 cy00, tid32, 1; 257 | --:-:-:-:1 ISCADD cy00, by, cy00, 6; 258 | 259 | // C += (cy*ldc + cx) * 4; 260 | --:-:-:-:1 MOV ldc, c[0x0][0x158]; 261 | --:-:-:-:1 XMAD.LO ci, cy00, ldc, cx, xmad_ci; 262 | --:-:-:-:1 ISCADD Cy00, ci, c[0x0][0x140], 2; 263 | 264 | --:-:-:-:1 ISETP.LT.AND P5, PT, cx, c[0x0][0x144], PT; // cx + 0 < m 265 | --:-:-:-:1 IADD cx, cx, 32; 266 | --:-:-:-:1 ISETP.LT.AND P6, PT, cx, c[0x0][0x144], PT; // cx + 64 < m 267 | 268 | // D += ((by * gridDimX * blockDimX * vars) + (bx * blockDimX * vars) + (tid * vars)) * 4 269 | // D += ((by * gridDimX + bx) * blockDimX + tid) * vars * 4 270 | //--:-:-:-:1 MOV gridDimX, c[0x0][0x14]; 271 | //--:-:-:-:1 MOV blckDimX, c[0x0][0x8]; 272 | //--:-:-:-:1 XMAD.LO D, by, gridDimX, bx, xmadD; 273 | //--:-:-:-:1 XMAD.LO D, D, blckDimX, tid, xmadD; 274 | //--:-:-:-:1 ISCADD D, D, c[0x0][0x160], 5; // 4 bytes * 8 vars = 32 or shift 5 275 | 276 | //--:-:-:-:1 STG.CS [D + 4x<0>], readAs; 277 | //--:-:-:-:1 STG.CS [D + 4x<1>], readBs; 278 | //--:-:-:-:1 STG.CS [D + 4x<2>], writeCs; 279 | //--:-:-:-:1 STG.CS [D + 4x<3>], readCs; 280 | //--:-:-:-:1 STG.CS [D + 4x<4>], cx; 281 | //--:-:-:-:1 STG.CS [D + 4x<5>], cy00; 282 | //--:-:-:-:1 STG.CS [D + 4x<6>], ci; 283 | //--:-:-:-:1 STG.CS [D + 4x<7>], cx35y35; 284 | 285 | --:-:-:-:1 IADD cy00, cy00, -1; 286 | --:-:-:-:1 IADD cy04, cy00, 4; 287 | --:-:-:-:1 IADD cy08, cy00, 8; 288 | --:-:-:-:1 IADD cy12, cy00, 12; 289 | 290 | --:-:-:-:1 SHL ldc1, ldc, 2; 291 | --:-:-:-:1 SHL ldc4, ldc, 4; 292 | --:-:-:-:1 SHL ldc8, ldc, 5; 293 | --:-:-:-:1 ISCADD ldc28, ldc, -ldc4, 7; 294 | 295 | --:-:-:-:1 MOV alpha, c[0x0][0x15c]; 296 | --:-:-:-:1 FMUL cs0, cx00y00, alpha; 297 | --:-:-:-:1 FMUL cs1, cx01y00, alpha; 298 | --:-:-:-:1 FMUL cs2, cx02y00, alpha; 299 | --:-:-:-:1 FMUL cs3, cx03y00, alpha; 300 | --:-:-:-:1 FMUL cs4, cx32y00, alpha; 301 | --:-:-:-:1 FMUL cs5, cx33y00, alpha; 302 | --:-:-:-:1 FMUL cs6, cx34y00, alpha; 303 | --:-:-:-:1 FMUL cs7, cx35y00, alpha; 304 | 305 | --:-:-:-:1 IADD Cy00, Cy00, -ldc1; 306 | --:-:-:-:1 IADD Cy04, Cy00, ldc4; 307 | --:-:-:-:1 IADD Cy08, Cy00, ldc8; 308 | --:-:-:-:0 IADD Cy12, Cy04, ldc8; // Dual Issue (last instruction after reordering) 309 | 310 | 311 | 312 | 313 | 314 | my $out; 315 | foreach my $y (0..3, 32..35) 316 | { 317 | my ($wait, $comment) = $y == 32 ? ('--', '') : ('02',' // Wait Dep 2'); 318 | 319 | $out .= 320 | "--:-:-:-:1 IADD cy00, cy00, 28;\n" . 321 | "--:-:-:-:1 IADD cy04, cy04, 28;\n" . 322 | "--:-:-:-:1 IADD cy08, cy08, 28;\n" . 323 | "--:-:-:-:1 IADD cy12, cy12, 28;\n\n" . 324 | 325 | "02:-:-:-:1 IADD Cy00, Cy00, ldc28; // Wait Dep 2\n" . 326 | "--:-:-:-:1 IADD Cy04, Cy04, ldc28;\n" . 327 | "--:-:-:-:1 IADD Cy08, Cy08, ldc28;\n" . 328 | "--:-:-:-:1 IADD Cy12, Cy12, ldc28;\n\n" if $y == 32; 329 | 330 | $out .= sprintf( 331 | "%s:-:-:-:1 FMUL cs0, cx00y%02d, alpha;%s\n" . 332 | "--:-:-:-:1 FMUL cs1, cx01y%02d, alpha;\n" . 333 | "--:-:-:-:1 FMUL cs2, cx02y%02d, alpha;\n" . 334 | "--:-:-:-:1 FMUL cs3, cx03y%02d, alpha;\n" . 335 | "--:-:-:-:1 FMUL cs4, cx32y%02d, alpha;\n" . 336 | "--:-:-:-:1 FMUL cs5, cx33y%02d, alpha;\n" . 337 | "--:-:-:-:1 FMUL cs6, cx34y%02d, alpha;\n" . 338 | "--:-:-:-:0 FMUL cs7, cx35y%02d, alpha; // Dual Issue\n", 339 | $wait, $y, $comment, ($y) x 7) if $y; 340 | 341 | $out .= "--:-:-:-:5 CAL STORE_C;\n\n"; 342 | } 343 | return $out; 344 | 345 | 346 | 347 | --:-:-:-:5 EXIT; 348 | 349 | STORE_C: 350 | 351 | 352 | 353 | --:-:-:-:1 STS.128 [writeCs+4x<00>], cs0; 354 | --:-:-:-:1 STS.128 [writeCs+4x<32>], cs4; 355 | 356 | --:-:-:-:1 LDS cs0, [readCs + 4x<0*64 + 00>]; 357 | --:-:-:-:1 LDS cs1, [readCs + 4x<0*64 + 32>]; 358 | --:-:-:-:1 LDS cs2, [readCs + 4x<1*64 + 00>]; 359 | --:-:-:-:1 LDS cs3, [readCs + 4x<1*64 + 32>]; 360 | --:-:-:-:1 LDS cs4, [readCs + 4x<2*64 + 00>]; 361 | --:-:-:-:1 LDS cs5, [readCs + 4x<2*64 + 32>]; 362 | --:-:-:-:1 LDS cs6, [readCs + 4x<3*64 + 00>]; 363 | --:-:1:-:1 LDS cs7, [readCs + 4x<3*64 + 32>]; // Set Dep 1 364 | 365 | --:-:-:-:1 IADD cy00, cy00, 1; 366 | --:-:-:-:1 IADD cy04, cy04, 1; 367 | --:-:-:-:1 IADD cy08, cy08, 1; 368 | --:-:-:-:1 IADD cy12, cy12, 1; 369 | 370 | --:-:-:-:1 IADD Cy00, Cy00, ldc1; 371 | --:-:-:-:1 IADD Cy04, Cy04, ldc1; 372 | --:-:-:-:1 IADD Cy08, Cy08, ldc1; 373 | --:-:-:-:1 IADD Cy12, Cy12, ldc1; 374 | 375 | --:-:-:-:1 ISETP.LT.AND P0, PT, cy00, c[0x0][0x148], P5; // cy00 < n && cx + 0 < m 376 | --:-:-:-:1 ISETP.LT.AND P1, PT, cy00, c[0x0][0x148], P6; // cy00 < n && cx + 32 < m 377 | --:-:-:-:1 ISETP.LT.AND P2, PT, cy04, c[0x0][0x148], P5; // cy04 < n && cx + 0 < m 378 | --:-:-:-:1 ISETP.LT.AND P3, PT, cy04, c[0x0][0x148], P6; // cy04 < n && cx + 32 < m 379 | 380 | 01:-:-:-:1 @P0 STG.CG [Cy00 + 4x<00>], cs0; // Wait Dep 1 381 | --:-:-:-:1 @P1 STG.CG [Cy00 + 4x<32>], cs1; 382 | --:-:-:-:1 @P2 STG.CG [Cy04 + 4x<00>], cs2; 383 | --:-:-:-:1 @P3 STG.CG [Cy04 + 4x<32>], cs3; 384 | 385 | --:-:-:-:1 ISETP.LT.AND P0, PT, cy08, c[0x0][0x148], P5; // cy08 < n && cx + 0 < m 386 | --:-:-:-:1 ISETP.LT.AND P1, PT, cy08, c[0x0][0x148], P6; // cy08 < n && cx + 32 < m 387 | --:-:-:-:1 ISETP.LT.AND P2, PT, cy12, c[0x0][0x148], P5; // cy12 < n && cx + 0 < m 388 | --:-:-:-:1 ISETP.LT.AND P3, PT, cy12, c[0x0][0x148], P6; // cy12 < n && cx + 32 < m 389 | 390 | --:-:-:-:1 @P0 STG.CG [Cy08 + 4x<00>], cs4; 391 | --:-:-:-:1 @P1 STG.CG [Cy08 + 4x<32>], cs5; 392 | --:-:-:-:1 @P2 STG.CG [Cy12 + 4x<00>], cs6; 393 | --:2:-:-:1 @P3 STG.CG [Cy12 + 4x<32>], cs7; // Set Dep 2 394 | 395 | 396 | 397 | --:-:-:-:5 RET; 398 | 399 | -------------------------------------------------------------------------------- /sgemm/sgemm_final_128.sass: -------------------------------------------------------------------------------- 1 | # Kernel: sgemm_kernel_128 2 | # Arch: sm_50 3 | # InsCnt: 770 4 | # RegCnt: 118 5 | # SharedSize: 16384 6 | # BarCnt: 1 7 | # Params(9): 8 | # ord:addr:size:align 9 | # 0:0x140:4:0 10 | # 1:0x144:4:0 11 | # 2:0x148:4:0 12 | # 3:0x14c:4:0 13 | # 4:0x150:4:0 14 | # 5:0x154:4:0 15 | # 6:0x158:4:0 16 | # 7:0x15c:4:0 17 | # 8:0x160:4:0 18 | # 19 | # Instructions: 20 | 21 | --:-:1:-:1 S2R R112, SR_TID.X; 22 | --:-:2:-:1 S2R R113, SR_CTAID.X; 23 | --:-:3:-:1 S2R R114, SR_CTAID.Y; 24 | 01:-:-:Y:1 ISETP.GE.AND P0, PT, R112.reuse, 0x80, PT; 25 | --:-:-:-:1 LOP.AND R117, R112.reuse, 0x1f; 26 | --:-:-:-:1 BFE.U32 R9, R112.reuse, 0x205; 27 | --:-:-:-:1 MOV R13, c[0x0][0x14c]; 28 | --:-:-:-:1 BFE.U32 R4, R112.reuse, 0x301; 29 | --:-:-:-:1 LOP.AND R115, R112.reuse, 0x80; 30 | --:-:-:-:1 LOP.AND R107, R112.reuse, 0x70; 31 | --:-:-:-:1 SHL R16, R117, 0x4; 32 | --:-:-:-:1 LOP.AND R0, R112.reuse, 0x1; 33 | --:-:-:-:1 IADD R13, R13, -0x8; 34 | --:-:-:-:1 LOP.AND R80, R112.reuse, -0x20; 35 | --:-:-:-:1 SHR.U32 R106, R115, 0x4; 36 | --:-:-:-:1 LOP.AND R116, R112, 0x60; 37 | --:-:-:-:1 SHR.U32 R107, R107, 0x3; 38 | --:-:-:-:0 @!P0 MOV R1, c[0x0][0x150]; 39 | --:-:-:-:1 STS.128 [R80+0x2000], RZ; 40 | --:-:-:-:1 @P0 MOV R1, c[0x0][0x154]; 41 | --:-:-:-:1 ISCADD R111, R9, R16, 0x9; 42 | 06:-:-:-:1 SEL R12, R114, R113, P0; 43 | --:-:-:-:1 @!P0 MOV32I R110, 0x80000001; 44 | --:-:-:-:1 @P0 MOV32I R110, 0x80000000; 45 | --:-:-:-:1 LOP.OR R106, R106, R4; 46 | --:-:-:-:1 SHR.U32 R8, R1.reuse, 0x2; 47 | --:-:-:-:1 LOP.OR R107, R107, R0; 48 | --:-:-:-:1 ISCADD R104, R12, R117, 0x5; 49 | --:-:-:-:1 IADD R109, R1, R1; 50 | --:-:-:-:1 @P0 IADD R111, R111, 0x1000; 51 | --:-:-:-:1 SHL R106, R106, 0x4; 52 | --:-:-:-:1 XMAD.MRG R5, R8.reuse, R9.H1.reuse, RZ; 53 | --:-:-:-:1 ISCADD R107, R107, 0x1000, 0x4; 54 | --:-:-:-:1 XMAD R104, R8.reuse, R9, R104; 55 | --:-:-:Y:5 XMAD.MRG R20, R13.reuse, R8.H1.reuse, RZ; 56 | --:-:-:-:2 XMAD.PSL.CBCC R104, R8.H1, R5.H1, R104; 57 | --:-:1:-:4 TLD.B.LZ.P R96, R104, R110, 0x0, 1D, 0xf; 58 | --:-:-:-:1 IADD R108, R104, R1; 59 | --:-:-:-:1 XMAD R105, R13.reuse, R8, R104; 60 | --:-:2:Y:5 TLD.B.LZ.P R100, R108, R110, 0x0, 1D, 0xf; 61 | --:-:-:-:1 XMAD.PSL.CBCC R105, R13.H1, R20.H1, R105; 62 | --:-:3:-:1 LDS.U.128 R0, [R80+0x2000]; 63 | --:-:3:-:1 LDS.U.128 R4, [R80+0x2000]; 64 | --:-:3:-:1 LDS.U.128 R8, [R80+0x2000]; 65 | --:-:3:-:1 LDS.U.128 R12, [R80+0x2000]; 66 | --:-:3:-:1 LDS.U.128 R16, [R80+0x2000]; 67 | --:-:3:-:1 LDS.U.128 R20, [R80+0x2000]; 68 | --:-:3:-:1 LDS.U.128 R24, [R80+0x2000]; 69 | --:-:3:-:1 LDS.U.128 R28, [R80+0x2000]; 70 | --:-:3:-:1 LDS.U.128 R32, [R80+0x2000]; 71 | --:-:3:-:1 LDS.U.128 R36, [R80+0x2000]; 72 | --:-:3:-:1 LDS.U.128 R40, [R80+0x2000]; 73 | --:-:3:-:1 LDS.U.128 R44, [R80+0x2000]; 74 | --:-:3:-:1 LDS.U.128 R48, [R80+0x2000]; 75 | --:-:3:-:1 LDS.U.128 R52, [R80+0x2000]; 76 | --:-:3:-:1 LDS.U.128 R56, [R80+0x2000]; 77 | --:-:3:-:1 LDS.U.128 R60, [R80+0x2000]; 78 | 01:-:-:-:1 STS.128 [R111], R96; 79 | --:-:-:-:0 IADD R104, R104, R109.reuse; 80 | 02:-:-:-:1 STS.128 [R111+0x800], R100; 81 | --:-:-:-:0 IADD R108, R108, R109; 82 | 04:-:-:-:5 BAR.SYNC 0x0; 83 | --:-:-:-:0 LOP.XOR R111, R111, 0x2000; 84 | --:-:-:-:1 LDS.U.128 R64, [R106]; 85 | --:-:-:-:1 LDS.U.128 R72, [R107]; 86 | --:-:-:-:1 LDS.U.128 R68, [R106+0x100]; 87 | --:-:1:-:1 LDS.U.128 R76, [R107+0x100]; 88 | TARGET1: 89 | --:-:-:-:1 ISETP.LE.AND P0, PT, R104, R105, PT; 90 | 01:-:-:-:0 FFMA R1, R66.reuse, R72.reuse, R1; 91 | --:-:-:-:1 LDS.U.128 R80, [R106+0x200]; 92 | --:-:-:-:1 FFMA R0, R66, R73.reuse, R0; 93 | --:-:-:-:0 FFMA R2, R64.reuse, R73.reuse, R2; 94 | --:-:-:-:1 LDS.U.128 R88, [R107+0x200]; 95 | --:-:-:-:1 FFMA R3, R64, R72.reuse, R3; 96 | --:-:-:-:0 FFMA R5, R67.reuse, R72.reuse, R5; 97 | --:-:-:-:1 LDS.U.128 R84, [R106+0x300]; 98 | --:-:-:-:1 FFMA R4, R67, R73.reuse, R4; 99 | --:-:-:-:0 FFMA R6, R65.reuse, R73.reuse, R6; 100 | --:-:1:-:1 LDS.U.128 R92, [R107+0x300]; 101 | --:-:-:-:1 FFMA R7, R65, R72.reuse, R7; 102 | --:-:-:-:1 FFMA R33, R70.reuse, R72.reuse, R33; 103 | --:-:-:-:1 FFMA R32, R70, R73.reuse, R32; 104 | --:-:-:-:1 FFMA R34, R68.reuse, R73.reuse, R34; 105 | --:-:-:-:1 FFMA R35, R68, R72.reuse, R35; 106 | --:-:-:-:1 FFMA R37, R71.reuse, R72.reuse, R37; 107 | --:-:-:-:1 FFMA R36, R71.reuse, R73.reuse, R36; 108 | --:-:-:-:1 FFMA R38, R69.reuse, R73, R38; 109 | --:-:-:-:1 FFMA R39, R69.reuse, R72, R39; 110 | --:-:-:-:1 FFMA R45, R71.reuse, R74.reuse, R45; 111 | --:-:-:-:1 FFMA R44, R71, R75.reuse, R44; 112 | --:-:-:-:1 FFMA R46, R69.reuse, R75.reuse, R46; 113 | --:-:-:-:1 FFMA R47, R69, R74.reuse, R47; 114 | --:-:-:-:1 FFMA R41, R70.reuse, R74.reuse, R41; 115 | --:-:-:-:1 FFMA R40, R70, R75.reuse, R40; 116 | --:-:-:-:1 FFMA R42, R68.reuse, R75.reuse, R42; 117 | --:-:-:-:1 FFMA R43, R68, R74.reuse, R43; 118 | --:-:-:-:1 FFMA R13, R67.reuse, R74.reuse, R13; 119 | --:-:-:-:1 FFMA R12, R67, R75.reuse, R12; 120 | --:-:-:-:1 FFMA R14, R65.reuse, R75.reuse, R14; 121 | --:-:-:-:1 FFMA R15, R65, R74.reuse, R15; 122 | --:-:-:-:1 FFMA R9, R66.reuse, R74.reuse, R9; 123 | --:-:-:-:1 FFMA R8, R66.reuse, R75.reuse, R8; 124 | --:-:-:-:1 FFMA R10, R64.reuse, R75, R10; 125 | --:-:-:-:0 FFMA R11, R64.reuse, R74, R11; 126 | --:-:2:-:1 @P0 TLD.B.LZ.P R96, R104, R110, 0x0, 1D, 0xf; 127 | --:-:-:Y:1 FFMA R17, R66.reuse, R76.reuse, R17; 128 | --:-:-:-:0 FFMA R16, R66, R77.reuse, R16; 129 | --:-:3:-:1 @P0 TLD.B.LZ.P R100, R108, R110, 0x0, 1D, 0xf; 130 | --:-:-:-:1 FFMA R18, R64.reuse, R77.reuse, R18; 131 | --:-:-:-:1 FFMA R19, R64, R76.reuse, R19; 132 | --:-:-:-:1 FFMA R21, R67.reuse, R76.reuse, R21; 133 | --:-:-:-:1 FFMA R20, R67, R77.reuse, R20; 134 | --:-:-:-:1 FFMA R22, R65.reuse, R77.reuse, R22; 135 | --:-:-:-:1 FFMA R23, R65, R76.reuse, R23; 136 | --:-:-:-:1 FFMA R49, R70.reuse, R76.reuse, R49; 137 | --:-:-:-:1 FFMA R48, R70, R77.reuse, R48; 138 | --:-:-:-:1 FFMA R50, R68.reuse, R77.reuse, R50; 139 | --:-:-:-:1 FFMA R51, R68, R76.reuse, R51; 140 | --:-:-:-:1 FFMA R53, R71.reuse, R76.reuse, R53; 141 | --:-:-:-:1 FFMA R52, R71.reuse, R77.reuse, R52; 142 | --:-:-:-:1 FFMA R54, R69.reuse, R77, R54; 143 | --:-:-:-:1 FFMA R55, R69.reuse, R76, R55; 144 | --:-:-:-:1 FFMA R61, R71.reuse, R78.reuse, R61; 145 | --:-:-:-:1 FFMA R60, R71, R79.reuse, R60; 146 | --:-:-:-:1 FFMA R62, R69.reuse, R79.reuse, R62; 147 | --:-:-:-:1 FFMA R63, R69, R78.reuse, R63; 148 | --:-:-:-:1 FFMA R57, R70.reuse, R78.reuse, R57; 149 | --:-:-:-:1 FFMA R56, R70, R79.reuse, R56; 150 | --:-:-:-:1 FFMA R58, R68.reuse, R79.reuse, R58; 151 | --:-:-:-:1 FFMA R59, R68, R78.reuse, R59; 152 | --:-:-:-:1 FFMA R29, R67.reuse, R78.reuse, R29; 153 | --:-:-:-:1 FFMA R28, R67, R79.reuse, R28; 154 | --:-:-:-:1 FFMA R30, R65.reuse, R79.reuse, R30; 155 | --:-:-:-:1 FFMA R31, R65, R78.reuse, R31; 156 | --:-:-:-:1 FFMA R25, R66.reuse, R78.reuse, R25; 157 | --:-:-:-:1 FFMA R24, R66, R79.reuse, R24; 158 | --:-:-:-:1 FFMA R26, R64.reuse, R79, R26; 159 | --:-:-:-:1 FFMA R27, R64, R78, R27; 160 | 01:-:-:-:0 FFMA R1, R82.reuse, R88.reuse, R1; 161 | --:-:-:-:1 LDS.U.128 R64, [R106+0x400]; 162 | --:-:-:-:1 FFMA R0, R82, R89.reuse, R0; 163 | --:-:-:-:0 FFMA R2, R80.reuse, R89.reuse, R2; 164 | --:-:-:-:1 LDS.U.128 R72, [R107+0x400]; 165 | --:-:-:-:1 FFMA R3, R80, R88.reuse, R3; 166 | --:-:-:-:0 FFMA R5, R83.reuse, R88.reuse, R5; 167 | --:-:-:-:1 LDS.U.128 R68, [R106+0x500]; 168 | --:-:-:-:1 FFMA R4, R83, R89.reuse, R4; 169 | --:-:-:-:0 FFMA R6, R81.reuse, R89.reuse, R6; 170 | --:-:1:-:1 LDS.U.128 R76, [R107+0x500]; 171 | --:-:-:-:1 FFMA R7, R81, R88.reuse, R7; 172 | --:-:-:-:1 FFMA R33, R86.reuse, R88.reuse, R33; 173 | --:-:-:-:1 FFMA R32, R86, R89.reuse, R32; 174 | --:-:-:-:1 FFMA R34, R84.reuse, R89.reuse, R34; 175 | --:-:-:-:1 FFMA R35, R84, R88.reuse, R35; 176 | --:-:-:-:1 FFMA R37, R87.reuse, R88.reuse, R37; 177 | --:-:-:-:1 FFMA R36, R87.reuse, R89.reuse, R36; 178 | --:-:-:-:1 FFMA R38, R85.reuse, R89, R38; 179 | --:-:-:-:1 FFMA R39, R85.reuse, R88, R39; 180 | --:-:-:-:1 FFMA R45, R87.reuse, R90.reuse, R45; 181 | --:-:-:-:1 FFMA R44, R87, R91.reuse, R44; 182 | --:-:-:-:1 FFMA R46, R85.reuse, R91.reuse, R46; 183 | --:-:-:-:1 FFMA R47, R85, R90.reuse, R47; 184 | --:-:-:-:1 FFMA R41, R86.reuse, R90.reuse, R41; 185 | --:-:-:-:1 FFMA R40, R86, R91.reuse, R40; 186 | --:-:-:-:1 FFMA R42, R84.reuse, R91.reuse, R42; 187 | --:-:-:-:1 FFMA R43, R84, R90.reuse, R43; 188 | --:-:-:-:1 FFMA R13, R83.reuse, R90.reuse, R13; 189 | --:-:-:-:1 FFMA R12, R83, R91.reuse, R12; 190 | --:-:-:-:1 FFMA R14, R81.reuse, R91.reuse, R14; 191 | --:-:-:-:1 FFMA R15, R81, R90.reuse, R15; 192 | --:-:-:-:1 FFMA R9, R82.reuse, R90.reuse, R9; 193 | --:-:-:-:1 FFMA R8, R82.reuse, R91.reuse, R8; 194 | --:-:-:-:1 FFMA R10, R80.reuse, R91, R10; 195 | --:-:-:-:1 FFMA R11, R80.reuse, R90, R11; 196 | --:-:-:Y:1 FFMA R17, R82.reuse, R92.reuse, R17; 197 | --:-:-:-:1 FFMA R16, R82, R93.reuse, R16; 198 | --:-:-:-:1 FFMA R18, R80.reuse, R93.reuse, R18; 199 | --:-:-:-:1 FFMA R19, R80, R92.reuse, R19; 200 | --:-:-:-:1 FFMA R21, R83.reuse, R92.reuse, R21; 201 | --:-:-:-:1 FFMA R20, R83, R93.reuse, R20; 202 | --:-:-:-:1 FFMA R22, R81.reuse, R93.reuse, R22; 203 | --:-:-:-:1 FFMA R23, R81, R92.reuse, R23; 204 | --:-:-:-:1 FFMA R49, R86.reuse, R92.reuse, R49; 205 | --:-:-:-:1 FFMA R48, R86, R93.reuse, R48; 206 | --:-:-:-:1 FFMA R50, R84.reuse, R93.reuse, R50; 207 | --:-:-:-:1 FFMA R51, R84, R92.reuse, R51; 208 | --:-:-:-:1 FFMA R53, R87.reuse, R92.reuse, R53; 209 | --:-:-:-:1 FFMA R52, R87.reuse, R93.reuse, R52; 210 | --:-:-:-:1 FFMA R54, R85.reuse, R93, R54; 211 | --:-:-:-:1 FFMA R55, R85.reuse, R92, R55; 212 | --:-:-:-:1 FFMA R61, R87.reuse, R94.reuse, R61; 213 | --:-:-:-:1 FFMA R60, R87, R95.reuse, R60; 214 | --:-:-:-:1 FFMA R62, R85.reuse, R95.reuse, R62; 215 | --:-:-:-:1 FFMA R63, R85, R94.reuse, R63; 216 | --:-:-:-:1 FFMA R57, R86.reuse, R94.reuse, R57; 217 | --:-:-:-:1 FFMA R56, R86, R95.reuse, R56; 218 | --:-:-:-:1 FFMA R58, R84.reuse, R95.reuse, R58; 219 | --:-:-:-:1 FFMA R59, R84, R94.reuse, R59; 220 | --:-:-:-:1 FFMA R29, R83.reuse, R94.reuse, R29; 221 | --:-:-:-:1 FFMA R28, R83, R95.reuse, R28; 222 | --:-:-:-:1 FFMA R30, R81.reuse, R95.reuse, R30; 223 | --:-:-:-:1 FFMA R31, R81, R94.reuse, R31; 224 | --:-:-:-:1 FFMA R25, R82.reuse, R94.reuse, R25; 225 | --:-:-:-:1 FFMA R24, R82, R95.reuse, R24; 226 | --:-:-:-:1 FFMA R26, R80.reuse, R95, R26; 227 | --:-:-:-:1 FFMA R27, R80, R94, R27; 228 | 01:-:-:-:0 FFMA R1, R66.reuse, R72.reuse, R1; 229 | --:-:-:-:1 LDS.U.128 R80, [R106+0x600]; 230 | --:-:-:-:1 FFMA R0, R66, R73.reuse, R0; 231 | --:-:-:-:0 FFMA R2, R64.reuse, R73.reuse, R2; 232 | --:-:-:-:1 LDS.U.128 R88, [R107+0x600]; 233 | --:-:-:-:1 FFMA R3, R64, R72.reuse, R3; 234 | --:-:-:-:0 FFMA R5, R67.reuse, R72.reuse, R5; 235 | --:-:-:-:1 LDS.U.128 R84, [R106+0x700]; 236 | --:-:-:-:1 FFMA R4, R67, R73.reuse, R4; 237 | --:-:-:-:0 FFMA R6, R65.reuse, R73.reuse, R6; 238 | --:-:1:-:1 LDS.U.128 R92, [R107+0x700]; 239 | --:-:-:-:1 FFMA R7, R65, R72.reuse, R7; 240 | --:-:-:-:1 FFMA R33, R70.reuse, R72.reuse, R33; 241 | --:-:-:-:1 FFMA R32, R70, R73.reuse, R32; 242 | --:-:-:-:1 FFMA R34, R68.reuse, R73.reuse, R34; 243 | --:-:-:-:1 FFMA R35, R68, R72.reuse, R35; 244 | --:-:-:-:1 FFMA R37, R71.reuse, R72.reuse, R37; 245 | --:-:-:-:1 FFMA R36, R71.reuse, R73.reuse, R36; 246 | --:-:-:-:1 FFMA R38, R69.reuse, R73, R38; 247 | --:-:-:-:1 FFMA R39, R69.reuse, R72, R39; 248 | --:-:-:-:1 FFMA R45, R71.reuse, R74.reuse, R45; 249 | --:-:-:-:1 FFMA R44, R71, R75.reuse, R44; 250 | --:-:-:-:1 FFMA R46, R69.reuse, R75.reuse, R46; 251 | --:-:-:-:1 FFMA R47, R69, R74.reuse, R47; 252 | --:-:-:-:1 FFMA R41, R70.reuse, R74.reuse, R41; 253 | --:-:-:-:1 FFMA R40, R70, R75.reuse, R40; 254 | --:-:-:-:1 FFMA R42, R68.reuse, R75.reuse, R42; 255 | --:-:-:-:1 FFMA R43, R68, R74.reuse, R43; 256 | --:-:-:-:1 FFMA R13, R67.reuse, R74.reuse, R13; 257 | --:-:-:-:1 FFMA R12, R67, R75.reuse, R12; 258 | --:-:-:-:1 FFMA R14, R65.reuse, R75.reuse, R14; 259 | --:-:-:-:1 FFMA R15, R65, R74.reuse, R15; 260 | --:-:-:-:1 FFMA R9, R66.reuse, R74.reuse, R9; 261 | --:-:-:-:1 FFMA R8, R66.reuse, R75.reuse, R8; 262 | --:-:-:-:1 FFMA R10, R64.reuse, R75, R10; 263 | --:-:-:-:1 FFMA R11, R64.reuse, R74, R11; 264 | --:-:-:Y:1 FFMA R17, R66.reuse, R76.reuse, R17; 265 | --:-:-:-:1 FFMA R16, R66, R77.reuse, R16; 266 | --:-:-:-:1 FFMA R18, R64.reuse, R77.reuse, R18; 267 | --:-:-:-:1 FFMA R19, R64, R76.reuse, R19; 268 | --:-:-:-:1 FFMA R21, R67.reuse, R76.reuse, R21; 269 | --:-:-:-:1 FFMA R20, R67, R77.reuse, R20; 270 | --:-:-:-:1 FFMA R22, R65.reuse, R77.reuse, R22; 271 | --:-:-:-:1 FFMA R23, R65, R76.reuse, R23; 272 | --:-:-:-:1 FFMA R49, R70.reuse, R76.reuse, R49; 273 | --:-:-:-:1 FFMA R48, R70, R77.reuse, R48; 274 | --:-:-:-:1 FFMA R50, R68.reuse, R77.reuse, R50; 275 | --:-:-:-:1 FFMA R51, R68, R76.reuse, R51; 276 | --:-:-:-:1 FFMA R53, R71.reuse, R76.reuse, R53; 277 | --:-:-:-:1 FFMA R52, R71.reuse, R77.reuse, R52; 278 | --:-:-:-:1 FFMA R54, R69.reuse, R77, R54; 279 | --:-:-:-:1 FFMA R55, R69.reuse, R76, R55; 280 | --:-:-:-:1 FFMA R61, R71.reuse, R78.reuse, R61; 281 | --:-:-:-:1 FFMA R60, R71, R79.reuse, R60; 282 | --:-:-:-:1 FFMA R62, R69.reuse, R79.reuse, R62; 283 | --:-:-:-:1 FFMA R63, R69, R78.reuse, R63; 284 | --:-:-:-:1 FFMA R57, R70.reuse, R78.reuse, R57; 285 | --:-:-:-:1 FFMA R56, R70, R79.reuse, R56; 286 | --:-:-:-:1 FFMA R58, R68.reuse, R79.reuse, R58; 287 | --:-:-:-:1 FFMA R59, R68, R78.reuse, R59; 288 | --:-:-:-:1 FFMA R29, R67.reuse, R78.reuse, R29; 289 | --:-:-:-:1 FFMA R28, R67, R79.reuse, R28; 290 | --:-:-:-:1 FFMA R30, R65.reuse, R79.reuse, R30; 291 | --:-:-:-:1 FFMA R31, R65, R78.reuse, R31; 292 | --:-:-:-:1 FFMA R25, R66.reuse, R78.reuse, R25; 293 | --:-:-:-:1 FFMA R24, R66, R79.reuse, R24; 294 | --:-:-:-:1 FFMA R26, R64.reuse, R79, R26; 295 | --:-:-:-:1 FFMA R27, R64, R78, R27; 296 | 01:-:-:-:0 FFMA R1, R82.reuse, R88.reuse, R1; 297 | --:-:-:-:1 LDS.U.128 R64, [R106+0x800]; 298 | --:-:-:-:1 FFMA R0, R82, R89.reuse, R0; 299 | --:-:-:-:0 FFMA R2, R80.reuse, R89.reuse, R2; 300 | --:-:-:-:1 LDS.U.128 R72, [R107+0x800]; 301 | --:-:-:-:1 FFMA R3, R80, R88.reuse, R3; 302 | --:-:-:-:0 FFMA R5, R83.reuse, R88.reuse, R5; 303 | --:-:-:-:1 LDS.U.128 R68, [R106+0x900]; 304 | --:-:-:-:1 FFMA R4, R83, R89.reuse, R4; 305 | --:-:-:-:0 FFMA R6, R81.reuse, R89.reuse, R6; 306 | --:-:1:-:1 LDS.U.128 R76, [R107+0x900]; 307 | --:-:-:-:1 FFMA R7, R81, R88.reuse, R7; 308 | --:-:-:-:1 FFMA R33, R86.reuse, R88.reuse, R33; 309 | --:-:-:-:1 FFMA R32, R86, R89.reuse, R32; 310 | --:-:-:-:1 FFMA R34, R84.reuse, R89.reuse, R34; 311 | --:-:-:-:1 FFMA R35, R84, R88.reuse, R35; 312 | --:-:-:-:1 FFMA R37, R87.reuse, R88.reuse, R37; 313 | --:-:-:-:1 FFMA R36, R87.reuse, R89.reuse, R36; 314 | --:-:-:-:1 FFMA R38, R85.reuse, R89, R38; 315 | --:-:-:-:1 FFMA R39, R85.reuse, R88, R39; 316 | --:-:-:-:1 FFMA R45, R87.reuse, R90.reuse, R45; 317 | --:-:-:-:1 FFMA R44, R87, R91.reuse, R44; 318 | --:-:-:-:1 FFMA R46, R85.reuse, R91.reuse, R46; 319 | --:-:-:-:1 FFMA R47, R85, R90.reuse, R47; 320 | --:-:-:-:1 FFMA R41, R86.reuse, R90.reuse, R41; 321 | --:-:-:-:1 FFMA R40, R86, R91.reuse, R40; 322 | --:-:-:-:1 FFMA R42, R84.reuse, R91.reuse, R42; 323 | --:-:-:-:1 FFMA R43, R84, R90.reuse, R43; 324 | --:-:-:-:1 FFMA R13, R83.reuse, R90.reuse, R13; 325 | --:-:-:-:1 FFMA R12, R83, R91.reuse, R12; 326 | --:-:-:-:1 FFMA R14, R81.reuse, R91.reuse, R14; 327 | --:-:-:-:1 FFMA R15, R81, R90.reuse, R15; 328 | --:-:-:-:1 FFMA R9, R82.reuse, R90.reuse, R9; 329 | --:-:-:-:1 FFMA R8, R82.reuse, R91.reuse, R8; 330 | --:-:-:-:1 FFMA R10, R80.reuse, R91, R10; 331 | --:-:-:-:1 FFMA R11, R80.reuse, R90, R11; 332 | --:-:-:Y:1 FFMA R17, R82.reuse, R92.reuse, R17; 333 | --:-:-:-:1 FFMA R16, R82, R93.reuse, R16; 334 | --:-:-:-:1 FFMA R18, R80.reuse, R93.reuse, R18; 335 | --:-:-:-:1 FFMA R19, R80, R92.reuse, R19; 336 | --:-:-:-:1 FFMA R21, R83.reuse, R92.reuse, R21; 337 | --:-:-:-:1 FFMA R20, R83, R93.reuse, R20; 338 | --:-:-:-:1 FFMA R22, R81.reuse, R93.reuse, R22; 339 | --:-:-:-:1 FFMA R23, R81, R92.reuse, R23; 340 | --:-:-:-:1 FFMA R49, R86.reuse, R92.reuse, R49; 341 | --:-:-:-:1 FFMA R48, R86, R93.reuse, R48; 342 | --:-:-:-:1 FFMA R50, R84.reuse, R93.reuse, R50; 343 | --:-:-:-:1 FFMA R51, R84, R92.reuse, R51; 344 | --:-:-:-:1 FFMA R53, R87.reuse, R92.reuse, R53; 345 | --:-:-:-:1 FFMA R52, R87.reuse, R93.reuse, R52; 346 | --:-:-:-:1 FFMA R54, R85.reuse, R93, R54; 347 | --:-:-:-:1 FFMA R55, R85.reuse, R92, R55; 348 | --:-:-:-:1 FFMA R61, R87.reuse, R94.reuse, R61; 349 | --:-:-:-:1 FFMA R60, R87, R95.reuse, R60; 350 | --:-:-:-:1 FFMA R62, R85.reuse, R95.reuse, R62; 351 | --:-:-:-:1 FFMA R63, R85, R94.reuse, R63; 352 | --:-:-:-:1 FFMA R57, R86.reuse, R94.reuse, R57; 353 | --:-:-:-:1 FFMA R56, R86, R95.reuse, R56; 354 | --:-:-:-:1 FFMA R58, R84.reuse, R95.reuse, R58; 355 | --:-:-:-:1 FFMA R59, R84, R94.reuse, R59; 356 | --:-:-:-:1 FFMA R29, R83.reuse, R94.reuse, R29; 357 | --:-:-:-:1 FFMA R28, R83, R95.reuse, R28; 358 | --:-:-:-:1 FFMA R30, R81.reuse, R95.reuse, R30; 359 | --:-:-:-:1 FFMA R31, R81, R94.reuse, R31; 360 | --:-:-:-:1 FFMA R25, R82.reuse, R94.reuse, R25; 361 | --:-:-:-:1 FFMA R24, R82, R95.reuse, R24; 362 | --:-:-:-:1 FFMA R26, R80.reuse, R95, R26; 363 | --:-:-:-:1 FFMA R27, R80, R94, R27; 364 | 01:-:-:-:0 FFMA R1, R66.reuse, R72.reuse, R1; 365 | --:-:-:-:1 LDS.U.128 R80, [R106+0xa00]; 366 | --:-:-:-:1 FFMA R0, R66, R73.reuse, R0; 367 | --:-:-:-:0 FFMA R2, R64.reuse, R73.reuse, R2; 368 | --:-:-:-:1 LDS.U.128 R88, [R107+0xa00]; 369 | --:-:-:-:1 FFMA R3, R64, R72.reuse, R3; 370 | --:-:-:-:0 FFMA R5, R67.reuse, R72.reuse, R5; 371 | --:-:-:-:1 LDS.U.128 R84, [R106+0xb00]; 372 | --:-:-:-:1 FFMA R4, R67, R73.reuse, R4; 373 | --:-:-:-:0 FFMA R6, R65.reuse, R73.reuse, R6; 374 | --:-:1:-:1 LDS.U.128 R92, [R107+0xb00]; 375 | --:-:-:-:1 FFMA R7, R65, R72.reuse, R7; 376 | --:-:-:-:1 FFMA R33, R70.reuse, R72.reuse, R33; 377 | --:-:-:-:1 FFMA R32, R70, R73.reuse, R32; 378 | --:-:-:-:1 FFMA R34, R68.reuse, R73.reuse, R34; 379 | --:-:-:-:1 FFMA R35, R68, R72.reuse, R35; 380 | --:-:-:-:1 FFMA R37, R71.reuse, R72.reuse, R37; 381 | --:-:-:-:1 FFMA R36, R71.reuse, R73.reuse, R36; 382 | --:-:-:-:1 FFMA R38, R69.reuse, R73, R38; 383 | --:-:-:-:1 FFMA R39, R69.reuse, R72, R39; 384 | --:-:-:-:1 FFMA R45, R71.reuse, R74.reuse, R45; 385 | --:-:-:-:1 FFMA R44, R71, R75.reuse, R44; 386 | --:-:-:-:1 FFMA R46, R69.reuse, R75.reuse, R46; 387 | --:-:-:-:1 FFMA R47, R69, R74.reuse, R47; 388 | --:-:-:-:1 FFMA R41, R70.reuse, R74.reuse, R41; 389 | --:-:-:-:1 FFMA R40, R70, R75.reuse, R40; 390 | --:-:-:-:1 FFMA R42, R68.reuse, R75.reuse, R42; 391 | --:-:-:-:1 FFMA R43, R68, R74.reuse, R43; 392 | --:-:-:-:1 FFMA R13, R67.reuse, R74.reuse, R13; 393 | --:-:-:-:1 FFMA R12, R67, R75.reuse, R12; 394 | --:-:-:-:1 FFMA R14, R65.reuse, R75.reuse, R14; 395 | --:-:-:-:1 FFMA R15, R65, R74.reuse, R15; 396 | --:-:-:-:1 FFMA R9, R66.reuse, R74.reuse, R9; 397 | --:-:-:-:1 FFMA R8, R66.reuse, R75.reuse, R8; 398 | --:-:-:-:1 FFMA R10, R64.reuse, R75, R10; 399 | --:-:-:-:1 FFMA R11, R64.reuse, R74, R11; 400 | --:-:-:Y:1 FFMA R17, R66.reuse, R76.reuse, R17; 401 | --:-:-:-:1 FFMA R16, R66, R77.reuse, R16; 402 | --:-:-:-:1 FFMA R18, R64.reuse, R77.reuse, R18; 403 | --:-:-:-:1 FFMA R19, R64, R76.reuse, R19; 404 | --:-:-:-:1 FFMA R21, R67.reuse, R76.reuse, R21; 405 | --:-:-:-:1 FFMA R20, R67, R77.reuse, R20; 406 | --:-:-:-:1 FFMA R22, R65.reuse, R77.reuse, R22; 407 | --:-:-:-:1 FFMA R23, R65, R76.reuse, R23; 408 | --:-:-:-:1 FFMA R49, R70.reuse, R76.reuse, R49; 409 | --:-:-:-:1 FFMA R48, R70, R77.reuse, R48; 410 | --:-:-:-:1 FFMA R50, R68.reuse, R77.reuse, R50; 411 | --:-:-:-:1 FFMA R51, R68, R76.reuse, R51; 412 | --:-:-:-:1 FFMA R53, R71.reuse, R76.reuse, R53; 413 | --:-:-:-:1 FFMA R52, R71.reuse, R77.reuse, R52; 414 | --:-:-:-:1 FFMA R54, R69.reuse, R77, R54; 415 | --:-:-:-:1 FFMA R55, R69.reuse, R76, R55; 416 | --:-:-:-:1 FFMA R61, R71.reuse, R78.reuse, R61; 417 | --:-:-:-:1 FFMA R60, R71, R79.reuse, R60; 418 | --:-:-:-:1 FFMA R62, R69.reuse, R79.reuse, R62; 419 | --:-:-:-:1 FFMA R63, R69, R78.reuse, R63; 420 | --:-:-:-:1 FFMA R57, R70.reuse, R78.reuse, R57; 421 | --:-:-:-:1 FFMA R56, R70, R79.reuse, R56; 422 | --:-:-:-:1 FFMA R58, R68.reuse, R79.reuse, R58; 423 | --:-:-:-:1 FFMA R59, R68, R78.reuse, R59; 424 | --:-:-:-:1 FFMA R29, R67.reuse, R78.reuse, R29; 425 | --:-:-:-:1 FFMA R28, R67, R79.reuse, R28; 426 | --:-:-:-:1 FFMA R30, R65.reuse, R79.reuse, R30; 427 | --:-:-:-:1 FFMA R31, R65, R78.reuse, R31; 428 | --:-:-:-:1 FFMA R25, R66.reuse, R78.reuse, R25; 429 | --:-:-:-:1 FFMA R24, R66, R79.reuse, R24; 430 | --:-:-:-:1 FFMA R26, R64.reuse, R79, R26; 431 | --:-:-:-:1 FFMA R27, R64, R78, R27; 432 | 01:-:-:-:0 FFMA R1, R82.reuse, R88.reuse, R1; 433 | --:-:-:-:1 LDS.U.128 R64, [R106+0xc00]; 434 | --:-:-:-:1 FFMA R0, R82, R89.reuse, R0; 435 | --:-:-:-:0 FFMA R2, R80.reuse, R89.reuse, R2; 436 | --:-:-:-:1 LDS.U.128 R72, [R107+0xc00]; 437 | --:-:-:-:1 FFMA R3, R80, R88.reuse, R3; 438 | --:-:-:-:0 FFMA R5, R83.reuse, R88.reuse, R5; 439 | --:-:-:-:1 LDS.U.128 R68, [R106+0xd00]; 440 | --:-:-:-:1 FFMA R4, R83, R89.reuse, R4; 441 | --:-:-:-:0 FFMA R6, R81.reuse, R89.reuse, R6; 442 | --:-:1:-:1 LDS.U.128 R76, [R107+0xd00]; 443 | --:-:-:-:1 FFMA R7, R81, R88.reuse, R7; 444 | --:-:-:-:1 FFMA R33, R86.reuse, R88.reuse, R33; 445 | --:-:-:-:1 FFMA R32, R86, R89.reuse, R32; 446 | --:-:-:-:1 FFMA R34, R84.reuse, R89.reuse, R34; 447 | --:-:-:-:1 FFMA R35, R84, R88.reuse, R35; 448 | --:-:-:-:1 FFMA R37, R87.reuse, R88.reuse, R37; 449 | --:-:-:-:1 FFMA R36, R87.reuse, R89.reuse, R36; 450 | --:-:-:-:1 FFMA R38, R85.reuse, R89, R38; 451 | --:-:-:-:1 FFMA R39, R85.reuse, R88, R39; 452 | --:-:-:-:1 FFMA R45, R87.reuse, R90.reuse, R45; 453 | --:-:-:-:1 FFMA R44, R87, R91.reuse, R44; 454 | --:-:-:-:1 FFMA R46, R85.reuse, R91.reuse, R46; 455 | --:-:-:-:1 FFMA R47, R85, R90.reuse, R47; 456 | --:-:-:-:1 FFMA R41, R86.reuse, R90.reuse, R41; 457 | --:-:-:-:1 FFMA R40, R86, R91.reuse, R40; 458 | --:-:-:-:1 FFMA R42, R84.reuse, R91.reuse, R42; 459 | --:-:-:-:1 FFMA R43, R84, R90.reuse, R43; 460 | --:-:-:-:1 FFMA R13, R83.reuse, R90.reuse, R13; 461 | --:-:-:-:1 FFMA R12, R83, R91.reuse, R12; 462 | --:-:-:-:1 FFMA R14, R81.reuse, R91.reuse, R14; 463 | --:-:-:-:1 FFMA R15, R81, R90.reuse, R15; 464 | --:-:-:-:1 FFMA R9, R82.reuse, R90.reuse, R9; 465 | --:-:-:-:1 FFMA R8, R82.reuse, R91.reuse, R8; 466 | --:-:-:-:1 FFMA R10, R80.reuse, R91, R10; 467 | --:-:-:-:1 FFMA R11, R80.reuse, R90, R11; 468 | --:-:-:Y:1 FFMA R17, R82.reuse, R92.reuse, R17; 469 | --:-:-:-:1 FFMA R16, R82, R93.reuse, R16; 470 | --:-:-:-:1 FFMA R18, R80.reuse, R93.reuse, R18; 471 | --:-:-:-:1 FFMA R19, R80, R92.reuse, R19; 472 | --:-:-:-:1 FFMA R21, R83.reuse, R92.reuse, R21; 473 | --:-:-:-:1 FFMA R20, R83, R93.reuse, R20; 474 | --:-:-:-:1 FFMA R22, R81.reuse, R93.reuse, R22; 475 | --:-:-:-:1 FFMA R23, R81, R92.reuse, R23; 476 | --:-:-:-:1 FFMA R49, R86.reuse, R92.reuse, R49; 477 | --:-:-:-:1 FFMA R48, R86, R93.reuse, R48; 478 | --:-:-:-:1 FFMA R50, R84.reuse, R93.reuse, R50; 479 | --:-:-:-:1 FFMA R51, R84, R92.reuse, R51; 480 | --:-:-:-:1 FFMA R53, R87.reuse, R92.reuse, R53; 481 | --:-:-:-:1 FFMA R52, R87.reuse, R93.reuse, R52; 482 | --:-:-:-:1 FFMA R54, R85.reuse, R93, R54; 483 | --:-:-:-:1 FFMA R55, R85.reuse, R92, R55; 484 | --:-:-:-:1 FFMA R61, R87.reuse, R94.reuse, R61; 485 | --:-:-:-:1 FFMA R60, R87, R95.reuse, R60; 486 | --:-:-:-:1 FFMA R62, R85.reuse, R95.reuse, R62; 487 | --:-:-:-:1 FFMA R63, R85, R94.reuse, R63; 488 | --:-:-:-:1 FFMA R57, R86.reuse, R94.reuse, R57; 489 | --:-:-:-:1 FFMA R56, R86, R95.reuse, R56; 490 | --:-:-:-:1 FFMA R58, R84.reuse, R95.reuse, R58; 491 | --:-:-:-:1 FFMA R59, R84, R94.reuse, R59; 492 | --:-:-:-:1 FFMA R29, R83.reuse, R94.reuse, R29; 493 | --:-:-:-:1 FFMA R28, R83, R95.reuse, R28; 494 | --:-:-:-:1 FFMA R30, R81.reuse, R95.reuse, R30; 495 | --:-:-:-:1 FFMA R31, R81, R94.reuse, R31; 496 | --:-:-:-:1 FFMA R25, R82.reuse, R94.reuse, R25; 497 | --:-:-:-:1 FFMA R24, R82, R95.reuse, R24; 498 | --:-:-:-:1 FFMA R26, R80.reuse, R95, R26; 499 | --:-:-:-:1 FFMA R27, R80, R94, R27; 500 | 01:-:-:-:0 FFMA R1, R66.reuse, R72.reuse, R1; 501 | --:-:-:-:1 LDS.U.128 R80, [R106+0xe00]; 502 | --:-:-:-:1 FFMA R0, R66, R73.reuse, R0; 503 | --:-:-:-:0 FFMA R2, R64.reuse, R73.reuse, R2; 504 | --:-:-:-:1 LDS.U.128 R88, [R107+0xe00]; 505 | --:-:-:-:1 FFMA R3, R64, R72.reuse, R3; 506 | --:-:-:-:0 FFMA R5, R67.reuse, R72.reuse, R5; 507 | --:-:-:-:1 LDS.U.128 R84, [R106+0xf00]; 508 | --:-:-:-:1 FFMA R4, R67, R73.reuse, R4; 509 | --:-:-:-:0 FFMA R6, R65.reuse, R73.reuse, R6; 510 | --:-:1:-:1 LDS.U.128 R92, [R107+0xf00]; 511 | --:-:-:-:1 FFMA R7, R65, R72.reuse, R7; 512 | --:-:-:-:1 FFMA R33, R70.reuse, R72.reuse, R33; 513 | --:-:-:-:1 FFMA R32, R70, R73.reuse, R32; 514 | --:-:-:-:1 FFMA R34, R68.reuse, R73.reuse, R34; 515 | --:-:-:-:1 FFMA R35, R68, R72.reuse, R35; 516 | --:-:-:-:1 FFMA R37, R71.reuse, R72.reuse, R37; 517 | --:-:-:-:1 FFMA R36, R71.reuse, R73.reuse, R36; 518 | --:-:-:-:1 FFMA R38, R69.reuse, R73, R38; 519 | --:-:-:-:1 FFMA R39, R69.reuse, R72, R39; 520 | --:-:-:-:1 FFMA R45, R71.reuse, R74.reuse, R45; 521 | --:-:-:-:1 FFMA R44, R71, R75.reuse, R44; 522 | --:-:-:-:1 FFMA R46, R69.reuse, R75.reuse, R46; 523 | --:-:-:-:1 FFMA R47, R69, R74.reuse, R47; 524 | --:-:-:-:1 FFMA R41, R70.reuse, R74.reuse, R41; 525 | --:-:-:-:1 FFMA R40, R70, R75.reuse, R40; 526 | --:-:-:-:1 FFMA R42, R68.reuse, R75.reuse, R42; 527 | --:-:-:-:1 FFMA R43, R68, R74.reuse, R43; 528 | --:-:-:-:1 FFMA R13, R67.reuse, R74.reuse, R13; 529 | --:-:-:-:1 FFMA R12, R67, R75.reuse, R12; 530 | --:-:-:-:1 FFMA R14, R65.reuse, R75.reuse, R14; 531 | --:-:-:-:1 FFMA R15, R65, R74.reuse, R15; 532 | --:-:-:-:1 FFMA R9, R66.reuse, R74.reuse, R9; 533 | --:-:-:-:1 FFMA R8, R66.reuse, R75.reuse, R8; 534 | --:-:-:-:0 FFMA R10, R64.reuse, R75, R10; 535 | 02:-:-:-:1 @P0 STS.128 [R111], R96; 536 | --:-:-:-:1 FFMA R11, R64.reuse, R74, R11; 537 | --:-:-:Y:1 FFMA R17, R66.reuse, R76.reuse, R17; 538 | --:-:-:-:1 FFMA R16, R66, R77.reuse, R16; 539 | --:-:-:-:0 FFMA R18, R64.reuse, R77.reuse, R18; 540 | 04:-:-:-:1 @P0 STS.128 [R111+0x800], R100; 541 | --:-:-:-:1 FFMA R19, R64, R76.reuse, R19; 542 | --:-:-:-:1 FFMA R21, R67.reuse, R76.reuse, R21; 543 | --:-:-:-:1 FFMA R20, R67, R77.reuse, R20; 544 | --:-:-:-:1 FFMA R22, R65.reuse, R77.reuse, R22; 545 | --:-:-:-:1 FFMA R23, R65, R76.reuse, R23; 546 | --:-:-:-:1 FFMA R49, R70.reuse, R76.reuse, R49; 547 | --:-:-:-:1 FFMA R48, R70, R77.reuse, R48; 548 | --:-:-:-:1 FFMA R50, R68.reuse, R77.reuse, R50; 549 | --:-:-:-:1 FFMA R51, R68, R76.reuse, R51; 550 | --:-:-:-:1 FFMA R53, R71.reuse, R76.reuse, R53; 551 | --:-:-:-:1 FFMA R52, R71.reuse, R77.reuse, R52; 552 | --:-:-:-:1 FFMA R54, R69.reuse, R77, R54; 553 | --:-:-:-:1 FFMA R55, R69.reuse, R76, R55; 554 | --:-:-:-:1 FFMA R61, R71.reuse, R78.reuse, R61; 555 | --:-:-:-:1 FFMA R60, R71, R79.reuse, R60; 556 | --:-:-:-:1 FFMA R62, R69.reuse, R79.reuse, R62; 557 | --:-:-:-:1 FFMA R63, R69, R78.reuse, R63; 558 | --:-:-:-:1 FFMA R57, R70.reuse, R78.reuse, R57; 559 | --:-:-:-:1 FFMA R56, R70, R79.reuse, R56; 560 | --:-:-:-:1 FFMA R58, R68.reuse, R79.reuse, R58; 561 | --:-:-:-:1 FFMA R59, R68, R78.reuse, R59; 562 | --:-:-:-:1 FFMA R29, R67.reuse, R78.reuse, R29; 563 | --:-:-:-:1 FFMA R28, R67, R79.reuse, R28; 564 | --:-:-:-:1 FFMA R30, R65.reuse, R79.reuse, R30; 565 | --:-:-:-:1 FFMA R31, R65, R78.reuse, R31; 566 | --:-:-:-:1 FFMA R25, R66.reuse, R78.reuse, R25; 567 | --:-:-:-:1 FFMA R24, R66, R79.reuse, R24; 568 | --:-:-:-:0 FFMA R26, R64.reuse, R79, R26; 569 | 01:-:-:-:5 BAR.SYNC 0x0; 570 | --:-:-:-:1 @P0 LOP.XOR R106, R106, 0x2000; 571 | --:-:-:-:1 @P0 LOP.XOR R107, R107, 0x2000; 572 | --:-:-:-:1 @P0 LOP.XOR R111, R111, 0x2000; 573 | --:-:-:-:1 FFMA R27, R64, R78, R27; 574 | --:-:-:-:0 FFMA R1, R82.reuse, R88.reuse, R1; 575 | --:-:-:-:1 @P0 LDS.U.128 R64, [R106]; 576 | --:-:-:-:1 FFMA R0, R82, R89.reuse, R0; 577 | --:-:-:-:0 FFMA R2, R80.reuse, R89.reuse, R2; 578 | --:-:-:-:1 @P0 LDS.U.128 R72, [R107]; 579 | --:-:-:-:1 FFMA R3, R80, R88.reuse, R3; 580 | --:-:-:-:0 FFMA R5, R83.reuse, R88.reuse, R5; 581 | --:-:-:-:1 @P0 LDS.U.128 R68, [R106+0x100]; 582 | --:-:-:-:1 FFMA R4, R83, R89.reuse, R4; 583 | --:-:-:-:0 FFMA R6, R81.reuse, R89.reuse, R6; 584 | --:-:1:-:1 @P0 LDS.U.128 R76, [R107+0x100]; 585 | --:-:-:-:1 FFMA R7, R81, R88.reuse, R7; 586 | --:-:-:-:1 FFMA R33, R86.reuse, R88.reuse, R33; 587 | --:-:-:-:1 FFMA R32, R86, R89.reuse, R32; 588 | --:-:-:-:1 FFMA R34, R84.reuse, R89.reuse, R34; 589 | --:-:-:-:1 FFMA R35, R84, R88.reuse, R35; 590 | --:-:-:-:1 FFMA R37, R87.reuse, R88.reuse, R37; 591 | --:-:-:-:1 FFMA R36, R87.reuse, R89.reuse, R36; 592 | --:-:-:-:1 FFMA R38, R85.reuse, R89, R38; 593 | --:-:-:-:1 FFMA R39, R85.reuse, R88, R39; 594 | --:-:-:-:1 FFMA R45, R87.reuse, R90.reuse, R45; 595 | --:-:-:-:1 FFMA R44, R87, R91.reuse, R44; 596 | --:-:-:-:1 FFMA R46, R85.reuse, R91.reuse, R46; 597 | --:-:-:-:1 FFMA R47, R85, R90.reuse, R47; 598 | --:-:-:-:1 FFMA R41, R86.reuse, R90.reuse, R41; 599 | --:-:-:-:1 FFMA R40, R86, R91.reuse, R40; 600 | --:-:-:-:1 FFMA R42, R84.reuse, R91.reuse, R42; 601 | --:-:-:-:1 FFMA R43, R84, R90.reuse, R43; 602 | --:-:-:-:1 FFMA R13, R83.reuse, R90.reuse, R13; 603 | --:-:-:-:1 FFMA R12, R83, R91.reuse, R12; 604 | --:-:-:-:1 FFMA R14, R81.reuse, R91.reuse, R14; 605 | --:-:-:-:1 FFMA R15, R81, R90.reuse, R15; 606 | --:-:-:-:1 FFMA R9, R82.reuse, R90.reuse, R9; 607 | --:-:-:-:1 FFMA R8, R82.reuse, R91.reuse, R8; 608 | --:-:-:-:1 FFMA R10, R80.reuse, R91, R10; 609 | --:-:-:-:1 FFMA R11, R80.reuse, R90, R11; 610 | --:-:-:Y:1 FFMA R17, R82.reuse, R92.reuse, R17; 611 | --:-:-:-:1 FFMA R16, R82, R93.reuse, R16; 612 | --:-:-:-:1 FFMA R18, R80.reuse, R93.reuse, R18; 613 | --:-:-:-:1 FFMA R19, R80, R92.reuse, R19; 614 | --:-:-:-:1 FFMA R21, R83.reuse, R92.reuse, R21; 615 | --:-:-:-:1 FFMA R20, R83, R93.reuse, R20; 616 | --:-:-:-:1 FFMA R22, R81.reuse, R93.reuse, R22; 617 | --:-:-:-:1 FFMA R23, R81, R92.reuse, R23; 618 | --:-:-:-:1 FFMA R49, R86.reuse, R92.reuse, R49; 619 | --:-:-:-:1 FFMA R48, R86, R93.reuse, R48; 620 | --:-:-:-:1 FFMA R50, R84.reuse, R93.reuse, R50; 621 | --:-:-:-:1 FFMA R51, R84, R92.reuse, R51; 622 | --:-:-:-:1 FFMA R53, R87.reuse, R92.reuse, R53; 623 | --:-:-:-:1 FFMA R52, R87.reuse, R93.reuse, R52; 624 | --:-:-:-:1 FFMA R54, R85.reuse, R93, R54; 625 | --:-:-:-:1 FFMA R55, R85.reuse, R92, R55; 626 | --:-:-:-:1 FFMA R61, R87.reuse, R94.reuse, R61; 627 | --:-:-:-:1 FFMA R60, R87, R95.reuse, R60; 628 | --:-:-:-:1 FFMA R62, R85.reuse, R95.reuse, R62; 629 | --:-:-:-:1 FFMA R63, R85, R94.reuse, R63; 630 | --:-:-:-:1 FFMA R57, R86.reuse, R94.reuse, R57; 631 | --:-:-:-:1 FFMA R56, R86, R95.reuse, R56; 632 | --:-:-:-:1 FFMA R58, R84.reuse, R95.reuse, R58; 633 | --:-:-:-:1 FFMA R59, R84, R94.reuse, R59; 634 | --:-:-:-:1 FFMA R29, R83.reuse, R94.reuse, R29; 635 | --:-:-:-:1 FFMA R28, R83, R95.reuse, R28; 636 | --:-:-:-:1 FFMA R30, R81.reuse, R95.reuse, R30; 637 | --:-:-:-:1 FFMA R31, R81, R94.reuse, R31; 638 | --:-:-:-:1 FFMA R25, R82.reuse, R94.reuse, R25; 639 | --:-:-:-:1 FFMA R24, R82, R95.reuse, R24; 640 | --:-:-:-:1 FFMA R26, R80.reuse, R95, R26; 641 | --:-:-:-:1 FFMA R27, R80, R94, R27; 642 | --:-:-:-:1 @P0 IADD R104, R104, R109.reuse; 643 | --:-:-:-:0 @P0 IADD R108, R108, R109; 644 | --:-:-:Y:5 @P0 BRA TARGET1; 645 | --:-:-:-:1 SHR.U32 R84, R115, 0x2; 646 | --:-:-:-:1 MOV R77, c[0x0][0x158]; 647 | --:-:-:-:1 SHR.U32 R80, R116.reuse, 0x1; 648 | --:-:-:-:1 MOV R72, c[0x0][0x15c]; 649 | --:-:-:-:1 SHL R89, R116, 0x4; 650 | --:-:-:-:1 LOP.AND R106, R106, 0xfff; 651 | --:-:-:-:1 LOP.OR R84, R117, R84; 652 | --:-:-:-:1 SHL R81, R77.reuse, 0x2; 653 | --:-:-:-:1 LOP.AND R107, R107, 0xfff; 654 | --:-:-:-:1 ISCADD R80, R114, R80, 0x7; 655 | --:-:-:-:1 FMUL R64, R3, R72.reuse; 656 | --:-:-:-:1 SHL R74, R77.reuse, 0x4; 657 | --:-:-:-:1 LOP.OR R89, R89, R84; 658 | --:-:-:-:1 ISCADD R84, R113, R84, 0x7; 659 | --:-:-:-:1 FMUL R65, R7, R72.reuse; 660 | --:-:-:-:1 SHL R88, R77, 0x5; 661 | --:-:-:-:1 XMAD.MRG R78, R80.reuse, R77.H1.reuse, RZ; 662 | --:-:-:-:1 ISCADD R90, R107, R106, 0x5; 663 | --:-:-:-:1 FMUL R66, R1, R72.reuse; 664 | --:-:-:-:1 SHL R89, R89, 0x2; 665 | --:-:-:-:1 XMAD R73, R80, R77, R84; 666 | --:-:-:-:1 ISETP.LT.AND P5, PT, R84, c[0x0][0x144], PT; 667 | --:-:-:-:1 IADD R84, R84, 0x40; 668 | --:-:-:-:1 ISCADD R85, R77, -R74, 0x8; 669 | --:-:-:-:1 FMUL R67, R5, R72.reuse; 670 | --:-:-:-:1 FMUL R68, R35, R72.reuse; 671 | --:-:-:-:1 XMAD.PSL.CBCC R73, R80.H1, R78.H1, R73; 672 | --:-:-:-:1 IADD R80, R80, -0x1; 673 | --:-:-:-:1 ISETP.LT.AND P6, PT, R84, c[0x0][0x144], PT; 674 | --:-:-:-:1 FMUL R69, R39, R72.reuse; 675 | --:-:-:-:1 FMUL R70, R33, R72.reuse; 676 | --:-:-:-:1 FMUL R71, R37, R72; 677 | --:-:-:-:1 ISCADD R76, R73, c[0x0][0x140], 0x2; 678 | --:-:-:-:1 IADD R83, R80.reuse, 0x4; 679 | --:-:-:-:1 IADD R86, R80.reuse, 0x8; 680 | --:-:-:-:3 IADD R87, R80, 0xc; 681 | --:-:-:Y:6 IADD R76, R76, -R81; 682 | --:-:-:-:1 IADD R75, R76.reuse, R74; 683 | --:-:-:Y:5 IADD R79, R76, R88.reuse; 684 | --:-:-:-:0 IADD R82, R75, R88; 685 | --:-:-:-:5 CAL TARGET2; 686 | 02:-:-:-:1 FMUL R64, R2, R72.reuse; 687 | --:-:-:-:1 FMUL R65, R6, R72.reuse; 688 | --:-:-:-:1 FMUL R66, R0, R72.reuse; 689 | --:-:-:-:1 FMUL R67, R4, R72.reuse; 690 | --:-:-:-:1 FMUL R68, R34, R72.reuse; 691 | --:-:-:-:1 FMUL R69, R38, R72.reuse; 692 | --:-:-:-:1 FMUL R70, R32, R72.reuse; 693 | --:-:-:-:0 FMUL R71, R36, R72; 694 | --:-:-:-:5 CAL TARGET2; 695 | 02:-:-:-:1 FMUL R64, R11, R72.reuse; 696 | --:-:-:-:1 FMUL R65, R15, R72.reuse; 697 | --:-:-:-:1 FMUL R66, R9, R72.reuse; 698 | --:-:-:-:1 FMUL R67, R13, R72.reuse; 699 | --:-:-:-:1 FMUL R68, R43, R72.reuse; 700 | --:-:-:-:1 FMUL R69, R47, R72.reuse; 701 | --:-:-:-:1 FMUL R70, R41, R72.reuse; 702 | --:-:-:-:0 FMUL R71, R45, R72; 703 | --:-:-:-:5 CAL TARGET2; 704 | 02:-:-:-:1 FMUL R64, R10, R72.reuse; 705 | --:-:-:-:1 FMUL R65, R14, R72.reuse; 706 | --:-:-:-:1 FMUL R66, R8, R72.reuse; 707 | --:-:-:-:1 FMUL R67, R12, R72.reuse; 708 | --:-:-:-:1 FMUL R68, R42, R72.reuse; 709 | --:-:-:-:1 FMUL R69, R46, R72.reuse; 710 | --:-:-:-:1 FMUL R70, R40, R72.reuse; 711 | --:-:-:-:0 FMUL R71, R44, R72; 712 | --:-:-:-:5 CAL TARGET2; 713 | --:-:-:-:1 IADD R80, R80, 0x3c; 714 | --:-:-:-:1 IADD R83, R83, 0x3c; 715 | --:-:-:-:1 IADD R86, R86, 0x3c; 716 | --:-:-:-:1 IADD R87, R87, 0x3c; 717 | 02:-:-:-:1 IADD R76, R76, R85.reuse; 718 | --:-:-:-:1 IADD R75, R75, R85.reuse; 719 | --:-:-:-:1 IADD R79, R79, R85.reuse; 720 | --:-:-:-:1 IADD R82, R82, R85; 721 | --:-:-:-:1 FMUL R64, R19, R72.reuse; 722 | --:-:-:-:1 FMUL R65, R23, R72.reuse; 723 | --:-:-:-:1 FMUL R66, R17, R72.reuse; 724 | --:-:-:-:1 FMUL R67, R21, R72.reuse; 725 | --:-:-:-:1 FMUL R68, R51, R72.reuse; 726 | --:-:-:-:1 FMUL R69, R55, R72.reuse; 727 | --:-:-:-:1 FMUL R70, R49, R72.reuse; 728 | --:-:-:-:0 FMUL R71, R53, R72; 729 | --:-:-:-:5 CAL TARGET2; 730 | 02:-:-:-:1 FMUL R64, R18, R72.reuse; 731 | --:-:-:-:1 FMUL R65, R22, R72.reuse; 732 | --:-:-:-:1 FMUL R66, R16, R72.reuse; 733 | --:-:-:-:1 FMUL R67, R20, R72.reuse; 734 | --:-:-:-:1 FMUL R68, R50, R72.reuse; 735 | --:-:-:-:1 FMUL R69, R54, R72.reuse; 736 | --:-:-:-:1 FMUL R70, R48, R72.reuse; 737 | --:-:-:-:0 FMUL R71, R52, R72; 738 | --:-:-:-:5 CAL TARGET2; 739 | 02:-:-:-:1 FMUL R64, R27, R72.reuse; 740 | --:-:-:-:1 FMUL R65, R31, R72.reuse; 741 | --:-:-:-:1 FMUL R66, R25, R72.reuse; 742 | --:-:-:-:1 FMUL R67, R29, R72.reuse; 743 | --:-:-:-:1 FMUL R68, R59, R72.reuse; 744 | --:-:-:-:1 FMUL R69, R63, R72.reuse; 745 | --:-:-:-:1 FMUL R70, R57, R72.reuse; 746 | --:-:-:-:0 FMUL R71, R61, R72; 747 | --:-:-:-:5 CAL TARGET2; 748 | 02:-:-:-:1 FMUL R64, R26, R72.reuse; 749 | --:-:-:-:1 FMUL R65, R30, R72.reuse; 750 | --:-:-:-:1 FMUL R66, R24, R72.reuse; 751 | --:-:-:-:1 FMUL R67, R28, R72.reuse; 752 | --:-:-:-:1 FMUL R68, R58, R72.reuse; 753 | --:-:-:-:1 FMUL R69, R62, R72.reuse; 754 | --:-:-:-:1 FMUL R70, R56, R72.reuse; 755 | --:-:-:-:0 FMUL R71, R60, R72; 756 | --:-:-:-:5 CAL TARGET2; 757 | --:-:-:-:5 EXIT; 758 | TARGET2: 759 | --:-:-:-:0 IADD R80, R80, 0x1; 760 | --:-:-:-:1 STS.128 [R90], R64; 761 | --:-:-:-:0 IADD R83, R83, 0x1; 762 | --:-:-:-:1 STS.128 [R90+0x100], R68; 763 | --:-:-:-:0 IADD R86, R86, 0x1; 764 | --:-:-:-:1 LDS R64, [R89]; 765 | --:-:-:-:0 IADD R87, R87, 0x1; 766 | --:-:-:-:1 LDS R65, [R89+0x100]; 767 | --:-:-:-:0 IADD R76, R76, R81.reuse; 768 | --:-:-:-:1 LDS R66, [R89+0x200]; 769 | --:-:-:-:0 IADD R75, R75, R81.reuse; 770 | --:-:-:-:1 LDS R67, [R89+0x300]; 771 | --:-:-:-:0 IADD R79, R79, R81.reuse; 772 | --:-:-:-:1 LDS R68, [R89+0x400]; 773 | --:-:-:-:0 IADD R82, R82, R81; 774 | --:-:-:-:1 LDS R69, [R89+0x500]; 775 | --:-:-:-:1 ISETP.LT.AND P0, PT, R80.reuse, c[0x0][0x148], P5; 776 | --:-:-:-:1 LDS R70, [R89+0x600]; 777 | --:-:-:-:1 ISETP.LT.AND P1, PT, R80, c[0x0][0x148], P6; 778 | --:-:1:-:1 LDS R71, [R89+0x700]; 779 | --:-:-:-:2 ISETP.LT.AND P2, PT, R83.reuse, c[0x0][0x148], P5; 780 | --:-:-:Y:7 ISETP.LT.AND P3, PT, R83, c[0x0][0x148], P6; 781 | 01:-:-:-:1 @P0 STG.CG [R76], R64; 782 | --:-:-:-:1 ISETP.LT.AND P0, PT, R86.reuse, c[0x0][0x148], P5; 783 | --:-:-:-:1 @P1 STG.CG [R76+0x100], R65; 784 | --:-:-:-:1 ISETP.LT.AND P1, PT, R86, c[0x0][0x148], P6; 785 | --:-:-:-:1 @P2 STG.CG [R75], R66; 786 | --:-:-:-:1 ISETP.LT.AND P2, PT, R87.reuse, c[0x0][0x148], P5; 787 | --:-:-:-:1 @P3 STG.CG [R75+0x100], R67; 788 | --:-:-:Y:7 ISETP.LT.AND P3, PT, R87, c[0x0][0x148], P6; 789 | --:-:-:-:2 @P0 STG.CG [R79], R68; 790 | --:-:-:-:2 @P1 STG.CG [R79+0x100], R69; 791 | --:-:-:-:2 @P2 STG.CG [R82], R70; 792 | --:2:-:-:1 @P3 STG.CG [R82+0x100], R71; 793 | --:-:-:-:5 RET; 794 | -------------------------------------------------------------------------------- /t/MaxAs-MaxAs.t: -------------------------------------------------------------------------------- 1 | use strict; 2 | use warnings; 3 | 4 | use Test::More tests => 1; 5 | BEGIN { use_ok('MaxAs::MaxAs') }; 6 | --------------------------------------------------------------------------------