├── Assembler
    ├── KeplerAs
    │   ├── Install.sh
    │   ├── Install_locally.sh
    │   ├── LICENSE
    │   ├── MYMETA.json
    │   ├── MYMETA.yml
    │   ├── Makefile
    │   ├── Makefile.PL
    │   ├── README.md
    │   ├── bin
    │   │   └── KeplerAs.pl
    │   ├── blib
    │   │   ├── arch
    │   │   │   ├── .exists
    │   │   │   └── auto
    │   │   │   │   └── KeplerAs
    │   │   │   │       └── KeplerAs
    │   │   │   │           └── .exists
    │   │   ├── bin
    │   │   │   └── .exists
    │   │   ├── lib
    │   │   │   ├── KeplerAs
    │   │   │   │   ├── .exists
    │   │   │   │   ├── Cubin.pm
    │   │   │   │   ├── KeplerAs.pm
    │   │   │   │   └── KeplerAsGrammar.pm
    │   │   │   └── auto
    │   │   │   │   └── KeplerAs
    │   │   │   │       └── KeplerAs
    │   │   │   │           └── .exists
    │   │   ├── man1
    │   │   │   └── .exists
    │   │   ├── man3
    │   │   │   ├── .exists
    │   │   │   └── KeplerAs::KeplerAs.3pm
    │   │   └── script
    │   │   │   ├── .exists
    │   │   │   └── KeplerAs.pl
    │   ├── lib
    │   │   └── KeplerAs
    │   │   │   ├── Cubin.pm
    │   │   │   ├── KeplerAs.pm
    │   │   │   └── KeplerAsGrammar.pm
    │   └── pm_to_blib
    ├── MaxAs
    │   ├── Changes
    │   ├── Install.sh
    │   ├── LICENSE
    │   ├── MANIFEST
    │   ├── Makefile
    │   ├── Makefile.PL
    │   ├── README.md
    │   ├── bin
    │   │   └── maxas.pl
    │   ├── blib
    │   │   ├── arch
    │   │   │   ├── .exists
    │   │   │   └── auto
    │   │   │   │   └── MaxAs
    │   │   │   │       └── MaxAs
    │   │   │   │           └── .exists
    │   │   ├── bin
    │   │   │   └── .exists
    │   │   ├── lib
    │   │   │   ├── MaxAs
    │   │   │   │   ├── .exists
    │   │   │   │   ├── Cubin.pm
    │   │   │   │   ├── MaxAs.pm
    │   │   │   │   └── MaxAsGrammar.pm
    │   │   │   └── auto
    │   │   │   │   └── MaxAs
    │   │   │   │       └── MaxAs
    │   │   │   │           └── .exists
    │   │   ├── man1
    │   │   │   └── .exists
    │   │   ├── man3
    │   │   │   ├── .exists
    │   │   │   └── MaxAs::MaxAs.3pm
    │   │   └── script
    │   │   │   ├── .exists
    │   │   │   └── maxas.pl
    │   ├── cpanfile
    │   ├── lib
    │   │   └── MaxAs
    │   │   │   ├── Cubin.pm
    │   │   │   ├── MaxAs.pm
    │   │   │   └── MaxAsGrammar.pm
    │   ├── microbench
    │   │   ├── microbench.cpp
    │   │   ├── microbench.cu
    │   │   ├── microbench.sass
    │   │   ├── shared.pl
    │   │   ├── shared_lds.sass
    │   │   ├── shared_sts16.sass
    │   │   ├── throughput.pl
    │   │   ├── throughput.sass
    │   │   ├── throughput2.pl
    │   │   ├── throughput2.sass
    │   │   ├── throughput3.pl
    │   │   ├── throughput4.pl
    │   │   ├── throughput5.pl
    │   │   ├── xmad.pl
    │   │   └── xmad2.sass
    │   ├── pm_to_blib
    │   ├── sgemm
    │   │   ├── batched_gemm.xlsx
    │   │   ├── cublas_sgemm.ptx
    │   │   ├── new.cubin
    │   │   ├── sgemm.cpp
    │   │   ├── sgemm.cu
    │   │   ├── sgemm.pl
    │   │   ├── sgemm.sln
    │   │   ├── sgemm.vcxproj
    │   │   ├── sgemm128.sass
    │   │   ├── sgemm64.sass
    │   │   ├── sgemm_final_128.sass
    │   │   ├── sgemm_final_64.sass
    │   │   ├── sgemm_pre_128.sass
    │   │   ├── sgemm_pre_64.sass
    │   │   ├── sgemm_sm52_64.cubin
    │   │   └── sgemm_sm52_64_dump.sass
    │   └── t
    │   │   └── MaxAs-MaxAs.t
    └── PascalAs
    │   ├── Changes
    │   ├── Install.sh
    │   ├── LICENSE
    │   ├── MANIFEST
    │   ├── MYMETA.json
    │   ├── MYMETA.yml
    │   ├── Makefile
    │   ├── Makefile.PL
    │   ├── README.md
    │   ├── bin
    │       └── pascalas.pl
    │   ├── blib
    │       ├── arch
    │       │   ├── .exists
    │       │   └── auto
    │       │   │   ├── MaxAs
    │       │   │       └── MaxAs
    │       │   │       │   └── .exists
    │       │   │   └── PascalAs
    │       │   │       └── PascalAs
    │       │   │           └── .exists
    │       ├── bin
    │       │   └── .exists
    │       ├── lib
    │       │   ├── MaxAs
    │       │   │   ├── .exists
    │       │   │   ├── Cubin.pm
    │       │   │   ├── MaxAs.pm
    │       │   │   └── MaxAsGrammar.pm
    │       │   ├── PascalAs
    │       │   │   ├── .exists
    │       │   │   ├── Cubin.pm
    │       │   │   ├── PascalAs.pm
    │       │   │   └── PascalAsGrammar.pm
    │       │   └── auto
    │       │   │   ├── MaxAs
    │       │   │       └── MaxAs
    │       │   │       │   └── .exists
    │       │   │   └── PascalAs
    │       │   │       └── PascalAs
    │       │   │           └── .exists
    │       ├── man1
    │       │   └── .exists
    │       ├── man3
    │       │   ├── .exists
    │       │   ├── MaxAs::MaxAs.3pm
    │       │   └── PascalAs::PascalAs.3pm
    │       └── script
    │       │   ├── .exists
    │       │   ├── maxas.pl
    │       │   └── pascalas.pl
    │   ├── cpanfile
    │   ├── lib
    │       └── PascalAs
    │       │   ├── Cubin.pm
    │       │   ├── PascalAs.pm
    │       │   └── PascalAsGrammar.pm
    │   ├── microbench
    │       ├── microbench.cpp
    │       ├── microbench.cu
    │       ├── microbench.sass
    │       ├── shared.pl
    │       ├── shared_lds.sass
    │       ├── shared_sts16.sass
    │       ├── throughput.pl
    │       ├── throughput.sass
    │       ├── throughput2.pl
    │       ├── throughput2.sass
    │       ├── throughput3.pl
    │       ├── throughput4.pl
    │       ├── throughput5.pl
    │       ├── xmad.pl
    │       └── xmad2.sass
    │   ├── pm_to_blib
    │   ├── sgemm
    │       ├── batched_gemm.xlsx
    │       ├── cublas_sgemm.ptx
    │       ├── new.cubin
    │       ├── sgemm.cpp
    │       ├── sgemm.cu
    │       ├── sgemm.cubin
    │       ├── sgemm.pl
    │       ├── sgemm.sln
    │       ├── sgemm.vcxproj
    │       ├── sgemm128.sass
    │       ├── sgemm64.sass
    │       ├── sgemm_final_128.sass
    │       ├── sgemm_final_64.sass
    │       ├── sgemm_pre_128.sass
    │       ├── sgemm_pre_64.sass
    │       ├── sgemm_sm52_64.cubin
    │       └── sgemm_sm52_64_dump.sass
    │   └── t
    │       └── MaxAs-MaxAs.t
├── Kernel
    ├── Convolution
    │   ├── Kepler
    │   │   ├── Makefile
    │   │   ├── sconv.h
    │   │   ├── sconv_bprop.cu
    │   │   ├── sconv_bprop_C128_N128.cu
    │   │   ├── sconv_bprop_C128_N128.sass
    │   │   ├── sconv_bprop_C1_N64.cu
    │   │   ├── sconv_bprop_C1_N64.sass
    │   │   ├── sconv_bprop_C64_N64.cu
    │   │   ├── sconv_bprop_C64_N64.sass
    │   │   ├── sconv_fprop.cu
    │   │   ├── sconv_fprop_K128_N128.cu
    │   │   ├── sconv_fprop_K128_N128.sass
    │   │   ├── sconv_fprop_K64_N64.cu
    │   │   ├── sconv_fprop_K64_N64.sass
    │   │   ├── sconv_fprop_K64_N64_template.cubin
    │   │   ├── sconv_update.cu
    │   │   ├── sconv_update_C128_K128.cu
    │   │   └── sconv_update_C128_K128.sass
    │   ├── Maxwell
    │   │   ├── hconv_bprop_C1_N64.sass
    │   │   ├── hconv_updat_C128_K128.sass
    │   │   ├── hconv_updat_C128_K64.sass
    │   │   ├── hconv_xprop_X128_N128.sass
    │   │   ├── hconv_xprop_X128_N64.sass
    │   │   ├── hconv_xprop_X32_N128.sass
    │   │   ├── hconv_xprop_X64_N128.sass
    │   │   ├── hconv_xprop_X64_N64.sass
    │   │   ├── persistent_rnn_bprop.sass
    │   │   ├── persistent_rnn_fprop.sass
    │   │   ├── sconv_bprop_C1_N64.sass
    │   │   ├── sconv_updat_C128_K128.sass
    │   │   ├── sconv_updat_C128_K64.sass
    │   │   ├── sconv_xprop_X128_N128.sass
    │   │   ├── sconv_xprop_X128_N64.sass
    │   │   ├── sconv_xprop_X32_N128.sass
    │   │   ├── sconv_xprop_X64_N128.sass
    │   │   ├── sconv_xprop_X64_N64.sass
    │   │   ├── xconv_direct_updat_64x32.sass
    │   │   ├── xconv_direct_xprop_64x32.sass
    │   │   ├── xconv_winograd_2x2_3x3_32x32.sass
    │   │   ├── xconv_winograd_2x2_5x5_32x32.sass
    │   │   ├── xconv_winograd_3x3_2x2_32x32.sass
    │   │   ├── xconv_winograd_3x3_4x4_32x32.sass
    │   │   ├── xconv_winograd_4x4_3x3_32x32.sass
    │   │   ├── xconv_winograd_4x4_3x3_32x32_X.sass
    │   │   ├── xconv_winograd_4x4_3x3_32x32_common.sass
    │   │   └── xconv_xprop_common.sass
    │   └── Pascal
    │   │   ├── hconv_bprop_C1_N64.sass
    │   │   ├── hconv_updat_C128_K128.sass
    │   │   ├── hconv_updat_C128_K64.sass
    │   │   ├── hconv_xprop_X128_N128.sass
    │   │   ├── hconv_xprop_X128_N64.sass
    │   │   ├── hconv_xprop_X32_N128.sass
    │   │   ├── hconv_xprop_X64_N128.sass
    │   │   ├── hconv_xprop_X64_N64.sass
    │   │   ├── persistent_rnn_bprop.sass
    │   │   ├── persistent_rnn_fprop.sass
    │   │   ├── sconv_bprop_C1_N64.sass
    │   │   ├── sconv_updat_C128_K128.sass
    │   │   ├── sconv_updat_C128_K64.sass
    │   │   ├── sconv_xprop_X128_N128.sass
    │   │   ├── sconv_xprop_X128_N64.sass
    │   │   ├── sconv_xprop_X32_N128.sass
    │   │   ├── sconv_xprop_X64_N128.sass
    │   │   ├── sconv_xprop_X64_N64.sass
    │   │   ├── xconv_direct_updat_64x32.sass
    │   │   ├── xconv_direct_xprop_64x32.sass
    │   │   ├── xconv_winograd_2x2_3x3_32x32.sass
    │   │   ├── xconv_winograd_2x2_5x5_32x32.sass
    │   │   ├── xconv_winograd_3x3_2x2_32x32.sass
    │   │   ├── xconv_winograd_3x3_4x4_32x32.sass
    │   │   ├── xconv_winograd_4x4_3x3_32x32.sass
    │   │   ├── xconv_winograd_4x4_3x3_32x32_X.sass
    │   │   ├── xconv_winograd_4x4_3x3_32x32_common.sass
    │   │   └── xconv_xprop_common.sass
    └── SGEMM
    │   ├── Kepler
    │       ├── Makefile
    │       ├── README.md
    │       ├── sgemm_common_128x128.sass
    │       ├── sgemm_common_128x32.sass
    │       ├── sgemm_nn_128x128.cu
    │       ├── sgemm_nn_128x128.sass
    │       ├── sgemm_nn_128x128_vec.cu
    │       ├── sgemm_nn_128x128_vec.sass
    │       ├── sgemm_nt_128x128.cu
    │       ├── sgemm_nt_128x128.sass
    │       ├── sgemm_nt_128x128_vec.cu
    │       ├── sgemm_nt_128x128_vec.sass
    │       ├── sgemm_tn_128x128.cu
    │       ├── sgemm_tn_128x128.sass
    │       ├── sgemm_tn_128x128_vec.cu
    │       ├── sgemm_tn_128x128_vec.sass
    │       └── sgemm_tn_128x32.sass
    │   ├── Maxwell
    │       ├── hgemm_common_128x128.sass
    │       ├── hgemm_common_128x32.sass
    │       ├── hgemm_common_128x64.sass
    │       ├── hgemm_common_32x128.sass
    │       ├── hgemm_nn_128x128.sass
    │       ├── hgemm_nn_128x32.sass
    │       ├── hgemm_nn_128x64.sass
    │       ├── hgemm_nn_16x64.sass
    │       ├── hgemm_nn_32x128.sass
    │       ├── hgemm_nn_32x64.sass
    │       ├── hgemm_nt_128x128.sass
    │       ├── hgemm_nt_16x64.sass
    │       ├── hgemm_nt_32x128.sass
    │       ├── hgemm_nt_32x32.sass
    │       ├── hgemm_tn_128x128.sass
    │       ├── hgemm_tn_128x16.sass
    │       ├── hgemm_tn_128x32.sass
    │       ├── hgemm_tn_128x64.sass
    │       ├── sgemm_common_128x128.sass
    │       ├── sgemm_common_128x32.sass
    │       ├── sgemm_common_128x64.sass
    │       ├── sgemm_common_32x128.sass
    │       ├── sgemm_nn_128x128.sass
    │       ├── sgemm_nn_128x32.sass
    │       ├── sgemm_nn_128x64.sass
    │       ├── sgemm_nn_32x128.sass
    │       ├── sgemm_nn_rnn_128x32.sass
    │       ├── sgemm_nt_128x128.sass
    │       ├── sgemm_nt_32x128.sass
    │       ├── sgemm_rnn_bprop_common_128x32.sass
    │       ├── sgemm_rnn_common_128x32.sass
    │       ├── sgemm_tn_128x128.sass
    │       ├── sgemm_tn_128x32.sass
    │       ├── sgemm_tn_128x64.sass
    │       └── sgemm_tn_rnn_bprop_128x32.sass
    │   └── Pascal
    │       ├── hgemm_common_128x128.sass
    │       ├── hgemm_common_128x32.sass
    │       ├── hgemm_common_128x64.sass
    │       ├── hgemm_common_32x128.sass
    │       ├── hgemm_nn_128x128.sass
    │       ├── hgemm_nn_128x32.sass
    │       ├── hgemm_nn_128x64.sass
    │       ├── hgemm_nn_16x64.sass
    │       ├── hgemm_nn_32x128.sass
    │       ├── hgemm_nn_32x64.sass
    │       ├── hgemm_nt_128x128.sass
    │       ├── hgemm_nt_16x64.sass
    │       ├── hgemm_nt_32x128.sass
    │       ├── hgemm_nt_32x32.sass
    │       ├── hgemm_tn_128x128.sass
    │       ├── hgemm_tn_128x16.sass
    │       ├── hgemm_tn_128x32.sass
    │       ├── hgemm_tn_128x64.sass
    │       ├── sgemm_common_128x128.sass
    │       ├── sgemm_common_128x32.sass
    │       ├── sgemm_common_128x64.sass
    │       ├── sgemm_common_32x128.sass
    │       ├── sgemm_nn_128x128.sass
    │       ├── sgemm_nn_128x32.sass
    │       ├── sgemm_nn_128x64.sass
    │       ├── sgemm_nn_32x128.sass
    │       ├── sgemm_nn_rnn_128x32.sass
    │       ├── sgemm_nt_128x128.sass
    │       ├── sgemm_nt_32x128.sass
    │       ├── sgemm_rnn_bprop_common_128x32.sass
    │       ├── sgemm_rnn_common_128x32.sass
    │       ├── sgemm_tn_128x128.sass
    │       ├── sgemm_tn_128x32.sass
    │       ├── sgemm_tn_128x64.sass
    │       └── sgemm_tn_rnn_bprop_128x32.sass
├── README.md
└── Solver
    ├── .gitignore
    ├── README.md
    ├── bin
        ├── generate_disassemble
        ├── modifier
        ├── opcode
        └── operand
    └── src
        ├── __init__.py
        ├── dumper.py
        ├── enumerator.py
        ├── inst.py
        ├── modifier.py
        ├── opcode.py
        ├── operand.py
        ├── ptxgen.pl
        ├── test.cu
        └── unique.py


/Assembler/KeplerAs/Install.sh:
--------------------------------------------------------------------------------
1 | perl Makefile.PL
2 | make
3 | sudo make install
4 | 


--------------------------------------------------------------------------------
/Assembler/KeplerAs/Install_locally.sh:
--------------------------------------------------------------------------------
1 | perl Makefile.PL
2 | make
3 | 
4 | #configure the following variables in .bashrc; then source ~/.bashrc
5 | #export PERL5LIB=/home/xiuxia/PP2017_artifact/KeplerAs/blib/lib/:$PERL5LIB
6 | #export PATH=/home/xiuxia/PPoPP2017_artifact/KeplerAs/blib/script:$PATH
7 | 


--------------------------------------------------------------------------------
/Assembler/KeplerAs/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2014 Scott Gray
 4 | Copyright (c) 2015~2016 Xiuxia Zhang
 5 | 
 6 | Permission is hereby granted, free of charge, to any person obtaining a copy
 7 | of this software and associated documentation files (the "Software"), to deal
 8 | in the Software without restriction, including without limitation the rights
 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the Software is
11 | furnished to do so, subject to the following conditions:
12 | 
13 | The above copyright notice and this permission notice shall be included in
14 | all copies or substantial portions of the Software.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 | THE SOFTWARE.
23 | 


--------------------------------------------------------------------------------
/Assembler/KeplerAs/MYMETA.json:
--------------------------------------------------------------------------------
 1 | {
 2 |    "abstract" : "Assembler for NVIDIA Maxwell architecture",
 3 |    "author" : [
 4 |       "Xiuxia Zhang <zhangxiuxia1@gmail.com>"
 5 |    ],
 6 |    "dynamic_config" : 0,
 7 |    "generated_by" : "ExtUtils::MakeMaker version 7.0401, CPAN::Meta::Converter version 2.150001",
 8 |    "license" : [
 9 |       "mit"
10 |    ],
11 |    "meta-spec" : {
12 |       "url" : "http://search.cpan.org/perldoc?CPAN::Meta::Spec",
13 |       "version" : "2"
14 |    },
15 |    "name" : "KeplerAs-KeplerAs",
16 |    "no_index" : {
17 |       "directory" : [
18 |          "t",
19 |          "inc"
20 |       ]
21 |    },
22 |    "prereqs" : {
23 |       "build" : {
24 |          "requires" : {
25 |             "ExtUtils::MakeMaker" : "0"
26 |          }
27 |       },
28 |       "configure" : {
29 |          "requires" : {
30 |             "ExtUtils::MakeMaker" : "0"
31 |          }
32 |       },
33 |       "runtime" : {
34 |          "requires" : {
35 |             "Carp" : "1.29",
36 |             "Data::Dumper" : "2.145"
37 |          }
38 |       }
39 |    },
40 |    "release_status" : "stable",
41 |    "version" : "1.06"
42 | }
43 | 


--------------------------------------------------------------------------------
/Assembler/KeplerAs/MYMETA.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | abstract: 'Assembler for NVIDIA Maxwell architecture'
 3 | author:
 4 |   - 'Xiuxia Zhang <zhangxiuxia1@gmail.com>'
 5 | build_requires:
 6 |   ExtUtils::MakeMaker: '0'
 7 | configure_requires:
 8 |   ExtUtils::MakeMaker: '0'
 9 | dynamic_config: 0
10 | generated_by: 'ExtUtils::MakeMaker version 7.0401, CPAN::Meta::Converter version 2.150001'
11 | license: mit
12 | meta-spec:
13 |   url: http://module-build.sourceforge.net/META-spec-v1.4.html
14 |   version: '1.4'
15 | name: KeplerAs-KeplerAs
16 | no_index:
17 |   directory:
18 |     - t
19 |     - inc
20 | requires:
21 |   Carp: '1.29'
22 |   Data::Dumper: '2.145'
23 | version: '1.06'
24 | 


--------------------------------------------------------------------------------
/Assembler/KeplerAs/Makefile.PL:
--------------------------------------------------------------------------------
 1 | require 5.10.0;
 2 | use ExtUtils::MakeMaker;
 3 | # See lib/ExtUtils/MakeMaker.pm for details of how to influence
 4 | # the contents of the Makefile that is written.
 5 | WriteMakefile(
 6 |     NAME              => 'KeplerAs::KeplerAs',
 7 |     VERSION_FROM      => 'lib/KeplerAs/KeplerAs.pm', # finds $VERSION
 8 |     EXE_FILES         => ['bin/KeplerAs.pl'],
 9 |     PREREQ_PM         => {Carp => 1.29, Data::Dumper => 2.145},
10 |     LICENSE           => 'MIT',
11 |     ($] >= 5.005 ?     ## Add these new keywords supported since 5.005
12 |       (ABSTRACT_FROM  => 'lib/KeplerAs/KeplerAs.pm', # retrieve abstract from module
13 |        AUTHOR         => 'Xiuxia Zhang <zhangxiuxia1@gmail.com>') : ()),
14 | );
15 | 


--------------------------------------------------------------------------------
/Assembler/KeplerAs/README.md:
--------------------------------------------------------------------------------
1 | ##Kepler GPU assembler: KeplerAs
2 | 
3 | Our KeplerAs is based on Maxas(for Maxwell and Pascal GPU). 
4 | Kepler use a completely different ISA incodings compared with Maxwell GPU.
5 | We use the ISA encoding information cracked by our solver.
6 | 
7 | Install.sh is script to install the software.
8 | 


--------------------------------------------------------------------------------
/Assembler/KeplerAs/blib/arch/.exists:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ncic-sugon/DeepPerf/e539add1347e9354712787f71f66ae346f006544/Assembler/KeplerAs/blib/arch/.exists


--------------------------------------------------------------------------------
/Assembler/KeplerAs/blib/arch/auto/KeplerAs/KeplerAs/.exists:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ncic-sugon/DeepPerf/e539add1347e9354712787f71f66ae346f006544/Assembler/KeplerAs/blib/arch/auto/KeplerAs/KeplerAs/.exists


--------------------------------------------------------------------------------
/Assembler/KeplerAs/blib/bin/.exists:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ncic-sugon/DeepPerf/e539add1347e9354712787f71f66ae346f006544/Assembler/KeplerAs/blib/bin/.exists


--------------------------------------------------------------------------------
/Assembler/KeplerAs/blib/lib/KeplerAs/.exists:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ncic-sugon/DeepPerf/e539add1347e9354712787f71f66ae346f006544/Assembler/KeplerAs/blib/lib/KeplerAs/.exists


--------------------------------------------------------------------------------
/Assembler/KeplerAs/blib/lib/auto/KeplerAs/KeplerAs/.exists:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ncic-sugon/DeepPerf/e539add1347e9354712787f71f66ae346f006544/Assembler/KeplerAs/blib/lib/auto/KeplerAs/KeplerAs/.exists


--------------------------------------------------------------------------------
/Assembler/KeplerAs/blib/man1/.exists:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ncic-sugon/DeepPerf/e539add1347e9354712787f71f66ae346f006544/Assembler/KeplerAs/blib/man1/.exists


--------------------------------------------------------------------------------
/Assembler/KeplerAs/blib/man3/.exists:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ncic-sugon/DeepPerf/e539add1347e9354712787f71f66ae346f006544/Assembler/KeplerAs/blib/man3/.exists


--------------------------------------------------------------------------------
/Assembler/KeplerAs/blib/man3/KeplerAs::KeplerAs.3pm:
--------------------------------------------------------------------------------
  1 | .\" Automatically generated by Pod::Man 2.28 (Pod::Simple 3.29)
  2 | .\"
  3 | .\" Standard preamble:
  4 | .\" ========================================================================
  5 | .de Sp \" Vertical space (when we can't use .PP)
  6 | .if t .sp .5v
  7 | .if n .sp
  8 | ..
  9 | .de Vb \" Begin verbatim text
 10 | .ft CW
 11 | .nf
 12 | .ne \\$1
 13 | ..
 14 | .de Ve \" End verbatim text
 15 | .ft R
 16 | .fi
 17 | ..
 18 | .\" Set up some character translations and predefined strings.  \*(-- will
 19 | .\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left
 20 | .\" double quote, and \*(R" will give a right double quote.  \*(C+ will
 21 | .\" give a nicer C++.  Capital omega is used to do unbreakable dashes and
 22 | .\" therefore won't be available.  \*(C` and \*(C' expand to `' in nroff,
 23 | .\" nothing in troff, for use with C<>.
 24 | .tr \(*W-
 25 | .ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p'
 26 | .ie n \{\
 27 | .    ds -- \(*W-
 28 | .    ds PI pi
 29 | .    if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch
 30 | .    if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\"  diablo 12 pitch
 31 | .    ds L" ""
 32 | .    ds R" ""
 33 | .    ds C` ""
 34 | .    ds C' ""
 35 | 'br\}
 36 | .el\{\
 37 | .    ds -- \|\(em\|
 38 | .    ds PI \(*p
 39 | .    ds L" ``
 40 | .    ds R" ''
 41 | .    ds C`
 42 | .    ds C'
 43 | 'br\}
 44 | .\"
 45 | .\" Escape single quotes in literal strings from groff's Unicode transform.
 46 | .ie \n(.g .ds Aq \(aq
 47 | .el       .ds Aq '
 48 | .\"
 49 | .\" If the F register is turned on, we'll generate index entries on stderr for
 50 | .\" titles (.TH), headers (.SH), subsections (.SS), items (.Ip), and index
 51 | .\" entries marked with X<> in POD.  Of course, you'll have to process the
 52 | .\" output yourself in some meaningful fashion.
 53 | .\"
 54 | .\" Avoid warning from groff about undefined register 'F'.
 55 | .de IX
 56 | ..
 57 | .nr rF 0
 58 | .if \n(.g .if rF .nr rF 1
 59 | .if (\n(rF:(\n(.g==0)) \{
 60 | .    if \nF \{
 61 | .        de IX
 62 | .        tm Index:\\$1\t\\n%\t"\\$2"
 63 | ..
 64 | .        if !\nF==2 \{
 65 | .            nr % 0
 66 | .            nr F 2
 67 | .        \}
 68 | .    \}
 69 | .\}
 70 | .rr rF
 71 | .\" ========================================================================
 72 | .\"
 73 | .IX Title "KeplerAs::KeplerAs 3pm"
 74 | .TH KeplerAs::KeplerAs 3pm "2018-11-05" "perl v5.22.1" "User Contributed Perl Documentation"
 75 | .\" For nroff, turn off justification.  Always turn off hyphenation; it makes
 76 | .\" way too many mistakes in technical documents.
 77 | .if n .ad l
 78 | .nh
 79 | .SH "NAME"
 80 | KeplerAs::KeplerAs \- Assembler for NVIDIA Maxwell architecture
 81 | .SH "SYNOPSIS"
 82 | .IX Header "SYNOPSIS"
 83 | .Vb 1
 84 | \&    KeplerAs.pl [opts]
 85 | .Ve
 86 | .SH "DESCRIPTION"
 87 | .IX Header "DESCRIPTION"
 88 | See the documentation at: https://github.com/NervanaSystems/KeplerAs
 89 | .SH "SEE ALSO"
 90 | .IX Header "SEE ALSO"
 91 | See the documentation at: https://github.com/NervanaSystems/KeplerAs
 92 | .SH "AUTHOR"
 93 | .IX Header "AUTHOR"
 94 | Scott Gray, <sgray@nervanasys.com<gt>
 95 | .SH "COPYRIGHT AND LICENSE"
 96 | .IX Header "COPYRIGHT AND LICENSE"
 97 | The \s-1MIT\s0 License (\s-1MIT\s0)
 98 | .PP
 99 | Copyright (c) 2014 Scott Gray
100 | .PP
101 | Permission is hereby granted, free of charge, to any person obtaining a copy
102 | of this software and associated documentation files (the \*(L"Software\*(R"), to deal
103 | in the Software without restriction, including without limitation the rights
104 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
105 | copies of the Software, and to permit persons to whom the Software is
106 | furnished to do so, subject to the following conditions:
107 | .PP
108 | The above copyright notice and this permission notice shall be included in
109 | all copies or substantial portions of the Software.
110 | .PP
111 | \&\s-1THE SOFTWARE IS PROVIDED \*(L"AS IS\*(R", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
112 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
113 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
114 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
115 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
116 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
117 | THE SOFTWARE.\s0
118 | 


--------------------------------------------------------------------------------
/Assembler/KeplerAs/blib/script/.exists:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ncic-sugon/DeepPerf/e539add1347e9354712787f71f66ae346f006544/Assembler/KeplerAs/blib/script/.exists


--------------------------------------------------------------------------------
/Assembler/KeplerAs/pm_to_blib:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ncic-sugon/DeepPerf/e539add1347e9354712787f71f66ae346f006544/Assembler/KeplerAs/pm_to_blib


--------------------------------------------------------------------------------
/Assembler/MaxAs/Changes:
--------------------------------------------------------------------------------
1 | Revision history for Perl extension MaxAs::MaxAs.
2 | 
3 | 1.01  Thu Mar 26 17:09:57 2015
4 | 	- original Perl packaged version
5 | 


--------------------------------------------------------------------------------
/Assembler/MaxAs/Install.sh:
--------------------------------------------------------------------------------
1 | perl Makefile.PL
2 | make
3 | sudo make install
4 | 


--------------------------------------------------------------------------------
/Assembler/MaxAs/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2014 Scott Gray
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Assembler/MaxAs/MANIFEST:
--------------------------------------------------------------------------------
 1 | bin/maxas.pl
 2 | Changes
 3 | lib/MaxAs/Cubin.pm
 4 | lib/MaxAs/MaxAs.pm
 5 | lib/MaxAs/MaxAsGrammar.pm
 6 | LICENSE
 7 | Makefile.PL
 8 | MANIFEST
 9 | microbench/microbench.cpp
10 | microbench/microbench.cu
11 | microbench/microbench.sass
12 | microbench/shared.pl
13 | microbench/shared_lds.sass
14 | microbench/shared_sts16.sass
15 | microbench/throughput.pl
16 | microbench/throughput.sass
17 | microbench/throughput2.pl
18 | microbench/throughput2.sass
19 | microbench/throughput3.pl
20 | microbench/throughput4.pl
21 | microbench/throughput5.pl
22 | microbench/xmad.pl
23 | microbench/xmad2.sass
24 | README.md
25 | sgemm/batched_gemm.xlsx
26 | sgemm/cublas_sgemm.ptx
27 | sgemm/sgemm.cpp
28 | sgemm/sgemm.cu
29 | sgemm/sgemm.pl
30 | sgemm/sgemm.sln
31 | sgemm/sgemm.vcxproj
32 | sgemm/sgemm128.sass
33 | sgemm/sgemm64.sass
34 | sgemm/sgemm_final_128.sass
35 | sgemm/sgemm_final_64.sass
36 | sgemm/sgemm_pre_128.sass
37 | sgemm/sgemm_pre_64.sass
38 | t/MaxAs-MaxAs.t
39 | 


--------------------------------------------------------------------------------
/Assembler/MaxAs/Makefile.PL:
--------------------------------------------------------------------------------
 1 | require 5.10.0;
 2 | use ExtUtils::MakeMaker;
 3 | # See lib/ExtUtils/MakeMaker.pm for details of how to influence
 4 | # the contents of the Makefile that is written.
 5 | WriteMakefile(
 6 |     NAME              => 'MaxAs::MaxAs',
 7 |     VERSION_FROM      => 'lib/MaxAs/MaxAs.pm', # finds $VERSION
 8 |     EXE_FILES         => ['bin/maxas.pl'],
 9 |     PREREQ_PM         => {Carp => 1.29, Data::Dumper => 2.145},
10 |     LICENSE           => 'MIT',
11 |     ($] >= 5.005 ?     ## Add these new keywords supported since 5.005
12 |       (ABSTRACT_FROM  => 'lib/MaxAs/MaxAs.pm', # retrieve abstract from module
13 |        AUTHOR         => 'Scott Gray <sgray@nervanasys.com>') : ()),
14 | );
15 | 


--------------------------------------------------------------------------------
/Assembler/MaxAs/README.md:
--------------------------------------------------------------------------------
 1 | # MaxAs
 2 | Assembler for NVIDIA Maxwell architecture
 3 | 
 4 | To install (system-wide):
 5 | 
 6 |     sudo cpanm git://github.com/NervanaSystems/maxas.git
 7 | 
 8 | or
 9 | 
10 |     perl Makefile.PL
11 |     make
12 |     sudo make install
13 | 
14 | 
15 | See wiki pages for more information:
16 | 
17 | - [Introduction](https://github.com/NervanaSystems/maxas/wiki/Introduction)
18 | - [Getting Started](https://github.com/NervanaSystems/maxas/wiki/Getting-Started)
19 | - [Control Codes](https://github.com/NervanaSystems/maxas/wiki/Control-Codes)
20 | - [SGEMM walkthrough](https://github.com/NervanaSystems/maxas/wiki/SGEMM)
21 | 
22 | Related work with lots of additional shader assembly (sass) examples:
23 | 
24 | - [NervanaGPU](https://github.com/NervanaSystems/nervanagpu)
25 | 
26 | This project is released under the [MIT License](http://opensource.org/licenses/MIT).
27 | 
28 | -- Scott Gray
29 | 


--------------------------------------------------------------------------------
/Assembler/MaxAs/blib/arch/.exists:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ncic-sugon/DeepPerf/e539add1347e9354712787f71f66ae346f006544/Assembler/MaxAs/blib/arch/.exists


--------------------------------------------------------------------------------
/Assembler/MaxAs/blib/arch/auto/MaxAs/MaxAs/.exists:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ncic-sugon/DeepPerf/e539add1347e9354712787f71f66ae346f006544/Assembler/MaxAs/blib/arch/auto/MaxAs/MaxAs/.exists


--------------------------------------------------------------------------------
/Assembler/MaxAs/blib/bin/.exists:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ncic-sugon/DeepPerf/e539add1347e9354712787f71f66ae346f006544/Assembler/MaxAs/blib/bin/.exists


--------------------------------------------------------------------------------
/Assembler/MaxAs/blib/lib/MaxAs/.exists:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ncic-sugon/DeepPerf/e539add1347e9354712787f71f66ae346f006544/Assembler/MaxAs/blib/lib/MaxAs/.exists


--------------------------------------------------------------------------------
/Assembler/MaxAs/blib/lib/auto/MaxAs/MaxAs/.exists:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ncic-sugon/DeepPerf/e539add1347e9354712787f71f66ae346f006544/Assembler/MaxAs/blib/lib/auto/MaxAs/MaxAs/.exists


--------------------------------------------------------------------------------
/Assembler/MaxAs/blib/man1/.exists:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ncic-sugon/DeepPerf/e539add1347e9354712787f71f66ae346f006544/Assembler/MaxAs/blib/man1/.exists


--------------------------------------------------------------------------------
/Assembler/MaxAs/blib/man3/.exists:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ncic-sugon/DeepPerf/e539add1347e9354712787f71f66ae346f006544/Assembler/MaxAs/blib/man3/.exists


--------------------------------------------------------------------------------
/Assembler/MaxAs/blib/man3/MaxAs::MaxAs.3pm:
--------------------------------------------------------------------------------
  1 | .\" Automatically generated by Pod::Man 2.22 (Pod::Simple 3.13)
  2 | .\"
  3 | .\" Standard preamble:
  4 | .\" ========================================================================
  5 | .de Sp \" Vertical space (when we can't use .PP)
  6 | .if t .sp .5v
  7 | .if n .sp
  8 | ..
  9 | .de Vb \" Begin verbatim text
 10 | .ft CW
 11 | .nf
 12 | .ne \\$1
 13 | ..
 14 | .de Ve \" End verbatim text
 15 | .ft R
 16 | .fi
 17 | ..
 18 | .\" Set up some character translations and predefined strings.  \*(-- will
 19 | .\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left
 20 | .\" double quote, and \*(R" will give a right double quote.  \*(C+ will
 21 | .\" give a nicer C++.  Capital omega is used to do unbreakable dashes and
 22 | .\" therefore won't be available.  \*(C` and \*(C' expand to `' in nroff,
 23 | .\" nothing in troff, for use with C<>.
 24 | .tr \(*W-
 25 | .ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p'
 26 | .ie n \{\
 27 | .    ds -- \(*W-
 28 | .    ds PI pi
 29 | .    if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch
 30 | .    if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\"  diablo 12 pitch
 31 | .    ds L" ""
 32 | .    ds R" ""
 33 | .    ds C` ""
 34 | .    ds C' ""
 35 | 'br\}
 36 | .el\{\
 37 | .    ds -- \|\(em\|
 38 | .    ds PI \(*p
 39 | .    ds L" ``
 40 | .    ds R" ''
 41 | 'br\}
 42 | .\"
 43 | .\" Escape single quotes in literal strings from groff's Unicode transform.
 44 | .ie \n(.g .ds Aq \(aq
 45 | .el       .ds Aq '
 46 | .\"
 47 | .\" If the F register is turned on, we'll generate index entries on stderr for
 48 | .\" titles (.TH), headers (.SH), subsections (.SS), items (.Ip), and index
 49 | .\" entries marked with X<> in POD.  Of course, you'll have to process the
 50 | .\" output yourself in some meaningful fashion.
 51 | .ie \nF \{\
 52 | .    de IX
 53 | .    tm Index:\\$1\t\\n%\t"\\$2"
 54 | ..
 55 | .    nr % 0
 56 | .    rr F
 57 | .\}
 58 | .el \{\
 59 | .    de IX
 60 | ..
 61 | .\}
 62 | .\"
 63 | .\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2).
 64 | .\" Fear.  Run.  Save yourself.  No user-serviceable parts.
 65 | .    \" fudge factors for nroff and troff
 66 | .if n \{\
 67 | .    ds #H 0
 68 | .    ds #V .8m
 69 | .    ds #F .3m
 70 | .    ds #[ \f1
 71 | .    ds #] \fP
 72 | .\}
 73 | .if t \{\
 74 | .    ds #H ((1u-(\\\\n(.fu%2u))*.13m)
 75 | .    ds #V .6m
 76 | .    ds #F 0
 77 | .    ds #[ \&
 78 | .    ds #] \&
 79 | .\}
 80 | .    \" simple accents for nroff and troff
 81 | .if n \{\
 82 | .    ds ' \&
 83 | .    ds ` \&
 84 | .    ds ^ \&
 85 | .    ds , \&
 86 | .    ds ~ ~
 87 | .    ds /
 88 | .\}
 89 | .if t \{\
 90 | .    ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u"
 91 | .    ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u'
 92 | .    ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u'
 93 | .    ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u'
 94 | .    ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u'
 95 | .    ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u'
 96 | .\}
 97 | .    \" troff and (daisy-wheel) nroff accents
 98 | .ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V'
 99 | .ds 8 \h'\*(#H'\(*b\h'-\*(#H'
100 | .ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#]
101 | .ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H'
102 | .ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u'
103 | .ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#]
104 | .ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#]
105 | .ds ae a\h'-(\w'a'u*4/10)'e
106 | .ds Ae A\h'-(\w'A'u*4/10)'E
107 | .    \" corrections for vroff
108 | .if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u'
109 | .if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u'
110 | .    \" for low resolution devices (crt and lpr)
111 | .if \n(.H>23 .if \n(.V>19 \
112 | \{\
113 | .    ds : e
114 | .    ds 8 ss
115 | .    ds o a
116 | .    ds d- d\h'-1'\(ga
117 | .    ds D- D\h'-1'\(hy
118 | .    ds th \o'bp'
119 | .    ds Th \o'LP'
120 | .    ds ae ae
121 | .    ds Ae AE
122 | .\}
123 | .rm #[ #] #H #V #F C
124 | .\" ========================================================================
125 | .\"
126 | .IX Title "MaxAs::MaxAs 3"
127 | .TH MaxAs::MaxAs 3 "2016-02-04" "perl v5.10.1" "User Contributed Perl Documentation"
128 | .\" For nroff, turn off justification.  Always turn off hyphenation; it makes
129 | .\" way too many mistakes in technical documents.
130 | .if n .ad l
131 | .nh
132 | .SH "NAME"
133 | MaxAs::MaxAs \- Assembler for NVIDIA Maxwell architecture
134 | .SH "SYNOPSIS"
135 | .IX Header "SYNOPSIS"
136 | .Vb 1
137 | \&    maxas.pl [opts]
138 | .Ve
139 | .SH "DESCRIPTION"
140 | .IX Header "DESCRIPTION"
141 | See the documentation at: https://github.com/NervanaSystems/maxas
142 | .SH "SEE ALSO"
143 | .IX Header "SEE ALSO"
144 | See the documentation at: https://github.com/NervanaSystems/maxas
145 | .SH "AUTHOR"
146 | .IX Header "AUTHOR"
147 | Scott Gray, <sgray@nervanasys.com<gt>
148 | .SH "COPYRIGHT AND LICENSE"
149 | .IX Header "COPYRIGHT AND LICENSE"
150 | The \s-1MIT\s0 License (\s-1MIT\s0)
151 | .PP
152 | Copyright (c) 2014 Scott Gray
153 | .PP
154 | Permission is hereby granted, free of charge, to any person obtaining a copy
155 | of this software and associated documentation files (the \*(L"Software\*(R"), to deal
156 | in the Software without restriction, including without limitation the rights
157 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
158 | copies of the Software, and to permit persons to whom the Software is
159 | furnished to do so, subject to the following conditions:
160 | .PP
161 | The above copyright notice and this permission notice shall be included in
162 | all copies or substantial portions of the Software.
163 | .PP
164 | \&\s-1THE\s0 \s-1SOFTWARE\s0 \s-1IS\s0 \s-1PROVIDED\s0 \*(L"\s-1AS\s0 \s-1IS\s0\*(R", \s-1WITHOUT\s0 \s-1WARRANTY\s0 \s-1OF\s0 \s-1ANY\s0 \s-1KIND\s0, \s-1EXPRESS\s0 \s-1OR\s0
165 | \&\s-1IMPLIED\s0, \s-1INCLUDING\s0 \s-1BUT\s0 \s-1NOT\s0 \s-1LIMITED\s0 \s-1TO\s0 \s-1THE\s0 \s-1WARRANTIES\s0 \s-1OF\s0 \s-1MERCHANTABILITY\s0,
166 | \&\s-1FITNESS\s0 \s-1FOR\s0 A \s-1PARTICULAR\s0 \s-1PURPOSE\s0 \s-1AND\s0 \s-1NONINFRINGEMENT\s0. \s-1IN\s0 \s-1NO\s0 \s-1EVENT\s0 \s-1SHALL\s0 \s-1THE\s0
167 | \&\s-1AUTHORS\s0 \s-1OR\s0 \s-1COPYRIGHT\s0 \s-1HOLDERS\s0 \s-1BE\s0 \s-1LIABLE\s0 \s-1FOR\s0 \s-1ANY\s0 \s-1CLAIM\s0, \s-1DAMAGES\s0 \s-1OR\s0 \s-1OTHER\s0
168 | \&\s-1LIABILITY\s0, \s-1WHETHER\s0 \s-1IN\s0 \s-1AN\s0 \s-1ACTION\s0 \s-1OF\s0 \s-1CONTRACT\s0, \s-1TORT\s0 \s-1OR\s0 \s-1OTHERWISE\s0, \s-1ARISING\s0 \s-1FROM\s0,
169 | \&\s-1OUT\s0 \s-1OF\s0 \s-1OR\s0 \s-1IN\s0 \s-1CONNECTION\s0 \s-1WITH\s0 \s-1THE\s0 \s-1SOFTWARE\s0 \s-1OR\s0 \s-1THE\s0 \s-1USE\s0 \s-1OR\s0 \s-1OTHER\s0 \s-1DEALINGS\s0 \s-1IN\s0
170 | \&\s-1THE\s0 \s-1SOFTWARE\s0.
171 | 


--------------------------------------------------------------------------------
/Assembler/MaxAs/blib/script/.exists:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ncic-sugon/DeepPerf/e539add1347e9354712787f71f66ae346f006544/Assembler/MaxAs/blib/script/.exists


--------------------------------------------------------------------------------
/Assembler/MaxAs/cpanfile:
--------------------------------------------------------------------------------
1 | requires 'perl', '5.10.0';
2 | 
3 | requires 'Carp', '1.29';
4 | requires 'Data::Dumper', '2.145';
5 | 


--------------------------------------------------------------------------------
/Assembler/MaxAs/microbench/microbench.cpp:
--------------------------------------------------------------------------------
  1 | // microbench.cpp : Defines the entry point for the console application.
  2 | //
  3 | 
  4 | // nvcc -l cuda -o microbench microbench.cpp
  5 | 
  6 | #include <stdio.h>
  7 | #include <stdlib.h>
  8 | #include <string.h>
  9 | #include <cuda.h>
 10 | #include <cudaProfiler.h>
 11 | 
 12 | CUcontext hContext = 0;
 13 | 
 14 | #define CUDA_CHECK( fn ) do { \
 15 | 		CUresult status = (fn); \
 16 | 		if ( CUDA_SUCCESS != status ) { \
 17 | 			const char* errstr; \
 18 | 			cuGetErrorString(status, &errstr); \
 19 | 			printf("CUDA Driver Failure (line %d of file %s):\n\t%s returned 0x%x (%s)\n", __LINE__, __FILE__, #fn, status, errstr); \
 20 | 			if (hContext) cuCtxDestroy(hContext); \
 21 | 			exit(EXIT_FAILURE); \
 22 | 		} \
 23 | 	} while (0)
 24 | 
 25 | 
 26 | int main(int argc, char* argv[])
 27 | {
 28 | 	//int iTest = 2896;
 29 | 	//while (iTest < 0x7fff)
 30 | 	//{
 31 | 	//	int iResult = iTest * iTest;
 32 | 	//	float fTest = (float)iTest;
 33 | 	//	int fResult = (int)(fTest * fTest);
 34 | 
 35 | 	//	printf("i*i:%08x f*f:%08x\n", iResult, fResult);
 36 | 
 37 | 	//	iTest += 0x0800;
 38 | 	//}
 39 | 	//exit(0);
 40 | 
 41 | 	char deviceName[32];
 42 | 	int devCount, ordinal, major, minor;
 43 | 	CUdevice  hDevice;
 44 | 
 45 | 	// Initialize the Driver API and find a device
 46 | 	CUDA_CHECK( cuInit(0) );
 47 | 	CUDA_CHECK( cuDeviceGetCount(&devCount) );
 48 | 	for (ordinal = 0; ordinal < devCount; ordinal++)
 49 | 	{
 50 | 		CUDA_CHECK( cuDeviceGet(&hDevice, ordinal) );
 51 | 		CUDA_CHECK( cuDeviceGetAttribute (&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, hDevice) );
 52 | 		CUDA_CHECK( cuDeviceGetAttribute (&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, hDevice) );
 53 | 		CUDA_CHECK( cuDeviceGetName(deviceName, sizeof(deviceName), hDevice) );
 54 | 		if (major >= 5 && minor >= 2)
 55 | 		{
 56 | 			printf("Using: Id:%d %s (%d.%d)\n\n", ordinal, deviceName, major, minor);
 57 | 			break;
 58 | 		}
 59 | 	}
 60 | 	if (ordinal == devCount)
 61 | 	{
 62 | 		printf("No compute 5.0 device found, exiting.\n");
 63 | 		exit(EXIT_FAILURE);
 64 | 	}
 65 | 
 66 | 	// First command line arg is the type: internal (CS2R) or external (cuEventElapsedTime) timing
 67 | 	int internalTiming = 1;
 68 | 	if (argc > 1)
 69 | 		internalTiming = strcmp(argv[1], "i") == 0 ? 1 : 0;
 70 | 
 71 | 	// Second command line arg is the number of blocks
 72 | 	int blocks = 1;
 73 | 	if (argc > 2)
 74 | 		blocks = atoi(argv[2]);
 75 | 	if (blocks < 1)
 76 | 		blocks = 1;
 77 | 
 78 | 	// Third command line arg is the number of threads
 79 | 	int threads = 128;
 80 | 	if (argc > 3)
 81 | 		threads = atoi(argv[3]);
 82 | 	if (threads > 1024 || threads < 32)
 83 | 		threads = 128;
 84 | 	threads &= -32;
 85 | 
 86 | 	// Forth command line arg:
 87 | 	double fops = 1.0;
 88 | 	int lanes = 1;
 89 | 	if (argc > 4)
 90 | 	{
 91 | 		if (internalTiming)
 92 | 		{
 93 | 			// The number of lanes to print for each warp
 94 | 			lanes = atoi(argv[4]);
 95 | 			if (lanes > 32 || lanes < 1)
 96 | 				lanes = 1;
 97 | 		}
 98 | 		else
 99 | 			// The number of floating point operations in a full kernel launch
100 | 			fops = atof(argv[4]);
101 | 	}
102 | 
103 | 	// Fifth command line arg is the repeat count for benchmarking
104 | 	int repeat = 1;
105 | 	if (argc > 5)
106 | 		repeat = atoi(argv[5]);
107 | 	if (repeat > 1000 || repeat < 1)
108 | 		repeat = 1;
109 | 
110 | 	// threads = total number of threads
111 | 	size_t size = sizeof(int) * threads * blocks;
112 | 
113 | 	// Setup our input and output buffers
114 | 	int* dataIn  = (int*)malloc(size);
115 | 	int* dataOut = (int*)malloc(size);
116 | 	int* clocks  = (int*)malloc(size);
117 | 	memset(dataIn, 0, size);
118 | 
119 | 	CUmodule hModule;
120 | 	CUfunction hKernel;
121 | 	CUevent hStart, hStop;
122 | 	CUdeviceptr devIn, devOut, devClocks;
123 | 
124 | 	// Init our context and device memory buffers
125 | 	CUDA_CHECK( cuCtxCreate(&hContext, 0, hDevice) );
126 | 	CUDA_CHECK( cuMemAlloc(&devIn, size) );
127 | 	CUDA_CHECK( cuMemAlloc(&devOut, size) );
128 | 	CUDA_CHECK( cuMemAlloc(&devClocks, size) );
129 | 	CUDA_CHECK( cuMemcpyHtoD(devIn, dataIn, size) );
130 | 	CUDA_CHECK( cuMemsetD8(devOut, 0, size) );
131 | 	CUDA_CHECK( cuMemsetD8(devClocks, 0, size) );
132 | 
133 | 	CUDA_CHECK( cuEventCreate(&hStart, CU_EVENT_BLOCKING_SYNC) );
134 | 	CUDA_CHECK( cuEventCreate(&hStop,  CU_EVENT_BLOCKING_SYNC) );
135 | 
136 | 	// Load our kernel
137 | 	CUDA_CHECK( cuModuleLoad(&hModule, "microbench.cubin") );
138 | 	CUDA_CHECK( cuModuleGetFunction(&hKernel, hModule, "microbench") );
139 | 
140 | 	// Setup the params
141 | 	void* params[] = { &devOut, &devClocks, &devIn };
142 | 	float ms = 0;
143 | 
144 | 	// Warm up the clock (unless under nsight)
145 | 	if (!getenv("NSIGHT_LAUNCHED")) // NSIGHT_CUDA_ANALYSIS NSIGHT_CUDA_DEBUGGER
146 | 		for (int i = 0; i < repeat; i++)
147 | 			CUDA_CHECK( cuLaunchKernel(hKernel, blocks, 1, 1, threads, 1, 1, 0, 0, params, 0) );
148 | 
149 | 	// Launch the kernel
150 | 	CUDA_CHECK( cuEventRecord(hStart, NULL) );
151 | 	//CUDA_CHECK( cuProfilerStart() );
152 | 	CUDA_CHECK( cuLaunchKernel(hKernel, blocks, 1, 1, threads, 1, 1, 0, 0, params, 0) );
153 | 	//CUDA_CHECK( cuProfilerStop() );
154 | 	CUDA_CHECK( cuEventRecord(hStop, NULL) );
155 | 	CUDA_CHECK( cuEventSynchronize(hStop) );
156 | 	CUDA_CHECK( cuEventElapsedTime(&ms, hStart, hStop) );
157 | 
158 | 	//CUDA_CHECK( cuCtxSynchronize() );
159 | 
160 | 	// Get back our results from each kernel
161 | 	CUDA_CHECK( cuMemcpyDtoH(dataOut, devOut, size) );
162 | 	CUDA_CHECK( cuMemcpyDtoH(clocks, devClocks, size) );
163 | 
164 | 	// Cleanup and shutdown of cuda
165 | 	CUDA_CHECK( cuEventDestroy(hStart) );
166 | 	CUDA_CHECK( cuEventDestroy(hStop) );
167 | 	CUDA_CHECK( cuModuleUnload(hModule) );
168 | 	CUDA_CHECK( cuMemFree(devIn) );
169 | 	CUDA_CHECK( cuMemFree(devOut) );
170 | 	CUDA_CHECK( cuMemFree(devClocks) );
171 | 	CUDA_CHECK( cuCtxDestroy(hContext) );
172 | 	hContext = 0;
173 | 
174 | 	// When using just one block, print out the internal timing data
175 | 	if (internalTiming)
176 | 	{
177 | 		int count = 0, total = 0, min = 999999, max = 0;
178 | 
179 | 		int* clocks_p  = clocks;
180 | 		int* dataOut_p = dataOut;
181 | 
182 | 		// Loop over and print results
183 | 		for (int blk = 0; blk < blocks; blk++)
184 | 		{
185 | 			float *fDataOut = reinterpret_cast<float*>(dataOut_p);
186 | 
187 | 			for(int tid = 0; tid < threads; tid += 32)
188 | 			{
189 | 				// Sometimes we want data on each thread, sometimes just one sample per warp is fine
190 | 				for (int lane = 0; lane < lanes; lane++)
191 | 					printf("b:%02d w:%03d t:%04d l:%02d clocks:%08d out:%08x\n", blk, tid/32, tid, lane, clocks_p[tid+lane], dataOut_p[tid+lane]); // %04u
192 | 
193 | 				count++;
194 | 				total += clocks_p[tid];
195 | 				if (clocks_p[tid] < min) min = clocks_p[tid];
196 | 				if (clocks_p[tid] > max) max = clocks_p[tid];
197 | 			}
198 | 			clocks_p  += threads;
199 | 			dataOut_p += threads;
200 | 		}
201 | 		printf("average: %.3f, min %d, max: %d\n", (float)total/count, min, max);
202 | 	}
203 | 	else
204 | 	{
205 | 		// For more than one block we're testing throughput and want external timing data
206 | 		printf("MilliSecs: %.3f, GFLOPS: %.3f\n", ms, fops / (ms * 1000000.0));
207 | 	}
208 | 	// And free up host memory
209 | 	free(dataIn); free(dataOut); free(clocks);
210 | 
211 | 	return 0;
212 | }
213 | 


--------------------------------------------------------------------------------
/Assembler/MaxAs/microbench/microbench.cu:
--------------------------------------------------------------------------------
 1 | 
 2 | // Note this file isn't configured to automatically compile
 3 | 
 4 | #include <device_functions.h>
 5 | #include <device_launch_parameters.h>
 6 | 
 7 | // Build:
 8 | // nvcc -l cuda -o microbench microbench.cpp
 9 | // nvcc -arch sm_50 -cubin microbench.cu
10 | 
11 | // Inspect a cubin (use nvdisasm from cuda 6.5 for best results):
12 | // maxas.pl -e microbench.cubin
13 | 
14 | // Insert new sass into cubin
15 | // maxas.pl -i microbench.sass microbench.cubin
16 | 
17 | // run it:
18 | // ./microbench
19 | 
20 | // Use extern C so C++ doesn't mangle our kernel name
21 | extern "C" __global__ void  microbench(int *out, int *clocks, int *in)
22 | {
23 |     __shared__ int share[1024];
24 | 
25 |     int tid = threadIdx.x;
26 |     int bx  = blockIdx.x;
27 |     int by  = blockIdx.y;
28 | 
29 |     int start = clock();
30 | 
31 |     share[tid] = in[by * 65535 + bx]; //tid + blkDimX + blkDimY + blkDimZ + grdDimX + grdDimY + grdDimZ
32 | 
33 |     __syncthreads();
34 | 
35 |     int end = clock();
36 | 
37 |     clocks[tid] = (start >> 16) | (end & 0xffff0000); //end - start;
38 | 
39 |     out[tid] = share[tid ^ 1];
40 | }
41 | 
42 | // A note about using the Cuda Runtime.
43 | // If that's your preference over the driver API then here's what you'd do:
44 | 
45 | // In your project properties in the Cuda C/C++ panel:
46 | //    -Set the "Keep Processed Files" (-keep) option
47 | //    -Add a -v manually to the command line
48 | // If compiling on command line just add -keep -v options to nvcc.
49 | // Rebuild your solution and look in the log for these lines that follow the ptxas step:
50 | 
51 | // #$ fatbinary --create="Release/kernel.fatbin" -32 --key="a7bce87544c2a492" --ident="C:/Users/Scott/Documents/sgemm6/sgemm6/kernel.cu" --cmdline="-v --opt-level 4 --generate-line-info " "--image=profile=sm_50,file=Release/kernel.sm_50.cubin" "--image=profile=compute_50,file=Release/kernel.ptx" --embedded-fatbin="Release/kernel.fatbin.c" --cuda
52 | // #$ cl.exe @Release/kernel.cu.cpp.ii.res > "Release/kernel.cu.cpp.ii"
53 | // #$ cl.exe @Release/kernel.cu.obj.res -Fo"Release/kernel.cu.obj"
54 | 
55 | // You just need to manually run these 3 commands (or add them to a build script)
56 | // after you've modified the cubin generated from the preceeding ptxas command.
57 | // That will give you a new .cu.obj file which will automatically be linked in for you next time you
58 | // build your project (or you could manually run the linker step as well).
59 | 
60 | // Having done that you can call your kernel normally using the <<< >>> syntax.
61 | // Debugging will have to be with the sass syntax but that's what you'll want to see anyway.
62 | // With fatbin you can also keep non-maxwell optimized versions of your code.
63 | 
64 | 
65 | // I just discovered this also works as a shortcut to the above:
66 | // nvcc -lib -arch sm_52 -m 32 -use-cubin code=sm_52,cubin=microbench.cubin -o microbench.lib microbench.cu
67 | 
68 | // The cu kernel definitions above need to have empty bodies.
69 | // And, the cu file must be compiled to a lib seperately before linking.


--------------------------------------------------------------------------------
/Assembler/MaxAs/microbench/microbench.sass:
--------------------------------------------------------------------------------
 1 | # Kernel: microbench
 2 | 
 3 | // This is a simple micro bench to demonstrate the latency in loading SR_TID.X
 4 | 
 5 | <CONSTANT_MAPPING>
 6 |     blockDimX : c[0x0][0x08]
 7 |     blockDimY : c[0x0][0x0c]
 8 |     blockDimZ : c[0x0][0x10]
 9 |     gridDimX  : c[0x0][0x14]
10 |     gridDimY  : c[0x0][0x18]
11 |     gridDimZ  : c[0x0][0x1c]
12 | 
13 |     param_out[0]    : c[0x0][0x140]
14 |     param_out[1]    : c[0x0][0x144]
15 |     param_clocks[0] : c[0x0][0x148]
16 |     param_clocks[1] : c[0x0][0x14c]
17 |     param_in[0]     : c[0x0][0x150]
18 |     param_in[1]     : c[0x0][0x154]
19 | </CONSTANT_MAPPING>
20 | 
21 | <REGISTER_MAPPING>
22 | 
23 |      0-1 : out<0-1>
24 |      2-3 : clocks<0-1>
25 |      4-5 : in<0-1>
26 |     6-20 : tid, bid, blockDim, clock1, clock2, result, offset, x
27 | 
28 | </REGISTER_MAPPING>
29 | 
30 | // Load in our params (not currently used below)
31 | --:-:-:-:1      MOV in0, param_in[0];
32 | --:-:-:-:1      MOV in1, param_in[1];
33 | 
34 | // Get the first clock value
35 | --:-:-:-:1      CS2R clock1, SR_CLOCKLO;
36 | 
37 | // Get the threadId and blockId
38 | // Set the Read-After-Write dependency barrier 1 and 2
39 | --:-:1:-:1      S2R tid, SR_TID.X;
40 | // Add one additional clock stall to allow the barrier time to set prior to next instruction that uses it
41 | --:-:2:-:2      S2R bid, SR_CTAID.X;
42 | 
43 | 
44 | // Get the second clock value
45 | // Wait on the depenedency barriers that were set in the prior instruction
46 | // Stall 6 to allow CS2R time to complete before next instruction
47 | // CS2R takes a constant 6 clocks to complete unlike S2R which is a variable 22-44 clocks
48 | // This stall count does not factor into the time calculation at all
49 | 03:-:-:-:6      CS2R clock2, SR_CLOCKLO;
50 | 
51 | // Take the difference of clocks
52 | --:-:-:-:1      IADD clock1, clock2, -clock1;
53 | 
54 | // Setup our output addresses
55 | // Stall your pipeline dependencies properly
56 | // Note using a single XMAD assumes blockDimX and bid are 16 bit values, which is reasonable for this test code
57 | --:-:-:-:6      XMAD offset, bid, blockDimX, tid;
58 | 
59 | // LEA is "load effective address"
60 | // The offset param is shifted left 2 and added to the pointers with 64bit math
61 | --:-:-:-:6      LEA      clocks0.CC, offset, param_clocks[0],     2;
62 | --:-:-:-:1      LEA.HI.X clocks1,    offset, param_clocks[1], RZ, 2;
63 | 
64 | --:-:-:-:6      LEA      out0.CC, offset, param_out[0],     2;
65 | --:-:-:-:1      LEA.HI.X out1,    offset, param_out[1], RZ, 2;
66 | 
67 | // Output the results.
68 | // No stall needed on prior instruction as memory store instructions have a 5 clock delay in picking up register values
69 | --:-:-:-:1      STG.E [clocks], clock1;
70 | --:-:-:-:1      STG.E [out],    offset; # use this to return whatever you like to inspect the results
71 | --:-:-:-:5      EXIT;
72 | 
73 | 


--------------------------------------------------------------------------------
/Assembler/MaxAs/microbench/shared.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | use strict;
 3 | 
 4 | print `maxas.pl -i shared_sts16.sass microbench.cubin`;
 5 | 
 6 | exit if $?;
 7 | 
 8 | print `Release\\microbench.exe i 1 64`;
 9 | 
10 | 
11 | __END__
12 | 
13 | 


--------------------------------------------------------------------------------
/Assembler/MaxAs/microbench/shared_lds.sass:
--------------------------------------------------------------------------------
  1 | # Kernel: microbench
  2 | # InsCnt: 18
  3 | # RegCnt: 5
  4 | # SharedSize: 4096
  5 | # BarCnt: 1
  6 | # Params(3):
  7 | #   ord:addr:size:align
  8 | #   0:0x140:4:0
  9 | #   1:0x144:4:0
 10 | #   2:0x148:4:0
 11 | 
 12 | // This is a simple micro bench to demonstrate the latency in loading SR_TID.X
 13 | 
 14 | <REGISTER_MAPPING>
 15 | 
 16 |     0-3 : result, a, b, c
 17 | 
 18 |     4-40 : out, clocks, in, tid, bid, blockDim, clock1, clock2, x, tid3, tid7, tid96, tid128, readAs, readBs, val<0-20>
 19 | 
 20 | </REGISTER_MAPPING>
 21 | 
 22 | // Load in our params
 23 | --:-:1:-:1      S2R tid,      SR_TID.X;
 24 | --:-:2:-:1      S2R bid,      SR_CTAID.X;
 25 | 
 26 | --:-:-:-:1      MOV result,  c[0x0][0x0];
 27 | --:-:-:-:1      MOV in,      c[0x0][0x100];
 28 | 
 29 | --:-:-:-:1      CS2R clock1, SR_CLOCKLO;
 30 | --:-:-:-:1      MOV result,  c[0x0][0x13c];
 31 | --:-:-:-:1      CS2R clock2, SR_CLOCKLO;
 32 | 
 33 | --:-:-:-:1      MOV blockDim, c[0x0][0x8];
 34 | --:-:-:-:1      MOV out,      c[0x0][0x140];
 35 | --:-:-:-:1      MOV clocks,   c[0x0][0x144];
 36 | 
 37 | 
 38 | 
 39 | 
 40 | <SCHEDULE_BLOCK>
 41 | 
 42 | 03:-:-:-:1      LOP.AND tid3,   tid, 3;
 43 | --:-:-:-:1      LOP.AND tid7,   tid, 7;
 44 | --:-:-:-:1      LOP.AND tid96,  tid, 96;
 45 | --:-:-:-:1      LOP.AND tid128, tid, 128;
 46 | 
 47 | // readAs = ((tid128 >> 4) | tid7) << 4
 48 | --:-:-:-:1      SHR.U32 readAs, tid128, 4;
 49 | --:-:-:-:1      LOP.OR  readAs, readAs, tid7;
 50 | --:-:-:-:1      SHL     readAs, readAs, 4;
 51 | 
 52 | // readBs  = ((tid96 >> 3) | tid3) << 4
 53 | --:-:-:-:1      SHR.U32 readBs, tid96, 3;
 54 | --:-:-:-:1      LOP.OR  readBs, readBs, tid3;
 55 | #--:-:-:-:1      SHL     readBs, readBs, 4;
 56 | #--:-:-:-:1      ISCADD  readBs, readBs, 4x<1024>, 4;
 57 | 
 58 | 
 59 | </SCHEDULE_BLOCK>
 60 | 
 61 | 
 62 | 
 63 | #--:-:-:-:1      LDS.U.128 result, [readBs];
 64 | 
 65 | 
 66 | 
 67 | 
 68 | 01:-:-:-:1      IADD clock1, clock2, -clock1;
 69 | 
 70 | 
 71 | --:-:-:-:1      XMAD tid, blockDim, bid, tid;
 72 | --:-:-:Y:6      XMAD.MRG x, blockDim, bid.H1, RZ;
 73 | --:-:-:Y:6      XMAD.PSL.CBCC tid, blockDim.H1, x.H1, tid;
 74 | --:-:-:Y:6      SHL  tid, tid, 0x2;
 75 | 
 76 | --:-:-:-:1      IADD clocks, clocks, tid;
 77 | --:-:-:-:2      IADD out,  out,  tid;
 78 | 
 79 | --:-:-:-:1      STG [clocks], clock1;
 80 | --:-:-:-:1      STG [out],    readBs;
 81 | --:-:-:-:5      EXIT;
 82 | 
 83 | <COMMENT>
 84 | 
 85 | --:-:-:-:4      LOP.AND tid32, tid, -32;
 86 | 
 87 | --:-:-:-:1      STS.128 [tid32 + 4x<2048>], RZ;
 88 | 
 89 | --:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
 90 | --:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
 91 | --:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
 92 | --:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
 93 | --:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
 94 | --:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
 95 | --:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
 96 | --:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
 97 | --:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
 98 | --:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
 99 | --:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
100 | --:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
101 | --:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
102 | --:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
103 | --:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
104 | --:-:1:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
105 | 
106 | 
107 | // readAs = (((tid & 0x80) >> 4) | ((tid >> 1) & 7)) << 4;
108 | --:-:-:-:1      BFE.U32 tid7,   tid,    0x301;
109 | --:-:-:-:1      LOP.AND readAs, tid,    0x80;
110 | --:-:-:-:1      SHR.U32 readAs, readAs, 4;
111 | --:-:-:-:1      LOP.OR  readAs, readAs, tid7;
112 | --:-:-:-:1      SHL     readAs, readAs, 4;
113 | 
114 | // readBs  = ((($tid & 0x70) >> 3) | ($tid & 1)) << 4 + 4096;
115 | --:-:-:-:1      LOP.AND tid1,   tid,    0x1;
116 | --:-:-:-:1      LOP.AND readBs, tid,    0x70;
117 | --:-:-:-:1      SHR.U32 readBs, readBs, 3;
118 | --:-:-:-:1      LOP.OR  readBs, readBs, tid1;
119 | --:-:-:-:1      ISCADD  readBs, readBs, 4x<1024>, 4;
120 | 
121 | 
122 | </COMMENT>


--------------------------------------------------------------------------------
/Assembler/MaxAs/microbench/shared_sts16.sass:
--------------------------------------------------------------------------------
  1 | # Kernel: microbench
  2 | # InsCnt: 18
  3 | # RegCnt: 5
  4 | # SharedSize: 4096
  5 | # BarCnt: 1
  6 | # Params(3):
  7 | #   ord:addr:size:align
  8 | #   0:0x140:4:0
  9 | #   1:0x144:4:0
 10 | #   2:0x148:4:0
 11 | 
 12 | // This is a simple micro bench to demonstrate the latency in loading SR_TID.X
 13 | 
 14 | <REGISTER_MAPPING>
 15 | 
 16 |     0-3 : result, a, b, c
 17 | 
 18 |     4-40 : out, clocks, in, tid, bid, blockDim, clock1, clock2, x, tid1, tid31, tid32, readAs, readBs, val<0-20>
 19 | 
 20 | </REGISTER_MAPPING>
 21 | 
 22 | // Load in our params
 23 | --:-:1:-:1      S2R tid,      SR_TID.X;
 24 | --:-:2:-:1      S2R bid,      SR_CTAID.X;
 25 | 
 26 | //--:-:-:-:1      MOV result,  c[0x0][0x0];
 27 | //--:-:-:-:1      MOV in,      c[0x0][0x100];
 28 | --:-:-:-:1      MOV result, 1;
 29 | 
 30 | --:-:-:-:1      MOV blockDim, c[0x0][0x8];
 31 | --:-:-:-:1      MOV out,      c[0x0][0x140];
 32 | --:-:-:-:1      MOV clocks,   c[0x0][0x144];
 33 | 
 34 | 
 35 | // readAs = ((tid >> 1) & 7) << 4;
 36 | 03:-:-:-:6      BFE.U32 readAs, tid,    0x301; // 3 bits at position 1
 37 | --:-:-:-:6      SHL     readAs, readAs, 3;
 38 | 
 39 | // readBs  = (((tid & 0x30) >> 3) | (tid & 1)) << 4 + 1024;
 40 | --:-:-:-:6      LOP.AND tid1,   tid,    1;
 41 | --:-:-:-:6      LOP.AND readBs, tid,    0x30;
 42 | --:-:-:-:6      SHR.U32 readBs, readBs, 3;
 43 | --:-:-:-:6      LOP.OR  readBs, readBs, tid1;
 44 | --:-:-:-:6      ISCADD  readBs, readBs, 0, 3;
 45 | 
 46 | 
 47 | 
 48 | ///--:-:-:-:1      STS [tid32], result;
 49 | //--:-:-:-:1      STS.S16 [tid32 + 2x<32>], result;
 50 | //--:-:1:-:2      LDS.U.64 result, [readBs];
 51 | 
 52 | --:-:-:-:0      CS2R clock1, SR_CLOCKLO;
 53 | --:-:1:-:6      LDS.U.64 result, [readAs];
 54 | --:-:-:-:6      CS2R clock2, SR_CLOCKLO;
 55 | 
 56 | 
 57 | 01:-:-:-:1      IADD clock1, clock2, -clock1;
 58 | 
 59 | 
 60 | --:-:-:-:1      XMAD tid, blockDim, bid, tid;
 61 | --:-:-:Y:6      XMAD.MRG x, blockDim, bid.H1, RZ;
 62 | --:-:-:Y:6      XMAD.PSL.CBCC tid, blockDim.H1, x.H1, tid;
 63 | --:-:-:Y:6      SHL  tid, tid, 0x2;
 64 | 
 65 | --:-:-:-:1      IADD clocks, clocks, tid;
 66 | --:-:-:-:2      IADD out,  out,  tid;
 67 | 
 68 | --:-:-:-:1      STG [clocks], clock1;
 69 | --:-:-:-:1      STG [out],    result;
 70 | --:-:-:-:5      EXIT;
 71 | 
 72 | <COMMENT>
 73 | 
 74 | --:-:-:-:4      LOP.AND tid32, tid, -32;
 75 | 
 76 | --:-:-:-:1      STS.128 [tid32 + 4x<2048>], RZ;
 77 | 
 78 | --:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
 79 | --:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
 80 | --:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
 81 | --:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
 82 | --:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
 83 | --:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
 84 | --:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
 85 | --:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
 86 | --:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
 87 | --:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
 88 | --:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
 89 | --:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
 90 | --:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
 91 | --:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
 92 | --:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
 93 | --:-:1:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
 94 | 
 95 | 03:-:-:-:6      LOP.AND  tid31, tid, 31;
 96 | --:-:-:-:6      LOP.AND  tid32, tid, 32;
 97 | --:-:-:-:6      SHL  tid32, tid32, 0x2;
 98 | --:-:-:-:6      LOP.OR  tid32, tid32, tid31;
 99 | --:-:-:-:6      SHL  tid32, tid32, 0x2;
100 | 
101 | // readAs = (((tid & 0x80) >> 4) | ((tid >> 1) & 7)) << 4;
102 | --:-:-:-:1      BFE.U32 tid7,   tid,    0x301;
103 | --:-:-:-:1      LOP.AND readAs, tid,    0x80;
104 | --:-:-:-:1      SHR.U32 readAs, readAs, 4;
105 | --:-:-:-:1      LOP.OR  readAs, readAs, tid7;
106 | --:-:-:-:1      SHL     readAs, readAs, 4;
107 | 
108 | // readBs  = ((($tid & 0x70) >> 3) | ($tid & 1)) << 4 + 4096;
109 | --:-:-:-:1      LOP.AND tid1,   tid,    0x1;
110 | --:-:-:-:1      LOP.AND readBs, tid,    0x70;
111 | --:-:-:-:1      SHR.U32 readBs, readBs, 3;
112 | --:-:-:-:1      LOP.OR  readBs, readBs, tid1;
113 | --:-:-:-:1      ISCADD  readBs, readBs, 4x<1024>, 4;
114 | 
115 | 
116 | </COMMENT>


--------------------------------------------------------------------------------
/Assembler/MaxAs/microbench/throughput.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | use strict;
 3 | 
 4 | my $loopSize  = 512;
 5 | my $blocks    = 32;
 6 | my $loops     = 10240000;
 7 | my $fileName  = 'throughput2.sass';
 8 | 
 9 | writeSassFile($fileName, $loops);
10 | 
11 | #print `maxas.pl -p $fileName`;
12 | #exit;
13 | 
14 | print `maxas.pl -i $fileName microbench.cubin`;
15 | exit if $?;
16 | 
17 | foreach my $thread128 (2)
18 | {
19 |     my $threads   = $thread128 * 128;
20 |     my $fops      = 2 * $loops * $loopSize * $blocks * $threads;
21 | 
22 |     my $data = `Release\\microbench.exe e $blocks $threads $fops`;
23 | 
24 |     my ($gflops) = $data =~ /GFLOPS: ([0-9]+)/ms;
25 | 
26 |     printf "%d %d %d\n", $thread128, $threads, $gflops;
27 | }
28 | 
29 | exit;
30 | 
31 | sub writeSassFile
32 | {
33 |     my ($filename, $loops) = @_;
34 | 
35 |     open my $fh, ">$filename" or die "$filename: $!";
36 | 
37 |     printf $fh <<'EOF', $loops;
38 | # Kernel: microbench
39 | 
40 | <REGISTER_MAPPING>
41 | 
42 |     0-10 : result, r1, r2, r3
43 |     20-27 ~ count, stop
44 | 
45 | </REGISTER_MAPPING>
46 | 
47 | --:-:-:-:1      MOV count, RZ;
48 | --:-:-:-:1      MOV32I stop, %d;
49 | --:-:-:-:1      MOV32I r1, 1.0;
50 | --:-:-:-:1      MOV32I r2, 1.0;
51 | --:-:-:-:4      MOV32I r3, 1.0;
52 | 
53 | LOOP:
54 | 
55 | --:-:-:-:1      ISETP.LE.AND P0, PT, count, stop, PT;
56 | --:-:-:-:1      IADD count, count, 1;
57 | 
58 | <CODE>
59 |     my $out;
60 | 
61 |     foreach my $i (0 .. 511)
62 |     {
63 |         my $yield = ($i + 32) & 63 ? '-' : 'Y';
64 | 
65 |         my $stall = $i == 511 ? 0 : 1;
66 | 
67 |         $out .= "--:-:-:$yield:$stall      FFMA result, r1, r2, r3;\n";
68 |     }
69 |     return $out;
70 | </CODE>
71 | 
72 | --:-:-:Y:5  @P0 BRA LOOP;
73 | --:-:-:-:5      EXIT;
74 | EOF
75 | 
76 |     close $fh;
77 | }
78 | 
79 | __END__
80 | 
81 | 


--------------------------------------------------------------------------------
/Assembler/MaxAs/microbench/throughput.sass:
--------------------------------------------------------------------------------
 1 | # Kernel: microbench
 2 | # InsCnt: 18
 3 | # RegCnt: 5
 4 | # SharedSize: 4096
 5 | # BarCnt: 1
 6 | # Params(3):
 7 | #   ord:addr:size:align
 8 | #   0:0x140:4:0
 9 | #   1:0x144:4:0
10 | #   2:0x148:4:0
11 | 
12 | <REGISTER_MAPPING>
13 | 
14 |     8-20 : count
15 | 
16 | </REGISTER_MAPPING>
17 | 
18 | --:-:-:-:1      MOV R0, RZ;
19 | --:-:-:-:1      MOV R1, RZ;
20 | --:-:-:-:1      MOV R2, RZ;
21 | --:-:-:-:1      MOV R3, RZ;
22 | --:-:-:-:1      MOV R4, RZ;
23 | --:-:-:-:1      MOV R5, RZ;
24 | --:-:-:-:1      MOV R6, RZ;
25 | --:-:-:-:1      MOV R7, RZ;
26 | --:-:-:-:1      MOV R8, RZ;
27 | --:-:-:Y:6      MOV count, RZ;
28 | 
29 | // This loop is capable of running at 1700 GFlops on GM107.
30 | // You can tweak it to see how register bank conflicts or different control codes
31 | // effect performance.
32 | // With thoughput.pl you can pass params to this code and do some autotuning.
33 | LOOP:
34 | 
35 | --:-:-:-:1      ISETP.LE.AND P0, PT, count, 0x19000, PT;
36 | --:-:-:-:1      IADD count, count, 0x1;
37 | 
38 | <CODE>
39 |     my $out;
40 | 
41 |     foreach my $i (0..511) #511
42 |     {
43 |         my $y = ($i + 32) & 63 ? '-' : 'Y';
44 | 
45 |         $out .= qq|
46 | --:-:-:$y:1      FFMA R0, R1, R2, R3;|; #c[0x0][$c]
47 |     }
48 |     return $out;
49 | </CODE>
50 | 
51 | --:-:-:Y:5  @P0 BRA LOOP;
52 | 
53 | --:-:-:-:5      EXIT;
54 | 
55 | <COMMENT>
56 | 
57 | 
58 |     open my $fh, 'params.txt';
59 |     my $line = <$fh>;
60 |     close $fh;
61 |     my ($r1, $r2, $r3) = split "\t", $line;
62 | 
63 |     80-95 : out, clocks, in, tid, clock1, clock2, result
64 | 
65 | 
66 | --:-:1:-:1      S2R tid,   SR_TID.X;
67 | --:-:-:-:1      MOV out,    c[0x0][0x140];
68 | --:-:-:-:1      MOV clocks, c[0x0][0x144];
69 | 01:-:-:-:1      MOV in,     c[0x0][0x148];
70 | 
71 | 
72 | 
73 | --:-:-:-:1      MOV32I f0, 0x3f800000;
74 | --:-:-:-:1      MOV32I f1, 0x3f800000;
75 | --:-:-:-:1      MOV32I f2, 0x3f800000;
76 | --:-:-:-:5      MOV32I f3, 0x3f800000;
77 | 
78 | --:-:-:-:1      CS2R clock1, SR_CLOCKLO;
79 | 
80 | 
81 | --:-:-:-:1      CS2R clock2, SR_CLOCKLO;
82 | 
83 | --:-:-:-:6      MOV32I result, 0x457;
84 | --:-:-:-:1      IADD clock1, clock2, -clock1;
85 | 
86 | 
87 | --:-:-:-:6      SHL  tid, tid, 0x2;
88 | --:-:-:-:1      IADD clocks, clocks, tid;
89 | --:-:-:-:1      IADD out,  out,  tid;
90 | 
91 | --:-:-:-:1      STG [clocks], clock1;
92 | --:-:-:-:1      STG [out],    R24;
93 | 
94 | 
95 | </COMMENT>


--------------------------------------------------------------------------------
/Assembler/MaxAs/microbench/throughput2.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl
  2 | use strict;
  3 | my %p;
  4 | 
  5 | $p{N}         = 8192;
  6 | $p{blocking}  = 8;
  7 | $p{unroll}    = 8;
  8 | $p{threads}   = 64;   #256
  9 | 
 10 | $p{csize}     = $p{blocking} * $p{blocking};
 11 | $p{loopSize}  = $p{unroll} * $p{csize};
 12 | $p{width}     = sqrt($p{csize} * $p{threads});
 13 | $p{blocks}    = ($p{N} / $p{width}) * ($p{N} / $p{width});
 14 | $p{loops}     = $p{N} / $p{unroll};
 15 | $p{fops}      = 2 * $p{loops} * $p{loopSize} * $p{blocks} * $p{threads};
 16 | 
 17 | my $fileName  = 'throughput2.sass';
 18 | 
 19 | my @params = qw(N blocking unroll threads csize loopSize loops width blocks fops);
 20 | 
 21 | #print join("\t", @params), "\n";
 22 | #print join("\t", @p{@params}), "\n";
 23 | 
 24 | print map sprintf("%-9s: %d\n", $_, $p{$_}), @params;
 25 | 
 26 | writeSassFile($fileName, $p{loopSize}, $p{loops});
 27 | 
 28 | #print `maxas.pl -p $fileName`;
 29 | #exit;
 30 | 
 31 | print `maxas.pl -i $fileName microbench.cubin`;
 32 | 
 33 | exit if $?;
 34 | 
 35 | my $data = `Release\\microbench.exe e $p{blocks} $p{threads} $p{fops} 50`;
 36 | 
 37 | my ($gflops) = $data =~ /GFLOPS: ([0-9]+)/ms;
 38 | 
 39 | print $data;
 40 | 
 41 | #printf "%d %4d %4d %d\n", $thread128, $loopSize, $loops, $gflops;
 42 | 
 43 | 
 44 | 
 45 | 
 46 | sub writeSassFile
 47 | {
 48 |     my ($filename, $loopSize, $loops) = @_;
 49 | 
 50 |     open my $fh, ">$filename" or die "$filename: $!";
 51 | 
 52 |     printf $fh <<'END_SASS', $loops;
 53 | # Kernel: microbench
 54 | 
 55 | <REGISTER_MAPPING>
 56 | 
 57 |      3, 2,11,10,19,18,27,26 : cx00y<00-03|64-67>
 58 |      7, 6,15,14,23,22,31,30 : cx01y<00-03|64-67>
 59 |      1, 0, 9, 8,17,16,25,24 : cx02y<00-03|64-67>
 60 |      5, 4,13,12,21,20,29,28 : cx03y<00-03|64-67>
 61 |     35,34,43,42,51,50,59,58 : cx64y<00-03|64-67>
 62 |     39,38,47,46,55,54,63,62 : cx65y<00-03|64-67>
 63 |     33,32,41,40,49,48,57,56 : cx66y<00-03|64-67>
 64 |     37,36,45,44,53,52,61,60 : cx67y<00-03|64-67>
 65 | 
 66 |     64-79 : j0Ax<00-03|64-67>, j0By<00-03|64-67>
 67 |     80-95 : j1Ax<00-03|64-67>, j1By<00-03|64-67>
 68 | 
 69 |     0-127 : r<0-127>
 70 | 
 71 |     100-101 : count, stop
 72 | 
 73 |     //102-112 ~ readAs, readBs, writeS
 74 | 
 75 | </REGISTER_MAPPING>
 76 | 
 77 | --:-:-:-:1      MOV count, RZ;
 78 | --:-:-:-:1      MOV32I stop, %d;
 79 | //--:-:-:-:1      MOV writeS, RZ;
 80 | //--:-:-:-:1      MOV readAs, RZ;
 81 | //--:-:-:-:1      MOV readBs, RZ;
 82 | 
 83 | <CODE>
 84 |     return join '', map "--:-:-:-:1      MOV32I r$_, 1.0;\n", 0..95;
 85 | </CODE>
 86 | 
 87 | LOOP:
 88 | 
 89 | --:-:-:-:1      ISETP.LE.AND P0, PT, count, stop, PT;
 90 | --:-:-:-:1      IADD count, count, 1;
 91 | 
 92 | <CODE>
 93 |     my $out;
 94 | 
 95 | 
 96 |     my @cOrder;
 97 |     #my @swirl = ([0,1],[0,0],[2,0],[2,1]);
 98 |     my @swirl = ([2,0],[2,1],[0,1],[0,0]);
 99 |     #my @swirl = ([0,1],[0,0],[1,0],[1,1]);
100 |     my @xVals = (0,1,64,65);
101 |     #my @xVals = (0,2,64,66);
102 | 
103 |     my @yVals = (0,2,64,66);
104 | 
105 |     foreach my $y (@yVals)
106 |     {
107 |         foreach my $x (@xVals)
108 |         {
109 |             push @cOrder, sprintf('x%%02dy%%02d', $x + $_->[0], $y + $_->[1]) foreach @swirl;
110 |         }
111 |         @xVals = reverse @xVals;
112 |     }
113 | 
114 |     foreach my $j (0..7)
115 |     {
116 |         my $odd  = $j & 1;
117 |         my $nOdd = !$odd + 0;
118 | 
119 | 		my %%insert;
120 | 
121 |         #$insert{c62} = "01:-:-:-:5      BAR.SYNC 0;\n" if $j == 6;
122 | 
123 |         $insert{c62} =
124 |                 "--:-:-:-:1      LOP.XOR readAs, readAs, 0;\n" .
125 |                 "--:-:-:-:1      LOP.XOR readBs, readBs, 0;\n" .
126 |                 "--:-:-:-:1      LOP.XOR readAs, readAs, 0;\n" .
127 |                 "--:-:-:-:1      LOP.XOR readBs, readBs, 0;\n" .
128 |                 "--:-:-:-:1      LOP.XOR writeS, writeS, 0;\n" if $j == 8;
129 | 
130 |         foreach my $c (0 .. 63)
131 |         {
132 |             my ($x,$y) = $cOrder[$c] =~ /^(x\d+)(y\d+)/;
133 |             my $ins    = $insert{"c$c"} || '';
134 |             my $stall  = ($c == 63 && $j == 7) ? 0 : 1; #1; #$ins ||
135 |             my $yield  = $c == 32 ? 'Y' : '-';
136 |             my $wait   = '--'; #$c ? '--' : '01';
137 | 
138 |             $out .= "$wait:-:-:$yield:$stall      FFMA c$cOrder[$c], j${odd}A$x, j${odd}B$y, c$cOrder[$c];\n$ins";
139 |         }
140 |     }
141 |     return $out;
142 | </CODE>
143 | 
144 | --:-:-:Y:5  @P0 BRA LOOP;
145 | --:-:-:-:5      EXIT;
146 | END_SASS
147 | 
148 |     close $fh;
149 | }
150 | 
151 | __END__
152 | 
153 |         my %%insert = (
154 |             c0 => "--:-:-:-:1      LDS.U.128 j${nOdd}Ax00, [readAs+0x10];\n",
155 |             c2 => "--:-:-:-:1      LDS.U.128 j${nOdd}By00, [readBs+0x10];\n",
156 |             c4 => "--:-:-:-:1      LDS.U.128 j${nOdd}Ax64, [readAs+0x10];\n",
157 |             c6 => "--:-:1:-:1      LDS.U.128 j${nOdd}By64, [readBs+0x10];\n",
158 |         );


--------------------------------------------------------------------------------
/Assembler/MaxAs/microbench/throughput2.sass:
--------------------------------------------------------------------------------
 1 | # Kernel: microbench
 2 | 
 3 | <REGISTER_MAPPING>
 4 | 
 5 |     0-10 : result, r1, r2, r3
 6 |     20-27 ~ count, stop
 7 | 
 8 | </REGISTER_MAPPING>
 9 | 
10 | --:-:-:-:1      MOV count, RZ;
11 | --:-:-:-:1      MOV32I stop, 102400;
12 | --:-:-:-:1      MOV32I r1, 1.0;
13 | --:-:-:-:1      MOV32I r2, 1.0;
14 | --:-:-:-:4      MOV32I r3, 1.0;
15 | 
16 | LOOP:
17 | 
18 | --:-:-:-:1      ISETP.LE.AND P0, PT, count, stop, PT;
19 | --:-:-:-:1      IADD count, count, 1;
20 | 
21 | <CODE>
22 |     my $out;
23 | 
24 |     foreach my $i (0 .. 511)
25 |     {
26 |         my $yield = ($i + 32) & 63 ? '-' : 'Y';
27 | 
28 |         my $stall = $i == 511 ? 0 : 1;
29 | 
30 |         #$out .= "--:-:-:$yield:1      FFMA r3, r1, r2, r3;\n";
31 |         #$out .= "--:-:-:-:1      FFMA r3, r1, r2, r3;\n";
32 |         #$out .= "--:-:-:-:1      FFMA r3, r1, r2, r3;\n";
33 |         #$out .= "--:-:-:-:0      FFMA r3, r1, r2, r3;\n";
34 |         #$out .= "--:-:-:-:1      I2F.F32.S16 result, r1;\n";
35 | 
36 |         #$out .= "--:-:-:$yield:$stall      VADD.S16.S16.SAT.MRG_16L result, r1, r2, RZ;\n";
37 |         #$out .= "--:-:-:-:1      MOV result, RZ;\n";
38 | 
39 |         $out .= "--:-:-:$yield:$stall      IADD.SAT result, r1, r2;\n";
40 |         #$out .= "--:-:-:$yield:$stall      VMAD.S8.S8.SAT result, r1, r2, r3;\n";
41 |         #$out .= "--:-:-:$yield:$stall      XMAD result, r1, r2, r3;\n";
42 |     }
43 |     return $out;
44 | </CODE>
45 | 
46 | --:-:-:Y:5  @P0 BRA LOOP;
47 | --:-:-:-:5      EXIT;
48 | 


--------------------------------------------------------------------------------
/Assembler/MaxAs/microbench/throughput3.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | use strict;
 3 | 
 4 | my %data;
 5 | 
 6 | foreach my $thread128 (1 .. 8)
 7 | {
 8 |     foreach my $size64 (8 .. 16)
 9 |     {
10 |         my $loopSize  = $size64 * 64;
11 |         my $loops     = int(2 * 1638400 / ($size64 * $thread128));
12 | 
13 |         my $blocks    = 16;
14 |         my $threads   = $thread128 * 128;
15 |         my $fops      = 2 * $loops * $loopSize * $blocks * $threads;
16 |         my $fileName  = 'throughput2.sass';
17 | 
18 |         #printf "%d %4d %4d %d\n", $thread128, $loopSize, $loops, $fops;
19 |         #next;
20 | 
21 |         writeSassFile($fileName, $loopSize, $loops);
22 | 
23 |         `maxas.pl -i $fileName microbench.cubin`;
24 | 
25 |         exit if $?;
26 | 
27 |         my $data = `Release\\microbench.exe e $blocks $threads $fops`;
28 | 
29 |         my ($gflops) = $data =~ /GFLOPS: ([0-9]+)/ms;
30 | 
31 |         printf "%d %4d %4d %d\n", $thread128, $loopSize, $loops, $gflops;
32 | 
33 |         push @{$data{$loopSize}}, $gflops;
34 |     }
35 | }
36 | print join("\t", 'size', 1 .. 8), "\n";
37 | foreach my $loopSize (sort {$a <=> $b} keys %data)
38 | {
39 |     print join("\t", $loopSize, @{$data{$loopSize}}), "\n";
40 | }
41 | 
42 | exit;
43 | 
44 | sub writeSassFile
45 | {
46 |     my ($filename, $loopSize, $loops) = @_;
47 | 
48 |     open my $fh, ">$filename" or die "$filename: $!";
49 | 
50 |     printf $fh <<'EOF', $loops, $loopSize, $loopSize;
51 | # Kernel: microbench
52 | 
53 | <REGISTER_MAPPING>
54 | 
55 |     0-10 : result, r1, r2, r3, count, stop
56 | 
57 | </REGISTER_MAPPING>
58 | 
59 | --:-:-:-:1      MOV count, RZ;
60 | --:-:-:-:1      MOV32I stop, %d;
61 | --:-:-:-:1      MOV32I r1, 1.0;
62 | --:-:-:-:1      MOV32I r2, 1.0;
63 | --:-:-:-:4      MOV32I r3, 1.0;
64 | 
65 | LOOP:
66 | 
67 | --:-:-:-:1      ISETP.LE.AND P0, PT, count, stop, PT;
68 | --:-:-:-:1      IADD count, count, 1;
69 | 
70 | <CODE>
71 |     my $out;
72 | 
73 |     foreach my $i (0 .. %d)
74 |     {
75 |         my $y = %d > 64 && (($i + 32) & 63) ? '-' : 'Y';
76 | 
77 |         $out .= "--:-:-:$y:1      FFMA result, r1, r2, r3;\n";
78 |     }
79 |     return $out;
80 | </CODE>
81 | 
82 | --:-:-:Y:5  @P0 BRA LOOP;
83 | --:-:-:-:5      EXIT;
84 | EOF
85 | 
86 |     close $fh;
87 | }
88 | 
89 | __END__
90 | 
91 | 


--------------------------------------------------------------------------------
/Assembler/MaxAs/microbench/throughput4.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl
  2 | use strict;
  3 | 
  4 | my $loopSize  = 512;
  5 | my $blocks    = 64;
  6 | my $loops     = 102400;
  7 | my $fileName  = 'throughput2.sass';
  8 | 
  9 | writeSassFile($fileName, $loops);
 10 | 
 11 | #print `maxas.pl -p $fileName`;
 12 | #exit;
 13 | 
 14 | print `maxas.pl -i $fileName microbench.cubin`;
 15 | exit if $?;
 16 | 
 17 | foreach my $thread128 (4)
 18 | {
 19 |     my $threads   = $thread128 * 128;
 20 |     my $fops      = 2 * $loops * $loopSize * $blocks * $threads;
 21 | 
 22 |     print "./microbench e $blocks $threads $fops\n\n";
 23 |     my $data = `./microbench e $blocks $threads $fops`;
 24 |     exit($?) if $?;
 25 | 
 26 |     my ($gflops) = $data =~ /GFLOPS: ([0-9]+)/ms;
 27 | 
 28 |     printf "%d %d %d %.2f\n", $thread128, $threads, $gflops, 100 * $gflops / 3050.0;
 29 | }
 30 | 
 31 | exit;
 32 | 
 33 | sub writeSassFile
 34 | {
 35 |     my ($filename, $loops) = @_;
 36 | 
 37 |     open my $fh, ">$filename" or die "$filename: $!";
 38 | 
 39 |     printf $fh <<'EOF', $loops;
 40 | # Kernel: microbench
 41 | 
 42 | <REGISTER_MAPPING>
 43 | 
 44 |     0-10 : result, r1, r2, r3
 45 |     20-27 ~ count, stop
 46 | 
 47 | </REGISTER_MAPPING>
 48 | 
 49 | --:-:-:-:1      MOV count, RZ;
 50 | --:-:-:-:1      MOV32I stop, %d;
 51 | --:-:-:-:1      MOV32I r1, 1.0;
 52 | --:-:-:-:1      MOV32I r2, 1.0;
 53 | --:-:-:-:4      MOV32I r3, 1.0;
 54 | 
 55 | LOOP:
 56 | 
 57 | --:-:-:-:1      ISETP.LE.AND P0, PT, count, stop, PT;
 58 | --:-:-:-:1      IADD count, count, 1;
 59 | 
 60 | <CODE>
 61 |     my $out;
 62 | 
 63 |     foreach my $i (0 .. 511)
 64 |     {
 65 |         my $yield = ($i + 32) & 63 ? '-' : 'Y';
 66 | 
 67 |         my $stall = $i == 511 ? 0 : 1;
 68 | 
 69 |         #$out .= "--:-:-:$yield:1      FFMA r3, r1, r2, r3;\n";
 70 |         #$out .= "--:-:-:-:1      FFMA r3, r1, r2, r3;\n";
 71 |         #$out .= "--:-:-:-:1      FFMA r3, r1, r2, r3;\n";
 72 |         #$out .= "--:-:-:-:0      FFMA r3, r1, r2, r3;\n";
 73 |         #$out .= "--:-:-:-:1      I2F.F32.S16 result, r1;\n";
 74 | 
 75 |         #$out .= "--:-:-:$yield:$stall      VADD.S16.S16.SAT.MRG_16L result, r1, r2, RZ;\n";
 76 |         #$out .= "--:-:-:-:1      MOV result, RZ;\n";
 77 | 
 78 |         $out .= "--:-:-:$yield:$stall      IADD.SAT result, r1, r2;\n";
 79 |         #$out .= "--:-:-:$yield:$stall      VMAD.S8.S8.SAT result, r1, r2, r3;\n";
 80 |         #$out .= "--:-:-:$yield:$stall      XMAD result, r1, r2, r3;\n";
 81 |     }
 82 |     return $out;
 83 | </CODE>
 84 | 
 85 | --:-:-:Y:5  @P0 BRA LOOP;
 86 | --:-:-:-:5      EXIT;
 87 | EOF
 88 | 
 89 |     close $fh;
 90 | }
 91 | 
 92 | __END__
 93 | 
 94 | VMAD.U8.U8
 95 | 
 96 | dddd 2655 / 4968 = 53.4%
 97 | 1d1d 4594 / 4968 = 92.4%
 98 | 11d  4746 / 4968 = 95.5%
 99 | 111d 4841 / 4968 = 97.4%
100 | 
101 | block context switches are a little more expensive than thread context switches
102 | 
103 | stall codes:
104 | 
105 | f : 13 clocks
106 | e :  8 clocks
107 | d :  6 clocks
108 | c :  8 clocks, no yield
109 | b : 11 clocks
110 | a : 10 clocks
111 | 9 :  9 clocks
112 | 8 :  8 clocks
113 | 7 :  7 clocks
114 | 6 :  6 clocks
115 | 5 :  5 clocks
116 | 4 :  4 clocks
117 | 3 :  3 clocks
118 | 2 :  2 clocks
119 | 1 :  1 clocks,  no yield
120 | 0 :  0 clocks,  no yield, dual issue


--------------------------------------------------------------------------------
/Assembler/MaxAs/microbench/throughput5.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl
  2 | use strict;
  3 | my %p;
  4 | 
  5 | $p{N}         = 8192;
  6 | $p{blocking}  = 8;
  7 | $p{unroll}    = 8;
  8 | $p{threads}   = 64;   #256
  9 | 
 10 | $p{csize}     = $p{blocking} * $p{blocking};
 11 | $p{loopSize}  = $p{unroll} * $p{csize};
 12 | $p{width}     = sqrt($p{csize} * $p{threads});
 13 | $p{blocks}    = ($p{N} / $p{width}) * ($p{N} / $p{width});
 14 | $p{loops}     = $p{N} / $p{unroll};
 15 | $p{fops}      = 2 * $p{loops} * $p{loopSize} * $p{blocks} * $p{threads};
 16 | 
 17 | my $fileName  = 'throughput2.sass';
 18 | 
 19 | my @params = qw(N blocking unroll threads csize loopSize loops width blocks fops);
 20 | 
 21 | #print join("\t", @params), "\n";
 22 | #print join("\t", @p{@params}), "\n";
 23 | 
 24 | print map sprintf("%-9s: %d\n", $_, $p{$_}), @params;
 25 | 
 26 | writeSassFile($fileName, $p{loopSize}, $p{loops});
 27 | 
 28 | #print `maxas.pl -p $fileName`;
 29 | #exit;
 30 | 
 31 | print `maxas.pl -i $fileName microbench.cubin`;
 32 | 
 33 | exit if $?;
 34 | 
 35 | my $data = `Release\\microbench.exe e $p{blocks} $p{threads} $p{fops} 50`;
 36 | 
 37 | my ($gflops) = $data =~ /GFLOPS: ([0-9]+)/ms;
 38 | 
 39 | print $data;
 40 | 
 41 | #printf "%d %4d %4d %d\n", $thread128, $loopSize, $loops, $gflops;
 42 | 
 43 | 
 44 | 
 45 | 
 46 | sub writeSassFile
 47 | {
 48 |     my ($filename, $loopSize, $loops) = @_;
 49 | 
 50 |     open my $fh, ">$filename" or die "$filename: $!";
 51 | 
 52 |     printf $fh <<'END_SASS', $loops;
 53 | # Kernel: microbench
 54 | 
 55 | <REGISTER_MAPPING>
 56 | 
 57 |      1, 9, 2,10,17,25,18,26 : cy0x<0-7>
 58 |      5,13, 6,14,21,29,22,30 : cy1x<0-7>
 59 |      3,11, 0, 8,19,27,16,24 : cy2x<0-7>
 60 |      7,15, 4,12,23,31,20,28 : cy3x<0-7>
 61 |     35,43,32,40,51,59,48,56 : cy4x<0-7>
 62 |     39,47,36,44,55,63,52,60 : cy5x<0-7>
 63 |     33,41,34,42,49,57,50,58 : cy6x<0-7>
 64 |     37,45,38,46,53,61,54,62 : cy7x<0-7>
 65 | 
 66 |     64-71   : j0Ax<0-3>, j0By<0-3>
 67 |     72-79   : j1Ax<0-3>, j1By<0-3>
 68 | 
 69 |     0-79 : r<0-79>
 70 | 
 71 |     100-101 : count, stop
 72 | 
 73 |     //102-112 ~ readAs, readBs, writeS
 74 | 
 75 | </REGISTER_MAPPING>
 76 | 
 77 | --:-:-:-:1      MOV count, RZ;
 78 | --:-:-:-:1      MOV32I stop, %d;
 79 | //--:-:-:-:1      MOV writeS, RZ;
 80 | //--:-:-:-:1      MOV readAs, RZ;
 81 | //--:-:-:-:1      MOV readBs, RZ;
 82 | 
 83 | <CODE>
 84 |     return join '', map "--:-:-:-:1      MOV r$_, RZ;\n", 0..63;
 85 | </CODE>
 86 | 
 87 | <CODE>
 88 |     return join '', map "--:-:-:-:1      MOV32I r$_, 0x00010001;\n", 64..79;
 89 | </CODE>
 90 | 
 91 | LOOP:
 92 | 
 93 | --:-:-:-:1      ISETP.LE.AND P0, PT, count, stop, PT;
 94 | --:-:-:-:1      IADD count, count, 1;
 95 | 
 96 | <CODE>
 97 |     my $out;
 98 | 
 99 |     my @swirl1 = ([0,0],[0,4],[4,4],[4,0]);
100 |     my @swirl2 = ([0,0],[1,0],[1,1],[0,1]);
101 |     my @swirl3 = ([0,2],[2,2],[2,0],[0,0]);
102 | 
103 |     my @cOrder;
104 |     foreach my $s1 (@swirl1)
105 |     {
106 |         foreach my $s2 (@swirl2)
107 |         {
108 |             foreach my $s3 (@swirl3)
109 |             {
110 |                 push @cOrder, [$s1->[0] + $s2->[0] + $s3->[0], $s1->[1] + $s2->[1] + $s3->[1]];
111 |             }
112 |         }
113 |     }
114 | 
115 |     foreach my $j (0..7)
116 |     {
117 |         my $odd  = $j & 1;
118 |         my $nOdd = !$odd + 0;
119 | 
120 |         my %%insert;
121 | 
122 |         #$insert{c62} = "01:-:-:-:5      BAR.SYNC 0;\n" if $j == 6;
123 | 
124 |         $insert{c62} =
125 |                 "--:-:-:-:1      LOP.XOR readAs, readAs, 0;\n" .
126 |                 "--:-:-:-:1      LOP.XOR readBs, readBs, 0;\n" .
127 |                 "--:-:-:-:1      LOP.XOR readAs, readAs, 0;\n" .
128 |                 "--:-:-:-:1      LOP.XOR readBs, readBs, 0;\n" .
129 |                 "--:-:-:-:1      LOP.XOR writeS, writeS, 0;\n" if $j == 8;
130 | 
131 |         foreach my $c (0 .. 63)
132 |         {
133 |             my ($x,$y) = @{$cOrder[$c]};
134 |             my $ins    = $insert{"c$c"} || '';
135 |             my $stall  = ($c == 63 && $j == 7) ? 0 : 1; #1; #$ins ||
136 |             my $yield  = $c == 32 ? 'Y' : '-';
137 |             my $wait   = '--'; #$c ? '--' : '01';
138 | 
139 |             my $xReg  = $x >> 1;
140 |             my $yReg  = $y >> 1;
141 |             my $xPart = $x & 1 ? '.H1' : '';
142 |             my $yPart = $y & 1 ? '.H1' : '';
143 | 
144 |             $out .= sprintf "$wait:-:-:$yield:$stall      XMAD cy%%dx%%d, j%%dAx%%d%%s, j%%dBy%%d%%s, cy%%dx%%d;\n%%s", $y,$x,  $odd,$xReg,$xPart,  $odd,$yReg,$yPart,  $y,$x,  $ins;
145 |         }
146 |     }
147 |     return $out;
148 | </CODE>
149 | 
150 | --:-:-:Y:5  @P0 BRA LOOP;
151 | --:-:-:-:5      EXIT;
152 | END_SASS
153 | 
154 |     close $fh;
155 | }
156 | 
157 | __END__
158 | 
159 |         my %%insert = (
160 |             c0 => "--:-:-:-:1      LDS.U.128 j${nOdd}Ax00, [readAs+0x10];\n",
161 |             c2 => "--:-:-:-:1      LDS.U.128 j${nOdd}By00, [readBs+0x10];\n",
162 |             c4 => "--:-:-:-:1      LDS.U.128 j${nOdd}Ax64, [readAs+0x10];\n",
163 |             c6 => "--:-:1:-:1      LDS.U.128 j${nOdd}By64, [readBs+0x10];\n",
164 |         );


--------------------------------------------------------------------------------
/Assembler/MaxAs/microbench/xmad.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | use strict;
 3 | 
 4 | print `maxas.pl -i xmad2.sass microbench.cubin`;
 5 | 
 6 | exit if $?;
 7 | 
 8 | print `./microbench i 1 128`;
 9 | 
10 | 
11 | __END__
12 | 
13 | 


--------------------------------------------------------------------------------
/Assembler/MaxAs/microbench/xmad2.sass:
--------------------------------------------------------------------------------
  1 | # Kernel: microbench
  2 | # InsCnt: 18
  3 | # RegCnt: 5
  4 | # SharedSize: 4096
  5 | # BarCnt: 1
  6 | # Params(3):
  7 | #	ord:addr:size:align
  8 | #	0:0x140:8:0
  9 | #	1:0x148:8:0
 10 | #	2:0x150:8:0
 11 | #
 12 | # Instructions:
 13 | 
 14 | <CONSTANT_MAPPING>
 15 |     blockDimX : c[0x0][0x8]
 16 |     blockDimY : c[0x0][0xc]
 17 |     blockDimZ : c[0x0][0x10]
 18 |     gridDimX : c[0x0][0x14]
 19 |     gridDimY : c[0x0][0x18]
 20 |     gridDimZ : c[0x0][0x1c]
 21 | 
 22 |     param_out[0] : c[0x0][0x140]
 23 |     param_out[1] : c[0x0][0x144]
 24 |     param_clocks[0] : c[0x0][0x148]
 25 |     param_clocks[1] : c[0x0][0x14c]
 26 |     param_in[0] : c[0x0][0x150]
 27 |     param_in[1] : c[0x0][0x154]
 28 | </CONSTANT_MAPPING>
 29 | 
 30 | <REGISTER_MAPPING>
 31 | 
 32 | 	0-1 : out<0-1>
 33 | 	2-3 : clocks<0-1>
 34 |     4-15  : result, result2, tid, bid, blockDim, clock1, clock2, scale, s
 35 |     16-24 : a, b, c, x
 36 | 
 37 | </REGISTER_MAPPING>
 38 | 
 39 | // Load in our params
 40 | --:-:-:-:1      MOV out0,      param_out[0];
 41 | --:-:-:-:1      MOV out1,      param_out[1];
 42 | --:-:-:-:1      MOV clocks0,   param_clocks[0];
 43 | --:-:-:-:1      MOV clocks1,   param_clocks[1];
 44 | //--:-:-:-:1      MOV in,       c[0x0][0x148];
 45 | --:-:-:-:1      MOV blockDim, blockDimX;
 46 | 
 47 | --:-:-:-:1      PSETP.AND.AND P0, PT, !PT, PT, PT;
 48 | 
 49 | --:-:-:-:6      MOV32I result,  0xffffffff;
 50 | --:-:-:-:6      MOV32I result2, 0x0;
 51 | --:-:-:-:1      MOV32I a, 1;
 52 | --:-:-:-:1      MOV32I b, 1;
 53 | --:-:-:-:6      MOV32I c, 0x0;
 54 | 
 55 | // (127 - scale) << 23
 56 | //--:-:-:-:6      MOV32I scale, 28;
 57 | //--:-:-:-:6      IADD scale, -scale, 127;
 58 | //--:-:-:-:6      SHL  scale, scale, 23;
 59 | 
 60 | 
 61 | //--:-:-:-:6      MOV32I c, 0x4f765432;
 62 | 
 63 | //--:-:1:-:2      LDG.CI.128 a, [in];
 64 | 
 65 | //01:-:-:-:6      VMAD.S16.S16 result, a, b, c;
 66 | 
 67 | //--:-:-:-:6      MOV result, a;
 68 | 
 69 | // a >> 16 | (b & 0xffff0000)
 70 | 
 71 | //--:-:-:-:6      SHR.U32 result, a, 16;
 72 | //--:-:-:-:6      LOP3.LUT result, result, b, c, 0xf8;
 73 | 
 74 | //--:-:-:-:6      I2I.S32.S16 result, a.H1;
 75 | 
 76 | //--:-:-:Y:d      IADD result.CC, a, -c;
 77 | //--:-:-:Y:2      IADD.X result2, b, -RZ;
 78 | 
 79 | //--:-:-:-:6      SHR result, a, 1;
 80 | 
 81 | //--:-:-:-:6      BFI result, b, 0x1010, a;
 82 | 
 83 | --:-:-:-:1      CS2R clock1, SR_CLOCKLO;
 84 | 
 85 | //--:-:-:-:6      XMAD.S16.S16 c, a, b, RZ;
 86 | //--:-:-:-:6      ISET.LT.AND s, c, RZ, PT;
 87 | //--:-:-:-:6      IADD result.CC, c, result;
 88 | //--:-:-:-:6      IADD.X result2, s, result2;
 89 | 
 90 | //--:-:-:-:6      XMAD.S16.S16 result.CC, a, b, result;
 91 | //--:-:-:-:6      IADD.X result2, result2, RZ;
 92 | 
 93 | //--:-:-:-:6      SHF.R.S64 result, result, 1, result2;
 94 | //--:-:-:-:6      MOV32I result2, 0;
 95 | 
 96 | --:-:-:-:f      LOP.AND.NZ P0, RZ, result, 1;
 97 | 
 98 | --:-:-:-:6  @P0 VADD.S16.S16.SAT.MRG_16H result, a, b, result;
 99 | 
100 | //--:-:1:-:d      I2F.F32.S32 result2, a;
101 | //01:-:-:-:6      FMUL result2, result2, scale;
102 | //01:-:2:-:d      F2I.S32.F32 result, result2;
103 | 
104 | 02:-:-:-:6      CS2R clock2, SR_CLOCKLO;
105 | 
106 | //F2I   = "^$pred?F2I$ftz$x2x$round $r0, $cr20;"
107 | //I2F   = "^$pred?I2F$x2x$rnd $r0, $cr20;"
108 | //x2x   = "\.(?<destSign>F|U|S)(?<destWidth>8|16|32|64)\.(?<srcSign>F|U|S)(?<srcWidth>8|16|32|64)"
109 | //rnd   = "(?:\.(?<rnd>RN|RM|RP|RZ))?"
110 | //round = "(?:\.(?<round>ROUND|FLOOR|CEIL|TRUNC))?"
111 | //r8    = qr"(?<r8neg>\-)?(?<r8abs>\|)?(?<r8>$reg)\|?(?:\.(?<r8part>H0|H1|B1|B2|B3))?(?<reuse1>\.reuse)?"
112 | //r20   = qr"(?<r20neg>\-)?(?<r20abs>\|)?(?<r20>$reg)\|?(?:\.(?<r20part>H0|H1|B1|B2|B3))?(?<reuse2>\.reuse)?"
113 | 
114 | 
115 | //--:-:-:-:1      XMAD.MRG x, a, b.H1, RZ;
116 | //--:-:-:-:6      XMAD result, a.H1, b.H1, c;
117 | //--:-:-:-:1      XMAD.PSL.CBCC result, a.H1, x.H1, result;
118 | 
119 | // Get the first clock value
120 | 
121 | --:-:1:-:1      S2R tid, SR_TID.X;
122 | --:-:2:-:2      S2R bid, SR_CTAID.X;
123 | 
124 | 
125 | 
126 | // Take the difference of clocks
127 | --:-:-:-:1      IADD clock1, clock2, -clock1;
128 | 
129 | // Setup our output addresses
130 | // Stall your pipeline dependencies properly
131 | 03:-:-:-:1      XMAD tid, blockDim, bid, tid;
132 | --:-:-:Y:6      XMAD.MRG x, blockDim, bid.H1, RZ;
133 | --:-:-:Y:6      XMAD.PSL.CBCC tid, blockDim.H1, x.H1, tid;
134 | --:-:-:Y:6      SHL  tid, tid, 0x2;
135 | 
136 | --:-:-:-:1      IADD clocks, clocks, tid;
137 | --:-:-:-:1      IADD out,  out,  tid;
138 | 
139 | // Output the results.
140 | // No stall needed on prior instruction as memory store instructions have a 5 clock delay in picking up register values
141 | --:-:-:-:1      STG.E [clocks], result2;
142 | --:-:-:-:1      STG.E [out],    result;
143 | --:-:-:-:5      EXIT;
144 | 
145 | 


--------------------------------------------------------------------------------
/Assembler/MaxAs/pm_to_blib:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ncic-sugon/DeepPerf/e539add1347e9354712787f71f66ae346f006544/Assembler/MaxAs/pm_to_blib


--------------------------------------------------------------------------------
/Assembler/MaxAs/sgemm/batched_gemm.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ncic-sugon/DeepPerf/e539add1347e9354712787f71f66ae346f006544/Assembler/MaxAs/sgemm/batched_gemm.xlsx


--------------------------------------------------------------------------------
/Assembler/MaxAs/sgemm/cublas_sgemm.ptx:
--------------------------------------------------------------------------------
 1 | .version 4.1
 2 | .target sm_50
 3 | .address_size 64
 4 | 
 5 | // ptxas -v -arch=sm_50 -m 32 --opt-level 4 -o cublas_sgemm.cubin cublas_sgemm.ptx
 6 | 
 7 | // You can use maxas to insert cublas_device.lib code into a cubin built from this ptx:
 8 | 
 9 | // From C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v6.5\lib\Win32\cublas_device.lib
10 | 
11 | // cuobjdump -lelf cublas_device.lib | find "sm_50"
12 | 
13 | // cuobjdump -xelf maxwell_sgemm.asm.sm_50.cubin cublas_device.lib
14 | 
15 | // maxas -l maxwell_sgemm.asm.sm_50.cubin
16 | 
17 | // maxas -e -k maxwell_sgemm_128x128_nt maxwell_sgemm_128x128_nt.sass
18 | // maxas -e -k maxwell_sgemm_128x64_nt  maxwell_sgemm_128x64_nt.sass
19 | 
20 | // maxas -i maxwell_sgemm_128x128_nt.sass cublas_sgemm.cubin
21 | // maxas -i maxwell_sgemm_128x64_nt.sass  cublas_sgemm.cubin
22 | 
23 | // The sgemm.cpp code makes use of this cubin to benchmark the kernels outside of cublas.
24 | 
25 | .visible .entry maxwell_sgemm_128x128_nt(
26 | 	.param .u64 .ptr.global.align 8 param_A,
27 | 	.param .u64 .ptr.global.align 8 param_B,
28 | 	.param .u64 .ptr.global.align 8 param_C,
29 | 	.param .s32 param_lda,
30 | 	.param .s32 param_ldb,
31 | 	.param .s32 param_ldc,
32 | 	.param .s32 param_k,
33 | 	.param .u64 .ptr.global.align 8 param_Alpha,
34 | 	.param .u64 .ptr.global.align 8 param_Beta,
35 | 	.param .s32 param_alpha,
36 | 	.param .s32 param_beta,
37 | 	.param .s32 param_flag
38 | )
39 | .reqntid 256
40 | {
41 | 	.shared .align 16 .b8 share[16384];
42 | 
43 | 	ret;
44 | }
45 | 
46 | .visible .entry maxwell_sgemm_128x64_nt(
47 | 	.param .u64 .ptr.global.align 8 param_A,
48 | 	.param .u64 .ptr.global.align 8 param_B,
49 | 	.param .u64 .ptr.global.align 8 param_C,
50 | 	.param .s32 param_lda,
51 | 	.param .s32 param_ldb,
52 | 	.param .s32 param_ldc,
53 | 	.param .s32 param_k,
54 | 	.param .u64 .ptr.global.align 8 param_Alpha,
55 | 	.param .u64 .ptr.global.align 8 param_Beta,
56 | 	.param .s32 param_alpha,
57 | 	.param .s32 param_beta,
58 | 	.param .s32 param_flag
59 | )
60 | .reqntid 128
61 | {
62 | 	.shared .align 16 .b8 share[12288];
63 | 
64 | 	ret;
65 | }
66 | 


--------------------------------------------------------------------------------
/Assembler/MaxAs/sgemm/new.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ncic-sugon/DeepPerf/e539add1347e9354712787f71f66ae346f006544/Assembler/MaxAs/sgemm/new.cubin


--------------------------------------------------------------------------------
/Assembler/MaxAs/sgemm/sgemm.cu:
--------------------------------------------------------------------------------
  1 | 
  2 | // Note this file isn't configured to automatically compile.
  3 | // Here's how:
  4 | 
  5 | // If you want to look at the ptx first:
  6 | // nvcc -arch sm_50 -m 32 -ptx sgemm.cu
  7 | 
  8 | // Manually compile your kernel to a cubin.
  9 | // You should only have to do this once, unless you change params or shared size or globals:
 10 | // nvcc -arch sm_50 -m 32 -cubin sgemm.cu
 11 | 
 12 | // If tweaking a kernel or writing a new one based on this shell code you would then do this:
 13 | // maxas.pl -e kernel.cubin kernel.sass
 14 | 
 15 | // I've already included a modified kernel (sgemm.sass) so the next step is..
 16 | 
 17 | // Splice the manually assembled code back into the cubin:
 18 | // maxas.pl -i sgemm.sass sgemm.cubin
 19 | 
 20 | #include <device_functions.h>
 21 | #include <device_launch_parameters.h>
 22 | #include <cuda_texture_types.h>
 23 | #include <texture_fetch_functions.h>
 24 | 
 25 | typedef texture<float4, cudaTextureType1D, cudaReadModeElementType> floatTex;
 26 | 
 27 | floatTex  texA(0, cudaFilterModePoint, cudaAddressModeBorder);
 28 | floatTex  texB(0, cudaFilterModePoint, cudaAddressModeBorder);
 29 | 
 30 | // Use extern C so C++ doesn't mangle our kernel name
 31 | extern "C"
 32 | // This kernel requires 256x1x1 threads per block
 33 | __global__ void __launch_bounds__(256) sgemm_kernel_128(
 34 | 	float *C,
 35 | 	const int m,   const int n,   const int k,
 36 | 	const int lda, const int ldb, const int ldc,
 37 | 	float alpha, int *D)
 38 | {
 39 | 	// Declare any shared memory your kernel requires
 40 | 	// Or you could just pass the amount in as a param to cuLaunchKernel
 41 | 	__shared__ float4 share[1024];
 42 | 
 43 | 	int tid = threadIdx.x;
 44 | 
 45 | 	// If you use indirect texture references, they will be passed as params at the end of the param list
 46 | 	// So set that up here to make sure they're available in your kernel
 47 | 	floatTex tex = tid > 127 ? texB : texA;
 48 | 
 49 | 	// Make use of shared and your textures so it doesn't get optimized away
 50 | 	share[tid] = tex1Dfetch(tex, tid);
 51 | 
 52 | 	__syncthreads();
 53 | 
 54 | 	// output something so your setup isn't optimized away.
 55 | 	C[tid] = share[255-tid].x;
 56 | }
 57 | 
 58 | extern "C"
 59 | __global__ void __launch_bounds__(64) sgemm_kernel_64(
 60 | 	float *C,
 61 | 	const int m,   const int n,   const int k,
 62 | 	const int lda, const int ldb, const int ldc,
 63 | 	float alpha, int *D)
 64 | {
 65 | 	__shared__ float4 share[512];
 66 | 
 67 | 	int tid = threadIdx.x;
 68 | 
 69 | 	floatTex tex = tid > 127 ? texB : texA;
 70 | 
 71 | 	share[tid] = tex1Dfetch(tex, tid);
 72 | 
 73 | 	__syncthreads();
 74 | 
 75 | 	C[tid] = share[255-tid].x;
 76 | }
 77 | 
 78 | // A note about using the Cuda Runtime.
 79 | // If that's your preference over the driver API then here's what you'd do:
 80 | 
 81 | // In your project properties in the Cuda C/C++ panel:
 82 | //    -Set the "Keep Processed Files" (-keep) option
 83 | //    -Add a -v manually to the command line
 84 | // If compiling on command line just add -keep -v options to nvcc.
 85 | // Rebuild your solution and look in the log for these lines that follow the ptxas step:
 86 | 
 87 | // #$ fatbinary --create="Release/kernel.fatbin" -32 --key="a7bce87544c2a492" --ident="C:/Users/Scott/Documents/sgemm6/sgemm6/kernel.cu" --cmdline="-v --opt-level 4 --generate-line-info " "--image=profile=sm_50,file=Release/kernel.sm_50.cubin" "--image=profile=compute_50,file=Release/kernel.ptx" --embedded-fatbin="Release/kernel.fatbin.c" --cuda
 88 | // #$ cl.exe @Release/kernel.cu.cpp.ii.res > "Release/kernel.cu.cpp.ii"
 89 | // #$ cl.exe @Release/kernel.cu.obj.res -Fo"Release/kernel.cu.obj"
 90 | 
 91 | // You just need to manually run these 3 commands (or add them to a build script)
 92 | // after you've modified the cubin generated from the preceeding ptxas command.
 93 | // That will give you a new .cu.obj file which will automatically be linked in for you next time you
 94 | // build your project (or you could manually run the linker step as well).
 95 | 
 96 | // Having done that you can call your kernel normally using the <<< >>> syntax.
 97 | // Debugging will have to be with the sass syntax but that's what you'll want to see anyway.
 98 | // With fatbin you can also keep non-maxwell optimized versions of your code.
 99 | 
100 | 
101 | // I just discovered this also works as a shortcut to the above:
102 | // nvcc -lib -arch sm_52 -m 32 -use-cubin code=sm_52,cubin=sgemm.cubin -o sgemm.lib sgemm.cu
103 | 
104 | // The cu kernel definitions above need to have empty bodies.
105 | // And, the cu file must be compiled to a lib seperately before linking.


--------------------------------------------------------------------------------
/Assembler/MaxAs/sgemm/sgemm.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl
  2 | use strict;
  3 | 
  4 | my $CU_AD_FORMAT_UNSIGNED_INT8  = 0x01;
  5 | my $CU_AD_FORMAT_UNSIGNED_INT16 = 0x02;
  6 | my $CU_AD_FORMAT_FLOAT          = 0x20;
  7 | 
  8 | if (!-f 'sgemm_pre_128.sass' || (stat 'sgemm128.sass')[9] > (stat 'sgemm_pre_128.sass')[9])
  9 | {
 10 |     print `maxas.pl -p sgemm128.sass sgemm_pre_128.sass`;
 11 |     exit if $?;
 12 |     print `maxas.pl -i sgemm128.sass sgemm.cubin`;
 13 |     exit if $?;
 14 |     print `maxas.pl -e -k sgemm_kernel_128 sgemm.cubin sgemm_final_128.sass`;
 15 | }
 16 | if (!-f 'sgemm_pre_64.sass' || (stat 'sgemm64.sass')[9] > (stat 'sgemm_pre_64.sass')[9])
 17 | {
 18 |     print `maxas.pl -p sgemm64.sass sgemm_pre_64.sass`;
 19 |     exit if $?;
 20 |     print `maxas.pl -i sgemm64.sass sgemm.cubin`;
 21 |     exit if $?;
 22 |     print `maxas.pl -e -k sgemm_kernel_64 sgemm.cubin sgemm_final_64.sass`;
 23 | }
 24 | 
 25 | #print `Release\\sgemm.exe $_ 20` foreach (80,60,40,30,20,10,9,8,7,6,5,4,3,2);
 26 | 
 27 | `Release\\sgemm.exe 64 5 $CU_AD_FORMAT_FLOAT`;
 28 | 
 29 | print `Release\\sgemm.exe 64 20 $CU_AD_FORMAT_UNSIGNED_INT8`;
 30 | exit;
 31 | 
 32 | my %data;
 33 | foreach my $thread128 (4 .. 64)
 34 | {
 35 |     my $N = $thread128 * 128;
 36 | 
 37 |     my $iterations = int(20 * (64 * 128)**3 / $N**3);
 38 |     $iterations = 10000 if $iterations > 10000;
 39 | 
 40 |     print "$N $iterations\n";
 41 | 
 42 |     my $data = `Release\\sgemm.exe $thread128 $iterations $CU_AD_FORMAT_UNSIGNED_INT16`;
 43 | 
 44 |     foreach my $bench (split "\n", $data)
 45 |     {
 46 |         if ($bench =~ /^(\w+)\s+GFLOPS: ([0-9.]+) /)
 47 |         {
 48 |             push @{$data{$N}}, $2;
 49 |             print "$1 $2\n";
 50 |         }
 51 |     }
 52 | }
 53 | print join("\t", qw(size Max64 Max128 Cub64 Cub128)), "\n";
 54 | 
 55 | foreach my $N (sort { $a <=> $b } keys %data)
 56 | {
 57 |     print join("\t", @{$data{$N}}), "\n";
 58 | }
 59 | 
 60 | 
 61 | #print $data;
 62 | 
 63 | __END__
 64 | 
 65 | 
 66 | 64 * 128 * 16 * 1.620 * .931 / 520
 67 | 
 68 | Max64  GFLOPS: 1377.38 (size: 256, iterations: 2000)
 69 | Max128 GFLOPS: 973.70 (size: 256, iterations: 2000)
 70 | Cub64  GFLOPS: 1272.42 (size: 256, iterations: 2000)
 71 | Cub128 GFLOPS: 948.15 (size: 256, iterations: 2000)
 72 | 
 73 | my @data = grep /\S/, split "\n", $data;
 74 | 
 75 | my $min;
 76 | my %smData;
 77 | my @sdata;
 78 | foreach (@data)
 79 | {
 80 |     next if /GFLOPS/;
 81 | 
 82 |     my ($sm, $clock, $by, $bx) = split /\s+/;
 83 | 
 84 |     $smData{$sm} = $clock if !$smData{$sm} || $clock < $smData{$sm};
 85 | 
 86 |     $min = $clock if !$min || $clock < $min;
 87 | 
 88 |     push @sdata, [$sm, $clock, $by, $bx];
 89 | }
 90 | 
 91 | foreach (@sdata)
 92 | {
 93 |     $_->[1] -= $smData{$_->[0]};
 94 | }
 95 | 
 96 | foreach (sort {$a->[1] <=> $b->[1] || $a->[0] <=> $b->[0]} @sdata)
 97 | {
 98 |     printf "%02d %8u  by: %2d bx: %2d\n", @$_;
 99 | 
100 | }
101 | 
102 | 
103 | 


--------------------------------------------------------------------------------
/Assembler/MaxAs/sgemm/sgemm.sln:
--------------------------------------------------------------------------------
 1 | ﻿
 2 | Microsoft Visual Studio Solution File, Format Version 11.00
 3 | # Visual Studio 2010
 4 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "sgemm", "sgemm.vcxproj", "{D571379D-3653-43CB-BE83-A6C68D392A05}"
 5 | EndProject
 6 | Global
 7 | 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 8 | 		Debug|Win32 = Debug|Win32
 9 | 		Release|Win32 = Release|Win32
10 | 	EndGlobalSection
11 | 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
12 | 		{D571379D-3653-43CB-BE83-A6C68D392A05}.Debug|Win32.ActiveCfg = Debug|Win32
13 | 		{D571379D-3653-43CB-BE83-A6C68D392A05}.Debug|Win32.Build.0 = Debug|Win32
14 | 		{D571379D-3653-43CB-BE83-A6C68D392A05}.Release|Win32.ActiveCfg = Release|Win32
15 | 		{D571379D-3653-43CB-BE83-A6C68D392A05}.Release|Win32.Build.0 = Release|Win32
16 | 	EndGlobalSection
17 | 	GlobalSection(SolutionProperties) = preSolution
18 | 		HideSolutionNode = FALSE
19 | 	EndGlobalSection
20 | EndGlobal
21 | 


--------------------------------------------------------------------------------
/Assembler/MaxAs/sgemm/sgemm.vcxproj:
--------------------------------------------------------------------------------
 1 | ﻿<?xml version="1.0" encoding="utf-8"?>
 2 | <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
 3 |   <ItemGroup Label="ProjectConfigurations">
 4 |     <ProjectConfiguration Include="Debug|Win32">
 5 |       <Configuration>Debug</Configuration>
 6 |       <Platform>Win32</Platform>
 7 |     </ProjectConfiguration>
 8 |     <ProjectConfiguration Include="Release|Win32">
 9 |       <Configuration>Release</Configuration>
10 |       <Platform>Win32</Platform>
11 |     </ProjectConfiguration>
12 |   </ItemGroup>
13 |   <PropertyGroup Label="Globals">
14 |     <ProjectGuid>{D571379D-3653-43CB-BE83-A6C68D392A05}</ProjectGuid>
15 |     <Keyword>Win32Proj</Keyword>
16 |     <RootNamespace>sgemm</RootNamespace>
17 |   </PropertyGroup>
18 |   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
19 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
20 |     <ConfigurationType>Application</ConfigurationType>
21 |     <UseDebugLibraries>true</UseDebugLibraries>
22 |     <CharacterSet>Unicode</CharacterSet>
23 |   </PropertyGroup>
24 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
25 |     <ConfigurationType>Application</ConfigurationType>
26 |     <UseDebugLibraries>false</UseDebugLibraries>
27 |     <WholeProgramOptimization>true</WholeProgramOptimization>
28 |     <CharacterSet>Unicode</CharacterSet>
29 |   </PropertyGroup>
30 |   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
31 |   <ImportGroup Label="ExtensionSettings">
32 |   </ImportGroup>
33 |   <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
34 |     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
35 |   </ImportGroup>
36 |   <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
37 |     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
38 |   </ImportGroup>
39 |   <PropertyGroup Label="UserMacros" />
40 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
41 |     <LinkIncremental>true</LinkIncremental>
42 |   </PropertyGroup>
43 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
44 |     <LinkIncremental>false</LinkIncremental>
45 |   </PropertyGroup>
46 |   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
47 |     <ClCompile>
48 |       <PrecompiledHeader>
49 |       </PrecompiledHeader>
50 |       <WarningLevel>Level3</WarningLevel>
51 |       <Optimization>Disabled</Optimization>
52 |       <PreprocessorDefinitions>_CRT_SECURE_NO_WARNINGS;WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
53 |       <AdditionalIncludeDirectories>$(CUDA_PATH_V6_5)\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
54 |     </ClCompile>
55 |     <Link>
56 |       <SubSystem>Console</SubSystem>
57 |       <GenerateDebugInformation>true</GenerateDebugInformation>
58 |       <AdditionalLibraryDirectories>$(CUDA_PATH_V6_5)\lib\Win32;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
59 |       <AdditionalDependencies>cuda.lib;cublas.lib;%(AdditionalDependencies)</AdditionalDependencies>
60 |     </Link>
61 |   </ItemDefinitionGroup>
62 |   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
63 |     <ClCompile>
64 |       <WarningLevel>Level3</WarningLevel>
65 |       <PrecompiledHeader>
66 |       </PrecompiledHeader>
67 |       <Optimization>MaxSpeed</Optimization>
68 |       <FunctionLevelLinking>true</FunctionLevelLinking>
69 |       <IntrinsicFunctions>true</IntrinsicFunctions>
70 |       <PreprocessorDefinitions>_CRT_SECURE_NO_WARNINGS;WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
71 |       <AdditionalIncludeDirectories>$(CUDA_PATH_V6_5)\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
72 |     </ClCompile>
73 |     <Link>
74 |       <SubSystem>Console</SubSystem>
75 |       <GenerateDebugInformation>true</GenerateDebugInformation>
76 |       <EnableCOMDATFolding>true</EnableCOMDATFolding>
77 |       <OptimizeReferences>true</OptimizeReferences>
78 |       <AdditionalLibraryDirectories>$(CUDA_PATH_V6_5)\lib\Win32;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
79 |       <AdditionalDependencies>cuda.lib;cublas.lib;%(AdditionalDependencies)</AdditionalDependencies>
80 |     </Link>
81 |   </ItemDefinitionGroup>
82 |   <ItemGroup>
83 |     <ClCompile Include="sgemm.cpp" />
84 |   </ItemGroup>
85 |   <ItemGroup>
86 |     <None Include="sgemm128.sass" />
87 |     <None Include="sgemm64.sass" />
88 |   </ItemGroup>
89 |   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
90 |   <ImportGroup Label="ExtensionTargets">
91 |   </ImportGroup>
92 | </Project>


--------------------------------------------------------------------------------
/Assembler/MaxAs/sgemm/sgemm_sm52_64.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ncic-sugon/DeepPerf/e539add1347e9354712787f71f66ae346f006544/Assembler/MaxAs/sgemm/sgemm_sm52_64.cubin


--------------------------------------------------------------------------------
/Assembler/MaxAs/t/MaxAs-MaxAs.t:
--------------------------------------------------------------------------------
1 | use strict;
2 | use warnings;
3 | 
4 | use Test::More tests => 1;
5 | BEGIN { use_ok('MaxAs::MaxAs') };
6 | 


--------------------------------------------------------------------------------
/Assembler/PascalAs/Changes:
--------------------------------------------------------------------------------
1 | Revision history for Perl extension MaxAs::MaxAs.
2 | 
3 | 1.01  Thu Mar 26 17:09:57 2015
4 | 	- original Perl packaged version
5 | 


--------------------------------------------------------------------------------
/Assembler/PascalAs/Install.sh:
--------------------------------------------------------------------------------
1 | perl Makefile.PL
2 | make
3 | sudo make install
4 | 


--------------------------------------------------------------------------------
/Assembler/PascalAs/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2014 Scott Gray
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Assembler/PascalAs/MANIFEST:
--------------------------------------------------------------------------------
 1 | bin/maxas.pl
 2 | Changes
 3 | lib/MaxAs/Cubin.pm
 4 | lib/MaxAs/MaxAs.pm
 5 | lib/MaxAs/MaxAsGrammar.pm
 6 | LICENSE
 7 | Makefile.PL
 8 | MANIFEST
 9 | microbench/microbench.cpp
10 | microbench/microbench.cu
11 | microbench/microbench.sass
12 | microbench/shared.pl
13 | microbench/shared_lds.sass
14 | microbench/shared_sts16.sass
15 | microbench/throughput.pl
16 | microbench/throughput.sass
17 | microbench/throughput2.pl
18 | microbench/throughput2.sass
19 | microbench/throughput3.pl
20 | microbench/throughput4.pl
21 | microbench/throughput5.pl
22 | microbench/xmad.pl
23 | microbench/xmad2.sass
24 | README.md
25 | sgemm/batched_gemm.xlsx
26 | sgemm/cublas_sgemm.ptx
27 | sgemm/sgemm.cpp
28 | sgemm/sgemm.cu
29 | sgemm/sgemm.pl
30 | sgemm/sgemm.sln
31 | sgemm/sgemm.vcxproj
32 | sgemm/sgemm128.sass
33 | sgemm/sgemm64.sass
34 | sgemm/sgemm_final_128.sass
35 | sgemm/sgemm_final_64.sass
36 | sgemm/sgemm_pre_128.sass
37 | sgemm/sgemm_pre_64.sass
38 | t/MaxAs-MaxAs.t
39 | 


--------------------------------------------------------------------------------
/Assembler/PascalAs/MYMETA.json:
--------------------------------------------------------------------------------
 1 | {
 2 |    "abstract" : "Assembler for NVIDIA Maxwell architecture",
 3 |    "author" : [
 4 |       "Scott Gray <sgray@nervanasys.com>"
 5 |    ],
 6 |    "dynamic_config" : 0,
 7 |    "generated_by" : "ExtUtils::MakeMaker version 7.0401, CPAN::Meta::Converter version 2.150001",
 8 |    "license" : [
 9 |       "mit"
10 |    ],
11 |    "meta-spec" : {
12 |       "url" : "http://search.cpan.org/perldoc?CPAN::Meta::Spec",
13 |       "version" : "2"
14 |    },
15 |    "name" : "PascalAs-PascalAs",
16 |    "no_index" : {
17 |       "directory" : [
18 |          "t",
19 |          "inc"
20 |       ]
21 |    },
22 |    "prereqs" : {
23 |       "build" : {
24 |          "requires" : {
25 |             "ExtUtils::MakeMaker" : "0"
26 |          }
27 |       },
28 |       "configure" : {
29 |          "requires" : {
30 |             "ExtUtils::MakeMaker" : "0"
31 |          }
32 |       },
33 |       "runtime" : {
34 |          "requires" : {
35 |             "Carp" : "1.29",
36 |             "Data::Dumper" : "2.145"
37 |          }
38 |       }
39 |    },
40 |    "release_status" : "stable",
41 |    "version" : "1.06"
42 | }
43 | 


--------------------------------------------------------------------------------
/Assembler/PascalAs/MYMETA.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | abstract: 'Assembler for NVIDIA Maxwell architecture'
 3 | author:
 4 |   - 'Scott Gray <sgray@nervanasys.com>'
 5 | build_requires:
 6 |   ExtUtils::MakeMaker: '0'
 7 | configure_requires:
 8 |   ExtUtils::MakeMaker: '0'
 9 | dynamic_config: 0
10 | generated_by: 'ExtUtils::MakeMaker version 7.0401, CPAN::Meta::Converter version 2.150001'
11 | license: mit
12 | meta-spec:
13 |   url: http://module-build.sourceforge.net/META-spec-v1.4.html
14 |   version: '1.4'
15 | name: PascalAs-PascalAs
16 | no_index:
17 |   directory:
18 |     - t
19 |     - inc
20 | requires:
21 |   Carp: '1.29'
22 |   Data::Dumper: '2.145'
23 | version: '1.06'
24 | 


--------------------------------------------------------------------------------
/Assembler/PascalAs/Makefile.PL:
--------------------------------------------------------------------------------
 1 | require 5.10.0;
 2 | use ExtUtils::MakeMaker;
 3 | # See lib/ExtUtils/MakeMaker.pm for details of how to influence
 4 | # the contents of the Makefile that is written.
 5 | WriteMakefile(
 6 |     NAME              => 'PascalAs::PascalAs',
 7 |     VERSION_FROM      => 'lib/PascalAs/PascalAs.pm', # finds $VERSION
 8 |     EXE_FILES         => ['bin/pascalas.pl'],
 9 |     PREREQ_PM         => {Carp => 1.29, Data::Dumper => 2.145},
10 |     LICENSE           => 'MIT',
11 |     ($] >= 5.005 ?     ## Add these new keywords supported since 5.005
12 |       (ABSTRACT_FROM  => 'lib/PascalAs/PascalAs.pm', # retrieve abstract from module
13 |        AUTHOR         => 'Scott Gray <sgray@nervanasys.com>') : ()),
14 | );
15 | 


--------------------------------------------------------------------------------
/Assembler/PascalAs/README.md:
--------------------------------------------------------------------------------
 1 | # MaxAs
 2 | Assembler for NVIDIA Maxwell architecture
 3 | 
 4 | To install (system-wide):
 5 | 
 6 |     sudo cpanm git://github.com/NervanaSystems/maxas.git
 7 | 
 8 | or
 9 | 
10 |     perl Makefile.PL
11 |     make
12 |     sudo make install
13 | 
14 | 
15 | See wiki pages for more information:
16 | 
17 | - [Introduction](https://github.com/NervanaSystems/maxas/wiki/Introduction)
18 | - [Getting Started](https://github.com/NervanaSystems/maxas/wiki/Getting-Started)
19 | - [Control Codes](https://github.com/NervanaSystems/maxas/wiki/Control-Codes)
20 | - [SGEMM walkthrough](https://github.com/NervanaSystems/maxas/wiki/SGEMM)
21 | 
22 | Related work with lots of additional shader assembly (sass) examples:
23 | 
24 | - [NervanaGPU](https://github.com/NervanaSystems/nervanagpu)
25 | 
26 | This project is released under the [MIT License](http://opensource.org/licenses/MIT).
27 | 
28 | -- Scott Gray
29 | 


--------------------------------------------------------------------------------
/Assembler/PascalAs/blib/arch/.exists:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ncic-sugon/DeepPerf/e539add1347e9354712787f71f66ae346f006544/Assembler/PascalAs/blib/arch/.exists


--------------------------------------------------------------------------------
/Assembler/PascalAs/blib/arch/auto/MaxAs/MaxAs/.exists:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ncic-sugon/DeepPerf/e539add1347e9354712787f71f66ae346f006544/Assembler/PascalAs/blib/arch/auto/MaxAs/MaxAs/.exists


--------------------------------------------------------------------------------
/Assembler/PascalAs/blib/arch/auto/PascalAs/PascalAs/.exists:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ncic-sugon/DeepPerf/e539add1347e9354712787f71f66ae346f006544/Assembler/PascalAs/blib/arch/auto/PascalAs/PascalAs/.exists


--------------------------------------------------------------------------------
/Assembler/PascalAs/blib/bin/.exists:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ncic-sugon/DeepPerf/e539add1347e9354712787f71f66ae346f006544/Assembler/PascalAs/blib/bin/.exists


--------------------------------------------------------------------------------
/Assembler/PascalAs/blib/lib/MaxAs/.exists:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ncic-sugon/DeepPerf/e539add1347e9354712787f71f66ae346f006544/Assembler/PascalAs/blib/lib/MaxAs/.exists


--------------------------------------------------------------------------------
/Assembler/PascalAs/blib/lib/PascalAs/.exists:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ncic-sugon/DeepPerf/e539add1347e9354712787f71f66ae346f006544/Assembler/PascalAs/blib/lib/PascalAs/.exists


--------------------------------------------------------------------------------
/Assembler/PascalAs/blib/lib/auto/MaxAs/MaxAs/.exists:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ncic-sugon/DeepPerf/e539add1347e9354712787f71f66ae346f006544/Assembler/PascalAs/blib/lib/auto/MaxAs/MaxAs/.exists


--------------------------------------------------------------------------------
/Assembler/PascalAs/blib/lib/auto/PascalAs/PascalAs/.exists:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ncic-sugon/DeepPerf/e539add1347e9354712787f71f66ae346f006544/Assembler/PascalAs/blib/lib/auto/PascalAs/PascalAs/.exists


--------------------------------------------------------------------------------
/Assembler/PascalAs/blib/man1/.exists:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ncic-sugon/DeepPerf/e539add1347e9354712787f71f66ae346f006544/Assembler/PascalAs/blib/man1/.exists


--------------------------------------------------------------------------------
/Assembler/PascalAs/blib/man3/.exists:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ncic-sugon/DeepPerf/e539add1347e9354712787f71f66ae346f006544/Assembler/PascalAs/blib/man3/.exists


--------------------------------------------------------------------------------
/Assembler/PascalAs/blib/man3/MaxAs::MaxAs.3pm:
--------------------------------------------------------------------------------
  1 | .\" Automatically generated by Pod::Man 2.22 (Pod::Simple 3.13)
  2 | .\"
  3 | .\" Standard preamble:
  4 | .\" ========================================================================
  5 | .de Sp \" Vertical space (when we can't use .PP)
  6 | .if t .sp .5v
  7 | .if n .sp
  8 | ..
  9 | .de Vb \" Begin verbatim text
 10 | .ft CW
 11 | .nf
 12 | .ne \\$1
 13 | ..
 14 | .de Ve \" End verbatim text
 15 | .ft R
 16 | .fi
 17 | ..
 18 | .\" Set up some character translations and predefined strings.  \*(-- will
 19 | .\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left
 20 | .\" double quote, and \*(R" will give a right double quote.  \*(C+ will
 21 | .\" give a nicer C++.  Capital omega is used to do unbreakable dashes and
 22 | .\" therefore won't be available.  \*(C` and \*(C' expand to `' in nroff,
 23 | .\" nothing in troff, for use with C<>.
 24 | .tr \(*W-
 25 | .ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p'
 26 | .ie n \{\
 27 | .    ds -- \(*W-
 28 | .    ds PI pi
 29 | .    if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch
 30 | .    if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\"  diablo 12 pitch
 31 | .    ds L" ""
 32 | .    ds R" ""
 33 | .    ds C` ""
 34 | .    ds C' ""
 35 | 'br\}
 36 | .el\{\
 37 | .    ds -- \|\(em\|
 38 | .    ds PI \(*p
 39 | .    ds L" ``
 40 | .    ds R" ''
 41 | 'br\}
 42 | .\"
 43 | .\" Escape single quotes in literal strings from groff's Unicode transform.
 44 | .ie \n(.g .ds Aq \(aq
 45 | .el       .ds Aq '
 46 | .\"
 47 | .\" If the F register is turned on, we'll generate index entries on stderr for
 48 | .\" titles (.TH), headers (.SH), subsections (.SS), items (.Ip), and index
 49 | .\" entries marked with X<> in POD.  Of course, you'll have to process the
 50 | .\" output yourself in some meaningful fashion.
 51 | .ie \nF \{\
 52 | .    de IX
 53 | .    tm Index:\\$1\t\\n%\t"\\$2"
 54 | ..
 55 | .    nr % 0
 56 | .    rr F
 57 | .\}
 58 | .el \{\
 59 | .    de IX
 60 | ..
 61 | .\}
 62 | .\"
 63 | .\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2).
 64 | .\" Fear.  Run.  Save yourself.  No user-serviceable parts.
 65 | .    \" fudge factors for nroff and troff
 66 | .if n \{\
 67 | .    ds #H 0
 68 | .    ds #V .8m
 69 | .    ds #F .3m
 70 | .    ds #[ \f1
 71 | .    ds #] \fP
 72 | .\}
 73 | .if t \{\
 74 | .    ds #H ((1u-(\\\\n(.fu%2u))*.13m)
 75 | .    ds #V .6m
 76 | .    ds #F 0
 77 | .    ds #[ \&
 78 | .    ds #] \&
 79 | .\}
 80 | .    \" simple accents for nroff and troff
 81 | .if n \{\
 82 | .    ds ' \&
 83 | .    ds ` \&
 84 | .    ds ^ \&
 85 | .    ds , \&
 86 | .    ds ~ ~
 87 | .    ds /
 88 | .\}
 89 | .if t \{\
 90 | .    ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u"
 91 | .    ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u'
 92 | .    ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u'
 93 | .    ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u'
 94 | .    ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u'
 95 | .    ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u'
 96 | .\}
 97 | .    \" troff and (daisy-wheel) nroff accents
 98 | .ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V'
 99 | .ds 8 \h'\*(#H'\(*b\h'-\*(#H'
100 | .ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#]
101 | .ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H'
102 | .ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u'
103 | .ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#]
104 | .ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#]
105 | .ds ae a\h'-(\w'a'u*4/10)'e
106 | .ds Ae A\h'-(\w'A'u*4/10)'E
107 | .    \" corrections for vroff
108 | .if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u'
109 | .if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u'
110 | .    \" for low resolution devices (crt and lpr)
111 | .if \n(.H>23 .if \n(.V>19 \
112 | \{\
113 | .    ds : e
114 | .    ds 8 ss
115 | .    ds o a
116 | .    ds d- d\h'-1'\(ga
117 | .    ds D- D\h'-1'\(hy
118 | .    ds th \o'bp'
119 | .    ds Th \o'LP'
120 | .    ds ae ae
121 | .    ds Ae AE
122 | .\}
123 | .rm #[ #] #H #V #F C
124 | .\" ========================================================================
125 | .\"
126 | .IX Title "MaxAs::MaxAs 3"
127 | .TH MaxAs::MaxAs 3 "2016-02-04" "perl v5.10.1" "User Contributed Perl Documentation"
128 | .\" For nroff, turn off justification.  Always turn off hyphenation; it makes
129 | .\" way too many mistakes in technical documents.
130 | .if n .ad l
131 | .nh
132 | .SH "NAME"
133 | MaxAs::MaxAs \- Assembler for NVIDIA Maxwell architecture
134 | .SH "SYNOPSIS"
135 | .IX Header "SYNOPSIS"
136 | .Vb 1
137 | \&    maxas.pl [opts]
138 | .Ve
139 | .SH "DESCRIPTION"
140 | .IX Header "DESCRIPTION"
141 | See the documentation at: https://github.com/NervanaSystems/maxas
142 | .SH "SEE ALSO"
143 | .IX Header "SEE ALSO"
144 | See the documentation at: https://github.com/NervanaSystems/maxas
145 | .SH "AUTHOR"
146 | .IX Header "AUTHOR"
147 | Scott Gray, <sgray@nervanasys.com<gt>
148 | .SH "COPYRIGHT AND LICENSE"
149 | .IX Header "COPYRIGHT AND LICENSE"
150 | The \s-1MIT\s0 License (\s-1MIT\s0)
151 | .PP
152 | Copyright (c) 2014 Scott Gray
153 | .PP
154 | Permission is hereby granted, free of charge, to any person obtaining a copy
155 | of this software and associated documentation files (the \*(L"Software\*(R"), to deal
156 | in the Software without restriction, including without limitation the rights
157 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
158 | copies of the Software, and to permit persons to whom the Software is
159 | furnished to do so, subject to the following conditions:
160 | .PP
161 | The above copyright notice and this permission notice shall be included in
162 | all copies or substantial portions of the Software.
163 | .PP
164 | \&\s-1THE\s0 \s-1SOFTWARE\s0 \s-1IS\s0 \s-1PROVIDED\s0 \*(L"\s-1AS\s0 \s-1IS\s0\*(R", \s-1WITHOUT\s0 \s-1WARRANTY\s0 \s-1OF\s0 \s-1ANY\s0 \s-1KIND\s0, \s-1EXPRESS\s0 \s-1OR\s0
165 | \&\s-1IMPLIED\s0, \s-1INCLUDING\s0 \s-1BUT\s0 \s-1NOT\s0 \s-1LIMITED\s0 \s-1TO\s0 \s-1THE\s0 \s-1WARRANTIES\s0 \s-1OF\s0 \s-1MERCHANTABILITY\s0,
166 | \&\s-1FITNESS\s0 \s-1FOR\s0 A \s-1PARTICULAR\s0 \s-1PURPOSE\s0 \s-1AND\s0 \s-1NONINFRINGEMENT\s0. \s-1IN\s0 \s-1NO\s0 \s-1EVENT\s0 \s-1SHALL\s0 \s-1THE\s0
167 | \&\s-1AUTHORS\s0 \s-1OR\s0 \s-1COPYRIGHT\s0 \s-1HOLDERS\s0 \s-1BE\s0 \s-1LIABLE\s0 \s-1FOR\s0 \s-1ANY\s0 \s-1CLAIM\s0, \s-1DAMAGES\s0 \s-1OR\s0 \s-1OTHER\s0
168 | \&\s-1LIABILITY\s0, \s-1WHETHER\s0 \s-1IN\s0 \s-1AN\s0 \s-1ACTION\s0 \s-1OF\s0 \s-1CONTRACT\s0, \s-1TORT\s0 \s-1OR\s0 \s-1OTHERWISE\s0, \s-1ARISING\s0 \s-1FROM\s0,
169 | \&\s-1OUT\s0 \s-1OF\s0 \s-1OR\s0 \s-1IN\s0 \s-1CONNECTION\s0 \s-1WITH\s0 \s-1THE\s0 \s-1SOFTWARE\s0 \s-1OR\s0 \s-1THE\s0 \s-1USE\s0 \s-1OR\s0 \s-1OTHER\s0 \s-1DEALINGS\s0 \s-1IN\s0
170 | \&\s-1THE\s0 \s-1SOFTWARE\s0.
171 | 


--------------------------------------------------------------------------------
/Assembler/PascalAs/blib/man3/PascalAs::PascalAs.3pm:
--------------------------------------------------------------------------------
  1 | .\" Automatically generated by Pod::Man 2.28 (Pod::Simple 3.29)
  2 | .\"
  3 | .\" Standard preamble:
  4 | .\" ========================================================================
  5 | .de Sp \" Vertical space (when we can't use .PP)
  6 | .if t .sp .5v
  7 | .if n .sp
  8 | ..
  9 | .de Vb \" Begin verbatim text
 10 | .ft CW
 11 | .nf
 12 | .ne \\$1
 13 | ..
 14 | .de Ve \" End verbatim text
 15 | .ft R
 16 | .fi
 17 | ..
 18 | .\" Set up some character translations and predefined strings.  \*(-- will
 19 | .\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left
 20 | .\" double quote, and \*(R" will give a right double quote.  \*(C+ will
 21 | .\" give a nicer C++.  Capital omega is used to do unbreakable dashes and
 22 | .\" therefore won't be available.  \*(C` and \*(C' expand to `' in nroff,
 23 | .\" nothing in troff, for use with C<>.
 24 | .tr \(*W-
 25 | .ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p'
 26 | .ie n \{\
 27 | .    ds -- \(*W-
 28 | .    ds PI pi
 29 | .    if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch
 30 | .    if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\"  diablo 12 pitch
 31 | .    ds L" ""
 32 | .    ds R" ""
 33 | .    ds C` ""
 34 | .    ds C' ""
 35 | 'br\}
 36 | .el\{\
 37 | .    ds -- \|\(em\|
 38 | .    ds PI \(*p
 39 | .    ds L" ``
 40 | .    ds R" ''
 41 | .    ds C`
 42 | .    ds C'
 43 | 'br\}
 44 | .\"
 45 | .\" Escape single quotes in literal strings from groff's Unicode transform.
 46 | .ie \n(.g .ds Aq \(aq
 47 | .el       .ds Aq '
 48 | .\"
 49 | .\" If the F register is turned on, we'll generate index entries on stderr for
 50 | .\" titles (.TH), headers (.SH), subsections (.SS), items (.Ip), and index
 51 | .\" entries marked with X<> in POD.  Of course, you'll have to process the
 52 | .\" output yourself in some meaningful fashion.
 53 | .\"
 54 | .\" Avoid warning from groff about undefined register 'F'.
 55 | .de IX
 56 | ..
 57 | .nr rF 0
 58 | .if \n(.g .if rF .nr rF 1
 59 | .if (\n(rF:(\n(.g==0)) \{
 60 | .    if \nF \{
 61 | .        de IX
 62 | .        tm Index:\\$1\t\\n%\t"\\$2"
 63 | ..
 64 | .        if !\nF==2 \{
 65 | .            nr % 0
 66 | .            nr F 2
 67 | .        \}
 68 | .    \}
 69 | .\}
 70 | .rr rF
 71 | .\" ========================================================================
 72 | .\"
 73 | .IX Title "PascalAs::PascalAs 3pm"
 74 | .TH PascalAs::PascalAs 3pm "2018-11-05" "perl v5.22.1" "User Contributed Perl Documentation"
 75 | .\" For nroff, turn off justification.  Always turn off hyphenation; it makes
 76 | .\" way too many mistakes in technical documents.
 77 | .if n .ad l
 78 | .nh
 79 | .SH "NAME"
 80 | PascalAs::PascalAs \- Assembler for NVIDIA Maxwell architecture
 81 | .SH "SYNOPSIS"
 82 | .IX Header "SYNOPSIS"
 83 | .Vb 1
 84 | \&    Pascalas.pl [opts]
 85 | .Ve
 86 | .SH "DESCRIPTION"
 87 | .IX Header "DESCRIPTION"
 88 | See the documentation at: https://github.com/NervanaSystems/pascalas
 89 | .SH "SEE ALSO"
 90 | .IX Header "SEE ALSO"
 91 | See the documentation at: https://github.com/NervanaSystems/pascalas
 92 | .SH "AUTHOR"
 93 | .IX Header "AUTHOR"
 94 | Scott Gray, <sgray@nervanasys.com<gt>
 95 | .SH "COPYRIGHT AND LICENSE"
 96 | .IX Header "COPYRIGHT AND LICENSE"
 97 | The \s-1MIT\s0 License (\s-1MIT\s0)
 98 | .PP
 99 | Copyright (c) 2014 Scott Gray
100 | .PP
101 | Permission is hereby granted, free of charge, to any person obtaining a copy
102 | of this software and associated documentation files (the \*(L"Software\*(R"), to deal
103 | in the Software without restriction, including without limitation the rights
104 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
105 | copies of the Software, and to permit persons to whom the Software is
106 | furnished to do so, subject to the following conditions:
107 | .PP
108 | The above copyright notice and this permission notice shall be included in
109 | all copies or substantial portions of the Software.
110 | .PP
111 | \&\s-1THE SOFTWARE IS PROVIDED \*(L"AS IS\*(R", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
112 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
113 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
114 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
115 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
116 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
117 | THE SOFTWARE.\s0
118 | 


--------------------------------------------------------------------------------
/Assembler/PascalAs/blib/script/.exists:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ncic-sugon/DeepPerf/e539add1347e9354712787f71f66ae346f006544/Assembler/PascalAs/blib/script/.exists


--------------------------------------------------------------------------------
/Assembler/PascalAs/cpanfile:
--------------------------------------------------------------------------------
1 | requires 'perl', '5.10.0';
2 | 
3 | requires 'Carp', '1.29';
4 | requires 'Data::Dumper', '2.145';
5 | 


--------------------------------------------------------------------------------
/Assembler/PascalAs/microbench/microbench.cpp:
--------------------------------------------------------------------------------
  1 | // microbench.cpp : Defines the entry point for the console application.
  2 | //
  3 | 
  4 | // nvcc -l cuda -o microbench microbench.cpp
  5 | 
  6 | #include <stdio.h>
  7 | #include <stdlib.h>
  8 | #include <string.h>
  9 | #include <cuda.h>
 10 | #include <cudaProfiler.h>
 11 | 
 12 | CUcontext hContext = 0;
 13 | 
 14 | #define CUDA_CHECK( fn ) do { \
 15 | 		CUresult status = (fn); \
 16 | 		if ( CUDA_SUCCESS != status ) { \
 17 | 			const char* errstr; \
 18 | 			cuGetErrorString(status, &errstr); \
 19 | 			printf("CUDA Driver Failure (line %d of file %s):\n\t%s returned 0x%x (%s)\n", __LINE__, __FILE__, #fn, status, errstr); \
 20 | 			if (hContext) cuCtxDestroy(hContext); \
 21 | 			exit(EXIT_FAILURE); \
 22 | 		} \
 23 | 	} while (0)
 24 | 
 25 | 
 26 | int main(int argc, char* argv[])
 27 | {
 28 | 	//int iTest = 2896;
 29 | 	//while (iTest < 0x7fff)
 30 | 	//{
 31 | 	//	int iResult = iTest * iTest;
 32 | 	//	float fTest = (float)iTest;
 33 | 	//	int fResult = (int)(fTest * fTest);
 34 | 
 35 | 	//	printf("i*i:%08x f*f:%08x\n", iResult, fResult);
 36 | 
 37 | 	//	iTest += 0x0800;
 38 | 	//}
 39 | 	//exit(0);
 40 | 
 41 | 	char deviceName[32];
 42 | 	int devCount, ordinal, major, minor;
 43 | 	CUdevice  hDevice;
 44 | 
 45 | 	// Initialize the Driver API and find a device
 46 | 	CUDA_CHECK( cuInit(0) );
 47 | 	CUDA_CHECK( cuDeviceGetCount(&devCount) );
 48 | 	for (ordinal = 0; ordinal < devCount; ordinal++)
 49 | 	{
 50 | 		CUDA_CHECK( cuDeviceGet(&hDevice, ordinal) );
 51 | 		CUDA_CHECK( cuDeviceGetAttribute (&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, hDevice) );
 52 | 		CUDA_CHECK( cuDeviceGetAttribute (&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, hDevice) );
 53 | 		CUDA_CHECK( cuDeviceGetName(deviceName, sizeof(deviceName), hDevice) );
 54 | 		if (major >= 5 && minor >= 2)
 55 | 		{
 56 | 			printf("Using: Id:%d %s (%d.%d)\n\n", ordinal, deviceName, major, minor);
 57 | 			break;
 58 | 		}
 59 | 	}
 60 | 	if (ordinal == devCount)
 61 | 	{
 62 | 		printf("No compute 5.0 device found, exiting.\n");
 63 | 		exit(EXIT_FAILURE);
 64 | 	}
 65 | 
 66 | 	// First command line arg is the type: internal (CS2R) or external (cuEventElapsedTime) timing
 67 | 	int internalTiming = 1;
 68 | 	if (argc > 1)
 69 | 		internalTiming = strcmp(argv[1], "i") == 0 ? 1 : 0;
 70 | 
 71 | 	// Second command line arg is the number of blocks
 72 | 	int blocks = 1;
 73 | 	if (argc > 2)
 74 | 		blocks = atoi(argv[2]);
 75 | 	if (blocks < 1)
 76 | 		blocks = 1;
 77 | 
 78 | 	// Third command line arg is the number of threads
 79 | 	int threads = 128;
 80 | 	if (argc > 3)
 81 | 		threads = atoi(argv[3]);
 82 | 	if (threads > 1024 || threads < 32)
 83 | 		threads = 128;
 84 | 	threads &= -32;
 85 | 
 86 | 	// Forth command line arg:
 87 | 	double fops = 1.0;
 88 | 	int lanes = 1;
 89 | 	if (argc > 4)
 90 | 	{
 91 | 		if (internalTiming)
 92 | 		{
 93 | 			// The number of lanes to print for each warp
 94 | 			lanes = atoi(argv[4]);
 95 | 			if (lanes > 32 || lanes < 1)
 96 | 				lanes = 1;
 97 | 		}
 98 | 		else
 99 | 			// The number of floating point operations in a full kernel launch
100 | 			fops = atof(argv[4]);
101 | 	}
102 | 
103 | 	// Fifth command line arg is the repeat count for benchmarking
104 | 	int repeat = 1;
105 | 	if (argc > 5)
106 | 		repeat = atoi(argv[5]);
107 | 	if (repeat > 1000 || repeat < 1)
108 | 		repeat = 1;
109 | 
110 | 	// threads = total number of threads
111 | 	size_t size = sizeof(int) * threads * blocks;
112 | 
113 | 	// Setup our input and output buffers
114 | 	int* dataIn  = (int*)malloc(size);
115 | 	int* dataOut = (int*)malloc(size);
116 | 	int* clocks  = (int*)malloc(size);
117 | 	memset(dataIn, 0, size);
118 | 
119 | 	CUmodule hModule;
120 | 	CUfunction hKernel;
121 | 	CUevent hStart, hStop;
122 | 	CUdeviceptr devIn, devOut, devClocks;
123 | 
124 | 	// Init our context and device memory buffers
125 | 	CUDA_CHECK( cuCtxCreate(&hContext, 0, hDevice) );
126 | 	CUDA_CHECK( cuMemAlloc(&devIn, size) );
127 | 	CUDA_CHECK( cuMemAlloc(&devOut, size) );
128 | 	CUDA_CHECK( cuMemAlloc(&devClocks, size) );
129 | 	CUDA_CHECK( cuMemcpyHtoD(devIn, dataIn, size) );
130 | 	CUDA_CHECK( cuMemsetD8(devOut, 0, size) );
131 | 	CUDA_CHECK( cuMemsetD8(devClocks, 0, size) );
132 | 
133 | 	CUDA_CHECK( cuEventCreate(&hStart, CU_EVENT_BLOCKING_SYNC) );
134 | 	CUDA_CHECK( cuEventCreate(&hStop,  CU_EVENT_BLOCKING_SYNC) );
135 | 
136 | 	// Load our kernel
137 | 	CUDA_CHECK( cuModuleLoad(&hModule, "microbench.cubin") );
138 | 	CUDA_CHECK( cuModuleGetFunction(&hKernel, hModule, "microbench") );
139 | 
140 | 	// Setup the params
141 | 	void* params[] = { &devOut, &devClocks, &devIn };
142 | 	float ms = 0;
143 | 
144 | 	// Warm up the clock (unless under nsight)
145 | 	if (!getenv("NSIGHT_LAUNCHED")) // NSIGHT_CUDA_ANALYSIS NSIGHT_CUDA_DEBUGGER
146 | 		for (int i = 0; i < repeat; i++)
147 | 			CUDA_CHECK( cuLaunchKernel(hKernel, blocks, 1, 1, threads, 1, 1, 0, 0, params, 0) );
148 | 
149 | 	// Launch the kernel
150 | 	CUDA_CHECK( cuEventRecord(hStart, NULL) );
151 | 	//CUDA_CHECK( cuProfilerStart() );
152 | 	CUDA_CHECK( cuLaunchKernel(hKernel, blocks, 1, 1, threads, 1, 1, 0, 0, params, 0) );
153 | 	//CUDA_CHECK( cuProfilerStop() );
154 | 	CUDA_CHECK( cuEventRecord(hStop, NULL) );
155 | 	CUDA_CHECK( cuEventSynchronize(hStop) );
156 | 	CUDA_CHECK( cuEventElapsedTime(&ms, hStart, hStop) );
157 | 
158 | 	//CUDA_CHECK( cuCtxSynchronize() );
159 | 
160 | 	// Get back our results from each kernel
161 | 	CUDA_CHECK( cuMemcpyDtoH(dataOut, devOut, size) );
162 | 	CUDA_CHECK( cuMemcpyDtoH(clocks, devClocks, size) );
163 | 
164 | 	// Cleanup and shutdown of cuda
165 | 	CUDA_CHECK( cuEventDestroy(hStart) );
166 | 	CUDA_CHECK( cuEventDestroy(hStop) );
167 | 	CUDA_CHECK( cuModuleUnload(hModule) );
168 | 	CUDA_CHECK( cuMemFree(devIn) );
169 | 	CUDA_CHECK( cuMemFree(devOut) );
170 | 	CUDA_CHECK( cuMemFree(devClocks) );
171 | 	CUDA_CHECK( cuCtxDestroy(hContext) );
172 | 	hContext = 0;
173 | 
174 | 	// When using just one block, print out the internal timing data
175 | 	if (internalTiming)
176 | 	{
177 | 		int count = 0, total = 0, min = 999999, max = 0;
178 | 
179 | 		int* clocks_p  = clocks;
180 | 		int* dataOut_p = dataOut;
181 | 
182 | 		// Loop over and print results
183 | 		for (int blk = 0; blk < blocks; blk++)
184 | 		{
185 | 			float *fDataOut = reinterpret_cast<float*>(dataOut_p);
186 | 
187 | 			for(int tid = 0; tid < threads; tid += 32)
188 | 			{
189 | 				// Sometimes we want data on each thread, sometimes just one sample per warp is fine
190 | 				for (int lane = 0; lane < lanes; lane++)
191 | 					printf("b:%02d w:%03d t:%04d l:%02d clocks:%08d out:%08x\n", blk, tid/32, tid, lane, clocks_p[tid+lane], dataOut_p[tid+lane]); // %04u
192 | 
193 | 				count++;
194 | 				total += clocks_p[tid];
195 | 				if (clocks_p[tid] < min) min = clocks_p[tid];
196 | 				if (clocks_p[tid] > max) max = clocks_p[tid];
197 | 			}
198 | 			clocks_p  += threads;
199 | 			dataOut_p += threads;
200 | 		}
201 | 		printf("average: %.3f, min %d, max: %d\n", (float)total/count, min, max);
202 | 	}
203 | 	else
204 | 	{
205 | 		// For more than one block we're testing throughput and want external timing data
206 | 		printf("MilliSecs: %.3f, GFLOPS: %.3f\n", ms, fops / (ms * 1000000.0));
207 | 	}
208 | 	// And free up host memory
209 | 	free(dataIn); free(dataOut); free(clocks);
210 | 
211 | 	return 0;
212 | }
213 | 


--------------------------------------------------------------------------------
/Assembler/PascalAs/microbench/microbench.cu:
--------------------------------------------------------------------------------
 1 | 
 2 | // Note this file isn't configured to automatically compile
 3 | 
 4 | #include <device_functions.h>
 5 | #include <device_launch_parameters.h>
 6 | 
 7 | // Build:
 8 | // nvcc -l cuda -o microbench microbench.cpp
 9 | // nvcc -arch sm_50 -cubin microbench.cu
10 | 
11 | // Inspect a cubin (use nvdisasm from cuda 6.5 for best results):
12 | // maxas.pl -e microbench.cubin
13 | 
14 | // Insert new sass into cubin
15 | // maxas.pl -i microbench.sass microbench.cubin
16 | 
17 | // run it:
18 | // ./microbench
19 | 
20 | // Use extern C so C++ doesn't mangle our kernel name
21 | extern "C" __global__ void  microbench(int *out, int *clocks, int *in)
22 | {
23 |     __shared__ int share[1024];
24 | 
25 |     int tid = threadIdx.x;
26 |     int bx  = blockIdx.x;
27 |     int by  = blockIdx.y;
28 | 
29 |     int start = clock();
30 | 
31 |     share[tid] = in[by * 65535 + bx]; //tid + blkDimX + blkDimY + blkDimZ + grdDimX + grdDimY + grdDimZ
32 | 
33 |     __syncthreads();
34 | 
35 |     int end = clock();
36 | 
37 |     clocks[tid] = (start >> 16) | (end & 0xffff0000); //end - start;
38 | 
39 |     out[tid] = share[tid ^ 1];
40 | }
41 | 
42 | // A note about using the Cuda Runtime.
43 | // If that's your preference over the driver API then here's what you'd do:
44 | 
45 | // In your project properties in the Cuda C/C++ panel:
46 | //    -Set the "Keep Processed Files" (-keep) option
47 | //    -Add a -v manually to the command line
48 | // If compiling on command line just add -keep -v options to nvcc.
49 | // Rebuild your solution and look in the log for these lines that follow the ptxas step:
50 | 
51 | // #$ fatbinary --create="Release/kernel.fatbin" -32 --key="a7bce87544c2a492" --ident="C:/Users/Scott/Documents/sgemm6/sgemm6/kernel.cu" --cmdline="-v --opt-level 4 --generate-line-info " "--image=profile=sm_50,file=Release/kernel.sm_50.cubin" "--image=profile=compute_50,file=Release/kernel.ptx" --embedded-fatbin="Release/kernel.fatbin.c" --cuda
52 | // #$ cl.exe @Release/kernel.cu.cpp.ii.res > "Release/kernel.cu.cpp.ii"
53 | // #$ cl.exe @Release/kernel.cu.obj.res -Fo"Release/kernel.cu.obj"
54 | 
55 | // You just need to manually run these 3 commands (or add them to a build script)
56 | // after you've modified the cubin generated from the preceeding ptxas command.
57 | // That will give you a new .cu.obj file which will automatically be linked in for you next time you
58 | // build your project (or you could manually run the linker step as well).
59 | 
60 | // Having done that you can call your kernel normally using the <<< >>> syntax.
61 | // Debugging will have to be with the sass syntax but that's what you'll want to see anyway.
62 | // With fatbin you can also keep non-maxwell optimized versions of your code.
63 | 
64 | 
65 | // I just discovered this also works as a shortcut to the above:
66 | // nvcc -lib -arch sm_52 -m 32 -use-cubin code=sm_52,cubin=microbench.cubin -o microbench.lib microbench.cu
67 | 
68 | // The cu kernel definitions above need to have empty bodies.
69 | // And, the cu file must be compiled to a lib seperately before linking.


--------------------------------------------------------------------------------
/Assembler/PascalAs/microbench/microbench.sass:
--------------------------------------------------------------------------------
 1 | # Kernel: microbench
 2 | 
 3 | // This is a simple micro bench to demonstrate the latency in loading SR_TID.X
 4 | 
 5 | <CONSTANT_MAPPING>
 6 |     blockDimX : c[0x0][0x08]
 7 |     blockDimY : c[0x0][0x0c]
 8 |     blockDimZ : c[0x0][0x10]
 9 |     gridDimX  : c[0x0][0x14]
10 |     gridDimY  : c[0x0][0x18]
11 |     gridDimZ  : c[0x0][0x1c]
12 | 
13 |     param_out[0]    : c[0x0][0x140]
14 |     param_out[1]    : c[0x0][0x144]
15 |     param_clocks[0] : c[0x0][0x148]
16 |     param_clocks[1] : c[0x0][0x14c]
17 |     param_in[0]     : c[0x0][0x150]
18 |     param_in[1]     : c[0x0][0x154]
19 | </CONSTANT_MAPPING>
20 | 
21 | <REGISTER_MAPPING>
22 | 
23 |      0-1 : out<0-1>
24 |      2-3 : clocks<0-1>
25 |      4-5 : in<0-1>
26 |     6-20 : tid, bid, blockDim, clock1, clock2, result, offset, x
27 | 
28 | </REGISTER_MAPPING>
29 | 
30 | // Load in our params (not currently used below)
31 | --:-:-:-:1      MOV in0, param_in[0];
32 | --:-:-:-:1      MOV in1, param_in[1];
33 | 
34 | // Get the first clock value
35 | --:-:-:-:1      CS2R clock1, SR_CLOCKLO;
36 | 
37 | // Get the threadId and blockId
38 | // Set the Read-After-Write dependency barrier 1 and 2
39 | --:-:1:-:1      S2R tid, SR_TID.X;
40 | // Add one additional clock stall to allow the barrier time to set prior to next instruction that uses it
41 | --:-:2:-:2      S2R bid, SR_CTAID.X;
42 | 
43 | 
44 | // Get the second clock value
45 | // Wait on the depenedency barriers that were set in the prior instruction
46 | // Stall 6 to allow CS2R time to complete before next instruction
47 | // CS2R takes a constant 6 clocks to complete unlike S2R which is a variable 22-44 clocks
48 | // This stall count does not factor into the time calculation at all
49 | 03:-:-:-:6      CS2R clock2, SR_CLOCKLO;
50 | 
51 | // Take the difference of clocks
52 | --:-:-:-:1      IADD clock1, clock2, -clock1;
53 | 
54 | // Setup our output addresses
55 | // Stall your pipeline dependencies properly
56 | // Note using a single XMAD assumes blockDimX and bid are 16 bit values, which is reasonable for this test code
57 | --:-:-:-:6      XMAD offset, bid, blockDimX, tid;
58 | 
59 | // LEA is "load effective address"
60 | // The offset param is shifted left 2 and added to the pointers with 64bit math
61 | --:-:-:-:6      LEA      clocks0.CC, offset, param_clocks[0],     2;
62 | --:-:-:-:1      LEA.HI.X clocks1,    offset, param_clocks[1], RZ, 2;
63 | 
64 | --:-:-:-:6      LEA      out0.CC, offset, param_out[0],     2;
65 | --:-:-:-:1      LEA.HI.X out1,    offset, param_out[1], RZ, 2;
66 | 
67 | // Output the results.
68 | // No stall needed on prior instruction as memory store instructions have a 5 clock delay in picking up register values
69 | --:-:-:-:1      STG.E [clocks], clock1;
70 | --:-:-:-:1      STG.E [out],    offset; # use this to return whatever you like to inspect the results
71 | --:-:-:-:5      EXIT;
72 | 
73 | 


--------------------------------------------------------------------------------
/Assembler/PascalAs/microbench/shared.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | use strict;
 3 | 
 4 | print `maxas.pl -i shared_sts16.sass microbench.cubin`;
 5 | 
 6 | exit if $?;
 7 | 
 8 | print `Release\\microbench.exe i 1 64`;
 9 | 
10 | 
11 | __END__
12 | 
13 | 


--------------------------------------------------------------------------------
/Assembler/PascalAs/microbench/shared_lds.sass:
--------------------------------------------------------------------------------
  1 | # Kernel: microbench
  2 | # InsCnt: 18
  3 | # RegCnt: 5
  4 | # SharedSize: 4096
  5 | # BarCnt: 1
  6 | # Params(3):
  7 | #   ord:addr:size:align
  8 | #   0:0x140:4:0
  9 | #   1:0x144:4:0
 10 | #   2:0x148:4:0
 11 | 
 12 | // This is a simple micro bench to demonstrate the latency in loading SR_TID.X
 13 | 
 14 | <REGISTER_MAPPING>
 15 | 
 16 |     0-3 : result, a, b, c
 17 | 
 18 |     4-40 : out, clocks, in, tid, bid, blockDim, clock1, clock2, x, tid3, tid7, tid96, tid128, readAs, readBs, val<0-20>
 19 | 
 20 | </REGISTER_MAPPING>
 21 | 
 22 | // Load in our params
 23 | --:-:1:-:1      S2R tid,      SR_TID.X;
 24 | --:-:2:-:1      S2R bid,      SR_CTAID.X;
 25 | 
 26 | --:-:-:-:1      MOV result,  c[0x0][0x0];
 27 | --:-:-:-:1      MOV in,      c[0x0][0x100];
 28 | 
 29 | --:-:-:-:1      CS2R clock1, SR_CLOCKLO;
 30 | --:-:-:-:1      MOV result,  c[0x0][0x13c];
 31 | --:-:-:-:1      CS2R clock2, SR_CLOCKLO;
 32 | 
 33 | --:-:-:-:1      MOV blockDim, c[0x0][0x8];
 34 | --:-:-:-:1      MOV out,      c[0x0][0x140];
 35 | --:-:-:-:1      MOV clocks,   c[0x0][0x144];
 36 | 
 37 | 
 38 | 
 39 | 
 40 | <SCHEDULE_BLOCK>
 41 | 
 42 | 03:-:-:-:1      LOP.AND tid3,   tid, 3;
 43 | --:-:-:-:1      LOP.AND tid7,   tid, 7;
 44 | --:-:-:-:1      LOP.AND tid96,  tid, 96;
 45 | --:-:-:-:1      LOP.AND tid128, tid, 128;
 46 | 
 47 | // readAs = ((tid128 >> 4) | tid7) << 4
 48 | --:-:-:-:1      SHR.U32 readAs, tid128, 4;
 49 | --:-:-:-:1      LOP.OR  readAs, readAs, tid7;
 50 | --:-:-:-:1      SHL     readAs, readAs, 4;
 51 | 
 52 | // readBs  = ((tid96 >> 3) | tid3) << 4
 53 | --:-:-:-:1      SHR.U32 readBs, tid96, 3;
 54 | --:-:-:-:1      LOP.OR  readBs, readBs, tid3;
 55 | #--:-:-:-:1      SHL     readBs, readBs, 4;
 56 | #--:-:-:-:1      ISCADD  readBs, readBs, 4x<1024>, 4;
 57 | 
 58 | 
 59 | </SCHEDULE_BLOCK>
 60 | 
 61 | 
 62 | 
 63 | #--:-:-:-:1      LDS.U.128 result, [readBs];
 64 | 
 65 | 
 66 | 
 67 | 
 68 | 01:-:-:-:1      IADD clock1, clock2, -clock1;
 69 | 
 70 | 
 71 | --:-:-:-:1      XMAD tid, blockDim, bid, tid;
 72 | --:-:-:Y:6      XMAD.MRG x, blockDim, bid.H1, RZ;
 73 | --:-:-:Y:6      XMAD.PSL.CBCC tid, blockDim.H1, x.H1, tid;
 74 | --:-:-:Y:6      SHL  tid, tid, 0x2;
 75 | 
 76 | --:-:-:-:1      IADD clocks, clocks, tid;
 77 | --:-:-:-:2      IADD out,  out,  tid;
 78 | 
 79 | --:-:-:-:1      STG [clocks], clock1;
 80 | --:-:-:-:1      STG [out],    readBs;
 81 | --:-:-:-:5      EXIT;
 82 | 
 83 | <COMMENT>
 84 | 
 85 | --:-:-:-:4      LOP.AND tid32, tid, -32;
 86 | 
 87 | --:-:-:-:1      STS.128 [tid32 + 4x<2048>], RZ;
 88 | 
 89 | --:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
 90 | --:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
 91 | --:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
 92 | --:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
 93 | --:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
 94 | --:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
 95 | --:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
 96 | --:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
 97 | --:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
 98 | --:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
 99 | --:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
100 | --:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
101 | --:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
102 | --:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
103 | --:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
104 | --:-:1:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
105 | 
106 | 
107 | // readAs = (((tid & 0x80) >> 4) | ((tid >> 1) & 7)) << 4;
108 | --:-:-:-:1      BFE.U32 tid7,   tid,    0x301;
109 | --:-:-:-:1      LOP.AND readAs, tid,    0x80;
110 | --:-:-:-:1      SHR.U32 readAs, readAs, 4;
111 | --:-:-:-:1      LOP.OR  readAs, readAs, tid7;
112 | --:-:-:-:1      SHL     readAs, readAs, 4;
113 | 
114 | // readBs  = ((($tid & 0x70) >> 3) | ($tid & 1)) << 4 + 4096;
115 | --:-:-:-:1      LOP.AND tid1,   tid,    0x1;
116 | --:-:-:-:1      LOP.AND readBs, tid,    0x70;
117 | --:-:-:-:1      SHR.U32 readBs, readBs, 3;
118 | --:-:-:-:1      LOP.OR  readBs, readBs, tid1;
119 | --:-:-:-:1      ISCADD  readBs, readBs, 4x<1024>, 4;
120 | 
121 | 
122 | </COMMENT>


--------------------------------------------------------------------------------
/Assembler/PascalAs/microbench/shared_sts16.sass:
--------------------------------------------------------------------------------
  1 | # Kernel: microbench
  2 | # InsCnt: 18
  3 | # RegCnt: 5
  4 | # SharedSize: 4096
  5 | # BarCnt: 1
  6 | # Params(3):
  7 | #   ord:addr:size:align
  8 | #   0:0x140:4:0
  9 | #   1:0x144:4:0
 10 | #   2:0x148:4:0
 11 | 
 12 | // This is a simple micro bench to demonstrate the latency in loading SR_TID.X
 13 | 
 14 | <REGISTER_MAPPING>
 15 | 
 16 |     0-3 : result, a, b, c
 17 | 
 18 |     4-40 : out, clocks, in, tid, bid, blockDim, clock1, clock2, x, tid1, tid31, tid32, readAs, readBs, val<0-20>
 19 | 
 20 | </REGISTER_MAPPING>
 21 | 
 22 | // Load in our params
 23 | --:-:1:-:1      S2R tid,      SR_TID.X;
 24 | --:-:2:-:1      S2R bid,      SR_CTAID.X;
 25 | 
 26 | //--:-:-:-:1      MOV result,  c[0x0][0x0];
 27 | //--:-:-:-:1      MOV in,      c[0x0][0x100];
 28 | --:-:-:-:1      MOV result, 1;
 29 | 
 30 | --:-:-:-:1      MOV blockDim, c[0x0][0x8];
 31 | --:-:-:-:1      MOV out,      c[0x0][0x140];
 32 | --:-:-:-:1      MOV clocks,   c[0x0][0x144];
 33 | 
 34 | 
 35 | // readAs = ((tid >> 1) & 7) << 4;
 36 | 03:-:-:-:6      BFE.U32 readAs, tid,    0x301; // 3 bits at position 1
 37 | --:-:-:-:6      SHL     readAs, readAs, 3;
 38 | 
 39 | // readBs  = (((tid & 0x30) >> 3) | (tid & 1)) << 4 + 1024;
 40 | --:-:-:-:6      LOP.AND tid1,   tid,    1;
 41 | --:-:-:-:6      LOP.AND readBs, tid,    0x30;
 42 | --:-:-:-:6      SHR.U32 readBs, readBs, 3;
 43 | --:-:-:-:6      LOP.OR  readBs, readBs, tid1;
 44 | --:-:-:-:6      ISCADD  readBs, readBs, 0, 3;
 45 | 
 46 | 
 47 | 
 48 | ///--:-:-:-:1      STS [tid32], result;
 49 | //--:-:-:-:1      STS.S16 [tid32 + 2x<32>], result;
 50 | //--:-:1:-:2      LDS.U.64 result, [readBs];
 51 | 
 52 | --:-:-:-:0      CS2R clock1, SR_CLOCKLO;
 53 | --:-:1:-:6      LDS.U.64 result, [readAs];
 54 | --:-:-:-:6      CS2R clock2, SR_CLOCKLO;
 55 | 
 56 | 
 57 | 01:-:-:-:1      IADD clock1, clock2, -clock1;
 58 | 
 59 | 
 60 | --:-:-:-:1      XMAD tid, blockDim, bid, tid;
 61 | --:-:-:Y:6      XMAD.MRG x, blockDim, bid.H1, RZ;
 62 | --:-:-:Y:6      XMAD.PSL.CBCC tid, blockDim.H1, x.H1, tid;
 63 | --:-:-:Y:6      SHL  tid, tid, 0x2;
 64 | 
 65 | --:-:-:-:1      IADD clocks, clocks, tid;
 66 | --:-:-:-:2      IADD out,  out,  tid;
 67 | 
 68 | --:-:-:-:1      STG [clocks], clock1;
 69 | --:-:-:-:1      STG [out],    result;
 70 | --:-:-:-:5      EXIT;
 71 | 
 72 | <COMMENT>
 73 | 
 74 | --:-:-:-:4      LOP.AND tid32, tid, -32;
 75 | 
 76 | --:-:-:-:1      STS.128 [tid32 + 4x<2048>], RZ;
 77 | 
 78 | --:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
 79 | --:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
 80 | --:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
 81 | --:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
 82 | --:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
 83 | --:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
 84 | --:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
 85 | --:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
 86 | --:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
 87 | --:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
 88 | --:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
 89 | --:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
 90 | --:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
 91 | --:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
 92 | --:-:-:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
 93 | --:-:1:-:1      LDS.U.128 result, [tid32 + 4x<2048>];
 94 | 
 95 | 03:-:-:-:6      LOP.AND  tid31, tid, 31;
 96 | --:-:-:-:6      LOP.AND  tid32, tid, 32;
 97 | --:-:-:-:6      SHL  tid32, tid32, 0x2;
 98 | --:-:-:-:6      LOP.OR  tid32, tid32, tid31;
 99 | --:-:-:-:6      SHL  tid32, tid32, 0x2;
100 | 
101 | // readAs = (((tid & 0x80) >> 4) | ((tid >> 1) & 7)) << 4;
102 | --:-:-:-:1      BFE.U32 tid7,   tid,    0x301;
103 | --:-:-:-:1      LOP.AND readAs, tid,    0x80;
104 | --:-:-:-:1      SHR.U32 readAs, readAs, 4;
105 | --:-:-:-:1      LOP.OR  readAs, readAs, tid7;
106 | --:-:-:-:1      SHL     readAs, readAs, 4;
107 | 
108 | // readBs  = ((($tid & 0x70) >> 3) | ($tid & 1)) << 4 + 4096;
109 | --:-:-:-:1      LOP.AND tid1,   tid,    0x1;
110 | --:-:-:-:1      LOP.AND readBs, tid,    0x70;
111 | --:-:-:-:1      SHR.U32 readBs, readBs, 3;
112 | --:-:-:-:1      LOP.OR  readBs, readBs, tid1;
113 | --:-:-:-:1      ISCADD  readBs, readBs, 4x<1024>, 4;
114 | 
115 | 
116 | </COMMENT>


--------------------------------------------------------------------------------
/Assembler/PascalAs/microbench/throughput.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | use strict;
 3 | 
 4 | my $loopSize  = 512;
 5 | my $blocks    = 32;
 6 | my $loops     = 10240000;
 7 | my $fileName  = 'throughput2.sass';
 8 | 
 9 | writeSassFile($fileName, $loops);
10 | 
11 | #print `maxas.pl -p $fileName`;
12 | #exit;
13 | 
14 | print `maxas.pl -i $fileName microbench.cubin`;
15 | exit if $?;
16 | 
17 | foreach my $thread128 (2)
18 | {
19 |     my $threads   = $thread128 * 128;
20 |     my $fops      = 2 * $loops * $loopSize * $blocks * $threads;
21 | 
22 |     my $data = `Release\\microbench.exe e $blocks $threads $fops`;
23 | 
24 |     my ($gflops) = $data =~ /GFLOPS: ([0-9]+)/ms;
25 | 
26 |     printf "%d %d %d\n", $thread128, $threads, $gflops;
27 | }
28 | 
29 | exit;
30 | 
31 | sub writeSassFile
32 | {
33 |     my ($filename, $loops) = @_;
34 | 
35 |     open my $fh, ">$filename" or die "$filename: $!";
36 | 
37 |     printf $fh <<'EOF', $loops;
38 | # Kernel: microbench
39 | 
40 | <REGISTER_MAPPING>
41 | 
42 |     0-10 : result, r1, r2, r3
43 |     20-27 ~ count, stop
44 | 
45 | </REGISTER_MAPPING>
46 | 
47 | --:-:-:-:1      MOV count, RZ;
48 | --:-:-:-:1      MOV32I stop, %d;
49 | --:-:-:-:1      MOV32I r1, 1.0;
50 | --:-:-:-:1      MOV32I r2, 1.0;
51 | --:-:-:-:4      MOV32I r3, 1.0;
52 | 
53 | LOOP:
54 | 
55 | --:-:-:-:1      ISETP.LE.AND P0, PT, count, stop, PT;
56 | --:-:-:-:1      IADD count, count, 1;
57 | 
58 | <CODE>
59 |     my $out;
60 | 
61 |     foreach my $i (0 .. 511)
62 |     {
63 |         my $yield = ($i + 32) & 63 ? '-' : 'Y';
64 | 
65 |         my $stall = $i == 511 ? 0 : 1;
66 | 
67 |         $out .= "--:-:-:$yield:$stall      FFMA result, r1, r2, r3;\n";
68 |     }
69 |     return $out;
70 | </CODE>
71 | 
72 | --:-:-:Y:5  @P0 BRA LOOP;
73 | --:-:-:-:5      EXIT;
74 | EOF
75 | 
76 |     close $fh;
77 | }
78 | 
79 | __END__
80 | 
81 | 


--------------------------------------------------------------------------------
/Assembler/PascalAs/microbench/throughput.sass:
--------------------------------------------------------------------------------
 1 | # Kernel: microbench
 2 | # InsCnt: 18
 3 | # RegCnt: 5
 4 | # SharedSize: 4096
 5 | # BarCnt: 1
 6 | # Params(3):
 7 | #   ord:addr:size:align
 8 | #   0:0x140:4:0
 9 | #   1:0x144:4:0
10 | #   2:0x148:4:0
11 | 
12 | <REGISTER_MAPPING>
13 | 
14 |     8-20 : count
15 | 
16 | </REGISTER_MAPPING>
17 | 
18 | --:-:-:-:1      MOV R0, RZ;
19 | --:-:-:-:1      MOV R1, RZ;
20 | --:-:-:-:1      MOV R2, RZ;
21 | --:-:-:-:1      MOV R3, RZ;
22 | --:-:-:-:1      MOV R4, RZ;
23 | --:-:-:-:1      MOV R5, RZ;
24 | --:-:-:-:1      MOV R6, RZ;
25 | --:-:-:-:1      MOV R7, RZ;
26 | --:-:-:-:1      MOV R8, RZ;
27 | --:-:-:Y:6      MOV count, RZ;
28 | 
29 | // This loop is capable of running at 1700 GFlops on GM107.
30 | // You can tweak it to see how register bank conflicts or different control codes
31 | // effect performance.
32 | // With thoughput.pl you can pass params to this code and do some autotuning.
33 | LOOP:
34 | 
35 | --:-:-:-:1      ISETP.LE.AND P0, PT, count, 0x19000, PT;
36 | --:-:-:-:1      IADD count, count, 0x1;
37 | 
38 | <CODE>
39 |     my $out;
40 | 
41 |     foreach my $i (0..511) #511
42 |     {
43 |         my $y = ($i + 32) & 63 ? '-' : 'Y';
44 | 
45 |         $out .= qq|
46 | --:-:-:$y:1      FFMA R0, R1, R2, R3;|; #c[0x0][$c]
47 |     }
48 |     return $out;
49 | </CODE>
50 | 
51 | --:-:-:Y:5  @P0 BRA LOOP;
52 | 
53 | --:-:-:-:5      EXIT;
54 | 
55 | <COMMENT>
56 | 
57 | 
58 |     open my $fh, 'params.txt';
59 |     my $line = <$fh>;
60 |     close $fh;
61 |     my ($r1, $r2, $r3) = split "\t", $line;
62 | 
63 |     80-95 : out, clocks, in, tid, clock1, clock2, result
64 | 
65 | 
66 | --:-:1:-:1      S2R tid,   SR_TID.X;
67 | --:-:-:-:1      MOV out,    c[0x0][0x140];
68 | --:-:-:-:1      MOV clocks, c[0x0][0x144];
69 | 01:-:-:-:1      MOV in,     c[0x0][0x148];
70 | 
71 | 
72 | 
73 | --:-:-:-:1      MOV32I f0, 0x3f800000;
74 | --:-:-:-:1      MOV32I f1, 0x3f800000;
75 | --:-:-:-:1      MOV32I f2, 0x3f800000;
76 | --:-:-:-:5      MOV32I f3, 0x3f800000;
77 | 
78 | --:-:-:-:1      CS2R clock1, SR_CLOCKLO;
79 | 
80 | 
81 | --:-:-:-:1      CS2R clock2, SR_CLOCKLO;
82 | 
83 | --:-:-:-:6      MOV32I result, 0x457;
84 | --:-:-:-:1      IADD clock1, clock2, -clock1;
85 | 
86 | 
87 | --:-:-:-:6      SHL  tid, tid, 0x2;
88 | --:-:-:-:1      IADD clocks, clocks, tid;
89 | --:-:-:-:1      IADD out,  out,  tid;
90 | 
91 | --:-:-:-:1      STG [clocks], clock1;
92 | --:-:-:-:1      STG [out],    R24;
93 | 
94 | 
95 | </COMMENT>


--------------------------------------------------------------------------------
/Assembler/PascalAs/microbench/throughput2.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl
  2 | use strict;
  3 | my %p;
  4 | 
  5 | $p{N}         = 8192;
  6 | $p{blocking}  = 8;
  7 | $p{unroll}    = 8;
  8 | $p{threads}   = 64;   #256
  9 | 
 10 | $p{csize}     = $p{blocking} * $p{blocking};
 11 | $p{loopSize}  = $p{unroll} * $p{csize};
 12 | $p{width}     = sqrt($p{csize} * $p{threads});
 13 | $p{blocks}    = ($p{N} / $p{width}) * ($p{N} / $p{width});
 14 | $p{loops}     = $p{N} / $p{unroll};
 15 | $p{fops}      = 2 * $p{loops} * $p{loopSize} * $p{blocks} * $p{threads};
 16 | 
 17 | my $fileName  = 'throughput2.sass';
 18 | 
 19 | my @params = qw(N blocking unroll threads csize loopSize loops width blocks fops);
 20 | 
 21 | #print join("\t", @params), "\n";
 22 | #print join("\t", @p{@params}), "\n";
 23 | 
 24 | print map sprintf("%-9s: %d\n", $_, $p{$_}), @params;
 25 | 
 26 | writeSassFile($fileName, $p{loopSize}, $p{loops});
 27 | 
 28 | #print `maxas.pl -p $fileName`;
 29 | #exit;
 30 | 
 31 | print `maxas.pl -i $fileName microbench.cubin`;
 32 | 
 33 | exit if $?;
 34 | 
 35 | my $data = `Release\\microbench.exe e $p{blocks} $p{threads} $p{fops} 50`;
 36 | 
 37 | my ($gflops) = $data =~ /GFLOPS: ([0-9]+)/ms;
 38 | 
 39 | print $data;
 40 | 
 41 | #printf "%d %4d %4d %d\n", $thread128, $loopSize, $loops, $gflops;
 42 | 
 43 | 
 44 | 
 45 | 
 46 | sub writeSassFile
 47 | {
 48 |     my ($filename, $loopSize, $loops) = @_;
 49 | 
 50 |     open my $fh, ">$filename" or die "$filename: $!";
 51 | 
 52 |     printf $fh <<'END_SASS', $loops;
 53 | # Kernel: microbench
 54 | 
 55 | <REGISTER_MAPPING>
 56 | 
 57 |      3, 2,11,10,19,18,27,26 : cx00y<00-03|64-67>
 58 |      7, 6,15,14,23,22,31,30 : cx01y<00-03|64-67>
 59 |      1, 0, 9, 8,17,16,25,24 : cx02y<00-03|64-67>
 60 |      5, 4,13,12,21,20,29,28 : cx03y<00-03|64-67>
 61 |     35,34,43,42,51,50,59,58 : cx64y<00-03|64-67>
 62 |     39,38,47,46,55,54,63,62 : cx65y<00-03|64-67>
 63 |     33,32,41,40,49,48,57,56 : cx66y<00-03|64-67>
 64 |     37,36,45,44,53,52,61,60 : cx67y<00-03|64-67>
 65 | 
 66 |     64-79 : j0Ax<00-03|64-67>, j0By<00-03|64-67>
 67 |     80-95 : j1Ax<00-03|64-67>, j1By<00-03|64-67>
 68 | 
 69 |     0-127 : r<0-127>
 70 | 
 71 |     100-101 : count, stop
 72 | 
 73 |     //102-112 ~ readAs, readBs, writeS
 74 | 
 75 | </REGISTER_MAPPING>
 76 | 
 77 | --:-:-:-:1      MOV count, RZ;
 78 | --:-:-:-:1      MOV32I stop, %d;
 79 | //--:-:-:-:1      MOV writeS, RZ;
 80 | //--:-:-:-:1      MOV readAs, RZ;
 81 | //--:-:-:-:1      MOV readBs, RZ;
 82 | 
 83 | <CODE>
 84 |     return join '', map "--:-:-:-:1      MOV32I r$_, 1.0;\n", 0..95;
 85 | </CODE>
 86 | 
 87 | LOOP:
 88 | 
 89 | --:-:-:-:1      ISETP.LE.AND P0, PT, count, stop, PT;
 90 | --:-:-:-:1      IADD count, count, 1;
 91 | 
 92 | <CODE>
 93 |     my $out;
 94 | 
 95 | 
 96 |     my @cOrder;
 97 |     #my @swirl = ([0,1],[0,0],[2,0],[2,1]);
 98 |     my @swirl = ([2,0],[2,1],[0,1],[0,0]);
 99 |     #my @swirl = ([0,1],[0,0],[1,0],[1,1]);
100 |     my @xVals = (0,1,64,65);
101 |     #my @xVals = (0,2,64,66);
102 | 
103 |     my @yVals = (0,2,64,66);
104 | 
105 |     foreach my $y (@yVals)
106 |     {
107 |         foreach my $x (@xVals)
108 |         {
109 |             push @cOrder, sprintf('x%%02dy%%02d', $x + $_->[0], $y + $_->[1]) foreach @swirl;
110 |         }
111 |         @xVals = reverse @xVals;
112 |     }
113 | 
114 |     foreach my $j (0..7)
115 |     {
116 |         my $odd  = $j & 1;
117 |         my $nOdd = !$odd + 0;
118 | 
119 | 		my %%insert;
120 | 
121 |         #$insert{c62} = "01:-:-:-:5      BAR.SYNC 0;\n" if $j == 6;
122 | 
123 |         $insert{c62} =
124 |                 "--:-:-:-:1      LOP.XOR readAs, readAs, 0;\n" .
125 |                 "--:-:-:-:1      LOP.XOR readBs, readBs, 0;\n" .
126 |                 "--:-:-:-:1      LOP.XOR readAs, readAs, 0;\n" .
127 |                 "--:-:-:-:1      LOP.XOR readBs, readBs, 0;\n" .
128 |                 "--:-:-:-:1      LOP.XOR writeS, writeS, 0;\n" if $j == 8;
129 | 
130 |         foreach my $c (0 .. 63)
131 |         {
132 |             my ($x,$y) = $cOrder[$c] =~ /^(x\d+)(y\d+)/;
133 |             my $ins    = $insert{"c$c"} || '';
134 |             my $stall  = ($c == 63 && $j == 7) ? 0 : 1; #1; #$ins ||
135 |             my $yield  = $c == 32 ? 'Y' : '-';
136 |             my $wait   = '--'; #$c ? '--' : '01';
137 | 
138 |             $out .= "$wait:-:-:$yield:$stall      FFMA c$cOrder[$c], j${odd}A$x, j${odd}B$y, c$cOrder[$c];\n$ins";
139 |         }
140 |     }
141 |     return $out;
142 | </CODE>
143 | 
144 | --:-:-:Y:5  @P0 BRA LOOP;
145 | --:-:-:-:5      EXIT;
146 | END_SASS
147 | 
148 |     close $fh;
149 | }
150 | 
151 | __END__
152 | 
153 |         my %%insert = (
154 |             c0 => "--:-:-:-:1      LDS.U.128 j${nOdd}Ax00, [readAs+0x10];\n",
155 |             c2 => "--:-:-:-:1      LDS.U.128 j${nOdd}By00, [readBs+0x10];\n",
156 |             c4 => "--:-:-:-:1      LDS.U.128 j${nOdd}Ax64, [readAs+0x10];\n",
157 |             c6 => "--:-:1:-:1      LDS.U.128 j${nOdd}By64, [readBs+0x10];\n",
158 |         );


--------------------------------------------------------------------------------
/Assembler/PascalAs/microbench/throughput2.sass:
--------------------------------------------------------------------------------
 1 | # Kernel: microbench
 2 | 
 3 | <REGISTER_MAPPING>
 4 | 
 5 |     0-10 : result, r1, r2, r3
 6 |     20-27 ~ count, stop
 7 | 
 8 | </REGISTER_MAPPING>
 9 | 
10 | --:-:-:-:1      MOV count, RZ;
11 | --:-:-:-:1      MOV32I stop, 102400;
12 | --:-:-:-:1      MOV32I r1, 1.0;
13 | --:-:-:-:1      MOV32I r2, 1.0;
14 | --:-:-:-:4      MOV32I r3, 1.0;
15 | 
16 | LOOP:
17 | 
18 | --:-:-:-:1      ISETP.LE.AND P0, PT, count, stop, PT;
19 | --:-:-:-:1      IADD count, count, 1;
20 | 
21 | <CODE>
22 |     my $out;
23 | 
24 |     foreach my $i (0 .. 511)
25 |     {
26 |         my $yield = ($i + 32) & 63 ? '-' : 'Y';
27 | 
28 |         my $stall = $i == 511 ? 0 : 1;
29 | 
30 |         #$out .= "--:-:-:$yield:1      FFMA r3, r1, r2, r3;\n";
31 |         #$out .= "--:-:-:-:1      FFMA r3, r1, r2, r3;\n";
32 |         #$out .= "--:-:-:-:1      FFMA r3, r1, r2, r3;\n";
33 |         #$out .= "--:-:-:-:0      FFMA r3, r1, r2, r3;\n";
34 |         #$out .= "--:-:-:-:1      I2F.F32.S16 result, r1;\n";
35 | 
36 |         #$out .= "--:-:-:$yield:$stall      VADD.S16.S16.SAT.MRG_16L result, r1, r2, RZ;\n";
37 |         #$out .= "--:-:-:-:1      MOV result, RZ;\n";
38 | 
39 |         $out .= "--:-:-:$yield:$stall      IADD.SAT result, r1, r2;\n";
40 |         #$out .= "--:-:-:$yield:$stall      VMAD.S8.S8.SAT result, r1, r2, r3;\n";
41 |         #$out .= "--:-:-:$yield:$stall      XMAD result, r1, r2, r3;\n";
42 |     }
43 |     return $out;
44 | </CODE>
45 | 
46 | --:-:-:Y:5  @P0 BRA LOOP;
47 | --:-:-:-:5      EXIT;
48 | 


--------------------------------------------------------------------------------
/Assembler/PascalAs/microbench/throughput3.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | use strict;
 3 | 
 4 | my %data;
 5 | 
 6 | foreach my $thread128 (1 .. 8)
 7 | {
 8 |     foreach my $size64 (8 .. 16)
 9 |     {
10 |         my $loopSize  = $size64 * 64;
11 |         my $loops     = int(2 * 1638400 / ($size64 * $thread128));
12 | 
13 |         my $blocks    = 16;
14 |         my $threads   = $thread128 * 128;
15 |         my $fops      = 2 * $loops * $loopSize * $blocks * $threads;
16 |         my $fileName  = 'throughput2.sass';
17 | 
18 |         #printf "%d %4d %4d %d\n", $thread128, $loopSize, $loops, $fops;
19 |         #next;
20 | 
21 |         writeSassFile($fileName, $loopSize, $loops);
22 | 
23 |         `maxas.pl -i $fileName microbench.cubin`;
24 | 
25 |         exit if $?;
26 | 
27 |         my $data = `Release\\microbench.exe e $blocks $threads $fops`;
28 | 
29 |         my ($gflops) = $data =~ /GFLOPS: ([0-9]+)/ms;
30 | 
31 |         printf "%d %4d %4d %d\n", $thread128, $loopSize, $loops, $gflops;
32 | 
33 |         push @{$data{$loopSize}}, $gflops;
34 |     }
35 | }
36 | print join("\t", 'size', 1 .. 8), "\n";
37 | foreach my $loopSize (sort {$a <=> $b} keys %data)
38 | {
39 |     print join("\t", $loopSize, @{$data{$loopSize}}), "\n";
40 | }
41 | 
42 | exit;
43 | 
44 | sub writeSassFile
45 | {
46 |     my ($filename, $loopSize, $loops) = @_;
47 | 
48 |     open my $fh, ">$filename" or die "$filename: $!";
49 | 
50 |     printf $fh <<'EOF', $loops, $loopSize, $loopSize;
51 | # Kernel: microbench
52 | 
53 | <REGISTER_MAPPING>
54 | 
55 |     0-10 : result, r1, r2, r3, count, stop
56 | 
57 | </REGISTER_MAPPING>
58 | 
59 | --:-:-:-:1      MOV count, RZ;
60 | --:-:-:-:1      MOV32I stop, %d;
61 | --:-:-:-:1      MOV32I r1, 1.0;
62 | --:-:-:-:1      MOV32I r2, 1.0;
63 | --:-:-:-:4      MOV32I r3, 1.0;
64 | 
65 | LOOP:
66 | 
67 | --:-:-:-:1      ISETP.LE.AND P0, PT, count, stop, PT;
68 | --:-:-:-:1      IADD count, count, 1;
69 | 
70 | <CODE>
71 |     my $out;
72 | 
73 |     foreach my $i (0 .. %d)
74 |     {
75 |         my $y = %d > 64 && (($i + 32) & 63) ? '-' : 'Y';
76 | 
77 |         $out .= "--:-:-:$y:1      FFMA result, r1, r2, r3;\n";
78 |     }
79 |     return $out;
80 | </CODE>
81 | 
82 | --:-:-:Y:5  @P0 BRA LOOP;
83 | --:-:-:-:5      EXIT;
84 | EOF
85 | 
86 |     close $fh;
87 | }
88 | 
89 | __END__
90 | 
91 | 


--------------------------------------------------------------------------------
/Assembler/PascalAs/microbench/throughput4.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl
  2 | use strict;
  3 | 
  4 | my $loopSize  = 512;
  5 | my $blocks    = 64;
  6 | my $loops     = 102400;
  7 | my $fileName  = 'throughput2.sass';
  8 | 
  9 | writeSassFile($fileName, $loops);
 10 | 
 11 | #print `maxas.pl -p $fileName`;
 12 | #exit;
 13 | 
 14 | print `maxas.pl -i $fileName microbench.cubin`;
 15 | exit if $?;
 16 | 
 17 | foreach my $thread128 (4)
 18 | {
 19 |     my $threads   = $thread128 * 128;
 20 |     my $fops      = 2 * $loops * $loopSize * $blocks * $threads;
 21 | 
 22 |     print "./microbench e $blocks $threads $fops\n\n";
 23 |     my $data = `./microbench e $blocks $threads $fops`;
 24 |     exit($?) if $?;
 25 | 
 26 |     my ($gflops) = $data =~ /GFLOPS: ([0-9]+)/ms;
 27 | 
 28 |     printf "%d %d %d %.2f\n", $thread128, $threads, $gflops, 100 * $gflops / 3050.0;
 29 | }
 30 | 
 31 | exit;
 32 | 
 33 | sub writeSassFile
 34 | {
 35 |     my ($filename, $loops) = @_;
 36 | 
 37 |     open my $fh, ">$filename" or die "$filename: $!";
 38 | 
 39 |     printf $fh <<'EOF', $loops;
 40 | # Kernel: microbench
 41 | 
 42 | <REGISTER_MAPPING>
 43 | 
 44 |     0-10 : result, r1, r2, r3
 45 |     20-27 ~ count, stop
 46 | 
 47 | </REGISTER_MAPPING>
 48 | 
 49 | --:-:-:-:1      MOV count, RZ;
 50 | --:-:-:-:1      MOV32I stop, %d;
 51 | --:-:-:-:1      MOV32I r1, 1.0;
 52 | --:-:-:-:1      MOV32I r2, 1.0;
 53 | --:-:-:-:4      MOV32I r3, 1.0;
 54 | 
 55 | LOOP:
 56 | 
 57 | --:-:-:-:1      ISETP.LE.AND P0, PT, count, stop, PT;
 58 | --:-:-:-:1      IADD count, count, 1;
 59 | 
 60 | <CODE>
 61 |     my $out;
 62 | 
 63 |     foreach my $i (0 .. 511)
 64 |     {
 65 |         my $yield = ($i + 32) & 63 ? '-' : 'Y';
 66 | 
 67 |         my $stall = $i == 511 ? 0 : 1;
 68 | 
 69 |         #$out .= "--:-:-:$yield:1      FFMA r3, r1, r2, r3;\n";
 70 |         #$out .= "--:-:-:-:1      FFMA r3, r1, r2, r3;\n";
 71 |         #$out .= "--:-:-:-:1      FFMA r3, r1, r2, r3;\n";
 72 |         #$out .= "--:-:-:-:0      FFMA r3, r1, r2, r3;\n";
 73 |         #$out .= "--:-:-:-:1      I2F.F32.S16 result, r1;\n";
 74 | 
 75 |         #$out .= "--:-:-:$yield:$stall      VADD.S16.S16.SAT.MRG_16L result, r1, r2, RZ;\n";
 76 |         #$out .= "--:-:-:-:1      MOV result, RZ;\n";
 77 | 
 78 |         $out .= "--:-:-:$yield:$stall      IADD.SAT result, r1, r2;\n";
 79 |         #$out .= "--:-:-:$yield:$stall      VMAD.S8.S8.SAT result, r1, r2, r3;\n";
 80 |         #$out .= "--:-:-:$yield:$stall      XMAD result, r1, r2, r3;\n";
 81 |     }
 82 |     return $out;
 83 | </CODE>
 84 | 
 85 | --:-:-:Y:5  @P0 BRA LOOP;
 86 | --:-:-:-:5      EXIT;
 87 | EOF
 88 | 
 89 |     close $fh;
 90 | }
 91 | 
 92 | __END__
 93 | 
 94 | VMAD.U8.U8
 95 | 
 96 | dddd 2655 / 4968 = 53.4%
 97 | 1d1d 4594 / 4968 = 92.4%
 98 | 11d  4746 / 4968 = 95.5%
 99 | 111d 4841 / 4968 = 97.4%
100 | 
101 | block context switches are a little more expensive than thread context switches
102 | 
103 | stall codes:
104 | 
105 | f : 13 clocks
106 | e :  8 clocks
107 | d :  6 clocks
108 | c :  8 clocks, no yield
109 | b : 11 clocks
110 | a : 10 clocks
111 | 9 :  9 clocks
112 | 8 :  8 clocks
113 | 7 :  7 clocks
114 | 6 :  6 clocks
115 | 5 :  5 clocks
116 | 4 :  4 clocks
117 | 3 :  3 clocks
118 | 2 :  2 clocks
119 | 1 :  1 clocks,  no yield
120 | 0 :  0 clocks,  no yield, dual issue


--------------------------------------------------------------------------------
/Assembler/PascalAs/microbench/throughput5.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl
  2 | use strict;
  3 | my %p;
  4 | 
  5 | $p{N}         = 8192;
  6 | $p{blocking}  = 8;
  7 | $p{unroll}    = 8;
  8 | $p{threads}   = 64;   #256
  9 | 
 10 | $p{csize}     = $p{blocking} * $p{blocking};
 11 | $p{loopSize}  = $p{unroll} * $p{csize};
 12 | $p{width}     = sqrt($p{csize} * $p{threads});
 13 | $p{blocks}    = ($p{N} / $p{width}) * ($p{N} / $p{width});
 14 | $p{loops}     = $p{N} / $p{unroll};
 15 | $p{fops}      = 2 * $p{loops} * $p{loopSize} * $p{blocks} * $p{threads};
 16 | 
 17 | my $fileName  = 'throughput2.sass';
 18 | 
 19 | my @params = qw(N blocking unroll threads csize loopSize loops width blocks fops);
 20 | 
 21 | #print join("\t", @params), "\n";
 22 | #print join("\t", @p{@params}), "\n";
 23 | 
 24 | print map sprintf("%-9s: %d\n", $_, $p{$_}), @params;
 25 | 
 26 | writeSassFile($fileName, $p{loopSize}, $p{loops});
 27 | 
 28 | #print `maxas.pl -p $fileName`;
 29 | #exit;
 30 | 
 31 | print `maxas.pl -i $fileName microbench.cubin`;
 32 | 
 33 | exit if $?;
 34 | 
 35 | my $data = `Release\\microbench.exe e $p{blocks} $p{threads} $p{fops} 50`;
 36 | 
 37 | my ($gflops) = $data =~ /GFLOPS: ([0-9]+)/ms;
 38 | 
 39 | print $data;
 40 | 
 41 | #printf "%d %4d %4d %d\n", $thread128, $loopSize, $loops, $gflops;
 42 | 
 43 | 
 44 | 
 45 | 
 46 | sub writeSassFile
 47 | {
 48 |     my ($filename, $loopSize, $loops) = @_;
 49 | 
 50 |     open my $fh, ">$filename" or die "$filename: $!";
 51 | 
 52 |     printf $fh <<'END_SASS', $loops;
 53 | # Kernel: microbench
 54 | 
 55 | <REGISTER_MAPPING>
 56 | 
 57 |      1, 9, 2,10,17,25,18,26 : cy0x<0-7>
 58 |      5,13, 6,14,21,29,22,30 : cy1x<0-7>
 59 |      3,11, 0, 8,19,27,16,24 : cy2x<0-7>
 60 |      7,15, 4,12,23,31,20,28 : cy3x<0-7>
 61 |     35,43,32,40,51,59,48,56 : cy4x<0-7>
 62 |     39,47,36,44,55,63,52,60 : cy5x<0-7>
 63 |     33,41,34,42,49,57,50,58 : cy6x<0-7>
 64 |     37,45,38,46,53,61,54,62 : cy7x<0-7>
 65 | 
 66 |     64-71   : j0Ax<0-3>, j0By<0-3>
 67 |     72-79   : j1Ax<0-3>, j1By<0-3>
 68 | 
 69 |     0-79 : r<0-79>
 70 | 
 71 |     100-101 : count, stop
 72 | 
 73 |     //102-112 ~ readAs, readBs, writeS
 74 | 
 75 | </REGISTER_MAPPING>
 76 | 
 77 | --:-:-:-:1      MOV count, RZ;
 78 | --:-:-:-:1      MOV32I stop, %d;
 79 | //--:-:-:-:1      MOV writeS, RZ;
 80 | //--:-:-:-:1      MOV readAs, RZ;
 81 | //--:-:-:-:1      MOV readBs, RZ;
 82 | 
 83 | <CODE>
 84 |     return join '', map "--:-:-:-:1      MOV r$_, RZ;\n", 0..63;
 85 | </CODE>
 86 | 
 87 | <CODE>
 88 |     return join '', map "--:-:-:-:1      MOV32I r$_, 0x00010001;\n", 64..79;
 89 | </CODE>
 90 | 
 91 | LOOP:
 92 | 
 93 | --:-:-:-:1      ISETP.LE.AND P0, PT, count, stop, PT;
 94 | --:-:-:-:1      IADD count, count, 1;
 95 | 
 96 | <CODE>
 97 |     my $out;
 98 | 
 99 |     my @swirl1 = ([0,0],[0,4],[4,4],[4,0]);
100 |     my @swirl2 = ([0,0],[1,0],[1,1],[0,1]);
101 |     my @swirl3 = ([0,2],[2,2],[2,0],[0,0]);
102 | 
103 |     my @cOrder;
104 |     foreach my $s1 (@swirl1)
105 |     {
106 |         foreach my $s2 (@swirl2)
107 |         {
108 |             foreach my $s3 (@swirl3)
109 |             {
110 |                 push @cOrder, [$s1->[0] + $s2->[0] + $s3->[0], $s1->[1] + $s2->[1] + $s3->[1]];
111 |             }
112 |         }
113 |     }
114 | 
115 |     foreach my $j (0..7)
116 |     {
117 |         my $odd  = $j & 1;
118 |         my $nOdd = !$odd + 0;
119 | 
120 |         my %%insert;
121 | 
122 |         #$insert{c62} = "01:-:-:-:5      BAR.SYNC 0;\n" if $j == 6;
123 | 
124 |         $insert{c62} =
125 |                 "--:-:-:-:1      LOP.XOR readAs, readAs, 0;\n" .
126 |                 "--:-:-:-:1      LOP.XOR readBs, readBs, 0;\n" .
127 |                 "--:-:-:-:1      LOP.XOR readAs, readAs, 0;\n" .
128 |                 "--:-:-:-:1      LOP.XOR readBs, readBs, 0;\n" .
129 |                 "--:-:-:-:1      LOP.XOR writeS, writeS, 0;\n" if $j == 8;
130 | 
131 |         foreach my $c (0 .. 63)
132 |         {
133 |             my ($x,$y) = @{$cOrder[$c]};
134 |             my $ins    = $insert{"c$c"} || '';
135 |             my $stall  = ($c == 63 && $j == 7) ? 0 : 1; #1; #$ins ||
136 |             my $yield  = $c == 32 ? 'Y' : '-';
137 |             my $wait   = '--'; #$c ? '--' : '01';
138 | 
139 |             my $xReg  = $x >> 1;
140 |             my $yReg  = $y >> 1;
141 |             my $xPart = $x & 1 ? '.H1' : '';
142 |             my $yPart = $y & 1 ? '.H1' : '';
143 | 
144 |             $out .= sprintf "$wait:-:-:$yield:$stall      XMAD cy%%dx%%d, j%%dAx%%d%%s, j%%dBy%%d%%s, cy%%dx%%d;\n%%s", $y,$x,  $odd,$xReg,$xPart,  $odd,$yReg,$yPart,  $y,$x,  $ins;
145 |         }
146 |     }
147 |     return $out;
148 | </CODE>
149 | 
150 | --:-:-:Y:5  @P0 BRA LOOP;
151 | --:-:-:-:5      EXIT;
152 | END_SASS
153 | 
154 |     close $fh;
155 | }
156 | 
157 | __END__
158 | 
159 |         my %%insert = (
160 |             c0 => "--:-:-:-:1      LDS.U.128 j${nOdd}Ax00, [readAs+0x10];\n",
161 |             c2 => "--:-:-:-:1      LDS.U.128 j${nOdd}By00, [readBs+0x10];\n",
162 |             c4 => "--:-:-:-:1      LDS.U.128 j${nOdd}Ax64, [readAs+0x10];\n",
163 |             c6 => "--:-:1:-:1      LDS.U.128 j${nOdd}By64, [readBs+0x10];\n",
164 |         );


--------------------------------------------------------------------------------
/Assembler/PascalAs/microbench/xmad.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | use strict;
 3 | 
 4 | print `maxas.pl -i xmad2.sass microbench.cubin`;
 5 | 
 6 | exit if $?;
 7 | 
 8 | print `./microbench i 1 128`;
 9 | 
10 | 
11 | __END__
12 | 
13 | 


--------------------------------------------------------------------------------
/Assembler/PascalAs/microbench/xmad2.sass:
--------------------------------------------------------------------------------
  1 | # Kernel: microbench
  2 | # InsCnt: 18
  3 | # RegCnt: 5
  4 | # SharedSize: 4096
  5 | # BarCnt: 1
  6 | # Params(3):
  7 | #	ord:addr:size:align
  8 | #	0:0x140:8:0
  9 | #	1:0x148:8:0
 10 | #	2:0x150:8:0
 11 | #
 12 | # Instructions:
 13 | 
 14 | <CONSTANT_MAPPING>
 15 |     blockDimX : c[0x0][0x8]
 16 |     blockDimY : c[0x0][0xc]
 17 |     blockDimZ : c[0x0][0x10]
 18 |     gridDimX : c[0x0][0x14]
 19 |     gridDimY : c[0x0][0x18]
 20 |     gridDimZ : c[0x0][0x1c]
 21 | 
 22 |     param_out[0] : c[0x0][0x140]
 23 |     param_out[1] : c[0x0][0x144]
 24 |     param_clocks[0] : c[0x0][0x148]
 25 |     param_clocks[1] : c[0x0][0x14c]
 26 |     param_in[0] : c[0x0][0x150]
 27 |     param_in[1] : c[0x0][0x154]
 28 | </CONSTANT_MAPPING>
 29 | 
 30 | <REGISTER_MAPPING>
 31 | 
 32 | 	0-1 : out<0-1>
 33 | 	2-3 : clocks<0-1>
 34 |     4-15  : result, result2, tid, bid, blockDim, clock1, clock2, scale, s
 35 |     16-24 : a, b, c, x
 36 | 
 37 | </REGISTER_MAPPING>
 38 | 
 39 | // Load in our params
 40 | --:-:-:-:1      MOV out0,      param_out[0];
 41 | --:-:-:-:1      MOV out1,      param_out[1];
 42 | --:-:-:-:1      MOV clocks0,   param_clocks[0];
 43 | --:-:-:-:1      MOV clocks1,   param_clocks[1];
 44 | //--:-:-:-:1      MOV in,       c[0x0][0x148];
 45 | --:-:-:-:1      MOV blockDim, blockDimX;
 46 | 
 47 | --:-:-:-:1      PSETP.AND.AND P0, PT, !PT, PT, PT;
 48 | 
 49 | --:-:-:-:6      MOV32I result,  0xffffffff;
 50 | --:-:-:-:6      MOV32I result2, 0x0;
 51 | --:-:-:-:1      MOV32I a, 1;
 52 | --:-:-:-:1      MOV32I b, 1;
 53 | --:-:-:-:6      MOV32I c, 0x0;
 54 | 
 55 | // (127 - scale) << 23
 56 | //--:-:-:-:6      MOV32I scale, 28;
 57 | //--:-:-:-:6      IADD scale, -scale, 127;
 58 | //--:-:-:-:6      SHL  scale, scale, 23;
 59 | 
 60 | 
 61 | //--:-:-:-:6      MOV32I c, 0x4f765432;
 62 | 
 63 | //--:-:1:-:2      LDG.CI.128 a, [in];
 64 | 
 65 | //01:-:-:-:6      VMAD.S16.S16 result, a, b, c;
 66 | 
 67 | //--:-:-:-:6      MOV result, a;
 68 | 
 69 | // a >> 16 | (b & 0xffff0000)
 70 | 
 71 | //--:-:-:-:6      SHR.U32 result, a, 16;
 72 | //--:-:-:-:6      LOP3.LUT result, result, b, c, 0xf8;
 73 | 
 74 | //--:-:-:-:6      I2I.S32.S16 result, a.H1;
 75 | 
 76 | //--:-:-:Y:d      IADD result.CC, a, -c;
 77 | //--:-:-:Y:2      IADD.X result2, b, -RZ;
 78 | 
 79 | //--:-:-:-:6      SHR result, a, 1;
 80 | 
 81 | //--:-:-:-:6      BFI result, b, 0x1010, a;
 82 | 
 83 | --:-:-:-:1      CS2R clock1, SR_CLOCKLO;
 84 | 
 85 | //--:-:-:-:6      XMAD.S16.S16 c, a, b, RZ;
 86 | //--:-:-:-:6      ISET.LT.AND s, c, RZ, PT;
 87 | //--:-:-:-:6      IADD result.CC, c, result;
 88 | //--:-:-:-:6      IADD.X result2, s, result2;
 89 | 
 90 | //--:-:-:-:6      XMAD.S16.S16 result.CC, a, b, result;
 91 | //--:-:-:-:6      IADD.X result2, result2, RZ;
 92 | 
 93 | //--:-:-:-:6      SHF.R.S64 result, result, 1, result2;
 94 | //--:-:-:-:6      MOV32I result2, 0;
 95 | 
 96 | --:-:-:-:f      LOP.AND.NZ P0, RZ, result, 1;
 97 | 
 98 | --:-:-:-:6  @P0 VADD.S16.S16.SAT.MRG_16H result, a, b, result;
 99 | 
100 | //--:-:1:-:d      I2F.F32.S32 result2, a;
101 | //01:-:-:-:6      FMUL result2, result2, scale;
102 | //01:-:2:-:d      F2I.S32.F32 result, result2;
103 | 
104 | 02:-:-:-:6      CS2R clock2, SR_CLOCKLO;
105 | 
106 | //F2I   = "^$pred?F2I$ftz$x2x$round $r0, $cr20;"
107 | //I2F   = "^$pred?I2F$x2x$rnd $r0, $cr20;"
108 | //x2x   = "\.(?<destSign>F|U|S)(?<destWidth>8|16|32|64)\.(?<srcSign>F|U|S)(?<srcWidth>8|16|32|64)"
109 | //rnd   = "(?:\.(?<rnd>RN|RM|RP|RZ))?"
110 | //round = "(?:\.(?<round>ROUND|FLOOR|CEIL|TRUNC))?"
111 | //r8    = qr"(?<r8neg>\-)?(?<r8abs>\|)?(?<r8>$reg)\|?(?:\.(?<r8part>H0|H1|B1|B2|B3))?(?<reuse1>\.reuse)?"
112 | //r20   = qr"(?<r20neg>\-)?(?<r20abs>\|)?(?<r20>$reg)\|?(?:\.(?<r20part>H0|H1|B1|B2|B3))?(?<reuse2>\.reuse)?"
113 | 
114 | 
115 | //--:-:-:-:1      XMAD.MRG x, a, b.H1, RZ;
116 | //--:-:-:-:6      XMAD result, a.H1, b.H1, c;
117 | //--:-:-:-:1      XMAD.PSL.CBCC result, a.H1, x.H1, result;
118 | 
119 | // Get the first clock value
120 | 
121 | --:-:1:-:1      S2R tid, SR_TID.X;
122 | --:-:2:-:2      S2R bid, SR_CTAID.X;
123 | 
124 | 
125 | 
126 | // Take the difference of clocks
127 | --:-:-:-:1      IADD clock1, clock2, -clock1;
128 | 
129 | // Setup our output addresses
130 | // Stall your pipeline dependencies properly
131 | 03:-:-:-:1      XMAD tid, blockDim, bid, tid;
132 | --:-:-:Y:6      XMAD.MRG x, blockDim, bid.H1, RZ;
133 | --:-:-:Y:6      XMAD.PSL.CBCC tid, blockDim.H1, x.H1, tid;
134 | --:-:-:Y:6      SHL  tid, tid, 0x2;
135 | 
136 | --:-:-:-:1      IADD clocks, clocks, tid;
137 | --:-:-:-:1      IADD out,  out,  tid;
138 | 
139 | // Output the results.
140 | // No stall needed on prior instruction as memory store instructions have a 5 clock delay in picking up register values
141 | --:-:-:-:1      STG.E [clocks], result2;
142 | --:-:-:-:1      STG.E [out],    result;
143 | --:-:-:-:5      EXIT;
144 | 
145 | 


--------------------------------------------------------------------------------
/Assembler/PascalAs/pm_to_blib:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ncic-sugon/DeepPerf/e539add1347e9354712787f71f66ae346f006544/Assembler/PascalAs/pm_to_blib


--------------------------------------------------------------------------------
/Assembler/PascalAs/sgemm/batched_gemm.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ncic-sugon/DeepPerf/e539add1347e9354712787f71f66ae346f006544/Assembler/PascalAs/sgemm/batched_gemm.xlsx


--------------------------------------------------------------------------------
/Assembler/PascalAs/sgemm/cublas_sgemm.ptx:
--------------------------------------------------------------------------------
 1 | .version 4.1
 2 | .target sm_50
 3 | .address_size 64
 4 | 
 5 | // ptxas -v -arch=sm_50 -m 32 --opt-level 4 -o cublas_sgemm.cubin cublas_sgemm.ptx
 6 | 
 7 | // You can use maxas to insert cublas_device.lib code into a cubin built from this ptx:
 8 | 
 9 | // From C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v6.5\lib\Win32\cublas_device.lib
10 | 
11 | // cuobjdump -lelf cublas_device.lib | find "sm_50"
12 | 
13 | // cuobjdump -xelf maxwell_sgemm.asm.sm_50.cubin cublas_device.lib
14 | 
15 | // maxas -l maxwell_sgemm.asm.sm_50.cubin
16 | 
17 | // maxas -e -k maxwell_sgemm_128x128_nt maxwell_sgemm_128x128_nt.sass
18 | // maxas -e -k maxwell_sgemm_128x64_nt  maxwell_sgemm_128x64_nt.sass
19 | 
20 | // maxas -i maxwell_sgemm_128x128_nt.sass cublas_sgemm.cubin
21 | // maxas -i maxwell_sgemm_128x64_nt.sass  cublas_sgemm.cubin
22 | 
23 | // The sgemm.cpp code makes use of this cubin to benchmark the kernels outside of cublas.
24 | 
25 | .visible .entry maxwell_sgemm_128x128_nt(
26 | 	.param .u64 .ptr.global.align 8 param_A,
27 | 	.param .u64 .ptr.global.align 8 param_B,
28 | 	.param .u64 .ptr.global.align 8 param_C,
29 | 	.param .s32 param_lda,
30 | 	.param .s32 param_ldb,
31 | 	.param .s32 param_ldc,
32 | 	.param .s32 param_k,
33 | 	.param .u64 .ptr.global.align 8 param_Alpha,
34 | 	.param .u64 .ptr.global.align 8 param_Beta,
35 | 	.param .s32 param_alpha,
36 | 	.param .s32 param_beta,
37 | 	.param .s32 param_flag
38 | )
39 | .reqntid 256
40 | {
41 | 	.shared .align 16 .b8 share[16384];
42 | 
43 | 	ret;
44 | }
45 | 
46 | .visible .entry maxwell_sgemm_128x64_nt(
47 | 	.param .u64 .ptr.global.align 8 param_A,
48 | 	.param .u64 .ptr.global.align 8 param_B,
49 | 	.param .u64 .ptr.global.align 8 param_C,
50 | 	.param .s32 param_lda,
51 | 	.param .s32 param_ldb,
52 | 	.param .s32 param_ldc,
53 | 	.param .s32 param_k,
54 | 	.param .u64 .ptr.global.align 8 param_Alpha,
55 | 	.param .u64 .ptr.global.align 8 param_Beta,
56 | 	.param .s32 param_alpha,
57 | 	.param .s32 param_beta,
58 | 	.param .s32 param_flag
59 | )
60 | .reqntid 128
61 | {
62 | 	.shared .align 16 .b8 share[12288];
63 | 
64 | 	ret;
65 | }
66 | 


--------------------------------------------------------------------------------
/Assembler/PascalAs/sgemm/new.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ncic-sugon/DeepPerf/e539add1347e9354712787f71f66ae346f006544/Assembler/PascalAs/sgemm/new.cubin


--------------------------------------------------------------------------------
/Assembler/PascalAs/sgemm/sgemm.cu:
--------------------------------------------------------------------------------
  1 | 
  2 | // Note this file isn't configured to automatically compile.
  3 | // Here's how:
  4 | 
  5 | // If you want to look at the ptx first:
  6 | // nvcc -arch sm_50 -m 32 -ptx sgemm.cu
  7 | 
  8 | // Manually compile your kernel to a cubin.
  9 | // You should only have to do this once, unless you change params or shared size or globals:
 10 | // nvcc -arch sm_50 -m 32 -cubin sgemm.cu
 11 | 
 12 | // If tweaking a kernel or writing a new one based on this shell code you would then do this:
 13 | // maxas.pl -e kernel.cubin kernel.sass
 14 | 
 15 | // I've already included a modified kernel (sgemm.sass) so the next step is..
 16 | 
 17 | // Splice the manually assembled code back into the cubin:
 18 | // maxas.pl -i sgemm.sass sgemm.cubin
 19 | 
 20 | #include <device_functions.h>
 21 | #include <device_launch_parameters.h>
 22 | #include <cuda_texture_types.h>
 23 | #include <texture_fetch_functions.h>
 24 | 
 25 | typedef texture<float4, cudaTextureType1D, cudaReadModeElementType> floatTex;
 26 | 
 27 | floatTex  texA(0, cudaFilterModePoint, cudaAddressModeBorder);
 28 | floatTex  texB(0, cudaFilterModePoint, cudaAddressModeBorder);
 29 | 
 30 | // Use extern C so C++ doesn't mangle our kernel name
 31 | extern "C"
 32 | // This kernel requires 256x1x1 threads per block
 33 | __global__ void __launch_bounds__(256) sgemm_kernel_128(
 34 | 	float *C,
 35 | 	const int m,   const int n,   const int k,
 36 | 	const int lda, const int ldb, const int ldc,
 37 | 	float alpha, int *D)
 38 | {
 39 | 	// Declare any shared memory your kernel requires
 40 | 	// Or you could just pass the amount in as a param to cuLaunchKernel
 41 | 	__shared__ float4 share[1024];
 42 | 
 43 | 	int tid = threadIdx.x;
 44 | 
 45 | 	// If you use indirect texture references, they will be passed as params at the end of the param list
 46 | 	// So set that up here to make sure they're available in your kernel
 47 | 	floatTex tex = tid > 127 ? texB : texA;
 48 | 
 49 | 	// Make use of shared and your textures so it doesn't get optimized away
 50 | 	share[tid] = tex1Dfetch(tex, tid);
 51 | 
 52 | 	__syncthreads();
 53 | 
 54 | 	// output something so your setup isn't optimized away.
 55 | 	C[tid] = share[255-tid].x;
 56 | }
 57 | 
 58 | extern "C"
 59 | __global__ void __launch_bounds__(64) sgemm_kernel_64(
 60 | 	float *C,
 61 | 	const int m,   const int n,   const int k,
 62 | 	const int lda, const int ldb, const int ldc,
 63 | 	float alpha, int *D)
 64 | {
 65 | 	__shared__ float4 share[512];
 66 | 
 67 | 	int tid = threadIdx.x;
 68 | 
 69 | 	floatTex tex = tid > 127 ? texB : texA;
 70 | 
 71 | 	share[tid] = tex1Dfetch(tex, tid);
 72 | 
 73 | 	__syncthreads();
 74 | 
 75 | 	C[tid] = share[255-tid].x;
 76 | }
 77 | 
 78 | // A note about using the Cuda Runtime.
 79 | // If that's your preference over the driver API then here's what you'd do:
 80 | 
 81 | // In your project properties in the Cuda C/C++ panel:
 82 | //    -Set the "Keep Processed Files" (-keep) option
 83 | //    -Add a -v manually to the command line
 84 | // If compiling on command line just add -keep -v options to nvcc.
 85 | // Rebuild your solution and look in the log for these lines that follow the ptxas step:
 86 | 
 87 | // #$ fatbinary --create="Release/kernel.fatbin" -32 --key="a7bce87544c2a492" --ident="C:/Users/Scott/Documents/sgemm6/sgemm6/kernel.cu" --cmdline="-v --opt-level 4 --generate-line-info " "--image=profile=sm_50,file=Release/kernel.sm_50.cubin" "--image=profile=compute_50,file=Release/kernel.ptx" --embedded-fatbin="Release/kernel.fatbin.c" --cuda
 88 | // #$ cl.exe @Release/kernel.cu.cpp.ii.res > "Release/kernel.cu.cpp.ii"
 89 | // #$ cl.exe @Release/kernel.cu.obj.res -Fo"Release/kernel.cu.obj"
 90 | 
 91 | // You just need to manually run these 3 commands (or add them to a build script)
 92 | // after you've modified the cubin generated from the preceeding ptxas command.
 93 | // That will give you a new .cu.obj file which will automatically be linked in for you next time you
 94 | // build your project (or you could manually run the linker step as well).
 95 | 
 96 | // Having done that you can call your kernel normally using the <<< >>> syntax.
 97 | // Debugging will have to be with the sass syntax but that's what you'll want to see anyway.
 98 | // With fatbin you can also keep non-maxwell optimized versions of your code.
 99 | 
100 | 
101 | // I just discovered this also works as a shortcut to the above:
102 | // nvcc -lib -arch sm_52 -m 32 -use-cubin code=sm_52,cubin=sgemm.cubin -o sgemm.lib sgemm.cu
103 | 
104 | // The cu kernel definitions above need to have empty bodies.
105 | // And, the cu file must be compiled to a lib seperately before linking.


--------------------------------------------------------------------------------
/Assembler/PascalAs/sgemm/sgemm.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ncic-sugon/DeepPerf/e539add1347e9354712787f71f66ae346f006544/Assembler/PascalAs/sgemm/sgemm.cubin


--------------------------------------------------------------------------------
/Assembler/PascalAs/sgemm/sgemm.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl
  2 | use strict;
  3 | 
  4 | my $CU_AD_FORMAT_UNSIGNED_INT8  = 0x01;
  5 | my $CU_AD_FORMAT_UNSIGNED_INT16 = 0x02;
  6 | my $CU_AD_FORMAT_FLOAT          = 0x20;
  7 | 
  8 | if (!-f 'sgemm_pre_128.sass' || (stat 'sgemm128.sass')[9] > (stat 'sgemm_pre_128.sass')[9])
  9 | {
 10 |     print `maxas.pl -p sgemm128.sass sgemm_pre_128.sass`;
 11 |     exit if $?;
 12 |     print `maxas.pl -i sgemm128.sass sgemm.cubin`;
 13 |     exit if $?;
 14 |     print `maxas.pl -e -k sgemm_kernel_128 sgemm.cubin sgemm_final_128.sass`;
 15 | }
 16 | if (!-f 'sgemm_pre_64.sass' || (stat 'sgemm64.sass')[9] > (stat 'sgemm_pre_64.sass')[9])
 17 | {
 18 |     print `maxas.pl -p sgemm64.sass sgemm_pre_64.sass`;
 19 |     exit if $?;
 20 |     print `maxas.pl -i sgemm64.sass sgemm.cubin`;
 21 |     exit if $?;
 22 |     print `maxas.pl -e -k sgemm_kernel_64 sgemm.cubin sgemm_final_64.sass`;
 23 | }
 24 | 
 25 | #print `Release\\sgemm.exe $_ 20` foreach (80,60,40,30,20,10,9,8,7,6,5,4,3,2);
 26 | 
 27 | `Release\\sgemm.exe 64 5 $CU_AD_FORMAT_FLOAT`;
 28 | 
 29 | print `Release\\sgemm.exe 64 20 $CU_AD_FORMAT_UNSIGNED_INT8`;
 30 | exit;
 31 | 
 32 | my %data;
 33 | foreach my $thread128 (4 .. 64)
 34 | {
 35 |     my $N = $thread128 * 128;
 36 | 
 37 |     my $iterations = int(20 * (64 * 128)**3 / $N**3);
 38 |     $iterations = 10000 if $iterations > 10000;
 39 | 
 40 |     print "$N $iterations\n";
 41 | 
 42 |     my $data = `Release\\sgemm.exe $thread128 $iterations $CU_AD_FORMAT_UNSIGNED_INT16`;
 43 | 
 44 |     foreach my $bench (split "\n", $data)
 45 |     {
 46 |         if ($bench =~ /^(\w+)\s+GFLOPS: ([0-9.]+) /)
 47 |         {
 48 |             push @{$data{$N}}, $2;
 49 |             print "$1 $2\n";
 50 |         }
 51 |     }
 52 | }
 53 | print join("\t", qw(size Max64 Max128 Cub64 Cub128)), "\n";
 54 | 
 55 | foreach my $N (sort { $a <=> $b } keys %data)
 56 | {
 57 |     print join("\t", @{$data{$N}}), "\n";
 58 | }
 59 | 
 60 | 
 61 | #print $data;
 62 | 
 63 | __END__
 64 | 
 65 | 
 66 | 64 * 128 * 16 * 1.620 * .931 / 520
 67 | 
 68 | Max64  GFLOPS: 1377.38 (size: 256, iterations: 2000)
 69 | Max128 GFLOPS: 973.70 (size: 256, iterations: 2000)
 70 | Cub64  GFLOPS: 1272.42 (size: 256, iterations: 2000)
 71 | Cub128 GFLOPS: 948.15 (size: 256, iterations: 2000)
 72 | 
 73 | my @data = grep /\S/, split "\n", $data;
 74 | 
 75 | my $min;
 76 | my %smData;
 77 | my @sdata;
 78 | foreach (@data)
 79 | {
 80 |     next if /GFLOPS/;
 81 | 
 82 |     my ($sm, $clock, $by, $bx) = split /\s+/;
 83 | 
 84 |     $smData{$sm} = $clock if !$smData{$sm} || $clock < $smData{$sm};
 85 | 
 86 |     $min = $clock if !$min || $clock < $min;
 87 | 
 88 |     push @sdata, [$sm, $clock, $by, $bx];
 89 | }
 90 | 
 91 | foreach (@sdata)
 92 | {
 93 |     $_->[1] -= $smData{$_->[0]};
 94 | }
 95 | 
 96 | foreach (sort {$a->[1] <=> $b->[1] || $a->[0] <=> $b->[0]} @sdata)
 97 | {
 98 |     printf "%02d %8u  by: %2d bx: %2d\n", @$_;
 99 | 
100 | }
101 | 
102 | 
103 | 


--------------------------------------------------------------------------------
/Assembler/PascalAs/sgemm/sgemm.sln:
--------------------------------------------------------------------------------
 1 | ﻿
 2 | Microsoft Visual Studio Solution File, Format Version 11.00
 3 | # Visual Studio 2010
 4 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "sgemm", "sgemm.vcxproj", "{D571379D-3653-43CB-BE83-A6C68D392A05}"
 5 | EndProject
 6 | Global
 7 | 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 8 | 		Debug|Win32 = Debug|Win32
 9 | 		Release|Win32 = Release|Win32
10 | 	EndGlobalSection
11 | 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
12 | 		{D571379D-3653-43CB-BE83-A6C68D392A05}.Debug|Win32.ActiveCfg = Debug|Win32
13 | 		{D571379D-3653-43CB-BE83-A6C68D392A05}.Debug|Win32.Build.0 = Debug|Win32
14 | 		{D571379D-3653-43CB-BE83-A6C68D392A05}.Release|Win32.ActiveCfg = Release|Win32
15 | 		{D571379D-3653-43CB-BE83-A6C68D392A05}.Release|Win32.Build.0 = Release|Win32
16 | 	EndGlobalSection
17 | 	GlobalSection(SolutionProperties) = preSolution
18 | 		HideSolutionNode = FALSE
19 | 	EndGlobalSection
20 | EndGlobal
21 | 


--------------------------------------------------------------------------------
/Assembler/PascalAs/sgemm/sgemm.vcxproj:
--------------------------------------------------------------------------------
 1 | ﻿<?xml version="1.0" encoding="utf-8"?>
 2 | <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
 3 |   <ItemGroup Label="ProjectConfigurations">
 4 |     <ProjectConfiguration Include="Debug|Win32">
 5 |       <Configuration>Debug</Configuration>
 6 |       <Platform>Win32</Platform>
 7 |     </ProjectConfiguration>
 8 |     <ProjectConfiguration Include="Release|Win32">
 9 |       <Configuration>Release</Configuration>
10 |       <Platform>Win32</Platform>
11 |     </ProjectConfiguration>
12 |   </ItemGroup>
13 |   <PropertyGroup Label="Globals">
14 |     <ProjectGuid>{D571379D-3653-43CB-BE83-A6C68D392A05}</ProjectGuid>
15 |     <Keyword>Win32Proj</Keyword>
16 |     <RootNamespace>sgemm</RootNamespace>
17 |   </PropertyGroup>
18 |   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
19 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
20 |     <ConfigurationType>Application</ConfigurationType>
21 |     <UseDebugLibraries>true</UseDebugLibraries>
22 |     <CharacterSet>Unicode</CharacterSet>
23 |   </PropertyGroup>
24 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
25 |     <ConfigurationType>Application</ConfigurationType>
26 |     <UseDebugLibraries>false</UseDebugLibraries>
27 |     <WholeProgramOptimization>true</WholeProgramOptimization>
28 |     <CharacterSet>Unicode</CharacterSet>
29 |   </PropertyGroup>
30 |   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
31 |   <ImportGroup Label="ExtensionSettings">
32 |   </ImportGroup>
33 |   <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
34 |     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
35 |   </ImportGroup>
36 |   <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
37 |     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
38 |   </ImportGroup>
39 |   <PropertyGroup Label="UserMacros" />
40 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
41 |     <LinkIncremental>true</LinkIncremental>
42 |   </PropertyGroup>
43 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
44 |     <LinkIncremental>false</LinkIncremental>
45 |   </PropertyGroup>
46 |   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
47 |     <ClCompile>
48 |       <PrecompiledHeader>
49 |       </PrecompiledHeader>
50 |       <WarningLevel>Level3</WarningLevel>
51 |       <Optimization>Disabled</Optimization>
52 |       <PreprocessorDefinitions>_CRT_SECURE_NO_WARNINGS;WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
53 |       <AdditionalIncludeDirectories>$(CUDA_PATH_V6_5)\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
54 |     </ClCompile>
55 |     <Link>
56 |       <SubSystem>Console</SubSystem>
57 |       <GenerateDebugInformation>true</GenerateDebugInformation>
58 |       <AdditionalLibraryDirectories>$(CUDA_PATH_V6_5)\lib\Win32;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
59 |       <AdditionalDependencies>cuda.lib;cublas.lib;%(AdditionalDependencies)</AdditionalDependencies>
60 |     </Link>
61 |   </ItemDefinitionGroup>
62 |   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
63 |     <ClCompile>
64 |       <WarningLevel>Level3</WarningLevel>
65 |       <PrecompiledHeader>
66 |       </PrecompiledHeader>
67 |       <Optimization>MaxSpeed</Optimization>
68 |       <FunctionLevelLinking>true</FunctionLevelLinking>
69 |       <IntrinsicFunctions>true</IntrinsicFunctions>
70 |       <PreprocessorDefinitions>_CRT_SECURE_NO_WARNINGS;WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
71 |       <AdditionalIncludeDirectories>$(CUDA_PATH_V6_5)\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
72 |     </ClCompile>
73 |     <Link>
74 |       <SubSystem>Console</SubSystem>
75 |       <GenerateDebugInformation>true</GenerateDebugInformation>
76 |       <EnableCOMDATFolding>true</EnableCOMDATFolding>
77 |       <OptimizeReferences>true</OptimizeReferences>
78 |       <AdditionalLibraryDirectories>$(CUDA_PATH_V6_5)\lib\Win32;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
79 |       <AdditionalDependencies>cuda.lib;cublas.lib;%(AdditionalDependencies)</AdditionalDependencies>
80 |     </Link>
81 |   </ItemDefinitionGroup>
82 |   <ItemGroup>
83 |     <ClCompile Include="sgemm.cpp" />
84 |   </ItemGroup>
85 |   <ItemGroup>
86 |     <None Include="sgemm128.sass" />
87 |     <None Include="sgemm64.sass" />
88 |   </ItemGroup>
89 |   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
90 |   <ImportGroup Label="ExtensionTargets">
91 |   </ImportGroup>
92 | </Project>


--------------------------------------------------------------------------------
/Assembler/PascalAs/sgemm/sgemm_sm52_64.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ncic-sugon/DeepPerf/e539add1347e9354712787f71f66ae346f006544/Assembler/PascalAs/sgemm/sgemm_sm52_64.cubin


--------------------------------------------------------------------------------
/Assembler/PascalAs/t/MaxAs-MaxAs.t:
--------------------------------------------------------------------------------
1 | use strict;
2 | use warnings;
3 | 
4 | use Test::More tests => 1;
5 | BEGIN { use_ok('MaxAs::MaxAs') };
6 | 


--------------------------------------------------------------------------------
/Kernel/Convolution/Kepler/Makefile:
--------------------------------------------------------------------------------
 1 | BINS := sconv_fprop_K64_N64 sconv_bprop_C64_N64 sconv_update_C128_K128 \
 2 |   sconv_bprop_C1_N64 sconv_fprop_K128_N128 sconv_bprop_C128_N128
 3 | TARGETS := $(addsuffix .cubin, $(BINS))
 4 | TEMPLATES := $(addsuffix _template.cubin, $(BINS))
 5 | 
 6 | all: $(BINS) sconv_fprop sconv_bprop sconv_update
 7 | 
 8 | $(BINS):
 9 | 	nvcc -arch sm_35 -m 64 $@.cu -cubin -O3 -o $@_template.cubin
10 | 	KeplerAs.pl -i $@.sass $@_template.cubin $@.cubin
11 | 
12 | sconv_fprop: sconv_fprop.cu
13 | 	nvcc -arch sm_35 -o $@ $^ -lcuda -lcudart
14 | 
15 | sconv_bprop: sconv_bprop.cu
16 | 	nvcc -arch sm_35 -o $@ $^ -lcuda -lcudart
17 | 
18 | sconv_update: sconv_update.cu
19 | 	nvcc -arch sm_35 -o $@ $^ -lcuda -lcudart
20 | 
21 | clean:
22 | 	rm $(TARGETS) $(TEMPLATES) sconv_fprop sconv_bprop sconv_update
23 | 
24 | .PHONY:
25 | 	all clean
26 | 
27 | #utils
28 | print-% : ; $(info $* is $(flavor $*) variable set to [$($*)]) @true           
29 | 


--------------------------------------------------------------------------------
/Kernel/Convolution/Kepler/sconv.h:
--------------------------------------------------------------------------------
 1 | #include <vector>
 2 | #include <string>
 3 | #include <map>
 4 | #include <cuda.h>
 5 | #include <iostream>
 6 | #include <sstream>
 7 | #include <stdlib.h>
 8 | #include <math.h>
 9 | 
10 | std::map<std::string, CUfunction> nervana_kernels;
11 | std::vector<CUmodule> nervana_modules;
12 | 
13 | int len_d2b(int n) {
14 |   int i, j = 0;
15 |   i = n;
16 |   while (i) {
17 |     i /= 2;
18 |     j++;
19 |   }
20 |   return j;
21 | }
22 | 
23 | void magic32(unsigned int nmax, unsigned int d, unsigned int& m, unsigned int& p) {
24 |   long nc = ((nmax + 1) / d) * d - 1;
25 |   long nbits = len_d2b(nmax);
26 |   std::cout << "nbits " << nbits << std::endl;
27 |   for(p = 0; p < 2 * nbits + 1; p++) {   
28 |     if(pow(2, p) > nc * (d - 1 - (long)(pow(2, p) - 1) % d)) {
29 |       m = (pow(2, p) + d - 1 -(long)(pow(2, p) - 1) % d) / d;
30 |       std::cout << "m " << m << std::endl;
31 |       std::cout << "p " << p << std::endl;
32 |       return;
33 |     }   
34 |   }   
35 |   return;
36 | }
37 | 
38 | void magic64(unsigned int d, unsigned int& magic, unsigned int& shift) {
39 |   // 3 is a special case that only ends up in the high bits
40 |   // if the nmax is 0xffffffff
41 |   // we can't use 0xffffffff for all cases as some return a 33 bit
42 |   // magic number
43 |   unsigned long nmax;
44 |   if(d == 3)
45 |     nmax = 0xffffffff;
46 |   else
47 |     nmax = 0x7fffffff;
48 |   magic32(nmax, d, magic, shift);
49 |   if(magic != 1)
50 |     shift -= 32;
51 | }
52 | 
53 | bool load_kernels(const char* const base_path_cstr) {
54 |     //better would be a vector<string>, but there is a bug in nvcc that prevents this
55 |     // (bug report filed)
56 |     const int NUM_KERNELS = 6;
57 |     std::string names[NUM_KERNELS] = {
58 |         "sconv_fprop_K64_N64",
59 |         "sconv_fprop_K128_N128",
60 |         "sconv_bprop_C128_N128",
61 |         "sconv_bprop_C64_N64",
62 |         "sconv_bprop_C1_N64",
63 |         "sconv_update_C128_K128"
64 |     };
65 | 
66 |     std::string base_path(base_path_cstr);
67 | 
68 |     for (int i = 0; i < NUM_KERNELS; ++i) {
69 |       std::string kernel = names[i];
70 |         if (nervana_kernels.count(kernel) > 0)
71 |             continue;
72 | 
73 |         CUmodule module;
74 | 
75 |         std::string path = base_path + kernel + std::string(".cubin");
76 |         CUresult res = cuModuleLoad(&module, path.c_str());
77 | 
78 |         if (res != CUDA_SUCCESS) {
79 |             std::cerr << "Failed to load: " << kernel << " " << res << std::endl;
80 |             return false;
81 |         }
82 | 
83 |         nervana_modules.push_back(module);
84 | 
85 |         CUfunction function;
86 |         res = cuModuleGetFunction(&function, module, kernel.c_str());
87 |         if (res != CUDA_SUCCESS) {
88 |             std::cerr << "Failed to extract: " << kernel << " " << res << std::endl;
89 |             return false;
90 |         }
91 | 
92 |         nervana_kernels.insert(std::make_pair(kernel, function));
93 |     }
94 | 
95 |     return true;
96 | }
97 | 


--------------------------------------------------------------------------------
/Kernel/Convolution/Kepler/sconv_bprop_C128_N128.cu:
--------------------------------------------------------------------------------
 1 | extern "C"
 2 | __global__ void sconv_bprop_C128_N128 (
 3 |   float* param_test,
 4 |   float* param_O,
 5 |   const float* param_I,
 6 |   const float* param_F,
 7 |   float param_alpha,
 8 |   int param_N,
 9 |   int param_K,
10 |   int param_D,
11 |   int param_H,
12 |   int param_W,
13 |   int param_WN,
14 |   int param_HWN,
15 |   int param_DHWN,
16 |   int param_C,
17 |   int param_CRST,
18 |   int param_RST,
19 |   int param_RS,
20 |   int param_magic_RS,
21 |   int param_shift_RS,
22 |   int param_S,
23 |   int param_magic_S,
24 |   int param_shift_S,
25 |   int param_pad_d,
26 |   int param_pad_h,
27 |   int param_pad_w,
28 |   int param_str_d,
29 |   int param_str_h,
30 |   int param_str_w,
31 |   int param_Q,
32 |   int param_PQ,
33 |   int param_QN,
34 |   int param_PQN,
35 |   int param_MPQN,
36 |   int param_magic_Q,
37 |   int param_shift_Q,
38 |   int param_magic_PQ,
39 |   int param_shift_PQ,
40 |   int param_R,
41 |   int param_T,
42 |   int param_magic_str_w,
43 |   int param_shift_str_w,
44 |   int param_magic_str_h,
45 |   int param_shift_str_h,
46 |   int param_magic_str_d,
47 |   int param_shift_str_d) {
48 |   __shared__ float share[128 * 8 * 4 + 8];
49 | 
50 |   int tid = threadIdx.x;
51 | 
52 |   share[tid] = 1;
53 | 
54 |   *param_O = share[127-tid];
55 |   *param_test = share[127-tid];
56 | }
57 | 


--------------------------------------------------------------------------------
/Kernel/Convolution/Kepler/sconv_bprop_C1_N64.cu:
--------------------------------------------------------------------------------
 1 | extern "C"
 2 | __global__ void sconv_bprop_C1_N64 (
 3 |     float* param_test,
 4 |     float* param_I,
 5 |     const float*  param_F,
 6 |     const float*  param_E,
 7 |     float param_alpha,
 8 |     int param_N,
 9 |     int param_K,
10 |     int param_D,
11 |     int param_H,
12 |     int param_W,
13 |     int param_WN,
14 |     int param_HWN,
15 |     int param_DHWN,
16 |     int param_C,
17 |     int param_CRST,
18 |     int param_RST,
19 |     int param_magic_RST,
20 |     int param_shift_RST,
21 |     int param_RS,
22 |     int param_magic_RS,
23 |     int param_shift_RS,
24 |     int param_S,
25 |     int param_magic_S,
26 |     int param_shift_S,
27 |     int param_pad_d,
28 |     int param_pad_h,
29 |     int param_pad_w,
30 |     int param_str_d,
31 |     int param_str_h,
32 |     int param_str_w,
33 |     int param_Q,
34 |     int param_PQ,
35 |     int param_QN,
36 |     int param_PQN,
37 |     int param_MPQN,
38 |     int param_magic_Q,
39 |     int param_shift_Q,
40 |     int param_magic_PQ,
41 |     int param_shift_PQ,
42 |     int param_CRST8,
43 |     int param_MPQN8) {
44 |       __shared__ float shared[64 * 8 * 4 * 2];
45 | 
46 |       int tid = threadIdx.x;
47 | 
48 |       shared[tid] = 1;
49 | 
50 |       *param_I = shared[31 - tid];
51 |       *param_test = shared[31 - tid];
52 |     }
53 | 


--------------------------------------------------------------------------------
/Kernel/Convolution/Kepler/sconv_bprop_C64_N64.cu:
--------------------------------------------------------------------------------
 1 | extern "C"
 2 | __global__ void sconv_bprop_C64_N64 (
 3 |     float* param_test,
 4 |     float* param_O,
 5 |     const float* param_I,
 6 |     const float* param_F,
 7 |     float param_alpha,
 8 |     int param_N,
 9 |     int param_K,
10 |     int param_D,
11 |     int param_H,
12 |     int param_W,
13 |     int param_WN,
14 |     int param_HWN,
15 |     int param_DHWN,
16 |     int param_C,
17 |     int param_CRST,
18 |     int param_RST,
19 |     int param_RS,
20 |     int param_magic_RS,
21 |     int param_shift_RS,
22 |     int param_S,
23 |     int param_magic_S,
24 |     int param_shift_S,
25 |     int param_pad_d,
26 |     int param_pad_h,
27 |     int param_pad_w,
28 |     int param_str_d,
29 |     int param_str_h,
30 |     int param_str_w,
31 |     int param_Q,
32 |     int param_PQ,
33 |     int param_QN,
34 |     int param_PQN,
35 |     int param_MPQN,
36 |     int param_magic_Q,
37 |     int param_shift_Q,
38 |     int param_magic_PQ,
39 |     int param_shift_PQ,
40 |     int param_R,
41 |     int param_T,
42 |     int param_magic_str_w,
43 |     int param_shift_str_w,
44 |     int param_magic_str_h,
45 |     int param_shift_str_h,
46 |     int param_magic_str_d,
47 |     int param_shift_str_d) {
48 |       __shared__ float share[64 * 8 * 4 + 8];
49 | 
50 |       int tid = threadIdx.x;
51 | 
52 |       share[tid] = 1;
53 | 
54 |       *param_O = share[63-tid];
55 |       *param_test = share[63-tid];
56 |     }
57 | 


--------------------------------------------------------------------------------
/Kernel/Convolution/Kepler/sconv_fprop_K128_N128.cu:
--------------------------------------------------------------------------------
 1 | extern "C"
 2 | __global__ void sconv_fprop_K128_N128 (
 3 | 	float* param_test,
 4 | 	float *param_O,
 5 | 	const float *param_I,
 6 | 	const float *param_F,
 7 | 	float param_alpha,
 8 | 	int param_N,
 9 | 	int param_K,
10 | 	int param_D,
11 | 	int param_H,
12 | 	int param_W,
13 | 	int param_WN,
14 | 	int param_HWN,
15 | 	int param_DHWN,
16 | 	int param_C,
17 | 	int param_KRST,
18 | 	int param_RST,
19 | 	int param_RS,
20 | 	int param_magic_RS,
21 | 	int param_shift_RS,
22 | 	int param_S,
23 | 	int param_magic_S,
24 | 	int param_shift_S,
25 | 	int param_pad_d,
26 | 	int param_pad_h,
27 | 	int param_pad_w,
28 | 	int param_str_d,
29 | 	int param_str_h,
30 | 	int param_str_w,
31 | 	int param_Q,
32 | 	int param_PQ,
33 | 	int param_QN,
34 | 	int param_PQN,
35 | 	int param_MPQN,
36 | 	int param_magic_Q,
37 | 	int param_shift_Q,
38 | 	int param_magic_PQ,
39 | 	int param_shift_PQ) {
40 | 	__shared__ float share[128 * 8 * 4 + 8];
41 | 
42 | 	int tid = threadIdx.x;
43 | 
44 | 	share[tid] = 1;
45 | 
46 | 	*param_O = share[127-tid];
47 | 	*param_test = share[127-tid];
48 | }
49 | 


--------------------------------------------------------------------------------
/Kernel/Convolution/Kepler/sconv_fprop_K64_N64.cu:
--------------------------------------------------------------------------------
 1 | extern "C"
 2 | __global__ void sconv_fprop_K64_N64 (
 3 |     float* param_test,
 4 |     float *param_O,
 5 |     const float *param_I,
 6 |     const float *param_F,
 7 |     float param_alpha,
 8 |     int param_N,
 9 |     int param_K,
10 |     int param_D,
11 |     int param_H,
12 |     int param_W,
13 |     int param_WN,
14 |     int param_HWN,
15 |     int param_DHWN,
16 |     int param_C,
17 |     int param_KRST,
18 |     int param_RST,
19 |     int param_RS,
20 |     int param_magic_RS,
21 |     int param_shift_RS,
22 |     int param_S,
23 |     int param_magic_S,
24 |     int param_shift_S,
25 |     int param_pad_d,
26 |     int param_pad_h,
27 |     int param_pad_w,
28 |     int param_str_d,
29 |     int param_str_h,
30 |     int param_str_w,
31 |     int param_Q,
32 |     int param_PQ,
33 |     int param_QN,
34 |     int param_PQN,
35 |     int param_MPQN,
36 |     int param_magic_Q,
37 |     int param_shift_Q,
38 |     int param_magic_PQ,
39 |     int param_shift_PQ) {
40 |       __shared__ float share[64 * 8 * 4 + 8];
41 | 
42 |       int tid = threadIdx.x;
43 | 
44 |       share[tid] = 1;
45 | 
46 |       *param_O = share[63-tid];
47 |       *param_test = share[63-tid];
48 |     }
49 | 


--------------------------------------------------------------------------------
/Kernel/Convolution/Kepler/sconv_fprop_K64_N64_template.cubin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ncic-sugon/DeepPerf/e539add1347e9354712787f71f66ae346f006544/Kernel/Convolution/Kepler/sconv_fprop_K64_N64_template.cubin


--------------------------------------------------------------------------------
/Kernel/Convolution/Kepler/sconv_update.cu:
--------------------------------------------------------------------------------
  1 | #include "sconv.h"
  2 | 
  3 | bool update(const float *I, float *F, const float *O,
  4 |   unsigned int N, unsigned int C, unsigned int K,
  5 |   unsigned int D, unsigned int H, unsigned int W,
  6 |   unsigned int R, unsigned int S, unsigned int T,
  7 |   unsigned int M, unsigned int P, unsigned int Q,
  8 |   unsigned int str_d, unsigned int str_h, unsigned int str_w,
  9 |   unsigned int pad_d, unsigned int pad_h, unsigned int pad_w) {
 10 |   float alpha = 1.0f;
 11 |   unsigned int DHW, WN, HW, HWN, DHWN, CRST, RST, RS;
 12 |   unsigned int PQ, QN, PQN, MPQN;
 13 |   unsigned int magic_HW, magic_W;
 14 |   unsigned int shift_HW, shift_W;
 15 |   unsigned int magic_RST, magic_RS, magic_S;
 16 |   unsigned int shift_RST, shift_RS, shift_S;
 17 |   unsigned int magic_PQu, shift_PQu;
 18 |   unsigned int magic_Qu, shift_Qu;
 19 |   unsigned int grid_P = 1;
 20 |   unsigned int grid_Q = 1;
 21 |   unsigned int grid_PQ = grid_P * grid_Q;
 22 |   unsigned int grid_PQM = grid_PQ * M;
 23 |   // input
 24 |   WN = W * N;
 25 |   HW = H * W;
 26 |   HWN = H * WN;
 27 |   DHW = D * HW;
 28 |   DHWN = D * HWN;
 29 |   // filter
 30 |   RS = R * S;
 31 |   RST = T * RS;
 32 |   CRST = C * RS;
 33 |   // output
 34 |   QN = Q * N;
 35 |   PQN = P * QN;
 36 |   MPQN = M * PQN;
 37 |   // magic numbers
 38 |   magic32(CRST, RST, magic_RST, shift_RST);
 39 |   magic32(RST + 32, RS, magic_RS, shift_RS);
 40 |   magic32(RS + 32, S, magic_S, shift_S);
 41 |   magic32(DHW, HW, magic_HW, shift_HW);
 42 |   magic32(HW, W, magic_W, shift_W);
 43 |   magic32(grid_PQM, grid_PQ, magic_PQu, shift_PQu);
 44 |   magic32(grid_PQ, grid_Q, magic_Qu, shift_Qu);
 45 |   std::cout << "CRST: " << CRST << std::endl;
 46 |   // test param set up
 47 |   float *test_param;
 48 |   cudaError_t cuda_error;
 49 |   cuda_error = cudaMalloc((void**)&test_param, sizeof(float) * 1024);
 50 |   cudaMemset(test_param, 0, sizeof(float) * 1024);
 51 |   void *args[43] = {&test_param, &F, &I, &O, &alpha,
 52 |     &N, &K, &D, &H, &W, &WN, &HWN, &DHWN,
 53 |     &C, &CRST,
 54 |     &RST, &magic_RST, &shift_RST,
 55 |     &RS, &magic_RS, &shift_RS,
 56 |     &S, &magic_S, &shift_S,
 57 |     &pad_d, &pad_h, &pad_w,
 58 |     &str_d, &str_h, &str_w,
 59 |     &P, &Q, &PQ, &QN, &PQN, &MPQN,
 60 |     &magic_Qu, &shift_Qu,
 61 |     &magic_PQu, &shift_PQu,
 62 |     &grid_P, &grid_Q, &grid_PQ};
 63 |   int gridX = grid_PQM;
 64 |   int gridY = CRST / 128 + (CRST % 128 != 0);
 65 |   int gridZ = K / 128 + (K % 128 != 0);
 66 |   std::string kernel_name = "sconv_update_C128_K128";
 67 |   CUresult res = cuLaunchKernel(nervana_kernels[kernel_name], gridX, gridY, gridZ, 256, 1, 1,
 68 |     0, 0, args, NULL);
 69 |   if (res != CUDA_SUCCESS) {
 70 |     std::cerr << "Line " << __LINE__ << " error launching kernel " << kernel_name << " " << res << std::endl;
 71 |     return false;
 72 |   }
 73 |   cuCtxSynchronize();
 74 |   float* h_test = (float *)malloc(sizeof(float) * 256);
 75 |   cuda_error = cudaMemcpy(h_test, test_param, sizeof(float) * 256, cudaMemcpyDeviceToHost);
 76 |   if (cuda_error != cudaSuccess) {
 77 |     std::cerr << "Line " << __LINE__ << " memcpy error: " << cuda_error << std::endl;
 78 |     exit(1);
 79 |   }
 80 |   for (int i = 0; i < 256; ++i) {
 81 |     std::cout << h_test[i] << " ";
 82 |   }
 83 |   std::cout << std::endl;
 84 |   // free test_param
 85 |   free(h_test);
 86 |   return true;
 87 | }
 88 | 
 89 | int main() {
 90 |   cudaFree(0);
 91 |   float *d_I, *d_F, *d_O;
 92 |   unsigned int N = 128, C = 3, K = 128, D = 1, H = 224, W = 224, T = 1, R = 11, S = 11;
 93 |   unsigned int str_d = 1, str_h = 4, str_w = 4;
 94 |   unsigned int pad_d = 0, pad_h = 3, pad_w = 3;
 95 |   unsigned int M, P, Q;
 96 |   cudaError_t cuda_error;
 97 |   M = (D - T + 2 * pad_d) / str_d + 1;
 98 |   P = (H - R + 2 * pad_h) / str_h + 1;
 99 |   Q = (W - S + 2 * pad_w) / str_w + 1;
100 |   float *h_O = (float *)malloc(K * M * P * Q * N * sizeof(float));
101 |   for (int i = 0; i < K * M * P * Q * N; ++i) {
102 |     h_O[i] = 1;
103 |   }
104 |   float *h_I = (float *)malloc(C * D * H * W * N * sizeof(float));
105 |   for (int i = 0; i < C * D * H * W * N; ++i) {
106 |     h_I[i] = 1;
107 |   }
108 |   float* h_F = (float *)malloc(sizeof(float) * C * R * S * T * K);
109 |   // device memory
110 |   cudaMalloc((void**)&d_I, sizeof(float) * C * D * H * W * N);
111 |   cudaMalloc((void**)&d_F, sizeof(float) * C * R * S * T * K);
112 |   cudaMalloc((void**)&d_O, sizeof(float) * K * M * P * Q * N);
113 |   // memcpy h_I, h_O
114 |   cudaMemcpy(d_I, h_I, sizeof(float) * C * D * H * W * N,
115 |     cudaMemcpyHostToDevice);
116 |   cudaMemcpy(d_O, h_O, sizeof(float) * K * M * P * Q * N,
117 |     cudaMemcpyHostToDevice);
118 |   // load kernels 
119 |   if (!load_kernels("./")) {
120 |     std::cerr << "Couldn't load all kernels" << std::endl;
121 |     exit(1);
122 |   }
123 |   // launch kernel
124 |   if (!update(d_I, d_F, d_O, N, C, K, D, H, W, R, S, T, M, P, Q, str_d, str_h, str_w, pad_d, pad_h, pad_w)) {
125 |     std::cerr << "Launch error" << std::endl;
126 |     exit(1);
127 |   }
128 |   // output
129 |   std::cout << "result" << std::endl;
130 |   cuda_error = cudaMemcpy(h_F, d_F, sizeof(float) * C * R * S * T * K, cudaMemcpyDeviceToHost);
131 |   if (cuda_error != cudaSuccess) {
132 |     std::cerr << "Line " << __LINE__ << " memcpy error: " << cuda_error << std::endl;
133 |     exit(1);
134 |   }
135 |   for (int i = 0; i < 128; ++i) {
136 |     std::cout << h_F[i] << " ";
137 |   }
138 |   std::cout << std::endl;
139 |   // free memory
140 |   free(h_O);
141 |   free(h_I);
142 |   free(h_F);
143 |   cudaFree(d_I);
144 |   cudaFree(d_F);
145 |   cudaFree(d_O);
146 |   // run successfully
147 |   std::cout << "finish" << std::endl;
148 |   return 0;
149 | }
150 | 


--------------------------------------------------------------------------------
/Kernel/Convolution/Kepler/sconv_update_C128_K128.cu:
--------------------------------------------------------------------------------
 1 | extern "C"
 2 | __global__ void sconv_update_C128_K128 (
 3 |     float* param_test,
 4 |     float* param_F,
 5 |     const float* param_I,
 6 |     const float* param_E,
 7 |     float param_alpha,
 8 |     int param_N,
 9 |     int param_K,
10 |     int param_D,
11 |     int param_H,
12 |     int param_W,
13 |     int param_WN,
14 |     int param_HWN,
15 |     int param_DHWN,
16 |     int param_C,
17 |     int param_CRST,
18 |     int param_RST,
19 |     int param_magic_RST,
20 |     int param_shift_RST,
21 |     int param_RS,
22 |     int param_magic_RS,
23 |     int param_shift_RS,
24 |     int param_S,
25 |     int param_magic_S,
26 |     int param_shift_S,
27 |     int param_pad_d,
28 |     int param_pad_h,
29 |     int param_pad_w,
30 |     int param_str_d,
31 |     int param_str_h,
32 |     int param_str_w,
33 |     int param_P,
34 |     int param_Q,
35 |     int param_PQ,
36 |     int param_QN,
37 |     int param_PQN,
38 |     int param_MPQN,
39 |     int param_magic_Q,
40 |     int param_shift_Q,
41 |     int param_magic_PQ,
42 |     int param_shift_PQ,
43 |     int param_part_P,
44 |     int param_part_Q,
45 |     int param_part_PQ) {
46 |       __shared__ float share[(128 * 16 + 32) * 4 + 6];
47 | 
48 |       int tid = threadIdx.x;
49 | 
50 |       share[tid] = 1;
51 | 
52 |       *param_F = share[255 - tid];
53 |       *param_test = share[255 - tid];
54 |     }
55 | 


--------------------------------------------------------------------------------
/Kernel/SGEMM/Kepler/Makefile:
--------------------------------------------------------------------------------
 1 | BINS := sgemm_nn_128x128 sgemm_nt_128x128 sgemm_tn_128x128 \
 2 |   sgemm_nn_128x128_vec sgemm_tn_128x128_vec sgemm_nt_128x128_vec
 3 | TARGETS := $(addsuffix .cubin, $(BINS))
 4 | TEMPLATES := $(addsuffix _template.cubin, $(BINS))
 5 | 
 6 | all: $(BINS)
 7 | 
 8 | $(BINS):
 9 | 	nvcc -arch sm_35 -m 64 $@.cu -cubin -O3 -o $@_template.cubin
10 | 	KeplerAs.pl -i $@.sass $@_template.cubin $@.cubin
11 | 
12 | clean:
13 | 	rm $(TARGETS) $(TEMPLATES)
14 | 
15 | .PHONY:
16 | 	all clean
17 | 
18 | #utils
19 | print-% : ; $(info $* is $(flavor $*) variable set to [$($*)]) @true           
20 | 


--------------------------------------------------------------------------------
/Kernel/SGEMM/Kepler/README.md:
--------------------------------------------------------------------------------
1 | # KeplerGEMM
2 | 
3 | Faster GEMM
4 | 


--------------------------------------------------------------------------------
/Kernel/SGEMM/Kepler/sgemm_nn_128x128.cu:
--------------------------------------------------------------------------------
 1 | extern "C"
 2 | __global__ void __launch_bounds__(256) sgemm_nn_128x128
 3 | (
 4 |  const float* param_A,
 5 |  const float* param_B,
 6 |  float*       param_C,
 7 |  float        param_alpha,
 8 |  float        param_beta,
 9 |  int          param_lda,
10 |  int          param_ldb8,  
11 |  int          param_ldc,
12 |  int          param_m,
13 |  int          param_n,
14 |  int          param_k
15 |  ) {
16 |   __shared__ float share[128 * 8 * 4 + 32];
17 | 
18 |   int tid = threadIdx.x;
19 | 
20 |   share[tid] = 1;
21 | 
22 |   __syncthreads();
23 | 
24 |   param_C[tid] = share[255 - tid];
25 | }
26 | 


--------------------------------------------------------------------------------
/Kernel/SGEMM/Kepler/sgemm_nn_128x128_vec.cu:
--------------------------------------------------------------------------------
 1 | extern "C"
 2 | __global__ void __launch_bounds__(256) sgemm_nn_128x128_vec
 3 | (
 4 |  const float* param_A,
 5 |  const float* param_B,
 6 |  float*       param_C,
 7 |  float        param_alpha,
 8 |  float        param_beta,
 9 |  int          param_lda,
10 |  int          param_ldb8,  
11 |  int          param_ldc,
12 |  int          param_m,
13 |  int          param_n,
14 |  int          param_k
15 |  ) {
16 |   __shared__ float share[128 * 8 * 4 + 32];
17 | 
18 |   int tid = threadIdx.x;
19 | 
20 |   share[tid] = 1;
21 | 
22 |   __syncthreads();
23 | 
24 |   param_C[tid] = share[255 - tid];
25 | }
26 | 


--------------------------------------------------------------------------------
/Kernel/SGEMM/Kepler/sgemm_nt_128x128.cu:
--------------------------------------------------------------------------------
 1 | extern "C"
 2 | __global__ void __launch_bounds__(256) sgemm_nt_128x128
 3 | (
 4 |  const float* param_A,
 5 |  const float* param_B,
 6 |  float*       param_C,
 7 |  float        param_alpha,
 8 |  float        param_beta,
 9 |  int          param_lda,
10 |  int          param_ldb,
11 |  int          param_ldc,
12 |  int          param_m,
13 |  int          param_n,
14 |  int          param_k
15 |  ) {
16 |   __shared__ float share[128 * 8 * 4 + 32];
17 | 
18 |   int tid = threadIdx.x;
19 | 
20 |   share[tid] = 1;
21 | 
22 |   __syncthreads();
23 | 
24 |   param_C[tid] = share[255 - tid];
25 | }
26 | 


--------------------------------------------------------------------------------
/Kernel/SGEMM/Kepler/sgemm_nt_128x128_vec.cu:
--------------------------------------------------------------------------------
 1 | extern "C"
 2 | __global__ void __launch_bounds__(256) sgemm_nt_128x128_vec
 3 | (
 4 |  const float* param_A,
 5 |  const float* param_B,
 6 |  float*       param_C,
 7 |  float        param_alpha,
 8 |  float        param_beta,
 9 |  int          param_lda,
10 |  int          param_ldb,
11 |  int          param_ldc,
12 |  int          param_m,
13 |  int          param_n,
14 |  int          param_k
15 |  ) {
16 |   __shared__ float share[128 * 8 * 4 + 32];
17 | 
18 |   int tid = threadIdx.x;
19 | 
20 |   share[tid] = 1;
21 | 
22 |   __syncthreads();
23 | 
24 |   param_C[tid] = share[255 - tid];
25 | }
26 | 


--------------------------------------------------------------------------------
/Kernel/SGEMM/Kepler/sgemm_tn_128x128.cu:
--------------------------------------------------------------------------------
 1 | extern "C"
 2 | __global__ void __launch_bounds__(256) sgemm_tn_128x128
 3 | (
 4 |  const float* param_A,
 5 |  const float* param_B,
 6 |  float*       param_C,
 7 |  float        param_alpha,
 8 |  float        param_beta,
 9 |  int          param_lda8,
10 |  int          param_ldb8,  
11 |  int          param_ldc,
12 |  int          param_m,
13 |  int          param_n,
14 |  int          param_k
15 |  ) {
16 |   __shared__ float share[128 * 8 * 4 + 32];
17 | 
18 |   int tid = threadIdx.x;
19 | 
20 |   share[tid] = 1;
21 | 
22 |   __syncthreads();
23 | 
24 |   param_C[tid] = share[255 - tid];
25 | }
26 | 


--------------------------------------------------------------------------------
/Kernel/SGEMM/Kepler/sgemm_tn_128x128_vec.cu:
--------------------------------------------------------------------------------
 1 | extern "C"
 2 | __global__ void __launch_bounds__(256) sgemm_tn_128x128_vec
 3 | (
 4 |  const float* param_A,
 5 |  const float* param_B,
 6 |  float*       param_C,
 7 |  float        param_alpha,
 8 |  float        param_beta,
 9 |  int          param_lda8,
10 |  int          param_ldb8,  
11 |  int          param_ldc,
12 |  int          param_m,
13 |  int          param_n,
14 |  int          param_k
15 |  ) {
16 |   __shared__ float share[128 * 8 * 4 + 32];
17 | 
18 |   int tid = threadIdx.x;
19 | 
20 |   share[tid] = 1;
21 | 
22 |   __syncthreads();
23 | 
24 |   param_C[tid] = share[255 - tid];
25 | }
26 | 
27 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # DeepPerf
 2 | 
 3 | DeepPerf is developed to understand GPU microarchitectural features and improve performance for compute-intensive kernels. The methodology relies on a reverse engineering approach to crack the GPU ISA encodings in order to build a GPU assembler. An assembly microbenchmark suite correlates microarchitectural features with their performance factors to uncover instruction-level and memory hierarchy preferences.
 4 | We use SGEMM and Convolution as examples to show the ways to achieve bare-metal performance tuning. In your deep learning framework, you could use directly these sass code to speed up the performance.
 5 | 
 6 | The toolchain is an attempt to automatically crack different GPU ISA encodings and build an assembler adaptively for the purpose of performance enhancements to applications on GPUs.
 7 | There are three directories in this folder, which consists of three major steps to optimize a cuda code in the assembly level. All the tools cover three recent NVIDIA GPU architecture, Kepler, Maxwell and Pascal.
 8 | 
 9 | 
10 | 


--------------------------------------------------------------------------------
/Solver/.gitignore:
--------------------------------------------------------------------------------
1 | data/*
2 | output/*
3 | 


--------------------------------------------------------------------------------
/Solver/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # Cracking GPU ISA Encodings
 3 | 
 4 | ## Output
 5 | 
 6 | * Bit positions of opcodes
 7 | * Bit positions of operands for different operand type
 8 | * Bit positions of modifiers for each instruction
 9 | 
10 | ## How to run the workflow?
11 | 
12 | The workflow is composed of four stages:
13 | 
14 | 1. Generate PTX code->`./bin/generate_disassemble [arch]`
15 |     * Generate PTX code (.ptx) in ptxgen directory and compile PTX to cubin; 
16 |     * Disassemble cubins to sass files, which feed into the following three solvers;
17 |     * Each line of sass files looks like this:
18 |     
19 |     `/∗0048∗/ IADD R0, R2, R0; /∗0x4800000000201c03∗/`
20 |     
21 | 2. Opcode solver->`./bin/opcode [arch]`
22 |     * Probe 64-bit binary code of sass files by flipping each bit and observe whether opcodes change;
23 |     
24 | 3. Modifer solver->`./bin/modifier [arch]`
25 |     * Probe 64-bit binary code of sass files by flipping each bit and observe whether modifiers change;
26 |     * Enuermerate bits on all modifier positions to generate all the modifiers;
27 |     
28 | 4. Operand solver->`./bin/operand [arch]`
29 |     * Probe 64-bit binary code of sass files by flipping each bit and observe whether operands change;
30 |     * Operand type: R: Register, S: Special Register, I: Immediate, C: constant[][], M: Memory, P: Predicate;
31 | 
32 | 5. Allowed values for `[arch]` options:  'sm_30','sm_32','sm_35','sm_37','sm_50','sm_52','sm_53','sm_60','sm_61','sm_62'.
33 | 


--------------------------------------------------------------------------------
/Solver/bin/generate_disassemble:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Please input architecture parameter argv[1]
 4 | if [ "$#" -lt 1 ]
 5 | then
 6 |     echo "Please input architecture parameter argv[1]"
 7 |     exit -1
 8 | fi
 9 | 
10 | arch=$1
11 | prefix="data/"$arch"/"
12 | src_directory="src/"
13 | 
14 | echo "Arch: "$arch
15 | echo "Data directory: "$prefix
16 | 
17 | # 1. Generate ptx
18 | ptx_directory=$prefix"ptx/"
19 | mkdir -p $ptx_directory
20 | echo ".................................................................."
21 | echo "1. Generate .ptx files to "$ptx_directory" directory"
22 | echo "It may take serveral miniutes"
23 | echo ".................................................................."
24 | perl $src_directory"ptxgen.pl" $arch $ptx_directory
25 | 
26 | # 2. Compile to cubins
27 | cubin_directory=$prefix"cubin/"
28 | mkdir -p $cubin_directory
29 | echo ".................................................................."
30 | echo "2. Compile .ptx file to cubin files in "$cubin_directory" directory"
31 | echo "It may take serveral miniutes"
32 | echo ".................................................................."
33 | ptx=$ptx_directory"*.ptx"
34 | for p in $ptx
35 | do
36 |      f=`echo $p | cut -d / -f 4 |cut -d . -f 1` 
37 |      fout=$cubin_directory""$f".cubin"
38 |      echo $fout
39 |      ptxas -arch $arch -m 64 $p -o $fout > /dev/null 2>&1
40 | done
41 | 
42 | # 3. Disassembly to sass
43 | asm_directory=$prefix"asm/"
44 | mkdir -p $asm_directory
45 | echo ".................................................................."
46 | echo "3. Disassemble .cubin file to sass files in "$asm_directory" directory"
47 | echo "It may take serveral miniutes"
48 | echo ".................................................................."
49 | cubin=$cubin_directory"*.cubin"
50 | for p in $cubin
51 | do
52 |      f=`echo $p | cut -d / -f 4 | cut -d . -f 1`
53 |      fout=$asm_directory""$f".sass"
54 |      echo $fout
55 |      cuobjdump --gpu-architecture $arch --dump-sass $p > $fout
56 | done
57 | 
58 | # 4.Put all sass results in one file
59 | echo ".................................................................."
60 | echo "4. Gathering results from ptxgen"
61 | echo ".................................................................."
62 | asm=$asm_directory"*.sass"
63 | if [ -f /tmp/all.sass ]
64 | then
65 |     rm /tmp/all.sass
66 | else
67 |     touch /tmp/all.sass
68 | fi
69 | 
70 | for f in $asm
71 | do
72 |     cat $f >> /tmp/all.sass
73 | done
74 | 
75 | # Ignore non-instruction lines
76 | awk '{if (NF >= 5) {$1 = ""; print $0} }' /tmp/all.sass > /tmp/all_inst.sass
77 | # Make instruction uniq
78 | python $src_directory"unique.py" /tmp/all_inst.sass > $prefix""$arch".sass"
79 | # Generate test cubin
80 | nvcc -cubin -arch $arch $src_directory"test.cu" -o $prefix""$arch".cubin"
81 | 
82 | rm /tmp/all.sass /tmp/all_inst.sass
83 | 


--------------------------------------------------------------------------------
/Solver/bin/modifier:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Please input architecture parameter argv[1]
 4 | if [ "$#" -lt 1 ]
 5 | then
 6 |     echo "Please input architecture parameter argv[1]"
 7 |     exit -1
 8 | fi
 9 | 
10 | arch=$1
11 | prefix="data/"$arch"/"
12 | src_directory="src/"
13 | asm_directory=$prefix
14 | output_directory="output/"$arch"/"
15 | output_file=$output_directory""$arch".modifier"
16 | mkdir -p $output_directory
17 | rm -rf $output_file || true
18 | echo "Output file: "$output_file
19 | python $src_directory"modifier.py" $asm_directory""$arch".sass" $arch $output_file
20 | 


--------------------------------------------------------------------------------
/Solver/bin/opcode:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Please input architecture parameter argv[1]
 4 | if [ "$#" -lt 1 ]
 5 | then
 6 |     echo "Please input architecture parameter argv[1]"
 7 |     exit -1
 8 | fi
 9 | 
10 | arch=$1
11 | prefix="data/"$arch"/"
12 | src_directory="src/"
13 | asm_directory=$prefix
14 | output_directory="output/"$arch"/"
15 | output_file=$output_directory""$arch".opcode"
16 | mkdir -p $output_directory
17 | rm -rf $output_file || true
18 | echo "Output file: "$output_file
19 | python $src_directory"opcode.py" $asm_directory""$arch".sass" $arch $output_file
20 | 


--------------------------------------------------------------------------------
/Solver/bin/operand:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Please input architecture parameter argv[1]
 4 | if [ "$#" -lt 1 ]
 5 | then
 6 |     echo "Please input architecture parameter argv[1]"
 7 |     exit -1
 8 | fi
 9 | 
10 | arch=$1
11 | prefix="data/"$arch"/"
12 | src_directory="src/"
13 | asm_directory=$prefix
14 | output_directory="output/"$arch"/"
15 | output_file=$output_directory""$arch".operand"
16 | mkdir -p $output_directory
17 | rm -rf $output_file || true
18 | echo "Output file: "$output_file
19 | python $src_directory"operand.py" $asm_directory""$arch".sass" $arch $output_file
20 | 


--------------------------------------------------------------------------------
/Solver/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ncic-sugon/DeepPerf/e539add1347e9354712787f71f66ae346f006544/Solver/src/__init__.py


--------------------------------------------------------------------------------
/Solver/src/dumper.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import struct
 3 | 
 4 | def arch2mode(arch):
 5 |     return arch.replace("_", "").upper()
 6 | 
 7 | def dump(newcode, arch):
 8 |     version = arch.split("_")[1]
 9 |     if version < 40:
10 |         tmp_bin = "/tmp/tmp_dumper.bin"
11 |         fout = open(tmp_bin, "wb")
12 |         fout.write(struct.pack("<Q", int(newcode, 16)))
13 |         fout.close()
14 |         cmd = "nvdisasm -b {0} {1} 2>&1".format(arch2mode(arch), tmp_bin)
15 |         tmp_read = os.popen(cmd).read()
16 |         rmfile = "rm {0}".format(tmp_bin)
17 |         os.system(rmfile)
18 |         return tmp_read
19 |     else:
20 |         tmp_cubin = "data/" + arch + "/" + arch + ".cubin"
21 |         f = open(tmp_cubin,'rb+')  
22 |         f.seek(904)
23 |         f.write(struct.pack('Q', int(newcode, 16)))
24 |         f.close()
25 |         cmd = "cuobjdump -arch {0} -sass {1} 2>&1".format(arch, tmp_cubin)
26 |         tmp_read = os.popen(cmd).read()
27 |         return tmp_read
28 | 


--------------------------------------------------------------------------------
/Solver/src/enumerator.py:
--------------------------------------------------------------------------------
 1 | from dumper import dump
 2 | import logging
 3 | 
 4 | def enumerate(base, pos, arch):
 5 |     version = int(arch.split("_")[1])
 6 |     for i in range(1 << len(pos)):
 7 |         bits = 0x0
 8 |         enc = base
 9 |         # Expresss i in binary
10 |         for j in range(len(pos)):
11 |             bits = (((i >> j) & 0x1) << pos[j]) | bits
12 |             enc = enc & (~(1 << pos[j]))
13 |         dump_file = dump("0x{:016x}".format(enc | bits), arch)
14 |         if dump_file and dump_file.find("?") == -1 and dump_file.find("error") == -1 and dump_file.find("INVALID") == -1:
15 |             line = dump_file.split("\n")
16 |             if version < 40:
17 |                 line_inst = line[1].split();
18 |             else:
19 |                 line_inst = line[5].split();
20 |             line_inst.pop(0)
21 |             logging.info("0b{:064b}".format(bits) + ": " + " ".join(line_inst))
22 | 


--------------------------------------------------------------------------------
/Solver/src/inst.py:
--------------------------------------------------------------------------------
 1 | from sets import Set
 2 | 
 3 | class Inst:
 4 |     def __init__(self, inst, raw = True):
 5 |         # Fetech binary encoding
 6 |         if raw == True: # From cuobjdump
 7 |             self.__enc = inst[-2]
 8 |             inst.pop(-1)
 9 |             inst.pop(-1)
10 |             inst.pop(-1)
11 |         else: # From nvdisasm
12 |             self.__enc = ""
13 | 
14 |         if inst[0] == '{':  # Check dual issue
15 |             inst.pop(0)
16 |             self.__pred = ""
17 |         if inst[0].find('@') != -1:  # Check predicate, such as @P0
18 |             self.__pred = inst.pop(0)
19 | 
20 |         # Remove semicolon of zero operand field instruction such as "RRO;" 
21 |         ops = inst.pop(0).replace(";", "")
22 |         # Fetech opcode
23 |         self.__op = ops.split(".")[0]
24 |         # Split opcode
25 |         self.__modifier = ops.split(".")[1:]
26 |         # Fetech operands and remove ; and ,
27 |         self.__operands = ' '.join(inst).replace(";", "").replace(",", "").replace("-","").replace("|","")
28 | 
29 |     def op(self):
30 |         return str(self.__op)
31 | 
32 |     def modifier(self):
33 |         return str(self.__modifier)
34 | 
35 |     def enc(self):
36 |         return str(self.__enc)
37 | 
38 |     def operands(self):
39 |         return str(self.__operands)
40 | 
41 |     def pred(self):
42 |         return str(self.__pred)
43 | 


--------------------------------------------------------------------------------
/Solver/src/modifier.py:
--------------------------------------------------------------------------------
 1 | from inst import Inst
 2 | from dumper import dump
 3 | import enumerator
 4 | import sys
 5 | import logging
 6 | 
 7 | if __name__ == "__main__":
 8 |     logging.basicConfig(filename = sys.argv[3], level = logging.INFO)
 9 |     logging.debug("argv[1]: Disassemble file")
10 |     logging.debug("argv[2]: Arch")
11 |     logging.debug("argv[3]: Output file")
12 |     logging.debug("argv[4]: Instruction limit (default 100)")
13 |     sass = sys.argv[1]
14 |     arch = sys.argv[2]
15 |     if len(sys.argv) >= 5:
16 |         limit = sys.argv[4]
17 |     else:
18 |         limit = 100
19 |     count = 0
20 |     version = int(arch.split("_")[1])
21 |     with open(sass) as f:
22 |         for line in f:
23 |             pos = []
24 |             count += 1
25 |             if count == limit:
26 |                 break
27 |             line_split = line.split()
28 |             # Construct instruction structure
29 |             origin = Inst(line_split)
30 |             # Find the 64-bit encodings
31 |             base = int(origin.enc(), 16)
32 |             # Bit by bit xor, observe whether opcode changes and guess what this bit represent
33 |             for i in range(0, 64):
34 |                 mask = 2**i
35 |                 newcode = base ^ mask
36 |                 # Disassemble the new code
37 |                 dump_file = dump("0x{:016x}".format(newcode), arch)
38 |                 # Compare the disassemble to check which field changes: opcode, operand or modifer
39 |                 if dump_file and dump_file.find("?") == -1 and dump_file.find("error") == -1:
40 |                     line = dump_file.split("\n")
41 |                     if version < 40:
42 |                         line_inst = line[1].split();
43 |                     else:
44 |                         line_inst = line[5].split();
45 |                     # [0]: header info, [1] instruction part
46 |                     line_inst.pop(0)
47 |                     # Parse the new generated disassembly
48 |                     inst = Inst(line_inst, raw = version > 40)
49 |                     if inst.modifier() != origin.modifier() and inst.op() == origin.op():
50 |                         if i not in pos:
51 |                             pos.append(i)
52 |             # Enumerate all modifiers
53 |             if len(pos) > 0:
54 |                 logging.info("%s modifier bits %s: ", origin.op(), pos);
55 |                 enumerator.enumerate(base, pos, arch)
56 | 


--------------------------------------------------------------------------------
/Solver/src/opcode.py:
--------------------------------------------------------------------------------
 1 | from inst import Inst
 2 | from dumper import dump
 3 | import sys
 4 | import logging
 5 | 
 6 | if __name__ == "__main__":
 7 |     logging.basicConfig(filename = sys.argv[3], level = logging.INFO)
 8 |     logging.debug("argv[1]: Disassemble file")
 9 |     logging.debug("argv[2]: Arch")
10 |     logging.debug("argv[3]: Output file")
11 |     logging.debug("argv[4]: Instruction limit (default 100)")
12 |     sass = sys.argv[1]
13 |     arch = sys.argv[2]
14 |     if len(sys.argv) >= 5:
15 |         limit = sys.argv[4]
16 |     else:
17 |         limit = 100
18 |     count = 0;
19 |     version = int(arch.split("_")[1])
20 |     with open(sass) as f:
21 |         for line in f:
22 |             pos = []
23 |             bits = 0x0
24 |             count += 1
25 |             if count == limit:
26 |                 break
27 |             line_split = line.split()
28 |             # Construct instruction structure
29 |             origin = Inst(line_split)
30 |             # Find the 64-bit encodings
31 |             base = int(origin.enc(), 16)
32 |             # Bit by bit xor, observe whether opcode changes and guess what this bit represent
33 |             for i in range(0, 64):
34 |                 mask = 2**i
35 |                 newcode = base ^ mask
36 |                 # Disassemble the new code
37 |                 dump_file = dump("0x{:016x}".format(newcode), arch)
38 |                 # Compare the disassemble to check which field changes: opcode, operand or modifer
39 |                 if dump_file and dump_file.find("?") == -1 and dump_file.find("error") == -1:
40 |                     line = dump_file.split("\n")
41 |                     if version < 40:
42 |                         line_inst = line[1].split();
43 |                     else:
44 |                         line_inst = line[5].split();
45 |                     # [0]: header info, [1] instruction part
46 |                     line_inst.pop(0)
47 |                     # Parse the new generated disassembly
48 |                     inst = Inst(line_inst, raw = version > 40)
49 |                     # If opcode is changed, then this bit represent opcode, we find it!
50 |                     # LDG and TEX are the same instructions in fact 
51 |                     # RED and ATOM are the same instruction
52 |                     if inst.op() != origin.op() and not i in pos and not \
53 |                     (inst.op() == "LDG" and origin.op() == "TEX") and not \
54 |                     (inst.op() == "TEX" and origin.op() =="LDG") and not \
55 |                     (inst.op() == "RED" and origin.op() == "ATOM") and not \
56 |                     (inst.op() == "ATOM" and origin.op() == "RED"):
57 |                         logging.info("Opcode changes: %s => %s when bit [%d] is flipped from [%d]", \
58 |                             origin.op(), inst.op(), i, (base >> i) & 0x1)
59 |                         bits = bits | (((base >> i) & 0x1) << i)
60 |                         pos.append(i)
61 |             if len(pos) > 0:
62 |                 logging.info("0b{:064b}".format(bits) + ": %s opcode bits %s: ", origin.op(), pos);
63 | 


--------------------------------------------------------------------------------
/Solver/src/operand.py:
--------------------------------------------------------------------------------
  1 | from sets import Set
  2 | from inst import Inst
  3 | from dumper import dump
  4 | import sys
  5 | import logging
  6 | 
  7 | ops = dict()
  8 | 
  9 | def check_operand_types(inst):
 10 |     operand_types = ""
 11 |     operands = inst.operands().split();
 12 |     for operand in operands:
 13 |         key = operand[0]
 14 |         if key == 'R': # Register
 15 |             value = operand[1:]
 16 |             if value == 'Z' or value == 'N' or value == 'M' or \
 17 |                 value == 'P' or float(value).is_integer():
 18 |                 operand_types += 'R'
 19 |             else:
 20 |                 return 'X'
 21 |         elif key == 'P': # Predicate
 22 |             value = operand[1:]
 23 |             if float(value).is_integer():
 24 |                 operand_types += 'P'
 25 |             else:
 26 |                 return 'X'
 27 |         elif key == 'c': # Constant memory
 28 |             operand_types += 'C'
 29 |         elif key == '[': # Memory
 30 |             operand_types += 'M'
 31 |         elif key == 'S': # Special register
 32 |             operand_types += 'S'
 33 |         else:
 34 |             if len(operand) >= 2 and (operand[0:2] == "0x" or operand[0:3] == "-0x"): # Hex immediate
 35 |                 operand_types += 'I'
 36 |             elif float(operand).is_integer(): # Immediate value
 37 |                 operand_types += 'I'
 38 |             else:
 39 |                 return 'X'
 40 |     if inst.op() not in ops:
 41 |         ops[inst.op()] = set()
 42 |         ops[inst.op()].add(operand_types)
 43 |         return operand_types
 44 |     elif inst.op() in ops and operand_types not in ops[inst.op()]:
 45 |         ops[inst.op()].add(operand_types)
 46 |         return operand_types
 47 |     else:
 48 |         return 'X'
 49 | 
 50 | def change(inst, origin):
 51 |     if inst.op() != origin.op():
 52 |         return -1
 53 |     elif inst.modifier() != origin.modifier():
 54 |         return -2
 55 |     else:
 56 |         inst_operands = inst.operands().split()
 57 |         origin_operands = origin.operands().split()
 58 |         for i in range(len(origin_operands)):
 59 |             if (inst_operands[i] != origin_operands[i]):
 60 |                 return i
 61 |         return -3
 62 |             
 63 | if __name__ == "__main__":
 64 |     logging.basicConfig(filename = sys.argv[3], level = logging.INFO)
 65 |     logging.debug("argv[1]: Disassemble file")
 66 |     logging.debug("argv[2]: Arch")
 67 |     logging.debug("argv[3]: Output file")
 68 |     logging.debug("argv[4]: Instruction limit (default 100)")
 69 |     sass = sys.argv[1]
 70 |     arch = sys.argv[2]
 71 |     if len(sys.argv) >= 5:
 72 |         limit = sys.argv[4]
 73 |     else:
 74 |         limit = 100
 75 |     count = 0;
 76 |     version = int(arch.split("_")[1])
 77 |     with open(sys.argv[1]) as f:
 78 |         for line in f:
 79 |             pos = []
 80 |             count += 1
 81 |             if count == limit:
 82 |                 break
 83 |             line_split = line.split()
 84 |             # Construct instruction structure
 85 |             origin = Inst(line_split)
 86 |             # Find the 64-bit encodings
 87 |             base = int(origin.enc(), 16)
 88 |             origin_operand_types = check_operand_types(origin)
 89 |             if len(origin.operands()) and origin_operand_types.find('X') == -1:
 90 |                 pp = [[] for i in range(len(origin_operand_types))]
 91 |                 logging.info(origin.op() + " " + origin.modifier())
 92 |                 logging.info("0b{:064b}".format(base) + ": " + origin.operands())
 93 |                 for i in range(0, 64):
 94 |                     mask = 2**i
 95 |                     newcode = base ^ mask
 96 |                     # Disassemble the new code
 97 |                     dump_file = dump("0x{:016x}".format(newcode), arch)
 98 |                     if dump_file and dump_file.find("?") == -1 and dump_file.find("error") == -1:
 99 |                         line = dump_file.split("\n")
100 |                         if version < 40:
101 |                             line_inst = line[1].split();
102 |                         else:
103 |                             line_inst = line[5].split();
104 |                         # [0]: header info, [1] instruction part
105 |                         line_inst.pop(0)
106 |                         inst = Inst(line_inst, raw = version > 40)
107 |                         pos = change(inst, origin) 
108 |                         if pos >= 0:
109 |                             pp[pos].append(i)
110 |                             logging.info("0b{:064b}".format(newcode) + ": " + inst.operands())
111 |                 logging.info("Operand combination types: %s", origin_operand_types)
112 |                 for i in range(0, len(pp)):
113 |                     logging.info("Operand type: %s", origin_operand_types[i])
114 |                     logging.info("Encoding: %s", pp[i])
115 | 


--------------------------------------------------------------------------------
/Solver/src/test.cu:
--------------------------------------------------------------------------------
1 | __global__ void test(float& a, float& b) {
2 |   do {
3 |   } while(1);
4 | }
5 | 


--------------------------------------------------------------------------------
/Solver/src/unique.py:
--------------------------------------------------------------------------------
 1 | from sets import Set
 2 | from inst import Inst
 3 | import subprocess 
 4 | import sys
 5 | 
 6 | if __name__ == "__main__":
 7 |     opset = Set([])
 8 |     with open(sys.argv[1]) as f:
 9 |         for line in f:
10 |             field = line.split()
11 |             inst = Inst(field, False)
12 |             if not inst.op() in opset:
13 |                 opset.add(inst.op())
14 |                 sys.stdout.write(line)
15 | 


--------------------------------------------------------------------------------