├── Assembler ├── KeplerAs │ ├── Install.sh │ ├── Install_locally.sh │ ├── LICENSE │ ├── MYMETA.json │ ├── MYMETA.yml │ ├── Makefile │ ├── Makefile.PL │ ├── README.md │ ├── bin │ │ └── KeplerAs.pl │ ├── blib │ │ ├── arch │ │ │ ├── .exists │ │ │ └── auto │ │ │ │ └── KeplerAs │ │ │ │ └── KeplerAs │ │ │ │ └── .exists │ │ ├── bin │ │ │ └── .exists │ │ ├── lib │ │ │ ├── KeplerAs │ │ │ │ ├── .exists │ │ │ │ ├── Cubin.pm │ │ │ │ ├── KeplerAs.pm │ │ │ │ └── KeplerAsGrammar.pm │ │ │ └── auto │ │ │ │ └── KeplerAs │ │ │ │ └── KeplerAs │ │ │ │ └── .exists │ │ ├── man1 │ │ │ └── .exists │ │ ├── man3 │ │ │ ├── .exists │ │ │ └── KeplerAs::KeplerAs.3pm │ │ └── script │ │ │ ├── .exists │ │ │ └── KeplerAs.pl │ ├── lib │ │ └── KeplerAs │ │ │ ├── Cubin.pm │ │ │ ├── KeplerAs.pm │ │ │ └── KeplerAsGrammar.pm │ └── pm_to_blib ├── MaxAs │ ├── Changes │ ├── Install.sh │ ├── LICENSE │ ├── MANIFEST │ ├── Makefile │ ├── Makefile.PL │ ├── README.md │ ├── bin │ │ └── maxas.pl │ ├── blib │ │ ├── arch │ │ │ ├── .exists │ │ │ └── auto │ │ │ │ └── MaxAs │ │ │ │ └── MaxAs │ │ │ │ └── .exists │ │ ├── bin │ │ │ └── .exists │ │ ├── lib │ │ │ ├── MaxAs │ │ │ │ ├── .exists │ │ │ │ ├── Cubin.pm │ │ │ │ ├── MaxAs.pm │ │ │ │ └── MaxAsGrammar.pm │ │ │ └── auto │ │ │ │ └── MaxAs │ │ │ │ └── MaxAs │ │ │ │ └── .exists │ │ ├── man1 │ │ │ └── .exists │ │ ├── man3 │ │ │ ├── .exists │ │ │ └── MaxAs::MaxAs.3pm │ │ └── script │ │ │ ├── .exists │ │ │ └── maxas.pl │ ├── cpanfile │ ├── lib │ │ └── MaxAs │ │ │ ├── Cubin.pm │ │ │ ├── MaxAs.pm │ │ │ └── MaxAsGrammar.pm │ ├── microbench │ │ ├── microbench.cpp │ │ ├── microbench.cu │ │ ├── microbench.sass │ │ ├── shared.pl │ │ ├── shared_lds.sass │ │ ├── shared_sts16.sass │ │ ├── throughput.pl │ │ ├── throughput.sass │ │ ├── throughput2.pl │ │ ├── throughput2.sass │ │ ├── throughput3.pl │ │ ├── throughput4.pl │ │ ├── throughput5.pl │ │ ├── xmad.pl │ │ └── xmad2.sass │ ├── pm_to_blib │ ├── sgemm │ │ ├── batched_gemm.xlsx │ │ ├── cublas_sgemm.ptx │ │ ├── new.cubin │ │ ├── sgemm.cpp │ │ ├── sgemm.cu │ │ ├── sgemm.pl │ │ ├── sgemm.sln │ │ ├── sgemm.vcxproj │ │ ├── sgemm128.sass │ │ ├── sgemm64.sass │ │ ├── sgemm_final_128.sass │ │ ├── sgemm_final_64.sass │ │ ├── sgemm_pre_128.sass │ │ ├── sgemm_pre_64.sass │ │ ├── sgemm_sm52_64.cubin │ │ └── sgemm_sm52_64_dump.sass │ └── t │ │ └── MaxAs-MaxAs.t └── PascalAs │ ├── Changes │ ├── Install.sh │ ├── LICENSE │ ├── MANIFEST │ ├── MYMETA.json │ ├── MYMETA.yml │ ├── Makefile │ ├── Makefile.PL │ ├── README.md │ ├── bin │ └── pascalas.pl │ ├── blib │ ├── arch │ │ ├── .exists │ │ └── auto │ │ │ ├── MaxAs │ │ │ └── MaxAs │ │ │ │ └── .exists │ │ │ └── PascalAs │ │ │ └── PascalAs │ │ │ └── .exists │ ├── bin │ │ └── .exists │ ├── lib │ │ ├── MaxAs │ │ │ ├── .exists │ │ │ ├── Cubin.pm │ │ │ ├── MaxAs.pm │ │ │ └── MaxAsGrammar.pm │ │ ├── PascalAs │ │ │ ├── .exists │ │ │ ├── Cubin.pm │ │ │ ├── PascalAs.pm │ │ │ └── PascalAsGrammar.pm │ │ └── auto │ │ │ ├── MaxAs │ │ │ └── MaxAs │ │ │ │ └── .exists │ │ │ └── PascalAs │ │ │ └── PascalAs │ │ │ └── .exists │ ├── man1 │ │ └── .exists │ ├── man3 │ │ ├── .exists │ │ ├── MaxAs::MaxAs.3pm │ │ └── PascalAs::PascalAs.3pm │ └── script │ │ ├── .exists │ │ ├── maxas.pl │ │ └── pascalas.pl │ ├── cpanfile │ ├── lib │ └── PascalAs │ │ ├── Cubin.pm │ │ ├── PascalAs.pm │ │ └── PascalAsGrammar.pm │ ├── microbench │ ├── microbench.cpp │ ├── microbench.cu │ ├── microbench.sass │ ├── shared.pl │ ├── shared_lds.sass │ ├── shared_sts16.sass │ ├── throughput.pl │ ├── throughput.sass │ ├── throughput2.pl │ ├── throughput2.sass │ ├── throughput3.pl │ ├── throughput4.pl │ ├── throughput5.pl │ ├── xmad.pl │ └── xmad2.sass │ ├── pm_to_blib │ ├── sgemm │ ├── batched_gemm.xlsx │ ├── cublas_sgemm.ptx │ ├── new.cubin │ ├── sgemm.cpp │ ├── sgemm.cu │ ├── sgemm.cubin │ ├── sgemm.pl │ ├── sgemm.sln │ ├── sgemm.vcxproj │ ├── sgemm128.sass │ ├── sgemm64.sass │ ├── sgemm_final_128.sass │ ├── sgemm_final_64.sass │ ├── sgemm_pre_128.sass │ ├── sgemm_pre_64.sass │ ├── sgemm_sm52_64.cubin │ └── sgemm_sm52_64_dump.sass │ └── t │ └── MaxAs-MaxAs.t ├── Kernel ├── Convolution │ ├── Kepler │ │ ├── Makefile │ │ ├── sconv.h │ │ ├── sconv_bprop.cu │ │ ├── sconv_bprop_C128_N128.cu │ │ ├── sconv_bprop_C128_N128.sass │ │ ├── sconv_bprop_C1_N64.cu │ │ ├── sconv_bprop_C1_N64.sass │ │ ├── sconv_bprop_C64_N64.cu │ │ ├── sconv_bprop_C64_N64.sass │ │ ├── sconv_fprop.cu │ │ ├── sconv_fprop_K128_N128.cu │ │ ├── sconv_fprop_K128_N128.sass │ │ ├── sconv_fprop_K64_N64.cu │ │ ├── sconv_fprop_K64_N64.sass │ │ ├── sconv_fprop_K64_N64_template.cubin │ │ ├── sconv_update.cu │ │ ├── sconv_update_C128_K128.cu │ │ └── sconv_update_C128_K128.sass │ ├── Maxwell │ │ ├── hconv_bprop_C1_N64.sass │ │ ├── hconv_updat_C128_K128.sass │ │ ├── hconv_updat_C128_K64.sass │ │ ├── hconv_xprop_X128_N128.sass │ │ ├── hconv_xprop_X128_N64.sass │ │ ├── hconv_xprop_X32_N128.sass │ │ ├── hconv_xprop_X64_N128.sass │ │ ├── hconv_xprop_X64_N64.sass │ │ ├── persistent_rnn_bprop.sass │ │ ├── persistent_rnn_fprop.sass │ │ ├── sconv_bprop_C1_N64.sass │ │ ├── sconv_updat_C128_K128.sass │ │ ├── sconv_updat_C128_K64.sass │ │ ├── sconv_xprop_X128_N128.sass │ │ ├── sconv_xprop_X128_N64.sass │ │ ├── sconv_xprop_X32_N128.sass │ │ ├── sconv_xprop_X64_N128.sass │ │ ├── sconv_xprop_X64_N64.sass │ │ ├── xconv_direct_updat_64x32.sass │ │ ├── xconv_direct_xprop_64x32.sass │ │ ├── xconv_winograd_2x2_3x3_32x32.sass │ │ ├── xconv_winograd_2x2_5x5_32x32.sass │ │ ├── xconv_winograd_3x3_2x2_32x32.sass │ │ ├── xconv_winograd_3x3_4x4_32x32.sass │ │ ├── xconv_winograd_4x4_3x3_32x32.sass │ │ ├── xconv_winograd_4x4_3x3_32x32_X.sass │ │ ├── xconv_winograd_4x4_3x3_32x32_common.sass │ │ └── xconv_xprop_common.sass │ └── Pascal │ │ ├── hconv_bprop_C1_N64.sass │ │ ├── hconv_updat_C128_K128.sass │ │ ├── hconv_updat_C128_K64.sass │ │ ├── hconv_xprop_X128_N128.sass │ │ ├── hconv_xprop_X128_N64.sass │ │ ├── hconv_xprop_X32_N128.sass │ │ ├── hconv_xprop_X64_N128.sass │ │ ├── hconv_xprop_X64_N64.sass │ │ ├── persistent_rnn_bprop.sass │ │ ├── persistent_rnn_fprop.sass │ │ ├── sconv_bprop_C1_N64.sass │ │ ├── sconv_updat_C128_K128.sass │ │ ├── sconv_updat_C128_K64.sass │ │ ├── sconv_xprop_X128_N128.sass │ │ ├── sconv_xprop_X128_N64.sass │ │ ├── sconv_xprop_X32_N128.sass │ │ ├── sconv_xprop_X64_N128.sass │ │ ├── sconv_xprop_X64_N64.sass │ │ ├── xconv_direct_updat_64x32.sass │ │ ├── xconv_direct_xprop_64x32.sass │ │ ├── xconv_winograd_2x2_3x3_32x32.sass │ │ ├── xconv_winograd_2x2_5x5_32x32.sass │ │ ├── xconv_winograd_3x3_2x2_32x32.sass │ │ ├── xconv_winograd_3x3_4x4_32x32.sass │ │ ├── xconv_winograd_4x4_3x3_32x32.sass │ │ ├── xconv_winograd_4x4_3x3_32x32_X.sass │ │ ├── xconv_winograd_4x4_3x3_32x32_common.sass │ │ └── xconv_xprop_common.sass └── SGEMM │ ├── Kepler │ ├── Makefile │ ├── README.md │ ├── sgemm_common_128x128.sass │ ├── sgemm_common_128x32.sass │ ├── sgemm_nn_128x128.cu │ ├── sgemm_nn_128x128.sass │ ├── sgemm_nn_128x128_vec.cu │ ├── sgemm_nn_128x128_vec.sass │ ├── sgemm_nt_128x128.cu │ ├── sgemm_nt_128x128.sass │ ├── sgemm_nt_128x128_vec.cu │ ├── sgemm_nt_128x128_vec.sass │ ├── sgemm_tn_128x128.cu │ ├── sgemm_tn_128x128.sass │ ├── sgemm_tn_128x128_vec.cu │ ├── sgemm_tn_128x128_vec.sass │ └── sgemm_tn_128x32.sass │ ├── Maxwell │ ├── hgemm_common_128x128.sass │ ├── hgemm_common_128x32.sass │ ├── hgemm_common_128x64.sass │ ├── hgemm_common_32x128.sass │ ├── hgemm_nn_128x128.sass │ ├── hgemm_nn_128x32.sass │ ├── hgemm_nn_128x64.sass │ ├── hgemm_nn_16x64.sass │ ├── hgemm_nn_32x128.sass │ ├── hgemm_nn_32x64.sass │ ├── hgemm_nt_128x128.sass │ ├── hgemm_nt_16x64.sass │ ├── hgemm_nt_32x128.sass │ ├── hgemm_nt_32x32.sass │ ├── hgemm_tn_128x128.sass │ ├── hgemm_tn_128x16.sass │ ├── hgemm_tn_128x32.sass │ ├── hgemm_tn_128x64.sass │ ├── sgemm_common_128x128.sass │ ├── sgemm_common_128x32.sass │ ├── sgemm_common_128x64.sass │ ├── sgemm_common_32x128.sass │ ├── sgemm_nn_128x128.sass │ ├── sgemm_nn_128x32.sass │ ├── sgemm_nn_128x64.sass │ ├── sgemm_nn_32x128.sass │ ├── sgemm_nn_rnn_128x32.sass │ ├── sgemm_nt_128x128.sass │ ├── sgemm_nt_32x128.sass │ ├── sgemm_rnn_bprop_common_128x32.sass │ ├── sgemm_rnn_common_128x32.sass │ ├── sgemm_tn_128x128.sass │ ├── sgemm_tn_128x32.sass │ ├── sgemm_tn_128x64.sass │ └── sgemm_tn_rnn_bprop_128x32.sass │ └── Pascal │ ├── hgemm_common_128x128.sass │ ├── hgemm_common_128x32.sass │ ├── hgemm_common_128x64.sass │ ├── hgemm_common_32x128.sass │ ├── hgemm_nn_128x128.sass │ ├── hgemm_nn_128x32.sass │ ├── hgemm_nn_128x64.sass │ ├── hgemm_nn_16x64.sass │ ├── hgemm_nn_32x128.sass │ ├── hgemm_nn_32x64.sass │ ├── hgemm_nt_128x128.sass │ ├── hgemm_nt_16x64.sass │ ├── hgemm_nt_32x128.sass │ ├── hgemm_nt_32x32.sass │ ├── hgemm_tn_128x128.sass │ ├── hgemm_tn_128x16.sass │ ├── hgemm_tn_128x32.sass │ ├── hgemm_tn_128x64.sass │ ├── sgemm_common_128x128.sass │ ├── sgemm_common_128x32.sass │ ├── sgemm_common_128x64.sass │ ├── sgemm_common_32x128.sass │ ├── sgemm_nn_128x128.sass │ ├── sgemm_nn_128x32.sass │ ├── sgemm_nn_128x64.sass │ ├── sgemm_nn_32x128.sass │ ├── sgemm_nn_rnn_128x32.sass │ ├── sgemm_nt_128x128.sass │ ├── sgemm_nt_32x128.sass │ ├── sgemm_rnn_bprop_common_128x32.sass │ ├── sgemm_rnn_common_128x32.sass │ ├── sgemm_tn_128x128.sass │ ├── sgemm_tn_128x32.sass │ ├── sgemm_tn_128x64.sass │ └── sgemm_tn_rnn_bprop_128x32.sass ├── README.md └── Solver ├── .gitignore ├── README.md ├── bin ├── generate_disassemble ├── modifier ├── opcode └── operand └── src ├── __init__.py ├── dumper.py ├── enumerator.py ├── inst.py ├── modifier.py ├── opcode.py ├── operand.py ├── ptxgen.pl ├── test.cu └── unique.py /Assembler/KeplerAs/Install.sh: -------------------------------------------------------------------------------- 1 | perl Makefile.PL 2 | make 3 | sudo make install 4 | -------------------------------------------------------------------------------- /Assembler/KeplerAs/Install_locally.sh: -------------------------------------------------------------------------------- 1 | perl Makefile.PL 2 | make 3 | 4 | #configure the following variables in .bashrc; then source ~/.bashrc 5 | #export PERL5LIB=/home/xiuxia/PP2017_artifact/KeplerAs/blib/lib/:$PERL5LIB 6 | #export PATH=/home/xiuxia/PPoPP2017_artifact/KeplerAs/blib/script:$PATH 7 | -------------------------------------------------------------------------------- /Assembler/KeplerAs/LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014 Scott Gray 4 | Copyright (c) 2015~2016 Xiuxia Zhang 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in 14 | all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 | THE SOFTWARE. 23 | -------------------------------------------------------------------------------- /Assembler/KeplerAs/MYMETA.json: -------------------------------------------------------------------------------- 1 | { 2 | "abstract" : "Assembler for NVIDIA Maxwell architecture", 3 | "author" : [ 4 | "Xiuxia Zhang " 5 | ], 6 | "dynamic_config" : 0, 7 | "generated_by" : "ExtUtils::MakeMaker version 7.0401, CPAN::Meta::Converter version 2.150001", 8 | "license" : [ 9 | "mit" 10 | ], 11 | "meta-spec" : { 12 | "url" : "http://search.cpan.org/perldoc?CPAN::Meta::Spec", 13 | "version" : "2" 14 | }, 15 | "name" : "KeplerAs-KeplerAs", 16 | "no_index" : { 17 | "directory" : [ 18 | "t", 19 | "inc" 20 | ] 21 | }, 22 | "prereqs" : { 23 | "build" : { 24 | "requires" : { 25 | "ExtUtils::MakeMaker" : "0" 26 | } 27 | }, 28 | "configure" : { 29 | "requires" : { 30 | "ExtUtils::MakeMaker" : "0" 31 | } 32 | }, 33 | "runtime" : { 34 | "requires" : { 35 | "Carp" : "1.29", 36 | "Data::Dumper" : "2.145" 37 | } 38 | } 39 | }, 40 | "release_status" : "stable", 41 | "version" : "1.06" 42 | } 43 | -------------------------------------------------------------------------------- /Assembler/KeplerAs/MYMETA.yml: -------------------------------------------------------------------------------- 1 | --- 2 | abstract: 'Assembler for NVIDIA Maxwell architecture' 3 | author: 4 | - 'Xiuxia Zhang ' 5 | build_requires: 6 | ExtUtils::MakeMaker: '0' 7 | configure_requires: 8 | ExtUtils::MakeMaker: '0' 9 | dynamic_config: 0 10 | generated_by: 'ExtUtils::MakeMaker version 7.0401, CPAN::Meta::Converter version 2.150001' 11 | license: mit 12 | meta-spec: 13 | url: http://module-build.sourceforge.net/META-spec-v1.4.html 14 | version: '1.4' 15 | name: KeplerAs-KeplerAs 16 | no_index: 17 | directory: 18 | - t 19 | - inc 20 | requires: 21 | Carp: '1.29' 22 | Data::Dumper: '2.145' 23 | version: '1.06' 24 | -------------------------------------------------------------------------------- /Assembler/KeplerAs/Makefile.PL: -------------------------------------------------------------------------------- 1 | require 5.10.0; 2 | use ExtUtils::MakeMaker; 3 | # See lib/ExtUtils/MakeMaker.pm for details of how to influence 4 | # the contents of the Makefile that is written. 5 | WriteMakefile( 6 | NAME => 'KeplerAs::KeplerAs', 7 | VERSION_FROM => 'lib/KeplerAs/KeplerAs.pm', # finds $VERSION 8 | EXE_FILES => ['bin/KeplerAs.pl'], 9 | PREREQ_PM => {Carp => 1.29, Data::Dumper => 2.145}, 10 | LICENSE => 'MIT', 11 | ($] >= 5.005 ? ## Add these new keywords supported since 5.005 12 | (ABSTRACT_FROM => 'lib/KeplerAs/KeplerAs.pm', # retrieve abstract from module 13 | AUTHOR => 'Xiuxia Zhang ') : ()), 14 | ); 15 | -------------------------------------------------------------------------------- /Assembler/KeplerAs/README.md: -------------------------------------------------------------------------------- 1 | ##Kepler GPU assembler: KeplerAs 2 | 3 | Our KeplerAs is based on Maxas(for Maxwell and Pascal GPU). 4 | Kepler use a completely different ISA incodings compared with Maxwell GPU. 5 | We use the ISA encoding information cracked by our solver. 6 | 7 | Install.sh is script to install the software. 8 | -------------------------------------------------------------------------------- /Assembler/KeplerAs/blib/arch/.exists: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ncic-sugon/DeepPerf/e539add1347e9354712787f71f66ae346f006544/Assembler/KeplerAs/blib/arch/.exists -------------------------------------------------------------------------------- /Assembler/KeplerAs/blib/arch/auto/KeplerAs/KeplerAs/.exists: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ncic-sugon/DeepPerf/e539add1347e9354712787f71f66ae346f006544/Assembler/KeplerAs/blib/arch/auto/KeplerAs/KeplerAs/.exists -------------------------------------------------------------------------------- /Assembler/KeplerAs/blib/bin/.exists: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ncic-sugon/DeepPerf/e539add1347e9354712787f71f66ae346f006544/Assembler/KeplerAs/blib/bin/.exists -------------------------------------------------------------------------------- /Assembler/KeplerAs/blib/lib/KeplerAs/.exists: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ncic-sugon/DeepPerf/e539add1347e9354712787f71f66ae346f006544/Assembler/KeplerAs/blib/lib/KeplerAs/.exists -------------------------------------------------------------------------------- /Assembler/KeplerAs/blib/lib/auto/KeplerAs/KeplerAs/.exists: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ncic-sugon/DeepPerf/e539add1347e9354712787f71f66ae346f006544/Assembler/KeplerAs/blib/lib/auto/KeplerAs/KeplerAs/.exists -------------------------------------------------------------------------------- /Assembler/KeplerAs/blib/man1/.exists: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ncic-sugon/DeepPerf/e539add1347e9354712787f71f66ae346f006544/Assembler/KeplerAs/blib/man1/.exists -------------------------------------------------------------------------------- /Assembler/KeplerAs/blib/man3/.exists: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ncic-sugon/DeepPerf/e539add1347e9354712787f71f66ae346f006544/Assembler/KeplerAs/blib/man3/.exists -------------------------------------------------------------------------------- /Assembler/KeplerAs/blib/man3/KeplerAs::KeplerAs.3pm: -------------------------------------------------------------------------------- 1 | .\" Automatically generated by Pod::Man 2.28 (Pod::Simple 3.29) 2 | .\" 3 | .\" Standard preamble: 4 | .\" ======================================================================== 5 | .de Sp \" Vertical space (when we can't use .PP) 6 | .if t .sp .5v 7 | .if n .sp 8 | .. 9 | .de Vb \" Begin verbatim text 10 | .ft CW 11 | .nf 12 | .ne \\$1 13 | .. 14 | .de Ve \" End verbatim text 15 | .ft R 16 | .fi 17 | .. 18 | .\" Set up some character translations and predefined strings. \*(-- will 19 | .\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left 20 | .\" double quote, and \*(R" will give a right double quote. \*(C+ will 21 | .\" give a nicer C++. Capital omega is used to do unbreakable dashes and 22 | .\" therefore won't be available. \*(C` and \*(C' expand to `' in nroff, 23 | .\" nothing in troff, for use with C<>. 24 | .tr \(*W- 25 | .ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p' 26 | .ie n \{\ 27 | . ds -- \(*W- 28 | . ds PI pi 29 | . if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch 30 | . if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\" diablo 12 pitch 31 | . ds L" "" 32 | . ds R" "" 33 | . ds C` "" 34 | . ds C' "" 35 | 'br\} 36 | .el\{\ 37 | . ds -- \|\(em\| 38 | . ds PI \(*p 39 | . ds L" `` 40 | . ds R" '' 41 | . ds C` 42 | . ds C' 43 | 'br\} 44 | .\" 45 | .\" Escape single quotes in literal strings from groff's Unicode transform. 46 | .ie \n(.g .ds Aq \(aq 47 | .el .ds Aq ' 48 | .\" 49 | .\" If the F register is turned on, we'll generate index entries on stderr for 50 | .\" titles (.TH), headers (.SH), subsections (.SS), items (.Ip), and index 51 | .\" entries marked with X<> in POD. Of course, you'll have to process the 52 | .\" output yourself in some meaningful fashion. 53 | .\" 54 | .\" Avoid warning from groff about undefined register 'F'. 55 | .de IX 56 | .. 57 | .nr rF 0 58 | .if \n(.g .if rF .nr rF 1 59 | .if (\n(rF:(\n(.g==0)) \{ 60 | . if \nF \{ 61 | . de IX 62 | . tm Index:\\$1\t\\n%\t"\\$2" 63 | .. 64 | . if !\nF==2 \{ 65 | . nr % 0 66 | . nr F 2 67 | . \} 68 | . \} 69 | .\} 70 | .rr rF 71 | .\" ======================================================================== 72 | .\" 73 | .IX Title "KeplerAs::KeplerAs 3pm" 74 | .TH KeplerAs::KeplerAs 3pm "2018-11-05" "perl v5.22.1" "User Contributed Perl Documentation" 75 | .\" For nroff, turn off justification. Always turn off hyphenation; it makes 76 | .\" way too many mistakes in technical documents. 77 | .if n .ad l 78 | .nh 79 | .SH "NAME" 80 | KeplerAs::KeplerAs \- Assembler for NVIDIA Maxwell architecture 81 | .SH "SYNOPSIS" 82 | .IX Header "SYNOPSIS" 83 | .Vb 1 84 | \& KeplerAs.pl [opts] 85 | .Ve 86 | .SH "DESCRIPTION" 87 | .IX Header "DESCRIPTION" 88 | See the documentation at: https://github.com/NervanaSystems/KeplerAs 89 | .SH "SEE ALSO" 90 | .IX Header "SEE ALSO" 91 | See the documentation at: https://github.com/NervanaSystems/KeplerAs 92 | .SH "AUTHOR" 93 | .IX Header "AUTHOR" 94 | Scott Gray, 95 | .SH "COPYRIGHT AND LICENSE" 96 | .IX Header "COPYRIGHT AND LICENSE" 97 | The \s-1MIT\s0 License (\s-1MIT\s0) 98 | .PP 99 | Copyright (c) 2014 Scott Gray 100 | .PP 101 | Permission is hereby granted, free of charge, to any person obtaining a copy 102 | of this software and associated documentation files (the \*(L"Software\*(R"), to deal 103 | in the Software without restriction, including without limitation the rights 104 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 105 | copies of the Software, and to permit persons to whom the Software is 106 | furnished to do so, subject to the following conditions: 107 | .PP 108 | The above copyright notice and this permission notice shall be included in 109 | all copies or substantial portions of the Software. 110 | .PP 111 | \&\s-1THE SOFTWARE IS PROVIDED \*(L"AS IS\*(R", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 112 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 113 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 114 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 115 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 116 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 117 | THE SOFTWARE.\s0 118 | -------------------------------------------------------------------------------- /Assembler/KeplerAs/blib/script/.exists: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ncic-sugon/DeepPerf/e539add1347e9354712787f71f66ae346f006544/Assembler/KeplerAs/blib/script/.exists -------------------------------------------------------------------------------- /Assembler/KeplerAs/pm_to_blib: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ncic-sugon/DeepPerf/e539add1347e9354712787f71f66ae346f006544/Assembler/KeplerAs/pm_to_blib -------------------------------------------------------------------------------- /Assembler/MaxAs/Changes: -------------------------------------------------------------------------------- 1 | Revision history for Perl extension MaxAs::MaxAs. 2 | 3 | 1.01 Thu Mar 26 17:09:57 2015 4 | - original Perl packaged version 5 | -------------------------------------------------------------------------------- /Assembler/MaxAs/Install.sh: -------------------------------------------------------------------------------- 1 | perl Makefile.PL 2 | make 3 | sudo make install 4 | -------------------------------------------------------------------------------- /Assembler/MaxAs/LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014 Scott Gray 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /Assembler/MaxAs/MANIFEST: -------------------------------------------------------------------------------- 1 | bin/maxas.pl 2 | Changes 3 | lib/MaxAs/Cubin.pm 4 | lib/MaxAs/MaxAs.pm 5 | lib/MaxAs/MaxAsGrammar.pm 6 | LICENSE 7 | Makefile.PL 8 | MANIFEST 9 | microbench/microbench.cpp 10 | microbench/microbench.cu 11 | microbench/microbench.sass 12 | microbench/shared.pl 13 | microbench/shared_lds.sass 14 | microbench/shared_sts16.sass 15 | microbench/throughput.pl 16 | microbench/throughput.sass 17 | microbench/throughput2.pl 18 | microbench/throughput2.sass 19 | microbench/throughput3.pl 20 | microbench/throughput4.pl 21 | microbench/throughput5.pl 22 | microbench/xmad.pl 23 | microbench/xmad2.sass 24 | README.md 25 | sgemm/batched_gemm.xlsx 26 | sgemm/cublas_sgemm.ptx 27 | sgemm/sgemm.cpp 28 | sgemm/sgemm.cu 29 | sgemm/sgemm.pl 30 | sgemm/sgemm.sln 31 | sgemm/sgemm.vcxproj 32 | sgemm/sgemm128.sass 33 | sgemm/sgemm64.sass 34 | sgemm/sgemm_final_128.sass 35 | sgemm/sgemm_final_64.sass 36 | sgemm/sgemm_pre_128.sass 37 | sgemm/sgemm_pre_64.sass 38 | t/MaxAs-MaxAs.t 39 | -------------------------------------------------------------------------------- /Assembler/MaxAs/Makefile.PL: -------------------------------------------------------------------------------- 1 | require 5.10.0; 2 | use ExtUtils::MakeMaker; 3 | # See lib/ExtUtils/MakeMaker.pm for details of how to influence 4 | # the contents of the Makefile that is written. 5 | WriteMakefile( 6 | NAME => 'MaxAs::MaxAs', 7 | VERSION_FROM => 'lib/MaxAs/MaxAs.pm', # finds $VERSION 8 | EXE_FILES => ['bin/maxas.pl'], 9 | PREREQ_PM => {Carp => 1.29, Data::Dumper => 2.145}, 10 | LICENSE => 'MIT', 11 | ($] >= 5.005 ? ## Add these new keywords supported since 5.005 12 | (ABSTRACT_FROM => 'lib/MaxAs/MaxAs.pm', # retrieve abstract from module 13 | AUTHOR => 'Scott Gray ') : ()), 14 | ); 15 | -------------------------------------------------------------------------------- /Assembler/MaxAs/README.md: -------------------------------------------------------------------------------- 1 | # MaxAs 2 | Assembler for NVIDIA Maxwell architecture 3 | 4 | To install (system-wide): 5 | 6 | sudo cpanm git://github.com/NervanaSystems/maxas.git 7 | 8 | or 9 | 10 | perl Makefile.PL 11 | make 12 | sudo make install 13 | 14 | 15 | See wiki pages for more information: 16 | 17 | - [Introduction](https://github.com/NervanaSystems/maxas/wiki/Introduction) 18 | - [Getting Started](https://github.com/NervanaSystems/maxas/wiki/Getting-Started) 19 | - [Control Codes](https://github.com/NervanaSystems/maxas/wiki/Control-Codes) 20 | - [SGEMM walkthrough](https://github.com/NervanaSystems/maxas/wiki/SGEMM) 21 | 22 | Related work with lots of additional shader assembly (sass) examples: 23 | 24 | - [NervanaGPU](https://github.com/NervanaSystems/nervanagpu) 25 | 26 | This project is released under the [MIT License](http://opensource.org/licenses/MIT). 27 | 28 | -- Scott Gray 29 | -------------------------------------------------------------------------------- /Assembler/MaxAs/blib/arch/.exists: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ncic-sugon/DeepPerf/e539add1347e9354712787f71f66ae346f006544/Assembler/MaxAs/blib/arch/.exists -------------------------------------------------------------------------------- /Assembler/MaxAs/blib/arch/auto/MaxAs/MaxAs/.exists: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ncic-sugon/DeepPerf/e539add1347e9354712787f71f66ae346f006544/Assembler/MaxAs/blib/arch/auto/MaxAs/MaxAs/.exists -------------------------------------------------------------------------------- /Assembler/MaxAs/blib/bin/.exists: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ncic-sugon/DeepPerf/e539add1347e9354712787f71f66ae346f006544/Assembler/MaxAs/blib/bin/.exists -------------------------------------------------------------------------------- /Assembler/MaxAs/blib/lib/MaxAs/.exists: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ncic-sugon/DeepPerf/e539add1347e9354712787f71f66ae346f006544/Assembler/MaxAs/blib/lib/MaxAs/.exists -------------------------------------------------------------------------------- /Assembler/MaxAs/blib/lib/auto/MaxAs/MaxAs/.exists: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ncic-sugon/DeepPerf/e539add1347e9354712787f71f66ae346f006544/Assembler/MaxAs/blib/lib/auto/MaxAs/MaxAs/.exists -------------------------------------------------------------------------------- /Assembler/MaxAs/blib/man1/.exists: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ncic-sugon/DeepPerf/e539add1347e9354712787f71f66ae346f006544/Assembler/MaxAs/blib/man1/.exists -------------------------------------------------------------------------------- /Assembler/MaxAs/blib/man3/.exists: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ncic-sugon/DeepPerf/e539add1347e9354712787f71f66ae346f006544/Assembler/MaxAs/blib/man3/.exists -------------------------------------------------------------------------------- /Assembler/MaxAs/blib/man3/MaxAs::MaxAs.3pm: -------------------------------------------------------------------------------- 1 | .\" Automatically generated by Pod::Man 2.22 (Pod::Simple 3.13) 2 | .\" 3 | .\" Standard preamble: 4 | .\" ======================================================================== 5 | .de Sp \" Vertical space (when we can't use .PP) 6 | .if t .sp .5v 7 | .if n .sp 8 | .. 9 | .de Vb \" Begin verbatim text 10 | .ft CW 11 | .nf 12 | .ne \\$1 13 | .. 14 | .de Ve \" End verbatim text 15 | .ft R 16 | .fi 17 | .. 18 | .\" Set up some character translations and predefined strings. \*(-- will 19 | .\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left 20 | .\" double quote, and \*(R" will give a right double quote. \*(C+ will 21 | .\" give a nicer C++. Capital omega is used to do unbreakable dashes and 22 | .\" therefore won't be available. \*(C` and \*(C' expand to `' in nroff, 23 | .\" nothing in troff, for use with C<>. 24 | .tr \(*W- 25 | .ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p' 26 | .ie n \{\ 27 | . ds -- \(*W- 28 | . ds PI pi 29 | . if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch 30 | . if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\" diablo 12 pitch 31 | . ds L" "" 32 | . ds R" "" 33 | . ds C` "" 34 | . ds C' "" 35 | 'br\} 36 | .el\{\ 37 | . ds -- \|\(em\| 38 | . ds PI \(*p 39 | . ds L" `` 40 | . ds R" '' 41 | 'br\} 42 | .\" 43 | .\" Escape single quotes in literal strings from groff's Unicode transform. 44 | .ie \n(.g .ds Aq \(aq 45 | .el .ds Aq ' 46 | .\" 47 | .\" If the F register is turned on, we'll generate index entries on stderr for 48 | .\" titles (.TH), headers (.SH), subsections (.SS), items (.Ip), and index 49 | .\" entries marked with X<> in POD. Of course, you'll have to process the 50 | .\" output yourself in some meaningful fashion. 51 | .ie \nF \{\ 52 | . de IX 53 | . tm Index:\\$1\t\\n%\t"\\$2" 54 | .. 55 | . nr % 0 56 | . rr F 57 | .\} 58 | .el \{\ 59 | . de IX 60 | .. 61 | .\} 62 | .\" 63 | .\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2). 64 | .\" Fear. Run. Save yourself. No user-serviceable parts. 65 | . \" fudge factors for nroff and troff 66 | .if n \{\ 67 | . ds #H 0 68 | . ds #V .8m 69 | . ds #F .3m 70 | . ds #[ \f1 71 | . ds #] \fP 72 | .\} 73 | .if t \{\ 74 | . ds #H ((1u-(\\\\n(.fu%2u))*.13m) 75 | . ds #V .6m 76 | . ds #F 0 77 | . ds #[ \& 78 | . ds #] \& 79 | .\} 80 | . \" simple accents for nroff and troff 81 | .if n \{\ 82 | . ds ' \& 83 | . ds ` \& 84 | . ds ^ \& 85 | . ds , \& 86 | . ds ~ ~ 87 | . ds / 88 | .\} 89 | .if t \{\ 90 | . ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u" 91 | . ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u' 92 | . ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u' 93 | . ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u' 94 | . ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u' 95 | . ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u' 96 | .\} 97 | . \" troff and (daisy-wheel) nroff accents 98 | .ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V' 99 | .ds 8 \h'\*(#H'\(*b\h'-\*(#H' 100 | .ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#] 101 | .ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H' 102 | .ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u' 103 | .ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#] 104 | .ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#] 105 | .ds ae a\h'-(\w'a'u*4/10)'e 106 | .ds Ae A\h'-(\w'A'u*4/10)'E 107 | . \" corrections for vroff 108 | .if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u' 109 | .if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u' 110 | . \" for low resolution devices (crt and lpr) 111 | .if \n(.H>23 .if \n(.V>19 \ 112 | \{\ 113 | . ds : e 114 | . ds 8 ss 115 | . ds o a 116 | . ds d- d\h'-1'\(ga 117 | . ds D- D\h'-1'\(hy 118 | . ds th \o'bp' 119 | . ds Th \o'LP' 120 | . ds ae ae 121 | . ds Ae AE 122 | .\} 123 | .rm #[ #] #H #V #F C 124 | .\" ======================================================================== 125 | .\" 126 | .IX Title "MaxAs::MaxAs 3" 127 | .TH MaxAs::MaxAs 3 "2016-02-04" "perl v5.10.1" "User Contributed Perl Documentation" 128 | .\" For nroff, turn off justification. Always turn off hyphenation; it makes 129 | .\" way too many mistakes in technical documents. 130 | .if n .ad l 131 | .nh 132 | .SH "NAME" 133 | MaxAs::MaxAs \- Assembler for NVIDIA Maxwell architecture 134 | .SH "SYNOPSIS" 135 | .IX Header "SYNOPSIS" 136 | .Vb 1 137 | \& maxas.pl [opts] 138 | .Ve 139 | .SH "DESCRIPTION" 140 | .IX Header "DESCRIPTION" 141 | See the documentation at: https://github.com/NervanaSystems/maxas 142 | .SH "SEE ALSO" 143 | .IX Header "SEE ALSO" 144 | See the documentation at: https://github.com/NervanaSystems/maxas 145 | .SH "AUTHOR" 146 | .IX Header "AUTHOR" 147 | Scott Gray, 148 | .SH "COPYRIGHT AND LICENSE" 149 | .IX Header "COPYRIGHT AND LICENSE" 150 | The \s-1MIT\s0 License (\s-1MIT\s0) 151 | .PP 152 | Copyright (c) 2014 Scott Gray 153 | .PP 154 | Permission is hereby granted, free of charge, to any person obtaining a copy 155 | of this software and associated documentation files (the \*(L"Software\*(R"), to deal 156 | in the Software without restriction, including without limitation the rights 157 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 158 | copies of the Software, and to permit persons to whom the Software is 159 | furnished to do so, subject to the following conditions: 160 | .PP 161 | The above copyright notice and this permission notice shall be included in 162 | all copies or substantial portions of the Software. 163 | .PP 164 | \&\s-1THE\s0 \s-1SOFTWARE\s0 \s-1IS\s0 \s-1PROVIDED\s0 \*(L"\s-1AS\s0 \s-1IS\s0\*(R", \s-1WITHOUT\s0 \s-1WARRANTY\s0 \s-1OF\s0 \s-1ANY\s0 \s-1KIND\s0, \s-1EXPRESS\s0 \s-1OR\s0 165 | \&\s-1IMPLIED\s0, \s-1INCLUDING\s0 \s-1BUT\s0 \s-1NOT\s0 \s-1LIMITED\s0 \s-1TO\s0 \s-1THE\s0 \s-1WARRANTIES\s0 \s-1OF\s0 \s-1MERCHANTABILITY\s0, 166 | \&\s-1FITNESS\s0 \s-1FOR\s0 A \s-1PARTICULAR\s0 \s-1PURPOSE\s0 \s-1AND\s0 \s-1NONINFRINGEMENT\s0. \s-1IN\s0 \s-1NO\s0 \s-1EVENT\s0 \s-1SHALL\s0 \s-1THE\s0 167 | \&\s-1AUTHORS\s0 \s-1OR\s0 \s-1COPYRIGHT\s0 \s-1HOLDERS\s0 \s-1BE\s0 \s-1LIABLE\s0 \s-1FOR\s0 \s-1ANY\s0 \s-1CLAIM\s0, \s-1DAMAGES\s0 \s-1OR\s0 \s-1OTHER\s0 168 | \&\s-1LIABILITY\s0, \s-1WHETHER\s0 \s-1IN\s0 \s-1AN\s0 \s-1ACTION\s0 \s-1OF\s0 \s-1CONTRACT\s0, \s-1TORT\s0 \s-1OR\s0 \s-1OTHERWISE\s0, \s-1ARISING\s0 \s-1FROM\s0, 169 | \&\s-1OUT\s0 \s-1OF\s0 \s-1OR\s0 \s-1IN\s0 \s-1CONNECTION\s0 \s-1WITH\s0 \s-1THE\s0 \s-1SOFTWARE\s0 \s-1OR\s0 \s-1THE\s0 \s-1USE\s0 \s-1OR\s0 \s-1OTHER\s0 \s-1DEALINGS\s0 \s-1IN\s0 170 | \&\s-1THE\s0 \s-1SOFTWARE\s0. 171 | -------------------------------------------------------------------------------- /Assembler/MaxAs/blib/script/.exists: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ncic-sugon/DeepPerf/e539add1347e9354712787f71f66ae346f006544/Assembler/MaxAs/blib/script/.exists -------------------------------------------------------------------------------- /Assembler/MaxAs/cpanfile: -------------------------------------------------------------------------------- 1 | requires 'perl', '5.10.0'; 2 | 3 | requires 'Carp', '1.29'; 4 | requires 'Data::Dumper', '2.145'; 5 | -------------------------------------------------------------------------------- /Assembler/MaxAs/microbench/microbench.cpp: -------------------------------------------------------------------------------- 1 | // microbench.cpp : Defines the entry point for the console application. 2 | // 3 | 4 | // nvcc -l cuda -o microbench microbench.cpp 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | CUcontext hContext = 0; 13 | 14 | #define CUDA_CHECK( fn ) do { \ 15 | CUresult status = (fn); \ 16 | if ( CUDA_SUCCESS != status ) { \ 17 | const char* errstr; \ 18 | cuGetErrorString(status, &errstr); \ 19 | printf("CUDA Driver Failure (line %d of file %s):\n\t%s returned 0x%x (%s)\n", __LINE__, __FILE__, #fn, status, errstr); \ 20 | if (hContext) cuCtxDestroy(hContext); \ 21 | exit(EXIT_FAILURE); \ 22 | } \ 23 | } while (0) 24 | 25 | 26 | int main(int argc, char* argv[]) 27 | { 28 | //int iTest = 2896; 29 | //while (iTest < 0x7fff) 30 | //{ 31 | // int iResult = iTest * iTest; 32 | // float fTest = (float)iTest; 33 | // int fResult = (int)(fTest * fTest); 34 | 35 | // printf("i*i:%08x f*f:%08x\n", iResult, fResult); 36 | 37 | // iTest += 0x0800; 38 | //} 39 | //exit(0); 40 | 41 | char deviceName[32]; 42 | int devCount, ordinal, major, minor; 43 | CUdevice hDevice; 44 | 45 | // Initialize the Driver API and find a device 46 | CUDA_CHECK( cuInit(0) ); 47 | CUDA_CHECK( cuDeviceGetCount(&devCount) ); 48 | for (ordinal = 0; ordinal < devCount; ordinal++) 49 | { 50 | CUDA_CHECK( cuDeviceGet(&hDevice, ordinal) ); 51 | CUDA_CHECK( cuDeviceGetAttribute (&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, hDevice) ); 52 | CUDA_CHECK( cuDeviceGetAttribute (&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, hDevice) ); 53 | CUDA_CHECK( cuDeviceGetName(deviceName, sizeof(deviceName), hDevice) ); 54 | if (major >= 5 && minor >= 2) 55 | { 56 | printf("Using: Id:%d %s (%d.%d)\n\n", ordinal, deviceName, major, minor); 57 | break; 58 | } 59 | } 60 | if (ordinal == devCount) 61 | { 62 | printf("No compute 5.0 device found, exiting.\n"); 63 | exit(EXIT_FAILURE); 64 | } 65 | 66 | // First command line arg is the type: internal (CS2R) or external (cuEventElapsedTime) timing 67 | int internalTiming = 1; 68 | if (argc > 1) 69 | internalTiming = strcmp(argv[1], "i") == 0 ? 1 : 0; 70 | 71 | // Second command line arg is the number of blocks 72 | int blocks = 1; 73 | if (argc > 2) 74 | blocks = atoi(argv[2]); 75 | if (blocks < 1) 76 | blocks = 1; 77 | 78 | // Third command line arg is the number of threads 79 | int threads = 128; 80 | if (argc > 3) 81 | threads = atoi(argv[3]); 82 | if (threads > 1024 || threads < 32) 83 | threads = 128; 84 | threads &= -32; 85 | 86 | // Forth command line arg: 87 | double fops = 1.0; 88 | int lanes = 1; 89 | if (argc > 4) 90 | { 91 | if (internalTiming) 92 | { 93 | // The number of lanes to print for each warp 94 | lanes = atoi(argv[4]); 95 | if (lanes > 32 || lanes < 1) 96 | lanes = 1; 97 | } 98 | else 99 | // The number of floating point operations in a full kernel launch 100 | fops = atof(argv[4]); 101 | } 102 | 103 | // Fifth command line arg is the repeat count for benchmarking 104 | int repeat = 1; 105 | if (argc > 5) 106 | repeat = atoi(argv[5]); 107 | if (repeat > 1000 || repeat < 1) 108 | repeat = 1; 109 | 110 | // threads = total number of threads 111 | size_t size = sizeof(int) * threads * blocks; 112 | 113 | // Setup our input and output buffers 114 | int* dataIn = (int*)malloc(size); 115 | int* dataOut = (int*)malloc(size); 116 | int* clocks = (int*)malloc(size); 117 | memset(dataIn, 0, size); 118 | 119 | CUmodule hModule; 120 | CUfunction hKernel; 121 | CUevent hStart, hStop; 122 | CUdeviceptr devIn, devOut, devClocks; 123 | 124 | // Init our context and device memory buffers 125 | CUDA_CHECK( cuCtxCreate(&hContext, 0, hDevice) ); 126 | CUDA_CHECK( cuMemAlloc(&devIn, size) ); 127 | CUDA_CHECK( cuMemAlloc(&devOut, size) ); 128 | CUDA_CHECK( cuMemAlloc(&devClocks, size) ); 129 | CUDA_CHECK( cuMemcpyHtoD(devIn, dataIn, size) ); 130 | CUDA_CHECK( cuMemsetD8(devOut, 0, size) ); 131 | CUDA_CHECK( cuMemsetD8(devClocks, 0, size) ); 132 | 133 | CUDA_CHECK( cuEventCreate(&hStart, CU_EVENT_BLOCKING_SYNC) ); 134 | CUDA_CHECK( cuEventCreate(&hStop, CU_EVENT_BLOCKING_SYNC) ); 135 | 136 | // Load our kernel 137 | CUDA_CHECK( cuModuleLoad(&hModule, "microbench.cubin") ); 138 | CUDA_CHECK( cuModuleGetFunction(&hKernel, hModule, "microbench") ); 139 | 140 | // Setup the params 141 | void* params[] = { &devOut, &devClocks, &devIn }; 142 | float ms = 0; 143 | 144 | // Warm up the clock (unless under nsight) 145 | if (!getenv("NSIGHT_LAUNCHED")) // NSIGHT_CUDA_ANALYSIS NSIGHT_CUDA_DEBUGGER 146 | for (int i = 0; i < repeat; i++) 147 | CUDA_CHECK( cuLaunchKernel(hKernel, blocks, 1, 1, threads, 1, 1, 0, 0, params, 0) ); 148 | 149 | // Launch the kernel 150 | CUDA_CHECK( cuEventRecord(hStart, NULL) ); 151 | //CUDA_CHECK( cuProfilerStart() ); 152 | CUDA_CHECK( cuLaunchKernel(hKernel, blocks, 1, 1, threads, 1, 1, 0, 0, params, 0) ); 153 | //CUDA_CHECK( cuProfilerStop() ); 154 | CUDA_CHECK( cuEventRecord(hStop, NULL) ); 155 | CUDA_CHECK( cuEventSynchronize(hStop) ); 156 | CUDA_CHECK( cuEventElapsedTime(&ms, hStart, hStop) ); 157 | 158 | //CUDA_CHECK( cuCtxSynchronize() ); 159 | 160 | // Get back our results from each kernel 161 | CUDA_CHECK( cuMemcpyDtoH(dataOut, devOut, size) ); 162 | CUDA_CHECK( cuMemcpyDtoH(clocks, devClocks, size) ); 163 | 164 | // Cleanup and shutdown of cuda 165 | CUDA_CHECK( cuEventDestroy(hStart) ); 166 | CUDA_CHECK( cuEventDestroy(hStop) ); 167 | CUDA_CHECK( cuModuleUnload(hModule) ); 168 | CUDA_CHECK( cuMemFree(devIn) ); 169 | CUDA_CHECK( cuMemFree(devOut) ); 170 | CUDA_CHECK( cuMemFree(devClocks) ); 171 | CUDA_CHECK( cuCtxDestroy(hContext) ); 172 | hContext = 0; 173 | 174 | // When using just one block, print out the internal timing data 175 | if (internalTiming) 176 | { 177 | int count = 0, total = 0, min = 999999, max = 0; 178 | 179 | int* clocks_p = clocks; 180 | int* dataOut_p = dataOut; 181 | 182 | // Loop over and print results 183 | for (int blk = 0; blk < blocks; blk++) 184 | { 185 | float *fDataOut = reinterpret_cast(dataOut_p); 186 | 187 | for(int tid = 0; tid < threads; tid += 32) 188 | { 189 | // Sometimes we want data on each thread, sometimes just one sample per warp is fine 190 | for (int lane = 0; lane < lanes; lane++) 191 | printf("b:%02d w:%03d t:%04d l:%02d clocks:%08d out:%08x\n", blk, tid/32, tid, lane, clocks_p[tid+lane], dataOut_p[tid+lane]); // %04u 192 | 193 | count++; 194 | total += clocks_p[tid]; 195 | if (clocks_p[tid] < min) min = clocks_p[tid]; 196 | if (clocks_p[tid] > max) max = clocks_p[tid]; 197 | } 198 | clocks_p += threads; 199 | dataOut_p += threads; 200 | } 201 | printf("average: %.3f, min %d, max: %d\n", (float)total/count, min, max); 202 | } 203 | else 204 | { 205 | // For more than one block we're testing throughput and want external timing data 206 | printf("MilliSecs: %.3f, GFLOPS: %.3f\n", ms, fops / (ms * 1000000.0)); 207 | } 208 | // And free up host memory 209 | free(dataIn); free(dataOut); free(clocks); 210 | 211 | return 0; 212 | } 213 | -------------------------------------------------------------------------------- /Assembler/MaxAs/microbench/microbench.cu: -------------------------------------------------------------------------------- 1 | 2 | // Note this file isn't configured to automatically compile 3 | 4 | #include 5 | #include 6 | 7 | // Build: 8 | // nvcc -l cuda -o microbench microbench.cpp 9 | // nvcc -arch sm_50 -cubin microbench.cu 10 | 11 | // Inspect a cubin (use nvdisasm from cuda 6.5 for best results): 12 | // maxas.pl -e microbench.cubin 13 | 14 | // Insert new sass into cubin 15 | // maxas.pl -i microbench.sass microbench.cubin 16 | 17 | // run it: 18 | // ./microbench 19 | 20 | // Use extern C so C++ doesn't mangle our kernel name 21 | extern "C" __global__ void microbench(int *out, int *clocks, int *in) 22 | { 23 | __shared__ int share[1024]; 24 | 25 | int tid = threadIdx.x; 26 | int bx = blockIdx.x; 27 | int by = blockIdx.y; 28 | 29 | int start = clock(); 30 | 31 | share[tid] = in[by * 65535 + bx]; //tid + blkDimX + blkDimY + blkDimZ + grdDimX + grdDimY + grdDimZ 32 | 33 | __syncthreads(); 34 | 35 | int end = clock(); 36 | 37 | clocks[tid] = (start >> 16) | (end & 0xffff0000); //end - start; 38 | 39 | out[tid] = share[tid ^ 1]; 40 | } 41 | 42 | // A note about using the Cuda Runtime. 43 | // If that's your preference over the driver API then here's what you'd do: 44 | 45 | // In your project properties in the Cuda C/C++ panel: 46 | // -Set the "Keep Processed Files" (-keep) option 47 | // -Add a -v manually to the command line 48 | // If compiling on command line just add -keep -v options to nvcc. 49 | // Rebuild your solution and look in the log for these lines that follow the ptxas step: 50 | 51 | // #$ fatbinary --create="Release/kernel.fatbin" -32 --key="a7bce87544c2a492" --ident="C:/Users/Scott/Documents/sgemm6/sgemm6/kernel.cu" --cmdline="-v --opt-level 4 --generate-line-info " "--image=profile=sm_50,file=Release/kernel.sm_50.cubin" "--image=profile=compute_50,file=Release/kernel.ptx" --embedded-fatbin="Release/kernel.fatbin.c" --cuda 52 | // #$ cl.exe @Release/kernel.cu.cpp.ii.res > "Release/kernel.cu.cpp.ii" 53 | // #$ cl.exe @Release/kernel.cu.obj.res -Fo"Release/kernel.cu.obj" 54 | 55 | // You just need to manually run these 3 commands (or add them to a build script) 56 | // after you've modified the cubin generated from the preceeding ptxas command. 57 | // That will give you a new .cu.obj file which will automatically be linked in for you next time you 58 | // build your project (or you could manually run the linker step as well). 59 | 60 | // Having done that you can call your kernel normally using the <<< >>> syntax. 61 | // Debugging will have to be with the sass syntax but that's what you'll want to see anyway. 62 | // With fatbin you can also keep non-maxwell optimized versions of your code. 63 | 64 | 65 | // I just discovered this also works as a shortcut to the above: 66 | // nvcc -lib -arch sm_52 -m 32 -use-cubin code=sm_52,cubin=microbench.cubin -o microbench.lib microbench.cu 67 | 68 | // The cu kernel definitions above need to have empty bodies. 69 | // And, the cu file must be compiled to a lib seperately before linking. -------------------------------------------------------------------------------- /Assembler/MaxAs/microbench/microbench.sass: -------------------------------------------------------------------------------- 1 | # Kernel: microbench 2 | 3 | // This is a simple micro bench to demonstrate the latency in loading SR_TID.X 4 | 5 | 6 | blockDimX : c[0x0][0x08] 7 | blockDimY : c[0x0][0x0c] 8 | blockDimZ : c[0x0][0x10] 9 | gridDimX : c[0x0][0x14] 10 | gridDimY : c[0x0][0x18] 11 | gridDimZ : c[0x0][0x1c] 12 | 13 | param_out[0] : c[0x0][0x140] 14 | param_out[1] : c[0x0][0x144] 15 | param_clocks[0] : c[0x0][0x148] 16 | param_clocks[1] : c[0x0][0x14c] 17 | param_in[0] : c[0x0][0x150] 18 | param_in[1] : c[0x0][0x154] 19 | 20 | 21 | 22 | 23 | 0-1 : out<0-1> 24 | 2-3 : clocks<0-1> 25 | 4-5 : in<0-1> 26 | 6-20 : tid, bid, blockDim, clock1, clock2, result, offset, x 27 | 28 | 29 | 30 | // Load in our params (not currently used below) 31 | --:-:-:-:1 MOV in0, param_in[0]; 32 | --:-:-:-:1 MOV in1, param_in[1]; 33 | 34 | // Get the first clock value 35 | --:-:-:-:1 CS2R clock1, SR_CLOCKLO; 36 | 37 | // Get the threadId and blockId 38 | // Set the Read-After-Write dependency barrier 1 and 2 39 | --:-:1:-:1 S2R tid, SR_TID.X; 40 | // Add one additional clock stall to allow the barrier time to set prior to next instruction that uses it 41 | --:-:2:-:2 S2R bid, SR_CTAID.X; 42 | 43 | 44 | // Get the second clock value 45 | // Wait on the depenedency barriers that were set in the prior instruction 46 | // Stall 6 to allow CS2R time to complete before next instruction 47 | // CS2R takes a constant 6 clocks to complete unlike S2R which is a variable 22-44 clocks 48 | // This stall count does not factor into the time calculation at all 49 | 03:-:-:-:6 CS2R clock2, SR_CLOCKLO; 50 | 51 | // Take the difference of clocks 52 | --:-:-:-:1 IADD clock1, clock2, -clock1; 53 | 54 | // Setup our output addresses 55 | // Stall your pipeline dependencies properly 56 | // Note using a single XMAD assumes blockDimX and bid are 16 bit values, which is reasonable for this test code 57 | --:-:-:-:6 XMAD offset, bid, blockDimX, tid; 58 | 59 | // LEA is "load effective address" 60 | // The offset param is shifted left 2 and added to the pointers with 64bit math 61 | --:-:-:-:6 LEA clocks0.CC, offset, param_clocks[0], 2; 62 | --:-:-:-:1 LEA.HI.X clocks1, offset, param_clocks[1], RZ, 2; 63 | 64 | --:-:-:-:6 LEA out0.CC, offset, param_out[0], 2; 65 | --:-:-:-:1 LEA.HI.X out1, offset, param_out[1], RZ, 2; 66 | 67 | // Output the results. 68 | // No stall needed on prior instruction as memory store instructions have a 5 clock delay in picking up register values 69 | --:-:-:-:1 STG.E [clocks], clock1; 70 | --:-:-:-:1 STG.E [out], offset; # use this to return whatever you like to inspect the results 71 | --:-:-:-:5 EXIT; 72 | 73 | -------------------------------------------------------------------------------- /Assembler/MaxAs/microbench/shared.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | 4 | print `maxas.pl -i shared_sts16.sass microbench.cubin`; 5 | 6 | exit if $?; 7 | 8 | print `Release\\microbench.exe i 1 64`; 9 | 10 | 11 | __END__ 12 | 13 | -------------------------------------------------------------------------------- /Assembler/MaxAs/microbench/shared_lds.sass: -------------------------------------------------------------------------------- 1 | # Kernel: microbench 2 | # InsCnt: 18 3 | # RegCnt: 5 4 | # SharedSize: 4096 5 | # BarCnt: 1 6 | # Params(3): 7 | # ord:addr:size:align 8 | # 0:0x140:4:0 9 | # 1:0x144:4:0 10 | # 2:0x148:4:0 11 | 12 | // This is a simple micro bench to demonstrate the latency in loading SR_TID.X 13 | 14 | 15 | 16 | 0-3 : result, a, b, c 17 | 18 | 4-40 : out, clocks, in, tid, bid, blockDim, clock1, clock2, x, tid3, tid7, tid96, tid128, readAs, readBs, val<0-20> 19 | 20 | 21 | 22 | // Load in our params 23 | --:-:1:-:1 S2R tid, SR_TID.X; 24 | --:-:2:-:1 S2R bid, SR_CTAID.X; 25 | 26 | --:-:-:-:1 MOV result, c[0x0][0x0]; 27 | --:-:-:-:1 MOV in, c[0x0][0x100]; 28 | 29 | --:-:-:-:1 CS2R clock1, SR_CLOCKLO; 30 | --:-:-:-:1 MOV result, c[0x0][0x13c]; 31 | --:-:-:-:1 CS2R clock2, SR_CLOCKLO; 32 | 33 | --:-:-:-:1 MOV blockDim, c[0x0][0x8]; 34 | --:-:-:-:1 MOV out, c[0x0][0x140]; 35 | --:-:-:-:1 MOV clocks, c[0x0][0x144]; 36 | 37 | 38 | 39 | 40 | 41 | 42 | 03:-:-:-:1 LOP.AND tid3, tid, 3; 43 | --:-:-:-:1 LOP.AND tid7, tid, 7; 44 | --:-:-:-:1 LOP.AND tid96, tid, 96; 45 | --:-:-:-:1 LOP.AND tid128, tid, 128; 46 | 47 | // readAs = ((tid128 >> 4) | tid7) << 4 48 | --:-:-:-:1 SHR.U32 readAs, tid128, 4; 49 | --:-:-:-:1 LOP.OR readAs, readAs, tid7; 50 | --:-:-:-:1 SHL readAs, readAs, 4; 51 | 52 | // readBs = ((tid96 >> 3) | tid3) << 4 53 | --:-:-:-:1 SHR.U32 readBs, tid96, 3; 54 | --:-:-:-:1 LOP.OR readBs, readBs, tid3; 55 | #--:-:-:-:1 SHL readBs, readBs, 4; 56 | #--:-:-:-:1 ISCADD readBs, readBs, 4x<1024>, 4; 57 | 58 | 59 | 60 | 61 | 62 | 63 | #--:-:-:-:1 LDS.U.128 result, [readBs]; 64 | 65 | 66 | 67 | 68 | 01:-:-:-:1 IADD clock1, clock2, -clock1; 69 | 70 | 71 | --:-:-:-:1 XMAD tid, blockDim, bid, tid; 72 | --:-:-:Y:6 XMAD.MRG x, blockDim, bid.H1, RZ; 73 | --:-:-:Y:6 XMAD.PSL.CBCC tid, blockDim.H1, x.H1, tid; 74 | --:-:-:Y:6 SHL tid, tid, 0x2; 75 | 76 | --:-:-:-:1 IADD clocks, clocks, tid; 77 | --:-:-:-:2 IADD out, out, tid; 78 | 79 | --:-:-:-:1 STG [clocks], clock1; 80 | --:-:-:-:1 STG [out], readBs; 81 | --:-:-:-:5 EXIT; 82 | 83 | 84 | 85 | --:-:-:-:4 LOP.AND tid32, tid, -32; 86 | 87 | --:-:-:-:1 STS.128 [tid32 + 4x<2048>], RZ; 88 | 89 | --:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 90 | --:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 91 | --:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 92 | --:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 93 | --:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 94 | --:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 95 | --:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 96 | --:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 97 | --:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 98 | --:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 99 | --:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 100 | --:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 101 | --:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 102 | --:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 103 | --:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 104 | --:-:1:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 105 | 106 | 107 | // readAs = (((tid & 0x80) >> 4) | ((tid >> 1) & 7)) << 4; 108 | --:-:-:-:1 BFE.U32 tid7, tid, 0x301; 109 | --:-:-:-:1 LOP.AND readAs, tid, 0x80; 110 | --:-:-:-:1 SHR.U32 readAs, readAs, 4; 111 | --:-:-:-:1 LOP.OR readAs, readAs, tid7; 112 | --:-:-:-:1 SHL readAs, readAs, 4; 113 | 114 | // readBs = ((($tid & 0x70) >> 3) | ($tid & 1)) << 4 + 4096; 115 | --:-:-:-:1 LOP.AND tid1, tid, 0x1; 116 | --:-:-:-:1 LOP.AND readBs, tid, 0x70; 117 | --:-:-:-:1 SHR.U32 readBs, readBs, 3; 118 | --:-:-:-:1 LOP.OR readBs, readBs, tid1; 119 | --:-:-:-:1 ISCADD readBs, readBs, 4x<1024>, 4; 120 | 121 | 122 | -------------------------------------------------------------------------------- /Assembler/MaxAs/microbench/shared_sts16.sass: -------------------------------------------------------------------------------- 1 | # Kernel: microbench 2 | # InsCnt: 18 3 | # RegCnt: 5 4 | # SharedSize: 4096 5 | # BarCnt: 1 6 | # Params(3): 7 | # ord:addr:size:align 8 | # 0:0x140:4:0 9 | # 1:0x144:4:0 10 | # 2:0x148:4:0 11 | 12 | // This is a simple micro bench to demonstrate the latency in loading SR_TID.X 13 | 14 | 15 | 16 | 0-3 : result, a, b, c 17 | 18 | 4-40 : out, clocks, in, tid, bid, blockDim, clock1, clock2, x, tid1, tid31, tid32, readAs, readBs, val<0-20> 19 | 20 | 21 | 22 | // Load in our params 23 | --:-:1:-:1 S2R tid, SR_TID.X; 24 | --:-:2:-:1 S2R bid, SR_CTAID.X; 25 | 26 | //--:-:-:-:1 MOV result, c[0x0][0x0]; 27 | //--:-:-:-:1 MOV in, c[0x0][0x100]; 28 | --:-:-:-:1 MOV result, 1; 29 | 30 | --:-:-:-:1 MOV blockDim, c[0x0][0x8]; 31 | --:-:-:-:1 MOV out, c[0x0][0x140]; 32 | --:-:-:-:1 MOV clocks, c[0x0][0x144]; 33 | 34 | 35 | // readAs = ((tid >> 1) & 7) << 4; 36 | 03:-:-:-:6 BFE.U32 readAs, tid, 0x301; // 3 bits at position 1 37 | --:-:-:-:6 SHL readAs, readAs, 3; 38 | 39 | // readBs = (((tid & 0x30) >> 3) | (tid & 1)) << 4 + 1024; 40 | --:-:-:-:6 LOP.AND tid1, tid, 1; 41 | --:-:-:-:6 LOP.AND readBs, tid, 0x30; 42 | --:-:-:-:6 SHR.U32 readBs, readBs, 3; 43 | --:-:-:-:6 LOP.OR readBs, readBs, tid1; 44 | --:-:-:-:6 ISCADD readBs, readBs, 0, 3; 45 | 46 | 47 | 48 | ///--:-:-:-:1 STS [tid32], result; 49 | //--:-:-:-:1 STS.S16 [tid32 + 2x<32>], result; 50 | //--:-:1:-:2 LDS.U.64 result, [readBs]; 51 | 52 | --:-:-:-:0 CS2R clock1, SR_CLOCKLO; 53 | --:-:1:-:6 LDS.U.64 result, [readAs]; 54 | --:-:-:-:6 CS2R clock2, SR_CLOCKLO; 55 | 56 | 57 | 01:-:-:-:1 IADD clock1, clock2, -clock1; 58 | 59 | 60 | --:-:-:-:1 XMAD tid, blockDim, bid, tid; 61 | --:-:-:Y:6 XMAD.MRG x, blockDim, bid.H1, RZ; 62 | --:-:-:Y:6 XMAD.PSL.CBCC tid, blockDim.H1, x.H1, tid; 63 | --:-:-:Y:6 SHL tid, tid, 0x2; 64 | 65 | --:-:-:-:1 IADD clocks, clocks, tid; 66 | --:-:-:-:2 IADD out, out, tid; 67 | 68 | --:-:-:-:1 STG [clocks], clock1; 69 | --:-:-:-:1 STG [out], result; 70 | --:-:-:-:5 EXIT; 71 | 72 | 73 | 74 | --:-:-:-:4 LOP.AND tid32, tid, -32; 75 | 76 | --:-:-:-:1 STS.128 [tid32 + 4x<2048>], RZ; 77 | 78 | --:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 79 | --:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 80 | --:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 81 | --:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 82 | --:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 83 | --:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 84 | --:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 85 | --:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 86 | --:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 87 | --:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 88 | --:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 89 | --:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 90 | --:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 91 | --:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 92 | --:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 93 | --:-:1:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 94 | 95 | 03:-:-:-:6 LOP.AND tid31, tid, 31; 96 | --:-:-:-:6 LOP.AND tid32, tid, 32; 97 | --:-:-:-:6 SHL tid32, tid32, 0x2; 98 | --:-:-:-:6 LOP.OR tid32, tid32, tid31; 99 | --:-:-:-:6 SHL tid32, tid32, 0x2; 100 | 101 | // readAs = (((tid & 0x80) >> 4) | ((tid >> 1) & 7)) << 4; 102 | --:-:-:-:1 BFE.U32 tid7, tid, 0x301; 103 | --:-:-:-:1 LOP.AND readAs, tid, 0x80; 104 | --:-:-:-:1 SHR.U32 readAs, readAs, 4; 105 | --:-:-:-:1 LOP.OR readAs, readAs, tid7; 106 | --:-:-:-:1 SHL readAs, readAs, 4; 107 | 108 | // readBs = ((($tid & 0x70) >> 3) | ($tid & 1)) << 4 + 4096; 109 | --:-:-:-:1 LOP.AND tid1, tid, 0x1; 110 | --:-:-:-:1 LOP.AND readBs, tid, 0x70; 111 | --:-:-:-:1 SHR.U32 readBs, readBs, 3; 112 | --:-:-:-:1 LOP.OR readBs, readBs, tid1; 113 | --:-:-:-:1 ISCADD readBs, readBs, 4x<1024>, 4; 114 | 115 | 116 | -------------------------------------------------------------------------------- /Assembler/MaxAs/microbench/throughput.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | 4 | my $loopSize = 512; 5 | my $blocks = 32; 6 | my $loops = 10240000; 7 | my $fileName = 'throughput2.sass'; 8 | 9 | writeSassFile($fileName, $loops); 10 | 11 | #print `maxas.pl -p $fileName`; 12 | #exit; 13 | 14 | print `maxas.pl -i $fileName microbench.cubin`; 15 | exit if $?; 16 | 17 | foreach my $thread128 (2) 18 | { 19 | my $threads = $thread128 * 128; 20 | my $fops = 2 * $loops * $loopSize * $blocks * $threads; 21 | 22 | my $data = `Release\\microbench.exe e $blocks $threads $fops`; 23 | 24 | my ($gflops) = $data =~ /GFLOPS: ([0-9]+)/ms; 25 | 26 | printf "%d %d %d\n", $thread128, $threads, $gflops; 27 | } 28 | 29 | exit; 30 | 31 | sub writeSassFile 32 | { 33 | my ($filename, $loops) = @_; 34 | 35 | open my $fh, ">$filename" or die "$filename: $!"; 36 | 37 | printf $fh <<'EOF', $loops; 38 | # Kernel: microbench 39 | 40 | 41 | 42 | 0-10 : result, r1, r2, r3 43 | 20-27 ~ count, stop 44 | 45 | 46 | 47 | --:-:-:-:1 MOV count, RZ; 48 | --:-:-:-:1 MOV32I stop, %d; 49 | --:-:-:-:1 MOV32I r1, 1.0; 50 | --:-:-:-:1 MOV32I r2, 1.0; 51 | --:-:-:-:4 MOV32I r3, 1.0; 52 | 53 | LOOP: 54 | 55 | --:-:-:-:1 ISETP.LE.AND P0, PT, count, stop, PT; 56 | --:-:-:-:1 IADD count, count, 1; 57 | 58 | 59 | my $out; 60 | 61 | foreach my $i (0 .. 511) 62 | { 63 | my $yield = ($i + 32) & 63 ? '-' : 'Y'; 64 | 65 | my $stall = $i == 511 ? 0 : 1; 66 | 67 | $out .= "--:-:-:$yield:$stall FFMA result, r1, r2, r3;\n"; 68 | } 69 | return $out; 70 | 71 | 72 | --:-:-:Y:5 @P0 BRA LOOP; 73 | --:-:-:-:5 EXIT; 74 | EOF 75 | 76 | close $fh; 77 | } 78 | 79 | __END__ 80 | 81 | -------------------------------------------------------------------------------- /Assembler/MaxAs/microbench/throughput.sass: -------------------------------------------------------------------------------- 1 | # Kernel: microbench 2 | # InsCnt: 18 3 | # RegCnt: 5 4 | # SharedSize: 4096 5 | # BarCnt: 1 6 | # Params(3): 7 | # ord:addr:size:align 8 | # 0:0x140:4:0 9 | # 1:0x144:4:0 10 | # 2:0x148:4:0 11 | 12 | 13 | 14 | 8-20 : count 15 | 16 | 17 | 18 | --:-:-:-:1 MOV R0, RZ; 19 | --:-:-:-:1 MOV R1, RZ; 20 | --:-:-:-:1 MOV R2, RZ; 21 | --:-:-:-:1 MOV R3, RZ; 22 | --:-:-:-:1 MOV R4, RZ; 23 | --:-:-:-:1 MOV R5, RZ; 24 | --:-:-:-:1 MOV R6, RZ; 25 | --:-:-:-:1 MOV R7, RZ; 26 | --:-:-:-:1 MOV R8, RZ; 27 | --:-:-:Y:6 MOV count, RZ; 28 | 29 | // This loop is capable of running at 1700 GFlops on GM107. 30 | // You can tweak it to see how register bank conflicts or different control codes 31 | // effect performance. 32 | // With thoughput.pl you can pass params to this code and do some autotuning. 33 | LOOP: 34 | 35 | --:-:-:-:1 ISETP.LE.AND P0, PT, count, 0x19000, PT; 36 | --:-:-:-:1 IADD count, count, 0x1; 37 | 38 | 39 | my $out; 40 | 41 | foreach my $i (0..511) #511 42 | { 43 | my $y = ($i + 32) & 63 ? '-' : 'Y'; 44 | 45 | $out .= qq| 46 | --:-:-:$y:1 FFMA R0, R1, R2, R3;|; #c[0x0][$c] 47 | } 48 | return $out; 49 | 50 | 51 | --:-:-:Y:5 @P0 BRA LOOP; 52 | 53 | --:-:-:-:5 EXIT; 54 | 55 | 56 | 57 | 58 | open my $fh, 'params.txt'; 59 | my $line = <$fh>; 60 | close $fh; 61 | my ($r1, $r2, $r3) = split "\t", $line; 62 | 63 | 80-95 : out, clocks, in, tid, clock1, clock2, result 64 | 65 | 66 | --:-:1:-:1 S2R tid, SR_TID.X; 67 | --:-:-:-:1 MOV out, c[0x0][0x140]; 68 | --:-:-:-:1 MOV clocks, c[0x0][0x144]; 69 | 01:-:-:-:1 MOV in, c[0x0][0x148]; 70 | 71 | 72 | 73 | --:-:-:-:1 MOV32I f0, 0x3f800000; 74 | --:-:-:-:1 MOV32I f1, 0x3f800000; 75 | --:-:-:-:1 MOV32I f2, 0x3f800000; 76 | --:-:-:-:5 MOV32I f3, 0x3f800000; 77 | 78 | --:-:-:-:1 CS2R clock1, SR_CLOCKLO; 79 | 80 | 81 | --:-:-:-:1 CS2R clock2, SR_CLOCKLO; 82 | 83 | --:-:-:-:6 MOV32I result, 0x457; 84 | --:-:-:-:1 IADD clock1, clock2, -clock1; 85 | 86 | 87 | --:-:-:-:6 SHL tid, tid, 0x2; 88 | --:-:-:-:1 IADD clocks, clocks, tid; 89 | --:-:-:-:1 IADD out, out, tid; 90 | 91 | --:-:-:-:1 STG [clocks], clock1; 92 | --:-:-:-:1 STG [out], R24; 93 | 94 | 95 | -------------------------------------------------------------------------------- /Assembler/MaxAs/microbench/throughput2.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | my %p; 4 | 5 | $p{N} = 8192; 6 | $p{blocking} = 8; 7 | $p{unroll} = 8; 8 | $p{threads} = 64; #256 9 | 10 | $p{csize} = $p{blocking} * $p{blocking}; 11 | $p{loopSize} = $p{unroll} * $p{csize}; 12 | $p{width} = sqrt($p{csize} * $p{threads}); 13 | $p{blocks} = ($p{N} / $p{width}) * ($p{N} / $p{width}); 14 | $p{loops} = $p{N} / $p{unroll}; 15 | $p{fops} = 2 * $p{loops} * $p{loopSize} * $p{blocks} * $p{threads}; 16 | 17 | my $fileName = 'throughput2.sass'; 18 | 19 | my @params = qw(N blocking unroll threads csize loopSize loops width blocks fops); 20 | 21 | #print join("\t", @params), "\n"; 22 | #print join("\t", @p{@params}), "\n"; 23 | 24 | print map sprintf("%-9s: %d\n", $_, $p{$_}), @params; 25 | 26 | writeSassFile($fileName, $p{loopSize}, $p{loops}); 27 | 28 | #print `maxas.pl -p $fileName`; 29 | #exit; 30 | 31 | print `maxas.pl -i $fileName microbench.cubin`; 32 | 33 | exit if $?; 34 | 35 | my $data = `Release\\microbench.exe e $p{blocks} $p{threads} $p{fops} 50`; 36 | 37 | my ($gflops) = $data =~ /GFLOPS: ([0-9]+)/ms; 38 | 39 | print $data; 40 | 41 | #printf "%d %4d %4d %d\n", $thread128, $loopSize, $loops, $gflops; 42 | 43 | 44 | 45 | 46 | sub writeSassFile 47 | { 48 | my ($filename, $loopSize, $loops) = @_; 49 | 50 | open my $fh, ">$filename" or die "$filename: $!"; 51 | 52 | printf $fh <<'END_SASS', $loops; 53 | # Kernel: microbench 54 | 55 | 56 | 57 | 3, 2,11,10,19,18,27,26 : cx00y<00-03|64-67> 58 | 7, 6,15,14,23,22,31,30 : cx01y<00-03|64-67> 59 | 1, 0, 9, 8,17,16,25,24 : cx02y<00-03|64-67> 60 | 5, 4,13,12,21,20,29,28 : cx03y<00-03|64-67> 61 | 35,34,43,42,51,50,59,58 : cx64y<00-03|64-67> 62 | 39,38,47,46,55,54,63,62 : cx65y<00-03|64-67> 63 | 33,32,41,40,49,48,57,56 : cx66y<00-03|64-67> 64 | 37,36,45,44,53,52,61,60 : cx67y<00-03|64-67> 65 | 66 | 64-79 : j0Ax<00-03|64-67>, j0By<00-03|64-67> 67 | 80-95 : j1Ax<00-03|64-67>, j1By<00-03|64-67> 68 | 69 | 0-127 : r<0-127> 70 | 71 | 100-101 : count, stop 72 | 73 | //102-112 ~ readAs, readBs, writeS 74 | 75 | 76 | 77 | --:-:-:-:1 MOV count, RZ; 78 | --:-:-:-:1 MOV32I stop, %d; 79 | //--:-:-:-:1 MOV writeS, RZ; 80 | //--:-:-:-:1 MOV readAs, RZ; 81 | //--:-:-:-:1 MOV readBs, RZ; 82 | 83 | 84 | return join '', map "--:-:-:-:1 MOV32I r$_, 1.0;\n", 0..95; 85 | 86 | 87 | LOOP: 88 | 89 | --:-:-:-:1 ISETP.LE.AND P0, PT, count, stop, PT; 90 | --:-:-:-:1 IADD count, count, 1; 91 | 92 | 93 | my $out; 94 | 95 | 96 | my @cOrder; 97 | #my @swirl = ([0,1],[0,0],[2,0],[2,1]); 98 | my @swirl = ([2,0],[2,1],[0,1],[0,0]); 99 | #my @swirl = ([0,1],[0,0],[1,0],[1,1]); 100 | my @xVals = (0,1,64,65); 101 | #my @xVals = (0,2,64,66); 102 | 103 | my @yVals = (0,2,64,66); 104 | 105 | foreach my $y (@yVals) 106 | { 107 | foreach my $x (@xVals) 108 | { 109 | push @cOrder, sprintf('x%%02dy%%02d', $x + $_->[0], $y + $_->[1]) foreach @swirl; 110 | } 111 | @xVals = reverse @xVals; 112 | } 113 | 114 | foreach my $j (0..7) 115 | { 116 | my $odd = $j & 1; 117 | my $nOdd = !$odd + 0; 118 | 119 | my %%insert; 120 | 121 | #$insert{c62} = "01:-:-:-:5 BAR.SYNC 0;\n" if $j == 6; 122 | 123 | $insert{c62} = 124 | "--:-:-:-:1 LOP.XOR readAs, readAs, 0;\n" . 125 | "--:-:-:-:1 LOP.XOR readBs, readBs, 0;\n" . 126 | "--:-:-:-:1 LOP.XOR readAs, readAs, 0;\n" . 127 | "--:-:-:-:1 LOP.XOR readBs, readBs, 0;\n" . 128 | "--:-:-:-:1 LOP.XOR writeS, writeS, 0;\n" if $j == 8; 129 | 130 | foreach my $c (0 .. 63) 131 | { 132 | my ($x,$y) = $cOrder[$c] =~ /^(x\d+)(y\d+)/; 133 | my $ins = $insert{"c$c"} || ''; 134 | my $stall = ($c == 63 && $j == 7) ? 0 : 1; #1; #$ins || 135 | my $yield = $c == 32 ? 'Y' : '-'; 136 | my $wait = '--'; #$c ? '--' : '01'; 137 | 138 | $out .= "$wait:-:-:$yield:$stall FFMA c$cOrder[$c], j${odd}A$x, j${odd}B$y, c$cOrder[$c];\n$ins"; 139 | } 140 | } 141 | return $out; 142 | 143 | 144 | --:-:-:Y:5 @P0 BRA LOOP; 145 | --:-:-:-:5 EXIT; 146 | END_SASS 147 | 148 | close $fh; 149 | } 150 | 151 | __END__ 152 | 153 | my %%insert = ( 154 | c0 => "--:-:-:-:1 LDS.U.128 j${nOdd}Ax00, [readAs+0x10];\n", 155 | c2 => "--:-:-:-:1 LDS.U.128 j${nOdd}By00, [readBs+0x10];\n", 156 | c4 => "--:-:-:-:1 LDS.U.128 j${nOdd}Ax64, [readAs+0x10];\n", 157 | c6 => "--:-:1:-:1 LDS.U.128 j${nOdd}By64, [readBs+0x10];\n", 158 | ); -------------------------------------------------------------------------------- /Assembler/MaxAs/microbench/throughput2.sass: -------------------------------------------------------------------------------- 1 | # Kernel: microbench 2 | 3 | 4 | 5 | 0-10 : result, r1, r2, r3 6 | 20-27 ~ count, stop 7 | 8 | 9 | 10 | --:-:-:-:1 MOV count, RZ; 11 | --:-:-:-:1 MOV32I stop, 102400; 12 | --:-:-:-:1 MOV32I r1, 1.0; 13 | --:-:-:-:1 MOV32I r2, 1.0; 14 | --:-:-:-:4 MOV32I r3, 1.0; 15 | 16 | LOOP: 17 | 18 | --:-:-:-:1 ISETP.LE.AND P0, PT, count, stop, PT; 19 | --:-:-:-:1 IADD count, count, 1; 20 | 21 | 22 | my $out; 23 | 24 | foreach my $i (0 .. 511) 25 | { 26 | my $yield = ($i + 32) & 63 ? '-' : 'Y'; 27 | 28 | my $stall = $i == 511 ? 0 : 1; 29 | 30 | #$out .= "--:-:-:$yield:1 FFMA r3, r1, r2, r3;\n"; 31 | #$out .= "--:-:-:-:1 FFMA r3, r1, r2, r3;\n"; 32 | #$out .= "--:-:-:-:1 FFMA r3, r1, r2, r3;\n"; 33 | #$out .= "--:-:-:-:0 FFMA r3, r1, r2, r3;\n"; 34 | #$out .= "--:-:-:-:1 I2F.F32.S16 result, r1;\n"; 35 | 36 | #$out .= "--:-:-:$yield:$stall VADD.S16.S16.SAT.MRG_16L result, r1, r2, RZ;\n"; 37 | #$out .= "--:-:-:-:1 MOV result, RZ;\n"; 38 | 39 | $out .= "--:-:-:$yield:$stall IADD.SAT result, r1, r2;\n"; 40 | #$out .= "--:-:-:$yield:$stall VMAD.S8.S8.SAT result, r1, r2, r3;\n"; 41 | #$out .= "--:-:-:$yield:$stall XMAD result, r1, r2, r3;\n"; 42 | } 43 | return $out; 44 | 45 | 46 | --:-:-:Y:5 @P0 BRA LOOP; 47 | --:-:-:-:5 EXIT; 48 | -------------------------------------------------------------------------------- /Assembler/MaxAs/microbench/throughput3.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | 4 | my %data; 5 | 6 | foreach my $thread128 (1 .. 8) 7 | { 8 | foreach my $size64 (8 .. 16) 9 | { 10 | my $loopSize = $size64 * 64; 11 | my $loops = int(2 * 1638400 / ($size64 * $thread128)); 12 | 13 | my $blocks = 16; 14 | my $threads = $thread128 * 128; 15 | my $fops = 2 * $loops * $loopSize * $blocks * $threads; 16 | my $fileName = 'throughput2.sass'; 17 | 18 | #printf "%d %4d %4d %d\n", $thread128, $loopSize, $loops, $fops; 19 | #next; 20 | 21 | writeSassFile($fileName, $loopSize, $loops); 22 | 23 | `maxas.pl -i $fileName microbench.cubin`; 24 | 25 | exit if $?; 26 | 27 | my $data = `Release\\microbench.exe e $blocks $threads $fops`; 28 | 29 | my ($gflops) = $data =~ /GFLOPS: ([0-9]+)/ms; 30 | 31 | printf "%d %4d %4d %d\n", $thread128, $loopSize, $loops, $gflops; 32 | 33 | push @{$data{$loopSize}}, $gflops; 34 | } 35 | } 36 | print join("\t", 'size', 1 .. 8), "\n"; 37 | foreach my $loopSize (sort {$a <=> $b} keys %data) 38 | { 39 | print join("\t", $loopSize, @{$data{$loopSize}}), "\n"; 40 | } 41 | 42 | exit; 43 | 44 | sub writeSassFile 45 | { 46 | my ($filename, $loopSize, $loops) = @_; 47 | 48 | open my $fh, ">$filename" or die "$filename: $!"; 49 | 50 | printf $fh <<'EOF', $loops, $loopSize, $loopSize; 51 | # Kernel: microbench 52 | 53 | 54 | 55 | 0-10 : result, r1, r2, r3, count, stop 56 | 57 | 58 | 59 | --:-:-:-:1 MOV count, RZ; 60 | --:-:-:-:1 MOV32I stop, %d; 61 | --:-:-:-:1 MOV32I r1, 1.0; 62 | --:-:-:-:1 MOV32I r2, 1.0; 63 | --:-:-:-:4 MOV32I r3, 1.0; 64 | 65 | LOOP: 66 | 67 | --:-:-:-:1 ISETP.LE.AND P0, PT, count, stop, PT; 68 | --:-:-:-:1 IADD count, count, 1; 69 | 70 | 71 | my $out; 72 | 73 | foreach my $i (0 .. %d) 74 | { 75 | my $y = %d > 64 && (($i + 32) & 63) ? '-' : 'Y'; 76 | 77 | $out .= "--:-:-:$y:1 FFMA result, r1, r2, r3;\n"; 78 | } 79 | return $out; 80 | 81 | 82 | --:-:-:Y:5 @P0 BRA LOOP; 83 | --:-:-:-:5 EXIT; 84 | EOF 85 | 86 | close $fh; 87 | } 88 | 89 | __END__ 90 | 91 | -------------------------------------------------------------------------------- /Assembler/MaxAs/microbench/throughput4.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | 4 | my $loopSize = 512; 5 | my $blocks = 64; 6 | my $loops = 102400; 7 | my $fileName = 'throughput2.sass'; 8 | 9 | writeSassFile($fileName, $loops); 10 | 11 | #print `maxas.pl -p $fileName`; 12 | #exit; 13 | 14 | print `maxas.pl -i $fileName microbench.cubin`; 15 | exit if $?; 16 | 17 | foreach my $thread128 (4) 18 | { 19 | my $threads = $thread128 * 128; 20 | my $fops = 2 * $loops * $loopSize * $blocks * $threads; 21 | 22 | print "./microbench e $blocks $threads $fops\n\n"; 23 | my $data = `./microbench e $blocks $threads $fops`; 24 | exit($?) if $?; 25 | 26 | my ($gflops) = $data =~ /GFLOPS: ([0-9]+)/ms; 27 | 28 | printf "%d %d %d %.2f\n", $thread128, $threads, $gflops, 100 * $gflops / 3050.0; 29 | } 30 | 31 | exit; 32 | 33 | sub writeSassFile 34 | { 35 | my ($filename, $loops) = @_; 36 | 37 | open my $fh, ">$filename" or die "$filename: $!"; 38 | 39 | printf $fh <<'EOF', $loops; 40 | # Kernel: microbench 41 | 42 | 43 | 44 | 0-10 : result, r1, r2, r3 45 | 20-27 ~ count, stop 46 | 47 | 48 | 49 | --:-:-:-:1 MOV count, RZ; 50 | --:-:-:-:1 MOV32I stop, %d; 51 | --:-:-:-:1 MOV32I r1, 1.0; 52 | --:-:-:-:1 MOV32I r2, 1.0; 53 | --:-:-:-:4 MOV32I r3, 1.0; 54 | 55 | LOOP: 56 | 57 | --:-:-:-:1 ISETP.LE.AND P0, PT, count, stop, PT; 58 | --:-:-:-:1 IADD count, count, 1; 59 | 60 | 61 | my $out; 62 | 63 | foreach my $i (0 .. 511) 64 | { 65 | my $yield = ($i + 32) & 63 ? '-' : 'Y'; 66 | 67 | my $stall = $i == 511 ? 0 : 1; 68 | 69 | #$out .= "--:-:-:$yield:1 FFMA r3, r1, r2, r3;\n"; 70 | #$out .= "--:-:-:-:1 FFMA r3, r1, r2, r3;\n"; 71 | #$out .= "--:-:-:-:1 FFMA r3, r1, r2, r3;\n"; 72 | #$out .= "--:-:-:-:0 FFMA r3, r1, r2, r3;\n"; 73 | #$out .= "--:-:-:-:1 I2F.F32.S16 result, r1;\n"; 74 | 75 | #$out .= "--:-:-:$yield:$stall VADD.S16.S16.SAT.MRG_16L result, r1, r2, RZ;\n"; 76 | #$out .= "--:-:-:-:1 MOV result, RZ;\n"; 77 | 78 | $out .= "--:-:-:$yield:$stall IADD.SAT result, r1, r2;\n"; 79 | #$out .= "--:-:-:$yield:$stall VMAD.S8.S8.SAT result, r1, r2, r3;\n"; 80 | #$out .= "--:-:-:$yield:$stall XMAD result, r1, r2, r3;\n"; 81 | } 82 | return $out; 83 | 84 | 85 | --:-:-:Y:5 @P0 BRA LOOP; 86 | --:-:-:-:5 EXIT; 87 | EOF 88 | 89 | close $fh; 90 | } 91 | 92 | __END__ 93 | 94 | VMAD.U8.U8 95 | 96 | dddd 2655 / 4968 = 53.4% 97 | 1d1d 4594 / 4968 = 92.4% 98 | 11d 4746 / 4968 = 95.5% 99 | 111d 4841 / 4968 = 97.4% 100 | 101 | block context switches are a little more expensive than thread context switches 102 | 103 | stall codes: 104 | 105 | f : 13 clocks 106 | e : 8 clocks 107 | d : 6 clocks 108 | c : 8 clocks, no yield 109 | b : 11 clocks 110 | a : 10 clocks 111 | 9 : 9 clocks 112 | 8 : 8 clocks 113 | 7 : 7 clocks 114 | 6 : 6 clocks 115 | 5 : 5 clocks 116 | 4 : 4 clocks 117 | 3 : 3 clocks 118 | 2 : 2 clocks 119 | 1 : 1 clocks, no yield 120 | 0 : 0 clocks, no yield, dual issue -------------------------------------------------------------------------------- /Assembler/MaxAs/microbench/throughput5.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | my %p; 4 | 5 | $p{N} = 8192; 6 | $p{blocking} = 8; 7 | $p{unroll} = 8; 8 | $p{threads} = 64; #256 9 | 10 | $p{csize} = $p{blocking} * $p{blocking}; 11 | $p{loopSize} = $p{unroll} * $p{csize}; 12 | $p{width} = sqrt($p{csize} * $p{threads}); 13 | $p{blocks} = ($p{N} / $p{width}) * ($p{N} / $p{width}); 14 | $p{loops} = $p{N} / $p{unroll}; 15 | $p{fops} = 2 * $p{loops} * $p{loopSize} * $p{blocks} * $p{threads}; 16 | 17 | my $fileName = 'throughput2.sass'; 18 | 19 | my @params = qw(N blocking unroll threads csize loopSize loops width blocks fops); 20 | 21 | #print join("\t", @params), "\n"; 22 | #print join("\t", @p{@params}), "\n"; 23 | 24 | print map sprintf("%-9s: %d\n", $_, $p{$_}), @params; 25 | 26 | writeSassFile($fileName, $p{loopSize}, $p{loops}); 27 | 28 | #print `maxas.pl -p $fileName`; 29 | #exit; 30 | 31 | print `maxas.pl -i $fileName microbench.cubin`; 32 | 33 | exit if $?; 34 | 35 | my $data = `Release\\microbench.exe e $p{blocks} $p{threads} $p{fops} 50`; 36 | 37 | my ($gflops) = $data =~ /GFLOPS: ([0-9]+)/ms; 38 | 39 | print $data; 40 | 41 | #printf "%d %4d %4d %d\n", $thread128, $loopSize, $loops, $gflops; 42 | 43 | 44 | 45 | 46 | sub writeSassFile 47 | { 48 | my ($filename, $loopSize, $loops) = @_; 49 | 50 | open my $fh, ">$filename" or die "$filename: $!"; 51 | 52 | printf $fh <<'END_SASS', $loops; 53 | # Kernel: microbench 54 | 55 | 56 | 57 | 1, 9, 2,10,17,25,18,26 : cy0x<0-7> 58 | 5,13, 6,14,21,29,22,30 : cy1x<0-7> 59 | 3,11, 0, 8,19,27,16,24 : cy2x<0-7> 60 | 7,15, 4,12,23,31,20,28 : cy3x<0-7> 61 | 35,43,32,40,51,59,48,56 : cy4x<0-7> 62 | 39,47,36,44,55,63,52,60 : cy5x<0-7> 63 | 33,41,34,42,49,57,50,58 : cy6x<0-7> 64 | 37,45,38,46,53,61,54,62 : cy7x<0-7> 65 | 66 | 64-71 : j0Ax<0-3>, j0By<0-3> 67 | 72-79 : j1Ax<0-3>, j1By<0-3> 68 | 69 | 0-79 : r<0-79> 70 | 71 | 100-101 : count, stop 72 | 73 | //102-112 ~ readAs, readBs, writeS 74 | 75 | 76 | 77 | --:-:-:-:1 MOV count, RZ; 78 | --:-:-:-:1 MOV32I stop, %d; 79 | //--:-:-:-:1 MOV writeS, RZ; 80 | //--:-:-:-:1 MOV readAs, RZ; 81 | //--:-:-:-:1 MOV readBs, RZ; 82 | 83 | 84 | return join '', map "--:-:-:-:1 MOV r$_, RZ;\n", 0..63; 85 | 86 | 87 | 88 | return join '', map "--:-:-:-:1 MOV32I r$_, 0x00010001;\n", 64..79; 89 | 90 | 91 | LOOP: 92 | 93 | --:-:-:-:1 ISETP.LE.AND P0, PT, count, stop, PT; 94 | --:-:-:-:1 IADD count, count, 1; 95 | 96 | 97 | my $out; 98 | 99 | my @swirl1 = ([0,0],[0,4],[4,4],[4,0]); 100 | my @swirl2 = ([0,0],[1,0],[1,1],[0,1]); 101 | my @swirl3 = ([0,2],[2,2],[2,0],[0,0]); 102 | 103 | my @cOrder; 104 | foreach my $s1 (@swirl1) 105 | { 106 | foreach my $s2 (@swirl2) 107 | { 108 | foreach my $s3 (@swirl3) 109 | { 110 | push @cOrder, [$s1->[0] + $s2->[0] + $s3->[0], $s1->[1] + $s2->[1] + $s3->[1]]; 111 | } 112 | } 113 | } 114 | 115 | foreach my $j (0..7) 116 | { 117 | my $odd = $j & 1; 118 | my $nOdd = !$odd + 0; 119 | 120 | my %%insert; 121 | 122 | #$insert{c62} = "01:-:-:-:5 BAR.SYNC 0;\n" if $j == 6; 123 | 124 | $insert{c62} = 125 | "--:-:-:-:1 LOP.XOR readAs, readAs, 0;\n" . 126 | "--:-:-:-:1 LOP.XOR readBs, readBs, 0;\n" . 127 | "--:-:-:-:1 LOP.XOR readAs, readAs, 0;\n" . 128 | "--:-:-:-:1 LOP.XOR readBs, readBs, 0;\n" . 129 | "--:-:-:-:1 LOP.XOR writeS, writeS, 0;\n" if $j == 8; 130 | 131 | foreach my $c (0 .. 63) 132 | { 133 | my ($x,$y) = @{$cOrder[$c]}; 134 | my $ins = $insert{"c$c"} || ''; 135 | my $stall = ($c == 63 && $j == 7) ? 0 : 1; #1; #$ins || 136 | my $yield = $c == 32 ? 'Y' : '-'; 137 | my $wait = '--'; #$c ? '--' : '01'; 138 | 139 | my $xReg = $x >> 1; 140 | my $yReg = $y >> 1; 141 | my $xPart = $x & 1 ? '.H1' : ''; 142 | my $yPart = $y & 1 ? '.H1' : ''; 143 | 144 | $out .= sprintf "$wait:-:-:$yield:$stall XMAD cy%%dx%%d, j%%dAx%%d%%s, j%%dBy%%d%%s, cy%%dx%%d;\n%%s", $y,$x, $odd,$xReg,$xPart, $odd,$yReg,$yPart, $y,$x, $ins; 145 | } 146 | } 147 | return $out; 148 | 149 | 150 | --:-:-:Y:5 @P0 BRA LOOP; 151 | --:-:-:-:5 EXIT; 152 | END_SASS 153 | 154 | close $fh; 155 | } 156 | 157 | __END__ 158 | 159 | my %%insert = ( 160 | c0 => "--:-:-:-:1 LDS.U.128 j${nOdd}Ax00, [readAs+0x10];\n", 161 | c2 => "--:-:-:-:1 LDS.U.128 j${nOdd}By00, [readBs+0x10];\n", 162 | c4 => "--:-:-:-:1 LDS.U.128 j${nOdd}Ax64, [readAs+0x10];\n", 163 | c6 => "--:-:1:-:1 LDS.U.128 j${nOdd}By64, [readBs+0x10];\n", 164 | ); -------------------------------------------------------------------------------- /Assembler/MaxAs/microbench/xmad.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | 4 | print `maxas.pl -i xmad2.sass microbench.cubin`; 5 | 6 | exit if $?; 7 | 8 | print `./microbench i 1 128`; 9 | 10 | 11 | __END__ 12 | 13 | -------------------------------------------------------------------------------- /Assembler/MaxAs/microbench/xmad2.sass: -------------------------------------------------------------------------------- 1 | # Kernel: microbench 2 | # InsCnt: 18 3 | # RegCnt: 5 4 | # SharedSize: 4096 5 | # BarCnt: 1 6 | # Params(3): 7 | # ord:addr:size:align 8 | # 0:0x140:8:0 9 | # 1:0x148:8:0 10 | # 2:0x150:8:0 11 | # 12 | # Instructions: 13 | 14 | 15 | blockDimX : c[0x0][0x8] 16 | blockDimY : c[0x0][0xc] 17 | blockDimZ : c[0x0][0x10] 18 | gridDimX : c[0x0][0x14] 19 | gridDimY : c[0x0][0x18] 20 | gridDimZ : c[0x0][0x1c] 21 | 22 | param_out[0] : c[0x0][0x140] 23 | param_out[1] : c[0x0][0x144] 24 | param_clocks[0] : c[0x0][0x148] 25 | param_clocks[1] : c[0x0][0x14c] 26 | param_in[0] : c[0x0][0x150] 27 | param_in[1] : c[0x0][0x154] 28 | 29 | 30 | 31 | 32 | 0-1 : out<0-1> 33 | 2-3 : clocks<0-1> 34 | 4-15 : result, result2, tid, bid, blockDim, clock1, clock2, scale, s 35 | 16-24 : a, b, c, x 36 | 37 | 38 | 39 | // Load in our params 40 | --:-:-:-:1 MOV out0, param_out[0]; 41 | --:-:-:-:1 MOV out1, param_out[1]; 42 | --:-:-:-:1 MOV clocks0, param_clocks[0]; 43 | --:-:-:-:1 MOV clocks1, param_clocks[1]; 44 | //--:-:-:-:1 MOV in, c[0x0][0x148]; 45 | --:-:-:-:1 MOV blockDim, blockDimX; 46 | 47 | --:-:-:-:1 PSETP.AND.AND P0, PT, !PT, PT, PT; 48 | 49 | --:-:-:-:6 MOV32I result, 0xffffffff; 50 | --:-:-:-:6 MOV32I result2, 0x0; 51 | --:-:-:-:1 MOV32I a, 1; 52 | --:-:-:-:1 MOV32I b, 1; 53 | --:-:-:-:6 MOV32I c, 0x0; 54 | 55 | // (127 - scale) << 23 56 | //--:-:-:-:6 MOV32I scale, 28; 57 | //--:-:-:-:6 IADD scale, -scale, 127; 58 | //--:-:-:-:6 SHL scale, scale, 23; 59 | 60 | 61 | //--:-:-:-:6 MOV32I c, 0x4f765432; 62 | 63 | //--:-:1:-:2 LDG.CI.128 a, [in]; 64 | 65 | //01:-:-:-:6 VMAD.S16.S16 result, a, b, c; 66 | 67 | //--:-:-:-:6 MOV result, a; 68 | 69 | // a >> 16 | (b & 0xffff0000) 70 | 71 | //--:-:-:-:6 SHR.U32 result, a, 16; 72 | //--:-:-:-:6 LOP3.LUT result, result, b, c, 0xf8; 73 | 74 | //--:-:-:-:6 I2I.S32.S16 result, a.H1; 75 | 76 | //--:-:-:Y:d IADD result.CC, a, -c; 77 | //--:-:-:Y:2 IADD.X result2, b, -RZ; 78 | 79 | //--:-:-:-:6 SHR result, a, 1; 80 | 81 | //--:-:-:-:6 BFI result, b, 0x1010, a; 82 | 83 | --:-:-:-:1 CS2R clock1, SR_CLOCKLO; 84 | 85 | //--:-:-:-:6 XMAD.S16.S16 c, a, b, RZ; 86 | //--:-:-:-:6 ISET.LT.AND s, c, RZ, PT; 87 | //--:-:-:-:6 IADD result.CC, c, result; 88 | //--:-:-:-:6 IADD.X result2, s, result2; 89 | 90 | //--:-:-:-:6 XMAD.S16.S16 result.CC, a, b, result; 91 | //--:-:-:-:6 IADD.X result2, result2, RZ; 92 | 93 | //--:-:-:-:6 SHF.R.S64 result, result, 1, result2; 94 | //--:-:-:-:6 MOV32I result2, 0; 95 | 96 | --:-:-:-:f LOP.AND.NZ P0, RZ, result, 1; 97 | 98 | --:-:-:-:6 @P0 VADD.S16.S16.SAT.MRG_16H result, a, b, result; 99 | 100 | //--:-:1:-:d I2F.F32.S32 result2, a; 101 | //01:-:-:-:6 FMUL result2, result2, scale; 102 | //01:-:2:-:d F2I.S32.F32 result, result2; 103 | 104 | 02:-:-:-:6 CS2R clock2, SR_CLOCKLO; 105 | 106 | //F2I = "^$pred?F2I$ftz$x2x$round $r0, $cr20;" 107 | //I2F = "^$pred?I2F$x2x$rnd $r0, $cr20;" 108 | //x2x = "\.(?F|U|S)(?8|16|32|64)\.(?F|U|S)(?8|16|32|64)" 109 | //rnd = "(?:\.(?RN|RM|RP|RZ))?" 110 | //round = "(?:\.(?ROUND|FLOOR|CEIL|TRUNC))?" 111 | //r8 = qr"(?\-)?(?\|)?(?$reg)\|?(?:\.(?H0|H1|B1|B2|B3))?(?\.reuse)?" 112 | //r20 = qr"(?\-)?(?\|)?(?$reg)\|?(?:\.(?H0|H1|B1|B2|B3))?(?\.reuse)?" 113 | 114 | 115 | //--:-:-:-:1 XMAD.MRG x, a, b.H1, RZ; 116 | //--:-:-:-:6 XMAD result, a.H1, b.H1, c; 117 | //--:-:-:-:1 XMAD.PSL.CBCC result, a.H1, x.H1, result; 118 | 119 | // Get the first clock value 120 | 121 | --:-:1:-:1 S2R tid, SR_TID.X; 122 | --:-:2:-:2 S2R bid, SR_CTAID.X; 123 | 124 | 125 | 126 | // Take the difference of clocks 127 | --:-:-:-:1 IADD clock1, clock2, -clock1; 128 | 129 | // Setup our output addresses 130 | // Stall your pipeline dependencies properly 131 | 03:-:-:-:1 XMAD tid, blockDim, bid, tid; 132 | --:-:-:Y:6 XMAD.MRG x, blockDim, bid.H1, RZ; 133 | --:-:-:Y:6 XMAD.PSL.CBCC tid, blockDim.H1, x.H1, tid; 134 | --:-:-:Y:6 SHL tid, tid, 0x2; 135 | 136 | --:-:-:-:1 IADD clocks, clocks, tid; 137 | --:-:-:-:1 IADD out, out, tid; 138 | 139 | // Output the results. 140 | // No stall needed on prior instruction as memory store instructions have a 5 clock delay in picking up register values 141 | --:-:-:-:1 STG.E [clocks], result2; 142 | --:-:-:-:1 STG.E [out], result; 143 | --:-:-:-:5 EXIT; 144 | 145 | -------------------------------------------------------------------------------- /Assembler/MaxAs/pm_to_blib: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ncic-sugon/DeepPerf/e539add1347e9354712787f71f66ae346f006544/Assembler/MaxAs/pm_to_blib -------------------------------------------------------------------------------- /Assembler/MaxAs/sgemm/batched_gemm.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ncic-sugon/DeepPerf/e539add1347e9354712787f71f66ae346f006544/Assembler/MaxAs/sgemm/batched_gemm.xlsx -------------------------------------------------------------------------------- /Assembler/MaxAs/sgemm/cublas_sgemm.ptx: -------------------------------------------------------------------------------- 1 | .version 4.1 2 | .target sm_50 3 | .address_size 64 4 | 5 | // ptxas -v -arch=sm_50 -m 32 --opt-level 4 -o cublas_sgemm.cubin cublas_sgemm.ptx 6 | 7 | // You can use maxas to insert cublas_device.lib code into a cubin built from this ptx: 8 | 9 | // From C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v6.5\lib\Win32\cublas_device.lib 10 | 11 | // cuobjdump -lelf cublas_device.lib | find "sm_50" 12 | 13 | // cuobjdump -xelf maxwell_sgemm.asm.sm_50.cubin cublas_device.lib 14 | 15 | // maxas -l maxwell_sgemm.asm.sm_50.cubin 16 | 17 | // maxas -e -k maxwell_sgemm_128x128_nt maxwell_sgemm_128x128_nt.sass 18 | // maxas -e -k maxwell_sgemm_128x64_nt maxwell_sgemm_128x64_nt.sass 19 | 20 | // maxas -i maxwell_sgemm_128x128_nt.sass cublas_sgemm.cubin 21 | // maxas -i maxwell_sgemm_128x64_nt.sass cublas_sgemm.cubin 22 | 23 | // The sgemm.cpp code makes use of this cubin to benchmark the kernels outside of cublas. 24 | 25 | .visible .entry maxwell_sgemm_128x128_nt( 26 | .param .u64 .ptr.global.align 8 param_A, 27 | .param .u64 .ptr.global.align 8 param_B, 28 | .param .u64 .ptr.global.align 8 param_C, 29 | .param .s32 param_lda, 30 | .param .s32 param_ldb, 31 | .param .s32 param_ldc, 32 | .param .s32 param_k, 33 | .param .u64 .ptr.global.align 8 param_Alpha, 34 | .param .u64 .ptr.global.align 8 param_Beta, 35 | .param .s32 param_alpha, 36 | .param .s32 param_beta, 37 | .param .s32 param_flag 38 | ) 39 | .reqntid 256 40 | { 41 | .shared .align 16 .b8 share[16384]; 42 | 43 | ret; 44 | } 45 | 46 | .visible .entry maxwell_sgemm_128x64_nt( 47 | .param .u64 .ptr.global.align 8 param_A, 48 | .param .u64 .ptr.global.align 8 param_B, 49 | .param .u64 .ptr.global.align 8 param_C, 50 | .param .s32 param_lda, 51 | .param .s32 param_ldb, 52 | .param .s32 param_ldc, 53 | .param .s32 param_k, 54 | .param .u64 .ptr.global.align 8 param_Alpha, 55 | .param .u64 .ptr.global.align 8 param_Beta, 56 | .param .s32 param_alpha, 57 | .param .s32 param_beta, 58 | .param .s32 param_flag 59 | ) 60 | .reqntid 128 61 | { 62 | .shared .align 16 .b8 share[12288]; 63 | 64 | ret; 65 | } 66 | -------------------------------------------------------------------------------- /Assembler/MaxAs/sgemm/new.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ncic-sugon/DeepPerf/e539add1347e9354712787f71f66ae346f006544/Assembler/MaxAs/sgemm/new.cubin -------------------------------------------------------------------------------- /Assembler/MaxAs/sgemm/sgemm.cu: -------------------------------------------------------------------------------- 1 | 2 | // Note this file isn't configured to automatically compile. 3 | // Here's how: 4 | 5 | // If you want to look at the ptx first: 6 | // nvcc -arch sm_50 -m 32 -ptx sgemm.cu 7 | 8 | // Manually compile your kernel to a cubin. 9 | // You should only have to do this once, unless you change params or shared size or globals: 10 | // nvcc -arch sm_50 -m 32 -cubin sgemm.cu 11 | 12 | // If tweaking a kernel or writing a new one based on this shell code you would then do this: 13 | // maxas.pl -e kernel.cubin kernel.sass 14 | 15 | // I've already included a modified kernel (sgemm.sass) so the next step is.. 16 | 17 | // Splice the manually assembled code back into the cubin: 18 | // maxas.pl -i sgemm.sass sgemm.cubin 19 | 20 | #include 21 | #include 22 | #include 23 | #include 24 | 25 | typedef texture floatTex; 26 | 27 | floatTex texA(0, cudaFilterModePoint, cudaAddressModeBorder); 28 | floatTex texB(0, cudaFilterModePoint, cudaAddressModeBorder); 29 | 30 | // Use extern C so C++ doesn't mangle our kernel name 31 | extern "C" 32 | // This kernel requires 256x1x1 threads per block 33 | __global__ void __launch_bounds__(256) sgemm_kernel_128( 34 | float *C, 35 | const int m, const int n, const int k, 36 | const int lda, const int ldb, const int ldc, 37 | float alpha, int *D) 38 | { 39 | // Declare any shared memory your kernel requires 40 | // Or you could just pass the amount in as a param to cuLaunchKernel 41 | __shared__ float4 share[1024]; 42 | 43 | int tid = threadIdx.x; 44 | 45 | // If you use indirect texture references, they will be passed as params at the end of the param list 46 | // So set that up here to make sure they're available in your kernel 47 | floatTex tex = tid > 127 ? texB : texA; 48 | 49 | // Make use of shared and your textures so it doesn't get optimized away 50 | share[tid] = tex1Dfetch(tex, tid); 51 | 52 | __syncthreads(); 53 | 54 | // output something so your setup isn't optimized away. 55 | C[tid] = share[255-tid].x; 56 | } 57 | 58 | extern "C" 59 | __global__ void __launch_bounds__(64) sgemm_kernel_64( 60 | float *C, 61 | const int m, const int n, const int k, 62 | const int lda, const int ldb, const int ldc, 63 | float alpha, int *D) 64 | { 65 | __shared__ float4 share[512]; 66 | 67 | int tid = threadIdx.x; 68 | 69 | floatTex tex = tid > 127 ? texB : texA; 70 | 71 | share[tid] = tex1Dfetch(tex, tid); 72 | 73 | __syncthreads(); 74 | 75 | C[tid] = share[255-tid].x; 76 | } 77 | 78 | // A note about using the Cuda Runtime. 79 | // If that's your preference over the driver API then here's what you'd do: 80 | 81 | // In your project properties in the Cuda C/C++ panel: 82 | // -Set the "Keep Processed Files" (-keep) option 83 | // -Add a -v manually to the command line 84 | // If compiling on command line just add -keep -v options to nvcc. 85 | // Rebuild your solution and look in the log for these lines that follow the ptxas step: 86 | 87 | // #$ fatbinary --create="Release/kernel.fatbin" -32 --key="a7bce87544c2a492" --ident="C:/Users/Scott/Documents/sgemm6/sgemm6/kernel.cu" --cmdline="-v --opt-level 4 --generate-line-info " "--image=profile=sm_50,file=Release/kernel.sm_50.cubin" "--image=profile=compute_50,file=Release/kernel.ptx" --embedded-fatbin="Release/kernel.fatbin.c" --cuda 88 | // #$ cl.exe @Release/kernel.cu.cpp.ii.res > "Release/kernel.cu.cpp.ii" 89 | // #$ cl.exe @Release/kernel.cu.obj.res -Fo"Release/kernel.cu.obj" 90 | 91 | // You just need to manually run these 3 commands (or add them to a build script) 92 | // after you've modified the cubin generated from the preceeding ptxas command. 93 | // That will give you a new .cu.obj file which will automatically be linked in for you next time you 94 | // build your project (or you could manually run the linker step as well). 95 | 96 | // Having done that you can call your kernel normally using the <<< >>> syntax. 97 | // Debugging will have to be with the sass syntax but that's what you'll want to see anyway. 98 | // With fatbin you can also keep non-maxwell optimized versions of your code. 99 | 100 | 101 | // I just discovered this also works as a shortcut to the above: 102 | // nvcc -lib -arch sm_52 -m 32 -use-cubin code=sm_52,cubin=sgemm.cubin -o sgemm.lib sgemm.cu 103 | 104 | // The cu kernel definitions above need to have empty bodies. 105 | // And, the cu file must be compiled to a lib seperately before linking. -------------------------------------------------------------------------------- /Assembler/MaxAs/sgemm/sgemm.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | 4 | my $CU_AD_FORMAT_UNSIGNED_INT8 = 0x01; 5 | my $CU_AD_FORMAT_UNSIGNED_INT16 = 0x02; 6 | my $CU_AD_FORMAT_FLOAT = 0x20; 7 | 8 | if (!-f 'sgemm_pre_128.sass' || (stat 'sgemm128.sass')[9] > (stat 'sgemm_pre_128.sass')[9]) 9 | { 10 | print `maxas.pl -p sgemm128.sass sgemm_pre_128.sass`; 11 | exit if $?; 12 | print `maxas.pl -i sgemm128.sass sgemm.cubin`; 13 | exit if $?; 14 | print `maxas.pl -e -k sgemm_kernel_128 sgemm.cubin sgemm_final_128.sass`; 15 | } 16 | if (!-f 'sgemm_pre_64.sass' || (stat 'sgemm64.sass')[9] > (stat 'sgemm_pre_64.sass')[9]) 17 | { 18 | print `maxas.pl -p sgemm64.sass sgemm_pre_64.sass`; 19 | exit if $?; 20 | print `maxas.pl -i sgemm64.sass sgemm.cubin`; 21 | exit if $?; 22 | print `maxas.pl -e -k sgemm_kernel_64 sgemm.cubin sgemm_final_64.sass`; 23 | } 24 | 25 | #print `Release\\sgemm.exe $_ 20` foreach (80,60,40,30,20,10,9,8,7,6,5,4,3,2); 26 | 27 | `Release\\sgemm.exe 64 5 $CU_AD_FORMAT_FLOAT`; 28 | 29 | print `Release\\sgemm.exe 64 20 $CU_AD_FORMAT_UNSIGNED_INT8`; 30 | exit; 31 | 32 | my %data; 33 | foreach my $thread128 (4 .. 64) 34 | { 35 | my $N = $thread128 * 128; 36 | 37 | my $iterations = int(20 * (64 * 128)**3 / $N**3); 38 | $iterations = 10000 if $iterations > 10000; 39 | 40 | print "$N $iterations\n"; 41 | 42 | my $data = `Release\\sgemm.exe $thread128 $iterations $CU_AD_FORMAT_UNSIGNED_INT16`; 43 | 44 | foreach my $bench (split "\n", $data) 45 | { 46 | if ($bench =~ /^(\w+)\s+GFLOPS: ([0-9.]+) /) 47 | { 48 | push @{$data{$N}}, $2; 49 | print "$1 $2\n"; 50 | } 51 | } 52 | } 53 | print join("\t", qw(size Max64 Max128 Cub64 Cub128)), "\n"; 54 | 55 | foreach my $N (sort { $a <=> $b } keys %data) 56 | { 57 | print join("\t", @{$data{$N}}), "\n"; 58 | } 59 | 60 | 61 | #print $data; 62 | 63 | __END__ 64 | 65 | 66 | 64 * 128 * 16 * 1.620 * .931 / 520 67 | 68 | Max64 GFLOPS: 1377.38 (size: 256, iterations: 2000) 69 | Max128 GFLOPS: 973.70 (size: 256, iterations: 2000) 70 | Cub64 GFLOPS: 1272.42 (size: 256, iterations: 2000) 71 | Cub128 GFLOPS: 948.15 (size: 256, iterations: 2000) 72 | 73 | my @data = grep /\S/, split "\n", $data; 74 | 75 | my $min; 76 | my %smData; 77 | my @sdata; 78 | foreach (@data) 79 | { 80 | next if /GFLOPS/; 81 | 82 | my ($sm, $clock, $by, $bx) = split /\s+/; 83 | 84 | $smData{$sm} = $clock if !$smData{$sm} || $clock < $smData{$sm}; 85 | 86 | $min = $clock if !$min || $clock < $min; 87 | 88 | push @sdata, [$sm, $clock, $by, $bx]; 89 | } 90 | 91 | foreach (@sdata) 92 | { 93 | $_->[1] -= $smData{$_->[0]}; 94 | } 95 | 96 | foreach (sort {$a->[1] <=> $b->[1] || $a->[0] <=> $b->[0]} @sdata) 97 | { 98 | printf "%02d %8u by: %2d bx: %2d\n", @$_; 99 | 100 | } 101 | 102 | 103 | -------------------------------------------------------------------------------- /Assembler/MaxAs/sgemm/sgemm.sln: -------------------------------------------------------------------------------- 1 |  2 | Microsoft Visual Studio Solution File, Format Version 11.00 3 | # Visual Studio 2010 4 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "sgemm", "sgemm.vcxproj", "{D571379D-3653-43CB-BE83-A6C68D392A05}" 5 | EndProject 6 | Global 7 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 8 | Debug|Win32 = Debug|Win32 9 | Release|Win32 = Release|Win32 10 | EndGlobalSection 11 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 12 | {D571379D-3653-43CB-BE83-A6C68D392A05}.Debug|Win32.ActiveCfg = Debug|Win32 13 | {D571379D-3653-43CB-BE83-A6C68D392A05}.Debug|Win32.Build.0 = Debug|Win32 14 | {D571379D-3653-43CB-BE83-A6C68D392A05}.Release|Win32.ActiveCfg = Release|Win32 15 | {D571379D-3653-43CB-BE83-A6C68D392A05}.Release|Win32.Build.0 = Release|Win32 16 | EndGlobalSection 17 | GlobalSection(SolutionProperties) = preSolution 18 | HideSolutionNode = FALSE 19 | EndGlobalSection 20 | EndGlobal 21 | -------------------------------------------------------------------------------- /Assembler/MaxAs/sgemm/sgemm.vcxproj: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | Debug 6 | Win32 7 | 8 | 9 | Release 10 | Win32 11 | 12 | 13 | 14 | {D571379D-3653-43CB-BE83-A6C68D392A05} 15 | Win32Proj 16 | sgemm 17 | 18 | 19 | 20 | Application 21 | true 22 | Unicode 23 | 24 | 25 | Application 26 | false 27 | true 28 | Unicode 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | true 42 | 43 | 44 | false 45 | 46 | 47 | 48 | 49 | 50 | Level3 51 | Disabled 52 | _CRT_SECURE_NO_WARNINGS;WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) 53 | $(CUDA_PATH_V6_5)\include;%(AdditionalIncludeDirectories) 54 | 55 | 56 | Console 57 | true 58 | $(CUDA_PATH_V6_5)\lib\Win32;%(AdditionalLibraryDirectories) 59 | cuda.lib;cublas.lib;%(AdditionalDependencies) 60 | 61 | 62 | 63 | 64 | Level3 65 | 66 | 67 | MaxSpeed 68 | true 69 | true 70 | _CRT_SECURE_NO_WARNINGS;WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) 71 | $(CUDA_PATH_V6_5)\include;%(AdditionalIncludeDirectories) 72 | 73 | 74 | Console 75 | true 76 | true 77 | true 78 | $(CUDA_PATH_V6_5)\lib\Win32;%(AdditionalLibraryDirectories) 79 | cuda.lib;cublas.lib;%(AdditionalDependencies) 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | -------------------------------------------------------------------------------- /Assembler/MaxAs/sgemm/sgemm_sm52_64.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ncic-sugon/DeepPerf/e539add1347e9354712787f71f66ae346f006544/Assembler/MaxAs/sgemm/sgemm_sm52_64.cubin -------------------------------------------------------------------------------- /Assembler/MaxAs/t/MaxAs-MaxAs.t: -------------------------------------------------------------------------------- 1 | use strict; 2 | use warnings; 3 | 4 | use Test::More tests => 1; 5 | BEGIN { use_ok('MaxAs::MaxAs') }; 6 | -------------------------------------------------------------------------------- /Assembler/PascalAs/Changes: -------------------------------------------------------------------------------- 1 | Revision history for Perl extension MaxAs::MaxAs. 2 | 3 | 1.01 Thu Mar 26 17:09:57 2015 4 | - original Perl packaged version 5 | -------------------------------------------------------------------------------- /Assembler/PascalAs/Install.sh: -------------------------------------------------------------------------------- 1 | perl Makefile.PL 2 | make 3 | sudo make install 4 | -------------------------------------------------------------------------------- /Assembler/PascalAs/LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014 Scott Gray 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /Assembler/PascalAs/MANIFEST: -------------------------------------------------------------------------------- 1 | bin/maxas.pl 2 | Changes 3 | lib/MaxAs/Cubin.pm 4 | lib/MaxAs/MaxAs.pm 5 | lib/MaxAs/MaxAsGrammar.pm 6 | LICENSE 7 | Makefile.PL 8 | MANIFEST 9 | microbench/microbench.cpp 10 | microbench/microbench.cu 11 | microbench/microbench.sass 12 | microbench/shared.pl 13 | microbench/shared_lds.sass 14 | microbench/shared_sts16.sass 15 | microbench/throughput.pl 16 | microbench/throughput.sass 17 | microbench/throughput2.pl 18 | microbench/throughput2.sass 19 | microbench/throughput3.pl 20 | microbench/throughput4.pl 21 | microbench/throughput5.pl 22 | microbench/xmad.pl 23 | microbench/xmad2.sass 24 | README.md 25 | sgemm/batched_gemm.xlsx 26 | sgemm/cublas_sgemm.ptx 27 | sgemm/sgemm.cpp 28 | sgemm/sgemm.cu 29 | sgemm/sgemm.pl 30 | sgemm/sgemm.sln 31 | sgemm/sgemm.vcxproj 32 | sgemm/sgemm128.sass 33 | sgemm/sgemm64.sass 34 | sgemm/sgemm_final_128.sass 35 | sgemm/sgemm_final_64.sass 36 | sgemm/sgemm_pre_128.sass 37 | sgemm/sgemm_pre_64.sass 38 | t/MaxAs-MaxAs.t 39 | -------------------------------------------------------------------------------- /Assembler/PascalAs/MYMETA.json: -------------------------------------------------------------------------------- 1 | { 2 | "abstract" : "Assembler for NVIDIA Maxwell architecture", 3 | "author" : [ 4 | "Scott Gray " 5 | ], 6 | "dynamic_config" : 0, 7 | "generated_by" : "ExtUtils::MakeMaker version 7.0401, CPAN::Meta::Converter version 2.150001", 8 | "license" : [ 9 | "mit" 10 | ], 11 | "meta-spec" : { 12 | "url" : "http://search.cpan.org/perldoc?CPAN::Meta::Spec", 13 | "version" : "2" 14 | }, 15 | "name" : "PascalAs-PascalAs", 16 | "no_index" : { 17 | "directory" : [ 18 | "t", 19 | "inc" 20 | ] 21 | }, 22 | "prereqs" : { 23 | "build" : { 24 | "requires" : { 25 | "ExtUtils::MakeMaker" : "0" 26 | } 27 | }, 28 | "configure" : { 29 | "requires" : { 30 | "ExtUtils::MakeMaker" : "0" 31 | } 32 | }, 33 | "runtime" : { 34 | "requires" : { 35 | "Carp" : "1.29", 36 | "Data::Dumper" : "2.145" 37 | } 38 | } 39 | }, 40 | "release_status" : "stable", 41 | "version" : "1.06" 42 | } 43 | -------------------------------------------------------------------------------- /Assembler/PascalAs/MYMETA.yml: -------------------------------------------------------------------------------- 1 | --- 2 | abstract: 'Assembler for NVIDIA Maxwell architecture' 3 | author: 4 | - 'Scott Gray ' 5 | build_requires: 6 | ExtUtils::MakeMaker: '0' 7 | configure_requires: 8 | ExtUtils::MakeMaker: '0' 9 | dynamic_config: 0 10 | generated_by: 'ExtUtils::MakeMaker version 7.0401, CPAN::Meta::Converter version 2.150001' 11 | license: mit 12 | meta-spec: 13 | url: http://module-build.sourceforge.net/META-spec-v1.4.html 14 | version: '1.4' 15 | name: PascalAs-PascalAs 16 | no_index: 17 | directory: 18 | - t 19 | - inc 20 | requires: 21 | Carp: '1.29' 22 | Data::Dumper: '2.145' 23 | version: '1.06' 24 | -------------------------------------------------------------------------------- /Assembler/PascalAs/Makefile.PL: -------------------------------------------------------------------------------- 1 | require 5.10.0; 2 | use ExtUtils::MakeMaker; 3 | # See lib/ExtUtils/MakeMaker.pm for details of how to influence 4 | # the contents of the Makefile that is written. 5 | WriteMakefile( 6 | NAME => 'PascalAs::PascalAs', 7 | VERSION_FROM => 'lib/PascalAs/PascalAs.pm', # finds $VERSION 8 | EXE_FILES => ['bin/pascalas.pl'], 9 | PREREQ_PM => {Carp => 1.29, Data::Dumper => 2.145}, 10 | LICENSE => 'MIT', 11 | ($] >= 5.005 ? ## Add these new keywords supported since 5.005 12 | (ABSTRACT_FROM => 'lib/PascalAs/PascalAs.pm', # retrieve abstract from module 13 | AUTHOR => 'Scott Gray ') : ()), 14 | ); 15 | -------------------------------------------------------------------------------- /Assembler/PascalAs/README.md: -------------------------------------------------------------------------------- 1 | # MaxAs 2 | Assembler for NVIDIA Maxwell architecture 3 | 4 | To install (system-wide): 5 | 6 | sudo cpanm git://github.com/NervanaSystems/maxas.git 7 | 8 | or 9 | 10 | perl Makefile.PL 11 | make 12 | sudo make install 13 | 14 | 15 | See wiki pages for more information: 16 | 17 | - [Introduction](https://github.com/NervanaSystems/maxas/wiki/Introduction) 18 | - [Getting Started](https://github.com/NervanaSystems/maxas/wiki/Getting-Started) 19 | - [Control Codes](https://github.com/NervanaSystems/maxas/wiki/Control-Codes) 20 | - [SGEMM walkthrough](https://github.com/NervanaSystems/maxas/wiki/SGEMM) 21 | 22 | Related work with lots of additional shader assembly (sass) examples: 23 | 24 | - [NervanaGPU](https://github.com/NervanaSystems/nervanagpu) 25 | 26 | This project is released under the [MIT License](http://opensource.org/licenses/MIT). 27 | 28 | -- Scott Gray 29 | -------------------------------------------------------------------------------- /Assembler/PascalAs/blib/arch/.exists: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ncic-sugon/DeepPerf/e539add1347e9354712787f71f66ae346f006544/Assembler/PascalAs/blib/arch/.exists -------------------------------------------------------------------------------- /Assembler/PascalAs/blib/arch/auto/MaxAs/MaxAs/.exists: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ncic-sugon/DeepPerf/e539add1347e9354712787f71f66ae346f006544/Assembler/PascalAs/blib/arch/auto/MaxAs/MaxAs/.exists -------------------------------------------------------------------------------- /Assembler/PascalAs/blib/arch/auto/PascalAs/PascalAs/.exists: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ncic-sugon/DeepPerf/e539add1347e9354712787f71f66ae346f006544/Assembler/PascalAs/blib/arch/auto/PascalAs/PascalAs/.exists -------------------------------------------------------------------------------- /Assembler/PascalAs/blib/bin/.exists: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ncic-sugon/DeepPerf/e539add1347e9354712787f71f66ae346f006544/Assembler/PascalAs/blib/bin/.exists -------------------------------------------------------------------------------- /Assembler/PascalAs/blib/lib/MaxAs/.exists: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ncic-sugon/DeepPerf/e539add1347e9354712787f71f66ae346f006544/Assembler/PascalAs/blib/lib/MaxAs/.exists -------------------------------------------------------------------------------- /Assembler/PascalAs/blib/lib/PascalAs/.exists: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ncic-sugon/DeepPerf/e539add1347e9354712787f71f66ae346f006544/Assembler/PascalAs/blib/lib/PascalAs/.exists -------------------------------------------------------------------------------- /Assembler/PascalAs/blib/lib/auto/MaxAs/MaxAs/.exists: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ncic-sugon/DeepPerf/e539add1347e9354712787f71f66ae346f006544/Assembler/PascalAs/blib/lib/auto/MaxAs/MaxAs/.exists -------------------------------------------------------------------------------- /Assembler/PascalAs/blib/lib/auto/PascalAs/PascalAs/.exists: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ncic-sugon/DeepPerf/e539add1347e9354712787f71f66ae346f006544/Assembler/PascalAs/blib/lib/auto/PascalAs/PascalAs/.exists -------------------------------------------------------------------------------- /Assembler/PascalAs/blib/man1/.exists: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ncic-sugon/DeepPerf/e539add1347e9354712787f71f66ae346f006544/Assembler/PascalAs/blib/man1/.exists -------------------------------------------------------------------------------- /Assembler/PascalAs/blib/man3/.exists: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ncic-sugon/DeepPerf/e539add1347e9354712787f71f66ae346f006544/Assembler/PascalAs/blib/man3/.exists -------------------------------------------------------------------------------- /Assembler/PascalAs/blib/man3/MaxAs::MaxAs.3pm: -------------------------------------------------------------------------------- 1 | .\" Automatically generated by Pod::Man 2.22 (Pod::Simple 3.13) 2 | .\" 3 | .\" Standard preamble: 4 | .\" ======================================================================== 5 | .de Sp \" Vertical space (when we can't use .PP) 6 | .if t .sp .5v 7 | .if n .sp 8 | .. 9 | .de Vb \" Begin verbatim text 10 | .ft CW 11 | .nf 12 | .ne \\$1 13 | .. 14 | .de Ve \" End verbatim text 15 | .ft R 16 | .fi 17 | .. 18 | .\" Set up some character translations and predefined strings. \*(-- will 19 | .\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left 20 | .\" double quote, and \*(R" will give a right double quote. \*(C+ will 21 | .\" give a nicer C++. Capital omega is used to do unbreakable dashes and 22 | .\" therefore won't be available. \*(C` and \*(C' expand to `' in nroff, 23 | .\" nothing in troff, for use with C<>. 24 | .tr \(*W- 25 | .ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p' 26 | .ie n \{\ 27 | . ds -- \(*W- 28 | . ds PI pi 29 | . if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch 30 | . if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\" diablo 12 pitch 31 | . ds L" "" 32 | . ds R" "" 33 | . ds C` "" 34 | . ds C' "" 35 | 'br\} 36 | .el\{\ 37 | . ds -- \|\(em\| 38 | . ds PI \(*p 39 | . ds L" `` 40 | . ds R" '' 41 | 'br\} 42 | .\" 43 | .\" Escape single quotes in literal strings from groff's Unicode transform. 44 | .ie \n(.g .ds Aq \(aq 45 | .el .ds Aq ' 46 | .\" 47 | .\" If the F register is turned on, we'll generate index entries on stderr for 48 | .\" titles (.TH), headers (.SH), subsections (.SS), items (.Ip), and index 49 | .\" entries marked with X<> in POD. Of course, you'll have to process the 50 | .\" output yourself in some meaningful fashion. 51 | .ie \nF \{\ 52 | . de IX 53 | . tm Index:\\$1\t\\n%\t"\\$2" 54 | .. 55 | . nr % 0 56 | . rr F 57 | .\} 58 | .el \{\ 59 | . de IX 60 | .. 61 | .\} 62 | .\" 63 | .\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2). 64 | .\" Fear. Run. Save yourself. No user-serviceable parts. 65 | . \" fudge factors for nroff and troff 66 | .if n \{\ 67 | . ds #H 0 68 | . ds #V .8m 69 | . ds #F .3m 70 | . ds #[ \f1 71 | . ds #] \fP 72 | .\} 73 | .if t \{\ 74 | . ds #H ((1u-(\\\\n(.fu%2u))*.13m) 75 | . ds #V .6m 76 | . ds #F 0 77 | . ds #[ \& 78 | . ds #] \& 79 | .\} 80 | . \" simple accents for nroff and troff 81 | .if n \{\ 82 | . ds ' \& 83 | . ds ` \& 84 | . ds ^ \& 85 | . ds , \& 86 | . ds ~ ~ 87 | . ds / 88 | .\} 89 | .if t \{\ 90 | . ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u" 91 | . ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u' 92 | . ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u' 93 | . ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u' 94 | . ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u' 95 | . ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u' 96 | .\} 97 | . \" troff and (daisy-wheel) nroff accents 98 | .ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V' 99 | .ds 8 \h'\*(#H'\(*b\h'-\*(#H' 100 | .ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#] 101 | .ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H' 102 | .ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u' 103 | .ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#] 104 | .ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#] 105 | .ds ae a\h'-(\w'a'u*4/10)'e 106 | .ds Ae A\h'-(\w'A'u*4/10)'E 107 | . \" corrections for vroff 108 | .if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u' 109 | .if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u' 110 | . \" for low resolution devices (crt and lpr) 111 | .if \n(.H>23 .if \n(.V>19 \ 112 | \{\ 113 | . ds : e 114 | . ds 8 ss 115 | . ds o a 116 | . ds d- d\h'-1'\(ga 117 | . ds D- D\h'-1'\(hy 118 | . ds th \o'bp' 119 | . ds Th \o'LP' 120 | . ds ae ae 121 | . ds Ae AE 122 | .\} 123 | .rm #[ #] #H #V #F C 124 | .\" ======================================================================== 125 | .\" 126 | .IX Title "MaxAs::MaxAs 3" 127 | .TH MaxAs::MaxAs 3 "2016-02-04" "perl v5.10.1" "User Contributed Perl Documentation" 128 | .\" For nroff, turn off justification. Always turn off hyphenation; it makes 129 | .\" way too many mistakes in technical documents. 130 | .if n .ad l 131 | .nh 132 | .SH "NAME" 133 | MaxAs::MaxAs \- Assembler for NVIDIA Maxwell architecture 134 | .SH "SYNOPSIS" 135 | .IX Header "SYNOPSIS" 136 | .Vb 1 137 | \& maxas.pl [opts] 138 | .Ve 139 | .SH "DESCRIPTION" 140 | .IX Header "DESCRIPTION" 141 | See the documentation at: https://github.com/NervanaSystems/maxas 142 | .SH "SEE ALSO" 143 | .IX Header "SEE ALSO" 144 | See the documentation at: https://github.com/NervanaSystems/maxas 145 | .SH "AUTHOR" 146 | .IX Header "AUTHOR" 147 | Scott Gray, 148 | .SH "COPYRIGHT AND LICENSE" 149 | .IX Header "COPYRIGHT AND LICENSE" 150 | The \s-1MIT\s0 License (\s-1MIT\s0) 151 | .PP 152 | Copyright (c) 2014 Scott Gray 153 | .PP 154 | Permission is hereby granted, free of charge, to any person obtaining a copy 155 | of this software and associated documentation files (the \*(L"Software\*(R"), to deal 156 | in the Software without restriction, including without limitation the rights 157 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 158 | copies of the Software, and to permit persons to whom the Software is 159 | furnished to do so, subject to the following conditions: 160 | .PP 161 | The above copyright notice and this permission notice shall be included in 162 | all copies or substantial portions of the Software. 163 | .PP 164 | \&\s-1THE\s0 \s-1SOFTWARE\s0 \s-1IS\s0 \s-1PROVIDED\s0 \*(L"\s-1AS\s0 \s-1IS\s0\*(R", \s-1WITHOUT\s0 \s-1WARRANTY\s0 \s-1OF\s0 \s-1ANY\s0 \s-1KIND\s0, \s-1EXPRESS\s0 \s-1OR\s0 165 | \&\s-1IMPLIED\s0, \s-1INCLUDING\s0 \s-1BUT\s0 \s-1NOT\s0 \s-1LIMITED\s0 \s-1TO\s0 \s-1THE\s0 \s-1WARRANTIES\s0 \s-1OF\s0 \s-1MERCHANTABILITY\s0, 166 | \&\s-1FITNESS\s0 \s-1FOR\s0 A \s-1PARTICULAR\s0 \s-1PURPOSE\s0 \s-1AND\s0 \s-1NONINFRINGEMENT\s0. \s-1IN\s0 \s-1NO\s0 \s-1EVENT\s0 \s-1SHALL\s0 \s-1THE\s0 167 | \&\s-1AUTHORS\s0 \s-1OR\s0 \s-1COPYRIGHT\s0 \s-1HOLDERS\s0 \s-1BE\s0 \s-1LIABLE\s0 \s-1FOR\s0 \s-1ANY\s0 \s-1CLAIM\s0, \s-1DAMAGES\s0 \s-1OR\s0 \s-1OTHER\s0 168 | \&\s-1LIABILITY\s0, \s-1WHETHER\s0 \s-1IN\s0 \s-1AN\s0 \s-1ACTION\s0 \s-1OF\s0 \s-1CONTRACT\s0, \s-1TORT\s0 \s-1OR\s0 \s-1OTHERWISE\s0, \s-1ARISING\s0 \s-1FROM\s0, 169 | \&\s-1OUT\s0 \s-1OF\s0 \s-1OR\s0 \s-1IN\s0 \s-1CONNECTION\s0 \s-1WITH\s0 \s-1THE\s0 \s-1SOFTWARE\s0 \s-1OR\s0 \s-1THE\s0 \s-1USE\s0 \s-1OR\s0 \s-1OTHER\s0 \s-1DEALINGS\s0 \s-1IN\s0 170 | \&\s-1THE\s0 \s-1SOFTWARE\s0. 171 | -------------------------------------------------------------------------------- /Assembler/PascalAs/blib/man3/PascalAs::PascalAs.3pm: -------------------------------------------------------------------------------- 1 | .\" Automatically generated by Pod::Man 2.28 (Pod::Simple 3.29) 2 | .\" 3 | .\" Standard preamble: 4 | .\" ======================================================================== 5 | .de Sp \" Vertical space (when we can't use .PP) 6 | .if t .sp .5v 7 | .if n .sp 8 | .. 9 | .de Vb \" Begin verbatim text 10 | .ft CW 11 | .nf 12 | .ne \\$1 13 | .. 14 | .de Ve \" End verbatim text 15 | .ft R 16 | .fi 17 | .. 18 | .\" Set up some character translations and predefined strings. \*(-- will 19 | .\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left 20 | .\" double quote, and \*(R" will give a right double quote. \*(C+ will 21 | .\" give a nicer C++. Capital omega is used to do unbreakable dashes and 22 | .\" therefore won't be available. \*(C` and \*(C' expand to `' in nroff, 23 | .\" nothing in troff, for use with C<>. 24 | .tr \(*W- 25 | .ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p' 26 | .ie n \{\ 27 | . ds -- \(*W- 28 | . ds PI pi 29 | . if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch 30 | . if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\" diablo 12 pitch 31 | . ds L" "" 32 | . ds R" "" 33 | . ds C` "" 34 | . ds C' "" 35 | 'br\} 36 | .el\{\ 37 | . ds -- \|\(em\| 38 | . ds PI \(*p 39 | . ds L" `` 40 | . ds R" '' 41 | . ds C` 42 | . ds C' 43 | 'br\} 44 | .\" 45 | .\" Escape single quotes in literal strings from groff's Unicode transform. 46 | .ie \n(.g .ds Aq \(aq 47 | .el .ds Aq ' 48 | .\" 49 | .\" If the F register is turned on, we'll generate index entries on stderr for 50 | .\" titles (.TH), headers (.SH), subsections (.SS), items (.Ip), and index 51 | .\" entries marked with X<> in POD. Of course, you'll have to process the 52 | .\" output yourself in some meaningful fashion. 53 | .\" 54 | .\" Avoid warning from groff about undefined register 'F'. 55 | .de IX 56 | .. 57 | .nr rF 0 58 | .if \n(.g .if rF .nr rF 1 59 | .if (\n(rF:(\n(.g==0)) \{ 60 | . if \nF \{ 61 | . de IX 62 | . tm Index:\\$1\t\\n%\t"\\$2" 63 | .. 64 | . if !\nF==2 \{ 65 | . nr % 0 66 | . nr F 2 67 | . \} 68 | . \} 69 | .\} 70 | .rr rF 71 | .\" ======================================================================== 72 | .\" 73 | .IX Title "PascalAs::PascalAs 3pm" 74 | .TH PascalAs::PascalAs 3pm "2018-11-05" "perl v5.22.1" "User Contributed Perl Documentation" 75 | .\" For nroff, turn off justification. Always turn off hyphenation; it makes 76 | .\" way too many mistakes in technical documents. 77 | .if n .ad l 78 | .nh 79 | .SH "NAME" 80 | PascalAs::PascalAs \- Assembler for NVIDIA Maxwell architecture 81 | .SH "SYNOPSIS" 82 | .IX Header "SYNOPSIS" 83 | .Vb 1 84 | \& Pascalas.pl [opts] 85 | .Ve 86 | .SH "DESCRIPTION" 87 | .IX Header "DESCRIPTION" 88 | See the documentation at: https://github.com/NervanaSystems/pascalas 89 | .SH "SEE ALSO" 90 | .IX Header "SEE ALSO" 91 | See the documentation at: https://github.com/NervanaSystems/pascalas 92 | .SH "AUTHOR" 93 | .IX Header "AUTHOR" 94 | Scott Gray, 95 | .SH "COPYRIGHT AND LICENSE" 96 | .IX Header "COPYRIGHT AND LICENSE" 97 | The \s-1MIT\s0 License (\s-1MIT\s0) 98 | .PP 99 | Copyright (c) 2014 Scott Gray 100 | .PP 101 | Permission is hereby granted, free of charge, to any person obtaining a copy 102 | of this software and associated documentation files (the \*(L"Software\*(R"), to deal 103 | in the Software without restriction, including without limitation the rights 104 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 105 | copies of the Software, and to permit persons to whom the Software is 106 | furnished to do so, subject to the following conditions: 107 | .PP 108 | The above copyright notice and this permission notice shall be included in 109 | all copies or substantial portions of the Software. 110 | .PP 111 | \&\s-1THE SOFTWARE IS PROVIDED \*(L"AS IS\*(R", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 112 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 113 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 114 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 115 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 116 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 117 | THE SOFTWARE.\s0 118 | -------------------------------------------------------------------------------- /Assembler/PascalAs/blib/script/.exists: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ncic-sugon/DeepPerf/e539add1347e9354712787f71f66ae346f006544/Assembler/PascalAs/blib/script/.exists -------------------------------------------------------------------------------- /Assembler/PascalAs/cpanfile: -------------------------------------------------------------------------------- 1 | requires 'perl', '5.10.0'; 2 | 3 | requires 'Carp', '1.29'; 4 | requires 'Data::Dumper', '2.145'; 5 | -------------------------------------------------------------------------------- /Assembler/PascalAs/microbench/microbench.cpp: -------------------------------------------------------------------------------- 1 | // microbench.cpp : Defines the entry point for the console application. 2 | // 3 | 4 | // nvcc -l cuda -o microbench microbench.cpp 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | CUcontext hContext = 0; 13 | 14 | #define CUDA_CHECK( fn ) do { \ 15 | CUresult status = (fn); \ 16 | if ( CUDA_SUCCESS != status ) { \ 17 | const char* errstr; \ 18 | cuGetErrorString(status, &errstr); \ 19 | printf("CUDA Driver Failure (line %d of file %s):\n\t%s returned 0x%x (%s)\n", __LINE__, __FILE__, #fn, status, errstr); \ 20 | if (hContext) cuCtxDestroy(hContext); \ 21 | exit(EXIT_FAILURE); \ 22 | } \ 23 | } while (0) 24 | 25 | 26 | int main(int argc, char* argv[]) 27 | { 28 | //int iTest = 2896; 29 | //while (iTest < 0x7fff) 30 | //{ 31 | // int iResult = iTest * iTest; 32 | // float fTest = (float)iTest; 33 | // int fResult = (int)(fTest * fTest); 34 | 35 | // printf("i*i:%08x f*f:%08x\n", iResult, fResult); 36 | 37 | // iTest += 0x0800; 38 | //} 39 | //exit(0); 40 | 41 | char deviceName[32]; 42 | int devCount, ordinal, major, minor; 43 | CUdevice hDevice; 44 | 45 | // Initialize the Driver API and find a device 46 | CUDA_CHECK( cuInit(0) ); 47 | CUDA_CHECK( cuDeviceGetCount(&devCount) ); 48 | for (ordinal = 0; ordinal < devCount; ordinal++) 49 | { 50 | CUDA_CHECK( cuDeviceGet(&hDevice, ordinal) ); 51 | CUDA_CHECK( cuDeviceGetAttribute (&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, hDevice) ); 52 | CUDA_CHECK( cuDeviceGetAttribute (&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, hDevice) ); 53 | CUDA_CHECK( cuDeviceGetName(deviceName, sizeof(deviceName), hDevice) ); 54 | if (major >= 5 && minor >= 2) 55 | { 56 | printf("Using: Id:%d %s (%d.%d)\n\n", ordinal, deviceName, major, minor); 57 | break; 58 | } 59 | } 60 | if (ordinal == devCount) 61 | { 62 | printf("No compute 5.0 device found, exiting.\n"); 63 | exit(EXIT_FAILURE); 64 | } 65 | 66 | // First command line arg is the type: internal (CS2R) or external (cuEventElapsedTime) timing 67 | int internalTiming = 1; 68 | if (argc > 1) 69 | internalTiming = strcmp(argv[1], "i") == 0 ? 1 : 0; 70 | 71 | // Second command line arg is the number of blocks 72 | int blocks = 1; 73 | if (argc > 2) 74 | blocks = atoi(argv[2]); 75 | if (blocks < 1) 76 | blocks = 1; 77 | 78 | // Third command line arg is the number of threads 79 | int threads = 128; 80 | if (argc > 3) 81 | threads = atoi(argv[3]); 82 | if (threads > 1024 || threads < 32) 83 | threads = 128; 84 | threads &= -32; 85 | 86 | // Forth command line arg: 87 | double fops = 1.0; 88 | int lanes = 1; 89 | if (argc > 4) 90 | { 91 | if (internalTiming) 92 | { 93 | // The number of lanes to print for each warp 94 | lanes = atoi(argv[4]); 95 | if (lanes > 32 || lanes < 1) 96 | lanes = 1; 97 | } 98 | else 99 | // The number of floating point operations in a full kernel launch 100 | fops = atof(argv[4]); 101 | } 102 | 103 | // Fifth command line arg is the repeat count for benchmarking 104 | int repeat = 1; 105 | if (argc > 5) 106 | repeat = atoi(argv[5]); 107 | if (repeat > 1000 || repeat < 1) 108 | repeat = 1; 109 | 110 | // threads = total number of threads 111 | size_t size = sizeof(int) * threads * blocks; 112 | 113 | // Setup our input and output buffers 114 | int* dataIn = (int*)malloc(size); 115 | int* dataOut = (int*)malloc(size); 116 | int* clocks = (int*)malloc(size); 117 | memset(dataIn, 0, size); 118 | 119 | CUmodule hModule; 120 | CUfunction hKernel; 121 | CUevent hStart, hStop; 122 | CUdeviceptr devIn, devOut, devClocks; 123 | 124 | // Init our context and device memory buffers 125 | CUDA_CHECK( cuCtxCreate(&hContext, 0, hDevice) ); 126 | CUDA_CHECK( cuMemAlloc(&devIn, size) ); 127 | CUDA_CHECK( cuMemAlloc(&devOut, size) ); 128 | CUDA_CHECK( cuMemAlloc(&devClocks, size) ); 129 | CUDA_CHECK( cuMemcpyHtoD(devIn, dataIn, size) ); 130 | CUDA_CHECK( cuMemsetD8(devOut, 0, size) ); 131 | CUDA_CHECK( cuMemsetD8(devClocks, 0, size) ); 132 | 133 | CUDA_CHECK( cuEventCreate(&hStart, CU_EVENT_BLOCKING_SYNC) ); 134 | CUDA_CHECK( cuEventCreate(&hStop, CU_EVENT_BLOCKING_SYNC) ); 135 | 136 | // Load our kernel 137 | CUDA_CHECK( cuModuleLoad(&hModule, "microbench.cubin") ); 138 | CUDA_CHECK( cuModuleGetFunction(&hKernel, hModule, "microbench") ); 139 | 140 | // Setup the params 141 | void* params[] = { &devOut, &devClocks, &devIn }; 142 | float ms = 0; 143 | 144 | // Warm up the clock (unless under nsight) 145 | if (!getenv("NSIGHT_LAUNCHED")) // NSIGHT_CUDA_ANALYSIS NSIGHT_CUDA_DEBUGGER 146 | for (int i = 0; i < repeat; i++) 147 | CUDA_CHECK( cuLaunchKernel(hKernel, blocks, 1, 1, threads, 1, 1, 0, 0, params, 0) ); 148 | 149 | // Launch the kernel 150 | CUDA_CHECK( cuEventRecord(hStart, NULL) ); 151 | //CUDA_CHECK( cuProfilerStart() ); 152 | CUDA_CHECK( cuLaunchKernel(hKernel, blocks, 1, 1, threads, 1, 1, 0, 0, params, 0) ); 153 | //CUDA_CHECK( cuProfilerStop() ); 154 | CUDA_CHECK( cuEventRecord(hStop, NULL) ); 155 | CUDA_CHECK( cuEventSynchronize(hStop) ); 156 | CUDA_CHECK( cuEventElapsedTime(&ms, hStart, hStop) ); 157 | 158 | //CUDA_CHECK( cuCtxSynchronize() ); 159 | 160 | // Get back our results from each kernel 161 | CUDA_CHECK( cuMemcpyDtoH(dataOut, devOut, size) ); 162 | CUDA_CHECK( cuMemcpyDtoH(clocks, devClocks, size) ); 163 | 164 | // Cleanup and shutdown of cuda 165 | CUDA_CHECK( cuEventDestroy(hStart) ); 166 | CUDA_CHECK( cuEventDestroy(hStop) ); 167 | CUDA_CHECK( cuModuleUnload(hModule) ); 168 | CUDA_CHECK( cuMemFree(devIn) ); 169 | CUDA_CHECK( cuMemFree(devOut) ); 170 | CUDA_CHECK( cuMemFree(devClocks) ); 171 | CUDA_CHECK( cuCtxDestroy(hContext) ); 172 | hContext = 0; 173 | 174 | // When using just one block, print out the internal timing data 175 | if (internalTiming) 176 | { 177 | int count = 0, total = 0, min = 999999, max = 0; 178 | 179 | int* clocks_p = clocks; 180 | int* dataOut_p = dataOut; 181 | 182 | // Loop over and print results 183 | for (int blk = 0; blk < blocks; blk++) 184 | { 185 | float *fDataOut = reinterpret_cast(dataOut_p); 186 | 187 | for(int tid = 0; tid < threads; tid += 32) 188 | { 189 | // Sometimes we want data on each thread, sometimes just one sample per warp is fine 190 | for (int lane = 0; lane < lanes; lane++) 191 | printf("b:%02d w:%03d t:%04d l:%02d clocks:%08d out:%08x\n", blk, tid/32, tid, lane, clocks_p[tid+lane], dataOut_p[tid+lane]); // %04u 192 | 193 | count++; 194 | total += clocks_p[tid]; 195 | if (clocks_p[tid] < min) min = clocks_p[tid]; 196 | if (clocks_p[tid] > max) max = clocks_p[tid]; 197 | } 198 | clocks_p += threads; 199 | dataOut_p += threads; 200 | } 201 | printf("average: %.3f, min %d, max: %d\n", (float)total/count, min, max); 202 | } 203 | else 204 | { 205 | // For more than one block we're testing throughput and want external timing data 206 | printf("MilliSecs: %.3f, GFLOPS: %.3f\n", ms, fops / (ms * 1000000.0)); 207 | } 208 | // And free up host memory 209 | free(dataIn); free(dataOut); free(clocks); 210 | 211 | return 0; 212 | } 213 | -------------------------------------------------------------------------------- /Assembler/PascalAs/microbench/microbench.cu: -------------------------------------------------------------------------------- 1 | 2 | // Note this file isn't configured to automatically compile 3 | 4 | #include 5 | #include 6 | 7 | // Build: 8 | // nvcc -l cuda -o microbench microbench.cpp 9 | // nvcc -arch sm_50 -cubin microbench.cu 10 | 11 | // Inspect a cubin (use nvdisasm from cuda 6.5 for best results): 12 | // maxas.pl -e microbench.cubin 13 | 14 | // Insert new sass into cubin 15 | // maxas.pl -i microbench.sass microbench.cubin 16 | 17 | // run it: 18 | // ./microbench 19 | 20 | // Use extern C so C++ doesn't mangle our kernel name 21 | extern "C" __global__ void microbench(int *out, int *clocks, int *in) 22 | { 23 | __shared__ int share[1024]; 24 | 25 | int tid = threadIdx.x; 26 | int bx = blockIdx.x; 27 | int by = blockIdx.y; 28 | 29 | int start = clock(); 30 | 31 | share[tid] = in[by * 65535 + bx]; //tid + blkDimX + blkDimY + blkDimZ + grdDimX + grdDimY + grdDimZ 32 | 33 | __syncthreads(); 34 | 35 | int end = clock(); 36 | 37 | clocks[tid] = (start >> 16) | (end & 0xffff0000); //end - start; 38 | 39 | out[tid] = share[tid ^ 1]; 40 | } 41 | 42 | // A note about using the Cuda Runtime. 43 | // If that's your preference over the driver API then here's what you'd do: 44 | 45 | // In your project properties in the Cuda C/C++ panel: 46 | // -Set the "Keep Processed Files" (-keep) option 47 | // -Add a -v manually to the command line 48 | // If compiling on command line just add -keep -v options to nvcc. 49 | // Rebuild your solution and look in the log for these lines that follow the ptxas step: 50 | 51 | // #$ fatbinary --create="Release/kernel.fatbin" -32 --key="a7bce87544c2a492" --ident="C:/Users/Scott/Documents/sgemm6/sgemm6/kernel.cu" --cmdline="-v --opt-level 4 --generate-line-info " "--image=profile=sm_50,file=Release/kernel.sm_50.cubin" "--image=profile=compute_50,file=Release/kernel.ptx" --embedded-fatbin="Release/kernel.fatbin.c" --cuda 52 | // #$ cl.exe @Release/kernel.cu.cpp.ii.res > "Release/kernel.cu.cpp.ii" 53 | // #$ cl.exe @Release/kernel.cu.obj.res -Fo"Release/kernel.cu.obj" 54 | 55 | // You just need to manually run these 3 commands (or add them to a build script) 56 | // after you've modified the cubin generated from the preceeding ptxas command. 57 | // That will give you a new .cu.obj file which will automatically be linked in for you next time you 58 | // build your project (or you could manually run the linker step as well). 59 | 60 | // Having done that you can call your kernel normally using the <<< >>> syntax. 61 | // Debugging will have to be with the sass syntax but that's what you'll want to see anyway. 62 | // With fatbin you can also keep non-maxwell optimized versions of your code. 63 | 64 | 65 | // I just discovered this also works as a shortcut to the above: 66 | // nvcc -lib -arch sm_52 -m 32 -use-cubin code=sm_52,cubin=microbench.cubin -o microbench.lib microbench.cu 67 | 68 | // The cu kernel definitions above need to have empty bodies. 69 | // And, the cu file must be compiled to a lib seperately before linking. -------------------------------------------------------------------------------- /Assembler/PascalAs/microbench/microbench.sass: -------------------------------------------------------------------------------- 1 | # Kernel: microbench 2 | 3 | // This is a simple micro bench to demonstrate the latency in loading SR_TID.X 4 | 5 | 6 | blockDimX : c[0x0][0x08] 7 | blockDimY : c[0x0][0x0c] 8 | blockDimZ : c[0x0][0x10] 9 | gridDimX : c[0x0][0x14] 10 | gridDimY : c[0x0][0x18] 11 | gridDimZ : c[0x0][0x1c] 12 | 13 | param_out[0] : c[0x0][0x140] 14 | param_out[1] : c[0x0][0x144] 15 | param_clocks[0] : c[0x0][0x148] 16 | param_clocks[1] : c[0x0][0x14c] 17 | param_in[0] : c[0x0][0x150] 18 | param_in[1] : c[0x0][0x154] 19 | 20 | 21 | 22 | 23 | 0-1 : out<0-1> 24 | 2-3 : clocks<0-1> 25 | 4-5 : in<0-1> 26 | 6-20 : tid, bid, blockDim, clock1, clock2, result, offset, x 27 | 28 | 29 | 30 | // Load in our params (not currently used below) 31 | --:-:-:-:1 MOV in0, param_in[0]; 32 | --:-:-:-:1 MOV in1, param_in[1]; 33 | 34 | // Get the first clock value 35 | --:-:-:-:1 CS2R clock1, SR_CLOCKLO; 36 | 37 | // Get the threadId and blockId 38 | // Set the Read-After-Write dependency barrier 1 and 2 39 | --:-:1:-:1 S2R tid, SR_TID.X; 40 | // Add one additional clock stall to allow the barrier time to set prior to next instruction that uses it 41 | --:-:2:-:2 S2R bid, SR_CTAID.X; 42 | 43 | 44 | // Get the second clock value 45 | // Wait on the depenedency barriers that were set in the prior instruction 46 | // Stall 6 to allow CS2R time to complete before next instruction 47 | // CS2R takes a constant 6 clocks to complete unlike S2R which is a variable 22-44 clocks 48 | // This stall count does not factor into the time calculation at all 49 | 03:-:-:-:6 CS2R clock2, SR_CLOCKLO; 50 | 51 | // Take the difference of clocks 52 | --:-:-:-:1 IADD clock1, clock2, -clock1; 53 | 54 | // Setup our output addresses 55 | // Stall your pipeline dependencies properly 56 | // Note using a single XMAD assumes blockDimX and bid are 16 bit values, which is reasonable for this test code 57 | --:-:-:-:6 XMAD offset, bid, blockDimX, tid; 58 | 59 | // LEA is "load effective address" 60 | // The offset param is shifted left 2 and added to the pointers with 64bit math 61 | --:-:-:-:6 LEA clocks0.CC, offset, param_clocks[0], 2; 62 | --:-:-:-:1 LEA.HI.X clocks1, offset, param_clocks[1], RZ, 2; 63 | 64 | --:-:-:-:6 LEA out0.CC, offset, param_out[0], 2; 65 | --:-:-:-:1 LEA.HI.X out1, offset, param_out[1], RZ, 2; 66 | 67 | // Output the results. 68 | // No stall needed on prior instruction as memory store instructions have a 5 clock delay in picking up register values 69 | --:-:-:-:1 STG.E [clocks], clock1; 70 | --:-:-:-:1 STG.E [out], offset; # use this to return whatever you like to inspect the results 71 | --:-:-:-:5 EXIT; 72 | 73 | -------------------------------------------------------------------------------- /Assembler/PascalAs/microbench/shared.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | 4 | print `maxas.pl -i shared_sts16.sass microbench.cubin`; 5 | 6 | exit if $?; 7 | 8 | print `Release\\microbench.exe i 1 64`; 9 | 10 | 11 | __END__ 12 | 13 | -------------------------------------------------------------------------------- /Assembler/PascalAs/microbench/shared_lds.sass: -------------------------------------------------------------------------------- 1 | # Kernel: microbench 2 | # InsCnt: 18 3 | # RegCnt: 5 4 | # SharedSize: 4096 5 | # BarCnt: 1 6 | # Params(3): 7 | # ord:addr:size:align 8 | # 0:0x140:4:0 9 | # 1:0x144:4:0 10 | # 2:0x148:4:0 11 | 12 | // This is a simple micro bench to demonstrate the latency in loading SR_TID.X 13 | 14 | 15 | 16 | 0-3 : result, a, b, c 17 | 18 | 4-40 : out, clocks, in, tid, bid, blockDim, clock1, clock2, x, tid3, tid7, tid96, tid128, readAs, readBs, val<0-20> 19 | 20 | 21 | 22 | // Load in our params 23 | --:-:1:-:1 S2R tid, SR_TID.X; 24 | --:-:2:-:1 S2R bid, SR_CTAID.X; 25 | 26 | --:-:-:-:1 MOV result, c[0x0][0x0]; 27 | --:-:-:-:1 MOV in, c[0x0][0x100]; 28 | 29 | --:-:-:-:1 CS2R clock1, SR_CLOCKLO; 30 | --:-:-:-:1 MOV result, c[0x0][0x13c]; 31 | --:-:-:-:1 CS2R clock2, SR_CLOCKLO; 32 | 33 | --:-:-:-:1 MOV blockDim, c[0x0][0x8]; 34 | --:-:-:-:1 MOV out, c[0x0][0x140]; 35 | --:-:-:-:1 MOV clocks, c[0x0][0x144]; 36 | 37 | 38 | 39 | 40 | 41 | 42 | 03:-:-:-:1 LOP.AND tid3, tid, 3; 43 | --:-:-:-:1 LOP.AND tid7, tid, 7; 44 | --:-:-:-:1 LOP.AND tid96, tid, 96; 45 | --:-:-:-:1 LOP.AND tid128, tid, 128; 46 | 47 | // readAs = ((tid128 >> 4) | tid7) << 4 48 | --:-:-:-:1 SHR.U32 readAs, tid128, 4; 49 | --:-:-:-:1 LOP.OR readAs, readAs, tid7; 50 | --:-:-:-:1 SHL readAs, readAs, 4; 51 | 52 | // readBs = ((tid96 >> 3) | tid3) << 4 53 | --:-:-:-:1 SHR.U32 readBs, tid96, 3; 54 | --:-:-:-:1 LOP.OR readBs, readBs, tid3; 55 | #--:-:-:-:1 SHL readBs, readBs, 4; 56 | #--:-:-:-:1 ISCADD readBs, readBs, 4x<1024>, 4; 57 | 58 | 59 | 60 | 61 | 62 | 63 | #--:-:-:-:1 LDS.U.128 result, [readBs]; 64 | 65 | 66 | 67 | 68 | 01:-:-:-:1 IADD clock1, clock2, -clock1; 69 | 70 | 71 | --:-:-:-:1 XMAD tid, blockDim, bid, tid; 72 | --:-:-:Y:6 XMAD.MRG x, blockDim, bid.H1, RZ; 73 | --:-:-:Y:6 XMAD.PSL.CBCC tid, blockDim.H1, x.H1, tid; 74 | --:-:-:Y:6 SHL tid, tid, 0x2; 75 | 76 | --:-:-:-:1 IADD clocks, clocks, tid; 77 | --:-:-:-:2 IADD out, out, tid; 78 | 79 | --:-:-:-:1 STG [clocks], clock1; 80 | --:-:-:-:1 STG [out], readBs; 81 | --:-:-:-:5 EXIT; 82 | 83 | 84 | 85 | --:-:-:-:4 LOP.AND tid32, tid, -32; 86 | 87 | --:-:-:-:1 STS.128 [tid32 + 4x<2048>], RZ; 88 | 89 | --:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 90 | --:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 91 | --:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 92 | --:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 93 | --:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 94 | --:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 95 | --:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 96 | --:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 97 | --:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 98 | --:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 99 | --:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 100 | --:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 101 | --:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 102 | --:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 103 | --:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 104 | --:-:1:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 105 | 106 | 107 | // readAs = (((tid & 0x80) >> 4) | ((tid >> 1) & 7)) << 4; 108 | --:-:-:-:1 BFE.U32 tid7, tid, 0x301; 109 | --:-:-:-:1 LOP.AND readAs, tid, 0x80; 110 | --:-:-:-:1 SHR.U32 readAs, readAs, 4; 111 | --:-:-:-:1 LOP.OR readAs, readAs, tid7; 112 | --:-:-:-:1 SHL readAs, readAs, 4; 113 | 114 | // readBs = ((($tid & 0x70) >> 3) | ($tid & 1)) << 4 + 4096; 115 | --:-:-:-:1 LOP.AND tid1, tid, 0x1; 116 | --:-:-:-:1 LOP.AND readBs, tid, 0x70; 117 | --:-:-:-:1 SHR.U32 readBs, readBs, 3; 118 | --:-:-:-:1 LOP.OR readBs, readBs, tid1; 119 | --:-:-:-:1 ISCADD readBs, readBs, 4x<1024>, 4; 120 | 121 | 122 | -------------------------------------------------------------------------------- /Assembler/PascalAs/microbench/shared_sts16.sass: -------------------------------------------------------------------------------- 1 | # Kernel: microbench 2 | # InsCnt: 18 3 | # RegCnt: 5 4 | # SharedSize: 4096 5 | # BarCnt: 1 6 | # Params(3): 7 | # ord:addr:size:align 8 | # 0:0x140:4:0 9 | # 1:0x144:4:0 10 | # 2:0x148:4:0 11 | 12 | // This is a simple micro bench to demonstrate the latency in loading SR_TID.X 13 | 14 | 15 | 16 | 0-3 : result, a, b, c 17 | 18 | 4-40 : out, clocks, in, tid, bid, blockDim, clock1, clock2, x, tid1, tid31, tid32, readAs, readBs, val<0-20> 19 | 20 | 21 | 22 | // Load in our params 23 | --:-:1:-:1 S2R tid, SR_TID.X; 24 | --:-:2:-:1 S2R bid, SR_CTAID.X; 25 | 26 | //--:-:-:-:1 MOV result, c[0x0][0x0]; 27 | //--:-:-:-:1 MOV in, c[0x0][0x100]; 28 | --:-:-:-:1 MOV result, 1; 29 | 30 | --:-:-:-:1 MOV blockDim, c[0x0][0x8]; 31 | --:-:-:-:1 MOV out, c[0x0][0x140]; 32 | --:-:-:-:1 MOV clocks, c[0x0][0x144]; 33 | 34 | 35 | // readAs = ((tid >> 1) & 7) << 4; 36 | 03:-:-:-:6 BFE.U32 readAs, tid, 0x301; // 3 bits at position 1 37 | --:-:-:-:6 SHL readAs, readAs, 3; 38 | 39 | // readBs = (((tid & 0x30) >> 3) | (tid & 1)) << 4 + 1024; 40 | --:-:-:-:6 LOP.AND tid1, tid, 1; 41 | --:-:-:-:6 LOP.AND readBs, tid, 0x30; 42 | --:-:-:-:6 SHR.U32 readBs, readBs, 3; 43 | --:-:-:-:6 LOP.OR readBs, readBs, tid1; 44 | --:-:-:-:6 ISCADD readBs, readBs, 0, 3; 45 | 46 | 47 | 48 | ///--:-:-:-:1 STS [tid32], result; 49 | //--:-:-:-:1 STS.S16 [tid32 + 2x<32>], result; 50 | //--:-:1:-:2 LDS.U.64 result, [readBs]; 51 | 52 | --:-:-:-:0 CS2R clock1, SR_CLOCKLO; 53 | --:-:1:-:6 LDS.U.64 result, [readAs]; 54 | --:-:-:-:6 CS2R clock2, SR_CLOCKLO; 55 | 56 | 57 | 01:-:-:-:1 IADD clock1, clock2, -clock1; 58 | 59 | 60 | --:-:-:-:1 XMAD tid, blockDim, bid, tid; 61 | --:-:-:Y:6 XMAD.MRG x, blockDim, bid.H1, RZ; 62 | --:-:-:Y:6 XMAD.PSL.CBCC tid, blockDim.H1, x.H1, tid; 63 | --:-:-:Y:6 SHL tid, tid, 0x2; 64 | 65 | --:-:-:-:1 IADD clocks, clocks, tid; 66 | --:-:-:-:2 IADD out, out, tid; 67 | 68 | --:-:-:-:1 STG [clocks], clock1; 69 | --:-:-:-:1 STG [out], result; 70 | --:-:-:-:5 EXIT; 71 | 72 | 73 | 74 | --:-:-:-:4 LOP.AND tid32, tid, -32; 75 | 76 | --:-:-:-:1 STS.128 [tid32 + 4x<2048>], RZ; 77 | 78 | --:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 79 | --:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 80 | --:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 81 | --:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 82 | --:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 83 | --:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 84 | --:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 85 | --:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 86 | --:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 87 | --:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 88 | --:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 89 | --:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 90 | --:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 91 | --:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 92 | --:-:-:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 93 | --:-:1:-:1 LDS.U.128 result, [tid32 + 4x<2048>]; 94 | 95 | 03:-:-:-:6 LOP.AND tid31, tid, 31; 96 | --:-:-:-:6 LOP.AND tid32, tid, 32; 97 | --:-:-:-:6 SHL tid32, tid32, 0x2; 98 | --:-:-:-:6 LOP.OR tid32, tid32, tid31; 99 | --:-:-:-:6 SHL tid32, tid32, 0x2; 100 | 101 | // readAs = (((tid & 0x80) >> 4) | ((tid >> 1) & 7)) << 4; 102 | --:-:-:-:1 BFE.U32 tid7, tid, 0x301; 103 | --:-:-:-:1 LOP.AND readAs, tid, 0x80; 104 | --:-:-:-:1 SHR.U32 readAs, readAs, 4; 105 | --:-:-:-:1 LOP.OR readAs, readAs, tid7; 106 | --:-:-:-:1 SHL readAs, readAs, 4; 107 | 108 | // readBs = ((($tid & 0x70) >> 3) | ($tid & 1)) << 4 + 4096; 109 | --:-:-:-:1 LOP.AND tid1, tid, 0x1; 110 | --:-:-:-:1 LOP.AND readBs, tid, 0x70; 111 | --:-:-:-:1 SHR.U32 readBs, readBs, 3; 112 | --:-:-:-:1 LOP.OR readBs, readBs, tid1; 113 | --:-:-:-:1 ISCADD readBs, readBs, 4x<1024>, 4; 114 | 115 | 116 | -------------------------------------------------------------------------------- /Assembler/PascalAs/microbench/throughput.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | 4 | my $loopSize = 512; 5 | my $blocks = 32; 6 | my $loops = 10240000; 7 | my $fileName = 'throughput2.sass'; 8 | 9 | writeSassFile($fileName, $loops); 10 | 11 | #print `maxas.pl -p $fileName`; 12 | #exit; 13 | 14 | print `maxas.pl -i $fileName microbench.cubin`; 15 | exit if $?; 16 | 17 | foreach my $thread128 (2) 18 | { 19 | my $threads = $thread128 * 128; 20 | my $fops = 2 * $loops * $loopSize * $blocks * $threads; 21 | 22 | my $data = `Release\\microbench.exe e $blocks $threads $fops`; 23 | 24 | my ($gflops) = $data =~ /GFLOPS: ([0-9]+)/ms; 25 | 26 | printf "%d %d %d\n", $thread128, $threads, $gflops; 27 | } 28 | 29 | exit; 30 | 31 | sub writeSassFile 32 | { 33 | my ($filename, $loops) = @_; 34 | 35 | open my $fh, ">$filename" or die "$filename: $!"; 36 | 37 | printf $fh <<'EOF', $loops; 38 | # Kernel: microbench 39 | 40 | 41 | 42 | 0-10 : result, r1, r2, r3 43 | 20-27 ~ count, stop 44 | 45 | 46 | 47 | --:-:-:-:1 MOV count, RZ; 48 | --:-:-:-:1 MOV32I stop, %d; 49 | --:-:-:-:1 MOV32I r1, 1.0; 50 | --:-:-:-:1 MOV32I r2, 1.0; 51 | --:-:-:-:4 MOV32I r3, 1.0; 52 | 53 | LOOP: 54 | 55 | --:-:-:-:1 ISETP.LE.AND P0, PT, count, stop, PT; 56 | --:-:-:-:1 IADD count, count, 1; 57 | 58 | 59 | my $out; 60 | 61 | foreach my $i (0 .. 511) 62 | { 63 | my $yield = ($i + 32) & 63 ? '-' : 'Y'; 64 | 65 | my $stall = $i == 511 ? 0 : 1; 66 | 67 | $out .= "--:-:-:$yield:$stall FFMA result, r1, r2, r3;\n"; 68 | } 69 | return $out; 70 | 71 | 72 | --:-:-:Y:5 @P0 BRA LOOP; 73 | --:-:-:-:5 EXIT; 74 | EOF 75 | 76 | close $fh; 77 | } 78 | 79 | __END__ 80 | 81 | -------------------------------------------------------------------------------- /Assembler/PascalAs/microbench/throughput.sass: -------------------------------------------------------------------------------- 1 | # Kernel: microbench 2 | # InsCnt: 18 3 | # RegCnt: 5 4 | # SharedSize: 4096 5 | # BarCnt: 1 6 | # Params(3): 7 | # ord:addr:size:align 8 | # 0:0x140:4:0 9 | # 1:0x144:4:0 10 | # 2:0x148:4:0 11 | 12 | 13 | 14 | 8-20 : count 15 | 16 | 17 | 18 | --:-:-:-:1 MOV R0, RZ; 19 | --:-:-:-:1 MOV R1, RZ; 20 | --:-:-:-:1 MOV R2, RZ; 21 | --:-:-:-:1 MOV R3, RZ; 22 | --:-:-:-:1 MOV R4, RZ; 23 | --:-:-:-:1 MOV R5, RZ; 24 | --:-:-:-:1 MOV R6, RZ; 25 | --:-:-:-:1 MOV R7, RZ; 26 | --:-:-:-:1 MOV R8, RZ; 27 | --:-:-:Y:6 MOV count, RZ; 28 | 29 | // This loop is capable of running at 1700 GFlops on GM107. 30 | // You can tweak it to see how register bank conflicts or different control codes 31 | // effect performance. 32 | // With thoughput.pl you can pass params to this code and do some autotuning. 33 | LOOP: 34 | 35 | --:-:-:-:1 ISETP.LE.AND P0, PT, count, 0x19000, PT; 36 | --:-:-:-:1 IADD count, count, 0x1; 37 | 38 | 39 | my $out; 40 | 41 | foreach my $i (0..511) #511 42 | { 43 | my $y = ($i + 32) & 63 ? '-' : 'Y'; 44 | 45 | $out .= qq| 46 | --:-:-:$y:1 FFMA R0, R1, R2, R3;|; #c[0x0][$c] 47 | } 48 | return $out; 49 | 50 | 51 | --:-:-:Y:5 @P0 BRA LOOP; 52 | 53 | --:-:-:-:5 EXIT; 54 | 55 | 56 | 57 | 58 | open my $fh, 'params.txt'; 59 | my $line = <$fh>; 60 | close $fh; 61 | my ($r1, $r2, $r3) = split "\t", $line; 62 | 63 | 80-95 : out, clocks, in, tid, clock1, clock2, result 64 | 65 | 66 | --:-:1:-:1 S2R tid, SR_TID.X; 67 | --:-:-:-:1 MOV out, c[0x0][0x140]; 68 | --:-:-:-:1 MOV clocks, c[0x0][0x144]; 69 | 01:-:-:-:1 MOV in, c[0x0][0x148]; 70 | 71 | 72 | 73 | --:-:-:-:1 MOV32I f0, 0x3f800000; 74 | --:-:-:-:1 MOV32I f1, 0x3f800000; 75 | --:-:-:-:1 MOV32I f2, 0x3f800000; 76 | --:-:-:-:5 MOV32I f3, 0x3f800000; 77 | 78 | --:-:-:-:1 CS2R clock1, SR_CLOCKLO; 79 | 80 | 81 | --:-:-:-:1 CS2R clock2, SR_CLOCKLO; 82 | 83 | --:-:-:-:6 MOV32I result, 0x457; 84 | --:-:-:-:1 IADD clock1, clock2, -clock1; 85 | 86 | 87 | --:-:-:-:6 SHL tid, tid, 0x2; 88 | --:-:-:-:1 IADD clocks, clocks, tid; 89 | --:-:-:-:1 IADD out, out, tid; 90 | 91 | --:-:-:-:1 STG [clocks], clock1; 92 | --:-:-:-:1 STG [out], R24; 93 | 94 | 95 | -------------------------------------------------------------------------------- /Assembler/PascalAs/microbench/throughput2.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | my %p; 4 | 5 | $p{N} = 8192; 6 | $p{blocking} = 8; 7 | $p{unroll} = 8; 8 | $p{threads} = 64; #256 9 | 10 | $p{csize} = $p{blocking} * $p{blocking}; 11 | $p{loopSize} = $p{unroll} * $p{csize}; 12 | $p{width} = sqrt($p{csize} * $p{threads}); 13 | $p{blocks} = ($p{N} / $p{width}) * ($p{N} / $p{width}); 14 | $p{loops} = $p{N} / $p{unroll}; 15 | $p{fops} = 2 * $p{loops} * $p{loopSize} * $p{blocks} * $p{threads}; 16 | 17 | my $fileName = 'throughput2.sass'; 18 | 19 | my @params = qw(N blocking unroll threads csize loopSize loops width blocks fops); 20 | 21 | #print join("\t", @params), "\n"; 22 | #print join("\t", @p{@params}), "\n"; 23 | 24 | print map sprintf("%-9s: %d\n", $_, $p{$_}), @params; 25 | 26 | writeSassFile($fileName, $p{loopSize}, $p{loops}); 27 | 28 | #print `maxas.pl -p $fileName`; 29 | #exit; 30 | 31 | print `maxas.pl -i $fileName microbench.cubin`; 32 | 33 | exit if $?; 34 | 35 | my $data = `Release\\microbench.exe e $p{blocks} $p{threads} $p{fops} 50`; 36 | 37 | my ($gflops) = $data =~ /GFLOPS: ([0-9]+)/ms; 38 | 39 | print $data; 40 | 41 | #printf "%d %4d %4d %d\n", $thread128, $loopSize, $loops, $gflops; 42 | 43 | 44 | 45 | 46 | sub writeSassFile 47 | { 48 | my ($filename, $loopSize, $loops) = @_; 49 | 50 | open my $fh, ">$filename" or die "$filename: $!"; 51 | 52 | printf $fh <<'END_SASS', $loops; 53 | # Kernel: microbench 54 | 55 | 56 | 57 | 3, 2,11,10,19,18,27,26 : cx00y<00-03|64-67> 58 | 7, 6,15,14,23,22,31,30 : cx01y<00-03|64-67> 59 | 1, 0, 9, 8,17,16,25,24 : cx02y<00-03|64-67> 60 | 5, 4,13,12,21,20,29,28 : cx03y<00-03|64-67> 61 | 35,34,43,42,51,50,59,58 : cx64y<00-03|64-67> 62 | 39,38,47,46,55,54,63,62 : cx65y<00-03|64-67> 63 | 33,32,41,40,49,48,57,56 : cx66y<00-03|64-67> 64 | 37,36,45,44,53,52,61,60 : cx67y<00-03|64-67> 65 | 66 | 64-79 : j0Ax<00-03|64-67>, j0By<00-03|64-67> 67 | 80-95 : j1Ax<00-03|64-67>, j1By<00-03|64-67> 68 | 69 | 0-127 : r<0-127> 70 | 71 | 100-101 : count, stop 72 | 73 | //102-112 ~ readAs, readBs, writeS 74 | 75 | 76 | 77 | --:-:-:-:1 MOV count, RZ; 78 | --:-:-:-:1 MOV32I stop, %d; 79 | //--:-:-:-:1 MOV writeS, RZ; 80 | //--:-:-:-:1 MOV readAs, RZ; 81 | //--:-:-:-:1 MOV readBs, RZ; 82 | 83 | 84 | return join '', map "--:-:-:-:1 MOV32I r$_, 1.0;\n", 0..95; 85 | 86 | 87 | LOOP: 88 | 89 | --:-:-:-:1 ISETP.LE.AND P0, PT, count, stop, PT; 90 | --:-:-:-:1 IADD count, count, 1; 91 | 92 | 93 | my $out; 94 | 95 | 96 | my @cOrder; 97 | #my @swirl = ([0,1],[0,0],[2,0],[2,1]); 98 | my @swirl = ([2,0],[2,1],[0,1],[0,0]); 99 | #my @swirl = ([0,1],[0,0],[1,0],[1,1]); 100 | my @xVals = (0,1,64,65); 101 | #my @xVals = (0,2,64,66); 102 | 103 | my @yVals = (0,2,64,66); 104 | 105 | foreach my $y (@yVals) 106 | { 107 | foreach my $x (@xVals) 108 | { 109 | push @cOrder, sprintf('x%%02dy%%02d', $x + $_->[0], $y + $_->[1]) foreach @swirl; 110 | } 111 | @xVals = reverse @xVals; 112 | } 113 | 114 | foreach my $j (0..7) 115 | { 116 | my $odd = $j & 1; 117 | my $nOdd = !$odd + 0; 118 | 119 | my %%insert; 120 | 121 | #$insert{c62} = "01:-:-:-:5 BAR.SYNC 0;\n" if $j == 6; 122 | 123 | $insert{c62} = 124 | "--:-:-:-:1 LOP.XOR readAs, readAs, 0;\n" . 125 | "--:-:-:-:1 LOP.XOR readBs, readBs, 0;\n" . 126 | "--:-:-:-:1 LOP.XOR readAs, readAs, 0;\n" . 127 | "--:-:-:-:1 LOP.XOR readBs, readBs, 0;\n" . 128 | "--:-:-:-:1 LOP.XOR writeS, writeS, 0;\n" if $j == 8; 129 | 130 | foreach my $c (0 .. 63) 131 | { 132 | my ($x,$y) = $cOrder[$c] =~ /^(x\d+)(y\d+)/; 133 | my $ins = $insert{"c$c"} || ''; 134 | my $stall = ($c == 63 && $j == 7) ? 0 : 1; #1; #$ins || 135 | my $yield = $c == 32 ? 'Y' : '-'; 136 | my $wait = '--'; #$c ? '--' : '01'; 137 | 138 | $out .= "$wait:-:-:$yield:$stall FFMA c$cOrder[$c], j${odd}A$x, j${odd}B$y, c$cOrder[$c];\n$ins"; 139 | } 140 | } 141 | return $out; 142 | 143 | 144 | --:-:-:Y:5 @P0 BRA LOOP; 145 | --:-:-:-:5 EXIT; 146 | END_SASS 147 | 148 | close $fh; 149 | } 150 | 151 | __END__ 152 | 153 | my %%insert = ( 154 | c0 => "--:-:-:-:1 LDS.U.128 j${nOdd}Ax00, [readAs+0x10];\n", 155 | c2 => "--:-:-:-:1 LDS.U.128 j${nOdd}By00, [readBs+0x10];\n", 156 | c4 => "--:-:-:-:1 LDS.U.128 j${nOdd}Ax64, [readAs+0x10];\n", 157 | c6 => "--:-:1:-:1 LDS.U.128 j${nOdd}By64, [readBs+0x10];\n", 158 | ); -------------------------------------------------------------------------------- /Assembler/PascalAs/microbench/throughput2.sass: -------------------------------------------------------------------------------- 1 | # Kernel: microbench 2 | 3 | 4 | 5 | 0-10 : result, r1, r2, r3 6 | 20-27 ~ count, stop 7 | 8 | 9 | 10 | --:-:-:-:1 MOV count, RZ; 11 | --:-:-:-:1 MOV32I stop, 102400; 12 | --:-:-:-:1 MOV32I r1, 1.0; 13 | --:-:-:-:1 MOV32I r2, 1.0; 14 | --:-:-:-:4 MOV32I r3, 1.0; 15 | 16 | LOOP: 17 | 18 | --:-:-:-:1 ISETP.LE.AND P0, PT, count, stop, PT; 19 | --:-:-:-:1 IADD count, count, 1; 20 | 21 | 22 | my $out; 23 | 24 | foreach my $i (0 .. 511) 25 | { 26 | my $yield = ($i + 32) & 63 ? '-' : 'Y'; 27 | 28 | my $stall = $i == 511 ? 0 : 1; 29 | 30 | #$out .= "--:-:-:$yield:1 FFMA r3, r1, r2, r3;\n"; 31 | #$out .= "--:-:-:-:1 FFMA r3, r1, r2, r3;\n"; 32 | #$out .= "--:-:-:-:1 FFMA r3, r1, r2, r3;\n"; 33 | #$out .= "--:-:-:-:0 FFMA r3, r1, r2, r3;\n"; 34 | #$out .= "--:-:-:-:1 I2F.F32.S16 result, r1;\n"; 35 | 36 | #$out .= "--:-:-:$yield:$stall VADD.S16.S16.SAT.MRG_16L result, r1, r2, RZ;\n"; 37 | #$out .= "--:-:-:-:1 MOV result, RZ;\n"; 38 | 39 | $out .= "--:-:-:$yield:$stall IADD.SAT result, r1, r2;\n"; 40 | #$out .= "--:-:-:$yield:$stall VMAD.S8.S8.SAT result, r1, r2, r3;\n"; 41 | #$out .= "--:-:-:$yield:$stall XMAD result, r1, r2, r3;\n"; 42 | } 43 | return $out; 44 | 45 | 46 | --:-:-:Y:5 @P0 BRA LOOP; 47 | --:-:-:-:5 EXIT; 48 | -------------------------------------------------------------------------------- /Assembler/PascalAs/microbench/throughput3.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | 4 | my %data; 5 | 6 | foreach my $thread128 (1 .. 8) 7 | { 8 | foreach my $size64 (8 .. 16) 9 | { 10 | my $loopSize = $size64 * 64; 11 | my $loops = int(2 * 1638400 / ($size64 * $thread128)); 12 | 13 | my $blocks = 16; 14 | my $threads = $thread128 * 128; 15 | my $fops = 2 * $loops * $loopSize * $blocks * $threads; 16 | my $fileName = 'throughput2.sass'; 17 | 18 | #printf "%d %4d %4d %d\n", $thread128, $loopSize, $loops, $fops; 19 | #next; 20 | 21 | writeSassFile($fileName, $loopSize, $loops); 22 | 23 | `maxas.pl -i $fileName microbench.cubin`; 24 | 25 | exit if $?; 26 | 27 | my $data = `Release\\microbench.exe e $blocks $threads $fops`; 28 | 29 | my ($gflops) = $data =~ /GFLOPS: ([0-9]+)/ms; 30 | 31 | printf "%d %4d %4d %d\n", $thread128, $loopSize, $loops, $gflops; 32 | 33 | push @{$data{$loopSize}}, $gflops; 34 | } 35 | } 36 | print join("\t", 'size', 1 .. 8), "\n"; 37 | foreach my $loopSize (sort {$a <=> $b} keys %data) 38 | { 39 | print join("\t", $loopSize, @{$data{$loopSize}}), "\n"; 40 | } 41 | 42 | exit; 43 | 44 | sub writeSassFile 45 | { 46 | my ($filename, $loopSize, $loops) = @_; 47 | 48 | open my $fh, ">$filename" or die "$filename: $!"; 49 | 50 | printf $fh <<'EOF', $loops, $loopSize, $loopSize; 51 | # Kernel: microbench 52 | 53 | 54 | 55 | 0-10 : result, r1, r2, r3, count, stop 56 | 57 | 58 | 59 | --:-:-:-:1 MOV count, RZ; 60 | --:-:-:-:1 MOV32I stop, %d; 61 | --:-:-:-:1 MOV32I r1, 1.0; 62 | --:-:-:-:1 MOV32I r2, 1.0; 63 | --:-:-:-:4 MOV32I r3, 1.0; 64 | 65 | LOOP: 66 | 67 | --:-:-:-:1 ISETP.LE.AND P0, PT, count, stop, PT; 68 | --:-:-:-:1 IADD count, count, 1; 69 | 70 | 71 | my $out; 72 | 73 | foreach my $i (0 .. %d) 74 | { 75 | my $y = %d > 64 && (($i + 32) & 63) ? '-' : 'Y'; 76 | 77 | $out .= "--:-:-:$y:1 FFMA result, r1, r2, r3;\n"; 78 | } 79 | return $out; 80 | 81 | 82 | --:-:-:Y:5 @P0 BRA LOOP; 83 | --:-:-:-:5 EXIT; 84 | EOF 85 | 86 | close $fh; 87 | } 88 | 89 | __END__ 90 | 91 | -------------------------------------------------------------------------------- /Assembler/PascalAs/microbench/throughput4.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | 4 | my $loopSize = 512; 5 | my $blocks = 64; 6 | my $loops = 102400; 7 | my $fileName = 'throughput2.sass'; 8 | 9 | writeSassFile($fileName, $loops); 10 | 11 | #print `maxas.pl -p $fileName`; 12 | #exit; 13 | 14 | print `maxas.pl -i $fileName microbench.cubin`; 15 | exit if $?; 16 | 17 | foreach my $thread128 (4) 18 | { 19 | my $threads = $thread128 * 128; 20 | my $fops = 2 * $loops * $loopSize * $blocks * $threads; 21 | 22 | print "./microbench e $blocks $threads $fops\n\n"; 23 | my $data = `./microbench e $blocks $threads $fops`; 24 | exit($?) if $?; 25 | 26 | my ($gflops) = $data =~ /GFLOPS: ([0-9]+)/ms; 27 | 28 | printf "%d %d %d %.2f\n", $thread128, $threads, $gflops, 100 * $gflops / 3050.0; 29 | } 30 | 31 | exit; 32 | 33 | sub writeSassFile 34 | { 35 | my ($filename, $loops) = @_; 36 | 37 | open my $fh, ">$filename" or die "$filename: $!"; 38 | 39 | printf $fh <<'EOF', $loops; 40 | # Kernel: microbench 41 | 42 | 43 | 44 | 0-10 : result, r1, r2, r3 45 | 20-27 ~ count, stop 46 | 47 | 48 | 49 | --:-:-:-:1 MOV count, RZ; 50 | --:-:-:-:1 MOV32I stop, %d; 51 | --:-:-:-:1 MOV32I r1, 1.0; 52 | --:-:-:-:1 MOV32I r2, 1.0; 53 | --:-:-:-:4 MOV32I r3, 1.0; 54 | 55 | LOOP: 56 | 57 | --:-:-:-:1 ISETP.LE.AND P0, PT, count, stop, PT; 58 | --:-:-:-:1 IADD count, count, 1; 59 | 60 | 61 | my $out; 62 | 63 | foreach my $i (0 .. 511) 64 | { 65 | my $yield = ($i + 32) & 63 ? '-' : 'Y'; 66 | 67 | my $stall = $i == 511 ? 0 : 1; 68 | 69 | #$out .= "--:-:-:$yield:1 FFMA r3, r1, r2, r3;\n"; 70 | #$out .= "--:-:-:-:1 FFMA r3, r1, r2, r3;\n"; 71 | #$out .= "--:-:-:-:1 FFMA r3, r1, r2, r3;\n"; 72 | #$out .= "--:-:-:-:0 FFMA r3, r1, r2, r3;\n"; 73 | #$out .= "--:-:-:-:1 I2F.F32.S16 result, r1;\n"; 74 | 75 | #$out .= "--:-:-:$yield:$stall VADD.S16.S16.SAT.MRG_16L result, r1, r2, RZ;\n"; 76 | #$out .= "--:-:-:-:1 MOV result, RZ;\n"; 77 | 78 | $out .= "--:-:-:$yield:$stall IADD.SAT result, r1, r2;\n"; 79 | #$out .= "--:-:-:$yield:$stall VMAD.S8.S8.SAT result, r1, r2, r3;\n"; 80 | #$out .= "--:-:-:$yield:$stall XMAD result, r1, r2, r3;\n"; 81 | } 82 | return $out; 83 | 84 | 85 | --:-:-:Y:5 @P0 BRA LOOP; 86 | --:-:-:-:5 EXIT; 87 | EOF 88 | 89 | close $fh; 90 | } 91 | 92 | __END__ 93 | 94 | VMAD.U8.U8 95 | 96 | dddd 2655 / 4968 = 53.4% 97 | 1d1d 4594 / 4968 = 92.4% 98 | 11d 4746 / 4968 = 95.5% 99 | 111d 4841 / 4968 = 97.4% 100 | 101 | block context switches are a little more expensive than thread context switches 102 | 103 | stall codes: 104 | 105 | f : 13 clocks 106 | e : 8 clocks 107 | d : 6 clocks 108 | c : 8 clocks, no yield 109 | b : 11 clocks 110 | a : 10 clocks 111 | 9 : 9 clocks 112 | 8 : 8 clocks 113 | 7 : 7 clocks 114 | 6 : 6 clocks 115 | 5 : 5 clocks 116 | 4 : 4 clocks 117 | 3 : 3 clocks 118 | 2 : 2 clocks 119 | 1 : 1 clocks, no yield 120 | 0 : 0 clocks, no yield, dual issue -------------------------------------------------------------------------------- /Assembler/PascalAs/microbench/throughput5.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | my %p; 4 | 5 | $p{N} = 8192; 6 | $p{blocking} = 8; 7 | $p{unroll} = 8; 8 | $p{threads} = 64; #256 9 | 10 | $p{csize} = $p{blocking} * $p{blocking}; 11 | $p{loopSize} = $p{unroll} * $p{csize}; 12 | $p{width} = sqrt($p{csize} * $p{threads}); 13 | $p{blocks} = ($p{N} / $p{width}) * ($p{N} / $p{width}); 14 | $p{loops} = $p{N} / $p{unroll}; 15 | $p{fops} = 2 * $p{loops} * $p{loopSize} * $p{blocks} * $p{threads}; 16 | 17 | my $fileName = 'throughput2.sass'; 18 | 19 | my @params = qw(N blocking unroll threads csize loopSize loops width blocks fops); 20 | 21 | #print join("\t", @params), "\n"; 22 | #print join("\t", @p{@params}), "\n"; 23 | 24 | print map sprintf("%-9s: %d\n", $_, $p{$_}), @params; 25 | 26 | writeSassFile($fileName, $p{loopSize}, $p{loops}); 27 | 28 | #print `maxas.pl -p $fileName`; 29 | #exit; 30 | 31 | print `maxas.pl -i $fileName microbench.cubin`; 32 | 33 | exit if $?; 34 | 35 | my $data = `Release\\microbench.exe e $p{blocks} $p{threads} $p{fops} 50`; 36 | 37 | my ($gflops) = $data =~ /GFLOPS: ([0-9]+)/ms; 38 | 39 | print $data; 40 | 41 | #printf "%d %4d %4d %d\n", $thread128, $loopSize, $loops, $gflops; 42 | 43 | 44 | 45 | 46 | sub writeSassFile 47 | { 48 | my ($filename, $loopSize, $loops) = @_; 49 | 50 | open my $fh, ">$filename" or die "$filename: $!"; 51 | 52 | printf $fh <<'END_SASS', $loops; 53 | # Kernel: microbench 54 | 55 | 56 | 57 | 1, 9, 2,10,17,25,18,26 : cy0x<0-7> 58 | 5,13, 6,14,21,29,22,30 : cy1x<0-7> 59 | 3,11, 0, 8,19,27,16,24 : cy2x<0-7> 60 | 7,15, 4,12,23,31,20,28 : cy3x<0-7> 61 | 35,43,32,40,51,59,48,56 : cy4x<0-7> 62 | 39,47,36,44,55,63,52,60 : cy5x<0-7> 63 | 33,41,34,42,49,57,50,58 : cy6x<0-7> 64 | 37,45,38,46,53,61,54,62 : cy7x<0-7> 65 | 66 | 64-71 : j0Ax<0-3>, j0By<0-3> 67 | 72-79 : j1Ax<0-3>, j1By<0-3> 68 | 69 | 0-79 : r<0-79> 70 | 71 | 100-101 : count, stop 72 | 73 | //102-112 ~ readAs, readBs, writeS 74 | 75 | 76 | 77 | --:-:-:-:1 MOV count, RZ; 78 | --:-:-:-:1 MOV32I stop, %d; 79 | //--:-:-:-:1 MOV writeS, RZ; 80 | //--:-:-:-:1 MOV readAs, RZ; 81 | //--:-:-:-:1 MOV readBs, RZ; 82 | 83 | 84 | return join '', map "--:-:-:-:1 MOV r$_, RZ;\n", 0..63; 85 | 86 | 87 | 88 | return join '', map "--:-:-:-:1 MOV32I r$_, 0x00010001;\n", 64..79; 89 | 90 | 91 | LOOP: 92 | 93 | --:-:-:-:1 ISETP.LE.AND P0, PT, count, stop, PT; 94 | --:-:-:-:1 IADD count, count, 1; 95 | 96 | 97 | my $out; 98 | 99 | my @swirl1 = ([0,0],[0,4],[4,4],[4,0]); 100 | my @swirl2 = ([0,0],[1,0],[1,1],[0,1]); 101 | my @swirl3 = ([0,2],[2,2],[2,0],[0,0]); 102 | 103 | my @cOrder; 104 | foreach my $s1 (@swirl1) 105 | { 106 | foreach my $s2 (@swirl2) 107 | { 108 | foreach my $s3 (@swirl3) 109 | { 110 | push @cOrder, [$s1->[0] + $s2->[0] + $s3->[0], $s1->[1] + $s2->[1] + $s3->[1]]; 111 | } 112 | } 113 | } 114 | 115 | foreach my $j (0..7) 116 | { 117 | my $odd = $j & 1; 118 | my $nOdd = !$odd + 0; 119 | 120 | my %%insert; 121 | 122 | #$insert{c62} = "01:-:-:-:5 BAR.SYNC 0;\n" if $j == 6; 123 | 124 | $insert{c62} = 125 | "--:-:-:-:1 LOP.XOR readAs, readAs, 0;\n" . 126 | "--:-:-:-:1 LOP.XOR readBs, readBs, 0;\n" . 127 | "--:-:-:-:1 LOP.XOR readAs, readAs, 0;\n" . 128 | "--:-:-:-:1 LOP.XOR readBs, readBs, 0;\n" . 129 | "--:-:-:-:1 LOP.XOR writeS, writeS, 0;\n" if $j == 8; 130 | 131 | foreach my $c (0 .. 63) 132 | { 133 | my ($x,$y) = @{$cOrder[$c]}; 134 | my $ins = $insert{"c$c"} || ''; 135 | my $stall = ($c == 63 && $j == 7) ? 0 : 1; #1; #$ins || 136 | my $yield = $c == 32 ? 'Y' : '-'; 137 | my $wait = '--'; #$c ? '--' : '01'; 138 | 139 | my $xReg = $x >> 1; 140 | my $yReg = $y >> 1; 141 | my $xPart = $x & 1 ? '.H1' : ''; 142 | my $yPart = $y & 1 ? '.H1' : ''; 143 | 144 | $out .= sprintf "$wait:-:-:$yield:$stall XMAD cy%%dx%%d, j%%dAx%%d%%s, j%%dBy%%d%%s, cy%%dx%%d;\n%%s", $y,$x, $odd,$xReg,$xPart, $odd,$yReg,$yPart, $y,$x, $ins; 145 | } 146 | } 147 | return $out; 148 | 149 | 150 | --:-:-:Y:5 @P0 BRA LOOP; 151 | --:-:-:-:5 EXIT; 152 | END_SASS 153 | 154 | close $fh; 155 | } 156 | 157 | __END__ 158 | 159 | my %%insert = ( 160 | c0 => "--:-:-:-:1 LDS.U.128 j${nOdd}Ax00, [readAs+0x10];\n", 161 | c2 => "--:-:-:-:1 LDS.U.128 j${nOdd}By00, [readBs+0x10];\n", 162 | c4 => "--:-:-:-:1 LDS.U.128 j${nOdd}Ax64, [readAs+0x10];\n", 163 | c6 => "--:-:1:-:1 LDS.U.128 j${nOdd}By64, [readBs+0x10];\n", 164 | ); -------------------------------------------------------------------------------- /Assembler/PascalAs/microbench/xmad.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | 4 | print `maxas.pl -i xmad2.sass microbench.cubin`; 5 | 6 | exit if $?; 7 | 8 | print `./microbench i 1 128`; 9 | 10 | 11 | __END__ 12 | 13 | -------------------------------------------------------------------------------- /Assembler/PascalAs/microbench/xmad2.sass: -------------------------------------------------------------------------------- 1 | # Kernel: microbench 2 | # InsCnt: 18 3 | # RegCnt: 5 4 | # SharedSize: 4096 5 | # BarCnt: 1 6 | # Params(3): 7 | # ord:addr:size:align 8 | # 0:0x140:8:0 9 | # 1:0x148:8:0 10 | # 2:0x150:8:0 11 | # 12 | # Instructions: 13 | 14 | 15 | blockDimX : c[0x0][0x8] 16 | blockDimY : c[0x0][0xc] 17 | blockDimZ : c[0x0][0x10] 18 | gridDimX : c[0x0][0x14] 19 | gridDimY : c[0x0][0x18] 20 | gridDimZ : c[0x0][0x1c] 21 | 22 | param_out[0] : c[0x0][0x140] 23 | param_out[1] : c[0x0][0x144] 24 | param_clocks[0] : c[0x0][0x148] 25 | param_clocks[1] : c[0x0][0x14c] 26 | param_in[0] : c[0x0][0x150] 27 | param_in[1] : c[0x0][0x154] 28 | 29 | 30 | 31 | 32 | 0-1 : out<0-1> 33 | 2-3 : clocks<0-1> 34 | 4-15 : result, result2, tid, bid, blockDim, clock1, clock2, scale, s 35 | 16-24 : a, b, c, x 36 | 37 | 38 | 39 | // Load in our params 40 | --:-:-:-:1 MOV out0, param_out[0]; 41 | --:-:-:-:1 MOV out1, param_out[1]; 42 | --:-:-:-:1 MOV clocks0, param_clocks[0]; 43 | --:-:-:-:1 MOV clocks1, param_clocks[1]; 44 | //--:-:-:-:1 MOV in, c[0x0][0x148]; 45 | --:-:-:-:1 MOV blockDim, blockDimX; 46 | 47 | --:-:-:-:1 PSETP.AND.AND P0, PT, !PT, PT, PT; 48 | 49 | --:-:-:-:6 MOV32I result, 0xffffffff; 50 | --:-:-:-:6 MOV32I result2, 0x0; 51 | --:-:-:-:1 MOV32I a, 1; 52 | --:-:-:-:1 MOV32I b, 1; 53 | --:-:-:-:6 MOV32I c, 0x0; 54 | 55 | // (127 - scale) << 23 56 | //--:-:-:-:6 MOV32I scale, 28; 57 | //--:-:-:-:6 IADD scale, -scale, 127; 58 | //--:-:-:-:6 SHL scale, scale, 23; 59 | 60 | 61 | //--:-:-:-:6 MOV32I c, 0x4f765432; 62 | 63 | //--:-:1:-:2 LDG.CI.128 a, [in]; 64 | 65 | //01:-:-:-:6 VMAD.S16.S16 result, a, b, c; 66 | 67 | //--:-:-:-:6 MOV result, a; 68 | 69 | // a >> 16 | (b & 0xffff0000) 70 | 71 | //--:-:-:-:6 SHR.U32 result, a, 16; 72 | //--:-:-:-:6 LOP3.LUT result, result, b, c, 0xf8; 73 | 74 | //--:-:-:-:6 I2I.S32.S16 result, a.H1; 75 | 76 | //--:-:-:Y:d IADD result.CC, a, -c; 77 | //--:-:-:Y:2 IADD.X result2, b, -RZ; 78 | 79 | //--:-:-:-:6 SHR result, a, 1; 80 | 81 | //--:-:-:-:6 BFI result, b, 0x1010, a; 82 | 83 | --:-:-:-:1 CS2R clock1, SR_CLOCKLO; 84 | 85 | //--:-:-:-:6 XMAD.S16.S16 c, a, b, RZ; 86 | //--:-:-:-:6 ISET.LT.AND s, c, RZ, PT; 87 | //--:-:-:-:6 IADD result.CC, c, result; 88 | //--:-:-:-:6 IADD.X result2, s, result2; 89 | 90 | //--:-:-:-:6 XMAD.S16.S16 result.CC, a, b, result; 91 | //--:-:-:-:6 IADD.X result2, result2, RZ; 92 | 93 | //--:-:-:-:6 SHF.R.S64 result, result, 1, result2; 94 | //--:-:-:-:6 MOV32I result2, 0; 95 | 96 | --:-:-:-:f LOP.AND.NZ P0, RZ, result, 1; 97 | 98 | --:-:-:-:6 @P0 VADD.S16.S16.SAT.MRG_16H result, a, b, result; 99 | 100 | //--:-:1:-:d I2F.F32.S32 result2, a; 101 | //01:-:-:-:6 FMUL result2, result2, scale; 102 | //01:-:2:-:d F2I.S32.F32 result, result2; 103 | 104 | 02:-:-:-:6 CS2R clock2, SR_CLOCKLO; 105 | 106 | //F2I = "^$pred?F2I$ftz$x2x$round $r0, $cr20;" 107 | //I2F = "^$pred?I2F$x2x$rnd $r0, $cr20;" 108 | //x2x = "\.(?F|U|S)(?8|16|32|64)\.(?F|U|S)(?8|16|32|64)" 109 | //rnd = "(?:\.(?RN|RM|RP|RZ))?" 110 | //round = "(?:\.(?ROUND|FLOOR|CEIL|TRUNC))?" 111 | //r8 = qr"(?\-)?(?\|)?(?$reg)\|?(?:\.(?H0|H1|B1|B2|B3))?(?\.reuse)?" 112 | //r20 = qr"(?\-)?(?\|)?(?$reg)\|?(?:\.(?H0|H1|B1|B2|B3))?(?\.reuse)?" 113 | 114 | 115 | //--:-:-:-:1 XMAD.MRG x, a, b.H1, RZ; 116 | //--:-:-:-:6 XMAD result, a.H1, b.H1, c; 117 | //--:-:-:-:1 XMAD.PSL.CBCC result, a.H1, x.H1, result; 118 | 119 | // Get the first clock value 120 | 121 | --:-:1:-:1 S2R tid, SR_TID.X; 122 | --:-:2:-:2 S2R bid, SR_CTAID.X; 123 | 124 | 125 | 126 | // Take the difference of clocks 127 | --:-:-:-:1 IADD clock1, clock2, -clock1; 128 | 129 | // Setup our output addresses 130 | // Stall your pipeline dependencies properly 131 | 03:-:-:-:1 XMAD tid, blockDim, bid, tid; 132 | --:-:-:Y:6 XMAD.MRG x, blockDim, bid.H1, RZ; 133 | --:-:-:Y:6 XMAD.PSL.CBCC tid, blockDim.H1, x.H1, tid; 134 | --:-:-:Y:6 SHL tid, tid, 0x2; 135 | 136 | --:-:-:-:1 IADD clocks, clocks, tid; 137 | --:-:-:-:1 IADD out, out, tid; 138 | 139 | // Output the results. 140 | // No stall needed on prior instruction as memory store instructions have a 5 clock delay in picking up register values 141 | --:-:-:-:1 STG.E [clocks], result2; 142 | --:-:-:-:1 STG.E [out], result; 143 | --:-:-:-:5 EXIT; 144 | 145 | -------------------------------------------------------------------------------- /Assembler/PascalAs/pm_to_blib: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ncic-sugon/DeepPerf/e539add1347e9354712787f71f66ae346f006544/Assembler/PascalAs/pm_to_blib -------------------------------------------------------------------------------- /Assembler/PascalAs/sgemm/batched_gemm.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ncic-sugon/DeepPerf/e539add1347e9354712787f71f66ae346f006544/Assembler/PascalAs/sgemm/batched_gemm.xlsx -------------------------------------------------------------------------------- /Assembler/PascalAs/sgemm/cublas_sgemm.ptx: -------------------------------------------------------------------------------- 1 | .version 4.1 2 | .target sm_50 3 | .address_size 64 4 | 5 | // ptxas -v -arch=sm_50 -m 32 --opt-level 4 -o cublas_sgemm.cubin cublas_sgemm.ptx 6 | 7 | // You can use maxas to insert cublas_device.lib code into a cubin built from this ptx: 8 | 9 | // From C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v6.5\lib\Win32\cublas_device.lib 10 | 11 | // cuobjdump -lelf cublas_device.lib | find "sm_50" 12 | 13 | // cuobjdump -xelf maxwell_sgemm.asm.sm_50.cubin cublas_device.lib 14 | 15 | // maxas -l maxwell_sgemm.asm.sm_50.cubin 16 | 17 | // maxas -e -k maxwell_sgemm_128x128_nt maxwell_sgemm_128x128_nt.sass 18 | // maxas -e -k maxwell_sgemm_128x64_nt maxwell_sgemm_128x64_nt.sass 19 | 20 | // maxas -i maxwell_sgemm_128x128_nt.sass cublas_sgemm.cubin 21 | // maxas -i maxwell_sgemm_128x64_nt.sass cublas_sgemm.cubin 22 | 23 | // The sgemm.cpp code makes use of this cubin to benchmark the kernels outside of cublas. 24 | 25 | .visible .entry maxwell_sgemm_128x128_nt( 26 | .param .u64 .ptr.global.align 8 param_A, 27 | .param .u64 .ptr.global.align 8 param_B, 28 | .param .u64 .ptr.global.align 8 param_C, 29 | .param .s32 param_lda, 30 | .param .s32 param_ldb, 31 | .param .s32 param_ldc, 32 | .param .s32 param_k, 33 | .param .u64 .ptr.global.align 8 param_Alpha, 34 | .param .u64 .ptr.global.align 8 param_Beta, 35 | .param .s32 param_alpha, 36 | .param .s32 param_beta, 37 | .param .s32 param_flag 38 | ) 39 | .reqntid 256 40 | { 41 | .shared .align 16 .b8 share[16384]; 42 | 43 | ret; 44 | } 45 | 46 | .visible .entry maxwell_sgemm_128x64_nt( 47 | .param .u64 .ptr.global.align 8 param_A, 48 | .param .u64 .ptr.global.align 8 param_B, 49 | .param .u64 .ptr.global.align 8 param_C, 50 | .param .s32 param_lda, 51 | .param .s32 param_ldb, 52 | .param .s32 param_ldc, 53 | .param .s32 param_k, 54 | .param .u64 .ptr.global.align 8 param_Alpha, 55 | .param .u64 .ptr.global.align 8 param_Beta, 56 | .param .s32 param_alpha, 57 | .param .s32 param_beta, 58 | .param .s32 param_flag 59 | ) 60 | .reqntid 128 61 | { 62 | .shared .align 16 .b8 share[12288]; 63 | 64 | ret; 65 | } 66 | -------------------------------------------------------------------------------- /Assembler/PascalAs/sgemm/new.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ncic-sugon/DeepPerf/e539add1347e9354712787f71f66ae346f006544/Assembler/PascalAs/sgemm/new.cubin -------------------------------------------------------------------------------- /Assembler/PascalAs/sgemm/sgemm.cu: -------------------------------------------------------------------------------- 1 | 2 | // Note this file isn't configured to automatically compile. 3 | // Here's how: 4 | 5 | // If you want to look at the ptx first: 6 | // nvcc -arch sm_50 -m 32 -ptx sgemm.cu 7 | 8 | // Manually compile your kernel to a cubin. 9 | // You should only have to do this once, unless you change params or shared size or globals: 10 | // nvcc -arch sm_50 -m 32 -cubin sgemm.cu 11 | 12 | // If tweaking a kernel or writing a new one based on this shell code you would then do this: 13 | // maxas.pl -e kernel.cubin kernel.sass 14 | 15 | // I've already included a modified kernel (sgemm.sass) so the next step is.. 16 | 17 | // Splice the manually assembled code back into the cubin: 18 | // maxas.pl -i sgemm.sass sgemm.cubin 19 | 20 | #include 21 | #include 22 | #include 23 | #include 24 | 25 | typedef texture floatTex; 26 | 27 | floatTex texA(0, cudaFilterModePoint, cudaAddressModeBorder); 28 | floatTex texB(0, cudaFilterModePoint, cudaAddressModeBorder); 29 | 30 | // Use extern C so C++ doesn't mangle our kernel name 31 | extern "C" 32 | // This kernel requires 256x1x1 threads per block 33 | __global__ void __launch_bounds__(256) sgemm_kernel_128( 34 | float *C, 35 | const int m, const int n, const int k, 36 | const int lda, const int ldb, const int ldc, 37 | float alpha, int *D) 38 | { 39 | // Declare any shared memory your kernel requires 40 | // Or you could just pass the amount in as a param to cuLaunchKernel 41 | __shared__ float4 share[1024]; 42 | 43 | int tid = threadIdx.x; 44 | 45 | // If you use indirect texture references, they will be passed as params at the end of the param list 46 | // So set that up here to make sure they're available in your kernel 47 | floatTex tex = tid > 127 ? texB : texA; 48 | 49 | // Make use of shared and your textures so it doesn't get optimized away 50 | share[tid] = tex1Dfetch(tex, tid); 51 | 52 | __syncthreads(); 53 | 54 | // output something so your setup isn't optimized away. 55 | C[tid] = share[255-tid].x; 56 | } 57 | 58 | extern "C" 59 | __global__ void __launch_bounds__(64) sgemm_kernel_64( 60 | float *C, 61 | const int m, const int n, const int k, 62 | const int lda, const int ldb, const int ldc, 63 | float alpha, int *D) 64 | { 65 | __shared__ float4 share[512]; 66 | 67 | int tid = threadIdx.x; 68 | 69 | floatTex tex = tid > 127 ? texB : texA; 70 | 71 | share[tid] = tex1Dfetch(tex, tid); 72 | 73 | __syncthreads(); 74 | 75 | C[tid] = share[255-tid].x; 76 | } 77 | 78 | // A note about using the Cuda Runtime. 79 | // If that's your preference over the driver API then here's what you'd do: 80 | 81 | // In your project properties in the Cuda C/C++ panel: 82 | // -Set the "Keep Processed Files" (-keep) option 83 | // -Add a -v manually to the command line 84 | // If compiling on command line just add -keep -v options to nvcc. 85 | // Rebuild your solution and look in the log for these lines that follow the ptxas step: 86 | 87 | // #$ fatbinary --create="Release/kernel.fatbin" -32 --key="a7bce87544c2a492" --ident="C:/Users/Scott/Documents/sgemm6/sgemm6/kernel.cu" --cmdline="-v --opt-level 4 --generate-line-info " "--image=profile=sm_50,file=Release/kernel.sm_50.cubin" "--image=profile=compute_50,file=Release/kernel.ptx" --embedded-fatbin="Release/kernel.fatbin.c" --cuda 88 | // #$ cl.exe @Release/kernel.cu.cpp.ii.res > "Release/kernel.cu.cpp.ii" 89 | // #$ cl.exe @Release/kernel.cu.obj.res -Fo"Release/kernel.cu.obj" 90 | 91 | // You just need to manually run these 3 commands (or add them to a build script) 92 | // after you've modified the cubin generated from the preceeding ptxas command. 93 | // That will give you a new .cu.obj file which will automatically be linked in for you next time you 94 | // build your project (or you could manually run the linker step as well). 95 | 96 | // Having done that you can call your kernel normally using the <<< >>> syntax. 97 | // Debugging will have to be with the sass syntax but that's what you'll want to see anyway. 98 | // With fatbin you can also keep non-maxwell optimized versions of your code. 99 | 100 | 101 | // I just discovered this also works as a shortcut to the above: 102 | // nvcc -lib -arch sm_52 -m 32 -use-cubin code=sm_52,cubin=sgemm.cubin -o sgemm.lib sgemm.cu 103 | 104 | // The cu kernel definitions above need to have empty bodies. 105 | // And, the cu file must be compiled to a lib seperately before linking. -------------------------------------------------------------------------------- /Assembler/PascalAs/sgemm/sgemm.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ncic-sugon/DeepPerf/e539add1347e9354712787f71f66ae346f006544/Assembler/PascalAs/sgemm/sgemm.cubin -------------------------------------------------------------------------------- /Assembler/PascalAs/sgemm/sgemm.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | 4 | my $CU_AD_FORMAT_UNSIGNED_INT8 = 0x01; 5 | my $CU_AD_FORMAT_UNSIGNED_INT16 = 0x02; 6 | my $CU_AD_FORMAT_FLOAT = 0x20; 7 | 8 | if (!-f 'sgemm_pre_128.sass' || (stat 'sgemm128.sass')[9] > (stat 'sgemm_pre_128.sass')[9]) 9 | { 10 | print `maxas.pl -p sgemm128.sass sgemm_pre_128.sass`; 11 | exit if $?; 12 | print `maxas.pl -i sgemm128.sass sgemm.cubin`; 13 | exit if $?; 14 | print `maxas.pl -e -k sgemm_kernel_128 sgemm.cubin sgemm_final_128.sass`; 15 | } 16 | if (!-f 'sgemm_pre_64.sass' || (stat 'sgemm64.sass')[9] > (stat 'sgemm_pre_64.sass')[9]) 17 | { 18 | print `maxas.pl -p sgemm64.sass sgemm_pre_64.sass`; 19 | exit if $?; 20 | print `maxas.pl -i sgemm64.sass sgemm.cubin`; 21 | exit if $?; 22 | print `maxas.pl -e -k sgemm_kernel_64 sgemm.cubin sgemm_final_64.sass`; 23 | } 24 | 25 | #print `Release\\sgemm.exe $_ 20` foreach (80,60,40,30,20,10,9,8,7,6,5,4,3,2); 26 | 27 | `Release\\sgemm.exe 64 5 $CU_AD_FORMAT_FLOAT`; 28 | 29 | print `Release\\sgemm.exe 64 20 $CU_AD_FORMAT_UNSIGNED_INT8`; 30 | exit; 31 | 32 | my %data; 33 | foreach my $thread128 (4 .. 64) 34 | { 35 | my $N = $thread128 * 128; 36 | 37 | my $iterations = int(20 * (64 * 128)**3 / $N**3); 38 | $iterations = 10000 if $iterations > 10000; 39 | 40 | print "$N $iterations\n"; 41 | 42 | my $data = `Release\\sgemm.exe $thread128 $iterations $CU_AD_FORMAT_UNSIGNED_INT16`; 43 | 44 | foreach my $bench (split "\n", $data) 45 | { 46 | if ($bench =~ /^(\w+)\s+GFLOPS: ([0-9.]+) /) 47 | { 48 | push @{$data{$N}}, $2; 49 | print "$1 $2\n"; 50 | } 51 | } 52 | } 53 | print join("\t", qw(size Max64 Max128 Cub64 Cub128)), "\n"; 54 | 55 | foreach my $N (sort { $a <=> $b } keys %data) 56 | { 57 | print join("\t", @{$data{$N}}), "\n"; 58 | } 59 | 60 | 61 | #print $data; 62 | 63 | __END__ 64 | 65 | 66 | 64 * 128 * 16 * 1.620 * .931 / 520 67 | 68 | Max64 GFLOPS: 1377.38 (size: 256, iterations: 2000) 69 | Max128 GFLOPS: 973.70 (size: 256, iterations: 2000) 70 | Cub64 GFLOPS: 1272.42 (size: 256, iterations: 2000) 71 | Cub128 GFLOPS: 948.15 (size: 256, iterations: 2000) 72 | 73 | my @data = grep /\S/, split "\n", $data; 74 | 75 | my $min; 76 | my %smData; 77 | my @sdata; 78 | foreach (@data) 79 | { 80 | next if /GFLOPS/; 81 | 82 | my ($sm, $clock, $by, $bx) = split /\s+/; 83 | 84 | $smData{$sm} = $clock if !$smData{$sm} || $clock < $smData{$sm}; 85 | 86 | $min = $clock if !$min || $clock < $min; 87 | 88 | push @sdata, [$sm, $clock, $by, $bx]; 89 | } 90 | 91 | foreach (@sdata) 92 | { 93 | $_->[1] -= $smData{$_->[0]}; 94 | } 95 | 96 | foreach (sort {$a->[1] <=> $b->[1] || $a->[0] <=> $b->[0]} @sdata) 97 | { 98 | printf "%02d %8u by: %2d bx: %2d\n", @$_; 99 | 100 | } 101 | 102 | 103 | -------------------------------------------------------------------------------- /Assembler/PascalAs/sgemm/sgemm.sln: -------------------------------------------------------------------------------- 1 |  2 | Microsoft Visual Studio Solution File, Format Version 11.00 3 | # Visual Studio 2010 4 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "sgemm", "sgemm.vcxproj", "{D571379D-3653-43CB-BE83-A6C68D392A05}" 5 | EndProject 6 | Global 7 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 8 | Debug|Win32 = Debug|Win32 9 | Release|Win32 = Release|Win32 10 | EndGlobalSection 11 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 12 | {D571379D-3653-43CB-BE83-A6C68D392A05}.Debug|Win32.ActiveCfg = Debug|Win32 13 | {D571379D-3653-43CB-BE83-A6C68D392A05}.Debug|Win32.Build.0 = Debug|Win32 14 | {D571379D-3653-43CB-BE83-A6C68D392A05}.Release|Win32.ActiveCfg = Release|Win32 15 | {D571379D-3653-43CB-BE83-A6C68D392A05}.Release|Win32.Build.0 = Release|Win32 16 | EndGlobalSection 17 | GlobalSection(SolutionProperties) = preSolution 18 | HideSolutionNode = FALSE 19 | EndGlobalSection 20 | EndGlobal 21 | -------------------------------------------------------------------------------- /Assembler/PascalAs/sgemm/sgemm.vcxproj: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | Debug 6 | Win32 7 | 8 | 9 | Release 10 | Win32 11 | 12 | 13 | 14 | {D571379D-3653-43CB-BE83-A6C68D392A05} 15 | Win32Proj 16 | sgemm 17 | 18 | 19 | 20 | Application 21 | true 22 | Unicode 23 | 24 | 25 | Application 26 | false 27 | true 28 | Unicode 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | true 42 | 43 | 44 | false 45 | 46 | 47 | 48 | 49 | 50 | Level3 51 | Disabled 52 | _CRT_SECURE_NO_WARNINGS;WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) 53 | $(CUDA_PATH_V6_5)\include;%(AdditionalIncludeDirectories) 54 | 55 | 56 | Console 57 | true 58 | $(CUDA_PATH_V6_5)\lib\Win32;%(AdditionalLibraryDirectories) 59 | cuda.lib;cublas.lib;%(AdditionalDependencies) 60 | 61 | 62 | 63 | 64 | Level3 65 | 66 | 67 | MaxSpeed 68 | true 69 | true 70 | _CRT_SECURE_NO_WARNINGS;WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) 71 | $(CUDA_PATH_V6_5)\include;%(AdditionalIncludeDirectories) 72 | 73 | 74 | Console 75 | true 76 | true 77 | true 78 | $(CUDA_PATH_V6_5)\lib\Win32;%(AdditionalLibraryDirectories) 79 | cuda.lib;cublas.lib;%(AdditionalDependencies) 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | -------------------------------------------------------------------------------- /Assembler/PascalAs/sgemm/sgemm_sm52_64.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ncic-sugon/DeepPerf/e539add1347e9354712787f71f66ae346f006544/Assembler/PascalAs/sgemm/sgemm_sm52_64.cubin -------------------------------------------------------------------------------- /Assembler/PascalAs/t/MaxAs-MaxAs.t: -------------------------------------------------------------------------------- 1 | use strict; 2 | use warnings; 3 | 4 | use Test::More tests => 1; 5 | BEGIN { use_ok('MaxAs::MaxAs') }; 6 | -------------------------------------------------------------------------------- /Kernel/Convolution/Kepler/Makefile: -------------------------------------------------------------------------------- 1 | BINS := sconv_fprop_K64_N64 sconv_bprop_C64_N64 sconv_update_C128_K128 \ 2 | sconv_bprop_C1_N64 sconv_fprop_K128_N128 sconv_bprop_C128_N128 3 | TARGETS := $(addsuffix .cubin, $(BINS)) 4 | TEMPLATES := $(addsuffix _template.cubin, $(BINS)) 5 | 6 | all: $(BINS) sconv_fprop sconv_bprop sconv_update 7 | 8 | $(BINS): 9 | nvcc -arch sm_35 -m 64 $@.cu -cubin -O3 -o $@_template.cubin 10 | KeplerAs.pl -i $@.sass $@_template.cubin $@.cubin 11 | 12 | sconv_fprop: sconv_fprop.cu 13 | nvcc -arch sm_35 -o $@ $^ -lcuda -lcudart 14 | 15 | sconv_bprop: sconv_bprop.cu 16 | nvcc -arch sm_35 -o $@ $^ -lcuda -lcudart 17 | 18 | sconv_update: sconv_update.cu 19 | nvcc -arch sm_35 -o $@ $^ -lcuda -lcudart 20 | 21 | clean: 22 | rm $(TARGETS) $(TEMPLATES) sconv_fprop sconv_bprop sconv_update 23 | 24 | .PHONY: 25 | all clean 26 | 27 | #utils 28 | print-% : ; $(info $* is $(flavor $*) variable set to [$($*)]) @true 29 | -------------------------------------------------------------------------------- /Kernel/Convolution/Kepler/sconv.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | std::map nervana_kernels; 11 | std::vector nervana_modules; 12 | 13 | int len_d2b(int n) { 14 | int i, j = 0; 15 | i = n; 16 | while (i) { 17 | i /= 2; 18 | j++; 19 | } 20 | return j; 21 | } 22 | 23 | void magic32(unsigned int nmax, unsigned int d, unsigned int& m, unsigned int& p) { 24 | long nc = ((nmax + 1) / d) * d - 1; 25 | long nbits = len_d2b(nmax); 26 | std::cout << "nbits " << nbits << std::endl; 27 | for(p = 0; p < 2 * nbits + 1; p++) { 28 | if(pow(2, p) > nc * (d - 1 - (long)(pow(2, p) - 1) % d)) { 29 | m = (pow(2, p) + d - 1 -(long)(pow(2, p) - 1) % d) / d; 30 | std::cout << "m " << m << std::endl; 31 | std::cout << "p " << p << std::endl; 32 | return; 33 | } 34 | } 35 | return; 36 | } 37 | 38 | void magic64(unsigned int d, unsigned int& magic, unsigned int& shift) { 39 | // 3 is a special case that only ends up in the high bits 40 | // if the nmax is 0xffffffff 41 | // we can't use 0xffffffff for all cases as some return a 33 bit 42 | // magic number 43 | unsigned long nmax; 44 | if(d == 3) 45 | nmax = 0xffffffff; 46 | else 47 | nmax = 0x7fffffff; 48 | magic32(nmax, d, magic, shift); 49 | if(magic != 1) 50 | shift -= 32; 51 | } 52 | 53 | bool load_kernels(const char* const base_path_cstr) { 54 | //better would be a vector, but there is a bug in nvcc that prevents this 55 | // (bug report filed) 56 | const int NUM_KERNELS = 6; 57 | std::string names[NUM_KERNELS] = { 58 | "sconv_fprop_K64_N64", 59 | "sconv_fprop_K128_N128", 60 | "sconv_bprop_C128_N128", 61 | "sconv_bprop_C64_N64", 62 | "sconv_bprop_C1_N64", 63 | "sconv_update_C128_K128" 64 | }; 65 | 66 | std::string base_path(base_path_cstr); 67 | 68 | for (int i = 0; i < NUM_KERNELS; ++i) { 69 | std::string kernel = names[i]; 70 | if (nervana_kernels.count(kernel) > 0) 71 | continue; 72 | 73 | CUmodule module; 74 | 75 | std::string path = base_path + kernel + std::string(".cubin"); 76 | CUresult res = cuModuleLoad(&module, path.c_str()); 77 | 78 | if (res != CUDA_SUCCESS) { 79 | std::cerr << "Failed to load: " << kernel << " " << res << std::endl; 80 | return false; 81 | } 82 | 83 | nervana_modules.push_back(module); 84 | 85 | CUfunction function; 86 | res = cuModuleGetFunction(&function, module, kernel.c_str()); 87 | if (res != CUDA_SUCCESS) { 88 | std::cerr << "Failed to extract: " << kernel << " " << res << std::endl; 89 | return false; 90 | } 91 | 92 | nervana_kernels.insert(std::make_pair(kernel, function)); 93 | } 94 | 95 | return true; 96 | } 97 | -------------------------------------------------------------------------------- /Kernel/Convolution/Kepler/sconv_bprop_C128_N128.cu: -------------------------------------------------------------------------------- 1 | extern "C" 2 | __global__ void sconv_bprop_C128_N128 ( 3 | float* param_test, 4 | float* param_O, 5 | const float* param_I, 6 | const float* param_F, 7 | float param_alpha, 8 | int param_N, 9 | int param_K, 10 | int param_D, 11 | int param_H, 12 | int param_W, 13 | int param_WN, 14 | int param_HWN, 15 | int param_DHWN, 16 | int param_C, 17 | int param_CRST, 18 | int param_RST, 19 | int param_RS, 20 | int param_magic_RS, 21 | int param_shift_RS, 22 | int param_S, 23 | int param_magic_S, 24 | int param_shift_S, 25 | int param_pad_d, 26 | int param_pad_h, 27 | int param_pad_w, 28 | int param_str_d, 29 | int param_str_h, 30 | int param_str_w, 31 | int param_Q, 32 | int param_PQ, 33 | int param_QN, 34 | int param_PQN, 35 | int param_MPQN, 36 | int param_magic_Q, 37 | int param_shift_Q, 38 | int param_magic_PQ, 39 | int param_shift_PQ, 40 | int param_R, 41 | int param_T, 42 | int param_magic_str_w, 43 | int param_shift_str_w, 44 | int param_magic_str_h, 45 | int param_shift_str_h, 46 | int param_magic_str_d, 47 | int param_shift_str_d) { 48 | __shared__ float share[128 * 8 * 4 + 8]; 49 | 50 | int tid = threadIdx.x; 51 | 52 | share[tid] = 1; 53 | 54 | *param_O = share[127-tid]; 55 | *param_test = share[127-tid]; 56 | } 57 | -------------------------------------------------------------------------------- /Kernel/Convolution/Kepler/sconv_bprop_C1_N64.cu: -------------------------------------------------------------------------------- 1 | extern "C" 2 | __global__ void sconv_bprop_C1_N64 ( 3 | float* param_test, 4 | float* param_I, 5 | const float* param_F, 6 | const float* param_E, 7 | float param_alpha, 8 | int param_N, 9 | int param_K, 10 | int param_D, 11 | int param_H, 12 | int param_W, 13 | int param_WN, 14 | int param_HWN, 15 | int param_DHWN, 16 | int param_C, 17 | int param_CRST, 18 | int param_RST, 19 | int param_magic_RST, 20 | int param_shift_RST, 21 | int param_RS, 22 | int param_magic_RS, 23 | int param_shift_RS, 24 | int param_S, 25 | int param_magic_S, 26 | int param_shift_S, 27 | int param_pad_d, 28 | int param_pad_h, 29 | int param_pad_w, 30 | int param_str_d, 31 | int param_str_h, 32 | int param_str_w, 33 | int param_Q, 34 | int param_PQ, 35 | int param_QN, 36 | int param_PQN, 37 | int param_MPQN, 38 | int param_magic_Q, 39 | int param_shift_Q, 40 | int param_magic_PQ, 41 | int param_shift_PQ, 42 | int param_CRST8, 43 | int param_MPQN8) { 44 | __shared__ float shared[64 * 8 * 4 * 2]; 45 | 46 | int tid = threadIdx.x; 47 | 48 | shared[tid] = 1; 49 | 50 | *param_I = shared[31 - tid]; 51 | *param_test = shared[31 - tid]; 52 | } 53 | -------------------------------------------------------------------------------- /Kernel/Convolution/Kepler/sconv_bprop_C64_N64.cu: -------------------------------------------------------------------------------- 1 | extern "C" 2 | __global__ void sconv_bprop_C64_N64 ( 3 | float* param_test, 4 | float* param_O, 5 | const float* param_I, 6 | const float* param_F, 7 | float param_alpha, 8 | int param_N, 9 | int param_K, 10 | int param_D, 11 | int param_H, 12 | int param_W, 13 | int param_WN, 14 | int param_HWN, 15 | int param_DHWN, 16 | int param_C, 17 | int param_CRST, 18 | int param_RST, 19 | int param_RS, 20 | int param_magic_RS, 21 | int param_shift_RS, 22 | int param_S, 23 | int param_magic_S, 24 | int param_shift_S, 25 | int param_pad_d, 26 | int param_pad_h, 27 | int param_pad_w, 28 | int param_str_d, 29 | int param_str_h, 30 | int param_str_w, 31 | int param_Q, 32 | int param_PQ, 33 | int param_QN, 34 | int param_PQN, 35 | int param_MPQN, 36 | int param_magic_Q, 37 | int param_shift_Q, 38 | int param_magic_PQ, 39 | int param_shift_PQ, 40 | int param_R, 41 | int param_T, 42 | int param_magic_str_w, 43 | int param_shift_str_w, 44 | int param_magic_str_h, 45 | int param_shift_str_h, 46 | int param_magic_str_d, 47 | int param_shift_str_d) { 48 | __shared__ float share[64 * 8 * 4 + 8]; 49 | 50 | int tid = threadIdx.x; 51 | 52 | share[tid] = 1; 53 | 54 | *param_O = share[63-tid]; 55 | *param_test = share[63-tid]; 56 | } 57 | -------------------------------------------------------------------------------- /Kernel/Convolution/Kepler/sconv_fprop_K128_N128.cu: -------------------------------------------------------------------------------- 1 | extern "C" 2 | __global__ void sconv_fprop_K128_N128 ( 3 | float* param_test, 4 | float *param_O, 5 | const float *param_I, 6 | const float *param_F, 7 | float param_alpha, 8 | int param_N, 9 | int param_K, 10 | int param_D, 11 | int param_H, 12 | int param_W, 13 | int param_WN, 14 | int param_HWN, 15 | int param_DHWN, 16 | int param_C, 17 | int param_KRST, 18 | int param_RST, 19 | int param_RS, 20 | int param_magic_RS, 21 | int param_shift_RS, 22 | int param_S, 23 | int param_magic_S, 24 | int param_shift_S, 25 | int param_pad_d, 26 | int param_pad_h, 27 | int param_pad_w, 28 | int param_str_d, 29 | int param_str_h, 30 | int param_str_w, 31 | int param_Q, 32 | int param_PQ, 33 | int param_QN, 34 | int param_PQN, 35 | int param_MPQN, 36 | int param_magic_Q, 37 | int param_shift_Q, 38 | int param_magic_PQ, 39 | int param_shift_PQ) { 40 | __shared__ float share[128 * 8 * 4 + 8]; 41 | 42 | int tid = threadIdx.x; 43 | 44 | share[tid] = 1; 45 | 46 | *param_O = share[127-tid]; 47 | *param_test = share[127-tid]; 48 | } 49 | -------------------------------------------------------------------------------- /Kernel/Convolution/Kepler/sconv_fprop_K64_N64.cu: -------------------------------------------------------------------------------- 1 | extern "C" 2 | __global__ void sconv_fprop_K64_N64 ( 3 | float* param_test, 4 | float *param_O, 5 | const float *param_I, 6 | const float *param_F, 7 | float param_alpha, 8 | int param_N, 9 | int param_K, 10 | int param_D, 11 | int param_H, 12 | int param_W, 13 | int param_WN, 14 | int param_HWN, 15 | int param_DHWN, 16 | int param_C, 17 | int param_KRST, 18 | int param_RST, 19 | int param_RS, 20 | int param_magic_RS, 21 | int param_shift_RS, 22 | int param_S, 23 | int param_magic_S, 24 | int param_shift_S, 25 | int param_pad_d, 26 | int param_pad_h, 27 | int param_pad_w, 28 | int param_str_d, 29 | int param_str_h, 30 | int param_str_w, 31 | int param_Q, 32 | int param_PQ, 33 | int param_QN, 34 | int param_PQN, 35 | int param_MPQN, 36 | int param_magic_Q, 37 | int param_shift_Q, 38 | int param_magic_PQ, 39 | int param_shift_PQ) { 40 | __shared__ float share[64 * 8 * 4 + 8]; 41 | 42 | int tid = threadIdx.x; 43 | 44 | share[tid] = 1; 45 | 46 | *param_O = share[63-tid]; 47 | *param_test = share[63-tid]; 48 | } 49 | -------------------------------------------------------------------------------- /Kernel/Convolution/Kepler/sconv_fprop_K64_N64_template.cubin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ncic-sugon/DeepPerf/e539add1347e9354712787f71f66ae346f006544/Kernel/Convolution/Kepler/sconv_fprop_K64_N64_template.cubin -------------------------------------------------------------------------------- /Kernel/Convolution/Kepler/sconv_update.cu: -------------------------------------------------------------------------------- 1 | #include "sconv.h" 2 | 3 | bool update(const float *I, float *F, const float *O, 4 | unsigned int N, unsigned int C, unsigned int K, 5 | unsigned int D, unsigned int H, unsigned int W, 6 | unsigned int R, unsigned int S, unsigned int T, 7 | unsigned int M, unsigned int P, unsigned int Q, 8 | unsigned int str_d, unsigned int str_h, unsigned int str_w, 9 | unsigned int pad_d, unsigned int pad_h, unsigned int pad_w) { 10 | float alpha = 1.0f; 11 | unsigned int DHW, WN, HW, HWN, DHWN, CRST, RST, RS; 12 | unsigned int PQ, QN, PQN, MPQN; 13 | unsigned int magic_HW, magic_W; 14 | unsigned int shift_HW, shift_W; 15 | unsigned int magic_RST, magic_RS, magic_S; 16 | unsigned int shift_RST, shift_RS, shift_S; 17 | unsigned int magic_PQu, shift_PQu; 18 | unsigned int magic_Qu, shift_Qu; 19 | unsigned int grid_P = 1; 20 | unsigned int grid_Q = 1; 21 | unsigned int grid_PQ = grid_P * grid_Q; 22 | unsigned int grid_PQM = grid_PQ * M; 23 | // input 24 | WN = W * N; 25 | HW = H * W; 26 | HWN = H * WN; 27 | DHW = D * HW; 28 | DHWN = D * HWN; 29 | // filter 30 | RS = R * S; 31 | RST = T * RS; 32 | CRST = C * RS; 33 | // output 34 | QN = Q * N; 35 | PQN = P * QN; 36 | MPQN = M * PQN; 37 | // magic numbers 38 | magic32(CRST, RST, magic_RST, shift_RST); 39 | magic32(RST + 32, RS, magic_RS, shift_RS); 40 | magic32(RS + 32, S, magic_S, shift_S); 41 | magic32(DHW, HW, magic_HW, shift_HW); 42 | magic32(HW, W, magic_W, shift_W); 43 | magic32(grid_PQM, grid_PQ, magic_PQu, shift_PQu); 44 | magic32(grid_PQ, grid_Q, magic_Qu, shift_Qu); 45 | std::cout << "CRST: " << CRST << std::endl; 46 | // test param set up 47 | float *test_param; 48 | cudaError_t cuda_error; 49 | cuda_error = cudaMalloc((void**)&test_param, sizeof(float) * 1024); 50 | cudaMemset(test_param, 0, sizeof(float) * 1024); 51 | void *args[43] = {&test_param, &F, &I, &O, &alpha, 52 | &N, &K, &D, &H, &W, &WN, &HWN, &DHWN, 53 | &C, &CRST, 54 | &RST, &magic_RST, &shift_RST, 55 | &RS, &magic_RS, &shift_RS, 56 | &S, &magic_S, &shift_S, 57 | &pad_d, &pad_h, &pad_w, 58 | &str_d, &str_h, &str_w, 59 | &P, &Q, &PQ, &QN, &PQN, &MPQN, 60 | &magic_Qu, &shift_Qu, 61 | &magic_PQu, &shift_PQu, 62 | &grid_P, &grid_Q, &grid_PQ}; 63 | int gridX = grid_PQM; 64 | int gridY = CRST / 128 + (CRST % 128 != 0); 65 | int gridZ = K / 128 + (K % 128 != 0); 66 | std::string kernel_name = "sconv_update_C128_K128"; 67 | CUresult res = cuLaunchKernel(nervana_kernels[kernel_name], gridX, gridY, gridZ, 256, 1, 1, 68 | 0, 0, args, NULL); 69 | if (res != CUDA_SUCCESS) { 70 | std::cerr << "Line " << __LINE__ << " error launching kernel " << kernel_name << " " << res << std::endl; 71 | return false; 72 | } 73 | cuCtxSynchronize(); 74 | float* h_test = (float *)malloc(sizeof(float) * 256); 75 | cuda_error = cudaMemcpy(h_test, test_param, sizeof(float) * 256, cudaMemcpyDeviceToHost); 76 | if (cuda_error != cudaSuccess) { 77 | std::cerr << "Line " << __LINE__ << " memcpy error: " << cuda_error << std::endl; 78 | exit(1); 79 | } 80 | for (int i = 0; i < 256; ++i) { 81 | std::cout << h_test[i] << " "; 82 | } 83 | std::cout << std::endl; 84 | // free test_param 85 | free(h_test); 86 | return true; 87 | } 88 | 89 | int main() { 90 | cudaFree(0); 91 | float *d_I, *d_F, *d_O; 92 | unsigned int N = 128, C = 3, K = 128, D = 1, H = 224, W = 224, T = 1, R = 11, S = 11; 93 | unsigned int str_d = 1, str_h = 4, str_w = 4; 94 | unsigned int pad_d = 0, pad_h = 3, pad_w = 3; 95 | unsigned int M, P, Q; 96 | cudaError_t cuda_error; 97 | M = (D - T + 2 * pad_d) / str_d + 1; 98 | P = (H - R + 2 * pad_h) / str_h + 1; 99 | Q = (W - S + 2 * pad_w) / str_w + 1; 100 | float *h_O = (float *)malloc(K * M * P * Q * N * sizeof(float)); 101 | for (int i = 0; i < K * M * P * Q * N; ++i) { 102 | h_O[i] = 1; 103 | } 104 | float *h_I = (float *)malloc(C * D * H * W * N * sizeof(float)); 105 | for (int i = 0; i < C * D * H * W * N; ++i) { 106 | h_I[i] = 1; 107 | } 108 | float* h_F = (float *)malloc(sizeof(float) * C * R * S * T * K); 109 | // device memory 110 | cudaMalloc((void**)&d_I, sizeof(float) * C * D * H * W * N); 111 | cudaMalloc((void**)&d_F, sizeof(float) * C * R * S * T * K); 112 | cudaMalloc((void**)&d_O, sizeof(float) * K * M * P * Q * N); 113 | // memcpy h_I, h_O 114 | cudaMemcpy(d_I, h_I, sizeof(float) * C * D * H * W * N, 115 | cudaMemcpyHostToDevice); 116 | cudaMemcpy(d_O, h_O, sizeof(float) * K * M * P * Q * N, 117 | cudaMemcpyHostToDevice); 118 | // load kernels 119 | if (!load_kernels("./")) { 120 | std::cerr << "Couldn't load all kernels" << std::endl; 121 | exit(1); 122 | } 123 | // launch kernel 124 | if (!update(d_I, d_F, d_O, N, C, K, D, H, W, R, S, T, M, P, Q, str_d, str_h, str_w, pad_d, pad_h, pad_w)) { 125 | std::cerr << "Launch error" << std::endl; 126 | exit(1); 127 | } 128 | // output 129 | std::cout << "result" << std::endl; 130 | cuda_error = cudaMemcpy(h_F, d_F, sizeof(float) * C * R * S * T * K, cudaMemcpyDeviceToHost); 131 | if (cuda_error != cudaSuccess) { 132 | std::cerr << "Line " << __LINE__ << " memcpy error: " << cuda_error << std::endl; 133 | exit(1); 134 | } 135 | for (int i = 0; i < 128; ++i) { 136 | std::cout << h_F[i] << " "; 137 | } 138 | std::cout << std::endl; 139 | // free memory 140 | free(h_O); 141 | free(h_I); 142 | free(h_F); 143 | cudaFree(d_I); 144 | cudaFree(d_F); 145 | cudaFree(d_O); 146 | // run successfully 147 | std::cout << "finish" << std::endl; 148 | return 0; 149 | } 150 | -------------------------------------------------------------------------------- /Kernel/Convolution/Kepler/sconv_update_C128_K128.cu: -------------------------------------------------------------------------------- 1 | extern "C" 2 | __global__ void sconv_update_C128_K128 ( 3 | float* param_test, 4 | float* param_F, 5 | const float* param_I, 6 | const float* param_E, 7 | float param_alpha, 8 | int param_N, 9 | int param_K, 10 | int param_D, 11 | int param_H, 12 | int param_W, 13 | int param_WN, 14 | int param_HWN, 15 | int param_DHWN, 16 | int param_C, 17 | int param_CRST, 18 | int param_RST, 19 | int param_magic_RST, 20 | int param_shift_RST, 21 | int param_RS, 22 | int param_magic_RS, 23 | int param_shift_RS, 24 | int param_S, 25 | int param_magic_S, 26 | int param_shift_S, 27 | int param_pad_d, 28 | int param_pad_h, 29 | int param_pad_w, 30 | int param_str_d, 31 | int param_str_h, 32 | int param_str_w, 33 | int param_P, 34 | int param_Q, 35 | int param_PQ, 36 | int param_QN, 37 | int param_PQN, 38 | int param_MPQN, 39 | int param_magic_Q, 40 | int param_shift_Q, 41 | int param_magic_PQ, 42 | int param_shift_PQ, 43 | int param_part_P, 44 | int param_part_Q, 45 | int param_part_PQ) { 46 | __shared__ float share[(128 * 16 + 32) * 4 + 6]; 47 | 48 | int tid = threadIdx.x; 49 | 50 | share[tid] = 1; 51 | 52 | *param_F = share[255 - tid]; 53 | *param_test = share[255 - tid]; 54 | } 55 | -------------------------------------------------------------------------------- /Kernel/SGEMM/Kepler/Makefile: -------------------------------------------------------------------------------- 1 | BINS := sgemm_nn_128x128 sgemm_nt_128x128 sgemm_tn_128x128 \ 2 | sgemm_nn_128x128_vec sgemm_tn_128x128_vec sgemm_nt_128x128_vec 3 | TARGETS := $(addsuffix .cubin, $(BINS)) 4 | TEMPLATES := $(addsuffix _template.cubin, $(BINS)) 5 | 6 | all: $(BINS) 7 | 8 | $(BINS): 9 | nvcc -arch sm_35 -m 64 $@.cu -cubin -O3 -o $@_template.cubin 10 | KeplerAs.pl -i $@.sass $@_template.cubin $@.cubin 11 | 12 | clean: 13 | rm $(TARGETS) $(TEMPLATES) 14 | 15 | .PHONY: 16 | all clean 17 | 18 | #utils 19 | print-% : ; $(info $* is $(flavor $*) variable set to [$($*)]) @true 20 | -------------------------------------------------------------------------------- /Kernel/SGEMM/Kepler/README.md: -------------------------------------------------------------------------------- 1 | # KeplerGEMM 2 | 3 | Faster GEMM 4 | -------------------------------------------------------------------------------- /Kernel/SGEMM/Kepler/sgemm_nn_128x128.cu: -------------------------------------------------------------------------------- 1 | extern "C" 2 | __global__ void __launch_bounds__(256) sgemm_nn_128x128 3 | ( 4 | const float* param_A, 5 | const float* param_B, 6 | float* param_C, 7 | float param_alpha, 8 | float param_beta, 9 | int param_lda, 10 | int param_ldb8, 11 | int param_ldc, 12 | int param_m, 13 | int param_n, 14 | int param_k 15 | ) { 16 | __shared__ float share[128 * 8 * 4 + 32]; 17 | 18 | int tid = threadIdx.x; 19 | 20 | share[tid] = 1; 21 | 22 | __syncthreads(); 23 | 24 | param_C[tid] = share[255 - tid]; 25 | } 26 | -------------------------------------------------------------------------------- /Kernel/SGEMM/Kepler/sgemm_nn_128x128_vec.cu: -------------------------------------------------------------------------------- 1 | extern "C" 2 | __global__ void __launch_bounds__(256) sgemm_nn_128x128_vec 3 | ( 4 | const float* param_A, 5 | const float* param_B, 6 | float* param_C, 7 | float param_alpha, 8 | float param_beta, 9 | int param_lda, 10 | int param_ldb8, 11 | int param_ldc, 12 | int param_m, 13 | int param_n, 14 | int param_k 15 | ) { 16 | __shared__ float share[128 * 8 * 4 + 32]; 17 | 18 | int tid = threadIdx.x; 19 | 20 | share[tid] = 1; 21 | 22 | __syncthreads(); 23 | 24 | param_C[tid] = share[255 - tid]; 25 | } 26 | -------------------------------------------------------------------------------- /Kernel/SGEMM/Kepler/sgemm_nt_128x128.cu: -------------------------------------------------------------------------------- 1 | extern "C" 2 | __global__ void __launch_bounds__(256) sgemm_nt_128x128 3 | ( 4 | const float* param_A, 5 | const float* param_B, 6 | float* param_C, 7 | float param_alpha, 8 | float param_beta, 9 | int param_lda, 10 | int param_ldb, 11 | int param_ldc, 12 | int param_m, 13 | int param_n, 14 | int param_k 15 | ) { 16 | __shared__ float share[128 * 8 * 4 + 32]; 17 | 18 | int tid = threadIdx.x; 19 | 20 | share[tid] = 1; 21 | 22 | __syncthreads(); 23 | 24 | param_C[tid] = share[255 - tid]; 25 | } 26 | -------------------------------------------------------------------------------- /Kernel/SGEMM/Kepler/sgemm_nt_128x128_vec.cu: -------------------------------------------------------------------------------- 1 | extern "C" 2 | __global__ void __launch_bounds__(256) sgemm_nt_128x128_vec 3 | ( 4 | const float* param_A, 5 | const float* param_B, 6 | float* param_C, 7 | float param_alpha, 8 | float param_beta, 9 | int param_lda, 10 | int param_ldb, 11 | int param_ldc, 12 | int param_m, 13 | int param_n, 14 | int param_k 15 | ) { 16 | __shared__ float share[128 * 8 * 4 + 32]; 17 | 18 | int tid = threadIdx.x; 19 | 20 | share[tid] = 1; 21 | 22 | __syncthreads(); 23 | 24 | param_C[tid] = share[255 - tid]; 25 | } 26 | -------------------------------------------------------------------------------- /Kernel/SGEMM/Kepler/sgemm_tn_128x128.cu: -------------------------------------------------------------------------------- 1 | extern "C" 2 | __global__ void __launch_bounds__(256) sgemm_tn_128x128 3 | ( 4 | const float* param_A, 5 | const float* param_B, 6 | float* param_C, 7 | float param_alpha, 8 | float param_beta, 9 | int param_lda8, 10 | int param_ldb8, 11 | int param_ldc, 12 | int param_m, 13 | int param_n, 14 | int param_k 15 | ) { 16 | __shared__ float share[128 * 8 * 4 + 32]; 17 | 18 | int tid = threadIdx.x; 19 | 20 | share[tid] = 1; 21 | 22 | __syncthreads(); 23 | 24 | param_C[tid] = share[255 - tid]; 25 | } 26 | -------------------------------------------------------------------------------- /Kernel/SGEMM/Kepler/sgemm_tn_128x128_vec.cu: -------------------------------------------------------------------------------- 1 | extern "C" 2 | __global__ void __launch_bounds__(256) sgemm_tn_128x128_vec 3 | ( 4 | const float* param_A, 5 | const float* param_B, 6 | float* param_C, 7 | float param_alpha, 8 | float param_beta, 9 | int param_lda8, 10 | int param_ldb8, 11 | int param_ldc, 12 | int param_m, 13 | int param_n, 14 | int param_k 15 | ) { 16 | __shared__ float share[128 * 8 * 4 + 32]; 17 | 18 | int tid = threadIdx.x; 19 | 20 | share[tid] = 1; 21 | 22 | __syncthreads(); 23 | 24 | param_C[tid] = share[255 - tid]; 25 | } 26 | 27 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DeepPerf 2 | 3 | DeepPerf is developed to understand GPU microarchitectural features and improve performance for compute-intensive kernels. The methodology relies on a reverse engineering approach to crack the GPU ISA encodings in order to build a GPU assembler. An assembly microbenchmark suite correlates microarchitectural features with their performance factors to uncover instruction-level and memory hierarchy preferences. 4 | We use SGEMM and Convolution as examples to show the ways to achieve bare-metal performance tuning. In your deep learning framework, you could use directly these sass code to speed up the performance. 5 | 6 | The toolchain is an attempt to automatically crack different GPU ISA encodings and build an assembler adaptively for the purpose of performance enhancements to applications on GPUs. 7 | There are three directories in this folder, which consists of three major steps to optimize a cuda code in the assembly level. All the tools cover three recent NVIDIA GPU architecture, Kepler, Maxwell and Pascal. 8 | 9 | 10 | -------------------------------------------------------------------------------- /Solver/.gitignore: -------------------------------------------------------------------------------- 1 | data/* 2 | output/* 3 | -------------------------------------------------------------------------------- /Solver/README.md: -------------------------------------------------------------------------------- 1 | 2 | # Cracking GPU ISA Encodings 3 | 4 | ## Output 5 | 6 | * Bit positions of opcodes 7 | * Bit positions of operands for different operand type 8 | * Bit positions of modifiers for each instruction 9 | 10 | ## How to run the workflow? 11 | 12 | The workflow is composed of four stages: 13 | 14 | 1. Generate PTX code->`./bin/generate_disassemble [arch]` 15 | * Generate PTX code (.ptx) in ptxgen directory and compile PTX to cubin; 16 | * Disassemble cubins to sass files, which feed into the following three solvers; 17 | * Each line of sass files looks like this: 18 | 19 | `/∗0048∗/ IADD R0, R2, R0; /∗0x4800000000201c03∗/` 20 | 21 | 2. Opcode solver->`./bin/opcode [arch]` 22 | * Probe 64-bit binary code of sass files by flipping each bit and observe whether opcodes change; 23 | 24 | 3. Modifer solver->`./bin/modifier [arch]` 25 | * Probe 64-bit binary code of sass files by flipping each bit and observe whether modifiers change; 26 | * Enuermerate bits on all modifier positions to generate all the modifiers; 27 | 28 | 4. Operand solver->`./bin/operand [arch]` 29 | * Probe 64-bit binary code of sass files by flipping each bit and observe whether operands change; 30 | * Operand type: R: Register, S: Special Register, I: Immediate, C: constant[][], M: Memory, P: Predicate; 31 | 32 | 5. Allowed values for `[arch]` options: 'sm_30','sm_32','sm_35','sm_37','sm_50','sm_52','sm_53','sm_60','sm_61','sm_62'. 33 | -------------------------------------------------------------------------------- /Solver/bin/generate_disassemble: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Please input architecture parameter argv[1] 4 | if [ "$#" -lt 1 ] 5 | then 6 | echo "Please input architecture parameter argv[1]" 7 | exit -1 8 | fi 9 | 10 | arch=$1 11 | prefix="data/"$arch"/" 12 | src_directory="src/" 13 | 14 | echo "Arch: "$arch 15 | echo "Data directory: "$prefix 16 | 17 | # 1. Generate ptx 18 | ptx_directory=$prefix"ptx/" 19 | mkdir -p $ptx_directory 20 | echo ".................................................................." 21 | echo "1. Generate .ptx files to "$ptx_directory" directory" 22 | echo "It may take serveral miniutes" 23 | echo ".................................................................." 24 | perl $src_directory"ptxgen.pl" $arch $ptx_directory 25 | 26 | # 2. Compile to cubins 27 | cubin_directory=$prefix"cubin/" 28 | mkdir -p $cubin_directory 29 | echo ".................................................................." 30 | echo "2. Compile .ptx file to cubin files in "$cubin_directory" directory" 31 | echo "It may take serveral miniutes" 32 | echo ".................................................................." 33 | ptx=$ptx_directory"*.ptx" 34 | for p in $ptx 35 | do 36 | f=`echo $p | cut -d / -f 4 |cut -d . -f 1` 37 | fout=$cubin_directory""$f".cubin" 38 | echo $fout 39 | ptxas -arch $arch -m 64 $p -o $fout > /dev/null 2>&1 40 | done 41 | 42 | # 3. Disassembly to sass 43 | asm_directory=$prefix"asm/" 44 | mkdir -p $asm_directory 45 | echo ".................................................................." 46 | echo "3. Disassemble .cubin file to sass files in "$asm_directory" directory" 47 | echo "It may take serveral miniutes" 48 | echo ".................................................................." 49 | cubin=$cubin_directory"*.cubin" 50 | for p in $cubin 51 | do 52 | f=`echo $p | cut -d / -f 4 | cut -d . -f 1` 53 | fout=$asm_directory""$f".sass" 54 | echo $fout 55 | cuobjdump --gpu-architecture $arch --dump-sass $p > $fout 56 | done 57 | 58 | # 4.Put all sass results in one file 59 | echo ".................................................................." 60 | echo "4. Gathering results from ptxgen" 61 | echo ".................................................................." 62 | asm=$asm_directory"*.sass" 63 | if [ -f /tmp/all.sass ] 64 | then 65 | rm /tmp/all.sass 66 | else 67 | touch /tmp/all.sass 68 | fi 69 | 70 | for f in $asm 71 | do 72 | cat $f >> /tmp/all.sass 73 | done 74 | 75 | # Ignore non-instruction lines 76 | awk '{if (NF >= 5) {$1 = ""; print $0} }' /tmp/all.sass > /tmp/all_inst.sass 77 | # Make instruction uniq 78 | python $src_directory"unique.py" /tmp/all_inst.sass > $prefix""$arch".sass" 79 | # Generate test cubin 80 | nvcc -cubin -arch $arch $src_directory"test.cu" -o $prefix""$arch".cubin" 81 | 82 | rm /tmp/all.sass /tmp/all_inst.sass 83 | -------------------------------------------------------------------------------- /Solver/bin/modifier: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Please input architecture parameter argv[1] 4 | if [ "$#" -lt 1 ] 5 | then 6 | echo "Please input architecture parameter argv[1]" 7 | exit -1 8 | fi 9 | 10 | arch=$1 11 | prefix="data/"$arch"/" 12 | src_directory="src/" 13 | asm_directory=$prefix 14 | output_directory="output/"$arch"/" 15 | output_file=$output_directory""$arch".modifier" 16 | mkdir -p $output_directory 17 | rm -rf $output_file || true 18 | echo "Output file: "$output_file 19 | python $src_directory"modifier.py" $asm_directory""$arch".sass" $arch $output_file 20 | -------------------------------------------------------------------------------- /Solver/bin/opcode: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Please input architecture parameter argv[1] 4 | if [ "$#" -lt 1 ] 5 | then 6 | echo "Please input architecture parameter argv[1]" 7 | exit -1 8 | fi 9 | 10 | arch=$1 11 | prefix="data/"$arch"/" 12 | src_directory="src/" 13 | asm_directory=$prefix 14 | output_directory="output/"$arch"/" 15 | output_file=$output_directory""$arch".opcode" 16 | mkdir -p $output_directory 17 | rm -rf $output_file || true 18 | echo "Output file: "$output_file 19 | python $src_directory"opcode.py" $asm_directory""$arch".sass" $arch $output_file 20 | -------------------------------------------------------------------------------- /Solver/bin/operand: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Please input architecture parameter argv[1] 4 | if [ "$#" -lt 1 ] 5 | then 6 | echo "Please input architecture parameter argv[1]" 7 | exit -1 8 | fi 9 | 10 | arch=$1 11 | prefix="data/"$arch"/" 12 | src_directory="src/" 13 | asm_directory=$prefix 14 | output_directory="output/"$arch"/" 15 | output_file=$output_directory""$arch".operand" 16 | mkdir -p $output_directory 17 | rm -rf $output_file || true 18 | echo "Output file: "$output_file 19 | python $src_directory"operand.py" $asm_directory""$arch".sass" $arch $output_file 20 | -------------------------------------------------------------------------------- /Solver/src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ncic-sugon/DeepPerf/e539add1347e9354712787f71f66ae346f006544/Solver/src/__init__.py -------------------------------------------------------------------------------- /Solver/src/dumper.py: -------------------------------------------------------------------------------- 1 | import os 2 | import struct 3 | 4 | def arch2mode(arch): 5 | return arch.replace("_", "").upper() 6 | 7 | def dump(newcode, arch): 8 | version = arch.split("_")[1] 9 | if version < 40: 10 | tmp_bin = "/tmp/tmp_dumper.bin" 11 | fout = open(tmp_bin, "wb") 12 | fout.write(struct.pack("> j) & 0x1) << pos[j]) | bits 12 | enc = enc & (~(1 << pos[j])) 13 | dump_file = dump("0x{:016x}".format(enc | bits), arch) 14 | if dump_file and dump_file.find("?") == -1 and dump_file.find("error") == -1 and dump_file.find("INVALID") == -1: 15 | line = dump_file.split("\n") 16 | if version < 40: 17 | line_inst = line[1].split(); 18 | else: 19 | line_inst = line[5].split(); 20 | line_inst.pop(0) 21 | logging.info("0b{:064b}".format(bits) + ": " + " ".join(line_inst)) 22 | -------------------------------------------------------------------------------- /Solver/src/inst.py: -------------------------------------------------------------------------------- 1 | from sets import Set 2 | 3 | class Inst: 4 | def __init__(self, inst, raw = True): 5 | # Fetech binary encoding 6 | if raw == True: # From cuobjdump 7 | self.__enc = inst[-2] 8 | inst.pop(-1) 9 | inst.pop(-1) 10 | inst.pop(-1) 11 | else: # From nvdisasm 12 | self.__enc = "" 13 | 14 | if inst[0] == '{': # Check dual issue 15 | inst.pop(0) 16 | self.__pred = "" 17 | if inst[0].find('@') != -1: # Check predicate, such as @P0 18 | self.__pred = inst.pop(0) 19 | 20 | # Remove semicolon of zero operand field instruction such as "RRO;" 21 | ops = inst.pop(0).replace(";", "") 22 | # Fetech opcode 23 | self.__op = ops.split(".")[0] 24 | # Split opcode 25 | self.__modifier = ops.split(".")[1:] 26 | # Fetech operands and remove ; and , 27 | self.__operands = ' '.join(inst).replace(";", "").replace(",", "").replace("-","").replace("|","") 28 | 29 | def op(self): 30 | return str(self.__op) 31 | 32 | def modifier(self): 33 | return str(self.__modifier) 34 | 35 | def enc(self): 36 | return str(self.__enc) 37 | 38 | def operands(self): 39 | return str(self.__operands) 40 | 41 | def pred(self): 42 | return str(self.__pred) 43 | -------------------------------------------------------------------------------- /Solver/src/modifier.py: -------------------------------------------------------------------------------- 1 | from inst import Inst 2 | from dumper import dump 3 | import enumerator 4 | import sys 5 | import logging 6 | 7 | if __name__ == "__main__": 8 | logging.basicConfig(filename = sys.argv[3], level = logging.INFO) 9 | logging.debug("argv[1]: Disassemble file") 10 | logging.debug("argv[2]: Arch") 11 | logging.debug("argv[3]: Output file") 12 | logging.debug("argv[4]: Instruction limit (default 100)") 13 | sass = sys.argv[1] 14 | arch = sys.argv[2] 15 | if len(sys.argv) >= 5: 16 | limit = sys.argv[4] 17 | else: 18 | limit = 100 19 | count = 0 20 | version = int(arch.split("_")[1]) 21 | with open(sass) as f: 22 | for line in f: 23 | pos = [] 24 | count += 1 25 | if count == limit: 26 | break 27 | line_split = line.split() 28 | # Construct instruction structure 29 | origin = Inst(line_split) 30 | # Find the 64-bit encodings 31 | base = int(origin.enc(), 16) 32 | # Bit by bit xor, observe whether opcode changes and guess what this bit represent 33 | for i in range(0, 64): 34 | mask = 2**i 35 | newcode = base ^ mask 36 | # Disassemble the new code 37 | dump_file = dump("0x{:016x}".format(newcode), arch) 38 | # Compare the disassemble to check which field changes: opcode, operand or modifer 39 | if dump_file and dump_file.find("?") == -1 and dump_file.find("error") == -1: 40 | line = dump_file.split("\n") 41 | if version < 40: 42 | line_inst = line[1].split(); 43 | else: 44 | line_inst = line[5].split(); 45 | # [0]: header info, [1] instruction part 46 | line_inst.pop(0) 47 | # Parse the new generated disassembly 48 | inst = Inst(line_inst, raw = version > 40) 49 | if inst.modifier() != origin.modifier() and inst.op() == origin.op(): 50 | if i not in pos: 51 | pos.append(i) 52 | # Enumerate all modifiers 53 | if len(pos) > 0: 54 | logging.info("%s modifier bits %s: ", origin.op(), pos); 55 | enumerator.enumerate(base, pos, arch) 56 | -------------------------------------------------------------------------------- /Solver/src/opcode.py: -------------------------------------------------------------------------------- 1 | from inst import Inst 2 | from dumper import dump 3 | import sys 4 | import logging 5 | 6 | if __name__ == "__main__": 7 | logging.basicConfig(filename = sys.argv[3], level = logging.INFO) 8 | logging.debug("argv[1]: Disassemble file") 9 | logging.debug("argv[2]: Arch") 10 | logging.debug("argv[3]: Output file") 11 | logging.debug("argv[4]: Instruction limit (default 100)") 12 | sass = sys.argv[1] 13 | arch = sys.argv[2] 14 | if len(sys.argv) >= 5: 15 | limit = sys.argv[4] 16 | else: 17 | limit = 100 18 | count = 0; 19 | version = int(arch.split("_")[1]) 20 | with open(sass) as f: 21 | for line in f: 22 | pos = [] 23 | bits = 0x0 24 | count += 1 25 | if count == limit: 26 | break 27 | line_split = line.split() 28 | # Construct instruction structure 29 | origin = Inst(line_split) 30 | # Find the 64-bit encodings 31 | base = int(origin.enc(), 16) 32 | # Bit by bit xor, observe whether opcode changes and guess what this bit represent 33 | for i in range(0, 64): 34 | mask = 2**i 35 | newcode = base ^ mask 36 | # Disassemble the new code 37 | dump_file = dump("0x{:016x}".format(newcode), arch) 38 | # Compare the disassemble to check which field changes: opcode, operand or modifer 39 | if dump_file and dump_file.find("?") == -1 and dump_file.find("error") == -1: 40 | line = dump_file.split("\n") 41 | if version < 40: 42 | line_inst = line[1].split(); 43 | else: 44 | line_inst = line[5].split(); 45 | # [0]: header info, [1] instruction part 46 | line_inst.pop(0) 47 | # Parse the new generated disassembly 48 | inst = Inst(line_inst, raw = version > 40) 49 | # If opcode is changed, then this bit represent opcode, we find it! 50 | # LDG and TEX are the same instructions in fact 51 | # RED and ATOM are the same instruction 52 | if inst.op() != origin.op() and not i in pos and not \ 53 | (inst.op() == "LDG" and origin.op() == "TEX") and not \ 54 | (inst.op() == "TEX" and origin.op() =="LDG") and not \ 55 | (inst.op() == "RED" and origin.op() == "ATOM") and not \ 56 | (inst.op() == "ATOM" and origin.op() == "RED"): 57 | logging.info("Opcode changes: %s => %s when bit [%d] is flipped from [%d]", \ 58 | origin.op(), inst.op(), i, (base >> i) & 0x1) 59 | bits = bits | (((base >> i) & 0x1) << i) 60 | pos.append(i) 61 | if len(pos) > 0: 62 | logging.info("0b{:064b}".format(bits) + ": %s opcode bits %s: ", origin.op(), pos); 63 | -------------------------------------------------------------------------------- /Solver/src/operand.py: -------------------------------------------------------------------------------- 1 | from sets import Set 2 | from inst import Inst 3 | from dumper import dump 4 | import sys 5 | import logging 6 | 7 | ops = dict() 8 | 9 | def check_operand_types(inst): 10 | operand_types = "" 11 | operands = inst.operands().split(); 12 | for operand in operands: 13 | key = operand[0] 14 | if key == 'R': # Register 15 | value = operand[1:] 16 | if value == 'Z' or value == 'N' or value == 'M' or \ 17 | value == 'P' or float(value).is_integer(): 18 | operand_types += 'R' 19 | else: 20 | return 'X' 21 | elif key == 'P': # Predicate 22 | value = operand[1:] 23 | if float(value).is_integer(): 24 | operand_types += 'P' 25 | else: 26 | return 'X' 27 | elif key == 'c': # Constant memory 28 | operand_types += 'C' 29 | elif key == '[': # Memory 30 | operand_types += 'M' 31 | elif key == 'S': # Special register 32 | operand_types += 'S' 33 | else: 34 | if len(operand) >= 2 and (operand[0:2] == "0x" or operand[0:3] == "-0x"): # Hex immediate 35 | operand_types += 'I' 36 | elif float(operand).is_integer(): # Immediate value 37 | operand_types += 'I' 38 | else: 39 | return 'X' 40 | if inst.op() not in ops: 41 | ops[inst.op()] = set() 42 | ops[inst.op()].add(operand_types) 43 | return operand_types 44 | elif inst.op() in ops and operand_types not in ops[inst.op()]: 45 | ops[inst.op()].add(operand_types) 46 | return operand_types 47 | else: 48 | return 'X' 49 | 50 | def change(inst, origin): 51 | if inst.op() != origin.op(): 52 | return -1 53 | elif inst.modifier() != origin.modifier(): 54 | return -2 55 | else: 56 | inst_operands = inst.operands().split() 57 | origin_operands = origin.operands().split() 58 | for i in range(len(origin_operands)): 59 | if (inst_operands[i] != origin_operands[i]): 60 | return i 61 | return -3 62 | 63 | if __name__ == "__main__": 64 | logging.basicConfig(filename = sys.argv[3], level = logging.INFO) 65 | logging.debug("argv[1]: Disassemble file") 66 | logging.debug("argv[2]: Arch") 67 | logging.debug("argv[3]: Output file") 68 | logging.debug("argv[4]: Instruction limit (default 100)") 69 | sass = sys.argv[1] 70 | arch = sys.argv[2] 71 | if len(sys.argv) >= 5: 72 | limit = sys.argv[4] 73 | else: 74 | limit = 100 75 | count = 0; 76 | version = int(arch.split("_")[1]) 77 | with open(sys.argv[1]) as f: 78 | for line in f: 79 | pos = [] 80 | count += 1 81 | if count == limit: 82 | break 83 | line_split = line.split() 84 | # Construct instruction structure 85 | origin = Inst(line_split) 86 | # Find the 64-bit encodings 87 | base = int(origin.enc(), 16) 88 | origin_operand_types = check_operand_types(origin) 89 | if len(origin.operands()) and origin_operand_types.find('X') == -1: 90 | pp = [[] for i in range(len(origin_operand_types))] 91 | logging.info(origin.op() + " " + origin.modifier()) 92 | logging.info("0b{:064b}".format(base) + ": " + origin.operands()) 93 | for i in range(0, 64): 94 | mask = 2**i 95 | newcode = base ^ mask 96 | # Disassemble the new code 97 | dump_file = dump("0x{:016x}".format(newcode), arch) 98 | if dump_file and dump_file.find("?") == -1 and dump_file.find("error") == -1: 99 | line = dump_file.split("\n") 100 | if version < 40: 101 | line_inst = line[1].split(); 102 | else: 103 | line_inst = line[5].split(); 104 | # [0]: header info, [1] instruction part 105 | line_inst.pop(0) 106 | inst = Inst(line_inst, raw = version > 40) 107 | pos = change(inst, origin) 108 | if pos >= 0: 109 | pp[pos].append(i) 110 | logging.info("0b{:064b}".format(newcode) + ": " + inst.operands()) 111 | logging.info("Operand combination types: %s", origin_operand_types) 112 | for i in range(0, len(pp)): 113 | logging.info("Operand type: %s", origin_operand_types[i]) 114 | logging.info("Encoding: %s", pp[i]) 115 | -------------------------------------------------------------------------------- /Solver/src/test.cu: -------------------------------------------------------------------------------- 1 | __global__ void test(float& a, float& b) { 2 | do { 3 | } while(1); 4 | } 5 | -------------------------------------------------------------------------------- /Solver/src/unique.py: -------------------------------------------------------------------------------- 1 | from sets import Set 2 | from inst import Inst 3 | import subprocess 4 | import sys 5 | 6 | if __name__ == "__main__": 7 | opset = Set([]) 8 | with open(sys.argv[1]) as f: 9 | for line in f: 10 | field = line.split() 11 | inst = Inst(field, False) 12 | if not inst.op() in opset: 13 | opset.add(inst.op()) 14 | sys.stdout.write(line) 15 | --------------------------------------------------------------------------------