├── .gitignore ├── COPYRIGHT ├── README.md ├── docs ├── Generic.SIMD.Library.WPMVP2014.pdf ├── Makefile ├── apiguide │ ├── Makefile │ ├── README.md │ ├── apidata.js │ ├── apiguide.html │ └── vendor │ │ ├── filter.js │ │ ├── jquery-1.10.1.min.js │ │ ├── jquery-ui.css │ │ └── jquery-ui.js ├── developer_guide.md ├── doxygen_main.txt ├── faq.md ├── getting_started.md ├── history.md ├── img │ ├── intel2power_apps.jpg │ ├── intel_apps.jpg │ ├── power2intel_apps.jpg │ └── power_apps.jpg ├── intrinsics.doxyfile ├── performance.md └── programming_guide.md ├── examples ├── HelloSIMD │ ├── HelloSIMD.cpp │ └── Makefile ├── RGB2Gray │ ├── Makefile │ ├── RGB2Gray.cpp │ └── RGB2Gray_tune.cpp ├── RGB2YUV │ ├── Makefile │ └── RGB2YUV.cpp ├── common.mk └── mandelbrot │ ├── Makefile │ └── mandelbrot.cpp ├── include ├── README.md ├── generic.h ├── generic4.h ├── generic8.h ├── gsimd.h ├── gsimd_utility.h ├── perfmeasure.h ├── platform_intrinsics.h ├── power7_intrinsics.h ├── power8_intrinsics.h ├── power_vsx4.h ├── sse4.h ├── svec-vsx.h └── timing.h ├── tests ├── Makefile ├── README ├── codegen.cpp ├── test_lanes4.cpp ├── test_lanes8.cpp ├── test_svec.cpp └── test_utility.h └── tools ├── allgroupspower7.sh ├── groupnamepower7.sh ├── grouppower7.sh └── p7.sh /.gitignore: -------------------------------------------------------------------------------- 1 | .cproject 2 | .project 3 | docs/html/* 4 | docs/gh-pages.github/* 5 | tests/gtest-1.6.0/* 6 | .settings/* 7 | -------------------------------------------------------------------------------- /COPYRIGHT: -------------------------------------------------------------------------------- 1 | Copyright 2012 the Generic SIMD Intrinsic Library project authors. All rights reserved. 2 | 3 | Copyright IBM Corp. 2013, 2013. All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are 7 | met: 8 | 9 | * Redistributions of source code must retain the above copyright 10 | notice, this list of conditions and the following disclaimer. 11 | * Redistributions in binary form must reproduce the above 12 | copyright notice, this list of conditions and the following 13 | disclaimer in the documentation and/or other materials provided 14 | with the distribution. 15 | * Neither the name of IBM Corp. nor the names of its contributors may be 16 | used to endorse or promote products derived from this software 17 | without specific prior written permission. 18 | 19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 23 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 25 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | 31 | The original source code covered by the above license above has been 32 | modified significantly by IBM Corp. 33 | Copyright 2013 the Generic SIMD Intrinsic Library project authors. All rights reserved. 34 | 35 | Copyright (c) 2010-2012, Intel Corporation 36 | All rights reserved. 37 | 38 | Redistribution and use in source and binary forms, with or without 39 | modification, are permitted provided that the following conditions are 40 | met: 41 | 42 | * Redistributions of source code must retain the above copyright 43 | notice, this list of conditions and the following disclaimer. 44 | 45 | * Redistributions in binary form must reproduce the above copyright 46 | notice, this list of conditions and the following disclaimer in the 47 | documentation and/or other materials provided with the distribution. 48 | 49 | * Neither the name of Intel Corporation nor the names of its 50 | contributors may be used to endorse or promote products derived from 51 | this software without specific prior written permission. 52 | 53 | 54 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS 55 | IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 56 | TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 57 | PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER 58 | OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 59 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 60 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 61 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 62 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 63 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 64 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | #Generic SIMD Library 2 | 3 | The Generic SIMD Library allowers users to write C++ SIMD codes that are portable across different SIMD ISAs. 4 | 5 | ##Running examples 6 | ```c++ 7 | //HelloSIMD.cpp 8 | #include 9 | #include 10 | 11 | int main (int argc, char* argv[]) 12 | { 13 | svec<4,float> v1(1.1, 2.2, 3.3, 4.4); 14 | svec<4,float> v2 = v1 * 2; 15 | std::cout << "Hello World: " << v2 << std::endl; 16 | return 0; 17 | } 18 | ``` 19 | 20 | Let's use the example above to illustrate some of the basics features of the library: 21 | - The entire generic SIMD library is included from the header file . 22 | - Using proper platform-specific compiler flags, the code can be compiled by standard G++ into binaries for different target SIMD architectures. 23 | - In this example, svec<4,float> is the SIMD vector abstraction provided by the library. It represents a vector of 4 floating-point values. 24 | - Most operations on SIMD vectors use standard C++ operators such as "*" and "<<". 25 | 26 | ##Key features 27 | 28 | The library provides: 29 | - Fixed-lane SIMD vectors. Our SIMD vectors are defined based on the number of elements per vector (fixed-lane) instead of the byte-length of a vector (fixed-width). This is the key diffence between our vector types and the ones defined in platform-specific intrinsics. 30 | 31 | We choose fixed-lane vector because it is more natural to SIMDized parallel loops that involve data of different length such as int and double. 32 | 33 | We intend to support vectors with arbitrary power-of-two lanes, but currently only 4-element vectors are supported. Vector of 2-, and 8-elements are under development. 34 | 35 | - Portable SIMD programming. The programming interface of the library is completely platform neutral. The library provides mapping from the interface to target SIMD platforms. The current release supports the following target platforms: 36 | + SSE4.2 37 | + VSX for P7 38 | + Scalar emulation for non-SIMD platforms 39 | 40 | - Overloaded C++ semantics on SIMD vectors. We define SIMD vector operations based on semantics of C++ operators instead of platform-specific ISA semantics. This is because the semantics of C++ operators are platform independent. Secondly, C++ operators provide a slightly higher semantics than platform-specific intrinsics and are more natural to program since most users understand C++ operators well. 41 | 42 | ##More Information 43 | - [Generic SIMD Intrinsics Library API](http://genericsimd.github.io/generic_simd/index.html) 44 | - [Generic SIMD API Guide](http://genericsimd.github.io/generic_simd/apiguide/apiguide.html) 45 | - [Getting Started](docs/getting_started.md) 46 | - [Programming Guide](docs/programming_guide.md) 47 | - [Developer Guide](docs/developer_guide.md) 48 | - [WPMVP2014 Paper] (https://github.com/genericsimd/generic_simd/raw/master/docs/Generic.SIMD.Library.WPMVP2014.pdf) 49 | - [FAQ & Trouble Shooting](docs/faq.md) 50 | - [Performance Data](docs/performance.md) 51 | - [History](docs/history.md) 52 | 53 | -------------------------------------------------------------------------------- /docs/Generic.SIMD.Library.WPMVP2014.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/genericsimd/generic_simd/3a92a1983b195a790742b3dce93a8bc3d0679dff/docs/Generic.SIMD.Library.WPMVP2014.pdf -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Make file to build the files 2 | 3 | PUBLISH_ROOT=gsimd_html 4 | GH_PAGES_ROOT=gh-pages.github 5 | default: html 6 | 7 | .PHONY: html 8 | 9 | html: intrinsics.doxyfile ../include/power_vsx4.h 10 | make clean 11 | doxygen $< 12 | 13 | gitpub: html 14 | cp -fR html/* ${GH_PAGES_ROOT} 15 | 16 | copy: html 17 | cp -R html/* ${PUBLISH_ROOT} 18 | chmod -R g+wxr ${PUBLISH_ROOT}/* 19 | chmod -R a+xr ${PUBLISH_ROOT}/* 20 | 21 | clean: 22 | @rm -fR html 23 | -------------------------------------------------------------------------------- /docs/apiguide/Makefile: -------------------------------------------------------------------------------- 1 | # Make file to build the files 2 | 3 | GH_PAGES_ROOT=../gh-pages.github 4 | 5 | gitpub: 6 | mkdir -p ${GH_PAGES_ROOT}/apiguide/ 7 | cp -rf * ${GH_PAGES_ROOT}/apiguide/ 8 | -------------------------------------------------------------------------------- /docs/apiguide/README.md: -------------------------------------------------------------------------------- 1 | # Generic SIMD API Guide 2 | 3 | The Generic SIMD API Guide is a simple tool to search the data types and APIs of the Generic SIMD library. 4 | 5 | The tool is a pure static html tool based on [filter.js framework](https://github.com/jiren/filter.js). 6 | 7 | The tool uses one search box and three checkbox filters 8 | - Lane: filter API json object's Lane attribute 9 | - Type: filter API json object's Type attribute 10 | - category: filter API json object's Category attribute 11 | 12 | The search box is a full text search of all the json object's text. So if you search "add", you should get the result containting "address". 13 | 14 | ## API data 15 | The api data is defined in apidata.js as a json object. 16 | 17 | Each API json object has five attributes 18 | - *name*: String, the API's name. 19 | - *Lane*: Integer, could be only 4 or 8 right now. 20 | - *Type*: String, the API's base(scalar) type, could be one of the following types 21 | + bool 22 | + int8_t 23 | + uint8_t 24 | + int16_t 25 | + uint16_t 26 | + int32_t 27 | + uint32_t 28 | + int64_t 29 | + uint64_t 30 | + float 31 | + double 32 | - *Category*: String, could be 33 | + datatype: data type or constructor 34 | + math: arithmetic operations 35 | + bitop: bit operations 36 | + cmp: compare 37 | + load: load operation 38 | + store: store operation 39 | + cast: cast operation 40 | + other: other operations 41 | - *Description*: String, detail description 42 | - *Example*: String. Optional. Example code. 43 | 44 | Example 45 | ```json 46 | { name: "svec< 4, bool >", 47 | Lane:4, 48 | Type: "bool", 49 | Category: "datatype", 50 | Description: "Data representation and operations on a vector of 4 boolean values. This is used in predicated vector operations. Specifically the ith value of svec<4,bool> indicates whether the ith lane of a predicated vector operation is enabled or not", 51 | Example: "Sample code. Use
for line break" 52 | } 53 | ``` 54 | -------------------------------------------------------------------------------- /docs/apiguide/apidata.js: -------------------------------------------------------------------------------- 1 | apidata = [ 2 | { name: "svec< 4, bool >", 3 | Lane:4, 4 | Type: "bool", 5 | Category: "datatype", 6 | Description: "Data representation and operations on a vector of 4 boolean values. This is used in predicated vector operations. Specifically the ith value of svec<4,bool> indicates whether the ith lane of a predicated vector operation is enabled or not." 7 | }, 8 | { name: "svec< 4, bool >::svec()", 9 | Lane:4, 10 | Type: "bool", 11 | Category: "datatype", 12 | Description: "Default constructor.
Return a vector of 4 undefined values" 13 | }, 14 | { name: "svec< 4, bool >::svec(uint32_t a)", 15 | Lane:4, 16 | Type: "bool", 17 | Category: "datatype", 18 | Description: "Constructor.
Return a vector of 4 mask/booleans: {a,a,a,a}.
Note:a must be either 0 or -1." 19 | }, 20 | { name: "svec< 4, bool >::svec(uint32_t a, uint32_t b, uint32_t c, uint32_t d)", 21 | Lane:4, 22 | Type: "bool", 23 | Category: "datatype", 24 | Description: "Constructor.
Return a vector of a vector of 4 mask/booleans: {a,b,c,d}.
Note:a,b,c,d must be either 0 or -1." 25 | }, 26 | { name: "svec< 4, bool >::operator[](int index)", 27 | Lane:4, 28 | Type: "bool", 29 | Category: "other", 30 | Description: "Set or get the vector element specified by index.", 31 | Example: "svec<4,bool> mask(0,-1,-1,0);
bool a = mask[0];//a is false
mask[2] = 0; //mask is now{0,-1,0,0}" 32 | }, 33 | { name: "svec< 4, bool >::operator==(svec<4,bool> a)", 34 | Lane:4, 35 | Type: "bool", 36 | Category: "cmp", 37 | Description: "Element-wise compare equal. Return a bool vector.", 38 | Example: "a == b" 39 | }, 40 | { name: "svec< 4, bool >::operator!=(svec<4,bool> a)", 41 | Lane:4, 42 | Type: "bool", 43 | Category: "cmp", 44 | Description: "Element-wise compare not equal. Return a bool vector", 45 | Example: "a != b" 46 | }, 47 | { name: "svec< 4, bool >::store (svec< 4, bool > *p)", 48 | Lane:4, 49 | Type: "bool", 50 | Category: "store", 51 | Description: "Store the vector to address p. p does not have to be aligned. Each svec< 4, bool > requires 16 bytes", 52 | Example: "svec< 4, bool > mask(0,-1,-1,0);
void* dst=...;
mask.store((svec< 4, bool > *)dst);" 53 | }, 54 | { name: "static svec< 4, bool >::load (svec< 4, bool > *p)", 55 | Lane:4, 56 | Type: "bool", 57 | Category: "load", 58 | Description: "Class method, load the vector from the pointer p, and return a new svec< 4, bool > vector. p does not have to be aligned. Each svec< 4, bool > requires 16 bytes", 59 | Example: "void* src=...;
svec< 4, bool > mask = svec< 4, bool >::load((svec< 4, bool >*)src);" 60 | }, 61 | { name: "svec< 4, bool >::any_true()", 62 | Lane:4, 63 | Type: "bool", 64 | Category: "other", 65 | Description: "Check if any element in the mask vector is true. Return true if at least one element in the mask vector is true, otherwise false. This is a reduction operation that returns a scalar value.", 66 | Example: "" 67 | }, 68 | { name: "svec< 4, bool >::all_true()", 69 | Lane:4, 70 | Type: "bool", 71 | Category: "other", 72 | Description: "Check if all the elements in the mask vector is true. Return true if all the elements in the mask vector are true, otherwise false. This is a reduction operation that returns a scalar value.", 73 | Example: "" 74 | }, 75 | { name: "svec< 4, bool >::none_true()", 76 | Lane:4, 77 | Type: "bool", 78 | Category: "other", 79 | Description: "Check all the elements in the mask vector is false. Return true if all the elements in the mask vector are false, otherwise false. This is a reduction operation that returns a scalar value.", 80 | Example: "" 81 | }, 82 | ]; -------------------------------------------------------------------------------- /docs/apiguide/apiguide.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | Generic SIMD API Guide 14 | 66 | 67 | 68 | 71 | 72 |
73 |
74 | Search API: 75 |
76 | 77 |
78 |

Lanes

79 |
    80 |
  • All
  • 81 |
  • LANE4
  • 82 |
  • LANE8
  • 83 |
84 |
85 |
86 |

Types

87 |
    88 |
  • All
  • 89 |
  • bool
  • 90 |
  • int8_t
  • 91 |
  • uint8_t
  • 92 |
  • int16_t
  • 93 |
  • uint16_t
  • 94 |
  • int32_t
  • 95 |
  • uint32_t
  • 96 |
  • int64_t
  • 97 |
  • uint64_t
  • 98 |
  • float
  • 99 |
  • double
  • 100 |
101 |
102 |
103 |

Categories

104 |
    105 |
  • All
  • 106 |
  • Datatype/Constructor
  • 107 |
  • Arithmetic
  • 108 |
  • Bit Manipulation
  • 109 |
  • Compare
  • 110 |
  • Load
  • 111 |
  • Store
  • 112 |
  • Cast
  • 113 |
  • Other
  • 114 |
115 |
116 | 117 |
118 |
119 |

API List

120 |
121 | 122 |
123 | 124 | 179 | 180 | 181 | -------------------------------------------------------------------------------- /docs/apiguide/vendor/filter.js: -------------------------------------------------------------------------------- 1 | /* 2 | * Filter.js 3 | * version: 1.5.1 (22/4/2013) 4 | * 5 | * Licensed under the MIT: 6 | * http://www.opensource.org/licenses/mit-license.php 7 | * 8 | * Copyright 2013 Jiren Patel[ joshsoftware.com ] 9 | * 10 | * Dependency: 11 | * jQuery(v1.8 >=) 12 | */ 13 | 14 | (function(window) { 15 | 16 | 'use strict'; 17 | 18 | var FilterJS = function(data, container, view, options) { 19 | return new _FilterJS(data, container, view, options); 20 | }; 21 | 22 | FilterJS.VERSION = '1.5.1'; 23 | 24 | $.fn.filterjs = function(data, view, options) { 25 | var $this = $(this); 26 | if ($this.data('fjs')) return; 27 | $this.data('fjs', new _FilterJS(data, $this, view, options)); 28 | }; 29 | 30 | window.FilterJS = FilterJS; 31 | 32 | var _FilterJS = function(data, container, view, options) { 33 | var property_count = 0; 34 | 35 | this.data = data; 36 | this.view = view; 37 | this.container = container; 38 | this.options = options || {}; 39 | this.categories_map = {} 40 | this.record_ids = []; 41 | 42 | if (this.data.constructor != Array) this.data = [this.data]; 43 | 44 | for (name in this.data[0]){ 45 | this.root = name; 46 | property_count += 1; 47 | } 48 | 49 | if (property_count == 1){ 50 | this.getRecord = function(i, d){ return d[i][this.root]; } 51 | }else{ 52 | this.getRecord = function(i, d){ return d[i]; } 53 | this.root = 'fjs'; 54 | } 55 | 56 | this.id_field = this.options.id_field || 'id'; 57 | this.render(this.data); 58 | this.parseOptions(); 59 | this.buildCategoryMap(this.data); 60 | this.bindEvents(); 61 | 62 | this.options.callbacks = this.options.callbacks || {}; 63 | this.execCallBack('after_init', this.record_ids); 64 | this.execCallBack('after_add', this.data); 65 | this.options.filter_types = this.options.filter_types || {}; 66 | 67 | if (!this.options.filter_types['range']) 68 | this.options.filter_types['range'] = this.rangeFilter; 69 | 70 | this.options.streaming = this.options.streaming || {}; 71 | if (this.options.streaming.data_url){ 72 | this.options.streaming.stream_after = (this.options.streaming.stream_after || 2)*1000; 73 | this.options.streaming.batch_size = this.options.streaming.batch_size || false; 74 | this.streamData(this.options.streaming.stream_after); 75 | } 76 | 77 | return this; 78 | }; 79 | 80 | _FilterJS.prototype = { 81 | 82 | //Render Html using JSON data 83 | render: function(data, offset) { 84 | var $container = $(this.container), record, el; 85 | 86 | if (!data) return; 87 | 88 | for (var i = 0, l = data.length; i < l; i++){ 89 | record = this.getRecord(i, data); 90 | el = $(this.view(record)); 91 | el.attr({id: this.root + '_' + record[this.id_field], 'data-fjs': true}); 92 | el = $container.append(el); 93 | } 94 | }, 95 | 96 | //Bind Events to filter html elements 97 | bindEvents: function() { 98 | var self = this, s = this.options.selectors, i = 0, l = s.length; 99 | 100 | for (i; i < l; i++){ 101 | this.bindSelectorEvent(s[i], self); 102 | } 103 | 104 | if (this.options.search){ 105 | $(this.options.search.input).on('keyup', function(e){ 106 | self.filter(); 107 | }); 108 | } 109 | }, 110 | 111 | bindSelectorEvent: function(selector, context) { 112 | $(selector.element).on(selector.events, function(e) { 113 | context.filter(); 114 | }); 115 | }, 116 | 117 | //Unbind fileter events 118 | clear: function() { 119 | var s = this.options.selectors, i = 0, l = s.length; 120 | 121 | for (i; i < l; i++) 122 | $(s[i].element).off(s[i].events); 123 | 124 | if (this.options.search) $(this.options.search.input).off('keyup'); 125 | 126 | this.category_map = null; 127 | this.record_ids = null; 128 | }, 129 | 130 | //Find elements accroding to selection criteria. 131 | filter: function(){ 132 | var result, s, selected_vals, records, selected_none = false, i = 0, l = this.options.selectors.length; 133 | 134 | for (i; i < l; i++){ 135 | s = this.options.selectors[i]; 136 | selected_vals = $(s.element).filter(s.select).map(function() { 137 | return $(this).val(); 138 | }); 139 | 140 | if (selected_vals.length) { 141 | records = this.findObjects(selected_vals, this.categories_map[s.name], this.options.filter_types[s.type]); 142 | 143 | result = $.grep((result || this.record_ids), function(v) { 144 | return (records.indexOf(v) != -1); 145 | }); 146 | }else{ 147 | selected_none = true; 148 | } 149 | } 150 | 151 | if (selected_none && this.options.and_filter_on) result = []; 152 | 153 | if (this.options.search) result = this.search(this.options.search, result); 154 | 155 | this.hideShow(result); 156 | 157 | this.execCallBack('after_filter', result); 158 | }, 159 | 160 | //Compare and collect objects 161 | findObjects: function(category_vals, category_map, filter_type_func) { 162 | var r = [], ids, category_val, i = 0, l = category_vals.length; 163 | 164 | for (i; i < l; i++){ 165 | category_val = category_vals[i]; 166 | 167 | if (filter_type_func){ 168 | ids = $.map(category_map, function(n,v){ 169 | if (filter_type_func(category_val, v)) return n; 170 | }); 171 | } else { 172 | ids = category_map.constructor == Array ? category_map : category_map[category_val]; 173 | } 174 | 175 | if (ids) r = r.concat(ids); 176 | } 177 | 178 | return r; 179 | }, 180 | 181 | //Make eval expresssion to collect object from the json data. 182 | buildEvalString: function(field_map) { 183 | var fields = field_map.split('.ARRAY.'), eval_str, i = 1, l = fields.length; 184 | 185 | eval_str = fields[0]; 186 | 187 | for (i; i < l; i++) { 188 | eval_str += ".filter_collect('" + fields[i] + "')"; 189 | } 190 | 191 | return eval_str; 192 | }, 193 | 194 | addFilterCriteria: function(name, criteria, ids_or_mapping) { 195 | this.categories_map[name] = {}; 196 | 197 | var selector = this.parseSelectorOptions({name: name}, [criteria]); 198 | ids_or_mapping = ids_or_mapping || $(selector.element).data('ids') || []; 199 | 200 | this.options.selectors.push(selector); 201 | this.categories_map[name] = ids_or_mapping; 202 | 203 | this.bindSelectorEvent(selector, this); 204 | }, 205 | 206 | //Create map accroding to selection criteria. 207 | parseOptions: function() { 208 | var filter_criteria = this.options.filter_criteria, selector, criteria, ele, ele_type; 209 | this.options.selectors = []; 210 | 211 | for (name in filter_criteria) { 212 | 213 | criteria = filter_criteria[name]; 214 | selector = this.parseSelectorOptions({name: name}, criteria); 215 | 216 | this.options.selectors.push(selector); 217 | 218 | criteria.push(this.buildEvalString(criteria[1])); 219 | this.categories_map[name] = {}; 220 | } 221 | }, 222 | 223 | parseSelectorOptions: function(selector, criteria) { 224 | selector.element = criteria[0].split(/.EVENT.|.SELECT.|.TYPE./)[0]; 225 | selector.events = (criteria[0].match(/.EVENT.(\S*)/) || [])[1]; 226 | selector.select = (criteria[0].match(/.SELECT.(\S*)/) || [])[1]; 227 | selector.type = (criteria[0].match(/.TYPE.(\S*)/) || [])[1]; 228 | 229 | var ele = $(selector.element), 230 | ele_type = ele.attr('type'); 231 | 232 | if (!selector.select){ 233 | if (ele.get(0).tagName == 'INPUT'){ 234 | if (ele_type == 'checkbox' || ele_type == 'radio'){ 235 | selector.select = ':checked'; 236 | }else if (ele_type == 'hidden'){ 237 | selector.select = ':input'; 238 | } 239 | }else if (ele.get(0).tagName == 'SELECT'){ 240 | selector.select = 'select'; 241 | } 242 | } 243 | 244 | if (!selector.events){ 245 | if (ele_type == 'checkbox' ||ele_type == 'radio'){ 246 | selector.events = 'click'; 247 | }else if (ele_type == 'hidden' || ele.get(0).tagName == 'SELECT'){ 248 | selector.events = 'change'; 249 | } 250 | } 251 | 252 | return selector; 253 | }, 254 | 255 | buildCategoryMap: function(data) { 256 | var filter_criteria = this.options.filter_criteria, record, categories, obj, x; 257 | 258 | for (var i = 0, l = data.length; i < l; i++){ 259 | record = this.getRecord(i, data); 260 | this.record_ids.push(record[this.id_field]); 261 | 262 | for (name in filter_criteria) { 263 | categories = eval('record.' + filter_criteria[name][2]); 264 | obj = this.categories_map[name]; 265 | 266 | if (categories && categories.constructor == Array) { 267 | for (var j = 0, lj = categories.length; j < lj; j++){ 268 | x = categories[j]; 269 | obj[x] ? obj[x].push(record[this.id_field]) : obj[x] = [record[this.id_field]]; 270 | } 271 | } else { 272 | obj[categories] ? obj[categories].push(record[this.id_field]) : obj[categories] = [record[this.id_field]]; 273 | } 274 | } 275 | } 276 | }, 277 | 278 | hideShow: function(ids) { 279 | var e_id = '#' + this.root + '_', i = 0, l = ids.length; 280 | 281 | $(this.container + ' > *[data-fjs]').hide(); 282 | 283 | for (i; i < l; i++) 284 | $(e_id + ids[i]).show(); 285 | }, 286 | 287 | search: function (search_config, filter_result) { 288 | var val = $.trim($(search_config.input).val()); 289 | var search_in = search_config.search_in; 290 | var min_length = $.isNumeric(search_config.min_length) ? search_config.min_length : 1; 291 | 292 | if (val.length < min_length) return filter_result; 293 | 294 | var id_prefix = '#' + this.root + '_'; 295 | val = val.toUpperCase(); 296 | 297 | return $.map(filter_result, function (id) { 298 | var $ele = $(id_prefix + id); 299 | 300 | if (search_in) $ele = $ele.find(search_in); 301 | 302 | if ($ele.text().toUpperCase().indexOf(val) >= 0) return id; 303 | }); 304 | }, 305 | 306 | execCallBack: function(type, result){ 307 | if(this.options.callbacks[type]) 308 | this.options.callbacks[type].call(this, result) 309 | }, 310 | 311 | rangeFilter: function(category_value, v){ 312 | var range = category_value.split('-'); 313 | 314 | if (range.length == 2){ 315 | if (range[0] == 'below') range[0] = -Infinity; 316 | if (range[1] == 'above') range[1] = Infinity; 317 | if (Number(v) >= range[0] && Number(v) <= range[1]){ 318 | return true; 319 | } 320 | } 321 | }, 322 | 323 | //Collect Records by id array 324 | getRecordsByIds: function(ids){ 325 | var records = [], r, i = 0, l = this.data.length; 326 | 327 | for (i; i < l; i++){ 328 | r = this.getRecord(i, this.data); 329 | if (ids.indexOf(r[this.id_field]) != -1) records.push(r) 330 | } 331 | 332 | return records; 333 | }, 334 | 335 | addData: function(data){ 336 | if (data == undefined || data.length == 0 ) return; 337 | 338 | var i = 0, l = data.length, r, uniq_data = [], e_id = '#' + this.root + '_'; 339 | 340 | this.execCallBack('before_add', data) 341 | 342 | //for (i, l; i < l; i++){ 343 | // r = this.getRecord(i, data); 344 | // if ($(e_id + r.id).length == 0) uniq_data.push(data[i]); 345 | //} 346 | 347 | this.data = this.data.concat(data); 348 | this.render(data); 349 | this.buildCategoryMap(data); 350 | this.execCallBack('after_add', data) 351 | this.filter(); 352 | }, 353 | 354 | setStreamingTimer: function(){ 355 | var self = this, 356 | timer_func = this.options.streaming.batch_size ? setInterval : setTimeout; 357 | 358 | return timer_func(function(){ 359 | self.streamData(); 360 | }, this.options.streaming.stream_after); 361 | }, 362 | 363 | clearStreamingTimer: function(){ 364 | if (this.timer) clearTimeout(this.timer); 365 | }, 366 | 367 | fetchData: function(){ 368 | var self = this, 369 | params = this.options.params || {}, 370 | opts = this.options.streaming; 371 | 372 | params['offset'] = this.data.length; 373 | 374 | if (opts.batch_size) params['limit'] = opts.batch_size; 375 | if (this.options.search) params['q'] = $.trim($(this.options.search.input).val()); 376 | 377 | $.getJSON(opts.data_url, params).done(function(data){ 378 | 379 | if (params.limit != null && (!data || !data.length)){ 380 | self.stopStreaming(); 381 | }else{ 382 | self.setStreamInterval(); 383 | self.addData(data); 384 | } 385 | 386 | }).fail(function(e){ 387 | self.stopStreaming(); 388 | }); 389 | }, 390 | 391 | setStreamInterval: function(){ 392 | var self = this; 393 | if(self.options.streaming.stop_streaming == true) return; 394 | 395 | self.timer = setTimeout(function(){ 396 | self.fetchData(); 397 | }, self.options.streaming.stream_after); 398 | }, 399 | 400 | stopStreaming: function(){ 401 | this.options.streaming.stop_streaming = true; 402 | if (this.timer) clearTimeout(this.timer); 403 | }, 404 | 405 | resumeStreaming: function(){ 406 | this.options.streaming.stop_streaming = false; 407 | this.streamData(this.options.streaming.stream_after); 408 | }, 409 | 410 | streamData: function(time){ 411 | this.setStreamInterval(); 412 | if(!this.options.streaming.batch_size) this.stopStreaming(); 413 | } 414 | 415 | } 416 | 417 | 418 | })(this); 419 | 420 | /** 421 | * Recursive method to collect object from json object. 422 | * i.e. test = [ {"deal": {"id": 1 }}, {"deal": {"id": 2}}] 423 | * - to collect id from the json data 424 | * test.filter_collect('deal').filter_collect('id') 425 | * this will return [1,2] 426 | */ 427 | Array.prototype.filter_collect = function(field, arr) { 428 | var arr = arr || []; 429 | for (var i = 0, l = this.length; i < l; i++){ 430 | var obj = this[i]; 431 | if (obj.constructor == Array){ 432 | obj.filter_collect(field, arr); 433 | } 434 | else { 435 | arr.push(obj[field]); 436 | } 437 | } 438 | 439 | return arr; 440 | }; 441 | 442 | //In IE indexOf method not define. 443 | if (!Array.prototype.indexOf) { 444 | Array.prototype.indexOf = function(obj, start) { 445 | for (var i = (start || 0), j = this.length; i < j; i++) { 446 | if (this[i] === obj) { return i; } 447 | } 448 | return -1; 449 | } 450 | } 451 | -------------------------------------------------------------------------------- /docs/developer_guide.md: -------------------------------------------------------------------------------- 1 | #Developer Guide 2 | 3 | 4 | ##Source code structure 5 | 6 | The package contains the following structures: 7 | - docs Generate html docs through doxygen 8 | - examples Examples using the library 9 | - include The library source code 10 | - tests Unit tests, test library implementation 11 | 12 | When adding a new target platform, need to add .h to include/ and may need to modify makefiles so that the new platform can be tested and will run with existing examples. 13 | 14 | ##Run unit tests 15 | 16 | Unit tests are in tests directory. The current unit test covers power_vsx4.h, generic4.h, generic8,h and sse4.h 17 | 18 | Please download googletest framework first from https://code.google.com/p/googletest/, and unzip it into "tests/gtest-1.6.0" dirctory. 19 | Or you can unzip it to where you want, and modify the "GTEST_DIR" value in tests/Makefile. 20 | 21 | Then you can run the test 22 | ```bash 23 | $ cd tests 24 | $ make clean 25 | $ make {vsx4|sse4|generic4|generic8} # build/run unit tests for target SIMD ISA 26 | ``` 27 | The test app will test vsx4, generic4 and sse4 interfaces, and generate the report. 28 | 29 | 30 | ##Generate the documentation 31 | 32 | We use doxygen to generate documentations. The input files for doxygen is under /docs/. To update the documentation, either modigy the *.txt files or doxygen annotations in the library source codes. 33 | 34 | To publish new documentations, you need to go through the following steps: 35 | 36 | 1. Make sure you have doxygen installed 37 | 38 | 2. Checkout the gh-pages branch of your project to docs/gh-pages.github 39 | ```bash 40 | $ add docs/gh-pages.github to .gitignore 41 | $ cd docs 42 | # create a branch gh-pages from the github web interface 43 | # clone the project repo to docs/gh-pages.github 44 | $ git clone -b gh-pages https://github.com/genericsimd/generic_simd.git gh-pages.github 45 | $ cd gh-pages.github 46 | # remove all trunk files from gh-pages branch 47 | ``` 48 | 49 | 3. Generate new doxygen pages and copy into gh-pages.github 50 | ```bash 51 | $ cd docs 52 | $ make # generate documentation into docs/html 53 | $ make gitpub # copy docs/html into docs/gh-pages.github 54 | $ cd gh-pages.github 55 | $ git add -A # add everything under the directory 56 | $ git commit -a # checkin new documentation to github 57 | $ git push # push to github 58 | ``` 59 | Note: it may take 10 minutes before the new pages appear on http://genericsimd.github.io/generic_simd 60 | -------------------------------------------------------------------------------- /docs/doxygen_main.txt: -------------------------------------------------------------------------------- 1 | /** 2 | \mainpage 3 | This documentation describes the API of Generic SIMD Intrinsic Library. 4 | 5 | For usage instructions, please see the document https://github.com/genericsimd/generic_simd. 6 | 7 | */ -------------------------------------------------------------------------------- /docs/faq.md: -------------------------------------------------------------------------------- 1 | #Frequently Asked Questions 2 | 3 | ## Frequently asked questions 4 | 5 | 1. What target SIMD platforms does the library support? 6 | 7 | Currently we support two SIMD platforms, SSE4.2 and VSX. We also 8 | support a generic implementation of the library using scalar codes. 9 | 10 | 2. Failed to build unit tests under tests/ 11 | 12 | ```bash 13 | -bash-4.1$ make 14 | g++ -Igtest-1.6.0/include -Igtest-1.6.0 -c gtest-1.6.0/src/gtest-all.cc 15 | g++: error: gtest-1.6.0/src/gtest-all.cc: No such file or directory 16 | g++: fatal error: no input files 17 | ``` 18 | 19 | Our unit test engines uses google test framework. Due to opensource 20 | license issues, googletest is not included in our source tree. Please 21 | download googletest from [here](https://code.google.com/p/googletest/) 22 | and unzip it into "tests/gtest-1.6.0/". Or you can unzip it to 23 | where you want, and modify the "GTEST_DIR" value in tests/Makefile. 24 | 25 | 3. Could I get slightly different results using svec_madd and 26 | svec_msub on different platforms? 27 | 28 | The vsx's vsx::svec_madd(), vsx::svec_msub() are mapped into madd and 29 | msub intrinsics directly, while generic::svec_madd, 30 | generic::svec_msub() is implemented by scalar code. In rare occasions, 31 | fused operation by one hardware instruction provides higher precision 32 | in float operations. So it's possible the vsx and generic provide 33 | slightly different results. 34 | 35 | ## Known Bugs 36 | 37 | -------------------------------------------------------------------------------- /docs/getting_started.md: -------------------------------------------------------------------------------- 1 | #Getting Started 2 | 3 | ##Getting the source 4 | 5 | Clone the library from github 6 | ```bash 7 | $ git clone https://github.com/genericsimd/generic_simd.git generic_simd 8 | ``` 9 | The package contains the following directories: 10 | 11 | - docs/ input to doxygen and makefile to generate documents 12 | - examples/ Examples using the library 13 | - include/ The library source code 14 | - tests/ Unit tests, test library implementation 15 | 16 | ##Using the library 17 | 18 | The library is implemented completely inside header files, all of which are under include/. To use the library, follow these steps: 19 | 20 | 1. Include the library header into your source code 21 | 2. Programming according to library API 22 | 3. Build the binary w/ standard g++ like this: 23 | ```bash 24 | g++ -I /include -m{vsx|sse4.2} -Wno-int-to-pointer-cast -flax-vector-conversions ... 25 | ``` 26 | - -mvsx: standard g++ option to generate VSX instructions 27 | - -msse4.2: standard g++ option to generate SSE4.2 instructions 28 | - if no -mvsx or -msse4.2 is specified: generate scalar codes emulating generic SIMD intrinsics 29 | - -Wno-int-to-pointer-cast -flax-vector-conversions: ignore some warnings and enable vector casts 30 | 31 | Consider the hello-world example: 32 | ```cpp 33 | //HelloSIMD.cpp 34 | #include 35 | #include 36 | 37 | int main (int argc, char* argv[]) 38 | { 39 | svec<4,float> v1(1.1, 2.2, 3.3, 4.4); 40 | svec<4,float> v2 = v1 * 2; 41 | std::cout << "Hello World: " << v2 << std::endl; 42 | return 0; 43 | } 44 | ``` 45 | 46 | Example#1: how to build for VSX 47 | ```bash 48 | $ g++ -I../../include HelloSIMD.cpp -mvsx -flax-vector-conversions -o HelloSIMD -Wno-int-to-pointer-cast 49 | $ ./HelloSIMD 50 | Hello World: svec4_f[2.2, 4.4, 6.6, 8.8] 51 | ``` 52 | 53 | Eample#2: how to build for SSE4.2 54 | ``` 55 | $ g++ -I../../include HelloSIMD.cpp -msse4.2 -o HelloSIMD -Wno-int-to-pointer-cast 56 | $ ./HelloSIMD 57 | Hello World: svec4_f[2.2, 4.4, 6.6, 8.8] 58 | ``` 59 | 60 | ##Running examples 61 | 62 | We provided a few examples under examples/, that includes: 63 | 64 | - HelloSIMD hello-world example 65 | - mandelbrot mandelbrot algorithm 66 | - RGB2Gray RGB to gray conversion 67 | 68 | To try out these examples, simply 69 | ```bash 70 | $ cd examples/RGB2Gray 71 | $ make 72 | $ make run 73 | ``` 74 | -------------------------------------------------------------------------------- /docs/history.md: -------------------------------------------------------------------------------- 1 | #History 2 | 3 | ##Version 0.2 4 | 5 | - Add Intel SSE4.2 LANES=4 implementation. 6 | - Add gather_stride() and scatter_stride interface. 7 | 8 | ##Version 0.1 9 | 10 | - Initial implementation for LANES=4. Including power vsx and generic support. 11 | -------------------------------------------------------------------------------- /docs/img/intel2power_apps.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/genericsimd/generic_simd/3a92a1983b195a790742b3dce93a8bc3d0679dff/docs/img/intel2power_apps.jpg -------------------------------------------------------------------------------- /docs/img/intel_apps.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/genericsimd/generic_simd/3a92a1983b195a790742b3dce93a8bc3d0679dff/docs/img/intel_apps.jpg -------------------------------------------------------------------------------- /docs/img/power2intel_apps.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/genericsimd/generic_simd/3a92a1983b195a790742b3dce93a8bc3d0679dff/docs/img/power2intel_apps.jpg -------------------------------------------------------------------------------- /docs/img/power_apps.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/genericsimd/generic_simd/3a92a1983b195a790742b3dce93a8bc3d0679dff/docs/img/power_apps.jpg -------------------------------------------------------------------------------- /docs/performance.md: -------------------------------------------------------------------------------- 1 | #Performance Data 2 | 3 | 4 | ##Performance Evaluation 5 | 6 | The performance goal of the generic SIMD intrinsic library is to match up with the performance of the same code written in a platform-specific intrinsics. We have ported some examples from intrinsics implementation to Generic SIMD implementation, and evaluated their performance. 7 | 8 | Applications include 9 | 10 | - RGB2Gray, float 11 | - Mandelbrot, float 12 | - SPSS-sweep, double 13 | - Dgemm, double 14 | - CVA-Mean, float 15 | 16 | Some of them only have Power Platform intrinsics implementation, and some of them only have intel intrinsics implementation. We compared the generic SIMD's performance with the intrinsics implementation by measure their speed up over scalar code. 17 | 18 | We also evaluated the portability of our Generic SIMD intrinsics library by running the generic SIMD version application on the platform without intrinsics implementation. 19 | 20 | Below are the result. 21 | 22 | The following figures show the performance speedup versus the scalar code on different platforms. 23 | 24 | Higher is better. 25 | 26 | __App on Power Platform__
![App on Power Platform](img/power_apps.jpg) 27 | 28 | __App on Intel Platform__
![App on Intel Platform](img/intel_apps.jpg) 29 | 30 | __App with Power Intrinsics only__
![App with Power Intrinsics only](img/power2intel_apps.jpg) 31 | 32 | __App with Intel Intrinsics only__
![App with Intel Intrinsics only](img/intel2power_apps.jpg) 33 | -------------------------------------------------------------------------------- /docs/programming_guide.md: -------------------------------------------------------------------------------- 1 | #Programming Guide 2 | 3 | For detailed interface specification, refer to [Generic SIMD intrinsics library API] (http://genericsimd.github.io/generic_simd/index.html) 4 | 5 | ##Data Types 6 | 7 | The library supports templaterized SIMD vector types, *svec*, 8 | where *N* specifies elements per vector and has to be power of two. 9 | *STYPE* specifies scalar type of vector element: *bool*, *char*, "unsigned 10 | char*, *short*, *unsigned short*, *int*, *unsigned int*, *long long*, 11 | *unsigned long long*, *float*, and *double*. 12 | 13 | Currently the library supports only N = 4 14 | 15 | - *svec<4,bool>*: vector of 4 boolean 16 | - *svec<4,int8_t>, svec<4,uint8_t>: vector of 4 signed/unsigned 8-bit int 17 | - *svec<4,int16_t>*, *svec<4,uint16_t>*: vector of 4 signed/unsigned 16-bit int 18 | - *svec<4,int32_t>*, *svec<4,int32_t>*: vector of 4 signed/unsigned 32-bit int 19 | - *svec<4,int64_t>*, *svec<4,uint64_t>*: vector 4 signed/unsigned 32-bit int 20 | - *svec<4,float>*: vector of 4 float 21 | - *svec<4,double>*: vector of 4 double 22 | - *svec<4,void*>*: vector of 4 pointers 23 | 24 | In the rest of the document we use VTYPE to indicate SIMD vector types. 25 | 26 | ##Operations 27 | 28 | ###Constructor 29 | 30 | - Default constructor returns a vector with undefined value. e.g. "svec<4,int32_t> v;" 31 | You can modify it's elements by "[]" operator. 32 | - Construct a SIMD vector with four scalar values. e.g. "svec<4,int32_t> v(1,2,3,4)" 33 | - Construct a SIMD vector with one scalar value. e.g. "svec<4,int32_t> v(100)". 34 | 35 | All the four values in the SIMD vector is 100. 36 | 37 | 38 | ###Extract/insert single vector element 39 | 40 | "[]" operator is used to get and set the elements. 41 | ```c++ 42 | svec<4,int32_t> v(1,2,3,4); 43 | int a = v[2]; // extracts the 3rd element of the vector (i.e., element index starts from 0), a is 3 now 44 | v[3] = 10; // assigns 10 to the 3rd element of the vector, v is [1,2,3,10] now 45 | ``` 46 | 47 | Due to the current limitation, bool vector's setter must use "-1" as true in the right hand side. 48 | ```c++ 49 | svec<4,bool> m(0); // construct a vector of boolean with all elements initialized to false 50 | m[0] = -1; // after assignment, 1st element of m is true. 51 | ``` 52 | 53 | ###Load and Store 54 | 55 | Store a vector to location p through instance method store(VTYPE *). 56 | 57 | Load a vector from location p through class static method VTYPE::(VTYPE *). 58 | e.g. "svec<4,int32_t>::load(an_address)" will return a new svec<4,int32_t> vector. 59 | 60 | Load a scalar value from an address and splat it into the whole vector could be done through class static method VTYPE::load_and_splat(STYPE *) 61 | 62 | There is another method called VTYPE::load_const(STYPE*), which has similar semantics. 63 | 64 | ###Compare Operations 65 | 66 | Compare two vectors, and return a svec<4,bool> vector. 67 | 68 | Operators: == != for all types 69 | 70 | Operators: >, >=, <, <= for all types except svec<4,bool>. 71 | 72 | ###Bit operations 73 | 74 | svec<4,bool> has operator ~ to reverse the boolean value. 75 | 76 | Binary bit operators &, |, ^ are available for all integer vector types. 77 | 78 | Logical operators !, &&, || are available for svec<4,bool> type. 79 | 80 | ###Math operations 81 | 82 | Support all types except svec<4,bool>. 83 | 84 | Unary operator "-" is used to get the neg value for non-boolean vectors 85 | 86 | Binary operators +, -, *, / can support VTYPE op VTYPE, VTYPE op STYPE, STYPE op VTYPE. 87 | 88 | Binary operators >>, <<, % can support VTYPE op VTYPE, VTYPE op STYPE over all integer types. 89 | 90 | \>> and << for shift, and % for remainder. 91 | 92 | Please note shift by a vector can only has unsigned integer vector in the right hand. 93 | 94 | ###Instance methods operations 95 | 96 | broadcast(), rotate(), shuffle() support all types exclude svec<4,bool>(). 97 | 98 | round(), floor(), ceil(), sqrt(), rcp(), rsqrt(), exp(), log(), pow(VTYPE) support svec<4,float>, and svec<4,double>. 99 | 100 | All above will return a new vector. 101 | 102 | reduce_add(), reduce_max(), reduce_min() do a vector scope's reduction, and return a scalar value. 103 | 104 | any_true(), all_true(), none_true() do a svec<4,bool> vector's reduction, and return a boolean scalar value. 105 | 106 | ###Gather and Scatter 107 | 108 | Please refer the detail document for how to use gather and scatter. 109 | E.g. svec<4,int32_t> type 110 | 111 | - svec<4,int32_t>::gather() 112 | - svec<4,int32_t>::scatter() 113 | - svec<4,int32_T>::gather_base_offsets() 114 | - svec<4,int32_t>::scatter_base_offsets() 115 | - svec<4,int32_t>::gather_stride() 116 | - svec<4,int32_t>::scatter_stride() 117 | 118 | **Note** The current power processor has no gather/scatter instructions. The software based implementation is slow right now, especially the gather_base_offsets() and scatter_base_offsets(). 119 | 120 | In case of regular stride style gather/scatter, it's better to use gather_stride() and scatter_stride(). 121 | 122 | ###Multiply-Add and Multiply-Sub 123 | 124 | VTYPE svec_madd(VTYPE a, VTYPE b, VTYPE c) returns a * b + c; 125 | 126 | VTYPE svec_msub(VTYPE a, VTYPE b, VTYPE c) returns a * b - c; 127 | 128 | VTYPE svec_nmsub(VTYPE a, VTYPE b, VTYPE c) returns -(a * b - c); 129 | 130 | ###Select operation 131 | 132 | The prototype is svec_select(svec<4,bool> mask, VTYPE a, VTYPE b), and return a new vector whose elements are selected from _a_ or _b_ based on the mask. True from _a_ and false from _b_. 133 | 134 | There is another select svec_select(bool cond, VTYPE a, VTYPE b), which is the same as "cond ? a : b". 135 | 136 | ###Type cast operation 137 | 138 | The prototype is svec_cast(FROM_VTYPE). It supports all combinations of type cast. Each element's cast semantics is the same as scalar cast. 139 | 140 | ###Operation with mask 141 | 142 | load, store, gatter, scatter, compare operations have a masked version. 143 | Please refer the detail document for detail. 144 | -------------------------------------------------------------------------------- /examples/HelloSIMD/HelloSIMD.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | Copyright 2012 the Generic SIMD Intrinsic Library project authors. All rights reserved. 3 | 4 | Copyright IBM Corp. 2013, 2013. All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are 8 | met: 9 | 10 | * Redistributions of source code must retain the above copyright 11 | notice, this list of conditions and the following disclaimer. 12 | * Redistributions in binary form must reproduce the above 13 | copyright notice, this list of conditions and the following 14 | disclaimer in the documentation and/or other materials provided 15 | with the distribution. 16 | * Neither the name of IBM Corp. nor the names of its contributors may be 17 | used to endorse or promote products derived from this software 18 | without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 23 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 24 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 25 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 26 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 27 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 28 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 | */ 32 | 33 | 34 | /* 35 | * g++ -I../../include HelloSIMD.cpp -mvsx -flax-vector-conversions -Wno-int-to-pointer-cast -o HelloSIMD 36 | * */ 37 | 38 | #include 39 | #include 40 | 41 | int main (int argc, char* argv[]) 42 | { 43 | svec<4,float> v1(1.1, 2.2, 3.3, 4.4); 44 | svec<4,float> v2 = v1 * 2; 45 | std::cout << "Hello World: " << v2 << std::endl; 46 | return 0; 47 | } 48 | 49 | -------------------------------------------------------------------------------- /examples/HelloSIMD/Makefile: -------------------------------------------------------------------------------- 1 | 2 | 3 | EXAMPLE=HelloSIMD 4 | RUN_ARGS= 5 | 6 | include ../common.mk -------------------------------------------------------------------------------- /examples/RGB2Gray/Makefile: -------------------------------------------------------------------------------- 1 | 2 | 3 | EXAMPLE=RGB2Gray 4 | RUN_ARGS= 5 | 6 | include ../common.mk -------------------------------------------------------------------------------- /examples/RGB2Gray/RGB2Gray.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | Copyright 2012 the Generic SIMD Intrinsic Library project authors. All rights reserved. 3 | 4 | Copyright IBM Corp. 2013, 2013. All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are 8 | met: 9 | 10 | * Redistributions of source code must retain the above copyright 11 | notice, this list of conditions and the following disclaimer. 12 | * Redistributions in binary form must reproduce the above 13 | copyright notice, this list of conditions and the following 14 | disclaimer in the documentation and/or other materials provided 15 | with the distribution. 16 | * Neither the name of IBM Corp. nor the names of its contributors may be 17 | used to endorse or promote products derived from this software 18 | without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 23 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 24 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 25 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 26 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 27 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 28 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 | */ 32 | 33 | /** 34 | * RGB2Gray.cpp 35 | * 36 | * Created on: Jun 12, 2013 37 | * @author: Haichuan Wang (haichuan@us.ibm.com, hwang154@illinois.edu) 38 | 39 | */ 40 | 41 | 42 | /* 43 | * g++ -I../../include RGB2Gray.cpp -mvsx -flax-vector-conversions -Wno-int-to-pointer-cast -g -O2 -o RGB2Gray 44 | * */ 45 | 46 | #include 47 | #include 48 | #include 49 | #include 50 | #include 51 | #include 52 | #include 53 | 54 | #define N (1048576) 55 | //#define N (1000000) 56 | 57 | //Doesn't work 58 | //__attribute__((optimize("no-tree-vectorize"))) 59 | 60 | void 61 | #ifdef __SSE4_2__ 62 | __attribute__((target("no-sse"))) 63 | #endif 64 | serial_rgb2gray(float* ra, float* ga, float* ba, float* gray) { 65 | for(int i = 0; i < N; i++) { 66 | gray[i] = 0.3f * ra[i] + 0.59f * ga[i] + 0.11f * ba[i]; 67 | } 68 | } 69 | 70 | typedef svec<4,float> vfloat; 71 | 72 | void svec4_rgb2gray(float* ra, float* ga, float* ba, float* gray ) { 73 | 74 | for(int i = 0; i < N; i+=4) { 75 | vfloat a = vfloat::load((vfloat*)(ra+i)); 76 | vfloat b = vfloat::load((vfloat*)(ga+i)); 77 | vfloat c = vfloat::load((vfloat*)(ba+i)); 78 | vfloat out = 0.3f * a + 0.59f * b + 0.11f * c ; 79 | out.store((vfloat*)(gray+i)); 80 | } 81 | } 82 | 83 | void svec4_rgb2gray_ptr(float* ra, float* ga, float* ba, float* gray ) { 84 | 85 | for(int i = 0; i < N; i+=4) { 86 | vfloat a = *(vfloat*)(ra+i); 87 | vfloat b = *(vfloat*)(ga+i); 88 | vfloat c = *(vfloat*)(ba+i); 89 | vfloat out = 0.3f * a + 0.59f * b + 0.11f * c ; 90 | *(vfloat*)(gray+i) = out; 91 | } 92 | } 93 | 94 | #ifdef __ALTIVEC__ 95 | void intrinsics_rgb2gray(float* ra, float* ga, float* ba, float* gray ) { 96 | __vector float c1 = vec_splats(0.3f); 97 | __vector float c2 = vec_splats(0.59f); 98 | __vector float c3 = vec_splats(0.11f); 99 | 100 | for(int i = 0; i < N; i+=4) { 101 | __vector float a = vec_vsx_ld(0, ra+i); 102 | __vector float b = vec_vsx_ld(0, ga+i); 103 | __vector float c = vec_vsx_ld(0, ba+i); 104 | __vector float out = c1 * a + c2 * b + c3 * c ; 105 | vec_vsx_st(out, 0, gray+i); 106 | } 107 | } 108 | #endif 109 | 110 | #ifdef __SSE4_2__ 111 | 112 | void sse_rgb2gray(float* ra, float* ga, float* ba, float* gray) { 113 | __m128 c1 = _mm_set1_ps(0.3f); 114 | __m128 c2 = _mm_set1_ps(0.59f); 115 | __m128 c3 = _mm_set1_ps(0.11f); 116 | 117 | for(int i = 0; i < N; i+=4) { 118 | __m128 a = _mm_loadu_ps(ra+i); 119 | __m128 b = _mm_loadu_ps(ga+i); 120 | __m128 c = _mm_loadu_ps(ba+i); 121 | __m128 ab = _mm_add_ps(_mm_mul_ps(c1, a), _mm_mul_ps(c2, b)); 122 | __m128 out = _mm_add_ps(ab, _mm_mul_ps(c3, c)); 123 | _mm_storeu_ps(gray+i, out); 124 | } 125 | } 126 | 127 | #endif 128 | 129 | void svec4_rgb2gray_fma(float* ra, float* ga, float* ba, float* gray) { 130 | for(int i = 0; i < N; i+=4) { 131 | vfloat a = vfloat::load((vfloat*)(ra+i)); 132 | vfloat b = vfloat::load((vfloat*)(ga+i)); 133 | vfloat c = vfloat::load((vfloat*)(ba+i)); 134 | vfloat out = 0.3 * a; 135 | out = svec_madd(vfloat(0.59), b, out); 136 | out = svec_madd(vfloat(0.11), c, out); 137 | out.store((vfloat*)(gray+i)); 138 | } 139 | } 140 | 141 | 142 | float r[N+10000] POST_ALIGN(16); 143 | float g[N+20000] POST_ALIGN(16); 144 | float b[N+30000] POST_ALIGN(16); 145 | float gray[N+40000] POST_ALIGN(16); 146 | 147 | #define ITERATIONS 1000 148 | int main (int argc, char* argv[]) 149 | { 150 | for(int i = 0; i < N; i++) { 151 | r[N] = random() % 256; 152 | g[N] = random() % 256; 153 | b[N] = random() % 256; 154 | } 155 | std::cout<< "Convert " << N << " pixels RGB to gray." << std::endl; 156 | 157 | reset_and_start_stimer(); 158 | for(int i = 0; i < ITERATIONS; i++) { serial_rgb2gray(r, g, b, gray);} 159 | double dt = get_elapsed_seconds(); 160 | std::cout<< "serial version: " << dt << " seconds" << std::endl; 161 | 162 | reset_and_start_stimer(); 163 | for(int i = 0; i < ITERATIONS; i++) { svec4_rgb2gray(r, g, b, gray);} 164 | double dt2 = get_elapsed_seconds(); 165 | std::cout<< "svec4 version: " << dt2 << " seconds" << std::endl; 166 | 167 | reset_and_start_stimer(); 168 | for(int i = 0; i < ITERATIONS; i++) { svec4_rgb2gray_ptr(r, g, b, gray); } 169 | double dt3 = get_elapsed_seconds(); 170 | std::cout<< "svec4 ptr ld/st version: " << dt3 << " seconds" << std::endl; 171 | 172 | #ifdef __ALTIVEC__ 173 | reset_and_start_stimer(); 174 | for(int i = 0; i < ITERATIONS; i++) { intrinsics_rgb2gray(r, g, b, gray);} 175 | double dt5 = get_elapsed_seconds(); 176 | std::cout<< "Intrinsics version: " << dt5 << " seconds" << std::endl; 177 | #endif 178 | 179 | #ifdef __SSE4_2__ 180 | reset_and_start_stimer(); 181 | for(int i = 0; i < ITERATIONS; i++) { sse_rgb2gray(r, g, b, gray);} 182 | double dt6 = get_elapsed_seconds(); 183 | std::cout<< "SSE version: " << dt6 << " seconds" << std::endl; 184 | #endif 185 | 186 | reset_and_start_stimer(); 187 | for(int i = 0; i < ITERATIONS; i++) { svec4_rgb2gray_fma(r, g, b, gray); } 188 | double dt4 = get_elapsed_seconds(); 189 | std::cout<< "svec4 fma version: " << dt4 << " seconds" << std::endl; 190 | 191 | 192 | return 0; 193 | } 194 | 195 | -------------------------------------------------------------------------------- /examples/RGB2Gray/RGB2Gray_tune.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | Copyright 2012 the Generic SIMD Intrinsic Library project authors. All rights reserved. 3 | 4 | Copyright IBM Corp. 2013, 2013. All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are 8 | met: 9 | 10 | * Redistributions of source code must retain the above copyright 11 | notice, this list of conditions and the following disclaimer. 12 | * Redistributions in binary form must reproduce the above 13 | copyright notice, this list of conditions and the following 14 | disclaimer in the documentation and/or other materials provided 15 | with the distribution. 16 | * Neither the name of IBM Corp. nor the names of its contributors may be 17 | used to endorse or promote products derived from this software 18 | without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 23 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 24 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 25 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 26 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 27 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 28 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 | */ 32 | 33 | /* 34 | * RGB2Gray_tune.cpp 35 | * 36 | * Created on: Jun 12, 2013 37 | * Author: haichuan 38 | */ 39 | 40 | 41 | /* 42 | * g++ -I../../include RGB2Gray.cpp -mvsx -flax-vector-conversions -Wno-int-to-pointer-cast -g -O2 -o RGB2Gray 43 | * */ 44 | 45 | #include 46 | #include 47 | #include 48 | #include 49 | #include 50 | #include 51 | #include 52 | #ifdef __ALTIVEC__ 53 | #include 54 | using namespace vsx; 55 | #else 56 | #ifdef __SSE4_2__ 57 | #include 58 | using namespace sse; 59 | #else 60 | #include 61 | using namespace generic; 62 | #endif //__SSE4_2__ 63 | #endif //__ALTIVEC__ 64 | 65 | 66 | 67 | //#define N (16000) 68 | //#define N 1000000 69 | #define N (1048576) 70 | #ifdef __SSE4_2__ 71 | __attribute__((target("no-sse"))) 72 | #endif 73 | void serial_rgb2gray(float* ra, float* ga, float* ba, float* gray) { 74 | for(int i = 0; i < N; i++) { 75 | gray[i] = 0.3f * ra[i] + 0.59f * ga[i] + 0.11f * ba[i]; 76 | } 77 | } 78 | 79 | void 80 | __attribute__((optimize("no-tree-vectorize"))) 81 | svec4_rgb2gray(float* ra, float* ga, float* ba, float* gray ) { 82 | 83 | for(int i = 0; i < N; i+=4) { 84 | svec4_f a = svec4_f::load((svec4_f*)(ra+i)); 85 | svec4_f b = svec4_f::load((svec4_f*)(ga+i)); 86 | svec4_f c = svec4_f::load((svec4_f*)(ba+i)); 87 | svec4_f out = 0.3f * a + 0.59f * b + 0.11f * c ; 88 | out.store((svec4_f*)(gray+i)); 89 | } 90 | } 91 | 92 | void 93 | __attribute__((optimize("no-tree-vectorize"))) 94 | svec4_rgb2gray_ptr(float* ra, float* ga, float* ba, float* gray ) { 95 | 96 | for(int i = 0; i < N; i+=4) { 97 | svec4_f a = *(svec4_f*)(ra+i); 98 | svec4_f b = *(svec4_f*)(ga+i); 99 | svec4_f c = *(svec4_f*)(ba+i); 100 | svec4_f out = 0.3f * a + 0.59f * b + 0.11f * c ; 101 | *(svec4_f*)(gray+i) = out; 102 | } 103 | } 104 | 105 | 106 | void 107 | __attribute__((optimize("no-tree-vectorize"))) 108 | svec4_rgb2gray_fma(float* ra, float* ga, float* ba, float* gray) { 109 | for(int i = 0; i < N; i+=4) { 110 | svec4_f a = svec4_f::load((svec4_f*)(ra+i)); 111 | svec4_f b = svec4_f::load((svec4_f*)(ga+i)); 112 | svec4_f c = svec4_f::load((svec4_f*)(ba+i)); 113 | svec4_f out = 0.3 * a; 114 | out = svec_madd(svec4_f(0.59), b, out); 115 | out = svec_madd(svec4_f(0.11), c, out); 116 | out.store((svec4_f*)(gray+i)); 117 | } 118 | } 119 | 120 | #ifdef __ALTIVEC__ 121 | void intrinsics_rgb2gray(float* ra, float* ga, float* ba, float* gray ) { 122 | __vector float c1 = vec_splats(0.3f); 123 | __vector float c2 = vec_splats(0.59f); 124 | __vector float c3 = vec_splats(0.11f); 125 | 126 | for(int i = 0; i < N; i+=4) { 127 | __vector float a = vec_vsx_ld(0, ra+i); 128 | __vector float b = vec_vsx_ld(0, ga+i); 129 | __vector float c = vec_vsx_ld(0, ba+i); 130 | __vector float out = c1 * a + c2 * b + c3 * c ; 131 | vec_vsx_st(out, 0, gray+i); 132 | } 133 | } 134 | #endif 135 | 136 | 137 | #ifdef __SSE4_2__ 138 | 139 | void sse_rgb2gray(float* ra, float* ga, float* ba, float* gray) { 140 | __m128 c1 = _mm_set1_ps(0.3f); 141 | __m128 c2 = _mm_set1_ps(0.59f); 142 | __m128 c3 = _mm_set1_ps(0.11f); 143 | 144 | for(int i = 0; i < N; i+=4) { 145 | __m128 a = _mm_loadu_ps(ra+i); 146 | __m128 b = _mm_loadu_ps(ga+i); 147 | __m128 c = _mm_loadu_ps(ba+i); 148 | __m128 ab = _mm_add_ps(_mm_mul_ps(c1, a), _mm_mul_ps(c2, b)); 149 | __m128 out = _mm_add_ps(ab, _mm_mul_ps(c3, c)); 150 | _mm_storeu_ps(gray+i, out); 151 | } 152 | } 153 | #endif 154 | 155 | 156 | #ifdef __AVX__ 157 | #include "immintrin.h" 158 | void avx_rgb2gray(float* ra, float* ga, float* ba, float* gray) { 159 | __m256 c1 = _mm256_set1_ps(0.3f); 160 | __m256 c2 = _mm256_set1_ps(0.59f); 161 | __m256 c3 = _mm256_set1_ps(0.11f); 162 | 163 | for(int i = 0; i < N; i+=8) { 164 | __m256 a = _mm256_loadu_ps(ra+i); 165 | __m256 b = _mm256_loadu_ps(ga+i); 166 | __m256 c = _mm256_loadu_ps(ba+i); 167 | __m256 ab = _mm256_add_ps(_mm256_mul_ps(c1, a), _mm256_mul_ps(c2, b)); 168 | __m256 out = _mm256_add_ps(ab, _mm256_mul_ps(c3, c)); 169 | _mm256_storeu_ps(gray+i, out); 170 | } 171 | } 172 | 173 | #endif 174 | 175 | float r[N+10000] POST_ALIGN(16); 176 | float g[N+20000] POST_ALIGN(16); 177 | float b[N+30000] POST_ALIGN(16); 178 | float gray[N+40000] POST_ALIGN(16); 179 | 180 | #define ITERATIONS 1000 181 | int main (int argc, char* argv[]) 182 | { 183 | 184 | for(int i = 0; i < N; i++) { 185 | r[N] = random() % 256; 186 | g[N] = random() % 256; 187 | b[N] = random() % 256; 188 | } 189 | std::cout<< "Convert " << N << " pixels RGB to gray." << std::endl; 190 | 191 | HPM_PERF_CREATE; 192 | 193 | HPM_PERF_START; 194 | reset_and_start_stimer(); 195 | for(int i = 0; i < ITERATIONS; i++) { serial_rgb2gray(r, g, b, gray);} 196 | double dt = get_elapsed_seconds(); 197 | HPM_PERF_STOP; 198 | std::cout<< "serial version: " << dt << " seconds" << std::endl; 199 | 200 | HPM_PERF_START; 201 | reset_and_start_stimer(); 202 | for(int i = 0; i < ITERATIONS; i++) { svec4_rgb2gray(r, g, b, gray);} 203 | double dt2 = get_elapsed_seconds(); 204 | HPM_PERF_STOP; 205 | std::cout<< "svec4 version: " << dt2 << " seconds" << std::endl; 206 | 207 | HPM_PERF_START; 208 | reset_and_start_stimer(); 209 | for(int i = 0; i < ITERATIONS; i++) { svec4_rgb2gray_ptr(r, g, b, gray); } 210 | double dt3 = get_elapsed_seconds(); 211 | HPM_PERF_STOP; 212 | std::cout<< "svec4 ptr ld/st version: " << dt3 << " seconds" << std::endl; 213 | 214 | HPM_PERF_START; 215 | reset_and_start_stimer(); 216 | for(int i = 0; i < ITERATIONS; i++) { svec4_rgb2gray_fma(r, g, b, gray); } 217 | double dt4 = get_elapsed_seconds(); 218 | HPM_PERF_STOP; 219 | std::cout<< "svec4 fma version: " << dt4 << " seconds" << std::endl; 220 | 221 | #ifdef __ALTIVEC__ 222 | HPM_PERF_START; 223 | reset_and_start_stimer(); 224 | for(int i = 0; i < ITERATIONS; i++) { intrinsics_rgb2gray(r, g, b, gray);} 225 | double dt5 = get_elapsed_seconds(); 226 | HPM_PERF_STOP; 227 | std::cout<< "Intrinsics version: " << dt5 << " seconds" << std::endl; 228 | #endif 229 | 230 | #ifdef __SSE4_2__ 231 | HPM_PERF_START; 232 | reset_and_start_stimer(); 233 | for(int i = 0; i < ITERATIONS; i++) { sse_rgb2gray(r, g, b, gray);} 234 | double dt6 = get_elapsed_seconds(); 235 | HPM_PERF_STOP; 236 | std::cout<< "SSE version: " << dt6 << " seconds" << std::endl; 237 | #endif 238 | 239 | #ifdef __AVX__ 240 | HPM_PERF_START; 241 | reset_and_start_stimer(); 242 | for(int i = 0; i < ITERATIONS; i++) { avx_rgb2gray(r, g, b, gray);} 243 | double dt7 = get_elapsed_seconds(); 244 | HPM_PERF_STOP; 245 | std::cout<< "AVX version: " << dt7 << " seconds" << std::endl; 246 | #endif 247 | 248 | HPM_PERF_CLOSE; 249 | return 0; 250 | } 251 | 252 | -------------------------------------------------------------------------------- /examples/RGB2YUV/Makefile: -------------------------------------------------------------------------------- 1 | 2 | 3 | EXAMPLE=RGB2YUV 4 | RUN_ARGS= 5 | 6 | include ../common.mk -------------------------------------------------------------------------------- /examples/RGB2YUV/RGB2YUV.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | Copyright 2012 the Generic SIMD Intrinsic Library project authors. All rights reserved. 3 | 4 | Copyright IBM Corp. 2013, 2013. All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are 8 | met: 9 | 10 | * Redistributions of source code must retain the above copyright 11 | notice, this list of conditions and the following disclaimer. 12 | * Redistributions in binary form must reproduce the above 13 | copyright notice, this list of conditions and the following 14 | disclaimer in the documentation and/or other materials provided 15 | with the distribution. 16 | * Neither the name of IBM Corp. nor the names of its contributors may be 17 | used to endorse or promote products derived from this software 18 | without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 23 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 24 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 25 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 26 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 27 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 28 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 | */ 32 | 33 | /** 34 | * RGB2YUV.cpp 35 | * 36 | * Created on: Jun 12, 2013 37 | * @author: Haichuan Wang (hwang154@illinois.edu) 38 | * 39 | * Parameters are from http://en.wikipedia.org/wiki/YUV 40 | * Storages are based on SoA 41 | */ 42 | 43 | 44 | /* 45 | * IBM Power compiling 46 | * g++ -I../../include RGB2YUV.cpp -mvsx -flax-vector-conversions -Wno-int-to-pointer-cast -g -O2 -o RGB2YUV 47 | * */ 48 | 49 | #include 50 | #include 51 | #include 52 | #include 53 | #include 54 | #include 55 | #include 56 | 57 | //#define N (1048576) 58 | #define N (512*512) 59 | 60 | //Doesn't work 61 | //__attribute__((optimize("no-tree-vectorize"))) 62 | 63 | void 64 | #ifdef __SSE4_2__ 65 | __attribute__((target("no-sse"))) 66 | #endif 67 | serial_rgb2gray(float* ra, float* ga, float* ba, float* ya, float* ua, float* va) { 68 | for(int i = 0; i < N; i++) { 69 | ya[i] = 0.299f * ra[i] + 0.584f * ga[i] + 0.114f * ba[i]; 70 | ua[i] = -0.14713f * ra[i] -0.28886f * ga[i] + 0.436f * ba[i]; 71 | va[i] = 0.615f * ra[i] - 0.51499f * ga[i] - 0.10001f * ba[i]; 72 | } 73 | } 74 | 75 | typedef svec<4,float> vfloat; 76 | 77 | void svec4_rgb2gray(float* ra, float* ga, float* ba, float* ya, float* ua, float* va) { 78 | 79 | for(int i = 0; i < N; i+=4) { 80 | vfloat a = vfloat::load((vfloat*)(ra+i)); 81 | vfloat b = vfloat::load((vfloat*)(ga+i)); 82 | vfloat c = vfloat::load((vfloat*)(ba+i)); 83 | vfloat y = 0.299f * a + 0.584f * b + 0.114f * c ; 84 | y.store((vfloat*)(ya+i)); 85 | vfloat u = -0.14713f * a - 0.28886f * b + 0.436f * c; 86 | u.store((vfloat*)(ua+i)); 87 | vfloat v = 0.615f * a - 0.51499f * b - 0.10001f * c; 88 | v.store((vfloat*)(va+i)); 89 | } 90 | } 91 | 92 | void svec4_rgb2gray_ptr(float* ra, float* ga, float* ba, float* ya, float* ua, float* va) { 93 | 94 | for(int i = 0; i < N; i+=4) { 95 | vfloat a = *(vfloat*)(ra+i); 96 | vfloat b = *(vfloat*)(ga+i); 97 | vfloat c = *(vfloat*)(ba+i); 98 | vfloat y = 0.299f * a + 0.584f * b + 0.114f * c ; 99 | *(vfloat*)(ya+i) = y; 100 | vfloat u = -0.14713f * a - 0.28886f * b + 0.436f * c; 101 | *(vfloat*)(ua+i) = u; 102 | vfloat v = 0.615f * a - 0.51499f * b - 0.10001f * c; 103 | *(vfloat*)(va+i) = v; 104 | } 105 | } 106 | 107 | #ifdef __ALTIVEC__ 108 | void intrinsics_rgb2gray(float* ra, float* ga, float* ba, float* ya, float* ua, float* va) { 109 | __vector float c11 = vec_splats(0.299f); 110 | __vector float c12 = vec_splats(0.584f); 111 | __vector float c13 = vec_splats(0.114f); 112 | __vector float c21 = vec_splats(-0.1471f); 113 | __vector float c22 = vec_splats(-0.28886f); 114 | __vector float c23 = vec_splats(0.436f); 115 | __vector float c31 = vec_splats(0.615f); 116 | __vector float c32 = vec_splats(-0.51499f); 117 | __vector float c33 = vec_splats(-0.10001f); 118 | 119 | for(int i = 0; i < N; i+=4) { 120 | __vector float a = vec_vsx_ld(0, ra+i); 121 | __vector float b = vec_vsx_ld(0, ga+i); 122 | __vector float c = vec_vsx_ld(0, ba+i); 123 | __vector float y = c11 * a + c12 * b + c13 * c ; 124 | vec_vsx_st(y, 0, ya+i); 125 | __vector float u = c21 * a + c22 * b + c23 * c ; 126 | vec_vsx_st(u, 0, ua+i); 127 | __vector float v = c31 * a + c32 * b + c33 * c ; 128 | vec_vsx_st(v, 0, va+i); 129 | } 130 | } 131 | #endif 132 | 133 | #ifdef __SSE4_2__ 134 | 135 | void sse_rgb2gray(float* ra, float* ga, float* ba, float* ya, float* ua, float* va) { 136 | __m128 c11 = _mm_set1_ps(0.299f); 137 | __m128 c12 = _mm_set1_ps(0.584f); 138 | __m128 c13 = _mm_set1_ps(0.114f); 139 | __m128 c21 = _mm_set1_ps(-0.1471f); 140 | __m128 c22 = _mm_set1_ps(-0.28886f); 141 | __m128 c23 = _mm_set1_ps(0.436f); 142 | __m128 c31 = _mm_set1_ps(0.615f); 143 | __m128 c32 = _mm_set1_ps(-0.51499f); 144 | __m128 c33 = _mm_set1_ps(-0.10001f); 145 | 146 | 147 | for(int i = 0; i < N; i+=4) { 148 | __m128 a = _mm_loadu_ps(ra+i); 149 | __m128 b = _mm_loadu_ps(ga+i); 150 | __m128 c = _mm_loadu_ps(ba+i); 151 | __m128 y = _mm_add_ps(_mm_add_ps(_mm_mul_ps(c11, a), _mm_mul_ps(c12, b)), _mm_mul_ps(c13, c)); 152 | _mm_storeu_ps(ya+i, y); 153 | __m128 u = _mm_add_ps(_mm_add_ps(_mm_mul_ps(c21, a), _mm_mul_ps(c22, b)), _mm_mul_ps(c23, c)); 154 | _mm_storeu_ps(ua+i, u); 155 | __m128 v = _mm_add_ps(_mm_add_ps(_mm_mul_ps(c31, a), _mm_mul_ps(c32, b)), _mm_mul_ps(c33, c)); 156 | _mm_storeu_ps(va+i, v); 157 | } 158 | } 159 | 160 | #endif 161 | 162 | 163 | //the strange 100,200,300,... offset is used to reduce the effect of "address conflicts" 164 | float r[N+100] POST_ALIGN(16); 165 | float g[N+200] POST_ALIGN(16); 166 | float b[N+300] POST_ALIGN(16); 167 | float y[N+400] POST_ALIGN(16); 168 | float u[N+500] POST_ALIGN(16); 169 | float v[N+600] POST_ALIGN(16); 170 | 171 | #define ITERATIONS 1000 172 | int main (int argc, char* argv[]) 173 | { 174 | for(int i = 0; i < N; i++) { 175 | r[N] = random() % 256; 176 | g[N] = random() % 256; 177 | b[N] = random() % 256; 178 | } 179 | std::cout<< "Convert " << N << " pixels RGB to YUV." << std::endl; 180 | 181 | reset_and_start_stimer(); 182 | for(int i = 0; i < ITERATIONS; i++) { serial_rgb2gray(r, g, b, y, u, v);} 183 | double dt = get_elapsed_seconds(); 184 | std::cout<< "serial version: " << dt << " seconds" << std::endl; 185 | 186 | reset_and_start_stimer(); 187 | for(int i = 0; i < ITERATIONS; i++) { svec4_rgb2gray(r, g, b, y, u, v);} 188 | double dt2 = get_elapsed_seconds(); 189 | std::cout<< "svec4 version: " << dt2 << " seconds" << std::endl; 190 | 191 | reset_and_start_stimer(); 192 | for(int i = 0; i < ITERATIONS; i++) { svec4_rgb2gray_ptr(r, g, b, y, u, v); } 193 | double dt3 = get_elapsed_seconds(); 194 | std::cout<< "svec4 ptr ld/st version: " << dt3 << " seconds" << std::endl; 195 | 196 | #ifdef __ALTIVEC__ 197 | reset_and_start_stimer(); 198 | for(int i = 0; i < ITERATIONS; i++) { intrinsics_rgb2gray(r, g, b, y, u, v);} 199 | double dt5 = get_elapsed_seconds(); 200 | std::cout<< "Power VSX version: " << dt5 << " seconds" << std::endl; 201 | #endif 202 | 203 | #ifdef __SSE4_2__ 204 | reset_and_start_stimer(); 205 | for(int i = 0; i < ITERATIONS; i++) { sse_rgb2gray(r, g, b, y, u, v);} 206 | double dt6 = get_elapsed_seconds(); 207 | std::cout<< "SSE version: " << dt6 << " seconds" << std::endl; 208 | #endif 209 | 210 | return 0; 211 | } 212 | 213 | -------------------------------------------------------------------------------- /examples/common.mk: -------------------------------------------------------------------------------- 1 | .PONY: %.perfexe %.perfcollect %.perfbr %.perhw 2 | 3 | 4 | ifeq (${BASE_DIR},) 5 | BASE_DIR=../.. 6 | endif 7 | 8 | 9 | BITS=64 10 | CC=gcc 11 | CXX=g++ 12 | CCFLAGS+= -I${BASE_DIR}/include -O2 -m$(BITS) 13 | CXXFLAGS+= -I${BASE_DIR}/include -O2 -m$(BITS) 14 | 15 | CCFLAGS+= -Wno-int-to-pointer-cast 16 | CXXFLAGS+= -Wno-int-to-pointer-cast 17 | ########################################################## 18 | # Platform specific options 19 | # ppc64 or intel 20 | ########################################################## 21 | 22 | MACHINE=$(shell uname -m) 23 | ifeq ($(firstword $(filter ppc64,$(MACHINE))),ppc64) 24 | CXXFLAGS += -mno-vrsave -mvsx -flax-vector-conversions -mcpu=power7 25 | CCFLAGS += -mno-vrsave -mvsx -flax-vector-conversions -mcpu=power7 26 | ifeq (${PPC_ISA}, P8) 27 | PLATFORM = ppc64_P8 28 | CCFLAGS += -D__POWER8 29 | CXXFLAGS += -D__POWER8 30 | else 31 | PLATFORM = ppc64_P7 32 | endif 33 | else 34 | PLATFORM=x86-64 35 | CCFLAGS += -msse4.2 36 | CXXFLAGS += -msse4.2 37 | endif 38 | 39 | 40 | default: ${EXAMPLE} 41 | 42 | 43 | ${EXAMPLE}: ${EXAMPLE}.cpp 44 | ${CXX} ${CXXFLAGS} $< -o $@ 45 | 46 | 47 | run: ${EXAMPLE} 48 | ./$< ${RUN_ARGS} 49 | 50 | ${EXAMPLE}_tune: ${EXAMPLE}_tune.cpp 51 | ${CXX} ${CXXFLAGS} $< -o $@ 52 | 53 | tune: ${EXAMPLE}_tune 54 | ./$< ${RUN_ARGS} 55 | 56 | TMP=__perf.tmp 57 | 58 | #special for collecting all perf data 59 | %.perf: %.perfbr %.perficache %.perfdcache %.perfllc 60 | @echo "end" 61 | @rm -f ${TMP} 62 | 63 | 64 | 65 | 66 | %.perfhw: CXXFLAGS+= -DPERF_HW 67 | %.perfhw: %.cpp 68 | ${CXX} ${CXXFLAGS} $< -o $@ 69 | ./$@ | tee ${TMP} 70 | @grep "HPM Event" ${TMP} | tail -1 71 | @grep "HPM Values" ${TMP} 72 | 73 | 74 | %.perfbr: CXXFLAGS+= -DPERF_BR 75 | %.perfbr: %.cpp 76 | ${CXX} ${CXXFLAGS} $< -o $@ 77 | ./$@ | tee ${TMP} 78 | @grep "HPM Event" ${TMP} | tail -1 79 | @grep "HPM Values" ${TMP} 80 | 81 | %.perficache: CXXFLAGS+= -DPERF_ICACHE 82 | %.perficache: %.cpp 83 | ${CXX} ${CXXFLAGS} $< -o $@ 84 | ./$@ | tee ${TMP} 85 | @grep "HPM Event" ${TMP} | tail -1 86 | @grep "HPM Values" ${TMP} 87 | 88 | 89 | %.perfdcache: CXXFLAGS+= -DPERF_DCACHE 90 | %.perfdcache: %.cpp 91 | ${CXX} ${CXXFLAGS} $< -o $@ 92 | ./$@ | tee ${TMP} 93 | @grep "HPM Event" ${TMP} | tail -1 94 | @grep "HPM Values" ${TMP} 95 | 96 | %.perfllc: CXXFLAGS+= -DPERF_LLC 97 | %.perfllc: %.cpp 98 | ${CXX} ${CXXFLAGS} $< -o $@ 99 | ./$@ | tee ${TMP} 100 | @grep "HPM Event" ${TMP} | tail -1 101 | @grep "HPM Values" ${TMP} 102 | 103 | 104 | clean: 105 | rm -f ${EXAMPLE} ${EXAMPLE}_tune 106 | 107 | -------------------------------------------------------------------------------- /examples/mandelbrot/Makefile: -------------------------------------------------------------------------------- 1 | 2 | 3 | EXAMPLE=mandelbrot 4 | RUN_ARGS= 5 | 6 | include ../common.mk -------------------------------------------------------------------------------- /examples/mandelbrot/mandelbrot.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | Copyright 2012 the Generic SIMD Intrinsic Library project authors. All rights reserved. 3 | 4 | Copyright IBM Corp. 2013, 2013. All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are 8 | met: 9 | 10 | * Redistributions of source code must retain the above copyright 11 | notice, this list of conditions and the following disclaimer. 12 | * Redistributions in binary form must reproduce the above 13 | copyright notice, this list of conditions and the following 14 | disclaimer in the documentation and/or other materials provided 15 | with the distribution. 16 | * Neither the name of IBM Corp. nor the names of its contributors may be 17 | used to endorse or promote products derived from this software 18 | without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 23 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 24 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 25 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 26 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 27 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 28 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 | */ 32 | 33 | #include 34 | #include 35 | #include 36 | #include 37 | #include 38 | #include 39 | #include 40 | #include 41 | 42 | /* 43 | g++ -I../../include mandelbrot.cc -mvsx -flax-vector-conversions -Wno-int-to-pointer-cast -O3 -o mandelbrot 44 | */ 45 | 46 | /* 47 | Scalar version of mandelbrot 48 | */ 49 | static int mandel(float c_re, float c_im, int count) { 50 | float z_re = c_re, z_im = c_im; 51 | int cci=0; 52 | for (cci = 0; cci < count; ++cci) { 53 | if (z_re * z_re + z_im * z_im > 4.f) 54 | break; 55 | 56 | float new_re = z_re*z_re - z_im*z_im; 57 | float new_im = 2.f * z_re * z_im; 58 | z_re = c_re + new_re; 59 | z_im = c_im + new_im; 60 | } 61 | return cci; 62 | } 63 | 64 | void mandelbrot_serial(float x0, float y0, float x1, float y1, 65 | int width, int height, int maxIterations, 66 | int output[]) 67 | { 68 | float dx = (x1 - x0) / width; 69 | float dy = (y1 - y0) / height; 70 | 71 | for (int j = 0; j < height; j++) { 72 | for (int i = 0; i < width; ++i) { 73 | float x = x0 + i * dx; 74 | float y = y0 + j * dy; 75 | 76 | int index = (j * width + i); 77 | output[index] = mandel(x, y, maxIterations); 78 | } 79 | } 80 | } 81 | 82 | 83 | /* 84 | Generic Intrinsics 85 | */ 86 | void mandelbrot_generic(float x0, float y0, float x1, float y1, 87 | int width, int height, int maxIterations, 88 | int output[]) 89 | { 90 | typedef svec<4,float> vfloat; 91 | typedef svec<4,int> vint; 92 | typedef svec<4,unsigned int> vuint; 93 | typedef svec<4,short> vshort; 94 | typedef svec<4,bool> vbool; 95 | 96 | float dx = (x1 - x0) / width; 97 | float dy = (y1 - y0) / height; 98 | 99 | vfloat v_x0(x0); 100 | vfloat v_y0(y0); 101 | vfloat v_x1(x1); 102 | vfloat v_y1(y1); 103 | vint vci_4(4); 104 | 105 | vfloat v_w((float)width); 106 | vfloat v_h((float)height); 107 | 108 | vfloat v_dx = (v_x1 - v_x0) / v_w; 109 | vfloat v_dy = (v_y1 - v_y0) / v_h; 110 | 111 | for (int j = 0; j < height; j++) { 112 | vint v_j(j); 113 | vint v_i(0,1,2,3); 114 | vfloat v_i_f(0.0,1.0,2.0,3.0); 115 | vfloat v_j_f((float)j); 116 | 117 | //this is the 'parallel loop' 118 | for (int i = 0; i < width; i+=4) { 119 | //float x = x0 + i * dx; 120 | //float y = y0 + j * dy; 121 | vfloat v_x = v_x0 + (v_i_f * v_dx); 122 | vfloat v_y = v_y0 + (v_j_f * v_dy); 123 | 124 | //int index = (j * width + i); 125 | //vint v_index = svec_add(svec_mulo((vshort)v_j,(vshort)v_w),v_i); 126 | vint v_index = v_j * svec_cast(v_w) + v_i; 127 | 128 | // //output[index] = mandel(x, y, maxIterations); 129 | 130 | //float z_re = x, z_im = y; 131 | vfloat v_z_re = v_x; 132 | vfloat v_z_im = v_y; 133 | 134 | int ci=0; 135 | //float ct_4=4.f; 136 | //float ct_2=2.f; 137 | vint vci(0); 138 | vbool vzero(0); 139 | vint vci_1(1); 140 | vfloat v_ct_4(4.f); 141 | vfloat v_ct_2(2.f); 142 | 143 | vbool v_mask(0xffff); 144 | 145 | //next stay the same 146 | for (ci = 0; ci < maxIterations; ++ci) { 147 | //if (z_re * z_re + z_im * z_im > ct_4) 148 | // break; 149 | vfloat v_m = v_z_re*v_z_re + v_z_im*v_z_im; 150 | vbool v_cmp = v_m > v_ct_4; 151 | 152 | //v_mask = vec_andc(v_mask, v_cmp); 153 | v_mask = v_mask & (~v_cmp); 154 | 155 | int allexit = svec_all_true(v_cmp); 156 | 157 | if( allexit ) break; 158 | 159 | //here some threads will stop; how do we implement that 160 | 161 | //float new_re = z_re*z_re - z_im*z_im; 162 | vfloat v_new_re = v_z_re*v_z_re - v_z_im*v_z_im; 163 | //float new_im = ct_2 * z_re * z_im; 164 | vfloat v_new_im = v_ct_2 * (v_z_re*v_z_im); 165 | 166 | //z_re = x + new_re; 167 | v_z_re = v_x+v_new_re; 168 | //z_im = y + new_im; 169 | v_z_im = v_y + v_new_im; 170 | 171 | vint vnci = vci + vci_1; 172 | vci = svec_select(v_mask, vnci, vci); 173 | } 174 | //store vci 175 | //output[index] = ci; 176 | 177 | int index = (j * width + i); 178 | vci.store((vint*)(output+index)); 179 | 180 | //increment vector i 181 | v_i = v_i + vci_4; 182 | v_i_f = v_i_f + v_ct_4; 183 | } 184 | } 185 | } 186 | 187 | 188 | #ifdef __ALTIVEC__ 189 | void mandelbrot_intrinsics(float x0, float y0, float x1, float y1, 190 | int width, int height, int maxIterations, 191 | int output[]) 192 | { 193 | typedef __vector float vfloat; 194 | typedef __vector signed int vint; 195 | typedef __vector unsigned int vuint; 196 | typedef __vector signed short vshort; 197 | typedef __vector bool int vbool; 198 | 199 | float dx = (x1 - x0) / width; 200 | float dy = (y1 - y0) / height; 201 | 202 | vfloat v_x0={x0,x0,x0,x0}; 203 | vfloat v_y0={y0,y0,y0,y0}; 204 | vfloat v_x1={x1,x1,x1,x1}; 205 | vfloat v_y1={y1,y1,y1,y1}; 206 | vint vci_4 = {4,4,4,4}; 207 | 208 | vfloat v_w={(float)width,(float)width,(float)width,(float)width}; 209 | vfloat v_h={(float)height,(float)height,(float)height,(float)height}; 210 | 211 | 212 | vfloat v_dx = vec_div( vec_sub(v_x1,v_x0), v_w); 213 | vfloat v_dy = vec_div( vec_sub(v_y1,v_y0), v_h); 214 | 215 | for (int j = 0; j < height; j++) { 216 | vint v_j = {j,j,j,j}; 217 | vint v_i = {0,1,2,3}; 218 | vfloat v_i_f = {0.0,1.0,2.0,3.0}; 219 | vfloat v_j_f = {(float)j,(float)j,(float)j,(float)j}; 220 | 221 | 222 | //this is the 'parallel loop' 223 | for (int i = 0; i < width; i+=4) { 224 | //float x = x0 + i * dx; 225 | //float y = y0 + j * dy; 226 | vfloat v_x = vec_add(v_x0,vec_mul(v_i_f,v_dx)); 227 | vfloat v_y = vec_add(v_y0,vec_mul(v_j_f,v_dy)); 228 | 229 | //int index = (j * width + i); 230 | vint v_index = vec_add(vec_mulo((vshort)v_j,(vshort)v_w),v_i); 231 | 232 | // //output[index] = mandel(x, y, maxIterations); 233 | 234 | //float z_re = x, z_im = y; 235 | vfloat v_z_re = v_x; 236 | vfloat v_z_im = v_y; 237 | 238 | int ci=0; 239 | //float ct_4=4.f; 240 | //float ct_2=2.f; 241 | vint vci = {0,0,0,0}; 242 | vbool vzero={0,0,0,0}; 243 | vint vci_1 = {1,1,1,1}; 244 | vfloat v_ct_4 = {4.f,4.f,4.f,4.f}; 245 | vfloat v_ct_2 = {2.f,2.f,2.f,2.f}; 246 | 247 | vbool v_mask = {0xffff,0xffff,0xffff,0xffff}; 248 | 249 | //next stay the same 250 | 251 | for (ci = 0; ci < maxIterations; ++ci) { 252 | //if (z_re * z_re + z_im * z_im > ct_4) 253 | // break; 254 | vfloat v_m = vec_add(vec_mul(v_z_re,v_z_re),vec_mul(v_z_im,v_z_im)); 255 | vbool v_cmp = vec_cmpgt(v_m,v_ct_4); 256 | 257 | //v_mask = v_mask & !v_cmp 258 | v_mask = vec_andc(v_mask, v_cmp); 259 | 260 | int allexit = vec_all_ne(v_cmp, vzero); 261 | 262 | if( allexit ) break; 263 | 264 | //here some threads will stop; how do we implement that 265 | 266 | //float new_re = z_re*z_re - z_im*z_im; 267 | vfloat v_new_re = vec_sub((vec_mul(v_z_re,v_z_re)),(vec_mul(v_z_im,v_z_im))); 268 | //float new_im = ct_2 * z_re * z_im; 269 | vfloat v_new_im = vec_mul(v_ct_2,vec_mul(v_z_re,v_z_im)); 270 | 271 | //z_re = x + new_re; 272 | v_z_re = vec_add(v_x,v_new_re); 273 | //z_im = y + new_im; 274 | v_z_im = vec_add(v_y, v_new_im); 275 | 276 | vint vnci = vec_add(vci,vci_1); 277 | vci = vec_sel(vci, vnci, v_mask); 278 | } 279 | //store vci 280 | //output[index] = ci; 281 | 282 | int index = (j * width + i); 283 | vec_st(vci, 0, output+index); 284 | 285 | //increment vector i 286 | v_i = vec_add(v_i, vci_4); 287 | v_i_f = vec_add(v_i_f, v_ct_4); 288 | } 289 | 290 | } 291 | } 292 | #endif 293 | 294 | 295 | /* Write a PPM image file with the image of the Mandelbrot set */ 296 | static void 297 | writePPM(int *buf, int width, int height, const char *fn) { 298 | FILE *fp = fopen(fn, "wb"); 299 | fprintf(fp, "P6\n"); 300 | fprintf(fp, "%d %d\n", width, height); 301 | fprintf(fp, "255\n"); 302 | for (int i = 0; i < width*height; ++i) { 303 | // Map the iteration count to colors by just alternating between 304 | // two greys. 305 | char c = (buf[i] & 0x1) ? 240 : 20; 306 | for (int j = 0; j < 3; ++j) 307 | fputc(c, fp); 308 | } 309 | fclose(fp); 310 | printf("Wrote image file %s\n", fn); 311 | } 312 | 313 | 314 | static void 315 | writePPM_d(int *buf, int width, int height, const char *fn) { 316 | for (int i = 0; i < width; ++i) { 317 | for (int j = 0; j < height; ++j) { 318 | int index = i*width+j; 319 | printf("%4d ", buf[index]); 320 | } 321 | printf("\n"); 322 | } 323 | printf("Wrote image file %s\n", fn); 324 | } 325 | 326 | 327 | int main() { 328 | unsigned int width = 768; 329 | unsigned int height = 512; 330 | 331 | //unsigned int width = 1024; 332 | //unsigned int height = 1024; 333 | 334 | float x0 = -2; 335 | float x1 = 1; 336 | float y0 = -1; 337 | float y1 = 1; 338 | 339 | int maxIterations = 10; 340 | int *buf = new int[width*height]; 341 | 342 | // 343 | // Compute the image using the scalar and generic intrinsics implementations; report the minimum 344 | // time of three runs. 345 | // 346 | 347 | double minSerial = 1e30; 348 | for (int i = 0; i < 3; ++i) { 349 | reset_and_start_stimer(); 350 | mandelbrot_serial(x0, y0, x1, y1, width, height, maxIterations, buf); 351 | double dt = get_elapsed_seconds(); 352 | minSerial = std::min(minSerial, dt); 353 | } 354 | printf("[mandelbrot serial]:\t\t[%.4f] seconds\n", minSerial); 355 | writePPM(buf, width, height, "mandelbrot-serial.ppm"); 356 | 357 | 358 | double minGeneric = 1e30; 359 | for (int i = 0; i < 3; ++i) { 360 | reset_and_start_stimer(); 361 | mandelbrot_generic(x0, y0, x1, y1, width, height, maxIterations, buf); 362 | double dt = get_elapsed_seconds(); 363 | minGeneric = std::min(minGeneric, dt); 364 | } 365 | printf("[mandelbrot generic simd]:\t\t[%.4f] seconds\n", minGeneric); 366 | writePPM(buf, width, height, "mandelbrot-generic.ppm"); 367 | printf("[mandelbrot generic speedup]:\t\t%.2fx from GENERIC\n", minSerial/minGeneric); 368 | 369 | #ifdef __ALTIVEC__ 370 | double minIntrinsics = 1e30; 371 | for (int i = 0; i < 3; ++i) { 372 | reset_and_start_stimer(); 373 | mandelbrot_intrinsics(x0, y0, x1, y1, width, height, maxIterations, buf); 374 | double dt = get_elapsed_seconds(); 375 | minIntrinsics = std::min(minIntrinsics, dt); 376 | } 377 | printf("[mandelbrot intrinsics simd]:\t\t[%.4f] seconds\n", minIntrinsics); 378 | writePPM(buf, width, height, "mandelbrot-intrinsics.ppm"); 379 | printf("[mandelbrot intrinsics speedup]:\t%.2fx from INTRINSICS\n", minSerial/minIntrinsics); 380 | #endif 381 | return 0; 382 | } 383 | -------------------------------------------------------------------------------- /include/README.md: -------------------------------------------------------------------------------- 1 | # Header Files Organization 2 | 3 | The key header files is gsimd.h, which is the only header file that user codes include. 4 | 5 | The below structure is the header file organization. 6 | ``` 7 | gsimd.h 8 | | 9 | + generic.h 10 | | | 11 | | + generic4.h: Generic implementation of LANES=4 12 | | + generic8.h: Generic implementation of LANES=8 13 | | 14 | + sse4.h: Intel SSE4.2 intrinsics implementaiton of LANES=4 15 | | 16 | +-power_vsx4.h: IBM Power VSX intrinsics implementation of LANES=4 17 | | 18 | + power7_intrinsics.h Intrinsics only available on IBM Power7 Platform 19 | + power8_intrinsics.h Intrinsics only available on IBM Power8 Platform 20 | 21 | gsimd_utility.h: Common macros definitions 22 | ``` 23 | -------------------------------------------------------------------------------- /include/generic.h: -------------------------------------------------------------------------------- 1 | /** 2 | Copyright 2012 the Generic SIMD Intrinsic Library project authors. All rights reserved. 3 | 4 | Copyright IBM Corp. 2013, 2013. All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are 8 | met: 9 | 10 | * Redistributions of source code must retain the above copyright 11 | notice, this list of conditions and the following disclaimer. 12 | * Redistributions in binary form must reproduce the above 13 | copyright notice, this list of conditions and the following 14 | disclaimer in the documentation and/or other materials provided 15 | with the distribution. 16 | * Neither the name of IBM Corp. nor the names of its contributors may be 17 | used to endorse or promote products derived from this software 18 | without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 23 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 24 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 25 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 26 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 27 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 28 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 | */ 32 | 33 | /** 34 | * generic.h 35 | * 36 | * Created on: Oct.7, 2013 37 | * @author: Haichuan Wang (hwang154@illinois.edu) 38 | * @brief: Generic SIMD Library header configuration file for generic implementation 39 | * The common defintions for all generic implementations 40 | */ 41 | 42 | #ifndef GENERIC_H_ 43 | #define GENERIC_H_ 44 | 45 | #include 46 | #include 47 | #include 48 | #include 49 | 50 | #include "gsimd_utility.h" 51 | 52 | namespace generic { 53 | //simple trick to generate a compiler error if invalid template 54 | //arguments are used 55 | 56 | template 57 | struct svec : public invalid_template_arguments::type { 58 | //here we need to add the static assert 59 | }; 60 | 61 | } //generic namespace 62 | 63 | #include 64 | #include 65 | 66 | 67 | #endif /* GENERIC_H_ */ 68 | -------------------------------------------------------------------------------- /include/gsimd.h: -------------------------------------------------------------------------------- 1 | /** 2 | Copyright 2012 the Generic SIMD Intrinsic Library project authors. All rights reserved. 3 | 4 | Copyright IBM Corp. 2013, 2013. All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are 8 | met: 9 | 10 | * Redistributions of source code must retain the above copyright 11 | notice, this list of conditions and the following disclaimer. 12 | * Redistributions in binary form must reproduce the above 13 | copyright notice, this list of conditions and the following 14 | disclaimer in the documentation and/or other materials provided 15 | with the distribution. 16 | * Neither the name of IBM Corp. nor the names of its contributors may be 17 | used to endorse or promote products derived from this software 18 | without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 23 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 24 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 25 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 26 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 27 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 28 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 | */ 32 | 33 | /** 34 | * gsimd.h 35 | * 36 | * Created on: Aug 14, 2013 37 | * @author: Haichuan Wang (haichuan@us.ibm.com hwang154@illinois.edu) 38 | * @brief: Generic SIMD Library header configuration file 39 | */ 40 | 41 | #ifndef GSIMD_H_ 42 | #define GSIMD_H_ 43 | 44 | //a macro to for GSIMD 45 | #define __GSIMD__ 46 | #ifdef __ALTIVEC__ 47 | #include 48 | using namespace vsx; 49 | #else 50 | #ifdef __SSE4_2__ 51 | #include 52 | using namespace sse; 53 | #else 54 | #include 55 | using namespace generic; 56 | #endif //__SSE4_2__ 57 | #endif //__ALTIVEC__ 58 | 59 | 60 | 61 | 62 | #endif /* GSIMD_H_ */ 63 | -------------------------------------------------------------------------------- /include/perfmeasure.h: -------------------------------------------------------------------------------- 1 | /** 2 | Copyright 2012 the Generic SIMD Intrinsic Library project authors. All rights reserved. 3 | 4 | Copyright IBM Corp. 2013, 2013. All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are 8 | met: 9 | 10 | * Redistributions of source code must retain the above copyright 11 | notice, this list of conditions and the following disclaimer. 12 | * Redistributions in binary form must reproduce the above 13 | copyright notice, this list of conditions and the following 14 | disclaimer in the documentation and/or other materials provided 15 | with the distribution. 16 | * Neither the name of IBM Corp. nor the names of its contributors may be 17 | used to endorse or promote products derived from this software 18 | without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 23 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 24 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 25 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 26 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 27 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 28 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 | */ 32 | 33 | /** 34 | * perfmeasure.h 35 | * 36 | * Created on: Jun 3, 2013 37 | * author: Haichuan Wang (haichuan@us.ibm.com, hwang154@illinois.edu) 38 | * 39 | * Header file for call linux perf tool to measure HPM 40 | * Reference: http://web.eece.maine.edu/~vweaver/projects/perf_events/perf_event_open.html 41 | */ 42 | 43 | #ifndef PERFMEASURE_H_ 44 | #define PERFMEASURE_H_ 45 | 46 | #include 47 | #include 48 | #include 49 | #include 50 | #include 51 | #include 52 | #include 53 | 54 | long 55 | perf_event_open(struct perf_event_attr *hw_event, pid_t pid, 56 | int cpu, int group_fd, unsigned long flags) 57 | { 58 | int ret; 59 | 60 | ret = syscall(__NR_perf_event_open, hw_event, pid, cpu, 61 | group_fd, flags); 62 | return ret; 63 | } 64 | 65 | typedef struct hpm_fds_t { 66 | int fd[6]; //max 6 events 67 | } hpm_fds_t; 68 | 69 | typedef struct hpm_group_t { 70 | __u32 type;//type of events 71 | __u32 size;//4 or 6 72 | __u64 event[6];//max 6 events 73 | const char* event_name[6];//each event's name 74 | } hpm_group_t; 75 | 76 | static hpm_group_t hw_group = { PERF_TYPE_HARDWARE, 77 | 4, 78 | {PERF_COUNT_HW_INSTRUCTIONS, 79 | PERF_COUNT_HW_CPU_CYCLES, 80 | PERF_COUNT_HW_CACHE_REFERENCES, 81 | PERF_COUNT_HW_CACHE_MISSES, 82 | 0, 83 | 0}, 84 | { 85 | "Instr","Cycles", "Cache Ref", "Cache Miss", "", "" 86 | } 87 | }; 88 | 89 | static hpm_group_t br_group = { PERF_TYPE_HARDWARE, 90 | 4, 91 | {PERF_COUNT_HW_INSTRUCTIONS, 92 | PERF_COUNT_HW_CPU_CYCLES, 93 | PERF_COUNT_HW_BRANCH_INSTRUCTIONS, 94 | PERF_COUNT_HW_BRANCH_MISSES, 95 | 0, 96 | 0}, 97 | { 98 | "Instr","Cycles", "Branch Instr", "Branch Miss", "", "" 99 | } 100 | }; 101 | 102 | 103 | static hpm_group_t sw_group = { PERF_TYPE_SOFTWARE, 104 | 5, //not support by power 105 | {PERF_COUNT_SW_CPU_CLOCK, 106 | PERF_COUNT_SW_TASK_CLOCK, 107 | PERF_COUNT_SW_PAGE_FAULTS, 108 | PERF_COUNT_SW_CONTEXT_SWITCHES, 109 | PERF_COUNT_SW_CPU_MIGRATIONS, 110 | 0, /*PERF_COUNT_SW_ALIGNMENT_FAULTS*/}, 111 | { 112 | "CPU clock","Task clock", "Page fault", "Context switch", "Migration", "Aligment fault" 113 | } 114 | }; 115 | 116 | #define CACHE_READ(name) (name | (PERF_COUNT_HW_CACHE_OP_READ << 8) | ( PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16)) 117 | #define CACHE_READ_MISS(name) (name | (PERF_COUNT_HW_CACHE_OP_READ << 8) | ( PERF_COUNT_HW_CACHE_RESULT_MISS << 16)) 118 | #define CACHE_WRITE(name) (name | (PERF_COUNT_HW_CACHE_OP_WRITE << 8) | ( PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16)) 119 | #define CACHE_WRITE_MISS(name) (name | (PERF_COUNT_HW_CACHE_OP_WRITE << 8) | ( PERF_COUNT_HW_CACHE_RESULT_MISS << 16)) 120 | 121 | static hpm_group_t icache_group = { PERF_TYPE_HW_CACHE, 122 | 2, //not support by power 123 | { 124 | CACHE_READ_MISS(PERF_COUNT_HW_CACHE_L1I), 125 | CACHE_READ_MISS(PERF_COUNT_HW_CACHE_ITLB), 126 | CACHE_READ(PERF_COUNT_HW_CACHE_L1I), 127 | CACHE_READ(PERF_COUNT_HW_CACHE_ITLB), 128 | 0, 129 | 0, /*PERF_COUNT_SW_ALIGNMENT_FAULTS*/}, 130 | { 131 | "L1I Miss", "ITLB Miss", "L1I Read", "ITLB Read", "", "" 132 | } 133 | }; 134 | 135 | static hpm_group_t dcache_group = { PERF_TYPE_HW_CACHE, 136 | 3, //not support by power 137 | { 138 | CACHE_READ(PERF_COUNT_HW_CACHE_L1D), 139 | CACHE_READ_MISS(PERF_COUNT_HW_CACHE_L1D), 140 | CACHE_WRITE_MISS(PERF_COUNT_HW_CACHE_L1D), 141 | CACHE_WRITE(PERF_COUNT_HW_CACHE_L1D), 142 | CACHE_READ(PERF_COUNT_HW_CACHE_DTLB), 143 | CACHE_READ_MISS(PERF_COUNT_HW_CACHE_DTLB), 144 | }, 145 | { 146 | "L1D Read", "L1D Read Miss", "L1D Write Miss", "L1D Write", "L1D Write", "DTLB Ref" 147 | } 148 | }; 149 | 150 | static hpm_group_t llc_group = { PERF_TYPE_HW_CACHE, 151 | 4, //not support by power 152 | { 153 | CACHE_READ(PERF_COUNT_HW_CACHE_LL), 154 | CACHE_READ_MISS(PERF_COUNT_HW_CACHE_LL), 155 | CACHE_WRITE(PERF_COUNT_HW_CACHE_LL), 156 | CACHE_WRITE_MISS(PERF_COUNT_HW_CACHE_LL), 157 | CACHE_WRITE(PERF_COUNT_HW_CACHE_L1D), 158 | CACHE_READ(PERF_COUNT_HW_CACHE_DTLB), 159 | }, 160 | { 161 | "LLC Read", "LLC Read Miss", "LLC Write", "LLC Write Miss", "L1D Write", "DTLB Ref" 162 | } 163 | }; 164 | 165 | 166 | void perf_events_create(hpm_fds_t* fds, hpm_group_t* egroup) { 167 | int i; 168 | int size = egroup->size; 169 | struct perf_event_attr pe; 170 | memset(&pe, 0, sizeof(struct perf_event_attr)); 171 | pe.type = egroup->type; 172 | pe.size = sizeof(struct perf_event_attr); 173 | pe.disabled = 1; 174 | //pe.exclude_kernel = 1; 175 | //pe.exclude_idle = 1; 176 | pe.exclude_hv = 1; 177 | 178 | for(i = 0; i < size; ++i) { 179 | pe.config = egroup->event[i]; 180 | fds->fd[i] = perf_event_open(&pe, 0, -1, -1, 0); 181 | if (fds->fd[i] == -1) { 182 | fprintf(stderr, "Error opening leader %llx, %d, %s\n", pe.config, i, egroup->event_name[i]); 183 | exit(EXIT_FAILURE); 184 | } 185 | } 186 | } 187 | 188 | //void perf_events_create2(hpm_fds_t* fds) { 189 | // int i; 190 | // struct perf_event_attr pe; 191 | // memset(&pe, 0, sizeof(struct perf_event_attr)); 192 | // pe.type = PERF_TYPE_HARDWARE; 193 | // pe.size = sizeof(struct perf_event_attr); 194 | // pe.disabled = 1; 195 | // //pe.exclude_kernel = 1; 196 | // //pe.exclude_idle = 1; 197 | // pe.exclude_hv = 1; 198 | // 199 | // //instrs 200 | // pe.config = PERF_COUNT_HW_BRANCH_INSTRUCTIONS; 201 | // fds->fd0 = perf_event_open(&pe, 0, -1, -1, 0); 202 | // if (fds->fd0 == -1) { 203 | // fprintf(stderr, "Error opening leader %llx\n", pe.config); 204 | // exit(EXIT_FAILURE); 205 | // } 206 | // //cycles 207 | // pe.config = PERF_COUNT_HW_BRANCH_MISSES; 208 | // fds->fd1 = perf_event_open(&pe, 0, -1, fds->fd0, 0); 209 | // if (fds->fd1 == -1) { 210 | // fprintf(stderr, "Error opening event %llx\n", pe.config); 211 | // exit(EXIT_FAILURE); 212 | // } 213 | // //cache ref 214 | // pe.config = PERF_COUNT_HW_INSTRUCTIONS; //PERF_COUNT_HW_STALLED_CYCLES_FRONTEND ; 215 | // fds->fd2 = perf_event_open(&pe, 0, -1, fds->fd0, 0); 216 | // if (fds->fd2 == -1) { 217 | // fprintf(stderr, "Error opening event %llx\n", pe.config); 218 | // exit(EXIT_FAILURE); 219 | // } 220 | // 221 | // //cache ref 222 | // pe.config = PERF_COUNT_HW_CPU_CYCLES; //PERF_COUNT_HW_STALLED_CYCLES_BACKEND ; 223 | // fds->fd3 = perf_event_open(&pe, 0, -1, fds->fd0, 0); 224 | // if (fds->fd3 == -1) { 225 | // fprintf(stderr, "Error opening event %llx\n", pe.config); 226 | // exit(EXIT_FAILURE); 227 | // } 228 | //} 229 | 230 | void perf_events_start(hpm_fds_t* fds, hpm_group_t* egroup) { 231 | int i; 232 | int size = egroup->size; 233 | for(i = 0; i < size; ++i) { 234 | ioctl(fds->fd[i], PERF_EVENT_IOC_RESET, 0); 235 | } 236 | for(int i = 0; i < egroup->size; ++i) { 237 | ioctl(fds->fd[i], PERF_EVENT_IOC_ENABLE, 0); 238 | } 239 | } 240 | 241 | 242 | void perf_events_stop_report(hpm_fds_t* fds, hpm_group_t* egroup) { 243 | int size = egroup->size; 244 | int i; 245 | for(i = 0; i < size; ++i) { 246 | ioctl(fds->fd[i], PERF_EVENT_IOC_DISABLE, 0); 247 | } 248 | printf("[HPM Event]"); 249 | //title 250 | for(i = 0; i < size; ++i) { 251 | printf("%s,", egroup->event_name[i]); 252 | } 253 | printf("\n[HPM Values]"); 254 | long long c; 255 | for(i = 0; i < size; ++i) { 256 | read(fds->fd[i], &c, sizeof(long long)); 257 | printf("%lld,", c); 258 | } 259 | printf("\n"); 260 | } 261 | 262 | 263 | //void perf_events_stop_report2(hpm_fds_t* fds) { 264 | // long long c0,c1,c2,c3; 265 | // ioctl(fds->fd0, PERF_EVENT_IOC_DISABLE, 0); 266 | // read(fds->fd0, &c0, sizeof(long long)); 267 | // ioctl(fds->fd1, PERF_EVENT_IOC_DISABLE, 0); 268 | // read(fds->fd1, &c1, sizeof(long long)); 269 | // ioctl(fds->fd2, PERF_EVENT_IOC_DISABLE, 0); 270 | // read(fds->fd2, &c2, sizeof(long long)); 271 | // ioctl(fds->fd3, PERF_EVENT_IOC_DISABLE, 0); 272 | // read(fds->fd3, &c3, sizeof(long long)); 273 | // 274 | //// printf("[HPM Perf]Branch instrs:%lld; Misbranch instrs:%lld; Frontend Stall:%lld; Backend Stall:%lld\n", 275 | //// c0, c1, c2,c3); 276 | // printf("[HPM Perf]Branch instrs:%lld; Misbranch instrs:%lld; Instrs:%lld; Cycles:%lld\n", 277 | // c0, c1, c2,c3); 278 | //} 279 | 280 | void perf_events_close(hpm_fds_t* fds, hpm_group_t* egroup) { 281 | int i; 282 | int size = egroup->size; 283 | for(i = 0; i < size; ++i) { 284 | close(fds->fd[i]); 285 | } 286 | } 287 | 288 | /***** Macro Definition *****/ 289 | #if (defined PERF_HW) || (defined PERF_BR) || (defined PERF_SW) || (defined PERF_ICACHE) || (defined PERF_DCACHE) || (defined PERF_LLC) 290 | 291 | #ifdef PERF_HW 292 | #define GNAME hw_group 293 | #endif 294 | 295 | #ifdef PERF_BR 296 | #define GNAME br_group 297 | #endif 298 | 299 | #ifdef PERF_SW 300 | #define GNAME sw_group 301 | #endif 302 | 303 | #ifdef PERF_ICACHE 304 | #define GNAME icache_group 305 | #endif 306 | 307 | #ifdef PERF_DCACHE 308 | #define GNAME dcache_group 309 | #endif 310 | 311 | #ifdef PERF_LLC 312 | #define GNAME llc_group 313 | #endif 314 | 315 | #define HPM_PERF_CREATE hpm_fds_t __hpm_fds; perf_events_create(&__hpm_fds, &GNAME) 316 | #define HPM_PERF_START perf_events_start(&__hpm_fds, &GNAME) 317 | #define HPM_PERF_STOP perf_events_stop_report(&__hpm_fds, &GNAME) 318 | #define HPM_PERF_CLOSE perf_events_close(&__hpm_fds, &GNAME); 319 | #else 320 | #define HPM_PERF_CREATE 321 | #define HPM_PERF_START 322 | #define HPM_PERF_STOP 323 | #define HPM_PERF_CLOSE 324 | #endif 325 | 326 | #endif /* PERFMEASURE_H_ */ 327 | -------------------------------------------------------------------------------- /include/platform_intrinsics.h: -------------------------------------------------------------------------------- 1 | /** 2 | Copyright 2012 the Generic SIMD Intrinsic Library project authors. All rights reserved. 3 | 4 | Copyright IBM Corp. 2013, 2013. All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are 8 | met: 9 | 10 | * Redistributions of source code must retain the above copyright 11 | notice, this list of conditions and the following disclaimer. 12 | * Redistributions in binary form must reproduce the above 13 | copyright notice, this list of conditions and the following 14 | disclaimer in the documentation and/or other materials provided 15 | with the distribution. 16 | * Neither the name of IBM Corp. nor the names of its contributors may be 17 | used to endorse or promote products derived from this software 18 | without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 23 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 24 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 25 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 26 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 27 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 28 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 | */ 32 | 33 | #include "power7_intrinsics.h" 34 | 35 | #ifdef __POWER8 36 | #include "power8_intrinsics.h" 37 | #endif 38 | -------------------------------------------------------------------------------- /include/power7_intrinsics.h: -------------------------------------------------------------------------------- 1 | /** 2 | Copyright 2012 the Generic SIMD Intrinsic Library project authors. All rights reserved. 3 | 4 | Copyright IBM Corp. 2013, 2013. All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are 8 | met: 9 | 10 | * Redistributions of source code must retain the above copyright 11 | notice, this list of conditions and the following disclaimer. 12 | * Redistributions in binary form must reproduce the above 13 | copyright notice, this list of conditions and the following 14 | disclaimer in the documentation and/or other materials provided 15 | with the distribution. 16 | * Neither the name of IBM Corp. nor the names of its contributors may be 17 | used to endorse or promote products derived from this software 18 | without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 23 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 24 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 25 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 26 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 27 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 28 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 | */ 32 | 33 | #define VSXW "=wa" 34 | #define VSXR "wa" 35 | #define VSXWC "=&wa" 36 | 37 | /// already implemented in POWER7 38 | // 39 | 40 | static FORCEINLINE __vector float vec_splat_p7(__vector float a, const int v){ 41 | if(__builtin_constant_p(v) && v >= 0 && v < 4) { 42 | __vector float register r; 43 | asm ("xxspltw %x[xt], %x[xa],%[im] " : [xt] VSXW(r) : [xa] VSXR(a), [im] "i"(v) ); 44 | return r; 45 | } else { 46 | float f = vec_extract(a, v); 47 | __vector float r = {f,f,f,f}; 48 | return r; 49 | } 50 | } 51 | 52 | static FORCEINLINE __vector signed int vec_splat_p7(__vector signed int a, const int v){ 53 | return (__vector signed int)vec_splat_p7((__vector float)a, v); 54 | } 55 | 56 | static FORCEINLINE __vector unsigned int vec_splat_p7(__vector unsigned int a, const int v){ 57 | return (__vector unsigned int)vec_splat_p7((__vector float)a, v); 58 | } 59 | 60 | /** 61 | * @brief use xxpermdi 62 | */ 63 | static FORCEINLINE __vector double vec_splat_p7(__vector double a, const int v){ 64 | if(__builtin_constant_p(v) && v >= 0 && v < 2) { 65 | __vector double register r; 66 | const int perm_v = (v == 0 ? 0 : 3); 67 | asm ("xxpermdi %x[xt], %x[xa], %x[xb], %[im] " : [xt] VSXW(r) : [xa] VSXR(a), [xb] VSXR(a), [im] "i"(perm_v) ); 68 | return r; 69 | } else { 70 | double d = vec_extract(a, v); 71 | __vector double r = {d,d}; 72 | return r; 73 | } 74 | } 75 | 76 | /** 77 | * @brief use xxpermdi 78 | */ 79 | static FORCEINLINE __vector long long vec_splat_p7(__vector long long a, const int v){ 80 | return (__vector long long)vec_splat_p7((__vector double )a, v); 81 | } 82 | 83 | /** 84 | * @brief use xxpermdi 85 | */ 86 | static FORCEINLINE __vector unsigned long long vec_splat_p7(__vector unsigned long long a, const int v){ 87 | return (__vector unsigned long long)vec_splat_p7((__vector double )a, v); 88 | } 89 | 90 | 91 | 92 | static FORCEINLINE __vector double vec_smear_p7(double a){ 93 | __vector double register r; 94 | asm ("xxspltd %x[xt], %x[xa], 0" : [xt] VSXW(r) : [xa] "f"(a) ); 95 | return r; 96 | } 97 | 98 | static FORCEINLINE __vector float vec_zero_p7(){ 99 | __vector float register r; 100 | asm ("vspltisw %[xt], 0": [xt] "=v"(r) ); 101 | return r; 102 | } 103 | 104 | static FORCEINLINE __vector unsigned long long vec_smear_i64_p7(unsigned long long *ptr) { 105 | __vector unsigned long long r; 106 | asm ("lxvdsx %x[xt],%y1" : [xt] VSXW(r) : "Z"(*ptr) ); 107 | return r; 108 | } 109 | 110 | static FORCEINLINE __vector long long vec_smear_i64_p7(long long *ptr) { 111 | __vector long long r; 112 | asm ("lxvdsx %x[xt],%y1" : [xt] VSXW(r) : "Z"(*ptr) ); 113 | return r; 114 | } 115 | 116 | static FORCEINLINE __vector double vec_smear_double_p7(double *ptr) { 117 | __vector double r; 118 | asm ("lxvdsx %x[xt],%y1" : [xt] VSXW(r) : "Z"(*ptr) ); 119 | return r; 120 | } 121 | 122 | static FORCEINLINE __vector double vec_smear_const_double_p7(const double *ptr) { 123 | __vector double r; 124 | //asm ("lxvdsx %x[xt],%y1" : [xt] VSXW(r) : "Z"(*ptr) ); 125 | asm ("lxvdsx %x[xt], 0, %[xb]" : [xt] VSXW(r) : [xb] "b"(ptr) ); 126 | return r; 127 | } 128 | 129 | static FORCEINLINE __vector unsigned long long vec_smear_const_i64_p7(const long long *ptr) { 130 | __vector unsigned long long r; 131 | //asm ("lxvdsx %x[xt],%y1" : [xt] VSXW(r) : "Z"(*ptr) ); 132 | asm ("lxvdsx %x[xt], 0, %[xb]" : [xt] VSXW(r) : [xb] "b"(ptr) ); 133 | return r; 134 | } 135 | 136 | /** 137 | *\brief This one is not really a smear constant. Need fix it. 138 | */ 139 | static FORCEINLINE __vector float vec_smear_const_float_p7(const __vector float *ptr) { 140 | __vector float r, r1; 141 | asm ("lxvw4x %x[xt], 0, %[xb]" : [xt] VSXW(r) : [xb] "b"(ptr) ); 142 | asm ("vspltw %x[xt], %x[xa], %[im]" : [xt] VSXW(r1) : [xa] VSXR(r) , [im] "i"(0));; 143 | return r1; 144 | } 145 | 146 | static FORCEINLINE __vector float vec_neg_p7(__vector float a) { 147 | __vector float register r; 148 | asm ("xvnegsp %x[xt], %x[xa]" : [xt] VSXW(r) : [xa] VSXR(a)); 149 | return r; 150 | } 151 | 152 | 153 | -------------------------------------------------------------------------------- /include/power8_intrinsics.h: -------------------------------------------------------------------------------- 1 | /** 2 | * IBM Confidential 3 | */ 4 | 5 | /** 6 | Copyright 2012 the Generic SIMD Intrinsic Library project authors. All rights reserved. 7 | 8 | Copyright IBM Corp. 2013, 2013. All rights reserved. 9 | 10 | Redistribution and use in source and binary forms, with or without 11 | modification, are permitted provided that the following conditions are 12 | met: 13 | 14 | * Redistributions of source code must retain the above copyright 15 | notice, this list of conditions and the following disclaimer. 16 | * Redistributions in binary form must reproduce the above 17 | copyright notice, this list of conditions and the following 18 | disclaimer in the documentation and/or other materials provided 19 | with the distribution. 20 | * Neither the name of IBM Corp. nor the names of its contributors may be 21 | used to endorse or promote products derived from this software 22 | without specific prior written permission. 23 | 24 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 25 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 26 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 27 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 28 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 29 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 30 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 31 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 32 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 33 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 34 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 35 | */ 36 | 37 | #define VSXW "=wa" 38 | #define VSXR "wa" 39 | #define VSXWC "=&wa" 40 | 41 | // 42 | //// POWER8 intrinsics 43 | // 44 | 45 | #ifdef __POWER8 46 | 47 | //////////////////////////////////////////////// 48 | // int 64 math/logic operations 49 | static FORCEINLINE __vector signed long long vec_add_p8(__vector signed long long a, 50 | __vector signed long long b){ 51 | __vector signed long long register r; 52 | asm ("vaddudm %[xt], %[xa], %[xb]" : [xt] "=v"(r) : [xa] "v"(a), [xb] "v"(b) ); 53 | return r; 54 | } 55 | 56 | static FORCEINLINE __vector signed long long vec_sub_p8(__vector signed long long a, 57 | __vector signed long long b){ 58 | __vector signed long long register r; 59 | asm ("vsubudm %[xt], %[xa], %[xb]" : [xt] "=v"(r) : [xa] "v"(a), [xb] "v"(b) ); 60 | return r; 61 | } 62 | 63 | 64 | static FORCEINLINE __vector signed long long vec_sld_p8(__vector signed long long a, 65 | __vector signed long long b){ 66 | __vector signed long long register r; 67 | asm ("vsld %[xt], %[xa], %[xb]" : [xt] "=v"(r) : [xa] "v"(a), [xb] "v"(b) ); 68 | return r; 69 | } 70 | 71 | 72 | static FORCEINLINE __vector unsigned int vec_pack_p8(__vector signed long long a, 73 | __vector signed long long b){ 74 | __vector unsigned int register r; 75 | asm ("vpkuhum %[xt], %[xa], %[xb]" : [xt] "=v"(r) : [xa] "v"(a), [xb] "v"(b) ); 76 | return r; 77 | } 78 | 79 | static FORCEINLINE __vector unsigned long long vec_unpackh_p8(__vector unsigned int a){ 80 | __vector unsigned long long register r; 81 | asm ("vupkhsw %[xt], %[xa]" : [xt] "=v"(r) : [xa] "v"(a) ); 82 | return r; 83 | } 84 | 85 | static FORCEINLINE __vector unsigned long long vec_unpackl_p8(__vector unsigned int a){ 86 | __vector unsigned long long register r; 87 | asm ("vupklsw %[xt], %[xa]" : [xt] "=v"(r) : [xa] "v"(a) ); 88 | return r; 89 | } 90 | 91 | static FORCEINLINE __vector signed long long vec_cmpeq_p8(__vector signed long long a, 92 | __vector signed long long b){ 93 | __vector signed long long register r; 94 | asm ("vcmpequd %[xt], %[xa], %[xb]": [xt] "=v"(r) : [xa] "v"(a), [xb] "v"(b) ); 95 | return r; 96 | } 97 | 98 | static FORCEINLINE __vector unsigned long long vec_sel_p8(__vector unsigned long long a, 99 | __vector unsigned long long b, 100 | __vector unsigned long long m /*mask*/){ 101 | __vector unsigned long long register r; 102 | asm ("vsel %[xt], %[xa], %[xb], %[xc]": [xt] "=v"(r) : [xa] "v"(a), [xb] "v"(b), [xc] "v"(m)); 103 | return r; 104 | } 105 | 106 | static FORCEINLINE __vector double vec_sel_p8(__vector double a,__vector double b, __vector double m){ 107 | __vector double register r; 108 | asm ("vsel %[xt], %[xa], %[xb], %[xc]": [xt] "=v"(r) : [xa] "v"(a), [xb] "v"(b), [xc] "v"(m)); 109 | return r; 110 | } 111 | 112 | static FORCEINLINE __vector signed int vec_smear_p8(signed int a){ 113 | __vector signed int register r, t; 114 | asm ("mtvsrwz %x[xt], %[xa]" : [xt] VSXW(t) : [xa] "r"(a) ); 115 | asm ("xxspltw %x[xt], %x[xa], 1" : [xt] VSXW(r) : [xa] VSXR(t) ); 116 | return r; 117 | } 118 | 119 | static FORCEINLINE __vector unsigned int vec_smear_p8(unsigned int a){ 120 | __vector unsigned int register r, t; 121 | asm ("mtvsrwz %x[xt], %[xa]" : [xt] VSXW(t) : [xa] "r"(a) ); 122 | asm ("xxspltw %x[xt], %x[xa], 1" : [xt] VSXW(r) : [xa] VSXR(t) ); 123 | return r; 124 | } 125 | 126 | static FORCEINLINE __vector float vec_smear_p8(float a){ 127 | __vector float register r, t; 128 | asm ("xscvdpspn %x[xt], %[xa]" : [xt] VSXW(t) : [xa] "f"(a) ); 129 | asm ("xxspltw %x[xt], %x[xa], 0" : [xt] VSXW(r) : [xa] VSXR(t) ); 130 | return r; 131 | } 132 | 133 | static FORCEINLINE __vector unsigned long long vec_smear_i64_p8(long long a){ 134 | __vector unsigned long long register r, t; 135 | asm ("mtvsrd %x[xt], %[xa]" : [xt] VSXW(t) : [xa] "r"(a) ); 136 | asm ("xxspltd %x[xt], %x[xa], 0" : [xt] VSXW(r) : [xa] VSXR(t) ); 137 | return r; 138 | } 139 | 140 | static FORCEINLINE __vector float vec_smear_float_p8(float *ptr){ 141 | __vector float register r, t; 142 | //asm ("lxsiwzx %x[xt], 0, %[xb]" : [xt] VSXW(t) : [xb] "b"(ptr): "memory"); 143 | asm ("lxsiwzx %x[xt],%y1" : [xt] VSXW(t) : "Z"(*ptr)); 144 | asm ("xxspltw %x[xt], %x[xa], 1" : [xt] VSXW(r) : [xa] VSXR(t) ); 145 | return r; 146 | } 147 | 148 | static FORCEINLINE __vector unsigned int vec_smear_i32_p8(unsigned int *ptr){ 149 | __vector unsigned int register r, t; 150 | //asm ("lxsiwzx %x[xt], 0, %[xb]" : [xt] VSXW(t) : [xb] "b"(ptr): "memory"); 151 | asm ("lxsiwzx %x[xt],%y1" : [xt] VSXW(t) : "Z"(*ptr)); 152 | asm ("xxspltw %x[xt], %x[xa], 1" : [xt] VSXW(r) : [xa] VSXR(t) ); 153 | return r; 154 | } 155 | 156 | 157 | static FORCEINLINE __vector unsigned int vec_mul_p8(__vector unsigned int a,__vector unsigned int b) { 158 | __vector unsigned int register r; 159 | asm ("vmuluwm %[xt], %[xa], %[xb]": [xt] "=v"(r) : [xa] "v"(a), [xb] "v"(b) ); 160 | return r; 161 | } 162 | 163 | //P8 specific extract 164 | static FORCEINLINE uint64_t vec_extract_l(__vector int a) { 165 | uint64_t register r; 166 | asm ("mfvsrd %[ra], %x[xs]": [ra] "=r"(r) : [xs] VSXR(a) ); 167 | return r; 168 | } 169 | 170 | static FORCEINLINE uint64_t vec_extract_r(__vector int a) { 171 | uint64_t register r; 172 | __vector int register tmp; 173 | asm ("vsldoi %[xt], %[xa], %[xb], %[im]": [xt] "=v"(tmp) : [xa] "v"(a), [xb] "v"(a),[im] "i"(8) ); 174 | asm ("mfvsrd %[ra], %x[xs]": [ra] "=r"(r) : [xs] VSXR(tmp) ); 175 | return r; 176 | } 177 | 178 | static FORCEINLINE uint64_t vec_extract_l(__vector float a) { 179 | uint64_t register r; 180 | asm ("mfvsrd %[ra], %x[xs]": [ra] "=r"(r) : [xs] VSXR(a) ); 181 | return r; 182 | } 183 | 184 | static FORCEINLINE uint64_t vec_extract_r(__vector float a) { 185 | uint64_t register r; 186 | __vector int register tmp; 187 | asm ("vsldoi %[xt], %[xa], %[xb], %[im]": [xt] "=v"(tmp) : [xa] "v"(a), [xb] "v"(a),[im] "i"(8) ); 188 | asm ("mfvsrd %[ra], %x[xs]": [ra] "=r"(r) : [xs] VSXR(tmp) ); 189 | return r; 190 | } 191 | 192 | #define GATHER_WORD_OFF32_P8(TYPE) \ 193 | static FORCEINLINE __vector TYPE vec_gather_p8(TYPE *ptr0, \ 194 | TYPE *ptr1, \ 195 | TYPE *ptr2, \ 196 | TYPE *ptr3){ \ 197 | __vector TYPE register r0,r1,r2,r3; \ 198 | __vector TYPE register t0,t1; \ 199 | __vector TYPE register r; \ 200 | asm ("lxsiwzx %x[xt],%y1" : [xt] VSXW(r0) : "Z"(*ptr0)); \ 201 | asm ("lxsiwzx %x[xt],%y1" : [xt] VSXW(r1) : "Z"(*ptr1)); \ 202 | asm ("xxmrghd %x[xt], %x[xa], %x[xb]" : [xt] VSXW(t0) : [xa] VSXR(r0), [xb] VSXR(r1) ); \ 203 | asm ("lxsiwzx %x[xt],%y1" : [xt] VSXW(r2) : "Z"(*ptr2)); \ 204 | asm ("lxsiwzx %x[xt],%y1" : [xt] VSXW(r3) : "Z"(*ptr3)); \ 205 | asm ("xxmrghd %x[xt], %x[xa], %x[xb]" : [xt] VSXW(t1) : [xa] VSXR(r2), [xb] VSXR(r3) ); \ 206 | asm ("vpkudum %[xt], %[xa], %[xb]" : [xt] "=v"(r) : [xa] "v"(t0), [xb] "v"(t1) ); \ 207 | return r; \ 208 | } 209 | 210 | GATHER_WORD_OFF32_P8(float) 211 | GATHER_WORD_OFF32_P8(signed int) 212 | GATHER_WORD_OFF32_P8(unsigned int) 213 | 214 | 215 | 216 | #define GATHER_D_WORD_OFF32_P8(TYPE) \ 217 | static FORCEINLINE __vector TYPE vec_gather_p8(TYPE *ptr0, \ 218 | TYPE *ptr1){ \ 219 | __vector TYPE register r0,r1; \ 220 | __vector TYPE register r; \ 221 | asm ("lxvdsx %x[xt],%y1" : [xt] VSXW(r0) : "Z"(*ptr0)); \ 222 | asm ("lxvdsx %x[xt],%y1" : [xt] VSXW(r1) : "Z"(*ptr1)); \ 223 | asm ("xxmrghd %x[xt], %x[xa], %x[xb]" : [xt] VSXW(r) : [xa] VSXR(r0), [xb] VSXR(r1) ); \ 224 | return r; \ 225 | } 226 | 227 | GATHER_D_WORD_OFF32_P8(double) 228 | GATHER_D_WORD_OFF32_P8(signed long) 229 | 230 | //POWER 8 Scatter Intrinsics 231 | 232 | #define SCATTER_WORD_OFF32_P8(TYPE,IMM) \ 233 | void vec_scatter_step_##IMM(TYPE* ptr0, __vector TYPE val){ \ 234 | __vector TYPE register tmp; \ 235 | asm ("vsldoi %[xt], %[xa], %[xb], %[im]": [xt] "=v"(tmp) : [xa] "v"(val), [xb] "v"(val),[im] "i"(IMM) ); \ 236 | asm ("stxsiwx %x[xt],%y1" : : [xt] VSXR(tmp), "Z"(*ptr0)); \ 237 | } 238 | 239 | #define SCATTER_WORD_OFF32_Z_P8(TYPE) \ 240 | void vec_scatter_step_0(TYPE* ptr0, __vector TYPE val){ \ 241 | asm ("stxsiwx %x[xt],%y1" : : [xt] VSXR(val), "Z"(*ptr0)); \ 242 | } 243 | 244 | SCATTER_WORD_OFF32_Z_P8(float) 245 | SCATTER_WORD_OFF32_P8(float,4) 246 | SCATTER_WORD_OFF32_P8(float,8) 247 | SCATTER_WORD_OFF32_P8(float,12) 248 | 249 | SCATTER_WORD_OFF32_Z_P8(signed int) 250 | SCATTER_WORD_OFF32_P8(signed int,4) 251 | SCATTER_WORD_OFF32_P8(signed int,8) 252 | SCATTER_WORD_OFF32_P8(signed int,12) 253 | 254 | SCATTER_WORD_OFF32_Z_P8(unsigned int) 255 | SCATTER_WORD_OFF32_P8(unsigned int,4) 256 | SCATTER_WORD_OFF32_P8(unsigned int,8) 257 | SCATTER_WORD_OFF32_P8(unsigned int,12) 258 | 259 | #define SCATTER_D_WORD_OFF32_P8(TYPE) \ 260 | void vec_scatter_step_8(TYPE* ptr0, __vector TYPE val){ \ 261 | __vector TYPE tmp; \ 262 | asm ("vsldoi %[xt], %[xa], %[xb], %[im]": [xt] "=v"(tmp) : [xa] "v"(val), [xb] "v"(val),[im] "i"(8) ); \ 263 | asm ("stxsdx %x[xt],%y1" : : [xt] VSXR(tmp), "Z"(*ptr0)); \ 264 | } 265 | 266 | #define SCATTER_D_WORD_OFF32_Z_P8(TYPE) \ 267 | void vec_scatter_step_0(TYPE* ptr0, __vector TYPE val){ \ 268 | asm ("stxsdx %x[xt],%y1" : : [xt] VSXR(val), "Z"(*ptr0)); \ 269 | } 270 | 271 | SCATTER_D_WORD_OFF32_P8(double) 272 | SCATTER_D_WORD_OFF32_P8(signed long) 273 | SCATTER_D_WORD_OFF32_Z_P8(double) 274 | SCATTER_D_WORD_OFF32_Z_P8(signed long) 275 | #endif 276 | -------------------------------------------------------------------------------- /include/svec-vsx.h: -------------------------------------------------------------------------------- 1 | /** 2 | Copyright 2012 the Generic SIMD Intrinsic Library project authors. All rights reserved. 3 | 4 | Copyright IBM Corp. 2013, 2013. All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are 8 | met: 9 | 10 | * Redistributions of source code must retain the above copyright 11 | notice, this list of conditions and the following disclaimer. 12 | * Redistributions in binary form must reproduce the above 13 | copyright notice, this list of conditions and the following 14 | disclaimer in the documentation and/or other materials provided 15 | with the distribution. 16 | * Neither the name of IBM Corp. nor the names of its contributors may be 17 | used to endorse or promote products derived from this software 18 | without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 23 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 24 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 25 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 26 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 27 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 28 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 | */ 32 | /* 33 | * svec-vsx.h 34 | * 35 | * Created on: Jul 7, 2013 36 | * Author: Haichuan Wang (haichuan@us.ibm.com, hwang154@illinois.edu) 37 | */ 38 | 39 | #ifndef SVEC_VSX_H_ 40 | #define SVEC_VSX_H_ 41 | 42 | #include 43 | #include 44 | #include 45 | #include 46 | #include 47 | 48 | #include "gsimd_utility.h" 49 | #include "platform_intrinsics.h" 50 | 51 | std::ostream& operator<< (std::ostream &out, uint8_t &v) { 52 | out << uint16_t(v); 53 | return out; 54 | } 55 | 56 | std::ostream& operator<< (std::ostream &out, int8_t &v) { 57 | out << int16_t(v); 58 | return out; 59 | } 60 | 61 | namespace vsx { 62 | 63 | template 64 | class svec_internal { 65 | protected: 66 | FORCEINLINE int lanes_per_reg() { return sizeof(REGTYPE)/sizeof(STYPE);} 67 | FORCEINLINE int regs() { return N/lanes_per_reg();} 68 | 69 | 70 | FORCEINLINE svec_internal() {} 71 | 72 | FORCEINLINE svec_internal(const REGTYPE vva[]) { 73 | for(int i=0; i < regs() ; i++) { 74 | va[i] = vva[i]; 75 | } 76 | } 77 | 78 | FORCEINLINE svec_internal(const STYPE v) { 79 | REGTYPE t; 80 | switch(lanes_per_reg()) { 81 | case 2: {//uint64_t, int64_t, double for 128bit{ 82 | t = REGTYPE(v,v); 83 | } 84 | break; 85 | case 4: {// 86 | t = REGTYPE(v,v,v,v); 87 | } 88 | break; 89 | case 8: { 90 | t = REGTYPE(v,v,v,v,v,v,v,v); 91 | } 92 | break; 93 | case 16: { // 94 | t = REGTYPE(v,v,v,v,v,v,v,v,v,v,v,v,v,v,v,v); 95 | } 96 | break; 97 | case 32: {//suppose 256bit SIMD for 8 bit 98 | t = REGTYPE(v,v,v,v,v,v,v,v,v,v,v,v,v,v,v,v, 99 | v,v,v,v,v,v,v,v,v,v,v,v,v,v,v,v); 100 | } 101 | break; 102 | } //switch 103 | for(int i=0; i < N; i+=lanes_per_reg()) { 104 | va[i/lanes_per_reg()] = t; 105 | } //for 106 | } 107 | 108 | 109 | FORCEINLINE svec_internal(const STYPE v[]) { 110 | for(int i=0; i < N; i+=lanes_per_reg()) { 111 | REGTYPE t; 112 | switch(lanes_per_reg()) { 113 | case 2: {//uint64_t, int64_t, double for 128bit{ 114 | t = REGTYPE(v[i], v[i+1]); 115 | } 116 | break; 117 | case 4: {// 118 | t = REGTYPE(v[i], v[i+1], v[i+2], v[i+3]); 119 | } 120 | break; 121 | case 8: { 122 | t = REGTYPE(v[i], v[i+1], v[i+2], v[i+3], v[i+4], v[i+5], v[i+6], v[i+7]); 123 | } 124 | break; 125 | case 16: { // 126 | t = REGTYPE(v[i], v[i+1], v[i+2], v[i+3], v[i+4], v[i+5], v[i+6], v[i+7], 127 | v[i+8], v[i+9], v[i+10], v[i+11], v[i+12], v[i+13], v[i+14], v[i+15]); 128 | } 129 | break; 130 | case 32: {//suppose 256bit SIMD for 8 bit 131 | t = REGTYPE(v[i], v[i+1], v[i+2], v[i+3], v[i+4], v[i+5], v[i+6], v[i+7], 132 | v[i+8], v[i+9], v[i+10], v[i+11], v[i+12], v[i+13], v[i+14], v[i+15], 133 | v[i+16], v[i+17], v[i+18], v[i+19], v[i+20], v[i+21], v[i+22], v[i+23], 134 | v[i+24], v[i+25], v[i+26], v[i+27], v[i+28], v[i+29], v[i+30], v[i+31]); 135 | } 136 | break; 137 | } //switch 138 | va[i/lanes_per_reg()] = t; 139 | } //for 140 | } 141 | 142 | public: 143 | /** 144 | * @brief Internal use for get the storage register 145 | */ 146 | FORCEINLINE REGTYPE & reg(int index) { return va[index];} 147 | FORCEINLINE const REGTYPE & reg(int index) const { return va[index];} 148 | 149 | FORCEINLINE STYPE& operator[](int index) {return ((STYPE *)va)[index];} 150 | FORCEINLINE const STYPE& operator[](int index) const {return ((STYPE *)va)[index]; } 151 | 152 | friend std::ostream& operator<< (std::ostream &out, const svec_internal &v) { 153 | out << "svec_"<< iu_get_type_name() << "<" << N << ">["; 154 | stdout_scalar(out, v[0]); 155 | for(int i = 1; i < N ; i++) { 156 | out << ", "; 157 | stdout_scalar(out, v[i]); 158 | } 159 | out << "]"; 160 | return out; 161 | } 162 | 163 | REGTYPE va[N/(sizeof(REGTYPE)/sizeof(STYPE))]; 164 | }; 165 | 166 | 167 | template 168 | class svec_bool: public svec_internal { 169 | 170 | public: 171 | FORCEINLINE svec_bool() { } 172 | FORCEINLINE svec_bool(const __vector uint32_t vva[]) : svec_internal(vva) { } 173 | 174 | /** 175 | * @brief bool type's initial function need set each element full bits, either 0 or 0xFFFFFFFF. 176 | * @param v an array of bool values. 177 | * @return a svec_bool type object 178 | */ 179 | FORCEINLINE svec_bool(const bool v[]) { 180 | for(int i=0; i < N; i+=4) { 181 | __vector uint32_t t = { v[i] ? -1 : 0, v[i+1] ? -1 : 0, 182 | v[i+2] ? -1 : 0, v[i+3] ? -1 : 0 }; 183 | this->va[i>>2] = t; 184 | } 185 | } 186 | 187 | FORCEINLINE svec_bool(const bool &v0, const bool &v1, const bool &v2, const bool &v3) { 188 | __vector uint32_t t = {v0 ? -1 : 0, v1 ? -1 : 0, v2 ? -1 : 0, v3 ? -1 : 0}; 189 | this->va[0] = t; 190 | } 191 | FORCEINLINE svec_bool(const bool &v0, const bool &v1, const bool &v2, const bool &v3, 192 | const bool &v4, const bool &v5, const bool &v6, const bool &v7) { 193 | __vector uint32_t t0 = {v0 ? -1 : 0, v1 ? -1 : 0, v2 ? -1 : 0, v3 ? -1 : 0}; 194 | this->va[0] = t0; 195 | __vector uint32_t t1 = {v4 ? -1 : 0, v5 ? -1 : 0, v6 ? -1 : 0, v7 ? -1 : 0}; 196 | this->va[1] = t1; 197 | } 198 | FORCEINLINE svec_bool(const bool& v) { 199 | if(__builtin_constant_p(v)){ 200 | __vector uint32_t t = (v) ? vec_splat_s32(-1) : vec_splat_s32(0); 201 | for(int i=0; i < N; i+=this->lanes_per_reg()) { 202 | this->va[i/this->lanes_per_reg()] = t; 203 | } //fo 204 | } else { 205 | svec_internal(v); 206 | } 207 | } 208 | }; 209 | 210 | 211 | template 212 | class svec_i8: public svec_internal { 213 | 214 | public: 215 | FORCEINLINE svec_i8() { } 216 | FORCEINLINE svec_i8(const __vector int8_t vva[]) : svec_internal(vva) {} 217 | FORCEINLINE svec_i8(const int8_t v[]) : svec_internal(v) {} 218 | FORCEINLINE svec_i8(const int8_t& v0, const int8_t& v1, const int8_t& v2, const int8_t& v3) { 219 | __vector int8_t t = {v0, v1, v2, v3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 220 | this->va[0] = t; 221 | } 222 | FORCEINLINE svec_i8(const int8_t& v0, const int8_t& v1, const int8_t& v2, const int8_t& v3, 223 | const int8_t& v4, const int8_t& v5, const int8_t& v6, const int8_t& v7) { 224 | __vector int8_t t = {v0, v1, v2, v3, v4, v5, v6, v7, 0, 0, 0, 0, 0, 0, 0, 0}; 225 | this->va[0] = t; 226 | } 227 | FORCEINLINE svec_i8(const int8_t& v) { 228 | if(__builtin_constant_p(v) && (v <= 15) && (v >= -16)){ 229 | __vector int8_t t = vec_splat_s8(v); //will gen one instr.vspltisb 230 | for(int i=0; i < N; i+=this->lanes_per_reg()) { 231 | this->va[i/this->lanes_per_reg()] = t; 232 | } //fo 233 | } else { 234 | svec_internal(v); 235 | } 236 | } 237 | 238 | }; 239 | 240 | template 241 | class svec_u8: public svec_internal { 242 | 243 | public: 244 | FORCEINLINE svec_u8() { } 245 | FORCEINLINE svec_u8(const __vector uint8_t vva[]) : svec_internal(vva) {} 246 | FORCEINLINE svec_u8(const uint8_t v[]) : svec_internal(v) {} 247 | FORCEINLINE svec_u8(const uint8_t& v0, const uint8_t& v1, const uint8_t& v2, const uint8_t& v3) { 248 | __vector uint8_t t = {v0, v1, v2, v3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 249 | this->va[0] = t; 250 | } 251 | FORCEINLINE svec_u8(const uint8_t& v0, const uint8_t& v1, const uint8_t& v2, const uint8_t& v3, 252 | const uint8_t& v4, const uint8_t& v5, const uint8_t& v6, const uint8_t& v7) { 253 | __vector uint8_t t = {v0, v1, v2, v3, v4, v5, v6, v7, 0, 0, 0, 0, 0, 0, 0, 0}; 254 | this->va[0] = t; 255 | } 256 | FORCEINLINE svec_u8(const uint8_t& v) { 257 | if(__builtin_constant_p(v) && (v <= 15)){ 258 | __vector uint8_t t = vec_splat_u8(v); //will gen one instr.vspltisb 259 | for(int i=0; i < N; i+=this->lanes_per_reg()) { 260 | this->va[i/this->lanes_per_reg()] = t; 261 | } //fo 262 | } else { 263 | svec_internal(v); 264 | } 265 | } 266 | }; 267 | 268 | template 269 | class svec_i16: public svec_internal { 270 | 271 | public: 272 | FORCEINLINE svec_i16() { } 273 | FORCEINLINE svec_i16(const __vector int16_t vva[]) : svec_internal(vva) {} 274 | FORCEINLINE svec_i16(const int16_t v[]) : svec_internal(v) {} 275 | FORCEINLINE svec_i16(const int16_t& v0, const int16_t& v1, const int16_t& v2, const int16_t& v3) { 276 | __vector int16_t t = {v0, v1, v2, v3, 0, 0, 0, 0}; 277 | this->va[0] = t; 278 | } 279 | FORCEINLINE svec_i16(const int16_t& v0, const int16_t& v1, const int16_t& v2, const int16_t& v3, 280 | const int16_t& v4, const int16_t& v5, const int16_t& v6, const int16_t& v7) { 281 | __vector int16_t t = {v0, v1, v2, v3, v4, v5, v6, v7}; 282 | this->va[0] = t; 283 | } 284 | FORCEINLINE svec_i16(const int16_t& v) { 285 | if(__builtin_constant_p(v) && (v <= 15) && (v >= -16)){ 286 | __vector int16_t t = vec_splat_s16(v); //will gen one instr.vspltisb 287 | for(int i=0; i < N; i+=this->lanes_per_reg()) { 288 | this->va[i/this->lanes_per_reg()] = t; 289 | } //fo 290 | } else { 291 | svec_internal(v); 292 | } 293 | } 294 | }; 295 | 296 | template 297 | class svec_u16: public svec_internal { 298 | 299 | public: 300 | FORCEINLINE svec_u16() { } 301 | FORCEINLINE svec_u16(const __vector uint16_t vva[]) : svec_internal(vva) {} 302 | FORCEINLINE svec_u16(const uint16_t v[]) : svec_internal(v) {} 303 | FORCEINLINE svec_u16(const uint16_t& v0, const uint16_t& v1, const uint16_t& v2, const uint16_t& v3) { 304 | __vector uint16_t t = {v0, v1, v2, v3, 0, 0, 0, 0}; 305 | this->va[0] = t; 306 | } 307 | FORCEINLINE svec_u16(const uint16_t& v0, const uint16_t& v1, const uint16_t& v2, const uint16_t& v3, 308 | const uint16_t& v4, const uint16_t& v5, const uint16_t& v6, const uint16_t& v7) { 309 | __vector uint16_t t = {v0, v1, v2, v3, v4, v5, v6, v7}; 310 | this->va[0] = t; 311 | } 312 | FORCEINLINE svec_u16(const uint16_t& v) { 313 | if(__builtin_constant_p(v) && (v <= 15)){ 314 | __vector uint16_t t = vec_splat_u16(v); //will gen one instr.vspltisb 315 | for(int i=0; i < N; i+=this->lanes_per_reg()) { 316 | this->va[i/this->lanes_per_reg()] = t; 317 | } //fo 318 | } else { 319 | svec_internal(v); 320 | } 321 | } 322 | }; 323 | 324 | template 325 | class svec_i32: public svec_internal { 326 | 327 | public: 328 | FORCEINLINE svec_i32() { } 329 | FORCEINLINE svec_i32(const __vector int32_t vva[]) : svec_internal(vva) {} 330 | FORCEINLINE svec_i32(const int32_t v[]) : svec_internal(v) {} 331 | FORCEINLINE svec_i32(const int32_t& v0, const int32_t& v1, const int32_t& v2, const int32_t& v3) { 332 | __vector int32_t t = {v0, v1, v2, v3}; 333 | this->va[0] = t; 334 | } 335 | FORCEINLINE svec_i32(const int32_t& v0, const int32_t& v1, const int32_t& v2, const int32_t& v3, 336 | const int32_t& v4, const int32_t& v5, const int32_t& v6, const int32_t& v7) { 337 | __vector int32_t t0 = {v0, v1, v2, v3}; 338 | this->va[0] = t0; 339 | __vector int32_t t1 = {v4, v5, v6, v7}; 340 | this->va[1] = t1; 341 | } 342 | FORCEINLINE svec_i32(const int32_t& v) { 343 | if(__builtin_constant_p(v) && (v <= 15) && (v >= -16)){ 344 | __vector int32_t t = vec_splat_s32(v); //will gen one instr.vspltisb 345 | for(int i=0; i < N; i+=this->lanes_per_reg()) { 346 | this->va[i/this->lanes_per_reg()] = t; 347 | } //fo 348 | } else { 349 | svec_internal(v); 350 | } 351 | } 352 | }; 353 | 354 | template 355 | class svec_u32: public svec_internal { 356 | 357 | public: 358 | FORCEINLINE svec_u32() { } 359 | FORCEINLINE svec_u32(const __vector uint32_t vva[]) : svec_internal(vva) {} 360 | FORCEINLINE svec_u32(const uint32_t v[]) : svec_internal(v) {} 361 | FORCEINLINE svec_u32(const uint32_t& v0, const uint32_t& v1, const uint32_t& v2, const uint32_t& v3) { 362 | __vector uint32_t t = {v0, v1, v2, v3}; 363 | this->va[0] = t; 364 | } 365 | FORCEINLINE svec_u32(const uint32_t& v0, const uint32_t& v1, const uint32_t& v2, const uint32_t& v3, 366 | const uint32_t& v4, const uint32_t& v5, const uint32_t& v6, const uint32_t& v7) { 367 | __vector uint32_t t0 = {v0, v1, v2, v3}; 368 | this->va[0] = t0; 369 | __vector uint32_t t1 = {v4, v5, v6, v7}; 370 | this->va[1] = t1; 371 | } 372 | FORCEINLINE svec_u32(const uint32_t& v) { 373 | if(__builtin_constant_p(v) && (v <= 15)){ 374 | __vector uint32_t t = vec_splat_u8(v); //will gen one instr.vspltisb 375 | for(int i=0; i < N; i+=this->lanes_per_reg()) { 376 | this->va[i/this->lanes_per_reg()] = t; 377 | } //for 378 | } else { 379 | svec_internal(v); 380 | } 381 | } 382 | }; 383 | 384 | 385 | template 386 | class svec_i64: public svec_internal { 387 | 388 | public: 389 | FORCEINLINE svec_i64() { } 390 | FORCEINLINE svec_i64(const __vector int64_t vva[]) : svec_internal(vva) {} 391 | FORCEINLINE svec_i64(const int64_t v[]) : svec_internal(v) {} 392 | FORCEINLINE svec_i64(const int64_t& v0, const int64_t& v1, const int64_t& v2, const int64_t& v3) { 393 | __vector int64_t t0 = {v0, v1}; 394 | this->va[0] = t0; 395 | __vector int64_t t1 = {v2, v3}; 396 | this->va[1] = t1; 397 | } 398 | FORCEINLINE svec_i64(const int64_t& v0, const int64_t& v1, const int64_t& v2, const int64_t& v3, 399 | const int64_t& v4, const int64_t& v5, const int64_t& v6, const int64_t& v7) { 400 | __vector int64_t t0 = {v0, v1}; 401 | this->va[0] = t0; 402 | __vector int64_t t1 = {v2, v3}; 403 | this->va[1] = t1; 404 | __vector int64_t t2 = {v4, v5}; 405 | this->va[2] = t2; 406 | __vector int64_t t3 = {v6, v7}; 407 | this->va[3] = t3; 408 | } 409 | FORCEINLINE svec_i64(const int64_t& v) { 410 | if(__builtin_constant_p(v)){ 411 | __vector int64_t t; 412 | #ifdef __POWER8 413 | if ((v >= -16l) && (v <= 15l)) { 414 | const int iv = (int)v; 415 | __vector signed int x = {iv,iv,iv,iv}; 416 | t = vec_unpackh_p8(x); 417 | } else 418 | #endif 419 | t = (__vector int64_t)(v,v); 420 | 421 | for(int i=0; i < N; i+=this->lanes_per_reg()) { 422 | this->va[i/this->lanes_per_reg()] = t; 423 | } //for 424 | } else { 425 | svec_internal(v); 426 | } 427 | } 428 | }; 429 | 430 | template 431 | class svec_u64: public svec_internal { 432 | 433 | public: 434 | FORCEINLINE svec_u64() { } 435 | FORCEINLINE svec_u64(const __vector uint64_t vva[]) : svec_internal(vva) {} 436 | FORCEINLINE svec_u64(const uint64_t v[]) : svec_internal(v) {} 437 | FORCEINLINE svec_u64(const uint64_t& v0, const uint64_t& v1, const uint64_t& v2, const uint64_t& v3) { 438 | __vector uint64_t t0 = {v0, v1}; 439 | this->va[0] = t0; 440 | __vector uint64_t t1 = {v2, v3}; 441 | this->va[1] = t1; 442 | } 443 | FORCEINLINE svec_u64(const uint64_t& v0, const uint64_t& v1, const uint64_t& v2, const uint64_t& v3, 444 | const uint64_t& v4, const uint64_t& v5, const uint64_t& v6, const uint64_t& v7) { 445 | __vector uint64_t t0 = {v0, v1}; 446 | this->va[0] = t0; 447 | __vector uint64_t t1 = {v2, v3}; 448 | this->va[1] = t1; 449 | __vector uint64_t t2 = {v4, v5}; 450 | this->va[2] = t2; 451 | __vector uint64_t t3 = {v6, v7}; 452 | this->va[3] = t3; 453 | } 454 | FORCEINLINE svec_u64(const uint64_t& v) { 455 | if(__builtin_constant_p(v)){ 456 | __vector uint64_t t; 457 | #ifdef __POWER8 458 | if ((v >= 0ul) && (v <= 31ul)) { 459 | const int iv = (int)v; 460 | __vector signed int x = {iv,iv,iv,iv}; 461 | t = vec_unpackh_p8(x); 462 | } else 463 | #endif 464 | t = (__vector uint64_t)(v,v); 465 | 466 | for(int i=0; i < N; i+=this->lanes_per_reg()) { 467 | this->va[i/this->lanes_per_reg()] = t; 468 | } //for 469 | } else { 470 | svec_internal(v); 471 | } 472 | } 473 | }; 474 | 475 | 476 | template 477 | class svec_f: public svec_internal { 478 | 479 | public: 480 | FORCEINLINE svec_f() { } 481 | FORCEINLINE svec_f(const __vector float vva[]) : svec_internal(vva) {} 482 | FORCEINLINE svec_f(const float v[]) : svec_internal(v) {} 483 | FORCEINLINE svec_f(const float& v0, const float& v1, const float& v2, const float& v3) { 484 | __vector float t = {v0, v1, v2, v3}; 485 | this->va[0] = t; 486 | } 487 | FORCEINLINE svec_f(const float& v0, const float& v1, const float& v2, const float& v3, 488 | const float& v4, const float& v5, const float& v6, const float& v7) { 489 | __vector float t0 = {v0, v1, v2, v3}; 490 | this->va[0] = t0; 491 | __vector float t1 = {v4, v5, v6, v7}; 492 | this->va[1] = t1; 493 | } 494 | FORCEINLINE svec_f(const float& v) { 495 | if(__builtin_constant_p(v)){ 496 | __vector float t; 497 | float p; int iv; 498 | p = 1.0; iv = (int)(p*v); 499 | if (( (((float)iv)/p) == v ) && (iv >= -32) && (iv <= 15)) { 500 | t = vec_ctf(vec_splat_s32(iv),0); 501 | } else { 502 | p = 2.0; iv = (int)(p*v); 503 | if (( (((float)iv)/p) == v ) && (iv >= -16) && (iv <= 15)) { 504 | t = vec_ctf(vec_splat_s32(iv),1); 505 | } else { 506 | p = 4.0; iv = (int)(p*v); 507 | if (( (((float)iv)/p) == v ) && (iv >= -16) && (iv <= 15)) { 508 | t = vec_ctf(vec_splat_s32(iv),2); 509 | } else { 510 | t = (__vector float)(v, v, v, v); 511 | } 512 | } 513 | } 514 | for(int i=0; i < N; i+=this->lanes_per_reg()) { 515 | this->va[i/this->lanes_per_reg()] = t; 516 | } //for 517 | } else { //use built-in constructor 518 | svec_internal(v); 519 | } 520 | } 521 | 522 | }; 523 | 524 | template 525 | class svec_d: public svec_internal { 526 | 527 | public: 528 | FORCEINLINE svec_d() { } 529 | FORCEINLINE svec_d(const __vector double vva[]) : svec_internal(vva) {} 530 | FORCEINLINE svec_d(const double v[]) : svec_internal(v) {} 531 | FORCEINLINE svec_d(const double& v0, const double& v1, const double& v2, const double& v3) { 532 | __vector double t0 = {v0, v1}; 533 | this->va[0] = t0; 534 | __vector double t1 = {v2, v3}; 535 | this->va[1] = t1; 536 | } 537 | FORCEINLINE svec_d(const double& v0, const double& v1, const double& v2, const double& v3, 538 | const double& v4, const double& v5, const double& v6, const double& v7) { 539 | __vector double t0 = {v0, v1}; 540 | this->va[0] = t0; 541 | __vector double t1 = {v2, v3}; 542 | this->va[1] = t1; 543 | __vector double t2 = {v4, v5}; 544 | this->va[2] = t2; 545 | __vector double t3 = {v6, v7}; 546 | this->va[3] = t3; 547 | } 548 | FORCEINLINE svec_d(const double& v) { 549 | __vector double t = vec_smear_p7(v); 550 | for(int i=0; i < N; i+=this->lanes_per_reg()) { 551 | this->va[i/this->lanes_per_reg()] = t; 552 | } //for 553 | } 554 | }; 555 | 556 | 557 | ////////////Section of class member functions 558 | 559 | 560 | 561 | 562 | } //namespace vsx 563 | 564 | 565 | #endif /* SVEC_VSX_H_ */ 566 | -------------------------------------------------------------------------------- /include/timing.h: -------------------------------------------------------------------------------- 1 | /** 2 | Copyright 2012 the Generic SIMD Intrinsic Library project authors. All rights reserved. 3 | 4 | Copyright IBM Corp. 2013, 2013. All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are 8 | met: 9 | 10 | * Redistributions of source code must retain the above copyright 11 | notice, this list of conditions and the following disclaimer. 12 | * Redistributions in binary form must reproduce the above 13 | copyright notice, this list of conditions and the following 14 | disclaimer in the documentation and/or other materials provided 15 | with the distribution. 16 | * Neither the name of IBM Corp. nor the names of its contributors may be 17 | used to endorse or promote products derived from this software 18 | without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 23 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 24 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 25 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 26 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 27 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 28 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 | 32 | The original source code covered by the above license above has been 33 | modified significantly by IBM Corp. 34 | Copyright 2013 the Generic SIMD Intrinsic Library project authors. All rights reserved. 35 | 36 | Copyright (c) 2010-2012, Intel Corporation 37 | All rights reserved. 38 | 39 | Redistribution and use in source and binary forms, with or without 40 | modification, are permitted provided that the following conditions are 41 | met: 42 | 43 | * Redistributions of source code must retain the above copyright 44 | notice, this list of conditions and the following disclaimer. 45 | 46 | * Redistributions in binary form must reproduce the above copyright 47 | notice, this list of conditions and the following disclaimer in the 48 | documentation and/or other materials provided with the distribution. 49 | 50 | * Neither the name of Intel Corporation nor the names of its 51 | contributors may be used to endorse or promote products derived from 52 | this software without specific prior written permission. 53 | 54 | 55 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS 56 | IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 57 | TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 58 | PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER 59 | OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 60 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 61 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 62 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 63 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 64 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 65 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 66 | */ 67 | 68 | #include 69 | 70 | 71 | #ifdef WIN32 72 | #include 73 | #define rdtsc __rdtsc 74 | #else 75 | #ifdef __cplusplus 76 | extern "C" { 77 | #endif /* __cplusplus */ 78 | 79 | #include 80 | 81 | __inline__ uint64_t rdtsc() { 82 | 83 | #ifdef __PPC__ 84 | uint32_t tbl, tbu0, tbu1; 85 | do { 86 | __asm__ __volatile__ ("mftbu %0" : "=r"(tbu0)); 87 | __asm__ __volatile__ ("mftb %0" : "=r"(tbl)); 88 | __asm__ __volatile__ ("mftbu %0" : "=r"(tbu1)); 89 | } while (tbu0 != tbu1); 90 | 91 | return (((uint64_t)tbu0) << 32) | tbl; 92 | #else 93 | 94 | uint32_t low, high; 95 | #ifdef __x86_64 96 | __asm__ __volatile__ ( 97 | "xorl %%eax,%%eax \n cpuid" 98 | ::: "%rax", "%rbx", "%rcx", "%rdx" ); 99 | #else 100 | __asm__ __volatile__ ( 101 | "xorl %%eax,%%eax \n cpuid" 102 | ::: "%eax", "%ebx", "%ecx", "%edx" ); 103 | #endif 104 | __asm__ __volatile__ ( 105 | "rdtsc" : "=a" (low), "=d" (high)); 106 | return (uint64_t)high << 32 | low; 107 | #endif 108 | } 109 | #ifdef __cplusplus 110 | } 111 | #endif /* __cplusplus */ 112 | #endif 113 | 114 | //rdtsc based, used in simulator or must set thread affinity first 115 | static uint64_t start, end; 116 | 117 | static inline void reset_and_start_timer() 118 | { 119 | start = rdtsc(); 120 | } 121 | 122 | /* Returns the number of millions of elapsed processor cycles since the 123 | last reset_and_start_timer() call. */ 124 | static inline double get_elapsed_mcycles() 125 | { 126 | end = rdtsc(); 127 | return (end-start) / (1024. * 1024.); 128 | } 129 | 130 | //timeofday based, used in real hardware 131 | static double start_s; 132 | 133 | static inline double get_usec() 134 | { 135 | struct timeval tim; 136 | gettimeofday(&tim, 0); 137 | return tim.tv_sec+(tim.tv_usec/1000000.0); 138 | } 139 | 140 | /* 141 | * Start timer 142 | * Simulator (__POWER8 defined): use rdtsc register 143 | * Real machine: use gettimeofday() 144 | */ 145 | static inline void reset_and_start_stimer() 146 | { 147 | #ifdef __POWER8 148 | reset_and_start_timer(); 149 | #else 150 | start_s = get_usec(); 151 | #endif 152 | } 153 | 154 | /* 155 | * End timer and report 156 | * Simulator (__POWER8 defined): use rdtsc register 157 | * Real machine: use gettimeofday() 158 | */ 159 | static inline double get_elapsed_seconds() 160 | { 161 | #ifdef __POWER8 162 | return get_elapsed_mcycles(); 163 | #else 164 | return get_usec() - start_s; 165 | #endif 166 | } 167 | -------------------------------------------------------------------------------- /tests/Makefile: -------------------------------------------------------------------------------- 1 | GXX = g++ 2 | #GXX = /opt/at5.0/bin/g++ 3 | ECHO=echo 4 | 5 | GTEST_DIR=gtest-1.6.0 6 | SIMD_TARGETS=vsx4 generic4 sse4 7 | 8 | CXXFLAGS=-I../include -Wno-int-to-pointer-cast -flax-vector-conversions -g 9 | #disable assert failure 10 | CXXFLAGS+= -DNDEBUG 11 | ########### The below section is used for new test on gtest framework 12 | 13 | help: 14 | @-$(ECHO) "Usage: 'make ' to build/run unit-tests for a target SIMD platform" 15 | @-$(ECHO) " where are: ${SIMD_TARGETS}" 16 | @-$(ECHO) " e.g., use 'make clean; make vsx4' to build/run vsx4 unit tests" 17 | 18 | # Need build the libgtest.a 19 | check_googletest: 20 | @if [ ! -d ${GTEST_DIR} ]; then \ 21 | $(ECHO) "${GTEST_DIR} does not exist, please refer to README to install googletest."; \ 22 | exit 2; \ 23 | fi 24 | 25 | libgtest.a: check_googletest 26 | ${GXX} -I${GTEST_DIR}/include -I${GTEST_DIR} -c ${GTEST_DIR}/src/gtest-all.cc 27 | ar -rv $@ gtest-all.o 28 | 29 | vsx4: test_lanes4.cpp libgtest.a ../include/power_vsx4.h 30 | ${GXX} -I${GTEST_DIR}/include -l pthread -mvsx ${CXXFLAGS} $^ -o $@ 31 | ./$@ 32 | 33 | generic4: test_lanes4.cpp libgtest.a 34 | ${GXX} -I${GTEST_DIR}/include -l pthread ${CXXFLAGS} $^ -o $@ 35 | ./$@ 36 | 37 | generic8: test_lanes8.cpp libgtest.a 38 | ${GXX} -I${GTEST_DIR}/include -l pthread ${CXXFLAGS} $^ -o $@ 39 | ./$@ 40 | 41 | sse4: test_lanes4.cpp libgtest.a ../include/sse4.h 42 | ${GXX} -I${GTEST_DIR}/include -l pthread -msse4.2 ${CXXFLAGS} $^ -o $@ 43 | ./$@ 44 | 45 | #test_svec: test_svec.cpp libgtest.a ../include/svec-vsx.h 46 | # ${GXX} -I${GTEST_DIR}/include -l pthread -mvsx ${CXXFLAGS} $^ -o $@ 47 | 48 | 49 | #codegen: codegen.cpp 50 | # ${GXX} -mno-vrsave -mvsx ${CXXFLAGS} $< -O2 -S 51 | # ${GXX} -mno-vrsave -mvsx ${CXXFLAGS} $< -O2 -c -g -Wa,-a,-ad > $@.cs 52 | 53 | clean: 54 | rm -f *.o *.exe core *~ ${TARGETS} *.output *.a 55 | -------------------------------------------------------------------------------- /tests/README: -------------------------------------------------------------------------------- 1 | The tests are dependent on google test framework. Due to the 2 | opensource license issue, we don't include it in our source tree. 3 | 4 | Please download googletest framework first from 5 | https://code.google.com/p/googletest/ 6 | and unzip it into "tests/gtest-1.6.0" directory. 7 | Alternatively one can modify the "GTEST_DIR" value in 8 | tests/Makefile. 9 | 10 | -------------------------------------------------------------------------------- /tests/codegen.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | Copyright 2012 the Generic SIMD Intrinsic Library project authors. All rights reserved. 3 | 4 | Copyright IBM Corp. 2013, 2013. All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are 8 | met: 9 | 10 | * Redistributions of source code must retain the above copyright 11 | notice, this list of conditions and the following disclaimer. 12 | * Redistributions in binary form must reproduce the above 13 | copyright notice, this list of conditions and the following 14 | disclaimer in the documentation and/or other materials provided 15 | with the distribution. 16 | * Neither the name of IBM Corp. nor the names of its contributors may be 17 | used to endorse or promote products derived from this software 18 | without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 23 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 24 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 25 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 26 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 27 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 28 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 | */ 32 | 33 | /** 34 | * @brief Test the intrinsics' code mapping with our own code mapping 35 | * codegen.cpp 36 | * 37 | * Created on: Jul 12, 2013 38 | * Author: Haichuan 39 | */ 40 | 41 | #include 42 | using namespace vsx; 43 | 44 | static char mem[128] POST_ALIGN(16); 45 | static svec4_i1* p_vi1 = (svec4_i1*)mem; 46 | static svec4_i32* p_vi32 = (svec4_i32*)mem; 47 | static svec4_i64* p_vi64 = (svec4_i64*)mem; 48 | static svec4_f* p_vf = (svec4_f*)mem; 49 | static svec4_d* p_vd = (svec4_d*)mem; 50 | 51 | const svec4_i32 base_off(0,1,2,3); 52 | 53 | FORCEINLINE svec4_d gather(double* base, svec4_i32 off) { 54 | int* off_addr = (int*)(&(off.v)); 55 | 56 | double d0 = *(base + svec_extract(off, 0)); 57 | double d1 = *(base + svec_extract(off, 1)); 58 | double d2 = *(base + svec_extract(off, 2)); 59 | double d3 = *(base + svec_extract(off, 3)); 60 | return svec4_d(d0, d1, d2, d3); 61 | } 62 | 63 | svec4_d test_gather(int scale) { 64 | svec4_i32 off_ip = scale * base_off; 65 | return gather((double*)mem, off_ip); 66 | } 67 | 68 | FORCEINLINE svec4_d gather_opt(double* base, svec4_i32 off) { 69 | int* off_addr = (int*)(&(off.v)); 70 | 71 | double d0 = *(base + off_addr[0]); 72 | double d1 = *(base + off_addr[1]); 73 | double d2 = *(base + off_addr[2]); 74 | double d3 = *(base + off_addr[3]); 75 | return svec4_d(d0, d1, d2, d3); 76 | } 77 | 78 | svec4_d test_gather_opt(int scale) { 79 | svec4_i32 off_ip = scale * base_off; 80 | return gather_opt((double*)mem, off_ip); 81 | } 82 | 83 | 84 | FORCEINLINE svec4_d gather_stride(double* base, int off0, int off1, int off2, int off3) { 85 | double d0 = *(base + off0); 86 | double d1 = *(base + off1); 87 | double d2 = *(base + off2); 88 | double d3 = *(base + off3); 89 | return svec4_d(d0, d1, d2, d3); 90 | } 91 | 92 | svec4_d test_gather_stride(int scale) { 93 | return gather_stride((double*)mem, 0, scale*1, scale*2, scale*3); 94 | } 95 | 96 | FORCEINLINE svec4_d gather_stride2(double* base, long long off, long long stride) { 97 | long long stride2 = stride * 2; 98 | double d0 = *(base + off); 99 | double d1 = *(base + off+stride); 100 | double d2 = *(base + off+stride2); 101 | double d3 = *(base + off+stride2+stride); 102 | return svec4_d(d0, d1, d2, d3); 103 | } 104 | 105 | svec4_d test_gather_stride2(int scale) { 106 | return gather_stride2((double*)mem, scale, (long long)scale); 107 | } 108 | 109 | 110 | FORCEINLINE svec4_d gather_stride3(double* base, long long stride) { 111 | double d0 = *base; 112 | base += stride; 113 | double d1 = *base; 114 | base += stride; 115 | double d2 = *base; 116 | base += stride; 117 | double d3 = *base; 118 | return svec4_d(d0, d1, d2, d3); 119 | } 120 | 121 | svec4_d test_gather_stride3(int scale) { 122 | return gather_stride3((double*)(mem+scale), (long long)scale); 123 | } 124 | 125 | FORCEINLINE svec4_d gather_stride4(double* base, long long off, long long stride) { 126 | base += off; 127 | double d0 = *base; 128 | base += stride; 129 | double d1 = *base; 130 | base += stride; 131 | double d2 = *base; 132 | base += stride; 133 | double d3 = *base; 134 | return svec4_d(d0, d1, d2, d3); 135 | } 136 | 137 | svec4_d test_gather_stride4(int scale) { 138 | return gather_stride4((double*)mem, scale, (long long)scale); 139 | } 140 | 141 | FORCEINLINE svec4_d gather_stride5(double* base, long long stride) { 142 | long long stride2 = stride * 2; 143 | double d0 = *(base); 144 | double d1 = *(base + stride); 145 | double d2 = *(base + stride2); 146 | double d3 = *(base + stride2+stride); 147 | return svec4_d(d0, d1, d2, d3); 148 | } 149 | 150 | svec4_d test_gather_stride5(int scale) { 151 | return gather_stride5((double*)(mem+scale), (long long)scale); 152 | } 153 | 154 | 155 | int test_access(svec4_i1 v) { 156 | // li 0,48 157 | // addi 9,1,-64 158 | // stxvw4x 34,9,0 159 | //.LBE21: 160 | // .loc 1 20 0 161 | // lwa 3,-4(1) 162 | // blr 163 | 164 | // int i = v_i32[3]; 165 | 166 | 167 | // li 0,48 168 | // addi 9,1,-64 169 | // stxvw4x 34,9,0 170 | //.LBE18: 171 | // .loc 1 30 0 172 | // lwa 3,-4(1) 173 | // blr 174 | // int i = vec_extract(v_i32.v, 3); 175 | 176 | 177 | // li 0,100 178 | // ld 11,.LC1@toc(2) 179 | // stw 0,-16(1) 180 | // addi 10,1,-64 181 | //.LBB28: 182 | //.LBB29: 183 | //.LBB30: 184 | // .file 2 "../include/power_vsx4.h" 185 | // .loc 2 1065 0 186 | // ld 9,.LC2@toc(2) 187 | //.LBE30: 188 | //.LBE29: 189 | //.LBE28: 190 | // .loc 1 39 0 191 | // li 0,48 192 | // lxvw4x 33,0,11 193 | // lvewx 0,10,0 194 | // .loc 1 57 0 195 | // li 3,0 196 | // .loc 1 39 0 197 | // vperm 0,2,0,1 198 | //.LVL1: 199 | //.LBB33: 200 | //.LBB32: 201 | //.LBB31: 202 | // .loc 2 1065 0 203 | // stxvw4x 32,0,9 204 | //.LBE31: 205 | //.LBE32: 206 | //.LBE33: 207 | // .loc 1 57 0 208 | // blr 209 | 210 | v[3] = 15; 211 | 212 | int r = v[2]; 213 | 214 | 215 | // .loc 1 43 0 216 | // li 3,0 217 | //.LBB31: 218 | //.LBB30: 219 | //.LBB29: 220 | // .loc 2 1065 0 221 | // stxvw4x 34,0,9 222 | //.LBE29: 223 | //.LBE30: 224 | //.LBE31: 225 | // .loc 1 43 0 226 | // blr 227 | 228 | // vec_insert(100, v_i32.v, 3); 229 | v.store(p_vi1); 230 | 231 | return r; 232 | } 233 | 234 | void test_broadcasts_64(svec4_d v_d) { 235 | // v_d[0] = 1.1; 236 | 237 | // li 0,16 238 | // .loc 2 1107 0 239 | // lfd 13,-16(1) 240 | // xxpermdi 0,13,13,0 241 | // stxvd2x 0,0,9 242 | // .loc 2 1108 0 243 | // stxvd2x 0,9,0 244 | // __vector double splat_d = vec_splat_p7(v_d.v[0], 0); 245 | // svec4_d nvd(splat_d, splat_d); 246 | 247 | 248 | // ld 9,.LC3@toc(2) 249 | //.LBE118: 250 | //.LBE117: 251 | //.LBE116: 252 | //.LBE115: 253 | // .loc 1 97 0 254 | // lxvd2x 0,11,0 255 | //.LVL3: 256 | //.LBB130: 257 | //.LBB123: 258 | //.LBB121: 259 | //.LBB119: 260 | // .loc 2 1174 0 261 | // li 0,16 262 | //.LVL4: 263 | //.LBE119: 264 | //.LBE121: 265 | //.LBE123: 266 | //.LBB124: 267 | //.LBB125: 268 | //.LBB126: 269 | //.LBB127: 270 | //.LBB128: 271 | // .file 3 "../include/power9_intrinsics.h" 272 | // .loc 3 746 0 273 | //#APP 274 | //# 746 "../include/power9_intrinsics.h" 1 275 | // xxpermdi 0, 0, 0, 0 276 | //# 0 "" 2 277 | //.LVL5: 278 | //#NO_APP 279 | //.LBE128: 280 | //.LBE127: 281 | //.LBE126: 282 | //.LBE125: 283 | //.LBE124: 284 | //.LBB129: 285 | //.LBB122: 286 | //.LBB120: 287 | // .loc 2 1173 0 288 | // stxvd2x 0,0,9 289 | // .loc 2 1174 0 290 | // stxvd2x 0,9,0 291 | svec4_d nvd = v_d.broadcast(0); 292 | 293 | nvd.store(p_vd); 294 | // DUMP(nvd); 295 | 296 | } 297 | 298 | 299 | void test_broadcasts_32(svec4_i32 v_i32) { 300 | // li 0,48 301 | // addi 9,1,-80 302 | // stxvw4x 34,9,0 303 | // lwz 0,-24(1) 304 | //.LVL3: 305 | //.LBB66: 306 | //.LBB67: 307 | //.LBB68: 308 | //.LBB69: 309 | // .loc 2 1065 0 310 | // addi 11,1,-80 311 | // ld 9,.LC3@toc(2) 312 | // stw 0,-16(1) 313 | // stw 0,-12(1) 314 | // stw 0,-8(1) 315 | // stw 0,-4(1) 316 | // li 0,64 317 | //.LVL4: 318 | // lxvw4x 32,11,0 319 | // stxvw4x 32,0,9 320 | //.LBE69: 321 | //.LBE68: 322 | //.LBE67: 323 | //.LBE66: 324 | // .loc 1 99 0 325 | // blr 326 | 327 | // svec4_i32 vi = v_i32.broadcast(2); 328 | 329 | 330 | 331 | // li 0,48 332 | // addi 9,1,-80 333 | // stxvw4x 34,9,0 334 | // lwz 0,-24(1) 335 | //.LVL3: 336 | //.LBB46: 337 | //.LBB47: 338 | //.LBB48: 339 | // .loc 2 1065 0 340 | // addi 11,1,-80 341 | // ld 9,.LC3@toc(2) 342 | // stw 0,-16(1) 343 | // stw 0,-12(1) 344 | // stw 0,-8(1) 345 | // stw 0,-4(1) 346 | // li 0,64 347 | //.LVL4: 348 | // lxvw4x 32,11,0 349 | // stxvw4x 32,0,9 350 | // v_i32[2] = 100; 351 | // svec4_i32 vi = svec4_i32(vec_splats(vec_extract(v_i32.v, 2))); 352 | 353 | 354 | // .loc 2 1065 0 355 | // ld 9,.LC3@toc(2) 356 | //.LBE53: 357 | //.LBE52: 358 | //.LBE51: 359 | //.LBB56: 360 | //.LBB57: 361 | // .file 3 "../include/power9_intrinsics.h" 362 | // .loc 3 734 0 363 | //#APP 364 | //# 734 "../include/power9_intrinsics.h" 1 365 | // xxspltw 34, 34,2 366 | //# 0 "" 2 367 | //.LVL3: 368 | //#NO_APP 369 | //.LBE57: 370 | //.LBE56: 371 | //.LBB58: 372 | //.LBB55: 373 | //.LBB54: 374 | // .loc 2 1065 0 375 | // stxvw4x 34,0,9 376 | //.LBE54: 377 | //.LBE55: 378 | //.LBE58: 379 | //.LBE50: 380 | // .loc 1 156 0 381 | // blr 382 | 383 | svec4_i32 vi = svec4_i32(vec_splat_p7(v_i32.v, 2)); 384 | 385 | // DUMP(vi); 386 | 387 | vi.store(p_vi32); 388 | } 389 | 390 | 391 | void test_splats(int i) { 392 | //integer 393 | // svec4_i32 i0(i+1); 394 | // i0.store(p_vi32); 395 | 396 | // //float 397 | // svec4_f f0(0.25f); 398 | // f0.store(p_vf+2); 399 | // 400 | // //integer 401 | 402 | // .loc 2 1107 0 403 | // std 3,-16(1) 404 | // .loc 2 1108 0 405 | // li 0,16 406 | // .loc 2 1107 0 407 | // lfd 13,-16(1) 408 | // xxpermdi 0,13,13,0 409 | // stxvd2x 0,0,9 410 | // .loc 2 1108 0 411 | svec4_i64 i1(vec_splats((signed long long)(i+2)), vec_splats((signed long long)(i+2))); 412 | i1.store(p_vi64); 413 | 414 | // 415 | // //float 416 | // svec4_f f1(vec_splats(0.5f)); 417 | // f1.store(p_vf+3); 418 | } 419 | 420 | 421 | 422 | 423 | int main(int argc, char* argv[]) 424 | { 425 | int j = 0; 426 | svec4_i32 v_i32 = * p_vi32; 427 | svec4_d v_d = * p_vd; 428 | //test_splats(argc); 429 | svec4_i1 v_i1 = * p_vi1; 430 | test_access(v_i1); 431 | // test_broadcasts_32(v_i32); 432 | // test_broadcasts_64(v_d); 433 | 434 | DUMP(test_gather(argc+1)); 435 | DUMP(test_gather_opt(argc+1)); 436 | DUMP(test_gather_stride2(argc+1)); 437 | DUMP(test_gather_stride3(argc+1)); 438 | DUMP(test_gather_stride4(argc+1)); 439 | DUMP(test_gather_stride5(argc+1)); 440 | 441 | return 0; 442 | } 443 | 444 | 445 | -------------------------------------------------------------------------------- /tests/test_svec.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | Copyright 2012 the Generic SIMD Intrinsic Library project authors. All rights reserved. 3 | 4 | Copyright IBM Corp. 2013, 2013. All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are 8 | met: 9 | 10 | * Redistributions of source code must retain the above copyright 11 | notice, this list of conditions and the following disclaimer. 12 | * Redistributions in binary form must reproduce the above 13 | copyright notice, this list of conditions and the following 14 | disclaimer in the documentation and/or other materials provided 15 | with the distribution. 16 | * Neither the name of IBM Corp. nor the names of its contributors may be 17 | used to endorse or promote products derived from this software 18 | without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 23 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 24 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 25 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 26 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 27 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 28 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 | */ 32 | 33 | /* 34 | * test_svec-vsx.cpp 35 | * 36 | * Created on: Jul 7, 2013 37 | * Author: Haichuan Wang (haichuan@us.ibm.com, hwang154@illinois.edu) 38 | */ 39 | 40 | #include 41 | #include 42 | 43 | using namespace vsx; 44 | 45 | #define EXPECT_VEC_EQ(v1, v2) EXPECT_TRUE(vec_all_eq(v1, v2)) 46 | #define DUMP(v) std::cout << #v << ":" << (v) << std::endl 47 | 48 | 49 | 50 | TEST(svec_bool, ConstructorByScalars) 51 | { 52 | 53 | __vector unsigned int t = { -1, 0, -1, 0}; 54 | svec_bool<4> v1(1, 0, 1, 0); 55 | EXPECT_VEC_EQ(v1.reg(0), t); 56 | 57 | svec_bool<8> v2(1, 0, 1, 0, 1, 0, 1, 0); 58 | EXPECT_VEC_EQ(v2.reg(0), t); 59 | EXPECT_VEC_EQ(v2.reg(1), t); 60 | 61 | bool a[] = {1, 0, 1, 0}; 62 | svec_bool<4> v3(a); 63 | EXPECT_VEC_EQ(v3.reg(0), t); 64 | 65 | __vector uint32_t va[] = { t, t }; 66 | svec_bool<8> v4(va); 67 | EXPECT_VEC_EQ(v4.reg(0), t); 68 | EXPECT_VEC_EQ(v4.reg(1), t); 69 | } 70 | 71 | TEST(svec_8, ConstructorByScalars) 72 | { 73 | 74 | svec_i8<4> v1(100,0,-50,1); 75 | __vector int8_t t = { 100, 0, -50, 1, 0,0,0,0, 0,0,0,0, 0,0,0,0}; 76 | DUMP(v1); 77 | EXPECT_VEC_EQ(v1.reg(0), t); 78 | 79 | } 80 | 81 | 82 | int main(int argc, char* argv[]) 83 | { 84 | testing::InitGoogleTest(&argc, argv); 85 | return RUN_ALL_TESTS(); 86 | } 87 | -------------------------------------------------------------------------------- /tests/test_utility.h: -------------------------------------------------------------------------------- 1 | /** 2 | Copyright 2012 the Generic SIMD Intrinsic Library project authors. All rights reserved. 3 | 4 | Copyright IBM Corp. 2013, 2013. All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are 8 | met: 9 | 10 | * Redistributions of source code must retain the above copyright 11 | notice, this list of conditions and the following disclaimer. 12 | * Redistributions in binary form must reproduce the above 13 | copyright notice, this list of conditions and the following 14 | disclaimer in the documentation and/or other materials provided 15 | with the distribution. 16 | * Neither the name of IBM Corp. nor the names of its contributors may be 17 | used to endorse or promote products derived from this software 18 | without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 23 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 24 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 25 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 26 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 27 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 28 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 | */ 32 | 33 | /** 34 | * test_utility.h 35 | * 36 | * Created on: Aug 16, 2013 37 | * @author: Haichuan Wang (hwang154@illinois.edu) 38 | * @brief: common functions for test different lanes. 39 | */ 40 | 41 | #ifndef TEST_UTILITY_H_ 42 | #define TEST_UTILITY_H_ 43 | 44 | #define EXPECT_SVEC_EQ(v1, v2) EXPECT_TRUE(((v1) == (v2)).all_true()) 45 | #define EXPECT_SVEC_MASKED_EQ(v1, v2, mask) EXPECT_TRUE((svec_masked_equal((v1), (v2), (mask)) == mask).all_true()) 46 | 47 | /** 48 | * @brief macros for check float equal 49 | */ 50 | #define EXPECT_SVEC_FEQ(v1, v2) EXPECT_TRUE( \ 51 | (v1 - v2).abs().reduce_add() < 0.005 * LANES) 52 | 53 | 54 | #define DUMP(v) std::cout << #v << ":" << (v) << std::endl 55 | 56 | template 57 | VTYPE random_vec(int maxValue) { 58 | VTYPE vec; 59 | for (int i=0; i 68 | VTYPE random_vec() { 69 | random_vec(-1); 70 | } 71 | 72 | template 73 | VTYPE ref_shr(VTYPE val, VTYPE2 s) { 74 | VTYPE ret; 75 | for(int i = 0; i < LANES; i++) { 76 | ret[i] = val[i] >> s[i]; 77 | } 78 | return ret; 79 | } 80 | 81 | template 82 | VTYPE ref_shr(VTYPE val, int s) { 83 | VTYPE ret; 84 | for(int i = 0; i < LANES; i++) { 85 | ret[i] = val[i] >> s; 86 | } 87 | return ret; 88 | } 89 | 90 | 91 | template 92 | VTYPE ref_shl(VTYPE val, VTYPE2 s) { 93 | VTYPE ret; 94 | for(int i = 0; i < LANES; i++) { 95 | ret[i] = val[i] << s[i]; 96 | } 97 | return ret; 98 | } 99 | 100 | template 101 | VTYPE ref_shl(VTYPE val, int s) { 102 | VTYPE ret; 103 | for(int i = 0; i < LANES; i++) { 104 | ret[i] = val[i] << s; 105 | } 106 | return ret; 107 | } 108 | 109 | 110 | 111 | template 112 | TO ref_cast(FROM val) { 113 | TO ret; 114 | for(int i = 0; i < LANES; i++) { 115 | ret[i] = (STO)val[i]; 116 | } 117 | return ret; 118 | } 119 | 120 | 121 | #endif /* TEST_UTILITY_H_ */ 122 | -------------------------------------------------------------------------------- /tools/allgroupspower7.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | LOG=$1_pmc.log 3 | echo "run $1 to get pmc" | tee $LOG 4 | for group in {0..260} 5 | do 6 | ./grouppower7.sh $group $1 2>&1 | tee -a $LOG 7 | done 8 | -------------------------------------------------------------------------------- /tools/grouppower7.sh: -------------------------------------------------------------------------------- 1 | #/bin/ksh 2 | g=$1 3 | shift 4 | echo "************************* group $g" 1>&2 5 | case $g in 6 | 0) perf stat -e r1001E,r200F4,r300F2,r40002,r500fa,r600f4 $*;; 7 | 1) perf stat -e r140A0,r240A2,r340A4,r440AE,r500fa,r600f4 $*;; 8 | 2) perf stat -e r1409C,r240A8,r340A0,r440A2,r500fa,r600f4 $*;; 9 | 3) perf stat -e r10068,r20004,r3409C,r400F6,r500fa,r600f4 $*;; 10 | 4) perf stat -e r140AC,r2409E,r340AE,r440A4,r500fa,r600f4 $*;; 11 | 5) perf stat -e r148AA,r248AE,r3409C,r440A8,r500fa,r600f4 $*;; 12 | 6) perf stat -e r140A0,r240A2,r340A8,r440AA,r500fa,r600f4 $*;; 13 | 7) perf stat -e r140AC,r240A8,r340A0,r440A2,r500fa,r600f4 $*;; 14 | 8) perf stat -e r140AE,r240A8,r340A0,r440A2,r500fa,r600f4 $*;; 15 | 9) perf stat -e r140A4,r240A8,r340A0,r440A2,r500fa,r600f4 $*;; 16 | 10) perf stat -e r100F6,r2D090,r3D092,r4D890,r500fa,r600f4 $*;; 17 | 11) perf stat -e r15088,r20066,r300FC,r400FC,r500fa,r600f4 $*;; 18 | 12) perf stat -e r1C05E,r2C05E,r3C05E,r4C05E,r500fa,r600f4 $*;; 19 | 13) perf stat -e r1C05C,r2C05C,r3C05C,r4C05C,r500fa,r600f4 $*;; 20 | 14) perf stat -e r10002,r2C05C,r3C05C,r4C05C,r500fa,r600f4 $*;; 21 | 15) perf stat -e r1D090,r200FE,r3C05A,r400F0,r500fa,r600f4 $*;; 22 | 16) perf stat -e r1001E,r2C058,r3C05A,r400FA,r500fa,r600f4 $*;; 23 | 17) perf stat -e r1001E,r2C058,r3C05A,r4C058,r500fa,r600f4 $*;; 24 | 18) perf stat -e r1D090,r24048,r30002,r400FA,r500fa,r600f4 $*;; 25 | 19) perf stat -e r100F6,r2D090,r3D092,r40002,r500fa,r600f4 $*;; 26 | 20) perf stat -e r1C050,r2E050,r3C056,r4E054,r500fa,r600f4 $*;; 27 | 21) perf stat -e r1E050,r2E054,r3E054,r4C054,r500fa,r600f4 $*;; 28 | 22) perf stat -e r1C054,r2C058,r3E052,r4C052,r500fa,r600f4 $*;; 29 | 23) perf stat -e r1E052,r2C052,r3C052,r4C052,r500fa,r600f4 $*;; 30 | 24) perf stat -e r1C052,r2C056,r3C054,r4C056,r500fa,r600f4 $*;; 31 | 25) perf stat -e r1E054,r2E052,r3E056,r4E052,r500fa,r600f4 $*;; 32 | 26) perf stat -e r1E054,r2E056,r3E056,r4E056,r500fa,r600f4 $*;; 33 | 27) perf stat -e r1E050,r2E058,r3E052,r4E058,r500fa,r600f4 $*;; 34 | 28) perf stat -e r1C050,r2C050,r3C052,r4C058,r500fa,r600f4 $*;; 35 | 29) perf stat -e r1C050,r2C050,r30002,r4001E,r500fa,r600f4 $*;; 36 | 30) perf stat -e r1C052,r2C054,r30002,r4C054,r500fa,r600f4 $*;; 37 | 31) perf stat -e r10002,r2C052,r3C052,r4C052,r500fa,r600f4 $*;; 38 | 32) perf stat -e r1006E,r20006,r3000C,r4000C,r500fa,r600f4 $*;; 39 | 33) perf stat -e r1006E,r20006,r30006,r4000C,r500fa,r600f4 $*;; 40 | 34) perf stat -e r1C880,r2C080,r3C082,r4D0A6,r500fa,r600f4 $*;; 41 | 35) perf stat -e r12088,r2208A,r3208C,r400F8,r500fa,r600f4 $*;; 42 | 36) perf stat -e r12086,r22082,r3208E,r4C0AA,r500fa,r600f4 $*;; 43 | 37) perf stat -e r12082,r2001E,r30012,r400F8,r500fa,r600f4 $*;; 44 | 38) perf stat -e r1C8B0,r2C8B4,r3C8B8,r4C8BC,r500fa,r600f4 $*;; 45 | 39) perf stat -e r1C8B0,r2C0B0,r3C0B2,r400F8,r500fa,r600f4 $*;; 46 | 40) perf stat -e r1C8B4,r2C0B4,r3C0B6,r400F8,r500fa,r600f4 $*;; 47 | 41) perf stat -e r1C8B8,r2C0B8,r3C0BA,r400F8,r500fa,r600f4 $*;; 48 | 42) perf stat -e r1C8BC,r2C0BC,r3C0BE,r400F8,r500fa,r600f4 $*;; 49 | 43) perf stat -e r10018,r2408A,r34096,r4408E,r500fa,r600f4 $*;; 50 | 44) perf stat -e r100FA,r2000C,r300F4,r40060,r500fa,r600f4 $*;; 51 | 45) perf stat -e r10012,r2000C,r300F4,r440B0,r500fa,r600f4 $*;; 52 | 46) perf stat -e r10062,r20060,r30060,r440B4,r500fa,r600f4 $*;; 53 | 47) perf stat -e r140B2,r20062,r30062,r40062,r500fa,r600f4 $*;; 54 | 48) perf stat -e r140B0,r240B2,r340B4,r440B6,r500fa,r600f4 $*;; 55 | 49) perf stat -e r10060,r2000C,r300F4,r40060,r500fa,r600f4 $*;; 56 | 50) perf stat -e r1000E,r2000E,r3000E,r4000E,r500fa,r600f4 $*;; 57 | 51) perf stat -e r10004,r200F4,r30002,r40004,r500fa,r600f4 $*;; 58 | 52) perf stat -e r1001E,r2000E,r3000E,r4000E,r500fa,r600f4 $*;; 59 | 53) perf stat -e r1000E,r2000E,r3001E,r40002,r500fa,r600f4 $*;; 60 | 54) perf stat -e r16280,r26280,r36280,r46282,r500fa,r600f4 $*;; 61 | 55) perf stat -e r16382,r2001E,r36380,r40002,r500fa,r600f4 $*;; 62 | 56) perf stat -e r16280,r26280,r36282,r46280,r500fa,r600f4 $*;; 63 | 57) perf stat -e r16082,r26080,r30002,r4001E,r500fa,r600f4 $*;; 64 | 58) perf stat -e r10002,r2001E,r36182,r46182,r500fa,r600f4 $*;; 65 | 59) perf stat -e r10002,r2001E,r36180,r46180,r500fa,r600f4 $*;; 66 | 60) perf stat -e r16282,r26282,r30002,r4001E,r500fa,r600f4 $*;; 67 | 61) perf stat -e r10081,r20081,r30081,r40081,r500fa,r600f4 $*;; 68 | 62) perf stat -e r10083,r20083,r30083,r40083,r500fa,r600f4 $*;; 69 | 63) perf stat -e r10881,r20881,r30881,r40881,r500fa,r600f4 $*;; 70 | 64) perf stat -e r10883,r20883,r30883,r40883,r500fa,r600f4 $*;; 71 | 65) perf stat -e r14098,r2409A,r34088,r44082,r500fa,r600f4 $*;; 72 | 66) perf stat -e r1C040,r200F2,r300F6,r400F2,r500fa,r600f4 $*;; 73 | 67) perf stat -e r1C048,r2001E,r300F6,r40002,r500fa,r600f4 $*;; 74 | 68) perf stat -e r1C042,r2C044,r300F6,r40002,r500fa,r600f4 $*;; 75 | 69) perf stat -e r10064,r2C0AC,r3C0AE,r4C8AC,r500fa,r600f4 $*;; 76 | 70) perf stat -e r10064,r20064,r3C8A8,r40008,r500fa,r600f4 $*;; 77 | 71) perf stat -e r1C8A8,r2C0A8,r3001E,r40002,r500fa,r600f4 $*;; 78 | 72) perf stat -e r1C8A4,r2C0A4,r3C0A6,r40002,r500fa,r600f4 $*;; 79 | 73) perf stat -e r1C88C,r2C08C,r3C08E,r40002,r500fa,r600f4 $*;; 80 | 74) perf stat -e r100F8,r20008,r34086,r4001E,r500fa,r600f4 $*;; 81 | 75) perf stat -e r1209C,r2209E,r320A0,r420A2,r500fa,r600f4 $*;; 82 | 76) perf stat -e r16180,r26182,r30002,r4001E,r500fa,r600f4 $*;; 83 | 77) perf stat -e r16182,r26180,r30002,r4001E,r500fa,r600f4 $*;; 84 | 78) perf stat -e r10006,r20006,r30006,r400F2,r500fa,r600f4 $*;; 85 | 79) perf stat -e r10016,r20006,r30006,r40006,r500fa,r600f4 $*;; 86 | 80) perf stat -e r12092,r22094,r32096,r42098,r500fa,r600f4 $*;; 87 | 81) perf stat -e r1006E,r2006E,r3006E,r4006E,r500fa,r600f4 $*;; 88 | 82) perf stat -e r100F2,r200F2,r3000A,r400F2,r500fa,r600f4 $*;; 89 | 83) perf stat -e r100F2,r2001E,r30002,r400F2,r500fa,r600f4 $*;; 90 | 84) perf stat -e r14888,r2488C,r34890,r44898,r500fa,r600f4 $*;; 91 | 85) perf stat -e r14090,r24092,r34094,r44890,r500fa,r600f4 $*;; 92 | 86) perf stat -e r100F6,r200FC,r30002,r4001E,r500fa,r600f4 $*;; 93 | 87) perf stat -e r1C040,r20016,r300F6,r40018,r500fa,r600f4 $*;; 94 | 88) perf stat -e r1000E,r20014,r30004,r40014,r500fa,r600f4 $*;; 95 | 89) perf stat -e r10026,r20012,r3001A,r40016,r500fa,r600f4 $*;; 96 | 90) perf stat -e r100F4,r20018,r3003E,r40012,r500fa,r600f4 $*;; 97 | 91) perf stat -e r10028,r2001C,r3003F,r4000A,r500fa,r600f4 $*;; 98 | 92) perf stat -e r1001C,r2003C,r30002,r4001C,r500fa,r600f4 $*;; 99 | 93) perf stat -e r100F8,r2001A,r30014,r4001A,r500fa,r600f4 $*;; 100 | 94) perf stat -e r1C040,r2C040,r3C042,r4C042,r500fa,r600f4 $*;; 101 | 95) perf stat -e r1C048,r2C046,r3C04A,r4C048,r500fa,r600f4 $*;; 102 | 96) perf stat -e r1C04A,r2C048,r3C046,r4C048,r500fa,r600f4 $*;; 103 | 97) perf stat -e r1C044,r2C044,r3C04C,r4C044,r500fa,r600f4 $*;; 104 | 98) perf stat -e r1C04E,r2C042,r3C044,r4C046,r500fa,r600f4 $*;; 105 | 99) perf stat -e r1C042,r2C044,r3C04E,r4C048,r500fa,r600f4 $*;; 106 | 100) perf stat -e r1C04C,r2C048,r3C04C,r4C044,r500fa,r600f4 $*;; 107 | 101) perf stat -e r10002,r2C040,r300FE,r4C042,r500fa,r600f4 $*;; 108 | 102) perf stat -e r1C040,r200FE,r300F6,r400F0,r500fa,r600f4 $*;; 109 | 103) perf stat -e r1C042,r2C044,r3C044,r4C044,r500fa,r600f4 $*;; 110 | 104) perf stat -e r1C040,r200FE,r300FE,r400FA,r500fa,r600f4 $*;; 111 | 105) perf stat -e r1C042,r2C042,r3C042,r4C042,r500fa,r600f4 $*;; 112 | 106) perf stat -e r1C05C,r20002,r3C044,r4C044,r500fa,r600f4 $*;; 113 | 107) perf stat -e r1C04A,r20002,r3C042,r4C042,r500fa,r600f4 $*;; 114 | 108) perf stat -e r1C04A,r20002,r300F6,r4C042,r500fa,r600f4 $*;; 115 | 109) perf stat -e r14040,r24040,r3404A,r44048,r500fa,r600f4 $*;; 116 | 110) perf stat -e r14048,r24042,r3404C,r44042,r500fa,r600f4 $*;; 117 | 111) perf stat -e r1404A,r24048,r34044,r44044,r500fa,r600f4 $*;; 118 | 112) perf stat -e r14044,r24046,r34046,r44046,r500fa,r600f4 $*;; 119 | 113) perf stat -e r1404E,r24044,r3404E,r44048,r500fa,r600f4 $*;; 120 | 114) perf stat -e r14046,r24048,r3404A,r44048,r500fa,r600f4 $*;; 121 | 115) perf stat -e r14042,r24044,r34044,r44044,r500fa,r600f4 $*;; 122 | 116) perf stat -e r1404C,r24048,r3404A,r44048,r500fa,r600f4 $*;; 123 | 117) perf stat -e r14046,r24042,r34042,r44042,r500fa,r600f4 $*;; 124 | 118) perf stat -e r14040,r24040,r30002,r4001E,r500fa,r600f4 $*;; 125 | 119) perf stat -e r14042,r24044,r3404A,r40002,r500fa,r600f4 $*;; 126 | 120) perf stat -e r1001E,r20002,r34044,r44044,r500fa,r600f4 $*;; 127 | 121) perf stat -e r1404A,r20002,r34042,r44042,r500fa,r600f4 $*;; 128 | 122) perf stat -e r1D8A8,r2D8AC,r3D8B4,r4D8B8,r500fa,r600f4 $*;; 129 | 123) perf stat -e r1D8BC,r2C880,r30066,r400F0,r500fa,r600f4 $*;; 130 | 124) perf stat -e r1A080,r2A082,r3A098,r4A09A,r500fa,r600f4 $*;; 131 | 125) perf stat -e r1A09C,r2A09E,r3A0A0,r4A0A2,r500fa,r600f4 $*;; 132 | 126) perf stat -e r1A898,r2A88C,r3A08C,r4A08E,r500fa,r600f4 $*;; 133 | 127) perf stat -e r1A084,r2A086,r3A884,r40002,r500fa,r600f4 $*;; 134 | 128) perf stat -e r1A090,r2A092,r3A890,r40002,r500fa,r600f4 $*;; 135 | 129) perf stat -e r1B880,r2B080,r3B082,r40002,r500fa,r600f4 $*;; 136 | 130) perf stat -e r1A8AC,r2A0AC,r3A0AE,r40002,r500fa,r600f4 $*;; 137 | 131) perf stat -e r1A8BC,r2A0BC,r3A0BE,r40002,r500fa,r600f4 $*;; 138 | 132) perf stat -e r1B88C,r2B08C,r3B08E,r40002,r500fa,r600f4 $*;; 139 | 133) perf stat -e r1A8A8,r2A0A8,r3A0AA,r4A0A4,r500fa,r600f4 $*;; 140 | 134) perf stat -e r1A888,r2A088,r3A08A,r40002,r500fa,r600f4 $*;; 141 | 135) perf stat -e r1A894,r2A094,r3A096,r40002,r500fa,r600f4 $*;; 142 | 136) perf stat -e r1B888,r2B088,r3B08A,r40002,r500fa,r600f4 $*;; 143 | 137) perf stat -e r1B884,r2B084,r3B086,r40002,r500fa,r600f4 $*;; 144 | 138) perf stat -e r1A880,r2A89C,r3A8A0,r4A898,r500fa,r600f4 $*;; 145 | 139) perf stat -e r1B890,r2B090,r3B09C,r40002,r500fa,r600f4 $*;; 146 | 140) perf stat -e r1B894,r2B094,r3B096,r4B0A0,r500fa,r600f4 $*;; 147 | 141) perf stat -e r1B098,r2B09A,r3B092,r4B09E,r500fa,r600f4 $*;; 148 | 142) perf stat -e r1A8B0,r2A0B0,r3A0B2,r40002,r500fa,r600f4 $*;; 149 | 143) perf stat -e r1A8B4,r2A0B4,r3A0B6,r40002,r500fa,r600f4 $*;; 150 | 144) perf stat -e r1A8B8,r2A0B8,r3A0BA,r40002,r500fa,r600f4 $*;; 151 | 145) perf stat -e r10068,r200F4,r30002,r4A8BC,r500fa,r600f4 $*;; 152 | 146) perf stat -e r1C884,r2B88C,r3A884,r4A880,r500fa,r600f4 $*;; 153 | 147) perf stat -e r1A888,r2A8BC,r3A884,r4A880,r500fa,r600f4 $*;; 154 | 148) perf stat -e r100F4,r2A8BC,r3A8B8,r4A880,r500fa,r600f4 $*;; 155 | 149) perf stat -e r1B88C,r2A8BC,r3A8B4,r4A8B0,r500fa,r600f4 $*;; 156 | 150) perf stat -e r1D0A4,r2003E,r3001C,r40008,r500fa,r600f4 $*;; 157 | 151) perf stat -e r10066,r2C090,r30066,r4208E,r500fa,r600f4 $*;; 158 | 152) perf stat -e r1D098,r2D09A,r3D0A0,r4D0A4,r500fa,r600f4 $*;; 159 | 153) perf stat -e r1C8A0,r2C0A0,r3C0A2,r40002,r500fa,r600f4 $*;; 160 | 154) perf stat -e r1D096,r2D097,r3D09C,r40002,r500fa,r600f4 $*;; 161 | 155) perf stat -e r1D09C,r2D09E,r3D0A0,r40002,r500fa,r600f4 $*;; 162 | 156) perf stat -e r1D0A1,r2D09F,r3D09D,r40002,r500fa,r600f4 $*;; 163 | 157) perf stat -e r1D8B8,r2D0B8,r3D0BA,r40002,r500fa,r600f4 $*;; 164 | 158) perf stat -e r16480,r26480,r3001E,r40002,r500fa,r600f4 $*;; 165 | 159) perf stat -e r16482,r26482,r3001E,r40002,r500fa,r600f4 $*;; 166 | 160) perf stat -e r100F0,r24080,r30016,r40002,r500fa,r600f4 $*;; 167 | 161) perf stat -e r12080,r200F8,r300F8,r4001E,r500fa,r600f4 $*;; 168 | 162) perf stat -e r100F2,r2000A,r300F2,r400F2,r500fa,r600f4 $*;; 169 | 163) perf stat -e r1000C,r2001A,r3001E,r4001C,r500fa,r600f4 $*;; 170 | 164) perf stat -e r1000A,r248AE,r340A4,r400F6,r500fa,r600f4 $*;; 171 | 165) perf stat -e r1408C,r2408E,r3488C,r40002,r500fa,r600f4 $*;; 172 | 166) perf stat -e r10038,r2000A,r3001E,r40066,r500fa,r600f4 $*;; 173 | 167) perf stat -e r140A6,r200F8,r300F6,r400F6,r500fa,r600f4 $*;; 174 | 168) perf stat -e r12084,r22086,r3C0A8,r400F6,r500fa,r600f4 $*;; 175 | 169) perf stat -e r1001A,r2D8A8,r3D8B8,r44084,r500fa,r600f4 $*;; 176 | 170) perf stat -e r100F4,r2001E,r30004,r40002,r500fa,r600f4 $*;; 177 | 171) perf stat -e r10002,r200F0,r300F8,r400F8,r500fa,r600f4 $*;; 178 | 172) perf stat -e r100F8,r200F0,r300FC,r400F6,r500fa,r600f4 $*;; 179 | 173) perf stat -e r1001E,r2001E,r30002,r40066,r500fa,r600f4 $*;; 180 | 174) perf stat -e r1D0A2,r2004A,r300F6,r4004A,r500fa,r600f4 $*;; 181 | 175) perf stat -e r10028,r2C09C,r3C09E,r4004C,r500fa,r600f4 $*;; 182 | 176) perf stat -e r10068,r200F0,r3D054,r4004E,r500fa,r600f4 $*;; 183 | 177) perf stat -e r10000,r2001E,r3D094,r40002,r500fa,r600f4 $*;; 184 | 178) perf stat -e r10014,r2001E,r30014,r40002,r500fa,r600f4 $*;; 185 | 179) perf stat -e r1D094,r2001E,r3209A,r40002,r500fa,r600f4 $*;; 186 | 180) perf stat -e r1001E,r228A4,r320A4,r420A6,r500fa,r600f4 $*;; 187 | 181) perf stat -e r1F080,r2F080,r3F080,r4F080,r500fa,r600f4 $*;; 188 | 182) perf stat -e r15080,r25082,r35084,r45086,r500fa,r600f4 $*;; 189 | 183) perf stat -e r1D0AC,r2D0AE,r3D8AC,r4D8B8,r500fa,r600f4 $*;; 190 | 184) perf stat -e r1F082,r2F082,r3F082,r4F082,r500fa,r600f4 $*;; 191 | 185) perf stat -e r1001E,r2D8B4,r3D0B4,r4D0B6,r500fa,r600f4 $*;; 192 | 186) perf stat -e r1001E,r2D8BC,r3D0BC,r4D0BE,r500fa,r600f4 $*;; 193 | 187) perf stat -e r1D0B0,r2D8A8,r3D0A8,r4D0AA,r500fa,r600f4 $*;; 194 | 188) perf stat -e r1C094,r2C096,r3001E,r4C894,r500fa,r600f4 $*;; 195 | 189) perf stat -e r1001E,r2C884,r3C084,r4C086,r500fa,r600f4 $*;; 196 | 190) perf stat -e r1001E,r2C888,r3C088,r4C08A,r500fa,r600f4 $*;; 197 | 191) perf stat -e r16080,r26082,r3F080,r4001E,r500fa,r600f4 $*;; 198 | 192) perf stat -e r1C894,r2C8AC,r3C098,r4C09A,r500fa,r600f4 $*;; 199 | 193) perf stat -e r1508A,r25088,r3C098,r4C09A,r500fa,r600f4 $*;; 200 | 194) perf stat -e r140B8,r240BA,r3001E,r40002,r500fa,r600f4 $*;; 201 | 195) perf stat -e r100F0,r200F2,r30016,r40002,r500fa,r600f4 $*;; 202 | 196) perf stat -e r16880,r26880,r36082,r46080,r500fa,r600f4 $*;; 203 | 197) perf stat -e r10002,r2001E,r36080,r46080,r500fa,r600f4 $*;; 204 | 198) perf stat -e r10002,r2001E,r36482,r400FA,r500fa,r600f4 $*;; 205 | 199) perf stat -e r10002,r2001E,r36382,r46382,r500fa,r600f4 $*;; 206 | 200) perf stat -e r10002,r2001E,r36480,r400FA,r500fa,r600f4 $*;; 207 | 201) perf stat -e r10002,r200F4,r3001E,r46380,r500fa,r600f4 $*;; 208 | 202) perf stat -e r100F6,r240BC,r340BE,r40002,r500fa,r600f4 $*;; 209 | 203) perf stat -e r12090,r220A8,r3001E,r40002,r500fa,r600f4 $*;; 210 | 204) perf stat -e r1001E,r20006,r30008,r40002,r500fa,r600f4 $*;; 211 | 205) perf stat -e r10008,r200F4,r3001E,r400F4,r500fa,r600f4 $*;; 212 | 206) perf stat -e r10010,r20010,r30010,r40010,r500fa,r600f4 $*;; 213 | 207) perf stat -e r10024,r20010,r30024,r40010,r500fa,r600f4 $*;; 214 | 208) perf stat -e r10020,r200F4,r30020,r40002,r500fa,r600f4 $*;; 215 | 209) perf stat -e r10022,r200F4,r30022,r40002,r500fa,r600f4 $*;; 216 | 210) perf stat -e r1208A,r22096,r3D0B2,r40002,r500fa,r600f4 $*;; 217 | 211) perf stat -e r100F6,r200FC,r300F0,r400F0,r500fa,r600f4 $*;; 218 | 212) perf stat -e r1001E,r200F6,r300FC,r400FC,r500fa,r600f4 $*;; 219 | 213) perf stat -e r100FA,r200F4,r3001E,r400F4,r500fa,r600f4 $*;; 220 | 214) perf stat -e r100F4,r200F4,r3001E,r400FA,r500fa,r600f4 $*;; 221 | 215) perf stat -e r100F2,r200F4,r300F2,r400F2,r500fa,r600f4 $*;; 222 | 216) perf stat -e r10002,r200F0,r300F0,r400F0,r500fa,r600f4 $*;; 223 | 217) perf stat -e r10002,r200FE,r300F6,r400F0,r500fa,r600f4 $*;; 224 | 218) perf stat -e r100F6,r200FC,r30002,r400FC,r500fa,r600f4 $*;; 225 | 219) perf stat -e r10000,r20000,r30000,r40000,r500fa,r600f4 $*;; 226 | 220) perf stat -e r10002,r200F8,r300F8,r4001E,r500fa,r600f4 $*;; 227 | 221) perf stat -e r100F0,r200F2,r300F4,r400F8,r500fa,r600f4 $*;; 228 | 222) perf stat -e r100F8,r200F2,r3001E,r400F6,r500fa,r600f4 $*;; 229 | 223) perf stat -e r10036,r20036,r30036,r40002,r500fa,r600f4 $*;; 230 | 224) perf stat -e r1D04A,r2002E,r30002,r4D048,r500fa,r600f4 $*;; 231 | 225) perf stat -e r1003E,r20002,r3D046,r40024,r500fa,r600f4 $*;; 232 | 226) perf stat -e r1D048,r2D048,r30002,r40020,r500fa,r600f4 $*;; 233 | 227) perf stat -e r10002,r2002C,r3D04A,r4C042,r500fa,r600f4 $*;; 234 | 228) perf stat -e r1D044,r20002,r30030,r40026,r500fa,r600f4 $*;; 235 | 229) perf stat -e r1003F,r20024,r3D04E,r40002,r500fa,r600f4 $*;; 236 | 230) perf stat -e r1D040,r20020,r30002,r4D048,r500fa,r600f4 $*;; 237 | 231) perf stat -e r1D042,r2D048,r30002,r40028,r500fa,r600f4 $*;; 238 | 232) perf stat -e r10002,r2002A,r3D044,r4D048,r500fa,r600f4 $*;; 239 | 233) perf stat -e r1D04C,r20028,r3C042,r40002,r500fa,r600f4 $*;; 240 | 234) perf stat -e r1003E,r20002,r3D042,r4002C,r500fa,r600f4 $*;; 241 | 235) perf stat -e r1D04E,r20026,r30030,r40002,r500fa,r600f4 $*;; 242 | 236) perf stat -e r1003F,r20002,r3D04C,r4002A,r500fa,r600f4 $*;; 243 | 237) perf stat -e r1D084,r2D086,r30002,r4001E,r500fa,r600f4 $*;; 244 | 238) perf stat -e r10002,r2001E,r3D088,r4D08A,r500fa,r600f4 $*;; 245 | 239) perf stat -e r1D082,r2D08C,r30002,r40064,r500fa,r600f4 $*;; 246 | 240) perf stat -e r10032,r20030,r30030,r40002,r500fa,r600f4 $*;; 247 | 241) perf stat -e r10034,r20034,r30034,r40002,r500fa,r600f4 $*;; 248 | 242) perf stat -e r10002,r2D05E,r3D05E,r4D05E,r500fa,r600f4 $*;; 249 | 243) perf stat -e r1D05E,r2D05E,r3D05E,r40002,r500fa,r600f4 $*;; 250 | 244) perf stat -e r10002,r2D05C,r3D05C,r4D05C,r500fa,r600f4 $*;; 251 | 245) perf stat -e r1D05C,r2D05C,r3D05C,r40002,r500fa,r600f4 $*;; 252 | 246) perf stat -e r1003E,r20002,r3D05A,r4003E,r500fa,r600f4 $*;; 253 | 247) perf stat -e r10002,r2D052,r3D056,r4D056,r500fa,r600f4 $*;; 254 | 248) perf stat -e r1D050,r2D054,r3D052,r40002,r500fa,r600f4 $*;; 255 | 249) perf stat -e r10002,r2D056,r3D056,r4D054,r500fa,r600f4 $*;; 256 | 250) perf stat -e r1D054,r2D050,r30002,r4D058,r500fa,r600f4 $*;; 257 | 251) perf stat -e r1D052,r2D058,r30002,r4D052,r500fa,r600f4 $*;; 258 | 252) perf stat -e r1D08E,r20002,r3003A,r40034,r500fa,r600f4 $*;; 259 | 253) perf stat -e r10002,r20038,r3003A,r40032,r500fa,r600f4 $*;; 260 | 254) perf stat -e r10002,r2003A,r3D080,r40032,r500fa,r600f4 $*;; 261 | 255) perf stat -e r1003C,r20002,r30032,r40038,r500fa,r600f4 $*;; 262 | 256) perf stat -e r1003D,r20032,r3003F,r40002,r500fa,r600f4 $*;; 263 | 257) perf stat -e r10030,r200F4,r30002,r40030,r500fa,r600f4 $*;; 264 | 258) perf stat -e r1D082,r20002,r30064,r40064,r500fa,r600f4 $*;; 265 | 259) perf stat -e r1001E,r2001E,r30002,r40032,r500fa,r600f4 $*;; 266 | 260) perf stat -e r1D040,r20020,r3D0A2,r4000A,r500fa,r600f4 $*;; 267 | *) echo "GROUP NOT FOUND $g";; 268 | esac 269 | 270 | /gsa/yktgsa/home/h/a/haichuan/workspace/gsimd/examples/RGB2Gray/groupnamepower7.sh $g 271 | -------------------------------------------------------------------------------- /tools/p7.sh: -------------------------------------------------------------------------------- 1 | #/bin/ksh 2 | g=$1 3 | shift 4 | p=$1 5 | shift 6 | echo "************************* group $g" 1>&2 7 | case $g in 8 | 0) perf stat -p $p -e r1001E,r200F4,r300F2,r40002,r500fa,r600f4 $*;; 9 | 1) perf stat -p $p -e r140A0,r240A2,r340A4,r440AE,r500fa,r600f4 $*;; 10 | 2) perf stat -p $p -e r1409C,r240A8,r340A0,r440A2,r500fa,r600f4 $*;; 11 | 3) perf stat -p $p -e r10068,r20004,r3409C,r400F6,r500fa,r600f4 $*;; 12 | 4) perf stat -p $p -e r140AC,r2409E,r340AE,r440A4,r500fa,r600f4 $*;; 13 | 5) perf stat -p $p -e r148AA,r248AE,r3409C,r440A8,r500fa,r600f4 $*;; 14 | 6) perf stat -p $p -e r140A0,r240A2,r340A8,r440AA,r500fa,r600f4 $*;; 15 | 7) perf stat -p $p -e r140AC,r240A8,r340A0,r440A2,r500fa,r600f4 $*;; 16 | 8) perf stat -p $p -e r140AE,r240A8,r340A0,r440A2,r500fa,r600f4 $*;; 17 | 9) perf stat -p $p -e r140A4,r240A8,r340A0,r440A2,r500fa,r600f4 $*;; 18 | 10) perf stat -p $p -e r100F6,r2D090,r3D092,r4D890,r500fa,r600f4 $*;; 19 | 11) perf stat -p $p -e r15088,r20066,r300FC,r400FC,r500fa,r600f4 $*;; 20 | 12) perf stat -p $p -e r1C05E,r2C05E,r3C05E,r4C05E,r500fa,r600f4 $*;; 21 | 13) perf stat -p $p -e r1C05C,r2C05C,r3C05C,r4C05C,r500fa,r600f4 $*;; 22 | 14) perf stat -p $p -e r10002,r2C05C,r3C05C,r4C05C,r500fa,r600f4 $*;; 23 | 15) perf stat -p $p -e r1D090,r200FE,r3C05A,r400F0,r500fa,r600f4 $*;; 24 | 16) perf stat -p $p -e r1001E,r2C058,r3C05A,r400FA,r500fa,r600f4 $*;; 25 | 17) perf stat -p $p -e r1001E,r2C058,r3C05A,r4C058,r500fa,r600f4 $*;; 26 | 18) perf stat -p $p -e r1D090,r24048,r30002,r400FA,r500fa,r600f4 $*;; 27 | 19) perf stat -p $p -e r100F6,r2D090,r3D092,r40002,r500fa,r600f4 $*;; 28 | 20) perf stat -p $p -e r1C050,r2E050,r3C056,r4E054,r500fa,r600f4 $*;; 29 | 21) perf stat -p $p -e r1E050,r2E054,r3E054,r4C054,r500fa,r600f4 $*;; 30 | 22) perf stat -p $p -e r1C054,r2C058,r3E052,r4C052,r500fa,r600f4 $*;; 31 | 23) perf stat -p $p -e r1E052,r2C052,r3C052,r4C052,r500fa,r600f4 $*;; 32 | 24) perf stat -p $p -e r1C052,r2C056,r3C054,r4C056,r500fa,r600f4 $*;; 33 | 25) perf stat -p $p -e r1E054,r2E052,r3E056,r4E052,r500fa,r600f4 $*;; 34 | 26) perf stat -p $p -e r1E054,r2E056,r3E056,r4E056,r500fa,r600f4 $*;; 35 | 27) perf stat -p $p -e r1E050,r2E058,r3E052,r4E058,r500fa,r600f4 $*;; 36 | 28) perf stat -p $p -e r1C050,r2C050,r3C052,r4C058,r500fa,r600f4 $*;; 37 | 29) perf stat -p $p -e r1C050,r2C050,r30002,r4001E,r500fa,r600f4 $*;; 38 | 30) perf stat -p $p -e r1C052,r2C054,r30002,r4C054,r500fa,r600f4 $*;; 39 | 31) perf stat -p $p -e r10002,r2C052,r3C052,r4C052,r500fa,r600f4 $*;; 40 | 32) perf stat -p $p -e r1006E,r20006,r3000C,r4000C,r500fa,r600f4 $*;; 41 | 33) perf stat -p $p -e r1006E,r20006,r30006,r4000C,r500fa,r600f4 $*;; 42 | 34) perf stat -p $p -e r1C880,r2C080,r3C082,r4D0A6,r500fa,r600f4 $*;; 43 | 35) perf stat -p $p -e r12088,r2208A,r3208C,r400F8,r500fa,r600f4 $*;; 44 | 36) perf stat -p $p -e r12086,r22082,r3208E,r4C0AA,r500fa,r600f4 $*;; 45 | 37) perf stat -p $p -e r12082,r2001E,r30012,r400F8,r500fa,r600f4 $*;; 46 | 38) perf stat -p $p -e r1C8B0,r2C8B4,r3C8B8,r4C8BC,r500fa,r600f4 $*;; 47 | 39) perf stat -p $p -e r1C8B0,r2C0B0,r3C0B2,r400F8,r500fa,r600f4 $*;; 48 | 40) perf stat -p $p -e r1C8B4,r2C0B4,r3C0B6,r400F8,r500fa,r600f4 $*;; 49 | 41) perf stat -p $p -e r1C8B8,r2C0B8,r3C0BA,r400F8,r500fa,r600f4 $*;; 50 | 42) perf stat -p $p -e r1C8BC,r2C0BC,r3C0BE,r400F8,r500fa,r600f4 $*;; 51 | 43) perf stat -p $p -e r10018,r2408A,r34096,r4408E,r500fa,r600f4 $*;; 52 | 44) perf stat -p $p -e r100FA,r2000C,r300F4,r40060,r500fa,r600f4 $*;; 53 | 45) perf stat -p $p -e r10012,r2000C,r300F4,r440B0,r500fa,r600f4 $*;; 54 | 46) perf stat -p $p -e r10062,r20060,r30060,r440B4,r500fa,r600f4 $*;; 55 | 47) perf stat -p $p -e r140B2,r20062,r30062,r40062,r500fa,r600f4 $*;; 56 | 48) perf stat -p $p -e r140B0,r240B2,r340B4,r440B6,r500fa,r600f4 $*;; 57 | 49) perf stat -p $p -e r10060,r2000C,r300F4,r40060,r500fa,r600f4 $*;; 58 | 50) perf stat -p $p -e r1000E,r2000E,r3000E,r4000E,r500fa,r600f4 $*;; 59 | 51) perf stat -p $p -e r10004,r200F4,r30002,r40004,r500fa,r600f4 $*;; 60 | 52) perf stat -p $p -e r1001E,r2000E,r3000E,r4000E,r500fa,r600f4 $*;; 61 | 53) perf stat -p $p -e r1000E,r2000E,r3001E,r40002,r500fa,r600f4 $*;; 62 | 54) perf stat -p $p -e r16280,r26280,r36280,r46282,r500fa,r600f4 $*;; 63 | 55) perf stat -p $p -e r16382,r2001E,r36380,r40002,r500fa,r600f4 $*;; 64 | 56) perf stat -p $p -e r16280,r26280,r36282,r46280,r500fa,r600f4 $*;; 65 | 57) perf stat -p $p -e r16082,r26080,r30002,r4001E,r500fa,r600f4 $*;; 66 | 58) perf stat -p $p -e r10002,r2001E,r36182,r46182,r500fa,r600f4 $*;; 67 | 59) perf stat -p $p -e r10002,r2001E,r36180,r46180,r500fa,r600f4 $*;; 68 | 60) perf stat -p $p -e r16282,r26282,r30002,r4001E,r500fa,r600f4 $*;; 69 | 61) perf stat -p $p -e r10081,r20081,r30081,r40081,r500fa,r600f4 $*;; 70 | 62) perf stat -p $p -e r10083,r20083,r30083,r40083,r500fa,r600f4 $*;; 71 | 63) perf stat -p $p -e r10881,r20881,r30881,r40881,r500fa,r600f4 $*;; 72 | 64) perf stat -p $p -e r10883,r20883,r30883,r40883,r500fa,r600f4 $*;; 73 | 65) perf stat -p $p -e r14098,r2409A,r34088,r44082,r500fa,r600f4 $*;; 74 | 66) perf stat -p $p -e r1C040,r200F2,r300F6,r400F2,r500fa,r600f4 $*;; 75 | 67) perf stat -p $p -e r1C048,r2001E,r300F6,r40002,r500fa,r600f4 $*;; 76 | 68) perf stat -p $p -e r1C042,r2C044,r300F6,r40002,r500fa,r600f4 $*;; 77 | 69) perf stat -p $p -e r10064,r2C0AC,r3C0AE,r4C8AC,r500fa,r600f4 $*;; 78 | 70) perf stat -p $p -e r10064,r20064,r3C8A8,r40008,r500fa,r600f4 $*;; 79 | 71) perf stat -p $p -e r1C8A8,r2C0A8,r3001E,r40002,r500fa,r600f4 $*;; 80 | 72) perf stat -p $p -e r1C8A4,r2C0A4,r3C0A6,r40002,r500fa,r600f4 $*;; 81 | 73) perf stat -p $p -e r1C88C,r2C08C,r3C08E,r40002,r500fa,r600f4 $*;; 82 | 74) perf stat -p $p -e r100F8,r20008,r34086,r4001E,r500fa,r600f4 $*;; 83 | 75) perf stat -p $p -e r1209C,r2209E,r320A0,r420A2,r500fa,r600f4 $*;; 84 | 76) perf stat -p $p -e r16180,r26182,r30002,r4001E,r500fa,r600f4 $*;; 85 | 77) perf stat -p $p -e r16182,r26180,r30002,r4001E,r500fa,r600f4 $*;; 86 | 78) perf stat -p $p -e r10006,r20006,r30006,r400F2,r500fa,r600f4 $*;; 87 | 79) perf stat -p $p -e r10016,r20006,r30006,r40006,r500fa,r600f4 $*;; 88 | 80) perf stat -p $p -e r12092,r22094,r32096,r42098,r500fa,r600f4 $*;; 89 | 81) perf stat -p $p -e r1006E,r2006E,r3006E,r4006E,r500fa,r600f4 $*;; 90 | 82) perf stat -p $p -e r100F2,r200F2,r3000A,r400F2,r500fa,r600f4 $*;; 91 | 83) perf stat -p $p -e r100F2,r2001E,r30002,r400F2,r500fa,r600f4 $*;; 92 | 84) perf stat -p $p -e r14888,r2488C,r34890,r44898,r500fa,r600f4 $*;; 93 | 85) perf stat -p $p -e r14090,r24092,r34094,r44890,r500fa,r600f4 $*;; 94 | 86) perf stat -p $p -e r100F6,r200FC,r30002,r4001E,r500fa,r600f4 $*;; 95 | 87) perf stat -p $p -e r1C040,r20016,r300F6,r40018,r500fa,r600f4 $*;; 96 | 88) perf stat -p $p -e r1000E,r20014,r30004,r40014,r500fa,r600f4 $*;; 97 | 89) perf stat -p $p -e r10026,r20012,r3001A,r40016,r500fa,r600f4 $*;; 98 | 90) perf stat -p $p -e r100F4,r20018,r3003E,r40012,r500fa,r600f4 $*;; 99 | 91) perf stat -p $p -e r10028,r2001C,r3003F,r4000A,r500fa,r600f4 $*;; 100 | 92) perf stat -p $p -e r1001C,r2003C,r30002,r4001C,r500fa,r600f4 $*;; 101 | 93) perf stat -p $p -e r100F8,r2001A,r30014,r4001A,r500fa,r600f4 $*;; 102 | 94) perf stat -p $p -e r1C040,r2C040,r3C042,r4C042,r500fa,r600f4 $*;; 103 | 95) perf stat -p $p -e r1C048,r2C046,r3C04A,r4C048,r500fa,r600f4 $*;; 104 | 96) perf stat -p $p -e r1C04A,r2C048,r3C046,r4C048,r500fa,r600f4 $*;; 105 | 97) perf stat -p $p -e r1C044,r2C044,r3C04C,r4C044,r500fa,r600f4 $*;; 106 | 98) perf stat -p $p -e r1C04E,r2C042,r3C044,r4C046,r500fa,r600f4 $*;; 107 | 99) perf stat -p $p -e r1C042,r2C044,r3C04E,r4C048,r500fa,r600f4 $*;; 108 | 100) perf stat -p $p -e r1C04C,r2C048,r3C04C,r4C044,r500fa,r600f4 $*;; 109 | 101) perf stat -p $p -e r10002,r2C040,r300FE,r4C042,r500fa,r600f4 $*;; 110 | 102) perf stat -p $p -e r1C040,r200FE,r300F6,r400F0,r500fa,r600f4 $*;; 111 | 103) perf stat -p $p -e r1C042,r2C044,r3C044,r4C044,r500fa,r600f4 $*;; 112 | 104) perf stat -p $p -e r1C040,r200FE,r300FE,r400FA,r500fa,r600f4 $*;; 113 | 105) perf stat -p $p -e r1C042,r2C042,r3C042,r4C042,r500fa,r600f4 $*;; 114 | 106) perf stat -p $p -e r1C05C,r20002,r3C044,r4C044,r500fa,r600f4 $*;; 115 | 107) perf stat -p $p -e r1C04A,r20002,r3C042,r4C042,r500fa,r600f4 $*;; 116 | 108) perf stat -p $p -e r1C04A,r20002,r300F6,r4C042,r500fa,r600f4 $*;; 117 | 109) perf stat -p $p -e r14040,r24040,r3404A,r44048,r500fa,r600f4 $*;; 118 | 110) perf stat -p $p -e r14048,r24042,r3404C,r44042,r500fa,r600f4 $*;; 119 | 111) perf stat -p $p -e r1404A,r24048,r34044,r44044,r500fa,r600f4 $*;; 120 | 112) perf stat -p $p -e r14044,r24046,r34046,r44046,r500fa,r600f4 $*;; 121 | 113) perf stat -p $p -e r1404E,r24044,r3404E,r44048,r500fa,r600f4 $*;; 122 | 114) perf stat -p $p -e r14046,r24048,r3404A,r44048,r500fa,r600f4 $*;; 123 | 115) perf stat -p $p -e r14042,r24044,r34044,r44044,r500fa,r600f4 $*;; 124 | 116) perf stat -p $p -e r1404C,r24048,r3404A,r44048,r500fa,r600f4 $*;; 125 | 117) perf stat -p $p -e r14046,r24042,r34042,r44042,r500fa,r600f4 $*;; 126 | 118) perf stat -p $p -e r14040,r24040,r30002,r4001E,r500fa,r600f4 $*;; 127 | 119) perf stat -p $p -e r14042,r24044,r3404A,r40002,r500fa,r600f4 $*;; 128 | 120) perf stat -p $p -e r1001E,r20002,r34044,r44044,r500fa,r600f4 $*;; 129 | 121) perf stat -p $p -e r1404A,r20002,r34042,r44042,r500fa,r600f4 $*;; 130 | 122) perf stat -p $p -e r1D8A8,r2D8AC,r3D8B4,r4D8B8,r500fa,r600f4 $*;; 131 | 123) perf stat -p $p -e r1D8BC,r2C880,r30066,r400F0,r500fa,r600f4 $*;; 132 | 124) perf stat -p $p -e r1A080,r2A082,r3A098,r4A09A,r500fa,r600f4 $*;; 133 | 125) perf stat -p $p -e r1A09C,r2A09E,r3A0A0,r4A0A2,r500fa,r600f4 $*;; 134 | 126) perf stat -p $p -e r1A898,r2A88C,r3A08C,r4A08E,r500fa,r600f4 $*;; 135 | 127) perf stat -p $p -e r1A084,r2A086,r3A884,r40002,r500fa,r600f4 $*;; 136 | 128) perf stat -p $p -e r1A090,r2A092,r3A890,r40002,r500fa,r600f4 $*;; 137 | 129) perf stat -p $p -e r1B880,r2B080,r3B082,r40002,r500fa,r600f4 $*;; 138 | 130) perf stat -p $p -e r1A8AC,r2A0AC,r3A0AE,r40002,r500fa,r600f4 $*;; 139 | 131) perf stat -p $p -e r1A8BC,r2A0BC,r3A0BE,r40002,r500fa,r600f4 $*;; 140 | 132) perf stat -p $p -e r1B88C,r2B08C,r3B08E,r40002,r500fa,r600f4 $*;; 141 | 133) perf stat -p $p -e r1A8A8,r2A0A8,r3A0AA,r4A0A4,r500fa,r600f4 $*;; 142 | 134) perf stat -p $p -e r1A888,r2A088,r3A08A,r40002,r500fa,r600f4 $*;; 143 | 135) perf stat -p $p -e r1A894,r2A094,r3A096,r40002,r500fa,r600f4 $*;; 144 | 136) perf stat -p $p -e r1B888,r2B088,r3B08A,r40002,r500fa,r600f4 $*;; 145 | 137) perf stat -p $p -e r1B884,r2B084,r3B086,r40002,r500fa,r600f4 $*;; 146 | 138) perf stat -p $p -e r1A880,r2A89C,r3A8A0,r4A898,r500fa,r600f4 $*;; 147 | 139) perf stat -p $p -e r1B890,r2B090,r3B09C,r40002,r500fa,r600f4 $*;; 148 | 140) perf stat -p $p -e r1B894,r2B094,r3B096,r4B0A0,r500fa,r600f4 $*;; 149 | 141) perf stat -p $p -e r1B098,r2B09A,r3B092,r4B09E,r500fa,r600f4 $*;; 150 | 142) perf stat -p $p -e r1A8B0,r2A0B0,r3A0B2,r40002,r500fa,r600f4 $*;; 151 | 143) perf stat -p $p -e r1A8B4,r2A0B4,r3A0B6,r40002,r500fa,r600f4 $*;; 152 | 144) perf stat -p $p -e r1A8B8,r2A0B8,r3A0BA,r40002,r500fa,r600f4 $*;; 153 | 145) perf stat -p $p -e r10068,r200F4,r30002,r4A8BC,r500fa,r600f4 $*;; 154 | 146) perf stat -p $p -e r1C884,r2B88C,r3A884,r4A880,r500fa,r600f4 $*;; 155 | 147) perf stat -p $p -e r1A888,r2A8BC,r3A884,r4A880,r500fa,r600f4 $*;; 156 | 148) perf stat -p $p -e r100F4,r2A8BC,r3A8B8,r4A880,r500fa,r600f4 $*;; 157 | 149) perf stat -p $p -e r1B88C,r2A8BC,r3A8B4,r4A8B0,r500fa,r600f4 $*;; 158 | 150) perf stat -p $p -e r1D0A4,r2003E,r3001C,r40008,r500fa,r600f4 $*;; 159 | 151) perf stat -p $p -e r10066,r2C090,r30066,r4208E,r500fa,r600f4 $*;; 160 | 152) perf stat -p $p -e r1D098,r2D09A,r3D0A0,r4D0A4,r500fa,r600f4 $*;; 161 | 153) perf stat -p $p -e r1C8A0,r2C0A0,r3C0A2,r40002,r500fa,r600f4 $*;; 162 | 154) perf stat -p $p -e r1D096,r2D097,r3D09C,r40002,r500fa,r600f4 $*;; 163 | 155) perf stat -p $p -e r1D09C,r2D09E,r3D0A0,r40002,r500fa,r600f4 $*;; 164 | 156) perf stat -p $p -e r1D0A1,r2D09F,r3D09D,r40002,r500fa,r600f4 $*;; 165 | 157) perf stat -p $p -e r1D8B8,r2D0B8,r3D0BA,r40002,r500fa,r600f4 $*;; 166 | 158) perf stat -p $p -e r16480,r26480,r3001E,r40002,r500fa,r600f4 $*;; 167 | 159) perf stat -p $p -e r16482,r26482,r3001E,r40002,r500fa,r600f4 $*;; 168 | 160) perf stat -p $p -e r100F0,r24080,r30016,r40002,r500fa,r600f4 $*;; 169 | 161) perf stat -p $p -e r12080,r200F8,r300F8,r4001E,r500fa,r600f4 $*;; 170 | 162) perf stat -p $p -e r100F2,r2000A,r300F2,r400F2,r500fa,r600f4 $*;; 171 | 163) perf stat -p $p -e r1000C,r2001A,r3001E,r4001C,r500fa,r600f4 $*;; 172 | 164) perf stat -p $p -e r1000A,r248AE,r340A4,r400F6,r500fa,r600f4 $*;; 173 | 165) perf stat -p $p -e r1408C,r2408E,r3488C,r40002,r500fa,r600f4 $*;; 174 | 166) perf stat -p $p -e r10038,r2000A,r3001E,r40066,r500fa,r600f4 $*;; 175 | 167) perf stat -p $p -e r140A6,r200F8,r300F6,r400F6,r500fa,r600f4 $*;; 176 | 168) perf stat -p $p -e r12084,r22086,r3C0A8,r400F6,r500fa,r600f4 $*;; 177 | 169) perf stat -p $p -e r1001A,r2D8A8,r3D8B8,r44084,r500fa,r600f4 $*;; 178 | 170) perf stat -p $p -e r100F4,r2001E,r30004,r40002,r500fa,r600f4 $*;; 179 | 171) perf stat -p $p -e r10002,r200F0,r300F8,r400F8,r500fa,r600f4 $*;; 180 | 172) perf stat -p $p -e r100F8,r200F0,r300FC,r400F6,r500fa,r600f4 $*;; 181 | 173) perf stat -p $p -e r1001E,r2001E,r30002,r40066,r500fa,r600f4 $*;; 182 | 174) perf stat -p $p -e r1D0A2,r2004A,r300F6,r4004A,r500fa,r600f4 $*;; 183 | 175) perf stat -p $p -e r10028,r2C09C,r3C09E,r4004C,r500fa,r600f4 $*;; 184 | 176) perf stat -p $p -e r10068,r200F0,r3D054,r4004E,r500fa,r600f4 $*;; 185 | 177) perf stat -p $p -e r10000,r2001E,r3D094,r40002,r500fa,r600f4 $*;; 186 | 178) perf stat -p $p -e r10014,r2001E,r30014,r40002,r500fa,r600f4 $*;; 187 | 179) perf stat -p $p -e r1D094,r2001E,r3209A,r40002,r500fa,r600f4 $*;; 188 | 180) perf stat -p $p -e r1001E,r228A4,r320A4,r420A6,r500fa,r600f4 $*;; 189 | 181) perf stat -p $p -e r1F080,r2F080,r3F080,r4F080,r500fa,r600f4 $*;; 190 | 182) perf stat -p $p -e r15080,r25082,r35084,r45086,r500fa,r600f4 $*;; 191 | 183) perf stat -p $p -e r1D0AC,r2D0AE,r3D8AC,r4D8B8,r500fa,r600f4 $*;; 192 | 184) perf stat -p $p -e r1F082,r2F082,r3F082,r4F082,r500fa,r600f4 $*;; 193 | 185) perf stat -p $p -e r1001E,r2D8B4,r3D0B4,r4D0B6,r500fa,r600f4 $*;; 194 | 186) perf stat -p $p -e r1001E,r2D8BC,r3D0BC,r4D0BE,r500fa,r600f4 $*;; 195 | 187) perf stat -p $p -e r1D0B0,r2D8A8,r3D0A8,r4D0AA,r500fa,r600f4 $*;; 196 | 188) perf stat -p $p -e r1C094,r2C096,r3001E,r4C894,r500fa,r600f4 $*;; 197 | 189) perf stat -p $p -e r1001E,r2C884,r3C084,r4C086,r500fa,r600f4 $*;; 198 | 190) perf stat -p $p -e r1001E,r2C888,r3C088,r4C08A,r500fa,r600f4 $*;; 199 | 191) perf stat -p $p -e r16080,r26082,r3F080,r4001E,r500fa,r600f4 $*;; 200 | 192) perf stat -p $p -e r1C894,r2C8AC,r3C098,r4C09A,r500fa,r600f4 $*;; 201 | 193) perf stat -p $p -e r1508A,r25088,r3C098,r4C09A,r500fa,r600f4 $*;; 202 | 194) perf stat -p $p -e r140B8,r240BA,r3001E,r40002,r500fa,r600f4 $*;; 203 | 195) perf stat -p $p -e r100F0,r200F2,r30016,r40002,r500fa,r600f4 $*;; 204 | 196) perf stat -p $p -e r16880,r26880,r36082,r46080,r500fa,r600f4 $*;; 205 | 197) perf stat -p $p -e r10002,r2001E,r36080,r46080,r500fa,r600f4 $*;; 206 | 198) perf stat -p $p -e r10002,r2001E,r36482,r400FA,r500fa,r600f4 $*;; 207 | 199) perf stat -p $p -e r10002,r2001E,r36382,r46382,r500fa,r600f4 $*;; 208 | 200) perf stat -p $p -e r10002,r2001E,r36480,r400FA,r500fa,r600f4 $*;; 209 | 201) perf stat -p $p -e r10002,r200F4,r3001E,r46380,r500fa,r600f4 $*;; 210 | 202) perf stat -p $p -e r100F6,r240BC,r340BE,r40002,r500fa,r600f4 $*;; 211 | 203) perf stat -p $p -e r12090,r220A8,r3001E,r40002,r500fa,r600f4 $*;; 212 | 204) perf stat -p $p -e r1001E,r20006,r30008,r40002,r500fa,r600f4 $*;; 213 | 205) perf stat -p $p -e r10008,r200F4,r3001E,r400F4,r500fa,r600f4 $*;; 214 | 206) perf stat -p $p -e r10010,r20010,r30010,r40010,r500fa,r600f4 $*;; 215 | 207) perf stat -p $p -e r10024,r20010,r30024,r40010,r500fa,r600f4 $*;; 216 | 208) perf stat -p $p -e r10020,r200F4,r30020,r40002,r500fa,r600f4 $*;; 217 | 209) perf stat -p $p -e r10022,r200F4,r30022,r40002,r500fa,r600f4 $*;; 218 | 210) perf stat -p $p -e r1208A,r22096,r3D0B2,r40002,r500fa,r600f4 $*;; 219 | 211) perf stat -p $p -e r100F6,r200FC,r300F0,r400F0,r500fa,r600f4 $*;; 220 | 212) perf stat -p $p -e r1001E,r200F6,r300FC,r400FC,r500fa,r600f4 $*;; 221 | 213) perf stat -p $p -e r100FA,r200F4,r3001E,r400F4,r500fa,r600f4 $*;; 222 | 214) perf stat -p $p -e r100F4,r200F4,r3001E,r400FA,r500fa,r600f4 $*;; 223 | 215) perf stat -p $p -e r100F2,r200F4,r300F2,r400F2,r500fa,r600f4 $*;; 224 | 216) perf stat -p $p -e r10002,r200F0,r300F0,r400F0,r500fa,r600f4 $*;; 225 | 217) perf stat -p $p -e r10002,r200FE,r300F6,r400F0,r500fa,r600f4 $*;; 226 | 218) perf stat -p $p -e r100F6,r200FC,r30002,r400FC,r500fa,r600f4 $*;; 227 | 219) perf stat -p $p -e r10000,r20000,r30000,r40000,r500fa,r600f4 $*;; 228 | 220) perf stat -p $p -e r10002,r200F8,r300F8,r4001E,r500fa,r600f4 $*;; 229 | 221) perf stat -p $p -e r100F0,r200F2,r300F4,r400F8,r500fa,r600f4 $*;; 230 | 222) perf stat -p $p -e r100F8,r200F2,r3001E,r400F6,r500fa,r600f4 $*;; 231 | 223) perf stat -p $p -e r10036,r20036,r30036,r40002,r500fa,r600f4 $*;; 232 | 224) perf stat -p $p -e r1D04A,r2002E,r30002,r4D048,r500fa,r600f4 $*;; 233 | 225) perf stat -p $p -e r1003E,r20002,r3D046,r40024,r500fa,r600f4 $*;; 234 | 226) perf stat -p $p -e r1D048,r2D048,r30002,r40020,r500fa,r600f4 $*;; 235 | 227) perf stat -p $p -e r10002,r2002C,r3D04A,r4C042,r500fa,r600f4 $*;; 236 | 228) perf stat -p $p -e r1D044,r20002,r30030,r40026,r500fa,r600f4 $*;; 237 | 229) perf stat -p $p -e r1003F,r20024,r3D04E,r40002,r500fa,r600f4 $*;; 238 | 230) perf stat -p $p -e r1D040,r20020,r30002,r4D048,r500fa,r600f4 $*;; 239 | 231) perf stat -p $p -e r1D042,r2D048,r30002,r40028,r500fa,r600f4 $*;; 240 | 232) perf stat -p $p -e r10002,r2002A,r3D044,r4D048,r500fa,r600f4 $*;; 241 | 233) perf stat -p $p -e r1D04C,r20028,r3C042,r40002,r500fa,r600f4 $*;; 242 | 234) perf stat -p $p -e r1003E,r20002,r3D042,r4002C,r500fa,r600f4 $*;; 243 | 235) perf stat -p $p -e r1D04E,r20026,r30030,r40002,r500fa,r600f4 $*;; 244 | 236) perf stat -p $p -e r1003F,r20002,r3D04C,r4002A,r500fa,r600f4 $*;; 245 | 237) perf stat -p $p -e r1D084,r2D086,r30002,r4001E,r500fa,r600f4 $*;; 246 | 238) perf stat -p $p -e r10002,r2001E,r3D088,r4D08A,r500fa,r600f4 $*;; 247 | 239) perf stat -p $p -e r1D082,r2D08C,r30002,r40064,r500fa,r600f4 $*;; 248 | 240) perf stat -p $p -e r10032,r20030,r30030,r40002,r500fa,r600f4 $*;; 249 | 241) perf stat -p $p -e r10034,r20034,r30034,r40002,r500fa,r600f4 $*;; 250 | 242) perf stat -p $p -e r10002,r2D05E,r3D05E,r4D05E,r500fa,r600f4 $*;; 251 | 243) perf stat -p $p -e r1D05E,r2D05E,r3D05E,r40002,r500fa,r600f4 $*;; 252 | 244) perf stat -p $p -e r10002,r2D05C,r3D05C,r4D05C,r500fa,r600f4 $*;; 253 | 245) perf stat -p $p -e r1D05C,r2D05C,r3D05C,r40002,r500fa,r600f4 $*;; 254 | 246) perf stat -p $p -e r1003E,r20002,r3D05A,r4003E,r500fa,r600f4 $*;; 255 | 247) perf stat -p $p -e r10002,r2D052,r3D056,r4D056,r500fa,r600f4 $*;; 256 | 248) perf stat -p $p -e r1D050,r2D054,r3D052,r40002,r500fa,r600f4 $*;; 257 | 249) perf stat -p $p -e r10002,r2D056,r3D056,r4D054,r500fa,r600f4 $*;; 258 | 250) perf stat -p $p -e r1D054,r2D050,r30002,r4D058,r500fa,r600f4 $*;; 259 | 251) perf stat -p $p -e r1D052,r2D058,r30002,r4D052,r500fa,r600f4 $*;; 260 | 252) perf stat -p $p -e r1D08E,r20002,r3003A,r40034,r500fa,r600f4 $*;; 261 | 253) perf stat -p $p -e r10002,r20038,r3003A,r40032,r500fa,r600f4 $*;; 262 | 254) perf stat -p $p -e r10002,r2003A,r3D080,r40032,r500fa,r600f4 $*;; 263 | 255) perf stat -p $p -e r1003C,r20002,r30032,r40038,r500fa,r600f4 $*;; 264 | 256) perf stat -p $p -e r1003D,r20032,r3003F,r40002,r500fa,r600f4 $*;; 265 | 257) perf stat -p $p -e r10030,r200F4,r30002,r40030,r500fa,r600f4 $*;; 266 | 258) perf stat -p $p -e r1D082,r20002,r30064,r40064,r500fa,r600f4 $*;; 267 | 259) perf stat -p $p -e r1001E,r2001E,r30002,r40032,r500fa,r600f4 $*;; 268 | 260) perf stat -p $p -e r1D040,r20020,r3D0A2,r4000A,r500fa,r600f4 $*;; 269 | *) echo "GROUP NOT FOUND $g";; 270 | esac 271 | 272 | /gsa/yktgsa/home/h/a/haichuan/workspace/gsimd/examples/RGB2Gray/groupnamepower7.sh $g 273 | --------------------------------------------------------------------------------